Friday, January 19, 2024

Coroutines on the NES

When you're writing event-driven code, you often end up implementing a lot of state machines that run once per event. For example, in a game, you have a bunch of state machines that run once per frame to draw graphics, play music, check for input, etc. I always find these tedious and error prone to write. For me, it's much easier to write them as coroutines for simple cooperative multitasking.

The Concept

The basic idea is writing functions that can be suspended and resumed in the middle. Then, you write the state machine as normal branching and looping code. For example, to play a sound effect with an ADSR envelope, the state machine approach would look something like:

switch(sound->state) {
  case STATE_ATTACK:
    sound->vol += sound->attackRate;
    if (sound->vol >= maxVolume) nextState = STATE_DECAY;
    break;
  case STATE_DECAY:
    sound->vol -= sound->decayRate;
    if (sound->vol == sound->sustainLevel) {
        sound->sustainTimer = sound->sustainTime;
        nextState = STATE_SUSTAIN;
    }
    break;
  case STATE_SUSTAIN:
    if (--sound->sustainTimer == 0) {
        nextState = STATE_RELEASE;
    }
    break;
  case STATE_RELEASE:
    sound->vol -= sound->releaseRate;
    if (sound->vol <= 0) {
        sound->vol = 0;
        nextState = STATE_OFF;
    }
    break;
  default:
    sound->vol = 0;
    break;
}
updateSound(sound);
sound->state = nextState;

Or you could write it as a coroutine:

do {
    sound->vol += sound->attackRate;
    updateSound(sound);
    YIELD();
} while(sound->vol <= maxVolume);
do {
    sound->vol -= sound->decayRate;
    updateSound(sound);
    YIELD();
} while(sound->vol >= sound->sustainVolume);
sound->sustainTimer = sound->sustainTime;
do {
    sound->sustainTimer--;
    updateSound(sound);
    YIELD();
} while(sound->sustainTimer);
do {
    sound->vol -= sound->decayRate;
    updateSound(sound);
    YIELD();
}
sound->vol = 0;
updateSound(sound);
YIELD();

The sound handler is run once per frame. In the coroutine version, the function runs from one YIELD() to the next on each frame. I think this is a little easier to work with than the explicit state machine version. The state machine is implicit in the control flow instead of being explicitly written out with state transitions.

So how do you implement this on a 6502-like CPU? You need to write the YIELD() function so that it saves registers to the stack, picks another coroutine to run, switches to that routine's stack, then restores the registers and resumes.

The 6502 has a very limited stack pointer, though. Its high byte is fixed at 0x01, so the stack can only hold 256 bytes. This could be partitioned between the coroutines, but the approach I took was to save and restore the entire stack on each YIELD(). Since the YIELD() normally happens when there's not much on the stack, this isn't too expensive.

The other thing that makes this cheap is that you don't need to automatically save processor registers. Since YIELD() is implemented as a function call, there's no expectation that registers will be preserved. So the calling code can save them if it wants, but it isn't done automatically. And conveniently, calling YIELD() saves the PC of the calling site to the stack, so the task switching logic can just use a normal rts instruction to resume.

The Code

Here's my implementation in two files. First, tasks.inc:

; tasks.inc - include this in your asm file.
task_count .set 0

.macro addMainTask name, stack
        lda #<stack
        sta name+1
        lda #>stack
        sta name+2
        lda #<name
        sta TASKS
        lda #>name
        sta TASKS+1
task_count .set task_count + 1
.endmacro

.macro addTask name, stack, entry
        lda #2
        sta name
        lda #<stack
        sta name+1
        lda #>stack
        sta name+2
        lda #<(entry-1)
        sta stack
        lda #>(entry-1)
        sta stack+1
        lda #<name
        sta TASKS+(task_count*2)
        lda #>name
        sta TASKS+(task_count*2)+1
task_count .set task_count + 1
.endmacro

.macro initTaskList main_task
        lda #0
        sta CURRENT_TASK
        lda #task_count
        sta NUM_TASKS
        lda #<main_task
        sta TASK
        lda #>main_task
        sta TASK+1
.endmacro

.import TASKS
.importzp TASK
.importzp CURRENT_TASK
.importzp NUM_TASKS
.import yield, switchTask, sleep

And here's the actual implementation in tasks.asm:

; tasks.asm - coroutine yield implementation file
; Define MAX_TASKS in your build system.
; Requires a 2-byte zero-page variable called PTR defined elsewhere

.segment "ZEROPAGE"
TASK: .res 2  ; points to current task
CURRENT_TASK: .res 1 ; index of running task
NUM_TASKS: .res 1 ; number of tasks

.segment "DATA"
TASKS: .res 3*MAX_TASKS

.exportzp TASK, CURRENT_TASK, NUM_TASKS
.importzp PTR
.export yield, switchTask, TASKS, sleep

.segment "CODE"
; Call this subroutine to suspend current task and switch to the next
.proc yield
        ; task struct address is in zero-page reg TASK
        ldy #1
        lda (TASK), y
        sta PTR
        iny
        lda (TASK), y
        sta PTR+1

        ldy #0

        ; pop from the stack and store in stack array
        tsx
        inx
        beq copyDone
copyLoop:
        pla
        sta (PTR), y
        iny
        inx
        bne copyLoop
copyDone:
        tya
        ; store the size of the stack in the task state
        ldy #0
        sta (TASK), y
nextTask:
        ldy CURRENT_TASK
        iny
        cpy NUM_TASKS
        bcc :+
        ldy #0
:       sty CURRENT_TASK
        tya
        asl
        tay
        lda TASKS, y
        sta TASK
        lda TASKS+1, y
        sta TASK+1
        jmp switchTask
.endproc

; helper function, no need to call it directly
.proc switchTask
        ; stack is supposed to be empty here
        ; task struct address is in zero-page reg TASK
        ldy #1
        lda (TASK), y
        sta PTR
        iny
        lda (TASK), y
        sta PTR+1
        ldy #0
        lda (TASK), y  ; stackBytes
        tay
        dey
copyLoop:
        lda (PTR), y
        pha
        dey
        bpl copyLoop
copyDone:
        rts
.endproc

.proc sleep
; call this subroutine to sleep for number of frames in A
loop:
        pha
        jsr yield
        pla
        clc
        adc #$ff
        bpl loop
        rts
.endproc

How to use it

To use it, you need to declare all your tasks, then call yield. For example:


.segment "DATA"
; task state structure -> 1-byte stack size followed by 2-byte pointer to stack array
main_task: .res 3
music_task: .res 3
.align 256
main_task_stack: .res 128   ; this can be way smaller
music_task_stack: .res 128  ; this can also be way smaller

        ; ...
        ; in the initialization of the main task
        addMainTask main_task, main_task_stack
        addTask music_task, music_task_stack, music_main
        initTaskList main_task
        ; Run all tasks to their first yield
        jsr yield
        
        ; ... do whatever here
        
mainLoop:
        ; ... do your beginning-of-frame stuff here ...
        
        ; run all tasks until the next yield
        jsr yield
        
        ; ... do your end-of-frame stuff here ...
        
        clc
        bcc mainLoop

Note that this doesn't handle returning from a coroutine. All your coroutines should be infinite loops. The main task can add or remove coroutines from the task list TASKS dynamically.

Conclusion

This is certainly not production quality, and my 6502 coding skills are probably not great, but it shows that implementing coroutines on a 6502-like CPU is not that complicated. If you keep the stacks small when you jsr yield so only the PC and a few state variables are saved, you could use this for a bunch of routines. It wouldn't be too difficult to launch coroutines for each enemy, sound effect, or any other thing that needs a frame-triggered state machine. Even better, this technique could potentially use fewer cycles than the explicit state machine style because it maps better onto the underlying hardware.

No comments:

Post a Comment