Friday, January 19, 2024

Coroutines on the NES

When you're writing event-driven code, you often end up implementing a lot of state machines that run once per event. For example, in a game, you have a bunch of state machines that run once per frame to draw graphics, play music, check for input, etc. I always find these tedious and error prone to write. For me, it's much easier to write them as coroutines for simple cooperative multitasking.

The Concept

The basic idea is writing functions that can be suspended and resumed in the middle. Then, you write the state machine as normal branching and looping code. For example, to play a sound effect with an ADSR envelope, the state machine approach would look something like:

switch(sound->state) {
  case STATE_ATTACK:
    sound->vol += sound->attackRate;
    if (sound->vol >= maxVolume) nextState = STATE_DECAY;
    break;
  case STATE_DECAY:
    sound->vol -= sound->decayRate;
    if (sound->vol == sound->sustainLevel) {
        sound->sustainTimer = sound->sustainTime;
        nextState = STATE_SUSTAIN;
    }
    break;
  case STATE_SUSTAIN:
    if (--sound->sustainTimer == 0) {
        nextState = STATE_RELEASE;
    }
    break;
  case STATE_RELEASE:
    sound->vol -= sound->releaseRate;
    if (sound->vol <= 0) {
        sound->vol = 0;
        nextState = STATE_OFF;
    }
    break;
  default:
    sound->vol = 0;
    break;
}
updateSound(sound);
sound->state = nextState;

Or you could write it as a coroutine:

do {
    sound->vol += sound->attackRate;
    updateSound(sound);
    YIELD();
} while(sound->vol <= maxVolume);
do {
    sound->vol -= sound->decayRate;
    updateSound(sound);
    YIELD();
} while(sound->vol >= sound->sustainVolume);
sound->sustainTimer = sound->sustainTime;
do {
    sound->sustainTimer--;
    updateSound(sound);
    YIELD();
} while(sound->sustainTimer);
do {
    sound->vol -= sound->decayRate;
    updateSound(sound);
    YIELD();
}
sound->vol = 0;
updateSound(sound);
YIELD();

The sound handler is run once per frame. In the coroutine version, the function runs from one YIELD() to the next on each frame. I think this is a little easier to work with than the explicit state machine version. The state machine is implicit in the control flow instead of being explicitly written out with state transitions.

So how do you implement this on a 6502-like CPU? You need to write the YIELD() function so that it saves registers to the stack, picks another coroutine to run, switches to that routine's stack, then restores the registers and resumes.

The 6502 has a very limited stack pointer, though. Its high byte is fixed at 0x01, so the stack can only hold 256 bytes. This could be partitioned between the coroutines, but the approach I took was to save and restore the entire stack on each YIELD(). Since the YIELD() normally happens when there's not much on the stack, this isn't too expensive.

The other thing that makes this cheap is that you don't need to automatically save processor registers. Since YIELD() is implemented as a function call, there's no expectation that registers will be preserved. So the calling code can save them if it wants, but it isn't done automatically. And conveniently, calling YIELD() saves the PC of the calling site to the stack, so the task switching logic can just use a normal rts instruction to resume.

The Code

Here's my implementation in two files. First, tasks.inc:

; tasks.inc - include this in your asm file.
task_count .set 0

.macro addMainTask name, stack
        lda #<stack
        sta name+1
        lda #>stack
        sta name+2
        lda #<name
        sta TASKS
        lda #>name
        sta TASKS+1
task_count .set task_count + 1
.endmacro

.macro addTask name, stack, entry
        lda #2
        sta name
        lda #<stack
        sta name+1
        lda #>stack
        sta name+2
        lda #<(entry-1)
        sta stack
        lda #>(entry-1)
        sta stack+1
        lda #<name
        sta TASKS+(task_count*2)
        lda #>name
        sta TASKS+(task_count*2)+1
task_count .set task_count + 1
.endmacro

.macro initTaskList main_task
        lda #0
        sta CURRENT_TASK
        lda #task_count
        sta NUM_TASKS
        lda #<main_task
        sta TASK
        lda #>main_task
        sta TASK+1
.endmacro

.import TASKS
.importzp TASK
.importzp CURRENT_TASK
.importzp NUM_TASKS
.import yield, switchTask, sleep

And here's the actual implementation in tasks.asm:

; tasks.asm - coroutine yield implementation file
; Define MAX_TASKS in your build system.
; Requires a 2-byte zero-page variable called PTR defined elsewhere

.segment "ZEROPAGE"
TASK: .res 2  ; points to current task
CURRENT_TASK: .res 1 ; index of running task
NUM_TASKS: .res 1 ; number of tasks

.segment "DATA"
TASKS: .res 3*MAX_TASKS

.exportzp TASK, CURRENT_TASK, NUM_TASKS
.importzp PTR
.export yield, switchTask, TASKS, sleep

.segment "CODE"
; Call this subroutine to suspend current task and switch to the next
.proc yield
        ; task struct address is in zero-page reg TASK
        ldy #1
        lda (TASK), y
        sta PTR
        iny
        lda (TASK), y
        sta PTR+1

        ldy #0

        ; pop from the stack and store in stack array
        tsx
        inx
        beq copyDone
copyLoop:
        pla
        sta (PTR), y
        iny
        inx
        bne copyLoop
copyDone:
        tya
        ; store the size of the stack in the task state
        ldy #0
        sta (TASK), y
nextTask:
        ldy CURRENT_TASK
        iny
        cpy NUM_TASKS
        bcc :+
        ldy #0
:       sty CURRENT_TASK
        tya
        asl
        tay
        lda TASKS, y
        sta TASK
        lda TASKS+1, y
        sta TASK+1
        jmp switchTask
.endproc

; helper function, no need to call it directly
.proc switchTask
        ; stack is supposed to be empty here
        ; task struct address is in zero-page reg TASK
        ldy #1
        lda (TASK), y
        sta PTR
        iny
        lda (TASK), y
        sta PTR+1
        ldy #0
        lda (TASK), y  ; stackBytes
        tay
        dey
copyLoop:
        lda (PTR), y
        pha
        dey
        bpl copyLoop
copyDone:
        rts
.endproc

.proc sleep
; call this subroutine to sleep for number of frames in A
loop:
        pha
        jsr yield
        pla
        clc
        adc #$ff
        bpl loop
        rts
.endproc

How to use it

To use it, you need to declare all your tasks, then call yield. For example:


.segment "DATA"
; task state structure -> 1-byte stack size followed by 2-byte pointer to stack array
main_task: .res 3
music_task: .res 3
.align 256
main_task_stack: .res 128   ; this can be way smaller
music_task_stack: .res 128  ; this can also be way smaller

        ; ...
        ; in the initialization of the main task
        addMainTask main_task, main_task_stack
        addTask music_task, music_task_stack, music_main
        initTaskList main_task
        ; Run all tasks to their first yield
        jsr yield
        
        ; ... do whatever here
        
mainLoop:
        ; ... do your beginning-of-frame stuff here ...
        
        ; run all tasks until the next yield
        jsr yield
        
        ; ... do your end-of-frame stuff here ...
        
        clc
        bcc mainLoop

Note that this doesn't handle returning from a coroutine. All your coroutines should be infinite loops. The main task can add or remove coroutines from the task list TASKS dynamically.

Conclusion

This is certainly not production quality, and my 6502 coding skills are probably not great, but it shows that implementing coroutines on a 6502-like CPU is not that complicated. If you keep the stacks small when you jsr yield so only the PC and a few state variables are saved, you could use this for a bunch of routines. It wouldn't be too difficult to launch coroutines for each enemy, sound effect, or any other thing that needs a frame-triggered state machine. Even better, this technique could potentially use fewer cycles than the explicit state machine style because it maps better onto the underlying hardware.

More Testing of Mismatch RAM Modules

Here's a simple program to test whether dual-channel mode is being used for RAM.

Using this, I found that read speeds on my system increase even when I mismatch DIMM capacities in the A and B memory channels, but write speeds do not.

The code

// test.c
// compile with gcc -o test -O0 test.c
// then run with ./test
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdio.h>
#include <sys/time.h>

#define GB 8.0
#define SZ ((size_t) (GB*1073741824UL))

static double getTime(void)
{
  struct timeval t;
  gettimeofday(&t, NULL);
  return t.tv_sec + (double) t.tv_usec/1e6;
}

int main(int argc, char *argv[])
{
  uint8_t *p;
  double t1, t2;

  p = malloc(SZ);
  memset(p, 'a', SZ); // must write non-zero data first so the OS will actually map the pages to RAM

  // test write
  t1 = getTime();
  memset(p, 0, SZ);
  t2 = getTime();
  printf("Wrote %.1f GB in %f seconds -> %3.2f GB/s\n", GB, t2-t1, ((double)GB)/(t2-t1));

  // test write
  t1 = getTime();
  void *n = memchr(p, 'a', SZ);
  t2 = getTime();
  printf("Read %.1f GB in %f seconds -> %3.2f GB/s\n", GB, t2-t1, ((double)GB)/(t2-t1));
  printf("%p\n", n); // must use n for something to avoid memchr() being optimized out

  return 0;
}

The memset() and memchr() standard library functions are presumed to be heavily optimized for writing and reading from RAM, respectively. And in fact, memset() runs just a little slower than the theoretical limit, while memchr() is faster than rep lodsq, so I assume whoever wrote it is better at optimizing than I am.

With 16GB + 8GB of DDR4-3200 installed in memory channels A and B on my system, this prints:

Wrote 8.0 GB in 0.337975 seconds -> 23.67 GB/s
Read 8.0 GB in 0.249023 seconds -> 32.13 GB/s

The maximum possible bandwidth per channel is something like 3200e6*8 = 23.8 GB/s. It turns out that on a Ryzen 5700 processor, writes are bottlenecked somewhere between the CPU and the RAM controller, so we don't get the benefit of dual-channel writes. We'd expect that the reads would be up to twice as fast, but they're not. In fact, if I run the test for a 1-GB block instead of 8, reads and writes are the same speed. Presumably I'm getting memory that's mapped to a single DIMM in that scenario. I only get higher speeds for larger allocations that are more likely to use both DIMMs. At 16 GB, reads reach almost 33 GB/s.

So in conclusion, mismatched DIMM capacities in dual-channel mode on a Ryzen 5700 give some benefit, but read speeds don't reliably double.

By the way, benchmarks of this sort keep getting harder to write. The OS does not actually map pages to physical RAM until you write to them. If you try to use calloc() to initialize to zero, the OS maps all the pages to a read-only zero-filled page and still only maps them to physical RAM when you write. If you use malloc() followed by bzero() instead of calloc(), gcc's optimizer can replace that with a call to calloc()! And calls to memchr() are optimized out even at -O0 if you don't use the result somewhere. On the other hand, hand-written assembly is not reliably fast on today's CPUs. So be careful doing these kinds of tests with a modern compiler.

Thursday, January 18, 2024

Testing RAM Performance with Mismatched Modules

I upgraded the RAM in one of my computers and ended up with some extra 8 GB DDR4 sticks. My computers are mostly mini-ITX systems with only two RAM slots, so I normally buy matched pairs of RAM sticks, but for various reasons my main workstation had only a single slot filled with a 16 GB module. So I thought I'd see whether performance increased if I added an 8 GB module for 24 GB of RAM total.

A little research suggested that "dual-channel" mode would be a bit faster with matched sizes, but there's not widespread agreement on whether this would also work with mismatched sizes. For example, Socket 939 processors supported a "ganged" mode (see page 16) that combined two 64-bit channels into a 128-bit channel if DIMM sizes were matched across the two channels. Some people online spoke of a "flex mode" for partial dual-channel operation with mismatched modules, but others claimed this was an Intel-only technology.

It looks like AMD's Socket 939 had "A" and "B" pins for memory control signals, but the above document says they are identical signals, with an "A" and "B" pin provided for each to reduce loading when 4 DIMMs are connected. With the AMD 10h family, they changed this to be two independent memory channels but still supported "ganged" mode as a BIOS option for years.

So the question is, if the two DIMMs are accessed using independent memory controller channels, are the addresses still interleaved between DIMMs? If yes, is it still true if sizes are mismatched? I couldn't find any documentation of the Zen3 RAM controller that might help answer the question for current CPUs, so I just had to test it and see for myself.

The conclusion? Yes, mixing different-sized modules on different channels does increase RAM bandwidth for sequential writes, but not as much as using matched modules.

Test Details

I didn't know the standard way to test memory speeds on Linux, so I used a suggestion from here: https://serverfault.com/questions/372020/what-are-the-best-possible-ways-to-benchmark-ram-no-ecc-under-linux-arm

The suggestion amounted to running these commands:

$ cd /mnt
$ sudo mount -t tmpfs /mnt/test1 /mnt/test1
$ sudo dd if=/dev/zero of=/mnt/test1/test bs=1M

This just creates a RAM disk that's half as big as your RAM and fills it up with a bunch of zeros, then reports the average data rate. I tried it with three configurations:

A single 8-GB DDR4-3200 module: 5.7 GB/s
A 8-GB DDR4-3200 module plus a 16-GB DDR4-3200 module: 5.9 GB/s (+3.5%)
Two 8-GB DDR4-3200 modules: 6.2 GB/s (+8.7%)

What's going on here? Well, naively, you'd expect that filling RAM with zeroes would get close to the maximum speed of 3200e6 * 8 = 23.8 GB/s. But it's a little more complicated, since we're actually reading from /dev/zero, then copying the resulting buffer. The buffer is small enough to fit into my Zen3 CPU's L3 cache, so most of this is not limited by the RAM speed. In particular, the RAM controller doesn't have to switch between reading and writing, which could slow things down a lot. But for each byte, we're writing twice and reading once. On top of that, we have system call overhead and filesystem overhead from using tmpfs. With a single channel, this crude benchmark is getting about 25% of the maximum speed.

According to the unofficial AM4 pinout, each RAM channel has dedicated data, address, and control signals. So they should be able to operate independently. This would potentially double the maximum bandwidth to 57.6 GB/s if writes were interleaved between the two banks. My understanding is that Zen3's CPU cores can read and write 32 bytes (256 bits) from L3 on each ~4000-MHz clock cycle, so it should have no problem saturating two 64-bit RAM channels at 3200 MT/s.

If we interpreted the single-channel result as saying that 25% of the time was spent writing to RAM at the maximum transfer rate, then doubling the RAM speed would give us a 14% speedup (1/(1-0.25/2)=1.14). Instead, we get an 8.7% speedup.

The 24-GB case with mixed sizes is more interesting. Here, we get a 3.5% speedup. So presumably the memory controller is able to use both channels some of the time.

What's not clear is how the memory interleaving works. Is the memory controller able to interleave addresses with 8-byte granularity in both dual-channel cases? The 3.5% speed improvement versus 8.7% for matched sizes suggests that less than half of accesses are interleaved, even though we'd expect 2/3 of addresses to be interleavable.

I'd like to redo the testing with a much simpler rep movsb loop that has very little overhead. Unfortunately, after doing all this testing in my cramped mini-ITX case, the 16 GB RAM stick quit working, so I can't repeat it with a better benchmark!

Monday, January 8, 2024

Using sim65 to Test NES Code

I needed a period-63 LFSR implementation in 6502 assembly for some NES code I was writing. Normally, I like to unit test code, but it wasn't obvious how to do that with an NES. Thankfully, the cc65 project already has a great simulator with semi-hosted C stdio support that you can use for this sort of thing. This post explains how I tested my LFSR implementation.

I started with the LFSR code itself, in lfsr.asm:

.export _lfsr_state
.export _lfsr_update

.segment "DATA"
_lfsr_state: .res 1

.segment "CODE"

.proc _lfsr_update
        ; period 63 lfsr with polynomial x^6 + x^5 + 1, computed in top 6 bits
        ; Bottom 2 bits must always be 0.
        lda _lfsr_state
        asl
        bpl :+
        eor #4
:       bcc :+
        eor #4
:       sta _lfsr_state
        rts
.endproc

I'm still new to 6502 coding, so I'm sure this is suboptimal, but it seemed like a good start. The whole point of the exercise is to make it easier to optimize small routines, after all.

Then I made main.c:

#include <stdio.h>
#include <stdint.h>

extern void lfsr_update(void);
extern uint8_t lfsr_state;

int main(void)
{
  uint8_t i = 0;
  uint8_t first;
  
  lfsr_state = 0x4;
  first = lfsr_state;
  do {
    printf("%d: %02X\n", i++, lfsr_state);
    lfsr_update();
  } while (first != lfsr_state);
  return 0;
}

This initializes the LFSR (in the top 6 bits of lfsr_state) to a non-zero value, then updates it until it sees the same value again, printing the intermediate values.

Assuming you installed cc65 already, you can compile this with the commands:

$ cl65 --target sim6502 -o test main.c lfsr.asm

Then you can simulate it with the command:

$ sim65 test

This should give the output:

0: 04
1: 08
2: 10
3: 20
4: 40
5: 84
6: 0C
7: 18
8: 30
9: 60
10: C4
11: 88
12: 14
13: 28
14: 50
15: A4
16: 4C
17: 9C
18: 3C
19: 78
20: F4
21: E8
22: D0
23: A0
24: 44
25: 8C
26: 1C
27: 38
28: 70
29: E4
30: C8
31: 90
32: 24
33: 48
34: 94
35: 2C
36: 58
37: B4
38: 6C
39: DC
40: B8
41: 74
42: EC
43: D8
44: B0
45: 64
46: CC
47: 98
48: 34
49: 68
50: D4
51: A8
52: 54
53: AC
54: 5C
55: BC
56: 7C
57: FC
58: F8
59: F0
60: E0
61: C0
62: 80

This shows that the LFSR cycles after 63 steps, as intended. It also always leaves bits 0-1 clear.

The major limitation of this approach for testing is that if your routines are not using cc65's normal calling convention, you may have to write a wrapper function to be able to call them from C.

The sim65 tool also allows cycle counts to help optimization, but I don't think it can produce any sort of instruction trace that would make it more useful for debugging.

Joel's Blog