SBBE has no string type. Strings are byte sequences in linear memory, managed entirely by the frontend. This guide shows how a compiler lowers common string operations into SBBE IR.

What we’re lowering

Consider a language with built-in string operations:

func greet(name: String) -> String {
    let greeting = "Hello, "
    return greeting + name
}

func countChars(s: String) -> Int32 {
    var count = 0
    for ch in s {
        count += 1
    }
    return count
}

At the source level, strings are opaque values with high-level operations like concatenation and iteration. The frontend must decide a concrete memory representation and lower every operation to pointer arithmetic, byte loads, and loops. SBBE provides the building blocks; the frontend makes the design choices.

Choosing a representation

Before emitting any IR, the frontend must pick a string representation. The two most common are:

RepresentationLayoutProsCons
Null-terminated (C)ptr to bytes ending in 0x00Simple, interop with CLength requires a scan, no embedded nulls
Length-prefixed (Rust, Go)(ptr data, i32 len)O(1) length, embedded nulls ok8+ extra bytes per string

The rest of this guide shows both, starting with null-terminated strings since they are simpler to lower.

C-style null-terminated strings

Pseudo-code: string length

func strlen(s: *UInt8) -> Int32 {
    var len = 0
    while s[len] != 0 {
        len += 1
    }
    return len
}

Step-by-step lowering

  1. Parameters: s is a ptr to the first byte. We declare a local $len as i32 to track the current offset.
  2. The loop condition (s[len] != 0): Compute the address s + len with add.s i64, load a single byte with ldm.u8 (unsigned so 0x00-0xFF maps to 0-255), then test with eqz.
  3. Why ldm.u8? Characters are unsigned byte values. Using ldm.s8 would sign-extend bytes above 0x7F to negative integers, which would break the zero check for valid UTF-8 continuation bytes.
  4. Incrementing: Plain add.s i32 on the counter, then jump back to the check.

SBBE lowering

func $strlen(ptr) -> i32 {
    var $len i32

entry:
    ldi 0
    str $len
    jmp check

check:
    ldl 0              // s
    ld $len
    add.s i64          // s + len
    ldm.u8             // load byte (zero-extended)
    eqz i32            // is it the null terminator?
    jmp.if done
    jmp next

next:
    ld $len
    ldi 1
    add.s i32
    str $len
    jmp check

done:
    ld $len
    ret
}

Pseudo-code: byte comparison

func memcmp(a: *UInt8, b: *UInt8, len: Int32) -> Int32 {
    for i in 0..<len {
        if a[i] != b[i] {
            return a[i] - b[i]
        }
    }
    return 0
}

Step-by-step lowering

  1. Loop structure: A counter $i walks from 0 to len. Each iteration loads one byte from each buffer and compares them.
  2. Address computation: For each buffer, the byte address is base + i. Both use add.s i64 because pointer arithmetic operates at pointer width.
  3. Early exit: We subtract the two bytes. If the result is nonzero, we return it immediately (positive means a > b, negative means a < b). If zero, we advance to the next byte.
  4. Why sub.s i32 for the comparison? The difference of two unsigned bytes (0-255) always fits in a signed i32, and the sign of the result tells the caller which buffer is “greater.” This matches the C memcmp contract.

SBBE lowering

func $memcmp(ptr, ptr, i32) -> i32 {
    var $i i32

entry:
    ldi 0
    str $i
    jmp check

check:
    ld $i
    ldl 2              // length
    ge.u i32           // i >= length?
    jmp.if equal
    jmp compare

compare:
    // Load byte from first buffer
    ldl 0
    ld $i
    add.s i64
    ldm.u8

    // Load byte from second buffer
    ldl 1
    ld $i
    add.s i64
    ldm.u8

    // Compare
    sub.s i32          // a - b
    dup
    eqz i32
    jmp.if advance
    ret                // return nonzero difference

advance:
    drop               // discard the zero difference
    ld $i
    ldi 1
    add.s i32
    str $i
    jmp check

equal:
    ldi 0
    ret
}

Length-prefixed strings

Pseudo-code

struct StringView {
    var data: *UInt8    // offset 0, size 8 (ptr)
    var length: Int32   // offset 8, size 4
}

func stringLen(s: *StringView) -> Int32 {
    return s.length
}

func charAt(s: *StringView, i: Int32) -> UInt8 {
    return s.data[i]
}

Step-by-step lowering

  1. Struct layout: StringView is 12 bytes. data is a ptr at offset 0, length is an i32 at offset 8. The frontend computes these offsets at compile time.
  2. Length access (s.length): Load the parameter (a pointer to the struct), add offset 8, then ldm i32. This is O(1), unlike the null-terminated strlen which requires a loop.
  3. Character access (s.data[i]): First load the data pointer from offset 0 with ldm ptr, then add the index i, then load a byte with ldm.u8. This is two loads: one to dereference the struct, one to dereference the data pointer.
  4. Why a struct pointer? Passing the struct by pointer avoids copying both fields on every call. For hot paths, the frontend could also flatten the struct into two parameters (ptr, i32) to avoid the indirection.

SBBE lowering

// Get the length of a string view
func $string_len(ptr) -> i32 {
entry:
    ldl 0
    ldi 8
    add.s i64
    ldm i32            // load length field
    ret
}

// Get a byte at index i
func $string_at(ptr, i32) -> i32 {
entry:
    ldl 0
    ldm ptr            // load data pointer
    ldl 1              // index
    add.s i64          // data + index
    ldm.u8             // load byte
    ret
}

Copying memory (memcpy)

Pseudo-code

func memcpy(dst: *UInt8, src: *UInt8, len: Int32) {
    for i in 0..<len {
        dst[i] = src[i]
    }
}

Step-by-step lowering

  1. Loop counter: $i counts from 0 to len, same pattern as memcmp.
  2. Reading a byte: Compute src + i, then ldm.u8 to load one byte.
  3. Writing a byte: Compute dst + i, push the byte, then stm8 to store. Note that stm8 (not stm i32) truncates the value to 8 bits before writing, so only one byte is stored regardless of the i32 value on the stack.
  4. Why byte-by-byte? This is the simplest correct lowering. A real backend would recognize this loop pattern and emit a block copy instruction (rep movsb on x86, memcpy call on ARM). The IR should be correct first; the optimizer makes it fast.

SBBE lowering

func $memcpy(ptr, ptr, i32) {
    // params: dst=0, src=1, len=2
    var $i i32

entry:
    ldi 0
    str $i
    jmp check

check:
    ld $i
    ldl 2
    ge.u i32
    jmp.if done
    jmp copy

copy:
    // dst[i] = src[i]
    ldl 0
    ld $i
    add.s i64          // dst + i

    ldl 1
    ld $i
    add.s i64          // src + i
    ldm.u8             // load byte from src

    stm8               // store byte to dst

    ld $i
    ldi 1
    add.s i32
    str $i
    jmp check

done:
    ret
}

Key takeaways