Hello my name is cearn, and I'm an ARMaholic.

Aw crap. I told myself I wasn't going to do this:

bmp16_floodfill in ARM assembly

@ ---------------------------------------------------------------------
@ void bmp16_floodfill_asm(int x, int y, u16 clrNew, void *dstBase,
@   u16 dstW, u16 dstH);
@ ---------------------------------------------------------------------

    .extern memset16

    .text
    .arm
    .align 2
    .global bmp16_floodfill_asm
bmp16_floodfill_asm:
    stmfd   sp!, {r4-r11,lr}

    @ NOTE: all x-values are doubled for use as halfword offsets
    /* Reglist:
        r0: x, dstL
        r1: ppx (current pixel pointer)
        r2: x*2 (right, left, count)
        r3: yx / tmp / clr
        r4: clrNew
        r5: clrOld
        r6: dstW*2
        r7: dstH-1
        r8: dstBase
        r9: top of span-stack
        r10: clrUp
        r11: clrDown    
        ip: y
    */


    @ Prep ppx, clrOld, stack
    ldrh    r6, [sp, #36]
    ldrh    r7, [sp, #40]
    mov     r6, r6, lsl #1
    mov     r0, r0, lsl #1
    mla     r4, r6, r1, r3
    ldrh    r5, [r4, r0]
    cmp     r5, r2                  @ Old color== new color: nevermind
    beq     .Lbmp16_flood_end

    mov     r8, r3
    mov     r9, sp
    mov     r4, r2

    @ Initial push (merge with prologue?)
    add     r3, r0, r1, lsl #16
    stmfd   sp!, {r3}

    @ While stack not empty, get new span
.Lbmp16_flood_pop:
        ldmfd   sp!, {r3}
        mov     ip, r3, lsr #16     @ y
        eor     r2, r3, ip, lsl #16 @ x
        mla     r0, r6, ip, r8      @ dstL= &dstBase[y*dstW]

        @ Find left (can be optimized more)
.Lbmp16_flood_left:
            ldrh    r3, [r0,r2]
            cmp     r3, r5          @ clr==clrOld
            subeq   r2, r2, #2
            moveqs  r3, r2, asr #31 @ Special for x>=0
            beq     .Lbmp16_flood_left

        @ Prepare for moving right
        add     r2, r2, #2          @ Step back into span
        add     r0, r0, r2          @
        sub     r1, r0, #2
        mvn     r10, #0             @ clrUp= clrDown= -1
        mvn     r11, #0

        @# Move right, checking up and down as we go
.Lbmp16_flood_right:
            ldrh    r3, [r1,#2]!
            cmp     r3, r5
            bne     .Lbmp16_flood_fill_line

            @ Check for new upper span
            ldrh    r3, [r1,-r6]
            cmp     r3, r10
            mov     r10, r3
            beq     .Lbmp16_flood_down
            cmp     r3, r5
            bne     .Lbmp16_flood_down
            subs    r3, ip, #1
            @ y-1>=0: we have a new up-span: push it. Push it good
            addge   r3, r2, r3, lsl #16
            stmgefd sp!, {r3}

            @ Check for new down span
.Lbmp16_flood_down:    
            ldrh    r3, [r1,+r6]
            cmp     r3, r11
            mov     r11, r3
            beq     .Lbmp16_flood_right_chk
            cmp     r3, r5
            bne     .Lbmp16_flood_right_chk
            add     r3, ip, #1
            cmp     r3, r7
            @ y+1<dstH: we have a new up-span: push it.
            addlt   r3, r2, r3, lsl #16
            stmltfd sp!, {r3}
                           
.Lbmp16_flood_right_chk:
            add     r2, r2, #2
            cmp     r2, r6
            bls     .Lbmp16_flood_right

.Lbmp16_flood_fill_line:
        sub     r2, r1, r0
        movs    r2, r2, lsr #1
        mov     r1, r4
        ldrne   r3,=memset16
        movne   lr, pc
        bxne    r3

        cmp     r9, sp
        bne     .Lbmp16_flood_pop

.Lbmp16_flood_end:
    ldmfd   sp!, {r4-r11,lr}
    bx      lr

That would be a stack-based 16-bit floodfill. Although it's slower, I prefer this to the recursive floodfill from last week because there's no chance of overflowing the stack. Well actually there is that risk, but it's a very low one because for new span the routine only uses a single extra word of stack-space instead of about 8. Only in really complex fills should you worry about stack-overflow. I also prefer this over the C-equivalent (given below) because C doesn't allow you to use the stack-pointer in this way; you'd have to allocate a complete stack for it (and hope that it doesn't overflow).

I've set the section for this routine to .text here (the standard code-section). This hurts the routine's speed something fierce because of the 16-bit bus of GBA-ROM, but I'm not sure if it's worth putting it in IWRAM just yet. Feel free to put it there if you need the speed, though. For NDS it doesn't really matter because it'd be cached anyway.

Using the actual stack here for keeping track of span coordinates is the first trick I used here. The second would be to double the horizontal coordinates so that they can be used directly as offsets from the scanline start (given here by dstL. The math stays the same throughout so it doesn't really complicate matters. As far as I know, GCC can't do something like that, which is partially why stack-based 16-bit floodfill is slower than the recursive one in C. This would also explain why on 8-bit canvasses, where the doubling isn't necessary, the stack-based version is faster. I think it's possible to shave off one or two instructions here and there, but that'd probably be somewhat of a wasted effort. Yes, you may argue that writing this in assembly was a waste of effort in the first place, but my main goal here was stack safety; not necessarily the speed of the routine.

Note the use of memset16() here. You could, of course, use DMA instead, although that'll only be faster for small spans.


For completeness, here's the C version as well. There's almost a one-to-one correspondence between the C and assembly versions.

typedef struct POINT16 { s16 x, y; } POINT16;

#define PUSH_EA(_sp, _x, _y)    do { _sp->x= _x; _sp->y= _y; _sp++; } while(0)
#define POP_EA(_sp, _x, _y)     do { _sp--; _x= _sp->x; _y= _sp->y; } while(0)

void bmp16_floodfill(int x, int y, COLOR clrNew, void *dstBase,
    u32 dstW, u32 dstH)
{
    u16 *dstD= dstBase, *dstL= &dstD[y*dstW];
    u16 clrOld= dstL[x];

    if(clrOld == clrNew)
        return;

    int left, right;
    POINT16 stack[1024], *sp= stack;
    PUSH_EA(sp, x, y);

    while(sp != stack)
    {
        POP_EA(sp, left, y);
        dstL= &dstD[y*dstW];

        // Find left edge
        while(dstL[left]==clrOld && left>=0)
            left--;
        left++;

        u32 clr, clrUp= -1, clrDown= -1;

        // Trace line until right-edge
        for(right= left; dstL[right]==clrOld && right<dstW; right++)
        {
            clr= dstL[right-dstW];

            // Test up-span
            if(clr!=clrUp && clr==clrOld && y>0)
                PUSH_EA(sp, right, y-1);
            clrUp= clr;
           
            // Test down-span
            clr= dstL[right+dstW];
            if(clr!=clrDown && clr==clrOld && y+1 <  dstH)
                PUSH_EA(sp, right, y+1);
            clrDown= clr;        
        }
        memset16(&dstL[left], clr, right-left);
    }
}

Leave a Reply

Your email address will not be published. Required fields are marked *