Aw crap. I told myself I wasn't going to do this:
bmp16_floodfill in ARM assembly
@ void bmp16_floodfill_asm(int x, int y, u16 clrNew, void *dstBase,
@ u16 dstW, u16 dstH);
@ ---------------------------------------------------------------------
.extern memset16
.text
.arm
.align 2
.global bmp16_floodfill_asm
bmp16_floodfill_asm:
stmfd sp!, {r4-r11,lr}
@ NOTE: all x-values are doubled for use as halfword offsets
/* Reglist:
r0: x, dstL
r1: ppx (current pixel pointer)
r2: x*2 (right, left, count)
r3: yx / tmp / clr
r4: clrNew
r5: clrOld
r6: dstW*2
r7: dstH-1
r8: dstBase
r9: top of span-stack
r10: clrUp
r11: clrDown
ip: y
*/
@ Prep ppx, clrOld, stack
ldrh r6, [sp, #36]
ldrh r7, [sp, #40]
mov r6, r6, lsl #1
mov r0, r0, lsl #1
mla r4, r6, r1, r3
ldrh r5, [r4, r0]
cmp r5, r2 @ Old color== new color: nevermind
beq .Lbmp16_flood_end
mov r8, r3
mov r9, sp
mov r4, r2
@ Initial push (merge with prologue?)
add r3, r0, r1, lsl #16
stmfd sp!, {r3}
@ While stack not empty, get new span
.Lbmp16_flood_pop:
ldmfd sp!, {r3}
mov ip, r3, lsr #16 @ y
eor r2, r3, ip, lsl #16 @ x
mla r0, r6, ip, r8 @ dstL= &dstBase[y*dstW]
@ Find left (can be optimized more)
.Lbmp16_flood_left:
ldrh r3, [r0,r2]
cmp r3, r5 @ clr==clrOld
subeq r2, r2, #2
moveqs r3, r2, asr #31 @ Special for x>=0
beq .Lbmp16_flood_left
@ Prepare for moving right
add r2, r2, #2 @ Step back into span
add r0, r0, r2 @
sub r1, r0, #2
mvn r10, #0 @ clrUp= clrDown= -1
mvn r11, #0
@# Move right, checking up and down as we go
.Lbmp16_flood_right:
ldrh r3, [r1,#2]!
cmp r3, r5
bne .Lbmp16_flood_fill_line
@ Check for new upper span
ldrh r3, [r1,-r6]
cmp r3, r10
mov r10, r3
beq .Lbmp16_flood_down
cmp r3, r5
bne .Lbmp16_flood_down
subs r3, ip, #1
@ y-1>=0: we have a new up-span: push it. Push it good
addge r3, r2, r3, lsl #16
stmgefd sp!, {r3}
@ Check for new down span
.Lbmp16_flood_down:
ldrh r3, [r1,+r6]
cmp r3, r11
mov r11, r3
beq .Lbmp16_flood_right_chk
cmp r3, r5
bne .Lbmp16_flood_right_chk
add r3, ip, #1
cmp r3, r7
@ y+1<dstH: we have a new up-span: push it.
addlt r3, r2, r3, lsl #16
stmltfd sp!, {r3}
.Lbmp16_flood_right_chk:
add r2, r2, #2
cmp r2, r6
bls .Lbmp16_flood_right
.Lbmp16_flood_fill_line:
sub r2, r1, r0
movs r2, r2, lsr #1
mov r1, r4
ldrne r3,=memset16
movne lr, pc
bxne r3
cmp r9, sp
bne .Lbmp16_flood_pop
.Lbmp16_flood_end:
ldmfd sp!, {r4-r11,lr}
bx lr
That would be a stack-based 16-bit floodfill. Although it's slower, I prefer this to the recursive floodfill from last week because there's no chance of overflowing the stack. Well actually there is that risk, but it's a very low one because for new span the routine only uses a single extra word of stack-space instead of about 8. Only in really complex fills should you worry about stack-overflow. I also prefer this over the C-equivalent (given below) because C doesn't allow you to use the stack-pointer in this way; you'd have to allocate a complete stack for it (and hope that it doesn't overflow).
I've set the section for this routine to .text
here (the standard code-section). This hurts the routine's speed something fierce because of the 16-bit bus of GBA-ROM, but I'm not sure if it's worth putting it in IWRAM just yet. Feel free to put it there if you need the speed, though. For NDS it doesn't really matter because it'd be cached anyway.
Using the actual stack here for keeping track of span coordinates is the first trick I used here. The second would be to double the horizontal coordinates so that they can be used directly as offsets from the scanline start (given here by dstL
. The math stays the same throughout so it doesn't really complicate matters. As far as I know, GCC can't do something like that, which is partially why stack-based 16-bit floodfill is slower than the recursive one in C. This would also explain why on 8-bit canvasses, where the doubling isn't necessary, the stack-based version is faster. I think it's possible to shave off one or two instructions here and there, but that'd probably be somewhat of a wasted effort. Yes, you may argue that writing this in assembly was a waste of effort in the first place, but my main goal here was stack safety; not necessarily the speed of the routine.
Note the use of memset16()
here. You could, of course, use DMA instead, although that'll only be faster for small spans.
For completeness, here's the C version as well. There's almost a one-to-one correspondence between the C and assembly versions.
#define PUSH_EA(_sp, _x, _y) do { _sp->x= _x; _sp->y= _y; _sp++; } while(0)
#define POP_EA(_sp, _x, _y) do { _sp--; _x= _sp->x; _y= _sp->y; } while(0)
void bmp16_floodfill(int x, int y, COLOR clrNew, void *dstBase,
u32 dstW, u32 dstH)
{
u16 *dstD= dstBase, *dstL= &dstD[y*dstW];
u16 clrOld= dstL[x];
if(clrOld == clrNew)
return;
int left, right;
POINT16 stack[1024], *sp= stack;
PUSH_EA(sp, x, y);
while(sp != stack)
{
POP_EA(sp, left, y);
dstL= &dstD[y*dstW];
// Find left edge
while(dstL[left]==clrOld && left>=0)
left--;
left++;
u32 clr, clrUp= -1, clrDown= -1;
// Trace line until right-edge
for(right= left; dstL[right]==clrOld && right<dstW; right++)
{
clr= dstL[right-dstW];
// Test up-span
if(clr!=clrUp && clr==clrOld && y>0)
PUSH_EA(sp, right, y-1);
clrUp= clr;
// Test down-span
clr= dstL[right+dstW];
if(clr!=clrDown && clr==clrOld && y+1 < dstH)
PUSH_EA(sp, right, y+1);
clrDown= clr;
}
memset16(&dstL[left], clr, right-left);
}
}