Hello my name is cearn, and I'm an ARMaholic.

Aw crap. I told myself I wasn't going to do this:

bmp16_floodfill in ARM assembly

@ ---------------------------------------------------------------------
@ void bmp16_floodfill_asm(int x, int y, u16 clrNew, void *dstBase,
@   u16 dstW, u16 dstH);
@ ---------------------------------------------------------------------

    .extern memset16

    .text
    .arm
    .align 2
    .global bmp16_floodfill_asm
bmp16_floodfill_asm:
    stmfd   sp!, {r4-r11,lr}

    @ NOTE: all x-values are doubled for use as halfword offsets
    /* Reglist:
        r0: x, dstL
        r1: ppx (current pixel pointer)
        r2: x*2 (right, left, count)
        r3: yx / tmp / clr
        r4: clrNew
        r5: clrOld
        r6: dstW*2
        r7: dstH-1
        r8: dstBase
        r9: top of span-stack
        r10: clrUp
        r11: clrDown    
        ip: y
    */


    @ Prep ppx, clrOld, stack
    ldrh    r6, [sp, #36]
    ldrh    r7, [sp, #40]
    mov     r6, r6, lsl #1
    mov     r0, r0, lsl #1
    mla     r4, r6, r1, r3
    ldrh    r5, [r4, r0]
    cmp     r5, r2                  @ Old color== new color: nevermind
    beq     .Lbmp16_flood_end

    mov     r8, r3
    mov     r9, sp
    mov     r4, r2

    @ Initial push (merge with prologue?)
    add     r3, r0, r1, lsl #16
    stmfd   sp!, {r3}

    @ While stack not empty, get new span
.Lbmp16_flood_pop:
        ldmfd   sp!, {r3}
        mov     ip, r3, lsr #16     @ y
        eor     r2, r3, ip, lsl #16 @ x
        mla     r0, r6, ip, r8      @ dstL= &dstBase[y*dstW]

        @ Find left (can be optimized more)
.Lbmp16_flood_left:
            ldrh    r3, [r0,r2]
            cmp     r3, r5          @ clr==clrOld
            subeq   r2, r2, #2
            moveqs  r3, r2, asr #31 @ Special for x>=0
            beq     .Lbmp16_flood_left

        @ Prepare for moving right
        add     r2, r2, #2          @ Step back into span
        add     r0, r0, r2          @
        sub     r1, r0, #2
        mvn     r10, #0             @ clrUp= clrDown= -1
        mvn     r11, #0

        @# Move right, checking up and down as we go
.Lbmp16_flood_right:
            ldrh    r3, [r1,#2]!
            cmp     r3, r5
            bne     .Lbmp16_flood_fill_line

            @ Check for new upper span
            ldrh    r3, [r1,-r6]
            cmp     r3, r10
            mov     r10, r3
            beq     .Lbmp16_flood_down
            cmp     r3, r5
            bne     .Lbmp16_flood_down
            subs    r3, ip, #1
            @ y-1>=0: we have a new up-span: push it. Push it good
            addge   r3, r2, r3, lsl #16
            stmgefd sp!, {r3}

            @ Check for new down span
.Lbmp16_flood_down:    
            ldrh    r3, [r1,+r6]
            cmp     r3, r11
            mov     r11, r3
            beq     .Lbmp16_flood_right_chk
            cmp     r3, r5
            bne     .Lbmp16_flood_right_chk
            add     r3, ip, #1
            cmp     r3, r7
            @ y+1<dstH: we have a new up-span: push it.
            addlt   r3, r2, r3, lsl #16
            stmltfd sp!, {r3}
                           
.Lbmp16_flood_right_chk:
            add     r2, r2, #2
            cmp     r2, r6
            bls     .Lbmp16_flood_right

.Lbmp16_flood_fill_line:
        sub     r2, r1, r0
        movs    r2, r2, lsr #1
        mov     r1, r4
        ldrne   r3,=memset16
        movne   lr, pc
        bxne    r3

        cmp     r9, sp
        bne     .Lbmp16_flood_pop

.Lbmp16_flood_end:
    ldmfd   sp!, {r4-r11,lr}
    bx      lr

That would be a stack-based 16-bit floodfill. Although it's slower, I prefer this to the recursive floodfill from last week because there's no chance of overflowing the stack. Well actually there is that risk, but it's a very low one because for new span the routine only uses a single extra word of stack-space instead of about 8. Only in really complex fills should you worry about stack-overflow. I also prefer this over the C-equivalent (given below) because C doesn't allow you to use the stack-pointer in this way; you'd have to allocate a complete stack for it (and hope that it doesn't overflow).

I've set the section for this routine to .text here (the standard code-section). This hurts the routine's speed something fierce because of the 16-bit bus of GBA-ROM, but I'm not sure if it's worth putting it in IWRAM just yet. Feel free to put it there if you need the speed, though. For NDS it doesn't really matter because it'd be cached anyway.

Using the actual stack here for keeping track of span coordinates is the first trick I used here. The second would be to double the horizontal coordinates so that they can be used directly as offsets from the scanline start (given here by dstL. The math stays the same throughout so it doesn't really complicate matters. As far as I know, GCC can't do something like that, which is partially why stack-based 16-bit floodfill is slower than the recursive one in C. This would also explain why on 8-bit canvasses, where the doubling isn't necessary, the stack-based version is faster. I think it's possible to shave off one or two instructions here and there, but that'd probably be somewhat of a wasted effort. Yes, you may argue that writing this in assembly was a waste of effort in the first place, but my main goal here was stack safety; not necessarily the speed of the routine.

Note the use of memset16() here. You could, of course, use DMA instead, although that'll only be faster for small spans.


For completeness, here's the C version as well. There's almost a one-to-one correspondence between the C and assembly versions.

typedef struct POINT16 { s16 x, y; } POINT16;

#define PUSH_EA(_sp, _x, _y)    do { _sp->x= _x; _sp->y= _y; _sp++; } while(0)
#define POP_EA(_sp, _x, _y)     do { _sp--; _x= _sp->x; _y= _sp->y; } while(0)

void bmp16_floodfill(int x, int y, COLOR clrNew, void *dstBase,
    u32 dstW, u32 dstH)
{
    u16 *dstD= dstBase, *dstL= &dstD[y*dstW];
    u16 clrOld= dstL[x];

    if(clrOld == clrNew)
        return;

    int left, right;
    POINT16 stack[1024], *sp= stack;
    PUSH_EA(sp, x, y);

    while(sp != stack)
    {
        POP_EA(sp, left, y);
        dstL= &dstD[y*dstW];

        // Find left edge
        while(dstL[left]==clrOld && left>=0)
            left--;
        left++;

        u32 clr, clrUp= -1, clrDown= -1;

        // Trace line until right-edge
        for(right= left; dstL[right]==clrOld && right<dstW; right++)
        {
            clr= dstL[right-dstW];

            // Test up-span
            if(clr!=clrUp && clr==clrOld && y>0)
                PUSH_EA(sp, right, y-1);
            clrUp= clr;
           
            // Test down-span
            clr= dstL[right+dstW];
            if(clr!=clrDown && clr==clrOld && y+1 <  dstH)
                PUSH_EA(sp, right, y+1);
            clrDown= clr;        
        }
        memset16(&dstL[left], clr, right-left);
    }
}

A grit demo, part I


Grit demo menu.

I think I finally have a nice demonstration of the capabilities and use of grit again. The figure on the right shows its main menu. Each entry shows one or more capabilities of grit and how to make use of the data it outputs. Pressing start returns to the menu.

The source for the project is included in the demo, as is the most recent tonclib because the demo would not have been possible without it (at least not with a lot more code in the project itself). In a sense, it's a tonclib demo as well.

The way that grit is invoked and how its output is made available is pretty neat. I'm running a separate makefile called gfxmake which runs grit, assembles its output into a library called libgfx.a. It also collects all the separate headers into a main header file called all_gfx.h, so there will be no need to include dozens of graphics headers – just the one. I also have three sets of grit rules in here: for bitmap+.grit pairs, for .grit-less bitmaps (which use the directories main grit file) and fully custom rules for really special occasions. Like I said, pretty neat ^_^. I am pulling a few Clever Tricks in gfxmake to get all of this done and I hope to discuss these in more detail at a later time.

I'll probably add a few more examples in this demo (feel free to suggest some), but the way grit's invoked here will probably stay. Using a secondary makefile means a minimum of interference with the template makefiles and it also makes it easy to customize your projects w.r.t. graphics. As far as I'm concerned, this is the recommended path … until anyone informs me of a better plan.


Download link: /files/gritdemo.zip


Note to quirky: sorry, no palette merging in this demo yet. It will be in its follow-up.

FloodFill and adventures in optimization

One of the things I've been curious about for sometime but never really looked up is the FloodFill. Thanks to a certain recent post in the PALib forum, I finally started looking for implementations. There's a list of them at Lode Vandevenne's site. The actual implementation I start with actually came from the code codex, which gives an version of the recursive scanline floodfill. One that was originally written by tepples (it can be found in the ToD source). Converted to a form usable for any sized 16bit canvas, it looks a little like this:

//! Struct for semi-constant canvas and floodfill info.
typedef struct flood_t
{
    void *dstBase;
    u16 dstW;
    u16 dstH;
    COLOR clrNew;
    COLOR clrOld;
} flood_t;

//! Basic floodfill wrapper
void bmp16_flood_fill(int x, int y, COLOR clr, void *dstBase,
    u32 dstW, u32 dstH)
{
    u16 *dstL= (u16*)(dstBase + (y*dstW + x)*2);

    if(*dstL == clr)
        return;
       
    flood_t flood= { dstBase, dstW, dstH, clr, *dstL };
    bmp16_flood_fill_internal(x, y, &flood);    
}

//! Main floodfill code (called recursively)
void bmp16_flood_fill_internal(int x, int y, const flood_t *flood)
{
    u16 *dstD= flood->dstBase, *dstL= &dstD[y*flood->dstW];
   
    // Find horz edges
    int ii, left=x, right=x;

    while(dstL[left-1]==flood->clrOld && left>0)
    {
        dstL[left]= flood->clrNew;
        left--;
    }
    dstL[left]= flood->clrNew;

    while(dstL[right+1]==flood->clrOld && right<flood->dstW-1)
    {
        right++;
        dstL[right]= flood->clrNew;
    }

    // Recursive *ick*. All the stack-work's making GBA's head spin.
    for(ii=left; ii<=right; ii++)
    {
        if(y>0 && dstL[ii-flood->dstW] == flood->clrOld)
            bmp16_flood_fill_internal(ii, y-1, flood);
       
        if(y+1<flood->dstH && dstL[ii+flood->dstW] == flood->clrOld)
            bmp16_flood_fill_internal(ii, y+1, flood);
    }

}

I've split the code into two parts: bmp16_flood_fill(), which is merely an interface function to get the color to fill over and set-up some variables, and bmp16_flood_fill_internal(), which is the workhorse here.

My first reaction to the workings was “huh!”, in surprise of how simple it is: you look left and right for the length of the span in the current scanline, fill it, then search in the lines above and below for places to fill (i.e., if the color is clrOld) and call bmp16_flood_fill_internal() recursively to do so.

The code looks understandable, works nicely and is moderately fast considering all the work that needs to be done. That said, you can almost double its speed with just a few simple alterations.

Datatypes

The first optimization is using the correct datatypes. By which I of course mean int or u32. Using chars and shorts for my local variables and parameters here costs me roughly 25%.

You may also notice the flood_t datatype. This is an optimization too in a way: the more parameters a function has, the more stackwork needs to be done to set-up the arguments and pass them on to the function. Because I only need to read those values and not write to them once inside bmp16_flood_fill_internal(), I'm passing a struct pointer instead of separate parameters. The actual gain in speed turned out to be negligible, but I did find out something else: the amount of stack it cost. I had expected that the recursive nature would require an awful lot of stack, but what I hadn't counted on was that in my test fill, the separate-parameter version took all the stack. When I tested it on no$gba, I got an warning that that a bx r0 jumped to an invalid address. When I looked at the address it turned out the stack pointer was in the early 0300:xxxx's : it had completely clobbered IWRAM.

Dedicated scanline fill

The datatype improvements were things that I basically started out with in the first place, because those are rather obvious. The first real optimization was to use a fast scanlinefill instead of setting pixels individually – something like DMA or memset16(). Using memset16() saved roughly 15%. This optimization will only work if you have longer spans, of course.

Preload flood_t members

The second optimization is also somewhat obvious if you know a little about how compilers do their work. This optimization has to do with variable scope. Local variables and parameters only exist inside the functions they're used in. This means that the compiler knows exactly when they are changed and can use that knowledge to optimize certain things. In contrast, globals and pointer-parameters can be changed everywhere. In particular, they can be changed in the functions that are called by a given routine. This means that after every function-call, globals and pointer-dereferences are considered ‘dirty’ and they have to be reloaded to be sure of their values.

This affects the dereferences of the flood struct here: all the members used in the last loop of bmp16_flood_fill_internal() have to be reloaded at every iteration. There is an easy way of avoiding the reloads: create a local variable for these items and use that. This also increased the speed by about 25%.

Changing the branch condition orders

The last optimization I did turned out to be a major one. The loop- and if-conditions consist of two parts: a color check but also a clipping check because we don't want to fill outside the screen. The code above does the clipping check first.

And this is where some knowledge of assembly or compilers comes in again. Because logical ANDs require both expressions to be true, you can quit as soon as one of them is false. Because the conditions are evaluated sequentially, it pays to have the one most likely to fail first. This happens to be the color check, not the clipping check. You might think that this shouldn't matter too much, but it turns out that simply by changing the order of the `&&' expressions saves another 25%. Note that this only really matters for Thumb code, since ARM code has conditional opcodes: it doesn't have to use branches to escape early.

Final version

void bmp16_flood_fill_internal(int x, int y, const flood_t *flood)
{
    // Preloading flood variables
    u16 clrOld= flood->clrOld, clrNew= flood->clrNew;
    u32 dstW= flood->dstW, dstH= flood->dstH;
    u16 *dstD= flood->dstBase, *dstL= &dstD[y*dstW];

    int ii, left=x, right=x;

    // Find horz edges, then fill
    while(dstL[left -1]==clrOld && left>0)
        left--;
    while(dstL[right+1]==clrOld && right<dstW-1)
        right++;
    memset16(&dstL[left], clrNew, right+1-left);

    // Recursive *ick*. All the stackwork's making GBA's head spin.
    for(ii=left; ii<=right; ii++)
    {
        // Do color check first.
        if(dstL[ii-dstW]==clrOld && y>0)
            bmp16_flood_fill_internal(ii, y-1, flood);
       
        if(dstL[ii+dstW]==clrOld && y+1<dstH)
            bmp16_flood_fill_internal(ii, y+1, flood);      
    }
}

Above you can see the final version of bmp16_flood_fill_internal(). Note that it is not harder to read at all; in fact, I'd say it's actually easier to read than the first version. At the same time we've sped up the function by a factor of 1.15*1.25*1.25 ≈ 1.75. 75% faster just by making some very elementary alterations. I don't really think these can be considered “premature optimizations”, because you could have easily designed and written the algorithm the faster way in the first place. To recap:

  • Use words (int/s32/u32) for local variables and parameters, not bytes (char/s8/u8) or halfwords (short/s16/u16).
  • Use a fast-fill routine instead of manual looping for longer spans.
  • Load up pointers, struct members and global variables into locals if they're used frequently and/or you call other functions. Aside from being potentially faster, it also keeps the worker code cleaner.
  • Order the expressions in compound conditions so that the decision can be reached as early as possible. For ANDs put the expressions most likely to be false first; for ORs, put these last.

None of those points are specific to floodfills; they should work for just about any part of code. I suppose you could call them good habits.


First floodfill results .

Second floodfill results.

One word of caution if you want to use the recursive floodfill routine. Because of the amount of stack it uses, you'll have to be careful where you apply it. There is also non-recursive version using a local stack, but you'd still run the risk of overflowing that and it doesn't seem to be faster anyway. Maybe I'll make an asm version later, but preferably not if I don't have to.

memset16 fix

Although I know I tested the memset16() presented earlier, one instruction miraculously disappeared when posting it here. There should have been an `mov r12, r1' in the initializing of the registers for the stmia. I have no idea where it disappeared to, but with this fix the routine should run properly.

I've been having a number of this sort of errors lately, actually. I've also recently found out that the shared-tileset in the current distribution is broken because of a single character that handles it in grit_prep_map. This:

// in grit_prep.cpp, grit_prep_map
    if(gr->img_flags & GRIT_IMG_SHARED)
        rdxDib= grit_tile_reduce(&mapRec, tileDib, flags, NULL);
    else
        rdxDib= grit_tile_reduce(&mapRec, tileDib, flags, gr->shared->dib);

should have been this:

// in grit_prep.cpp, grit_prep_map
    if(~gr->img_flags & GRIT_IMG_SHARED)
        rdxDib= grit_tile_reduce(&mapRec, tileDib, flags, NULL);
    else
        rdxDib= grit_tile_reduce(&mapRec, tileDib, flags, gr->shared->dib);

The missing ~ means that the the code did exactly the wrong thing. And again, during testing I had it right (cue Twilight-zone theme).

Tonc cleanup and fixes

I've uploaded some newer tonc files today, mostly for devkitArm r21 compatibility. Because the linkscript for multiboot got b0rken somehow, I had to change all the demos to default to cart-builds. I intended to change to cart-boot later anyway, but not being able to build the demos properly with the latest devkit kinda forces me to do it now. I've also had to change the tier-3 makefiles because it used `-fno-expections' in the CXXFLAGS. This should of course have been `-fno-exceptions' (thanks muff).

There have also been a few changes in the text parts: build-specs are set to cart-boot there too now, and I fixed some broken links. I've also fixed a slew of spelling and grammar issues that Patater sent in. These part of the text shouldn't be gibberish anymore – just unintelligible :P.