Help needed optimizing compressor!

Harry Potter · May 19, 2022

Hi! I am working on a XZ0-variant compressor and need help. Right now, it is for the Commodore 64, but I plan to port it to the Atari 8-bit series soon. The problem I have is that it is too slow. I am using cc65 to create it. The LZ77 code in assembler follows:


_ChkBlockA:
;inc    $D020
    ;clc
    ;adc    #<_InBuffer
    ;sta    ptr1
    ;txa
    ;adc    #>_InBuffer
    ;sta    ptr1+1

    ldy    #1
    ;iny
@lp:
    lda    (ptr1),y
    cmp    (ptr2),y
    bne    @exit
    iny
    cpy    tmp1
    bcc    @lp
@exit:
    ;cpy    #2
    ;bcs    *+4
    ;ldy    #0
    ;tya
    ;ldx    #0
    rts

;ptr2=absolute address of current input
;ptr3=InPos of call.
;ptr4=current pos. in InBuffer
;curl=earliest pos. of scan.
;tmp1=max. len. of block
_GetLZ:
    ldy    #0
    sty    _techLZ+2
    sta    ptr3
    stx    ptr3+1
    ;sta    ptr4
    ;stx    ptr4+1
    ora    ptr3+1
    bne    *+5
    jmp    @exit
    clc
    lda    ptr3
    adc    #<_lzdict
    sta    ptr2
    sta    ptr1
    txa
    adc    #>_lzdict
    sta    ptr2+1
    sta    ptr1+1
@regstart:
    
    lda    ptr3+1
    cmp    #$12
    bcc    @calcincl
    sbc    #$12
    sta    curl+1
    lda    ptr3
    sta    curl
    jmp    @endcalc
    ;bcs    @endcalc
@calcincl:
    ldy    #0
    sty    curl
    sty    curl+1
    
@endcalc:
;lda    curl
;sta    $400+16
;lda    curl+1
;sta    $400+17
    lda    _InEnd
    sec
    sbc    _InPos
    sta    regbank
    lda    _InEnd+1
    sbc    _InPos+1
    sta    regbank+1
    bne    @maxlenw
    lda    regbank
    cmp    #128
    bcc    @maxlenx
    
@maxlenw:
    lda    #128
@maxlenx:
;sta    $400+16
    sta    tmp1
    cmp    #2
    bcc    @exit
    
    lda    curl
    clc
    adc    #<_lzdict
    sta    curl
    lda    curl+1
    adc    #>_lzdict
    sta    curl+1

    ;lda    ptr3
    ;sta    ptr4
    ;lda    ptr3+1
    ;sta    ptr4+1
    ldy    #0
    lda    (ptr1),y
    sta    tmp4
    ;iny
    ;lda    (ptr1),y
    ;sta    curread+1


@lp:
    ldy    ptr1
    bne    *+4
    dec    ptr1+1
    dec    ptr1
    ;ldy    ptr4
    ;bne    *+4
    ;dec    ptr4+1
    ;dec    ptr4
    ldy    #0
    lda    tmp4
    cmp    (ptr1),y
    bne    @lp2
    ;iny
    ;lda    curread+1
    ;cmp    (ptr1),y
    ;bne    @lp2
    ;lda    ptr4
    ;ldx    ptr4+1
    jsr    _ChkBlockA
    ;cpy    #0
    ;beq    @lp2
    ;bcc    @lp2
    cpy    _techLZ+2
    beq    @lp2
    bcc    @lp2
    sty    _techLZ+2
;sty    $400+8
    ;lda    ptr4
    lda    ptr1
    sta    _techLZ
    ;lda    ptr4+1
    lda    ptr1+1
    sta    _techLZ+1
@lp2:
    lda    curl+1
    cmp    ptr1+1
    bcc    @lp
    lda    curl
    cmp    ptr1
    bcc    @lp

@exit:
;inc    $D021
    ldx    #0
    lda    _techLZ+2
    cmp    #2
    bcs    *+3
    ;lda    #0
    txa
    ;ldx    #0
    rts

and the code to write to the output:


static void writebyte (void) {
    //cbm_write (0, &wr, 1);
    //wr>>=8;
// #ifdef __C128__
    // bank1_writebyte (&OutBuffer[OutPos], wr);
// #else
    // hidewriteb(&OutBuffer[OutPos], wr);
// #endif
    OutBuffer[OutPos]=wr;
    wr=0;
    //OutBit-=8;
    ++OutPos;
}

void writeoutlast (void){
    if ((signed char)OutBit<8) {
        wr<<=OutBit-1;
        writebyte ();
    }
}

void __fastcall__ writeoutf (unsigned val, unsigned char bitlen)
{
    while (bitlen) {
        --OutBit;
        //wr=(wr<<1)|((val>>(OutBit))&1);
        //wr|=(((val>>(bitlen-1))&1)<<OutBit);
        wr=(wr<<1)|((val>>(bitlen-1))&1);
        //val>>=1;
        if (!OutBit) {
            writebyte ();
            OutBit=8;
        }
        --bitlen;
    }
}

BTW, if you want more informartion, just ask, as this code is to be open-source.

TGB1718 · May 19, 2022

Maybe some comments to help identify what your code blocks are doing/trying to achieve, also leaving lots of

commented out code just confuses things.

It's good practice even if your only writing code for yourself is to add comments as you go, that way

even if you have to revisit code many moons later it will help you identify what your code is doing.

Harry Potter · May 19, 2022

You're right. I'm a sloppy coder.

Harry Potter · May 19, 2022

I fixed the LZ77 code. It follows:


;Compare current block with LZ77 test block.  Return in Y len. of match.
_ChkBlockA:
    ldy    #1
@lp:
    lda    (ptr1),y
    cmp    (ptr2),y
    bne    @exit
    iny
    cpy    tmp1
    bcc    @lp
@exit:
    rts

;Find best LZ77 block.
;
;ptr2=absolute address of current input
;ptr3=InPos of call.
;ptr4=current pos. in InBuffer
;curl=earliest pos. of scan.
;tmp1=max. len. of block

_GetLZ:
;Clear returned len. of LZ77 block.
    ldy    #0
    sty    _techLZ+2
;Save pos. in LZ77 input buffer
    sta    ptr3
    stx    ptr3+1
;If 0, exit.
    ora    ptr3+1
    bne    *+5
    jmp    @exit
;Add the address of input buffer and store current input and test.
    clc
    lda    ptr3
    adc    #<_lzdict
    sta    ptr2
    sta    ptr1
    txa
    adc    #>_lzdict
    sta    ptr2+1
    sta    ptr1+1
@regstart:
;Get offset of end of scan, which is the greater of $1300 before
;current and beginning of input.
;.
    lda    ptr3+1
    cmp    #$13
    bcc    @calcincl
    sbc    #$13
    sta    curl+1
    lda    ptr3
    sta    curl
    jmp    @endcalc
@calcincl:
    ldy    #0
    sty    curl
    sty    curl+1
    
@endcalc:
;Get max. len., which is the smaller of 128 bytes and the rest of input.
    lda    _InEnd
    sec
    sbc    _InPos
    sta    regbank
    lda    _InEnd+1
    sbc    _InPos+1
    sta    regbank+1
    bne    @maxlenw
    lda    regbank
    cmp    #128
    bcc    @maxlenx
    
@maxlenw:
    lda    #128
@maxlenx:
    sta    tmp1
;Exit if at end of input (<2 bytes left--uncompressable).
    cmp    #2
    bcc    @exit
;Add input buffer address to offset found at @regstart.
    lda    curl
    clc
    adc    #<_lzdict
    sta    curl
    lda    curl+1
    adc    #>_lzdict
    sta    curl+1

;Save first input byte for later optimization.
    ldy    #0
    lda    (ptr1),y
    sta    tmp4

@lp:
;Decrease test address.
    ldy    ptr1
    bne    *+4
    dec    ptr1+1
    dec    ptr1


;Compare first byte.  If !=, skip.
    lda    tmp4
    cmp    (ptr1),y
    bne    @lp2

;Get LZ77 len. at current pos.
    jsr    _ChkBlockA

;If >last best,
    cpy    _techLZ+2
    beq    @lp2
    bcc    @lp2
    sty    _techLZ+2
;store current loc. and len.
    lda    ptr1
    sta    _techLZ
    lda    ptr1+1
    sta    _techLZ+1
@lp2:
;Loop until end of scan.
    lda    curl+1
    cmp    ptr1+1
    bcc    @lp
    lda    curl
    cmp    ptr1
    bcc    @lp

@exit:
;Return best len. found--loc. and len. also in _techLZ.
    ldx    #0
    lda    _techLZ+2
    cmp    #2
    bcs    *+3
    txa
    rts

phaeron · May 19, 2022

This code is written to use pointer arithmetic, which is natural for C but translates poorly to 6502. The 6502 is much better with base+offset addressing, particularly with the (zp),Y addressing mode. But that is not your big problem. Your big problem is that this appears to be a brute force LZ search, which is going to be very slow. If I read this algorithm correctly, for each byte it is going to search all previous locations all the way up to $1300 behind the current point. This going to be quadratic time -- the search time for each input byte will be linear with the position of the byte, which means that the running time for the whole input will be proportional to the square of the input length. That will run slowly even on a modern CPU, much less a 6502. I'll bet that if you ran this under a profiler you'd find that _ChkBlockA lights up like a christmas tree compared to the main loop, due to the sheer number of matches being tried.

Practical implementations of LZ matching use incremental search structures to speed up the search. Common ones include binary tree and hash table. The hash table is the easier approach and would be simpler to implement here; the popular zlib library has a description of the algorithm. Basically, you continuously hash groups of 2 or 3 bytes and insert them into a running hash table. This speeds up the search because you can then also hash at the current position and check only the positions in the matching hash chain, which is a fraction of the full set of possibilities. A limit is also usually imposed to cut off the search if there are too many candidates, to avoid worst-case performance. This can miss some matches in corner cases, but greatly improves the average time. It does also require some memory for the hash table, which will reduce the maximum length or at least window that the compressor can support.

There is also a specific issue with quadratic time behavior involving cases of patterns that are actually really good for the compressor, but causes the compressor to do tons of redundant checks unless you specifically handle it. The simplest case is a plain run of A characters, where the compressor spends time finding a match for A, then AA, then AAA, etc. Testing the last character first sometimes helps with this. Charles Bloom, who worked on compression libraries at RAD Game Tools, refers to this as the "follows" problem in some of his articles. He also goes into a lot of details, such as running the LZ search backwards if you want optimal parsing, etc. But that's for after you get to reasonable performance.

Harry Potter · May 19, 2022

I thank you. I figured out how to download the text file--I had problems from the site before. I will look at it in more detail later.

Sign In

Help needed optimizing compressor!

Recommended Posts

Harry Potter

Link to comment

Share on other sites

TGB1718

Link to comment

Share on other sites

Harry Potter

Link to comment

Share on other sites

Harry Potter

Link to comment

Share on other sites

phaeron

Link to comment

Share on other sites

Harry Potter

Link to comment

Share on other sites

Join the conversation

Recently Browsing 0 members

Apps

My Activity Streams

More