Jump to content
IGNORED

unpacking LZ4 on Jaguar


Ericde45

Recommended Posts

hello,

 

in case you need to use compressed files on Jag, on my github, in my YM emulation code, there is an unpacking subroutine to decompress LZ4 using 68000, and the same for DSP

https://github.com/ericde45/YM2149_JAG/blob/main/ym1.s

 

the PC exe packing program is here :
 


compressing is done with : lz4.exe -9 -l --no-frame-crc [input file] [output file]

 

start of packed datas are at +8

 

the -l command generate a legacy frame format lz4, more simple then usual lz4 format


documentation of the format is here  : https://android.googlesource.com/platform/external/lz4/+/HEAD/doc/lz4_Frame_format.md
( bloc size is 8 MB which is enough to only create 1 block file for the Jaguar )

  • Like 3
  • Thanks 1
Link to comment
Share on other sites

if creg is tagging registers to create aliases for them, i don't have a clear answer.

i use it in complex code, for example in the gpu sprite collisions routine of Jalaga, but most of the time i code with directly the registers.

 

i may see things better with registers, and i'm using my own custom colors in notepad++ to help

 

 

  • Like 1
  • Thanks 1
Link to comment
Share on other sites

Slighly optimized/size reduced (250 > 186)

 

; input: R20 : packed buffer
;		 R21 : output buffer
;		 R0  : LZ4 packed block size (in bytes)

; A4 => R24
; A0 => R20
; A1 => R21
; A3 => R23
; D0 => R0
; D1 => R1
; D2 => R2
; D4 => R4

; adresse saut 1 => R10
; adresse saut 2 => R11

; R12=$FF pour mask
; R13=tmp

lz4_depack_smallest_DSP:
		move	R20,R24
		add	R0,R24	; packed buffer end
		moveq	#0,R0
		moveq	#0,R2
		moveq	#$F,R4
		movei	#.lenOffset_smallest_DSP,R10
		movei	#$FF,R12

.tokenLoop_smallest_DSP:
		loadb	(R20),R0
		addq	#1,R20
		move	R0,R1
		shrq	#4,R1
		jump	eq,(R10)
		nop

.readLen_smallest1_DSP:
		cmp	R1,R4			; cmp.B !!!!
		jr	ne,.readEnd_smallest1_DSP
;;->		nop
.readLoop_smallest1_DSP:
		loadb	(R20),R2
		addq	#1,R20
		add	R2,R1			; final len could be > 64KiB
		not	R2
		and	R12,R2			; not R2.b
		jr	eq,.readLoop_smallest1_DSP
		nop
.readEnd_smallest1_DSP:

.litcopy_smallest_DSP:
		loadb	(R20),R13
		addq	#1,R20
		subq	#1,R1
		storeb	R13,(R21)
		jr	ne,.litcopy_smallest_DSP
		addq	#1,R21

		; end test is always done just after literals
		movei	#.readEnd_smallest_DSP,R11
		cmp	R20,R24
		jump	eq,(R11)
		nop

.lenOffset_smallest_DSP:
		loadb	(R20),R1	; read 16bits offset, little endian, unaligned
		addq	#1,R20
		loadb	(R20),R13
		addq	#1,R20
		shlq	#8,R13
		add	R13,R1

		move	R21,R23
		sub	R1,R23		; R1/d1 bits 31..16 are always 0 here

		moveq	#$F,R1
		and	R0,R1		; and.w	d0,d1 .W !!!

.readLen_smallest2_DSP:
		cmp	R1,R4			; cmp.B !!!!
		jr	ne,.readEnd_smallest2_DSP
		nop

.readLoop_smallest2_DSP:
		loadb	(R20),R2
		addq	#1,R20
		add	R2,R1			; final len could be > 64KiB
		not	R2
		and	R12,R2			; not R2.b
		jr	eq,.readLoop_smallest2_DSP
		nop

.readEnd_smallest2_DSP:
		addq	#4,R1

.copy_smallest_DSP:
		loadb	(R23),R13
		addq	#1,R23
		subq	#1,R1
		storeb	R13,(R21)
		jr	ne,.copy_smallest_DSP
		addqt	#1,R21

		movei	#.tokenLoop_smallest_DSP,R11
		jump	(R11)
		nop

.readLen_smallest_DSP:
		cmp	R1,R4				; cmp.B !!!!
		jr	ne,.readEnd_smallest_DSP
		nop

.readLoop_smallest_DSP:
		loadb	(R20),R2
		addq	#1,R20
		add	R2,R1			; final len could be > 64KiB
		not	R2
		and	R12,R2			; not R2.b
		jr	eq,.readLoop_smallest_DSP
		nop

.readEnd_smallest_DSP:

YM_DSP_retour_depack_LZ4_boucle_principale_DSP:
	movei	#YM_LZ4_nb_bloc_LZ4_disponibles,R0
	load	(R0),R1
	addq	#1,R1
	store	R1,(R0)

	movei	#DSP_boucle_centrale,R0
	jump	(R0)
	nop

 

 

  • Like 4
Link to comment
Share on other sites

Could save a few more, there are a couple of places you can do this:

 

.tokenLoop_smallest_DSP:
		loadb	(R20),R0
		addq	#1,R20
		move	R0,R1
		shrq	#4,R1
		jump	eq,(R10)
		nop
        
into this....

.tokenLoop_smallest_DSP:
        loadb   (R20),r0
        move    r0,r1
        shrq    #4,r1
        jump    eq,(R10)
        addq    #1,r20

 

  • Like 3
Link to comment
Share on other sites

Down to 146  142 bytes plus some speed optimization:

;;; -*-asm-*-

; input: R20 : packed buffer
;		 R21 : output buffer
;		 R0  : LZ4 packed block size (in bytes)

; A4 => R24
; A0 => R20
; A1 => R21
; A3 => R23
; D0 => R0
; D1 => R1
; D2 => R2
; D4 => R4

; adresse saut 1 => R10
; adresse saut 2 => R11

; R12=$FF pour mask
; R13=tmp

lz4_depack_smallest_DSP:
		move	R20,R24
		add	R0,R24			; packed buffer end
		moveq	#$F,R4
		movei	#.lenOffset_smallest_DSP,R10
		movei	#.tokenLoop_smallest_DSP,R11
		movei	#$FF,R12

		loadb	(R20),R0
.tokenLoop_smallest_DSP:
		addqt	#1,R20
		move	R0,R1
		shrq	#4,R1
		jump	eq,(R10)
		and	r4,r0			; remove high nibble

.readLen_smallest1_DSP:
		cmp	R1,R4			; r1 == 15 ?
		loadb	(R20),R2
		jr	ne,.readEnd_smallest1a_DSP ; skip first addq in copy loop!
.readLoop_smallest1_DSP:
		addqt	#1,R20
		add	R2,R1			; final len could be > 64KiB
		cmp	R12,R2			; r2 = $ff ?
		jr	eq,.readLoop_smallest1_DSP
		loadb	(R20),R2
.readEnd_smallest1_DSP:

.litcopy_smallest_DSP:
		addqt	#1,R20
.readEnd_smallest1a_DSP:
		subq	#1,R1
		storeb	R2,(R21)
		addqt	#1,R21
		jr	ne,.litcopy_smallest_DSP
		loadb	(R20),r2

		; end test is always done just after literals
		cmp	R20,R24
		jr	ne,.lenOffset_smallestx_DSP
		loadb	(R20),R1	; read 16bits offset, little endian, unaligned

;;; .readEnd_smallest_DSP:

;;; ----------------------------------------
	;; Return to caller
YM_DSP_retour_depack_LZ4_boucle_principale_DSP:
		movei	#YM_LZ4_nb_bloc_LZ4_disponibles,R0
		load	(R0),R1
		addq	#1,R1
		store	R1,(R0)

		movei	#DSP_boucle_centrale,R0
		jump	(R0)
;;; ----------------------------------------
.lenOffset_smallest_DSP:
		loadb	(R20),R1	; read 16bits offset, little endian, unaligned
.lenOffset_smallestx_DSP:
		move	R21,R23
		addqt	#1,R20
		loadb	(R20),R13
		addqt	#1,R20

;;;readLen_smallest2_DSP
		cmp	R0,R4			; cmp.B !!!!
		jr	ne,.readEnd_smallest2_DSP
.readLoop_smallest2_DSP:
		loadb	(R20),R2
		add	R2,R0			; final len could be > 64KiB
		cmp	R12,R2			; r2 = $ff ?
		jr	eq,.readLoop_smallest2_DSP
		addq	#1,R20

.readEnd_smallest2_DSP:
	;; finish offset calculation
		shlq	#8,R13
		add	R13,R1
		sub	R1,R23		; R1/d1 bits 31..16 are always 0 here

		addq	#4,R0
.copy_smallest_DSP:
		loadb	(R23),R13
		addq	#1,R23
		subq	#1,R0
		storeb	R13,(R21)
		jr	ne,.copy_smallest_DSP
		addqt	#1,R21

		jump	(R11)
		loadb	(R20),R0

		align 4
YM_LZ4_nb_bloc_LZ4_disponibles:	ds.l 1

 

Edited by 42bs
No "not" needed
  • Like 3
Link to comment
Share on other sites

So, final version. Register usage reduced, more comments.

 

;;; -*-asm-*-

; input:
;;; R20 : packed buffer
;;; R21 : output buffer
;;; R0  : LZ4 packed block size (in bytes)
;;;
;;; Register usage (destroyed!)
;;; r1,r2,r4,r10,r11,r12,r13
;;;
;;; R1,R2     : temp register
;;; r4        : mask $0f
;;; r10       : jump destination
;;; r11       : jump destination
;;; r12       : mask $ff
;;; r13       : end of packed data

lz4_depack_smallest_DSP:
		move	R20,R13
		add	R0,R13			; packed buffer end
		moveq	#$F,R4
		movei	#.lenOffset_smallest_DSP,R10
		movei	#.tokenLoop_smallest_DSP,R11
		movei	#$FF,R12

		loadb	(R20),R0
.tokenLoop_smallest_DSP:
		addqt	#1,R20
		move	R0,R1
		shrq	#4,R1
		jump	eq,(R10)
		and	r4,r0			; remove high nibble

.readLen_smallest1_DSP:
		cmp	R1,R4			; r1 == 15 ?
		loadb	(R20),R2
		jr	ne,.readEnd_smallest1a_DSP ; skip first addq in copy loop!
.readLoop_smallest1_DSP:
		addqt	#1,R20
		add	R2,R1			; final len could be > 64KiB
		cmp	R12,R2			; r2 = $ff ?
		jr	eq,.readLoop_smallest1_DSP
		loadb	(R20),R2
.readEnd_smallest1_DSP:

.litcopy_smallest_DSP:
		addqt	#1,R20
.readEnd_smallest1a_DSP:
		subq	#1,R1
		storeb	R2,(R21)
		addqt	#1,R21
		jr	ne,.litcopy_smallest_DSP
		loadb	(R20),r2

		; end test is always done just after literals
		cmp	R20,R13
		jr	ne,.lenOffset_smallestx_DSP
		loadb	(R20),R1	; read 16bits offset, little endian, unaligned

;;; .readEnd_smallest_DSP:

;;; ----------------------------------------
	;; Return to caller
YM_DSP_retour_depack_LZ4_boucle_principale_DSP:
		movei	#YM_LZ4_nb_bloc_LZ4_disponibles,R0
		load	(R0),R1
		addq	#1,R1
		store	R1,(R0)

		movei	#DSP_boucle_centrale,R0
		jump	(R0)
;;; ----------------------------------------
.lenOffset_smallest_DSP:
		loadb	(R20),R1	; read 16bits offset, little endian, unaligned
.lenOffset_smallestx_DSP:
		addqt	#1,R20
		loadb	(R20),R2
		addqt	#1,R20
		shlq	#8,R2
		add	R2,R1
		neg	r1
		add	r21,r1		; source = dest - offset

;;;readLen_smallest2_DSP
		cmp	R0,R4		; r0 == 15 ?
		jr	ne,.readEnd_smallest2_DSP
.readLoop_smallest2_DSP:
		loadb	(R20),R2
		add	R2,R0		; final len could be > 64KiB
		cmp	R12,R2		; r2 = $ff ?
		jr	eq,.readLoop_smallest2_DSP
		addq	#1,R20

.readEnd_smallest2_DSP:

		addq	#4,R0
.copy_smallest_DSP:
		loadb	(R1),R2
		addq	#1,R1
		subq	#1,R0
		storeb	R2,(R21)
		jr	ne,.copy_smallest_DSP
		addqt	#1,R21

		jump	(R11)
		loadb	(R20),R0

		align 4
YM_LZ4_nb_bloc_LZ4_disponibles:	ds.l 1

 

  • Like 2
Link to comment
Share on other sites

regarding this depacking using the dsp, while playing a module, during the lz4 depacking, the music seems to be off tone.

it seems the interrupts are not able to do all their duty

 

aren't the interrupts I2S and timer 1 supposed to have priority upon the main DSP code ?

 

replay frequency is ~16000 Hz

Link to comment
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.
Note: Your post will require moderator approval before it will be visible.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

Loading...
  • Recently Browsing   0 members

    • No registered users viewing this page.
×
×
  • Create New...