Jump to content
IGNORED

Sizecoding reaches Jaggy


42bs

Recommended Posts

Spoiler

;;*****************
;;; JagMona - A port of Mona from Ilmenit
;;; Size: 304 bytes

	.68000

	.include "help.mac"
	.include "jaguar.inc"	; get equates
	.include "video.mac"

ScreenMode	EQU RGB16|VIDEN|PWIDTH4|BGEN|CSYNC

;PAL .equ 1
VID_MODE	EQU PAL

screen		EQU $400

stacktop	equ $4000	; ROM sets SP to this address

;---------------
start:
	lea	$f00000,a5
 IF ^^defined SKUNK
	; SKUNK does not setup video!
	VideoInit
 ENDIF
	move.w	#ScreenMode,$28(a5)
;;; --------------------
;;; CLUT init
;;; --------------------
	lea	$400(a5),a0
	move.l	#0|($14<<11)|(0<<6)|$16,(a0)+
	move.l	#((($1d<<11)|(8<<6)|$27)<<16)|(($1f<<11)|($11<<6)|$38),(a0)
;;; --------------------
;;; Init Interrupts
;;; --------------------
	lea	my_irq(pc),a0
	move.l	a0,$0100.w
	move.l	#$1f01<<16,a6
	bsr.s	irq_init
;;; --------------------
;;; Init OP
;;; --------------------
 IF ^^defined SKUNK
	move.w	#$2000,sr	; not needed after BIOS
 ENDIF
 IF ^^defined DEBUG
	moveq	#1,d0
	move.l	d0,$20(a5)
 ELSE
	clr.l	$20(a5)	 ; OPL = $0
 ENDIF

main:
	lea	brush(pc),a1
    move.l (a1)+,d7         ; ggn
	;move.l	#$7ec80000,d7	; seed
	moveq.l	#64,d6
loop0:
	move.b	(a1),d1		; y
	;move.w	(a1)+,d7    ; ggn
	move.b	d7,d0		; x
	move.w	d6,d2
	subq.w	#1,d6
finish:
	beq.s	finish
	lsl.w	#5,d2
loop:
	addx.l	d7,d7
	bcc.s	noc
	moveq	#-1,d4
	eor.l	#$4c11db7,d7
	move.b	d7,d5
	bmi.s	noc
	moveq	#1,d4
noc:
	btst	#1,d5
	beq.s	_y
	add.w	d4,d0
	bra.s	cont
_y:
	add.w	d4,d1
cont:
	moveq	#$7f,d3
	and.w	d3,d0
	and.w	d3,d1
	move.w	d0,a0
	move.w	d1,d3
	lsl.w	#7,d3
	add.w	d3,a0
	moveq	#3,d3
	and.w	d6,d3
	move.b	d3,screen(a0)
	subq.w	#1,d2
	bne.s	loop
    move.w	(a1)+,d7    ; ggn
	bra.s	loop0

;;****************
;;      IRQ      *
irq_init:
	move.w	sr,-(sp)	// prepare rte
my_irq:

CopyOBL:
	lea	OBL0(pc),a3
 IF ^^defined DEBUG
	move.l	#$10000,a4
 ELSE
	suba.l	a4,a4
 ENDIF
	move.l	(a3)+,(a4)+
	move.l	(a3)+,(a4)+
	move.l	(a3)+,(a4)+
	move.l	(a3)+,(a4)+
	move.w	#4,6(a4)	;add STOP object

	move.l	a6,$e0(a5)
	rte

brush:
    dc.w $7ec8 ; ggn
	dc.w $030A, $37BE, $2F9B, $072B, $0E3C, $F59B, $8A91, $1B0B
	dc.w $0EBD, $9378, $B83E, $B05A, $70B5, $0280, $D0B1, $9CD2
	dc.w $2093, $209C, $3D11, $26D6, $DF19, $97F5, $90A3, $A347
	dc.w $8AF7, $0859, $29AD, $A32C, $7DFC, $0D7D, $D57A, $3051
	dc.w $D431, $542B, $B242, $B114, $8A96, $2914, $B0F1, $532C
	dc.w $0413, $0A09, $3EBB, $E916, $1877, $B8E2, $AC72, $80C7
	dc.w $5240, $8D3C, $3EAF, $AD63, $1E14, $B23D, $238F, $C07B
	dc.w $AF9D, $312E, $96CE, $25A7, $9E37, $2C44, $2BB9, $2139
OBL0:
	.objproc
 IF ^^defined DEBUG
	.org	$10000
 ELSE
	.org	$0
 ENDIF
 IF ^^defined SKUNK
xpos	equ 9+(320-128)/2
 ELSE
xpos	equ (320-128)/2
 ENDIF
	bitmap screen-10*128, xpos, 41+(200-96)/2, 128/8, 128/8,96+10, 3, 0, NOTRANS, 0 ,1
	.68000
OBL0_end:

 IF ^^defined SKUNK
	VideoData
 ENDIF

jag_end:
m68k_size = jag_end-start

	size	= (jag_end - start)
	print "Total Size:",/u size,"\nm68k Size: ",/u m68k_size

 IF ^^defined DEBUG = 0 & ^^defined SKUNK = 0 & size < 512
	rept 512-(jag_end-start)
	dc.b $42
	endr
 ENDIF
;;****************

	END

 

 

So this is what I was talking about in my first reply. It does shave off 2 bytes from the binary, but I can't verify it still works as virtualjaguar can't run it here, even with SKUNK defined I'm afraid...

Link to comment
Share on other sites

1 hour ago, ggn said:
  Hide contents


;;*****************
;;; JagMona - A port of Mona from Ilmenit
;;; Size: 304 bytes

	.68000

	.include "help.mac"
	.include "jaguar.inc"	; get equates
	.include "video.mac"

ScreenMode	EQU RGB16|VIDEN|PWIDTH4|BGEN|CSYNC

;PAL .equ 1
VID_MODE	EQU PAL

screen		EQU $400

stacktop	equ $4000	; ROM sets SP to this address

;---------------
start:
	lea	$f00000,a5
 IF ^^defined SKUNK
	; SKUNK does not setup video!
	VideoInit
 ENDIF
	move.w	#ScreenMode,$28(a5)
;;; --------------------
;;; CLUT init
;;; --------------------
	lea	$400(a5),a0
	move.l	#0|($14<<11)|(0<<6)|$16,(a0)+
	move.l	#((($1d<<11)|(8<<6)|$27)<<16)|(($1f<<11)|($11<<6)|$38),(a0)
;;; --------------------
;;; Init Interrupts
;;; --------------------
	lea	my_irq(pc),a0
	move.l	a0,$0100.w
	move.l	#$1f01<<16,a6
	bsr.s	irq_init
;;; --------------------
;;; Init OP
;;; --------------------
 IF ^^defined SKUNK
	move.w	#$2000,sr	; not needed after BIOS
 ENDIF
 IF ^^defined DEBUG
	moveq	#1,d0
	move.l	d0,$20(a5)
 ELSE
	clr.l	$20(a5)	 ; OPL = $0
 ENDIF

main:
	lea	brush(pc),a1
    move.l (a1)+,d7         ; ggn
	;move.l	#$7ec80000,d7	; seed
	moveq.l	#64,d6
loop0:
	move.b	(a1),d1		; y
	;move.w	(a1)+,d7    ; ggn
	move.b	d7,d0		; x
	move.w	d6,d2
	subq.w	#1,d6
finish:
	beq.s	finish
	lsl.w	#5,d2
loop:
	addx.l	d7,d7
	bcc.s	noc
	moveq	#-1,d4
	eor.l	#$4c11db7,d7
	move.b	d7,d5
	bmi.s	noc
	moveq	#1,d4
noc:
	btst	#1,d5
	beq.s	_y
	add.w	d4,d0
	bra.s	cont
_y:
	add.w	d4,d1
cont:
	moveq	#$7f,d3
	and.w	d3,d0
	and.w	d3,d1
	move.w	d0,a0
	move.w	d1,d3
	lsl.w	#7,d3
	add.w	d3,a0
	moveq	#3,d3
	and.w	d6,d3
	move.b	d3,screen(a0)
	subq.w	#1,d2
	bne.s	loop
    move.w	(a1)+,d7    ; ggn
	bra.s	loop0

;;****************
;;      IRQ      *
irq_init:
	move.w	sr,-(sp)	// prepare rte
my_irq:

CopyOBL:
	lea	OBL0(pc),a3
 IF ^^defined DEBUG
	move.l	#$10000,a4
 ELSE
	suba.l	a4,a4
 ENDIF
	move.l	(a3)+,(a4)+
	move.l	(a3)+,(a4)+
	move.l	(a3)+,(a4)+
	move.l	(a3)+,(a4)+
	move.w	#4,6(a4)	;add STOP object

	move.l	a6,$e0(a5)
	rte

brush:
    dc.w $7ec8 ; ggn
	dc.w $030A, $37BE, $2F9B, $072B, $0E3C, $F59B, $8A91, $1B0B
	dc.w $0EBD, $9378, $B83E, $B05A, $70B5, $0280, $D0B1, $9CD2
	dc.w $2093, $209C, $3D11, $26D6, $DF19, $97F5, $90A3, $A347
	dc.w $8AF7, $0859, $29AD, $A32C, $7DFC, $0D7D, $D57A, $3051
	dc.w $D431, $542B, $B242, $B114, $8A96, $2914, $B0F1, $532C
	dc.w $0413, $0A09, $3EBB, $E916, $1877, $B8E2, $AC72, $80C7
	dc.w $5240, $8D3C, $3EAF, $AD63, $1E14, $B23D, $238F, $C07B
	dc.w $AF9D, $312E, $96CE, $25A7, $9E37, $2C44, $2BB9, $2139
OBL0:
	.objproc
 IF ^^defined DEBUG
	.org	$10000
 ELSE
	.org	$0
 ENDIF
 IF ^^defined SKUNK
xpos	equ 9+(320-128)/2
 ELSE
xpos	equ (320-128)/2
 ENDIF
	bitmap screen-10*128, xpos, 41+(200-96)/2, 128/8, 128/8,96+10, 3, 0, NOTRANS, 0 ,1
	.68000
OBL0_end:

 IF ^^defined SKUNK
	VideoData
 ENDIF

jag_end:
m68k_size = jag_end-start

	size	= (jag_end - start)
	print "Total Size:",/u size,"\nm68k Size: ",/u m68k_size

 IF ^^defined DEBUG = 0 & ^^defined SKUNK = 0 & size < 512
	rept 512-(jag_end-start)
	dc.b $42
	endr
 ENDIF
;;****************

	END

 

 

So this is what I was talking about in my first reply. It does shave off 2 bytes from the binary, but I can't verify it still works as virtualjaguar can't run it here, even with SKUNK defined I'm afraid...

It does not work as A1 points to the next seed/coordinates. 

Link to comment
Share on other sites

That's why I moved the "move.w (a1)+,d7" at the end of the loop. First move.l reads 7ec8 as well as the first seed, then the move.w should read the second seed after the first is consumed.

 

(Unless I misunderstood something)

Link to comment
Share on other sites

2 minutes ago, ggn said:

That's why I moved the "move.w (a1)+,d7" at the end of the loop. First move.l reads 7ec8 as well as the first seed, then the move.w should read the second seed after the first is consumed.

 

(Unless I misunderstood something)

Yes, the next load must be at the end, but the (A1)+ has moved A1 to point to brush[n+1]. The move.b (A1) to get y reads therefore y[n+1] and not y[n]. That's why monast uses a move and lsr.

 

Link to comment
Share on other sites

18 minutes ago, ggn said:

Right, got it, sorry for massively wasting your time!

No waste of time. I mean, I spent 4 hours to shrink it down to 314 bytes. And even though your idea did not work it was a reason to re-think other parts. Which in the end resulted in a 304 byte version. Well, goal (256b) missed, but that was clear from the beginning. 

  • Like 1
Link to comment
Share on other sites

  • 2 weeks later...
  • 2 weeks later...
26 minutes ago, Cyprian said:

cool.

is there a visible speed difference between GPU only and GPU/DSP?

Hard to say. I need to make a side by side video (or measure the time).

I have a line-interleaved version where one can see that the DSP is slightly slower (guess due to the 16bit bus) than the GPU.

Link to comment
Share on other sites

  • 2 weeks later...
9 minutes ago, DEATH said:

the 2 processors run at exactly the same speed on a real Jaguar

I cannot confirm. DSP is slower.

Actually I think pixel or line interleaving is a bad idea as it does not take the DRAM pages into account.

 

9 minutes ago, DEATH said:

almost 1 year ago

Did it 25 years ago ;-)

 

;;
;; 24.09.95     new videoinit,button-control ...
;; 07.05.96     adapted for NEWSRC
;; 08.10.96     Tom & Jerry parallel

 

5 minutes ago, DEATH said:

Oh and it takes about 10sec to calculate a mandelbrot image in 640*480 256 colors

How many fix-point bits and max. interations?

Edited by 42bs
  • Like 1
Link to comment
Share on other sites

The 2 processors use exactly the same code. "Strictly" the same. And they start at the same time.

DRAM access time is not really taken into account given the calculation time between each point.

Theoretically 1 processor could finish before the other because from one pixel to another the calculation time is not the same, but statistically the total calculation time of the pixels that each processor must calculate must be about the same (in interlaced pixel mode. In interlaced line it's another story...)
My code is based on the original Jaguar SDK exemple called "JAGMAND"
You can find all the history of its genesis, sources and .cof on Yaronet: https://www.yaronet.com/topics/192849-mandelbrot-popopo
In French only.

Be careful to take the latest version https://www.yaronet.com/topics/192849-mandelbrot-popopo/5#post-125 or https://www.mirari.fr/FI8f

Link to comment
Share on other sites

2 hours ago, 42bs said:

In my 256 version also. But the GPU uses 32 bit bus, the DSP a 16bit bus. Thus a slight difference.

Pixels are 8 bit, calculations are 8 bit, ploting results in DRAM are 8 bit. so...

I have already thought about the possibility of performing 4 calculations (for the GPU, or even 8 ) in a row and then writing the result in DRAM, but the relative addition of complexity in the code would most certainly have no effect on the speed, it might even be worse.

Link to comment
Share on other sites

Just now, DEATH said:

Pixels are 8 bit, calculations are 8 bit, ploting results in DRAM are 8 bit. so...

I have already thought about the possibility of performing 4 calculations (for the GPU, or even 8 ) in a row and then writing the result in DRAM, but the relative addition of complexity in the code would most certainly have no effect on the speed, it might even be worse.

At least phrase writing brings no benefit. The write to the HIDATA register takes "nearly" the same time as writing a word to RAM. And the calculations to get a new pixel value take by far more time.

 

I did not analyze the code in depth, but I guess GPU writes to one half-image and DSP to the other. So DRAM accesses of both to not disturb each other.

 

But the interlace stuff is fun. But I doubt it'll fit into a 256 byte intro. Maybe need to go for 4K ?

 

Do you plan to enhance the program to allow zooming in (like I did in tj_mandel)?

Link to comment
Share on other sites

37 minutes ago, 42bs said:

I recommend to have "stop $2000" in the endless loop, it saves a lot of time. (Well "a lot" is relative ;-) )

All calculations are done by JRISC entirely internaly (I even believe that I don't use the local ram, only the registers). Stopping the 68000 would be counterproductive because you also have to manage everything else (in interlaced you have to make changes at each frame, + refresh the object list).

Link to comment
Share on other sites

1 minute ago, DEATH said:

All calculations are done by JRISC entirely internaly (I even believe that I don't use the local ram, only the registers). Stopping the 68000 would be counterproductive because you also have to manage everything else (in interlaced you have to make changes at each frame, + refresh the object list).

"stop $2000" will only halt the cpu until the next interrupt. So if you have a tight endless loop in the 68k doing nothing, stop will release the bus.

Note: _not_ "stop $2700" ;-)

  • Like 1
Link to comment
Share on other sites

4 minutes ago, 42bs said:

At least phrase writing brings no benefit. The write to the HIDATA register takes "nearly" the same time as writing a word to RAM. And the calculations to get a new pixel value take by far more time.

 

I did not analyze the code in depth, but I guess GPU writes to one half-image and DSP to the other. So DRAM accesses of both to not disturb each other.

 

But the interlace stuff is fun. But I doubt it'll fit into a 256 byte intro. Maybe need to go for 4K ?

 

Do you plan to enhance the program to allow zooming in (like I did in tj_mandel)?

the code is relatively "heavy" because I kept part of the old one, especially the one that displays the text at the beginning. From memory I think it's a 16bit image... or 8... yes, for a single color. It's Atari....

As you can see, it was a year ago. I haven't touched it since. I just made a small program that displays a nice image in 640*480 65k colors for Valentine's Day on February 14th.
(I tried in 640*480 16Mil colors, but it's mega f**ing crap to program that on Jaguar... I gave up, I have other things to do :)

Link to comment
Share on other sites

1 hour ago, 42bs said:

Wow!

 

Really 640x480?  Do you have the sources public somewhere? Or are the ones linked at the beginning of the thread the current?

I had posted the sources at the very beginning of the thread on Yaronet, but of course they changed a lot afterwards. I can put them back somewhere if needed.

(for information I have just checked, the image of the beginning is 18Kb by itself. 192*48 pixels...16bit. For 1 single color...)

  • Haha 1
Link to comment
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.
Note: Your post will require moderator approval before it will be visible.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

Loading...
  • Recently Browsing   0 members

    • No registered users viewing this page.
×
×
  • Create New...