ggn Posted March 4, 2022 Share Posted March 4, 2022 Spoiler ;;***************** ;;; JagMona - A port of Mona from Ilmenit ;;; Size: 304 bytes .68000 .include "help.mac" .include "jaguar.inc" ; get equates .include "video.mac" ScreenMode EQU RGB16|VIDEN|PWIDTH4|BGEN|CSYNC ;PAL .equ 1 VID_MODE EQU PAL screen EQU $400 stacktop equ $4000 ; ROM sets SP to this address ;--------------- start: lea $f00000,a5 IF ^^defined SKUNK ; SKUNK does not setup video! VideoInit ENDIF move.w #ScreenMode,$28(a5) ;;; -------------------- ;;; CLUT init ;;; -------------------- lea $400(a5),a0 move.l #0|($14<<11)|(0<<6)|$16,(a0)+ move.l #((($1d<<11)|(8<<6)|$27)<<16)|(($1f<<11)|($11<<6)|$38),(a0) ;;; -------------------- ;;; Init Interrupts ;;; -------------------- lea my_irq(pc),a0 move.l a0,$0100.w move.l #$1f01<<16,a6 bsr.s irq_init ;;; -------------------- ;;; Init OP ;;; -------------------- IF ^^defined SKUNK move.w #$2000,sr ; not needed after BIOS ENDIF IF ^^defined DEBUG moveq #1,d0 move.l d0,$20(a5) ELSE clr.l $20(a5) ; OPL = $0 ENDIF main: lea brush(pc),a1 move.l (a1)+,d7 ; ggn ;move.l #$7ec80000,d7 ; seed moveq.l #64,d6 loop0: move.b (a1),d1 ; y ;move.w (a1)+,d7 ; ggn move.b d7,d0 ; x move.w d6,d2 subq.w #1,d6 finish: beq.s finish lsl.w #5,d2 loop: addx.l d7,d7 bcc.s noc moveq #-1,d4 eor.l #$4c11db7,d7 move.b d7,d5 bmi.s noc moveq #1,d4 noc: btst #1,d5 beq.s _y add.w d4,d0 bra.s cont _y: add.w d4,d1 cont: moveq #$7f,d3 and.w d3,d0 and.w d3,d1 move.w d0,a0 move.w d1,d3 lsl.w #7,d3 add.w d3,a0 moveq #3,d3 and.w d6,d3 move.b d3,screen(a0) subq.w #1,d2 bne.s loop move.w (a1)+,d7 ; ggn bra.s loop0 ;;**************** ;; IRQ * irq_init: move.w sr,-(sp) // prepare rte my_irq: CopyOBL: lea OBL0(pc),a3 IF ^^defined DEBUG move.l #$10000,a4 ELSE suba.l a4,a4 ENDIF move.l (a3)+,(a4)+ move.l (a3)+,(a4)+ move.l (a3)+,(a4)+ move.l (a3)+,(a4)+ move.w #4,6(a4) ;add STOP object move.l a6,$e0(a5) rte brush: dc.w $7ec8 ; ggn dc.w $030A, $37BE, $2F9B, $072B, $0E3C, $F59B, $8A91, $1B0B dc.w $0EBD, $9378, $B83E, $B05A, $70B5, $0280, $D0B1, $9CD2 dc.w $2093, $209C, $3D11, $26D6, $DF19, $97F5, $90A3, $A347 dc.w $8AF7, $0859, $29AD, $A32C, $7DFC, $0D7D, $D57A, $3051 dc.w $D431, $542B, $B242, $B114, $8A96, $2914, $B0F1, $532C dc.w $0413, $0A09, $3EBB, $E916, $1877, $B8E2, $AC72, $80C7 dc.w $5240, $8D3C, $3EAF, $AD63, $1E14, $B23D, $238F, $C07B dc.w $AF9D, $312E, $96CE, $25A7, $9E37, $2C44, $2BB9, $2139 OBL0: .objproc IF ^^defined DEBUG .org $10000 ELSE .org $0 ENDIF IF ^^defined SKUNK xpos equ 9+(320-128)/2 ELSE xpos equ (320-128)/2 ENDIF bitmap screen-10*128, xpos, 41+(200-96)/2, 128/8, 128/8,96+10, 3, 0, NOTRANS, 0 ,1 .68000 OBL0_end: IF ^^defined SKUNK VideoData ENDIF jag_end: m68k_size = jag_end-start size = (jag_end - start) print "Total Size:",/u size,"\nm68k Size: ",/u m68k_size IF ^^defined DEBUG = 0 & ^^defined SKUNK = 0 & size < 512 rept 512-(jag_end-start) dc.b $42 endr ENDIF ;;**************** END So this is what I was talking about in my first reply. It does shave off 2 bytes from the binary, but I can't verify it still works as virtualjaguar can't run it here, even with SKUNK defined I'm afraid... Quote Link to comment Share on other sites More sharing options...
42bs Posted March 4, 2022 Author Share Posted March 4, 2022 COF won't run, ROM only if BIOS runs. The emulators do not allow OBL at 0. BIOS=1 make vj_rom should work though. Quote Link to comment Share on other sites More sharing options...
42bs Posted March 4, 2022 Author Share Posted March 4, 2022 1 hour ago, ggn said: Hide contents ;;***************** ;;; JagMona - A port of Mona from Ilmenit ;;; Size: 304 bytes .68000 .include "help.mac" .include "jaguar.inc" ; get equates .include "video.mac" ScreenMode EQU RGB16|VIDEN|PWIDTH4|BGEN|CSYNC ;PAL .equ 1 VID_MODE EQU PAL screen EQU $400 stacktop equ $4000 ; ROM sets SP to this address ;--------------- start: lea $f00000,a5 IF ^^defined SKUNK ; SKUNK does not setup video! VideoInit ENDIF move.w #ScreenMode,$28(a5) ;;; -------------------- ;;; CLUT init ;;; -------------------- lea $400(a5),a0 move.l #0|($14<<11)|(0<<6)|$16,(a0)+ move.l #((($1d<<11)|(8<<6)|$27)<<16)|(($1f<<11)|($11<<6)|$38),(a0) ;;; -------------------- ;;; Init Interrupts ;;; -------------------- lea my_irq(pc),a0 move.l a0,$0100.w move.l #$1f01<<16,a6 bsr.s irq_init ;;; -------------------- ;;; Init OP ;;; -------------------- IF ^^defined SKUNK move.w #$2000,sr ; not needed after BIOS ENDIF IF ^^defined DEBUG moveq #1,d0 move.l d0,$20(a5) ELSE clr.l $20(a5) ; OPL = $0 ENDIF main: lea brush(pc),a1 move.l (a1)+,d7 ; ggn ;move.l #$7ec80000,d7 ; seed moveq.l #64,d6 loop0: move.b (a1),d1 ; y ;move.w (a1)+,d7 ; ggn move.b d7,d0 ; x move.w d6,d2 subq.w #1,d6 finish: beq.s finish lsl.w #5,d2 loop: addx.l d7,d7 bcc.s noc moveq #-1,d4 eor.l #$4c11db7,d7 move.b d7,d5 bmi.s noc moveq #1,d4 noc: btst #1,d5 beq.s _y add.w d4,d0 bra.s cont _y: add.w d4,d1 cont: moveq #$7f,d3 and.w d3,d0 and.w d3,d1 move.w d0,a0 move.w d1,d3 lsl.w #7,d3 add.w d3,a0 moveq #3,d3 and.w d6,d3 move.b d3,screen(a0) subq.w #1,d2 bne.s loop move.w (a1)+,d7 ; ggn bra.s loop0 ;;**************** ;; IRQ * irq_init: move.w sr,-(sp) // prepare rte my_irq: CopyOBL: lea OBL0(pc),a3 IF ^^defined DEBUG move.l #$10000,a4 ELSE suba.l a4,a4 ENDIF move.l (a3)+,(a4)+ move.l (a3)+,(a4)+ move.l (a3)+,(a4)+ move.l (a3)+,(a4)+ move.w #4,6(a4) ;add STOP object move.l a6,$e0(a5) rte brush: dc.w $7ec8 ; ggn dc.w $030A, $37BE, $2F9B, $072B, $0E3C, $F59B, $8A91, $1B0B dc.w $0EBD, $9378, $B83E, $B05A, $70B5, $0280, $D0B1, $9CD2 dc.w $2093, $209C, $3D11, $26D6, $DF19, $97F5, $90A3, $A347 dc.w $8AF7, $0859, $29AD, $A32C, $7DFC, $0D7D, $D57A, $3051 dc.w $D431, $542B, $B242, $B114, $8A96, $2914, $B0F1, $532C dc.w $0413, $0A09, $3EBB, $E916, $1877, $B8E2, $AC72, $80C7 dc.w $5240, $8D3C, $3EAF, $AD63, $1E14, $B23D, $238F, $C07B dc.w $AF9D, $312E, $96CE, $25A7, $9E37, $2C44, $2BB9, $2139 OBL0: .objproc IF ^^defined DEBUG .org $10000 ELSE .org $0 ENDIF IF ^^defined SKUNK xpos equ 9+(320-128)/2 ELSE xpos equ (320-128)/2 ENDIF bitmap screen-10*128, xpos, 41+(200-96)/2, 128/8, 128/8,96+10, 3, 0, NOTRANS, 0 ,1 .68000 OBL0_end: IF ^^defined SKUNK VideoData ENDIF jag_end: m68k_size = jag_end-start size = (jag_end - start) print "Total Size:",/u size,"\nm68k Size: ",/u m68k_size IF ^^defined DEBUG = 0 & ^^defined SKUNK = 0 & size < 512 rept 512-(jag_end-start) dc.b $42 endr ENDIF ;;**************** END So this is what I was talking about in my first reply. It does shave off 2 bytes from the binary, but I can't verify it still works as virtualjaguar can't run it here, even with SKUNK defined I'm afraid... It does not work as A1 points to the next seed/coordinates. Quote Link to comment Share on other sites More sharing options...
ggn Posted March 4, 2022 Share Posted March 4, 2022 That's why I moved the "move.w (a1)+,d7" at the end of the loop. First move.l reads 7ec8 as well as the first seed, then the move.w should read the second seed after the first is consumed. (Unless I misunderstood something) Quote Link to comment Share on other sites More sharing options...
42bs Posted March 4, 2022 Author Share Posted March 4, 2022 2 minutes ago, ggn said: That's why I moved the "move.w (a1)+,d7" at the end of the loop. First move.l reads 7ec8 as well as the first seed, then the move.w should read the second seed after the first is consumed. (Unless I misunderstood something) Yes, the next load must be at the end, but the (A1)+ has moved A1 to point to brush[n+1]. The move.b (A1) to get y reads therefore y[n+1] and not y[n]. That's why monast uses a move and lsr. Quote Link to comment Share on other sites More sharing options...
ggn Posted March 4, 2022 Share Posted March 4, 2022 Right, got it, sorry for massively wasting your time! Quote Link to comment Share on other sites More sharing options...
42bs Posted March 4, 2022 Author Share Posted March 4, 2022 18 minutes ago, ggn said: Right, got it, sorry for massively wasting your time! No waste of time. I mean, I spent 4 hours to shrink it down to 314 bytes. And even though your idea did not work it was a reason to re-think other parts. Which in the end resulted in a 304 byte version. Well, goal (256b) missed, but that was clear from the beginning. 1 Quote Link to comment Share on other sites More sharing options...
42bs Posted March 13, 2022 Author Share Posted March 13, 2022 (edited) Mandelbrot in 256 bytes added, just because ... ? https://github.com/42Bastian/new_bjl/tree/main/exp/gpumandel_256 Edited March 13, 2022 by 42bs 3 Quote Link to comment Share on other sites More sharing options...
42bs Posted March 22, 2022 Author Share Posted March 22, 2022 Mandelbrot set with GPU/DSP in parallel (pixel wise interleaved): https://github.com/42Bastian/new_bjl/tree/main/exp/tj_mandel_256 ROM does only run on real HW. mandel_256_j64.zip 3 Quote Link to comment Share on other sites More sharing options...
Cyprian Posted March 22, 2022 Share Posted March 22, 2022 cool. is there a visible speed difference between GPU only and GPU/DSP? Quote Link to comment Share on other sites More sharing options...
42bs Posted March 22, 2022 Author Share Posted March 22, 2022 26 minutes ago, Cyprian said: cool. is there a visible speed difference between GPU only and GPU/DSP? Hard to say. I need to make a side by side video (or measure the time). I have a line-interleaved version where one can see that the DSP is slightly slower (guess due to the 16bit bus) than the GPU. Quote Link to comment Share on other sites More sharing options...
DEATH Posted April 1, 2022 Share Posted April 1, 2022 I already coded a mandelbrot program which uses the GPU and the DSP at the same time (interlaced pixel) almost 1 year ago, the 2 processors run at exactly the same speed on a real Jaguar Quote Link to comment Share on other sites More sharing options...
DEATH Posted April 1, 2022 Share Posted April 1, 2022 Oh and it takes about 10sec to calculate a mandelbrot image in 640*480 256 colors 1 Quote Link to comment Share on other sites More sharing options...
42bs Posted April 1, 2022 Author Share Posted April 1, 2022 (edited) 9 minutes ago, DEATH said: the 2 processors run at exactly the same speed on a real Jaguar I cannot confirm. DSP is slower. Actually I think pixel or line interleaving is a bad idea as it does not take the DRAM pages into account. 9 minutes ago, DEATH said: almost 1 year ago Did it 25 years ago ;; ;; 24.09.95 new videoinit,button-control ... ;; 07.05.96 adapted for NEWSRC ;; 08.10.96 Tom & Jerry parallel 5 minutes ago, DEATH said: Oh and it takes about 10sec to calculate a mandelbrot image in 640*480 256 colors How many fix-point bits and max. interations? Edited April 1, 2022 by 42bs 1 Quote Link to comment Share on other sites More sharing options...
DEATH Posted April 1, 2022 Share Posted April 1, 2022 The 2 processors use exactly the same code. "Strictly" the same. And they start at the same time. DRAM access time is not really taken into account given the calculation time between each point. Theoretically 1 processor could finish before the other because from one pixel to another the calculation time is not the same, but statistically the total calculation time of the pixels that each processor must calculate must be about the same (in interlaced pixel mode. In interlaced line it's another story...) My code is based on the original Jaguar SDK exemple called "JAGMAND" You can find all the history of its genesis, sources and .cof on Yaronet: https://www.yaronet.com/topics/192849-mandelbrot-popopo In French only. Be careful to take the latest version https://www.yaronet.com/topics/192849-mandelbrot-popopo/5#post-125 or https://www.mirari.fr/FI8f Quote Link to comment Share on other sites More sharing options...
42bs Posted April 1, 2022 Author Share Posted April 1, 2022 In my 256 version also. But the GPU uses 32 bit bus, the DSP a 16bit bus. Thus a slight difference. 1 Quote Link to comment Share on other sites More sharing options...
42bs Posted April 1, 2022 Author Share Posted April 1, 2022 The mirari link is dead. I will check the forum and train Mon francais un peut. Quote Link to comment Share on other sites More sharing options...
42bs Posted April 1, 2022 Author Share Posted April 1, 2022 2 hours ago, DEATH said: e careful to take the latest version https://www.yaronet.com/topics/192849-mandelbrot-popopo/5#post-125 Wow! Really 640x480? Do you have the sources public somewhere? Or are the ones linked at the beginning of the thread the current? Quote Link to comment Share on other sites More sharing options...
42bs Posted April 1, 2022 Author Share Posted April 1, 2022 3 hours ago, DEATH said: The 2 processors use exactly the same code I recommend to have "stop $2000" in the endless loop, it saves a lot of time. (Well "a lot" is relative ) Quote Link to comment Share on other sites More sharing options...
DEATH Posted April 1, 2022 Share Posted April 1, 2022 2 hours ago, 42bs said: In my 256 version also. But the GPU uses 32 bit bus, the DSP a 16bit bus. Thus a slight difference. Pixels are 8 bit, calculations are 8 bit, ploting results in DRAM are 8 bit. so... I have already thought about the possibility of performing 4 calculations (for the GPU, or even 8 ) in a row and then writing the result in DRAM, but the relative addition of complexity in the code would most certainly have no effect on the speed, it might even be worse. Quote Link to comment Share on other sites More sharing options...
42bs Posted April 1, 2022 Author Share Posted April 1, 2022 Just now, DEATH said: Pixels are 8 bit, calculations are 8 bit, ploting results in DRAM are 8 bit. so... I have already thought about the possibility of performing 4 calculations (for the GPU, or even 8 ) in a row and then writing the result in DRAM, but the relative addition of complexity in the code would most certainly have no effect on the speed, it might even be worse. At least phrase writing brings no benefit. The write to the HIDATA register takes "nearly" the same time as writing a word to RAM. And the calculations to get a new pixel value take by far more time. I did not analyze the code in depth, but I guess GPU writes to one half-image and DSP to the other. So DRAM accesses of both to not disturb each other. But the interlace stuff is fun. But I doubt it'll fit into a 256 byte intro. Maybe need to go for 4K ? Do you plan to enhance the program to allow zooming in (like I did in tj_mandel)? Quote Link to comment Share on other sites More sharing options...
DEATH Posted April 1, 2022 Share Posted April 1, 2022 37 minutes ago, 42bs said: I recommend to have "stop $2000" in the endless loop, it saves a lot of time. (Well "a lot" is relative ) All calculations are done by JRISC entirely internaly (I even believe that I don't use the local ram, only the registers). Stopping the 68000 would be counterproductive because you also have to manage everything else (in interlaced you have to make changes at each frame, + refresh the object list). Quote Link to comment Share on other sites More sharing options...
42bs Posted April 1, 2022 Author Share Posted April 1, 2022 1 minute ago, DEATH said: All calculations are done by JRISC entirely internaly (I even believe that I don't use the local ram, only the registers). Stopping the 68000 would be counterproductive because you also have to manage everything else (in interlaced you have to make changes at each frame, + refresh the object list). "stop $2000" will only halt the cpu until the next interrupt. So if you have a tight endless loop in the 68k doing nothing, stop will release the bus. Note: _not_ "stop $2700" 1 Quote Link to comment Share on other sites More sharing options...
DEATH Posted April 1, 2022 Share Posted April 1, 2022 4 minutes ago, 42bs said: At least phrase writing brings no benefit. The write to the HIDATA register takes "nearly" the same time as writing a word to RAM. And the calculations to get a new pixel value take by far more time. I did not analyze the code in depth, but I guess GPU writes to one half-image and DSP to the other. So DRAM accesses of both to not disturb each other. But the interlace stuff is fun. But I doubt it'll fit into a 256 byte intro. Maybe need to go for 4K ? Do you plan to enhance the program to allow zooming in (like I did in tj_mandel)? the code is relatively "heavy" because I kept part of the old one, especially the one that displays the text at the beginning. From memory I think it's a 16bit image... or 8... yes, for a single color. It's Atari.... As you can see, it was a year ago. I haven't touched it since. I just made a small program that displays a nice image in 640*480 65k colors for Valentine's Day on February 14th. (I tried in 640*480 16Mil colors, but it's mega f**ing crap to program that on Jaguar... I gave up, I have other things to do ) Quote Link to comment Share on other sites More sharing options...
DEATH Posted April 1, 2022 Share Posted April 1, 2022 1 hour ago, 42bs said: Wow! Really 640x480? Do you have the sources public somewhere? Or are the ones linked at the beginning of the thread the current? I had posted the sources at the very beginning of the thread on Yaronet, but of course they changed a lot afterwards. I can put them back somewhere if needed. (for information I have just checked, the image of the beginning is 18Kb by itself. 192*48 pixels...16bit. For 1 single color...) 1 Quote Link to comment Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.
Note: Your post will require moderator approval before it will be visible.