sp_
Member |
The blitter pass can be replaced by a 1 CPU merge like this:
(code not tested)
I think the merge can can be optimized by using a rol merge.
input:
;a3a2b3b2a1a0b1b0 e3e2f3f2e1e0f1f0 c3c2d3d2c1c0d1d0 g3g2h3h2g1g0h1h0
;i3i2j3j2i1i0j1j0 m3m2n3n2m1m0n1n0 k3k2l3l2k1k0l1l0 o3o2p3p2o1o0p1p0
Output:
;0,1: a3a2b3b2 c3c2d3d2 e3e2f3f2 g3g2h3h2 i3i2j3j2 k3k2l3l2 m3m2n3n2 o3o2p3p2
;2,3: a1a0b1b0 c1c0d1d0 e1e0f1f0 g1g0h1h0 i1i0j1j0 k1k0l1l0 m1m0n1n0 o1o0p1p0
SMCTABLEC2P:
lea txture1,a0
lea txture2,a1
lea bpl01,a2 ;pointer to bitplane 0 and 1
lea bpl23,a3 ;pointer to bitplane 2 and 3
move.l #$0f0f0f0f,d6
REPT 160*100/32
move.w 0000(a0),d0
or.w 0000(a1),d0 ;ab00
move.w 0000(a0),d1
or.w 0000(a1),d1 ;cd00
move.b 0000(a0),d0
or.b 0000(a1),d0 ;abef
move.b 0000(a0),d1
or.b 0000(a1),d1 ;cdgh
swap d0
swap d1
move.w 0000(a0),d0
or.w 0000(a1),d0
move.w 0000(a0),d1
or.w 0000(a1),d1
move.b 0000(a0),d0
or.b 0000(a1),d0 ;abefijmn
move.b 0000(a0),d1 ;cdghklop
or.b 0000(a1),d1
move.w 0000(a0),d2
or.w 0000(a1),d2
move.w 0000(a0),d3
or.w 0000(a1),d3
move.b 0000(a0),d2
or.b 0000(a1),d2
move.b 0000(a0),d3
or.b 0000(a1),d3
swap d2
swap d3
move.w 0000(a0),d2
or.w 0000(a1),d2
move.w 0000(a0),d2
or.w 0000(a1),d2
move.b 0000(a0),d3
or.b 0000(a1),d3
move.b 0000(a0),d2
or.b 0000(a1),d2
move.l d1,d7
lsr.l #4,d7
eor.l d0,d7
and.l d6,d7
eor.l d7,d0
lsl.l #4,d7
eor.l d7,d1
move.l d0,(a2)+
move.l d1,(a3)+
move.l d3,d7
lsr.l #4,d7
eor.l d2,d7
and.l d6,d7
eor.l d7,d2
lsl.l #4,d7
eor.l d7,d3
move.l d2,(a2)+
move.l d3,(a3)+
ENDR
|
sp_
Member |
Here is how many cycles extra the MC68000 cpu will use for 32 pixels.
sum= 120 cycles. (15 cycles pr word written to memory.)
A ABCD blitter pass wil use 8 cycles (bus) pr word written.
swap ;4
swap ;4
swap ;4
swap ;4
move.l d1,d6 ;4
lsr.l #4,d6 ;16
eor.l d0,d6 ;4
and.l d7,d6 ;4
eor.l d4,d0 ;4
lsl.l #4,d6 ;16
eor.l d6,d1 ;4
move.l d3,d6
lsr.l #4,d6
eor.l d2,d6
and.l d7,d6
eor.l d4,d2
lsl.l #4,d6
eor.l d6,d3
|