Prompt: Please generate code for 256b intro in assembly x86 running on dos in 13h mode that renders rotating 3d cube
Let me create a 256-byte intro in x86 assembly for DOS mode 13h that renders a rotating 3D cube. This is a classic demoscene challenge.
Let me think about what we need to fit in 256 bytes:
This is quite tight. Let me think about optimizations:
Let me write the NASM assembly:
asm
; 256b intro - rotating 3D cube in DOS mode 13h
; Assemble with: nasm -f bin cube.asm -o cube.com
BITS 16
ORG 100h
start:
; Set video mode 13h
mov ax, 0013h
int 10h
main_loop:
; Clear screen (320*200 = 64000 bytes)
push es
mov ax, 0A000h
mov es, ax
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
pop es
; ... rotation and drawing
Actually, let me think more carefully. 256 bytes is very tight. Let me think about what's achievable.
The cube has 8 vertices and 12 edges. For each frame:
For rotation, I'll use a sine table or a small approximation.
Let me think about a simpler approach - use integer sine approximation.
Actually, for a 256b intro, let me use a trick: precompute sin table using a recurrence relation, or use the BIOS/FPU.
Let me think about using the FPU (x87) for trig, which will make the math easier but use more bytes.
Actually, let me try a different approach - use a recurrence relation for sine/cosine:
Where eps = 2*sin(da/2) ≈ da for small da.
This is the classic "rotating vector" trick used in demos.
Let me plan the structure:
; Cube vertices (normalized to -1..1, scaled)
; 8 vertices: (+/-1, +/-1, +/-1)
; 12 edges
; We'll use fixed point with scale factor
Let me try to write a complete solution. I'll use FPU for sin/cos to keep the code manageable.
Actually, let me think about this differently. The key challenge is fitting everything in 256 bytes.
Let me write the code step by step:
nasm
bits 16
org 100h
; Set video mode 13h
mov ax, 13h
int 10h
; Set up segment to video memory
mov ax, 0A000h
mov es, ax
For the rotation, I'll use the FPU:
Cube vertices: 8 points at (+/-64, +/-64, +/-64)
Let me think about the rotation. I'll rotate around Y and X axes.
For projection, use simple perspective division: x' = x256/(z+300), y' = y256/(z+300)
Or even simpler orthographic: x' = x, y' = y (after rotation)
Let me try with orthographic projection first since it's simpler.
Actually, let me look at what a typical 256b cube demo does:
The trick is to use the rotating vector for sin/cos:
But let me just use a sine table approach.
Actually, for 256 bytes, I think the most practical approach is:
But 64 bytes for sin table is too much. Instead, use a polynomial approximation or the FPU.
Let me try using the FPU with FSIN/FCOS. Each FPU instruction is 2 bytes (ESC prefix + ModRM), so it's not too bad.
Here's my plan:
nasm
bits 16
org 100h
; Variables in BSS area (after code)
; angle: dw
; vertices: 8 * 3 * 2 bytes = 48 bytes (after rotation)
start:
mov ax, 13h
int 10h
mov ax, 0a000h
mov es, ax
frame:
; Clear screen
xor di, di
mov cx, 32000
xor ax, ax
rep stosw
; Increment angle
; ...
; For each edge, draw line
; ...
; Check for keypress
in al, 60h
cmp al, 1 ; ESC
jne frame
; Return to text mode
mov ax, 3
int 10h
ret
Hmm, this is going to be tight. Let me look at this more carefully.
For a 256b intro, let me focus on:
Let me think about the line drawing algorithm. Bresenham is relatively compact but still needs some code. A simpler approach for a cube is to parameterize each edge as a line and step along it.
Actually, let me look at how to draw the 12 edges of a cube efficiently.
The cube vertices in 3D (before rotation), scaled to ~50 units:
v0: (-50, -50, -50)
v1: ( 50, -50, -50)
v2: ( 50, 50, -50)
v3: (-50, 50, -50)
v4: (-50, -50, 50)
v5: ( 50, -50, 50)
v6: ( 50, 50, 50)
v7: (-50, 50, 50)
Edges (12 total): Front face: 0-1, 1-2, 2-3, 3-0 Back face: 4-5, 5-6, 6-7, 7-4 Connecting: 0-4, 1-5, 2-6, 3-7
The edge table can be stored as pairs of vertex indices: 24 bytes.
For the rotation matrix (rotating around X and Y axes with angle a):
Ry(a):
cos(a) 0 sin(a)
0 1 0
-sin(a) 0 cos(a)
Rx(a):
1 0 0
0 cos(a) -sin(a)
0 sin(a) cos(a)
Combined rotation: Rx * Ry
This requires sin and cos of one angle (or two different angles for more interesting rotation).
OK let me just write the actual code. I'll aim for something that works and is under 256 bytes. If it's a bit over, I'll optimize.
nasm
bits 16
org 100h
start:
; Set mode 13h
mov ax, 0013h
int 10h
; Point ES to video segment
mov ax, 0A000h
mov es, ax
; Initialize FPU
finit
main_loop:
; Clear screen (fill with black = 0)
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
; Increment angle counter
inc byte [angle]
; Load angle in radians into FPU
fldpi
fimul word [angle] ; angle * pi
fidiv word [c180] ; angle * pi / 128 (using 256 steps = full rotation)
; Hmm, this is getting complicated. Let me use a different approach.
Actually, let me use a smarter approach. I'll use a byte as the angle, where 256 = 2*pi. Then:
But computing this with FPU requires several instructions. Let me instead use a fixed-point sine table.
Wait, the classic 256b intro trick: use the video memory itself as a sine table! Mode 13h has 320*200 = 64000 pixels, way more than needed. But that's the output buffer...
Another trick: use the BIOS timer for a sine-like value... no, that doesn't make sense.
Let me try a different approach - use the "rotating vector" method to compute sin/cos incrementally:
cx = 1.0 (cos, start)
sx = 0.0 (sin, start)
da = small angle increment
Each frame:
new_cx = cx - da * sx
new_sx = sx + da * cx
(normalize occasionally)
But I need to project 8 vertices, which requires the actual sin/cos values.
OK, let me just bite the bullet and use FSIN/FCOS from the FPU. The code will be longer but more straightforward.
Here's a more complete attempt:
nasm
bits 16
org 100h
; Set mode 13h
mov ax, 0013h
int 10h
mov ax, 0a000h
mov es, ax
finit
.frame:
; Clear screen
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
inc byte [angle]
; Compute sin and cos
fldpi
fimul word [angle_i] ; need angle in radians
; this is getting too complex
Actually, I think for a 256-byte demo, a sine table is often the way to go. Let me use a compact 256-byte sine table where sin_table[i] = 127 * sin(2pii/256) + 128 (in range 0..255). But that's 256 bytes just for the table...
Alternative: generate the sine table at runtime! Use the FPU recurrence method to generate 256 sine values into a memory buffer. But this buffer is part of our 256b budget...
Wait, let me reconsider. The .COM file is 256 bytes total. So code + data + table all must fit in 256 bytes.
Let me think about what's truly necessary:
This is really tight. Let me look at what tricks demos use.
One popular trick for 256b cube: don't rotate all 8 vertices separately. Instead, use the fact that opposite vertices are negations of each other. So we only need to compute 4 vertices (the other 4 are their negatives).
Another trick: draw dots instead of lines! Just plot the 8 vertices as single pixels. But that's not very impressive.
Another trick: instead of line-drawing algorithm, just rasterize each edge with a simple DDA (Digital Differential Analyzer):
for t in 0..N:
x = x1 + t*(x2-x1)/N
y = y1 + t*(y2-y1)/N
plot(x, y)
Using fixed-point arithmetic, this can be done with fewer bytes than Bresenham.
Let me try a concrete implementation. I'll use fixed-point FPU values stored as words (16-bit, signed, with scale factor 64):
Actually, let me try a completely different approach that's very compact:
Use the fact that a cube projected with a 45-degree isometric view looks like a hexagon. I can rotate around one axis and use a simple sin/cos table.
Or, even simpler: rotate only around the Y axis. Then:
And project with: screen_x = x'*K/(z'+D) + 160, screen_y = y'*K/(z'+D) + 100
This requires only sin and cos of one angle.
For the sin/cos, let me use the FPU with a precomputed floating-point angle.
Let me write a more concrete implementation:
nasm
bits 16
org 100h
; Cube vertices (8 bytes encoding: 3 bits each for sign of x,y,z)
; We use +/-64 for coordinates
start:
mov ax, 0013h
int 10h
mov ax, 0a000h
mov es, ax
finit
fldz ; angle = 0.0 on FPU stack
.frame:
; ES = A000h (already set)
; Clear screen
xor di, di
mov cx, 32000
xor ax, ax
rep stosw
; angle is at st(0), add increment
fadd dword [da]
; Compute sin/cos
fst dword [fangle] ; save angle
fsin ; st0 = sin(angle)
fst dword [fsin_a]
fld dword [fangle]
fcos
fst dword [fcos_a]
; Now rotate and project each vertex
; ...
; Check ESC
in al, 60h
cmp al, 1
je .exit
jmp .frame
.exit:
mov ax, 0003h
int 10h
ret
da: dd 0.03
fangle: dd 0.0
fsin_a: dd 0.0
fcos_a: dd 0.0
The problem is the vertex rotation and projection loop. Let me think...
For 8 vertices, I need to rotate each one. The rotation matrix applied to vertex (vx, vy, vz) with rotation around Y axis:
Then project:
I need to store the 8 projected 2D points, then draw 12 edges.
The 8 cube vertices can be encoded compactly. Since each coordinate is +/-64, I can use 3 bits (sign bit only) if I know the pattern. The cube vertices are all combinations of (±64, ±64, ±64), so:
This is neat! No vertex table needed. Just compute from index 0..7.
For edges, the 12 edges of a cube can be defined by the fact that two vertices are connected iff they differ in exactly 1 bit. So edges are pairs (i, j) where i XOR j is a power of 2 (1, 2, or 4).
Edge list (pairs where exactly 1 bit differs): 0-1, 2-3, 4-5, 6-7 (differ in bit 0) 0-2, 1-3, 4-6, 5-7 (differ in bit 1) 0-4, 1-5, 2-6, 3-7 (differ in bit 2)
So I can enumerate all 28 pairs (i,j) with i<j and check if i^j is a power of 2. Or just hardcode the 12 edges as a compact table.
12 edges as pairs: 24 bytes. Or as a bitfield...
Actually, let me store the edge list as pairs of indices (1 byte each), total 24 bytes for 12 edges:
db 0,1, 1,2, 2,3, 3,0 ; front face
db 4,5, 5,6, 6,7, 7,4 ; back face
db 0,4, 1,5, 2,6, 3,7 ; connecting edges
Wait, but vertex 0 is (-64,-64,-64), vertex 1 is (64,-64,-64), vertex 2 is (64,64,-64), vertex 3 is (-64,64,-64)... Actually with my bit encoding above:
Hmm that's a different ordering. Let me check which pairs differ in exactly 1 bit: 0(000)-1(001): differ in bit 0 ✓ 0(000)-2(010): differ in bit 1 ✓ 0(000)-4(100): differ in bit 2 ✓ 1(001)-3(011): differ in bit 1 ✓ 1(001)-5(101): differ in bit 2 ✓ 2(010)-3(011): differ in bit 0 ✓ 2(010)-6(110): differ in bit 2 ✓ 3(011)-7(111): differ in bit 2 ✓ 4(100)-5(101): differ in bit 0 ✓ 4(100)-6(110): differ in bit 1 ✓ 5(101)-7(111): differ in bit 1 ✓ 6(110)-7(111): differ in bit 0 ✓
Yes! 12 edges. I can compute these programmatically by checking if i^j is 1, 2, or 4.
So for the edge drawing loop:
for i = 0 to 6:
for j = i+1 to 7:
if (i^j == 1) or (i^j == 2) or (i^j == 4):
draw_line(projected[i], projected[j])
Or equivalently:
for i = 0 to 7:
for each of (i^1, i^2, i^4) as j:
if j > i:
draw_line(projected[i], projected[j])
This avoids needing an edge table.
Now, for the projected points storage: I need 8 * 2 words = 32 bytes. Plus 2 sin/cos floats = 8 bytes. Plus angle = 4 bytes. Total data = ~44 bytes. With code probably around 200 bytes, I'm at 244 bytes total. Might be feasible.
Let me try to write the complete code more carefully. I'll use a mix of FPU for math and integer for screen operations.
Actually, you know what, let me try a slightly different approach. Instead of FPU, let me use a 64-byte sine table (one quarter wave, 64 entries for 0..pi/2) stored in the code segment. This avoids FPU overhead.
Sine table: sin_table[i] = round(127 * sin(i * pi/128)) for i=0..63 Values fit in a byte.
For any angle a (0..255, full circle):
But 64 bytes for the table is a lot...
Actually, let me just go with FPU. It's cleaner and x87 instructions are 2-3 bytes each.
Here's my more complete attempt:
nasm
bits 16
org 100h
; Mode 13h
mov ax, 0013h
int 10h
mov ax, 0a000h
mov es, ax
finit
.frame:
; Clear screen
xor di, di
mov cx, 32000
xor ax, ax
rep stosw
; Increment angle
fld dword [.angle]
fadd dword [.da]
fst dword [.angle]
; Get sin and cos
fsin ; st0 = sin(angle)
fst dword [.sin_a]
fld dword [.angle] ; stack: angle, sin
fcos ; st0 = cos(angle)
fst dword [.cos_a]
fstp dword [.tmp] ; pop
fstp dword [.tmp] ; pop sin
; Project all 8 vertices
; For vertex i, compute: x=(i&1)?64:-64, y=(i&2)?32:-32, z=(i&4)?64:-64
; (Use asymmetric y to make it less boring)
; After rotation around Y: x' = x*cos - z*sin, z' = x*sin + z*cos
; Project: sx = x'*proj/(z'+proj_d) + 160
; sy = y*proj/(z'+proj_d) + 100
mov si, 7 ; vertex index
.vtx_loop:
; Compute x coordinate
mov ax, 64 ; default x = 64
test si, 1
jnz .x_pos
neg ax ; x = -64
.x_pos:
mov [.vx], ax
; Compute y coordinate
mov ax, 50
test si, 2
jnz .y_pos
neg ax
.y_pos:
mov [.vy], ax
; Compute z coordinate
mov ax, 64
test si, 4
jnz .z_pos
neg ax
.z_pos:
mov [.vz], ax
; Rotate: x' = vx*cos - vz*sin
fild word [.vx]
fmul dword [.cos_a]
fild word [.vz]
fmul dword [.sin_a]
fsub
fst dword [.rx]
; z' = vx*sin + vz*cos
fild word [.vx]
fmul dword [.sin_a]
fild word [.vz]
fmul dword [.cos_a]
fadd
fst dword [.rz]
; Project: scale = 200.0 / (z' + 350.0)
fld dword [.rz]
fadd dword [.proj_d]
fdivr dword [.proj_scale]
; sx = x' * scale + 160
fld dword [.rx]
fmul st0, st1
fadd dword [.cx]
fistp word [.sx + si*2]
; sy = vy * scale + 100
fild word [.vy]
fmul st0, st1
fadd dword [.cy]
fistp word [.sy + si*2]
fstp dword [.tmp] ; pop scale
dec si
jns .vtx_loop
This is getting complex and probably too long. Let me think of a simpler approach.
Actually, for a 256b demo, the FPU approach for computing 8 vertices each frame is going to be too many bytes. Let me think of a smarter approach.
Alternative: Instead of computing all vertices each frame, use the recurrence relation for rotation. Maintain the rotated cube state and apply a small rotation each frame.
But this requires maintaining 8 vertices * 3 coords = 24 16-bit values = 48 bytes of state, plus the rotation application.
Hmm, another idea: since the cube is symmetric, only compute 4 vertices (the other 4 are their negatives). This halves the vertex computation.
Let me reconsider the whole approach. For a 256b intro, I'll look at what common techniques are used:
Let me look at a simple but working approach:
For a cube rotating around the Y axis (simplest rotation), I'll:
Let me try the simplest possible FPU approach that might fit. I'll compute both rotation angles at once and be efficient about it.
Here's my revised plan - use 2 angles (around X and Y) for a nicer rotation:
Actually, let me look at this from a byte-counting perspective. What if I use only Y rotation?
Mode 13h has 320x200 pixels. Screen center is (160, 100).
For Y-axis rotation of a cube with vertices at (±S, ±S, ±S):
So the 8 projected points can be derived from just 4 values: Sc, Ss, and their negatives. This is clever!
The 8 2D coordinates are: For each vertex (bx, by, bz) where each b is ±1:
Since bx and bz are each ±1, we have 4 combinations for the (bx, bz) pair:
Note that cases 3 and 4 are negatives of cases 1 and 2. So I only need to compute the first 2 cases; the other 2 are free.
This means I only need: c = cos(a), s = sin(a), and then compute 4 values: c-s, c+s, s+c, s-c (wait, s+c and c+s are the same thing). So I need:
For rz:
Hmm interesting. Let me just code it up efficiently.
Actually, let me just write the code and see how many bytes it takes. I'll optimize iteratively.
Let me write NASM code for this:
nasm
bits 16
org 100h
S EQU 50 ; cube size
K EQU 200 ; perspective scale
D EQU 300 ; perspective distance
start:
mov ax, 0013h
int 10h
mov ax, 0a000h
mov es, ax
finit
fldz ; st0 = angle = 0
frame:
; Clear
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
; angle += 0.03
fadd dword [.da] ; increment angle
fst dword [.a]
; compute cos, sin
fcos
fstp dword [.c]
fld dword [.a]
fsin
fstp dword [.s]
; Project 8 vertices
; Vertex index: bit0=x, bit1=y, bit2=z sign
mov cx, 7
.vloop:
; x coord: cx & 1 => +S, else -S
; y coord: cx & 2 => +S, else -S
; z coord: cx & 4 => +S, else -S
... (project and store)
loop .vloop
; Draw 12 edges
...
in al, 60h
dec al ; ESC is scancode 1
jnz frame
mov ax, 3
int 10h
ret
.da: dd 0.03
.a: dd 0.0
.c: dd 0.0
.s: dd 0.0
.pts: times 16 dw 0 ; 8 points * (x,y)
; Edge table: 12 pairs * 2 bytes = 24 bytes
.edges:
db 0,1, 1,3, 3,2, 2,0 ; front (z=+)?
; wait, I need to think about which vertices connect
Hmm, let me be more careful about the vertex encoding and edge connectivity.
With my bit encoding:
Vertex v = index i: x = (i&1) ? S : -S, y = (i&2) ? S : -S, z = (i&4) ? S : -S
Edges connect vertices differing in exactly one bit:
So edge table: 0,1, 2,3, 4,5, 6,7, 0,2, 1,3, 4,6, 5,7, 0,4, 1,5, 2,6, 3,7 That's 24 bytes.
Now, for the line drawing, a simple DDA:
dx = x2 - x1, dy = y2 - y1
steps = max(|dx|, |dy|)
for t = 0 to steps:
x = x1 + dx*t/steps
y = y1 + dy*t/steps
plot(x, y)
Using FPU for the divisions, this would be quite long.
A simpler approach: use fixed-point DDA.
if |dx| > |dy|:
step in x by 1, step in y by dy/dx (fixed point)
else:
step in y by 1, step in x by dx/dy (fixed point)
Let me try a version that's compact. I'll use a simple approach where I step 64 times along each edge:
for t = 0 to 63:
x = x1 + (x2-x1)*t/64 (use shift)
y = y1 + (y2-y1)*t/64
plot rounded
This can be done with 16-bit integer arithmetic and shifts.
Actually for plotting a line with 64 steps:
dx = (x2 - x1) << 8 / 64 ; 4 = ((x2-x1) << 2)
dy = (y2 - y1) << 8 / 64 ; similar
No wait, let me think. If I step 64 times:
Then: x_fixed = x1 << 8; for each step, x_fixed += x_step; actual_x = x_fixed >> 8
Actually simpler:
But 64 steps may not be enough for long edges. With a 320x200 screen and cube radius 50, max edge length is about 100 pixels projected. 64 steps would skip some pixels.
Let me use 128 steps:
128 steps for 12 edges = 1536 iterations total per frame. That's fine for speed.
Actually 128 iterations with a fixed number of steps is wasteful for short lines. But for simplicity in a 256b demo, it's fine.
Let me try to write the complete code now:
nasm
bits 16
org 100h
S equ 60 ; cube half-size
PD equ 256 ; perspective distance
PS equ 200 ; perspective scale
CX equ 160 ; screen center X
CY equ 100 ; screen center Y
; Layout: code first, then data
start:
mov ax, 0013h
int 10h
mov ax, 0a000h
mov es, ax
finit
.frame:
; Clear screen (black)
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
; Increment angle
fld dword [angle]
fadd dword [da]
fst dword [angle]
; Compute sin, cos
fsin
fstp dword [fsin]
fld dword [angle]
fcos
fstp dword [fcos]
; Project 8 vertices
mov cl, 8
mov si, pts
.vproj:
dec cl
; Get vertex coordinates
mov al, cl
and al, 1
jnz .xp
; x = -S
fld dword [neg_S]
jmp .xdone
.xp:
fld dword [pos_S]
.xdone:
mov al, cl
and al, 2
jnz .yp
fld dword [neg_S]
jmp .ydone
.yp:
fld dword [pos_S]
.ydone:
; ... this is getting complex and uses many branches
This approach has too many branches. Let me use a lookup table for vertex coordinates.
Or better: compute vertex x,y,z as signed integers and then use FPU:
nasm
; Convert vertex index to ±S
; if bit set: S, else -S
; = (((index >> bit) & 1) * 2 - 1) * S
; = S or -S
Let me try a different representation. Store the sign bytes for x, y, z in a table. For 8 vertices, 3 signs each = 24 bytes. But the signs alternate in a regular pattern:
Wait, using my bit encoding (bit 0 = x, bit 1 = y, bit 2 = z): Vertex 0 (000): x=-, y=-, z=- Vertex 1 (001): x=+, y=-, z=- Vertex 2 (010): x=-, y=+, z=- Vertex 3 (011): x=+, y=+, z=- Vertex 4 (100): x=-, y=-, z=+ Vertex 5 (101): x=+, y=-, z=+ Vertex 6 (110): x=-, y=+, z=+ Vertex 7 (111): x=+, y=+, z=+
x signs: -, +, -, +, -, +, -, + (alternates) y signs: -, -, +, +, -, -, +, + z signs: -, -, -, -, +, +, +, +
I can compute these from the index using bit tests, which avoids a table.
OK let me just write the code and count bytes. I'll use a compact FPU-based approach.
Here's my attempt - let me write it out fully:
nasm
bits 16
org 100h
start:
mov ax, 0013h
int 10h
mov ax, 0a000h
mov es, ax
finit
frame:
; Clear screen
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
; angle += 0.04
fld dword [ang]
fadd dword [inc]
fst dword [ang]
fsin
fstp dword [sa]
fld dword [ang]
fcos
fstp dword [ca]
; Project 8 vertices - store screen coords in pts[0..15]
; pts[i*2] = screen x of vertex i
; pts[i*2+1] = screen y of vertex i
xor bx, bx ; vertex counter
.vloop:
; x component: bit 0 of bx
mov al, bl
and al, 1
; if bit0: fld pos_s, else fld neg_s
; trick: use bt + conditional
; Compute: fx = (bx&1 ? S : -S)
; fy = (bx&2 ? S : -S)
; fz = (bx&4 ? S : -S)
; Load fx
test bl, 1
fld dword [mS] ; load -S
jz .fx_done
fchs ; negate to get +S
.fx_done:
; Stack: fx
test bl, 4
fld dword [mS] ; load -S
jz .fz_done
fchs
.fz_done:
; Stack: fz, fx
; rx = fx*ca - fz*sa
fld st1 ; fx
fmul dword [ca]
fld st1 ; fz
fmul dword [sa]
fsub ; st0 = rx = fx*ca - fz*sa
; rz = fx*sa + fz*ca
fxch st2 ; swap: fz is at st0, fx at st1, rx at st2
; Hmm, stack management is getting complex
OK this is getting really complicated. Let me step back and think about a cleaner approach.
One common trick in 256b DOS demos: use a pre-built sin table in the code, or use the BIOS time for animation without needing a timer variable.
Let me try a completely different, simpler approach that's been used in real 256b demos:
The "divide and draw" approach: compute the 8 screen coordinates using a helper function, then draw edges using another helper function. Use CALL/RET to save code through reuse.
Let me look at how to minimize the FPU code:
nasm
; FPU stack management note:
; Before: []
; Load fx based on bit: test bl, 1 => fld mS or pS
; Load fy based on bit: test bl, 2 => fld mS or pS
; Load fz based on bit: test bl, 4 => fld mS or pS
; Stack: [fz, fy, fx]
Hmm, let me think about this differently. What if I use a helper subroutine that:
And for line drawing:
Actually, I just realized: for a 256b intro, quality doesn't need to be perfect. Let me try a version that:
With orthographic projection (no perspective):
Where ry = fy (no change from Y rotation), rx = fxcos - fzsin.
This eliminates the need for perspective division, saving significant code.
Or even simpler: rotate around Y and X simultaneously using two separate rotation matrices. This is actually more interesting visually.
OK let me just write the code. I'm going to write it in a way that's clean and correct first, then count bytes.
Actually, let me look at some references for 256b cube demos to understand what's achievable:
In a typical 256b DOS cube demo:
Here's a solid attempt that I'll write carefully:
nasm
; 256byte rotating cube - DOS mode 13h
; NASM: nasm cube.asm -f bin -o cube.com
bits 16
org 100h
mov ax, 0013h ; set video mode 13h
int 10h
mov ax, 0A000h ; point ES to video memory
mov es, ax
main:
; clear screen
xor di, di
mov al, 0
mov cx, 320*200/2
rep stosw
; increment angle (stored as float)
fld dword [angle]
fadd dword [da]
fst dword [angle]
fsin
fst dword [sinA]
fld dword [angle]
fcos
fstp dword [cosA]
fstp dword [sinA] ; wait, wrong - sinA already stored
Ugh, let me be more careful. Let me write the code step by step:
nasm
bits 16
org 100h
mov ax, 0013h
int 10h
mov ax, 0A000h
mov es, ax
main:
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
fld dword [angle]
fadd dword [da]
fst dword [angle]
; compute sin and cos
fst st0 ; duplicate
fsin
fstp dword [sa]
fld dword [angle]
fcos
fstp dword [ca]
; Project all 8 vertices
mov si, 7 ; si = vertex index
vtx:
; Compute fx = (si&1) ? 50.0 : -50.0
mov ax, si
and ax, 1
fld dword [pts + ax*4] ; pts[0] = -50.0, pts[4] = 50.0 ... hmm this doesn't work directly
I need a different way to load ±50 based on a bit. One trick:
test si, 1
fld dword [val] ; load 50.0
jnz .pos
fchs ; negate to -50.0
.pos:
That's 5 bytes (test + fld + jnz + fchs). Actually test = 2 bytes, fld dword [val] = 6 bytes (long!), jnz = 2 bytes, fchs = 2 bytes. That's 12 bytes per coordinate, 3 coords = 36 bytes just for loading! Too much.
Alternative: use FILD with a word variable:
mov word [tmp], 50
test si, 1
jnz .pos
neg word [tmp]
.pos:
fild word [tmp]
That's about 12 bytes still.
Better trick: precompute: the signed value is (2*(si&1) - 1) * 50. So:
mov ax, si
and ax, 1 ; 0 or 1
add ax, ax ; 0 or 2
dec ax ; -1 or 1
imul ax, 50 ; -50 or 50
mov [tmp], ax
fild word [tmp]
That's about 12 bytes too.
Even better: use FMUL with a sign trick. Load 50.0, then conditionally negate:
fld dword [fifty]
test si, 1
jnz .xpos
fchs
.xpos:
test = 3 bytes (if reg,imm8 it's 2), fld [fifty] = 6 bytes, jnz = 2, fchs = 2. That's 13 bytes.
But if I define fifty as a local label within range for short forms... Actually fld dword [mem] is 6 bytes (2 for opcode/mod + 4 for address). With a local variable in BSS that might use a short address... no, in a .COM file all data is in the same segment so addresses are 16-bit.
Actually fld dword [label] in NASM with a near label compiles to:
Wait, let me check. In 16-bit mode:
Hmm, but I can use a byte comparison: the low byte of SI: test bl, 1 = F6 C3 01 = 3 bytes (if BL is the vertex counter).
Let me use BX as vertex index and use BL for byte operations.
So loading vertex coordinate based on a bit:
fld dword [fifty] ; 4 bytes: D9 06 xx xx
test bl, bit ; 3 bytes: F6 C3 xx
jnz .pos ; 2 bytes: 75 xx
fchs ; 2 bytes: D9 E0
.pos:
Total: 11 bytes per coordinate. For 3 coordinates = 33 bytes. Then rotation + projection = more bytes.
This is going to be tight. Let me think about what I can cut.
Actually, let me try a fundamentally different approach: represent the cube not with 8 vertices, but with the property that all 8 vertices are sign permutations of (S, S, S).
After Y rotation:
So rx = S * (b0cos - b2sin) where b0,b2 ∈ {-1,+1} rz = S * (b0sin + b2cos)
There are only 4 distinct (rx, rz) values (for 4 combinations of b0, b2), and 2 distinct ry values. All 8 projected points are combinations of these.
I can precompute the 4 (rx,rz) pairs (screen x in perspective), giving 4 screen-x values (2 for each rx). Wait, this is getting complex.
Let me just try to write the complete code for a Y-axis rotation with orthographic projection (no perspective division) since that's simpler:
With orthographic: screen_x = cx + rx, screen_y = cy + ry where rx = vxcos - vzsin, ry = vy (unchanged)
For cube vertices (±S, ±S, ±S):
The 4 distinct rx values for (vx, vz) combinations:
So I need S*(cos-sin), S*(cos+sin), and their negatives. These can be computed with:
And similarly for ry: just ±S. And for rz (needed for depth sorting if we want):
Without perspective, the projected 2D coords:
Wait, I'm overcomplicating this. Let me just compute all 8 vertex projections in a loop.
Here's my plan for a simpler, orthographic projection:
nasm
; For each vertex i (0..7):
; vx = (i & 1) ? S : -S
; vy = (i & 2) ? S : -S
; vz = (i & 4) ? S : -S
; After Y rotation:
; rx = vx*ca - vz*sa (integer: (vx*ica - vz*isa) >> 8)
; ry = vy
; Screen: sx = 160 + rx, sy = 100 + ry
; Store in pts: pts[i*4] = sx (word), pts[i*4+2] = sy (word)
Using integer fixed-point (16-bit, scaled by some factor):
But computing sin/cos with integer: still need a table or FPU.
OK, I'll use FPU and accept ~200 bytes of code. Let me write it out fully:
nasm
bits 16
org 100h
S equ 60
start:
mov ax, 13h
int 10h
mov ax, 0a000h
mov es, ax
finit
.L:
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
fld dword [.ang]
fadd dword [.da]
fst dword [.ang]
fsin
fstp dword [.si]
fld dword [.ang]
fcos
fstp dword [.co]
; Project 8 vertices into .px/.py arrays
mov bl, 0
.vp:
; vx: test bit 0 of bl
fld dword [.co] ; st: ca
mov al, bl
and al, 1 ; 0 or 1
add al, al
dec al ; -1 or 1
cbw
imul ax, ax, S ; ax = vx
push ax
mov [.tmp], ax
fild word [.tmp] ; st: vx, ca
fmulp ; st: vx*ca
; fz: test bit 2 of bl
mov al, bl
shr al, 2
and al, 1
add al, al
dec al
cbw
imul ax, ax, S ; ax = vz
mov [.tmp], ax
fild word [.tmp] ; st: vz, vx*ca
fld dword [.si] ; st: sa, vz, vx*ca
fmulp ; st: vz*sa, vx*ca
fsub ; st: vx*ca - vz*sa = rx
; fy: test bit 1 of bl
mov al, bl
shr al, 1
and al, 1
add al, al
dec al
cbw
imul ax, ax, S ; ax = vy = ry
mov [.tmp], ax
; Convert rx to integer, add screen center
fistp word [.tmp2] ; rx as integer
mov ax, [.tmp2]
add ax, 160
mov di, bx
add di, di
add di, di
mov [.px + di], ax ; pts[bl].x = 160 + rx
; ry
mov ax, [.tmp]
neg ax ; negate y for screen coords (y increases downward)
add ax, 100
mov [.py + di], ax ; pts[bl].y = 100 + ry
inc bl
cmp bl, 8
jne .vp
; Draw 12 edges
; Edges: pairs where vertices differ in exactly 1 bit
mov si, .edges
.draw:
movzx bx, byte [si] ; vertex index 1
movzx cx, byte [si+1] ; vertex index 2
mov ax, bx
add ax, ax
add ax, ax
lea di, [.px + ax]
mov dx, [di] ; x1
mov bx, [di+2] ; y1 -- wait, px and py are separate arrays
; Hmm, let me restructure. Store pts as interleaved: px,py,px,py,...
...
This is getting unwieldy. Let me take a cleaner approach and write a complete, working example.
I'll structure data storage as:
For drawing, I'll use indices to look up screen coords.
Edge table: 12 * 2 = 24 bytes
Let me count how many bytes this will take and see if it fits in 256.
Let me actually write the entire thing carefully:
nasm
bits 16
org 100h
; Constants
S equ 60 ; cube half-size
start:
; Set mode 13h (320x200, 256 colors)
mov ax, 0013h ; 3 bytes
int 10h ; 2 bytes
mov ax, 0a000h ; 3 bytes
mov es, ax ; 2 bytes
; Total: 10 bytes
finit ; 2 bytes
; Main animation loop
frame:
; Clear screen to black
xor di, di ; 2 bytes
xor ax, ax ; 2 bytes
mov cx, 32000 ; 3 bytes
rep stosw ; 2 bytes
; Total: 9 bytes
; Increment and compute angle
fld dword [ang] ; 4 bytes (D9 06 lo hi)
fadd dword [da] ; 4 bytes
fst dword [ang] ; 4 bytes (D9 16 lo hi)
fsin ; 2 bytes
fstp dword [sin_a] ; 4 bytes
fld dword [ang] ; 4 bytes
fcos ; 2 bytes
fstp dword [cos_a] ; 4 bytes
; Total: 28 bytes
; Project vertices loop
xor bx, bx ; 2 bytes
vloop:
; Load vx = (bx&1) ? S : -S
; Method: compute ax = (bx&1)*2 - 1 = {-1, 1}, then multiply by S
mov ax, bx ; 2 bytes
and ax, 1 ; 3 bytes (AND AX, imm16)
add ax, ax ; 2 bytes
dec ax ; 1 byte
imul ax, S ; 3 bytes (IMUL r16, imm8 = 3B) -- actually IMUL r16, r/m16, imm8
; Wait: "imul ax, S" = imul ax, ax, S which is IMUL r16, r/m16, imm8 = 4 bytes
; Let me use: imul ax, ax, 60 -- 4 bytes (0F AF ... no wait)
; Actually: IMUL r16, imm8 with sign-extend: 6B /r ib
; imul ax, ax, 60 = 6B C0 3C = 3 bytes! Yes.
; So: mov ax, bx; and ax, 1; add ax, ax; dec ax; imul ax, ax, 60
; = 2 + 3 + 2 + 1 + 3 = 11 bytes
mov [vx], ax ; 3 bytes (mov [mem], ax = A3 lo hi = 3 bytes)
; Actually mov [mem], r16 = 89 /r = 89 06 lo hi = 4 bytes
; Load vz = (bx&4) ? S : -S
mov ax, bx ; 2 bytes
shr ax, 2 ; 3 bytes (C1 E8 02 or D1 E8 repeated... = shift right by 2)
; Actually SHR r16, imm8 = C1 /5 ib = C1 E8 02 = 3 bytes
and ax, 1 ; 3 bytes
add ax, ax ; 2 bytes
dec ax ; 1 byte
imul ax, ax, S ; 3 bytes
mov [vz], ax ; 4 bytes
; = 2+3+3+2+1+3+4 = 18 bytes
; rx = vx*cos_a - vz*sin_a (using FPU)
fild word [vx] ; 4 bytes (DF 06 lo hi)
fmul dword [cos_a] ; 4 bytes
fild word [vz] ; 4 bytes
fmul dword [sin_a] ; 4 bytes
fsub ; 2 bytes (DE E9)
; Total: 18 bytes
; Convert rx to integer, get screen x
fistp word [tmp] ; 4 bytes (DF 1E lo hi)
mov ax, [tmp] ; 3 bytes (A1 lo hi)
add ax, 160 ; 3 bytes
; Store in pts[bx*4]
mov di, bx ; 2 bytes
shl di, 2 ; 3 bytes (C1 E7 02)
mov [pts + di], ax ; probably 4 bytes (89 85 lo hi?)
; Actually mov [pts + di], ax with base+index in 16-bit mode...
; In 16-bit mode: [BX+DI+disp16] format
; mov [pts + di], ax = mov [SI+pts], ax? No...
; I need to use "mov [pts + di], ax" where pts is a constant
; This would be: 89 85 lo hi (ModRM=10 000 101 means [DI+disp16])
; Actually ModRM for [DI + disp16] = mod=10, rm=101 = 10000101b = 85h
; So: 89 85 lo_pts hi_pts = 5 bytes
; Hmm, that's 5 bytes. Unless pts is close enough for an 8-bit offset...
; If pts < 128 from current code, it could be 89 45 xx (3 bytes with 8-bit disp)
; Let me plan data layout carefully.
; ry = vy = (bx&2) ? S : -S (unchanged by Y rotation)
mov ax, bx ; 2 bytes
and ax, 2 ; 3 bytes
; ax is 0 or 2, want -S or +S
; if ax=2: S, if ax=0: -S
; (ax - 1) * S? No: 0-1=-1 => -S, 2-1=1 => S. Yes!
dec ax ; 1 byte (gives -1 or 1)
imul ax, ax, S ; 3 bytes
; negate for screen coordinates (y increases downward in screen)
neg ax ; 2 bytes (F7 D8)
add ax, 100 ; 3 bytes
mov [pts + di + 2], ax ; store sy... 5 bytes?
; = 2+3+1+3+2+3+5 = 19 bytes
inc bx ; 1 byte
cmp bx, 8 ; 3 bytes
jne vloop ; 2 bytes
; = 6 bytes
; Total for vertex loop body (approx): 11+4+18+18+19+6 = ~76 bytes
; This seems too big. Let me recount...
OK this is getting very messy with my byte counting. Let me just write the full code properly and trust NASM to assemble it, and then see if it's under 256 bytes. If it's over, I'll optimize.
Actually, you know what, let me write a proper implementation and provide it as code for the user. Even if it's slightly over 256 bytes, I can note that and provide optimization suggestions. The user asked for 256b intro code, and I should provide working, clean code. Let me aim for under 256 bytes but prioritize correctness.
Let me write the final version:
nasm
; 256byte intro: rotating 3D cube, DOS mode 13h
; Build: nasm -f bin cube256.asm -o cube256.com
; Run: dosbox cube256.com
bits 16
org 100h
S equ 55 ; cube half-size
PD equ 280 ; perspective depth offset
PS equ 180 ; perspective scale
start:
mov ax, 0013h
int 10h
mov ax, 0a000h
mov es, ax
finit
frame:
; Clear screen (320*200 = 64000 bytes = 32000 words)
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
; Increment angle
fld dword [ang]
fadd dword [da]
fst dword [ang]
; Compute sin and cos
fsin
fstp dword [sa]
fld dword [ang]
fcos
fstp dword [ca]
; Project 8 cube vertices
xor bx, bx
proj_loop:
; vx = (bx&1) ? S : -S
mov ax, bx
and al, 1
add al, al
dec al
cbw
imul ax, ax, S
mov [vx], ax
; vz = (bx&4) ? S : -S
mov ax, bx
shr al, 2
and al, 1
add al, al
dec al
cbw
imul ax, ax, S
mov [vz], ax
; vy = (bx&2) ? S : -S
mov ax, bx
shr al, 1
and al, 1
add al, al
dec al
cbw
imul ax, ax, S
mov [vy], ax
; Rotate around Y axis:
; rx = vx*cos(ang) - vz*sin(ang)
fild word [vx]
fmul dword [ca]
fild word [vz]
fmul dword [sa]
fsub
fst dword [rx] ; rx = vx*ca - vz*sa
; rz = vx*sin(ang) + vz*cos(ang)
fild word [vx]
fmul dword [sa]
fild word [vz]
fmul dword [ca]
fadd
fst dword [rz] ; rz = vx*sa + vz*ca
; Also rotate around X axis slightly for interest
; (Could skip this to save bytes)
; Perspective projection:
; scale = PS / (rz + PD)
; sx = rx * scale + 160
; sy = vy * scale + 100 (negated since screen Y is down)
fld dword [rz]
fiadd word [pd]
fdivr dword [ps] ; st0 = PS/(rz+PD)
fld dword [rx]
fmul st0, st1 ; st0 = rx * scale
fistp word [tmp]
mov ax, [tmp]
add ax, 160
; Store screen coords
mov di, bx
shl di, 2 ; di = bx*4
mov [pts+di], ax ; store sx
fxch ; st0 = scale again
fild word [vy]
fmul st0, st1
fchs ; negate (y down)
fistp word [tmp]
mov ax, [tmp]
add ax, 100
mov [pts+di+2], ax ; store sy
fstp st0 ; clear scale
fstp dword [rx] ; clear rx (actually just discard)
; Hmm, need to clean up FPU stack
inc bx
cmp bx, 8
jl proj_loop
; Draw 12 edges
mov si, edges
mov cx, 12
draw_loop:
movzx ax, byte [si]
movzx bx, byte [si+1]
add si, 2
shl ax, 2
shl bx, 2
mov dx, [pts+ax] ; x1
mov di, [pts+ax+2] ; y1
mov ax, [pts+bx] ; x2
mov bx, [pts+bx+2] ; y2
; Draw line from (dx,di) to (ax,bx) using DDA
; Use 128 steps
push cx
push si
; ... line drawing code ...
pop si
pop cx
loop draw_loop
; Check for keypress (ESC = scancode 1)
in al, 60h
dec al
jnz frame
mov ax, 3
int 10h
ret
; Data section
ang: dd 0.0
da: dd 0.04
ca: dd 1.0
sa: dd 0.0
rx: dd 0.0
rz: dd 0.0
pd: dw PD
ps: dd PS * 1.0
vx: dw 0
vy: dw 0
vz: dw 0
tmp: dw 0
pts: times 16 dw 0
edges:
db 0,1, 1,3, 3,2, 2,0 ; top/bottom faces?
db 4,5, 5,7, 7,6, 6,4
db 0,4, 1,5, 2,6, 3,7
Wait, I have a FPU stack imbalance. After computing scale = PS/(rz+PD):
Let me redo the FPU sequence for projection:
; After rotation, st0 = rz (from fst dword [rz])
; Actually let me clear after each vertex
; FPU sequence for projection:
fld dword [rz] ; st: rz
fiadd word [pd] ; st: rz+PD
fdivr dword [ps] ; st: PS/(rz+PD) = scale
; sx:
fld dword [rx] ; st: rx, scale
fmul st0, st1 ; st: rx*scale, scale
fistp word [sx_tmp] ; st: scale (pops rx*scale)
; sy:
fild word [vy] ; st: vy, scale
fmul ; st: vy*scale (pops both, pushes result) -- wait, fmul with no operand means st0 *= st1 and pops? No.
; fmul st0, st1 = multiply st0 by st1, result in st0, both operands kept
; fmulp = multiply and pop
; Hmm, fmul (no operand) = fmul st1, st0 = st1 *= st0, pop st0 (result in new st0)
fmulp st1, st0 ; st: vy*scale (correct way to multiply and pop)
fchs ; negate
fistp word [sy_tmp]
; stack empty now - good!
Actually fmul with no operands: In 16-bit real mode NASM, I need to be explicit. fmulp without operands = fmulp st1, st0 which multiplies st1 by st0, stores in st1, then pops st0. Result at st0.
Wait, let me be more careful with x87 instruction semantics:
OK so for my projection:
fld dword [rz] ; st: rz
fiadd word [pd] ; st: rz+PD -- hmm, fiadd with word is valid: DA /0
; Actually wait, I need PD as a word in memory. Let me use fld then fadd.
; Or: fild then fadd. Let me rethink.
; Load rz from float, add PD constant
fld dword [rz] ; st0 = rz
fadd dword [fpd] ; st0 = rz + PD (fpd = float constant PD)
fdivr dword [fps] ; st0 = PS / (rz+PD) = scale (fdivr = reverse division: fps/st0)
; Wait, fdivr with memory: fdivr dword [fps] = st0 = [fps] / st0 = PS/(rz+PD). Yes!
fld dword [rx] ; st0 = rx, st1 = scale
fmulp ; = fmulp st1,st0: st1=st1*st0, pop, so st0 = scale*rx = rx*scale
fistp word [sx_tmp] ; pop to integer, st0 = scale is gone
; Wait, after fmulp, st0 = scale*rx. So fistp saves rx*scale and stack is... let me redo:
; Stack notation: [top..bottom]
; Start: empty
fld dword [rz] ; [rz]
fadd dword [fpd] ; [rz+PD]
fdivr dword [fps] ; [PS/(rz+PD)] = [scale]
; For rx*scale:
fld dword [rx] ; [rx, scale]
fmulp st1,st0 ; st1 = st1*st0 = scale*rx, pop st0. [scale*rx]
fistp word [sx_tmp] ; pop as int. []
; Oops, we consumed scale. Need to compute it again for vy, or save it first.
I need to duplicate scale before multiplying. Use fld st0 to duplicate:
fld dword [rz] ; [rz]
fadd dword [fpd] ; [rz+PD]
fdivr dword [fps] ; [scale]
fld st0 ; [scale, scale]
; sx:
fld dword [rx] ; [rx, scale, scale]
fmulp st1,st0 ; st1=scale*rx, pop. [scale*rx, scale]
fistp word [sx_tmp] ; [scale]
; sy:
fild word [vy] ; [vy, scale]
fmulp st1,st0 ; [vy*scale]
fchs ; [-vy*scale]
fistp word [sy_tmp] ; []
This uses 10 FPU instructions. In bytes:
Actually, this is better. Let me now figure out the full code.
Actually, I think I need to also fix up the FPU after the rotation. Let me redesign the vertex loop:
nasm
vtx_loop: ; bx = 0..7
; Load vertex coordinates as integers
; vx
mov ax, bx
and al, 1
add al, al
dec al
cbw
imul ax, ax, S ; ax = vx
mov [vx], ax
; vz
mov ax, bx
shr al, 2
and al, 1
add al, al
dec al
cbw
imul ax, ax, S
mov [vz], ax
; vy
mov ax, bx
shr al, 1
and al, 1
add al, al
dec al
cbw
imul ax, ax, S
mov [vy], ax
; rx = vx*ca - vz*sa
fild word [vx] ; [vx]
fmul dword [ca] ; [vx*ca]
fild word [vz] ; [vz, vx*ca]
fmul dword [sa] ; [vz*sa, vx*ca]
fsubr ; st0 = st1 - st0 = vx*ca - vz*sa. Wait: fsubr = reverse subtract = st1-st0
; Actually: fsubr = st0 = st1 - st0
; fsubr dword [mem]: st0 = [mem] - st0
; With no operand or stack: fsubr st0, st1 = st0 = st1 - st0? No.
; Let me use fsub instead: fsub = st0 = st0 - st1... no.
; fsub (no operand) = fsub st0, st1 = st0 = st0 - st1
; So: fsubr = fsubr st0, st1 = st0 = st1 - st0
; With stack [vz*sa, vx*ca]: st0=vz*sa, st1=vx*ca
; fsubr = st0 = st1 - st0 = vx*ca - vz*sa. Yes!
fstp dword [rx] ; [] (save rx, pop)
; rz = vx*sa + vz*ca
fild word [vx]
fmul dword [sa]
fild word [vz]
fmul dword [ca]
fadd
fstp dword [rz]
; Project: scale = PS/(rz+PD)
fld dword [rz]
fadd dword [fpd]
fdivr dword [fps]
fld st0 ; duplicate scale
; sx = rx*scale + 160
fld dword [rx]
fmulp st1, st0
fistp word [tmp]
mov ax, [tmp]
add ax, 160
mov di, bx
shl di, 2
mov [pts+di], ax
; sy = -vy*scale + 100
fild word [vy]
fmulp st1, st0
fchs
fistp word [tmp]
mov ax, [tmp]
add ax, 100
mov [pts+di+2], ax
inc bx
cmp bx, 8
jl vtx_loop
Wait, I have a mistake. After computing rx (line fstp dword [rx]), the FPU stack is empty. Then I compute rz, stack is empty again. Then I start the projection. This is fine.
But wait, I need to check fsubr behavior:
Good. And then fstp dword [rx] stores st0 to [rx] and pops. Stack is now [vxca]... wait no. Before fsubr, stack was [vzsa, vxca]. After fsubr, st0 = vxca-vz*sa, st1 gone (result replaces operands? Let me check).
Actually, x87 binary operations work on st0 and st1 by default:
Hmm. Let me use fsubp st1, st0:
So fsubp st1, st0 is what I want! Not fsubr. Let me use this.
Byte encoding:
Now let me also think about the stack after fld dword [rx] and fmulp st1, st0:
fld dword [rz] → [rz]
fadd dword [fpd] → [rz+PD]
fdivr dword [fps] → [scale]
fld st0 → [scale, scale] (st0=scale, st1=scale)
; Now compute sx:
fld dword [rx] → [rx, scale, scale]
fmulp st1, st0 → st1 = st1*st0 = scale*rx, pop. [scale*rx, scale]
fistp word [tmp] → pop as int. [scale]
; Compute sy:
fild word [vy] → [vy, scale]
fmulp st1, st0 → [vy*scale]
fchs → [-vy*scale]
fistp word [tmp] → [] ✓
Great, this works. The stack is clean after each vertex.
Now, for the fmulp st1,st0: the encoding is DE C9 (2 bytes).
Let me count the vertex loop bytes:
Coordinate loading (3 coordinates):
mov ax, bx ; 2
and al, 1 ; 3
add al, al ; 2
dec al ; 1 (FE C8)? No, dec al = FE C8 = 2 bytes in 16-bit
; Actually: dec al = FE C8 = 2 bytes
cbw ; 2? No, CBW = 98h = 1 byte
imul ax, ax, S ; 6B C0 S = 3 bytes (where S=55 fits in 8 bits)
mov [vx], ax ; 89 06 xx xx = 4 bytes
; Similar for vz:
mov ax, bx ; 2
shr al, 2 ; 3 (C0 E8 02)
and al, 1 ; 3
add al, al ; 2
dec al ; 2
cbw ; 1
imul ax, ax, S ; 3
mov [vz], ax ; 4
; Similar for vy:
mov ax, bx ; 2
shr al, 1 ; 3
and al, 1 ; 3
add al, al ; 2
dec al ; 2
cbw ; 1
imul ax, ax, S ; 3
mov [vy], ax ; 4
Coordinate loading total: (2+3+2+2+1+3+4)3 = 173 = 51 bytes
Wait: dec al = FE C8 = 2 bytes in all modes. cbw = 98h = 1 byte. OK. Actually, dec al might encode differently. In 16-bit mode: DEC r/m8 with r/m8 = AL = FE /1 with ModRM for AL register = FE C8 = 2 bytes. Yes, 2 bytes.
But wait, I can simplify: for vx based on bit 0:
Alternative calculation:
mov ax, bx
and ax, 1 ; 0 or 1
; Want: -S or +S
; multiply by 2S and subtract S: (0*2-1)*S=-S, (1*2-1)*S=S
; = (2*ax - 1) * S
add ax, ax ; 0 or 2
dec ax ; -1 or 1
imul ax, ax, S ; -S or S
Or alternatively, using a small table or conditional: test bx, 1; mov ax, S; jnz $+4; neg ax but that's similar length.
Let me also reconsider: do I really need cbw here? After dec al, the result is in AL (-1 or 1). I need it in AX for imul ax, ax, S.
In 16-bit mode, imul ax, ax, S operates on the 16-bit AX register. If AL contains -1 (0xFF) or 1 (0x01), I need AX to have the correct 16-bit signed value. So cbw (sign-extend AL to AX) is needed after dec al if AL=-1 (0xFF), because I need AX=0xFFFF for -1. Yes, cbw is needed.
Actually, wait. dec al when al=0 gives al=0xFF = -1 in signed. cbw makes ax=0xFFFF = -1. Then imul ax, ax, 55 = -55. ✓
For al=1: dec doesn't apply here (we start with 0 or 1 from the and). When and al, 1 → al=1, add al, al → al=2, dec al → al=1 (in AL, no overflow), cbw → ax=1, imul ax, ax, 55 = 55. ✓
OK, so the code is correct.
Let me now think about size optimization. 51 bytes for loading 3 coordinates is a lot. Can I do it more efficiently?
One optimization: all three coordinates use the same pattern, just with different bit positions. I can use a sub-loop:
; BX = vertex index
; Process bits 0, 1, 2 → store vx, vy, vz
mov cx, 3
mov di, vx ; pointer to storage
.coord_loop:
mov ax, bx
and ax, 1 ; take bit 0
add ax, ax
dec ax
cbw
imul ax, ax, S
mov [di], ax
add di, 2 ; next storage word
shr bx, 1 ; shift to get next bit
loop .coord_loop
; But now bx is modified! I need to restore it.
But this modifies BX, which I need for the vertex index. I'd need to save/restore it with push/pop. But the sub-loop saves code:
With the loop:
Wait, and shr bx, 1 for each iteration: 3 bytes inside loop. So loop body = 20 + 3 = 23 bytes. Total = 3 + 3 + 23 + 2 + 1 + 1 = 33 bytes. Saves 51-33 = 18 bytes.
Actually, shr bx, 1 modifies bx. After the loop, bx = original_bx >> 3. I need to restore original bx. I can use push bx before and pop bx after.
And mov di, vx where vx is an address... lea di, [vx] = 4 bytes (8D 3E lo hi). Or mov di, offset vx = 3 bytes (in .COM it's mov di, vx = BF lo hi = 3 bytes). Let me use mov di, vx.
Let me revise:
In 16-bit addressing, [DI] is a valid addressing mode: ModRM byte mod=00, rm=101 = 00000101b? Let me check. In 16-bit mode:
So mov [DI], AX = 89 05 = 2 bytes. Only 2 bytes! Great.
ModRM for 16-bit register: mod=11, r/m=reg:
Great. And dec di is 1 byte (dec r16 = 4F for DI). Or inc di is 1 byte (47h for DI).
Let me recalculate coord loop body:
Plus loop instruction: loop coord_loop = 2 bytes Total loop = 18*3 + 2 = 56... wait no. The LOOP instruction repeats the body 3 times (CX=3), so total = 18 bytes (body) + 2 bytes (loop) = 20 bytes for the loop itself.
Plus setup: push bx (1) + mov di, vx (3) + mov cx, 3 (3) + pop bx (1) = 8 bytes. Total coordinate loop: 8 + 20 = 28 bytes.
But wait, I also need to reset bx after the shifts. The pop bx at the end restores it. And loop coord_loop jumps back if CX != 0. But LOOP decrements CX first.
Actually there's a problem: my shr bx,1 is inside the loop, shifting right 3 times total. After the loop, bx has been shifted right by 3. I pop bx to restore. OK.
But there's another issue: the mov ax, bx at the start of the loop body. After the first iteration, bx is already shifted right by 1. After second iteration, by 2, etc. This is intentional - we're extracting bits 0, 1, 2 successively. Wait:
Yes! This correctly extracts each bit.
So the loop works. I save 51-28 = 23 bytes with the loop.
Now, after the coordinate loop, I have vx=[vx], vy=[vy+0]=wait... I need to make sure the storage order is vx, vz, vy to match what the FPU code expects. Wait, I want vx, vy, vz in memory in some order. Let me set vx as the first word, vy as the second word (+2), vz as the third word (+4).
With the bit encoding: bit 0 = x, bit 1 = y, bit 2 = z. The loop extracts bit 0 first (→vx), bit 1 second (→vy), bit 2 third (→vz). Storage: DI starts at vx, so:
Now let me count the full code:
Mode setup (start):
mov ax, 0013h ; 3 (B8 13 00)
int 10h ; 2 (CD 10)
mov ax, 0a000h ; 3 (B8 00 A0)
mov es, ax ; 2 (8E C0)
finit ; 2 (9B DB E3)? or DB E3 = 2 bytes
Actually, finit = 9B DB E3 = 3 bytes (FWAIT + FINIT). Or fninit = DB E3 = 2 bytes (without FWAIT). In a 256b intro, use fninit = 2 bytes.
Mode setup: 3+2+3+2+2 = 12 bytes
Frame clear:
xor di, di ; 2 (33 FF)
xor ax, ax ; 2 (33 C0)
mov cx, 32000 ; 3 (B9 00 7D)
rep stosw ; 2 (F3 AB)
9 bytes
Angle update + sin/cos:
fld dword [ang] ; 4
fadd dword [da] ; 4
fst dword [ang] ; 4 (FST = D9 16 lo hi = 4 bytes? Wait: FST = D9 /2 = D9 16 = 2 bytes + 2 for addr = 4 bytes. FST without FISTP = saves without popping)
; Actually: fst dword [mem] = D9 /2 = D9 16 lo hi = 4 bytes (mod=00, rm=110=direct addr, reg=/2=010, so ModRM = 00 010 110 = 00010110 = 16h). Hmm wait:
; D9 is the FPU opcode for float32 operations
; For FST: opcode = D9, mod/rm for memory, reg field = 010 (FST)
; ModRM for [disp16]: mod=00, rm=110, reg=010 → 00 010 110 = 26h
; Wait: ModRM byte: [mod(2)][reg(3)][r/m(3)]
; mod=00, reg=010 (FST), rm=110 (direct) = 00_010_110 = 0x16
; So: D9 16 lo hi = 4 bytes. Yes.
; Or I can use FSTP to store and pop, then reload for sin:
; Better:
fld dword [ang] ; 4
fadd dword [da] ; 4
fst dword [ang] ; 4 - save updated angle, keep on stack
; Now compute sin on copy:
fld st0 ; 2 - duplicate
fsin ; 2
fstp dword [sa] ; 4 - save sin, pop
; angle is still on stack (st0)
fcos ; 2
fstp dword [ca] ; 4
Total: 4+4+4+2+2+4+2+4 = 26 bytes
That's a bit better since I avoid loading ang twice. But the angle is left on the stack after the frame clear, so I need to be careful.
Wait, actually I could keep the angle on the FPU stack throughout the program! That would save loading/storing it each frame. Let me redesign:
Start: push initial angle (0.0) onto FPU stack with fldz (2 bytes).
Each frame:
fadd dword [da] ; increment angle: 4 bytes
fld st0 ; duplicate: 2 bytes
fsin ; compute sin: 2 bytes
fstp dword [sa] ; store sin, pop copy: 4 bytes
; Angle is at st0
fld st0 ; duplicate again: 2 bytes
fcos ; compute cos: 2 bytes
fstp dword [ca] ; store cos: 4 bytes
; Angle is at st0 still
Total: 4+2+2+4+2+2+4 = 20 bytes (saved 6 bytes vs previous approach)
And I don't need the ang variable at all!
One concern: FPU stack overflow? I maintain exactly 1 value (the angle) persistently, so no overflow.
So full setup + first frame:
; Mode 13h setup
mov ax, 0013h ; 3
int 10h ; 2
mov ax, 0a000h ; 3
mov es, ax ; 2
fninit ; 2
fldz ; 2 - push angle=0
; Frame start
frame:
; Clear
xor di, di ; 2
xor ax, ax ; 2 -- Wait, ES is already set to A000h, DI=0, this should work
mov cx, 32000 ; 3
rep stosw ; 2
; Update angle, compute sin/cos
fadd dword [da] ; 4
fld st0 ; 2
fsin ; 2
fstp dword [sa] ; 4
fld st0 ; 2
fcos ; 2
fstp dword [ca] ; 4
OK I'm also noticing: after rep stosw, DI = 0x7D00 (past end of video mem). That's fine, we'll reset DI when we need it again.
Now let me count the vertex projection loop more carefully:
Vertex loop setup:
xor bx, bx ; 2 (33 DB)
Vertex loop body (per iteration): Coordinate loading (loop):
push bx ; 1
mov di, vx ; 3 (BF lo hi)
mov cx, 3 ; 3 (B9 03 00)
coord:
mov ax, bx ; 2 (89 D8? no: MOV AX, BX = 8B C3 = 2)
and al, 1 ; 2 (24 01)
add al, al ; 2 (00 C0)
dec al ; 2 (FE C8)
cbw ; 1 (98)
imul ax, ax, S ; 3 (6B C0 S)
mov [di], ax ; 2 (89 05)
inc di ; 1 (47)
inc di ; 1 (47)
shr bx, 1 ; 2 (D1 EB)
loop coord ; 2 (E2 xx)
pop bx ; 1
Coordinate loop: 1+3+3 + (2+2+2+2+1+3+2+1+1+2+2)*1_loop_overhead = 7 + 20 = 27 bytes (but the loop body runs 3 times, the 20 bytes is just the body once + loop jump)
Wait, let me recount the coord loop body:
Setup for coord loop: push bx (1) + mov di, vx (3) + mov cx, 3 (3) = 7 bytes Cleanup: pop bx (1) = 1 byte Total coordinate section: 7 + 20 + 1 = 28 bytes (code, not execution)
But wait, mov bx, [di] vs mov ax, bx... let me double-check encoding of mov ax, bx: MOV r16, r/m16 = 8B /r With r=AX(0), r/m=BX(3): ModRM = 11 000 011 = 0xC3 So: 8B C3 = 2 bytes ✓
And and al, 1 = 24 01 = 2 bytes (AND AL, imm8 short form) ✓
OK. Now the FPU rotation part:
; rx = vx*ca - vz*sa
fild word [vx] ; 4 (DF 06 lo hi)
fmul dword [ca] ; 4 (D8 0E lo hi)
fild word [vz] ; 4 (DF 06 lo hi)
fmul dword [sa] ; 4 (D8 0E lo hi)
fsubp st1, st0 ; 2 (DE E9) → st0 = vx*ca - vz*sa
fstp dword [rx] ; 4 (D9 1E lo hi? or D9 16? Let me check)
; FSTP dword: D9 /3 for D9 1E lo hi?
; FST/FSTP dword: D9 with reg field /2 or /3
; FST dword [mem] = D9 /2 = D9 16 lo hi
; FSTP dword [mem] = D9 /3 = D9 1E lo hi
; ModRM for /3 = reg=011, mod=00, rm=110: 00_011_110 = 0x1E
; So: D9 1E lo hi = 4 bytes ✓
; rz = vx*sa + vz*ca
fild word [vx] ; 4
fmul dword [sa] ; 4
fild word [vz] ; 4
fmul dword [ca] ; 4
faddp st1, st0 ; 2 (DE C1)
fstp dword [rz] ; 4
Rotation: (4+4+4+4+2+4)2 = 222 = 44 bytes
Projection:
fld dword [rz] ; 4
fadd dword [fpd] ; 4
fdivr dword [fps] ; 4 (FDIVR dword: D8 /7 = D8 3E lo hi = 4 bytes)
fld st0 ; 2
fld dword [rx] ; 4
fmulp st1, st0 ; 2 (DE C9) -- wait: fmulp st1, st0 = multiply st1 by st0, pop. stk: [rx, scale, scale] → fmulp: st1 = st1*st0 = scale*rx, pop st0. [scale*rx, scale]
; Hmm wait: after 'fld dword [rx]', stack is [rx, scale, scale] (st0=rx, st1=first_scale, st2=second_scale)
; 'fmulp st1, st0' = st1 = st1 * st0 = scale * rx, pop st0. Stack: [scale*rx, scale]
; Good.
fistp word [tmp] ; 4 (DF 1E lo hi)
mov ax, [tmp] ; 3 (A1 lo hi)
add ax, 160 ; 3 (05 A0 00)
; Store in pts array: pts[bx*4]
mov di, bx ; 2 (89 DF? no: MOV DI, BX = 89 DF? In 16-bit: MOV r/m16, r16 = 89 /r. With r=BX(3), r/m=DI(7): ModRM = 11 011 111 = 0xDF. So 89 DF = 2 bytes)
shl di, 2 ; 3 (C1 E7 02)
mov [pts+di], ax ; ?
; For [pts+di], we need base displacement with DI:
; In 16-bit mode: [disp16 + DI] = mod=00, rm=101 = [DI+disp16]?
; Actually in 16-bit mode addressing:
; mod=10, rm=101 = [DI + disp16]
; So: MOV [DI+disp16], AX = 89 85 lo hi = 4 bytes (89 = MOV r/m16, r16; 85 = mod=10, reg=AX=0, rm=DI=5 = 10_000_101 = 85h)
; But wait: is pts within a signed 16-bit displacement? Yes, pts is an offset in the .COM file.
; So: mov [pts+di], ax = 89 85 lo_pts hi_pts = 4 bytes. ✓
fild word [vy] ; 4
fmulp st1, st0 ; 2
fchs ; 2
fistp word [tmp] ; 4
mov ax, [tmp] ; 3
add ax, 100 ; 3
mov [pts+di+2], ax ; 5 bytes (89 85 lo+2 hi or similar)
Hmm, mov [pts+di+2], ax:
Projection: 4+4+4+2+4+2+4+3+3+2+3+4 + 4+2+2+4+3+3+4 = ?
Let me add them up:
fld dword [rz] : 4
fadd dword [fpd] : 4
fdivr dword [fps] : 4
fld st0 : 2
fld dword [rx] : 4
fmulp st1,st0 : 2
fistp word [tmp] : 4
mov ax, [tmp] : 3
add ax, 160 : 3
mov di, bx : 2
shl di, 2 : 3
mov [pts+di], ax : 4
fild word [vy] : 4
fmulp st1,st0 : 2
fchs : 2
fistp word [tmp] : 4
mov ax, [tmp] : 3
add ax, 100 : 3
mov [pts+di+2],ax : 4
Total: 4+4+4+2+4+2+4+3+3+2+3+4+4+2+2+4+3+3+4 = 61 bytes
Vertex loop overhead:
inc bx : 1 (43h)
cmp bx, 8 : 3 (83 FB 08)
jl vloop : 2 (7C xx)
Total: 6 bytes
Total vertex loop: 28 (coords) + 44 (rotation) + 61 (projection) + 6 (overhead) = 139 bytes
That's a lot for just the vertex loop. This won't fit in 256 bytes total!
Let me think about major optimizations.
Optimization 1: Don't compute vx, vy, vz as memory variables. Keep them in registers and use FPU iload tricks.
Optimization 2: Reduce the projection calculation. Skip fld st0 duplication if I compute vy multiplication differently.
Optimization 3: Use integer arithmetic instead of FPU where possible. For example, compute the rotation in integer arithmetic:
Optimization 4: Compute all vertex coordinates analytically without a loop. Since the 8 vertices are sign permutations of (S,S,S), and we're doing Y rotation + orthographic projection:
So there are only 4 distinct x-values and 2 y-values. I need only compute 6 values total (not 16). The 8 vertices map to these 6 values with a specific pattern.
Let me think about this more. With Y-axis rotation:
For vertex with index i (bits 0,1,2):
Let me pre-compute ca_S = round(caSscale) and sa_S = round(saSscale), then:
And ry = ±S.
So with perspective, I'd need rz values too:
With orthographic (no perspective):
And the 8 screen points are: (ry ∈ {-S, +S}) × (rx ∈ {ca_S-sa_S, ca_S+sa_S, -(ca_S+sa_S), sa_S-ca_S})
Wait, but the mapping between vertex index and (vx,vz) pairs:
OK this optimization direction could work for orthographic, but for perspective I need per-vertex rz values for the perspective division.
Let me simplify by using orthographic projection. No perspective, just:
For orthographic, no FPU per-vertex projection division needed! I can do everything in integer:
This approach:
The per-vertex computation in integer:
; BX = vertex index
; ica, isa = scaled sin/cos (word variables)
mov ax, vx_from_index(bx) ; ±S
imul ica_word ; DX:AX = vx * ica
sar ax, 6 ... this is tricky with word multiply
Actually imul r16, r/m16 (signed multiply word) gives DX:AX = AX * r/m16. For vx ∈ {-60, 60} and ica ∈ {-64, 64}, the product is ≤ 60*64 = 3840, which fits in a 16-bit AX (no overflow into DX). So:
mov ax, vx_value ; ±60
imul word [ica] ; AX = ax * ica (truncated to 16 bits, but product fits)
; Hmm, but IMUL with one operand gives DX:AX result, and AX has the lower 16 bits
; For small values (≤ 3840), AX is the full result
; Actually to get just AX from imul:
; Option 1: Use IMUL r, r/m, imm (3-operand form): imul ax, ax, [ica]... wait that needs imm not mem
; Option 2: Use standard IMUL mem and accept DX:AX
; The cleaner approach:
; vx = -60 or 60 (known magnitude)
; ica_full = round(cos(a) * 64) -- 7 fractional bits
; rx_fixed = vx * ica - vz * isa (all in 16-bit, no overflow since max = 2*60*64 = 7680 < 32768)
; Actually: vx*ica in range [-60*64, 60*64] = [-3840, 3840], no overflow ✓
; rx_fixed = 64*rx, divide by 64 = SAR 6
; Note: 3840 * 2 = 7680 < 32767 ✓, fits in signed 16-bit
So the integer approach works. Let me redesign:
nasm
bits 16
org 100h
S equ 55 ; cube half-size
SCL equ 64 ; sin/cos scale factor (6 bits fraction)
SCL_SHIFT equ 6
start:
mov ax, 0013h
int 10h
mov ax, 0a000h
mov es, ax
fninit
fldz ; angle = 0 on FPU stack
frame:
; Clear screen
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
; Update angle and compute sin, cos as scaled integers
fadd dword [da] ; angle += da
fld st0
fsin
fmul dword [fscl] ; sin * 64
fistp word [isa] ; isa = round(sin*64)
fld st0
fcos
fmul dword [fscl]
fistp word [ica] ; ica = round(cos*64)
; angle still at st0
; Project 8 vertices (orthographic, Y rotation)
xor bx, bx
vloop:
; Compute vx, vz, vy from vertex index bx
; vx:
mov ax, bx
and al, 1
add al, al
dec al
cbw
imul ax, ax, S
mov [vx], ax
; vz:
mov ax, bx
shr al, 2
and al, 1
add al, al
dec al
cbw
imul ax, ax, S
mov [vz], ax
; vy:
mov ax, bx
shr al, 1
and al, 1
add al, al
dec al
cbw
imul ax, ax, S
mov [vy], ax
; rx = (vx*ica - vz*isa) >> SCL_SHIFT
mov ax, [vx]
imul word [ica] ; DX:AX = vx * ica
mov cx, ax ; save low word of vx*ica
mov ax, [vz]
imul word [isa] ; DX:AX = vz * isa
sub cx, ax ; cx = vx*ica - vz*isa (low words, since result fits in 16 bits)
sar cx, SCL_SHIFT ; divide by 64
add cx, 160 ; screen center
; ry = vy (unchanged by Y rotation)
mov ax, [vy]
neg ax ; flip Y for screen
add ax, 100
; Store (cx, ax) = (sx, sy) in pts array
mov di, bx
shl di, 2
mov [pts+di], cx
mov [pts+di+2], ax
inc bx
cmp bx, 8
jl vloop
This is much better! The integer approach avoids FPU per-vertex projection.
But the coordinate loading is still 51 bytes (3 repetitions of 17 bytes). Let me use the coordinate loop optimization I worked out.
Actually, do I even need a separate vx, vz, vy? I can compute them directly in the rotation:
nasm
; Compute vx = ±S directly in AX:
mov ax, bx
and al, 1
add al, al
dec al
cbw
imul ax, ax, S
; AX = vx
; Compute vz:
mov cx, bx
shr cl, 2
and cl, 1
add cl, cl
dec cl
; can't do cbw with cl, need to go through AX
movsx cx, cl ; nope, 16-bit mode doesn't have movsx to extend to same reg
; Alternative: imul uses AX, so I need to put vz in AX
; This requires:
; 1. save vx*ica first
; 2. then compute vz*isa
Hmm, let me just go with the three separate computations (vx, vy, vz) and use the coordinate loop optimization.
Actually, let me think about a different approach: precompute all four "base" screen x values. Since vx ∈ {-S, S} and vz ∈ {-S, S}, there are only 4 distinct rx values:
So: let P = S*(ica+isa)>>6, Q = S*(ica-isa)>>6. Then the 4 rx values are: Q, P, -P, -Q.
Precompute P and Q, then assign: the 8 rx screen values are P+160, Q+160, -P+160, -Q+160 each appearing twice (for vy=±S).
But figuring out which vertex gets which value requires knowing the bit pattern, which is about as complex as just computing it per-vertex.
OK let me just go with the per-vertex computation but use the coord loop to save space.
Here's my current plan for a compact version:
nasm
bits 16
org 100h
S equ 55
SCL equ 6 ; shift amount for fixed-point
start:
mov ax, 13h
int 10h
mov ax, 0a000h
mov es, ax
fninit
fldz
frame:
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
fadd dword [da]
fld st0
fsin
fmul dword [fscl]
fistp word [isa]
fld st0
fcos
fmul dword [fscl]
fistp word [ica]
xor bx, bx
vloop:
; Compute vx, vy, vz at [vx], [vy], [vz]
push bx
mov di, vx
mov cx, 3
.cl: mov ax, bx
and al, 1
add al, al
dec al
cbw
imul ax, ax, S
mov [di], ax
inc di
inc di
shr bx, 1
loop .cl
pop bx
; rx = (vx*ica - vz*isa) >> SCL
mov ax, [vx]
imul word [ica]
xchg ax, cx
mov ax, [vz]
imul word [isa]
sub cx, ax
sar cx, SCL
add cx, 160
; sy = -vy + 100
mov ax, [vy]
neg ax
add ax, 100
; Store
mov di, bx
shl di, 2
mov [pts+di], cx
mov [pts+di+2], ax
inc bx
cmp bx, 8
jl vloop
Wait, there's an issue. imul word [ica] multiplies AX by [ica] and stores result in DX:AX. For small values (|vx|=55, |ica|≤64), |product| ≤ 3520 < 32768, so result fits in AX (DX will be -1 or 0). So sub cx, ax works as long as I only use the low 16 bits.
Actually, sar cx, SCL where SCL=6: this arithmetic right shifts cx by 6. If cx is the 16-bit result, this effectively divides by 64. Since |cx| ≤ 2*3520 = 7040, sar cx, 6 gives |cx| ≤ 110, which fits nicely for a 160 screen offset.
But there might be an issue: after imul word [ica], the full 32-bit result is in DX:AX. I'm only using AX (the low 16 bits). For products ≤ 3520, DX is the sign extension (0 or -1), so the lower 16 bits AX is the correct signed 16-bit result. ✓
And then sub cx, ax: cx = vxica_low16 - vzisa_low16. Since both terms are small, no overflow. ✓
Then sar cx, 6: arithmetic shift right by 6. Correct integer division by 64. ✓
For sar cx, 6 encoding: C1 F9 06 = 3 bytes (SAR CX, imm8).
But wait - I had xchg ax, cx to save vxica to cx, then I do mov ax, [vz]; imul word [isa] to get vzisa in AX, then sub cx, ax. This should work.
Actually, I just realized: imul word [ica] is a one-operand form: IMUL r/m16 = F7 /5. With memory operand [ica]: F7 2E lo hi = 4 bytes. And the operand is [ica] which is a 16-bit word in memory. OK.
Let me now count the bytes for the vertex loop body:
Coord loop (28 bytes as calculated earlier):
push bx : 1
mov di, vx : 3
mov cx, 3 : 3
cl:
mov ax, bx : 2
and al, 1 : 2
add al, al : 2
dec al : 2
cbw : 1
imul ax,ax,S : 3
mov [di], ax : 2
inc di : 1
inc di : 1
shr bx, 1 : 2
loop cl : 2
pop bx : 1
Total: 1+3+3+2+2+2+2+1+3+2+1+1+2+2+1 = 28 bytes
Rotation (12 bytes):
mov ax, [vx] : 3 (A1 lo hi)
imul word [ica] : 4 (F7 2E lo hi)
xchg ax, cx : 1 (91? no: xchg ax, cx = 91h? No. XCHG AX, CX = 91h = 1 byte! Yes. XCHG AX, r16 = 90+r where CX=1, so 91h)
mov ax, [vz] : 3 (A1 lo hi)
imul word [isa] : 4
sub cx, ax : 2 (2B C8)
sar cx, SCL : 3 (C1 F9 06)
add cx, 160 : 3 (81 C1 A0 00)
Total: 3+4+1+3+4+2+3+3 = 23 bytes
Screen Y (6 bytes):
mov ax, [vy] : 3 (A1 lo hi)
neg ax : 2 (F7 D8)
add ax, 100 : 3 (05 64 00)
Total: 8 bytes
Store (14 bytes):
mov di, bx : 2
shl di, 2 : 3
mov [pts+di],cx : 4
mov [pts+di+2],ax: 4
Total: 13 bytes
Loop control (6 bytes):
inc bx : 1
cmp bx, 8 : 3
jl vloop : 2
Total: 6 bytes
Vertex loop total: 28 + 23 + 8 + 13 + 6 = 78 bytes
That's much better!
Now for the line drawing. I need to draw 12 edges. For each edge (v1, v2):
Line drawing with 64 steps (DDA):
; Given: (x1, y1) in (SI, DI) and (x2, y2) in ... hmm need registers
; Edge indices stored in [edges] array
mov si, edges
mov cx, 12
draw_lp:
; Get vertex indices
lodsb ; al = v1 index (also advances si)
movzx bx, al
shl bx, 2
mov dx, [pts+bx] ; x1 = dx
mov [y1], word [pts+bx+2] ; y1
lodsb
movzx bx, al
shl bx, 2
mov bx, [pts+bx] ; x2 = bx ... conflict with bx already used
Hmm, register conflicts. Let me use the stack or temp variables.
Actually for a simple DDA line draw:
nasm
; Input: x1=dx, y1=tmp_y1, x2=bx, y2=tmp_y2
; Use fixed-point 8.8 arithmetic with 64 steps (but 64 might not be enough)
; Better: use 128 steps or the actual pixel distance
; Simple approach: step from t=0 to 127, plot x1 + t*(x2-x1)/128, y1 + t*(y2-y1)/128
; x_step = (x2-x1)*2 (= (x2-x1)*(256/128))
; y_step = (y2-y1)*2
; x_fixed = x1 << 8
; y_fixed = y1 << 8
; for 128 steps: x_fixed += x_step, y_fixed += y_step, plot (x_fixed>>8, y_fixed>>8)
This requires 128 iterations per edge × 12 edges = 1536 pixel operations per frame. Fine for DOS speed.
Line drawing code (inside edge loop):
nasm
; Assume dx=x1, ax=y1, bx=x2, cx=y2 going in
push cx ; loop counter for edge loop
push si ; edge pointer
; Set up DDA
; x_diff = x2-x1, y_diff = y2-y1
; Actually store fixed-point starters
sub bx, dx ; bx = x2-x1
sub cx, ax ; cx = y2-y1 ... but I destroyed y2 here
; Hmm, this requires more careful register management
This is getting complex. Let me use a different approach: store all coords on the stack.
Actually, for 256b, code duplication via CALL might save bytes. Let me define a plot routine:
nasm
plot:
; IN: BX=x, DX=y
; Clip check
or bx, bx
js .skip
cmp bx, 320
jge .skip
or dx, dx
js .skip
cmp dx, 200
jge .skip
; Calculate video memory offset: y*320 + x
imul dx, 320
add dx, bx
mov di, dx
mov byte [es:di], 15 ; white (color 15)
.skip:
ret
That's about 20 bytes for the plot function. Called ~1536 times per frame, so about 1536 * 3 (call overhead) = 4608 extra cycles per frame. Negligible for DOS.
But that's 20 bytes of code. Combined with the line drawing loop, total line drawing section needs to fit in the remaining bytes.
Let me count what I have so far:
Total so far (without line drawing): 12 + 9 + 30 + 78 + 10 + 5 = 144 bytes Remaining for line drawing: 256 - 144 = 112 bytes Data: ~30-40 bytes? So: 112 - 40 = 72 bytes for line drawing code.
72 bytes for drawing 12 edges. Is that enough?
Let me try. The edge drawing section:
nasm
; Edge drawing
mov si, edges
mov cl, 12 ; 12 edges (use CL to save 1 byte? no: mov cl, 12 = 2 bytes; mov cx, 12 = 3 bytes)
; Actually, I can use a word loop counter
elp:
; Get two vertex indices from edges table
lodsb ; 1 byte, si++
movzx bx, al ; 3 bytes (0F B6 C0? No: 0F B6 D8 = 3 bytes movzx bx, al)
shl bx, 2 ; 3 bytes
; Get x1, y1
mov ax, [pts+bx] ; 4 bytes
mov dx, [pts+bx+2] ; 4 bytes
lodsb ; 1
movzx bx, al ; 3
shl bx, 2 ; 3
; Get x2, y2
push [pts+bx+2] ; push y2: ... hmm, PUSH [mem] in 16-bit = FF /6 = FF 36 lo hi = 4 bytes? Or using BX+disp: complex
push [pts+bx] ; push x2
; x1 in ax, y1 in dx
push dx ; push y1
push ax ; push x1
This is getting really messy. Let me try a cleaner approach for the line drawing.
Compact line drawing: Since the cube is small (radius ~55 pixels), edges are at most ~150 pixels long. I'll use a simple parameterized approach with 128 steps:
nasm
; Draw line from (x1,y1) to (x2,y2)
; All in 16-bit fixed point (<<8)
; For each edge: si points to edge pair in edges table
line_draw:
; Pop coordinates (push x1,y1,x2,y2 before calling... too complex)
Let me use a very compact DDA approach without subroutines. Put the edge indices in the data, and inline the line drawing:
Actually, you know what? Let me try a totally different rendering approach for maximum compactness:
Just plot pixels at edge midpoints sampled densely: Instead of drawing lines, I'll sample 64 points along each edge parameterically using the FPU. This is less code but same visual result:
nasm
; For each edge (v0, v1), for t=0..63:
; x = x0 + (x1-x0)*t/64
; y = y0 + (y1-y0)*t/64
Using integer arithmetic:
nasm
; x_step = (x1-x0)/64 -- but this loses precision for short edges
; Better: use <<1 or <<2 for sub-pixel accuracy
Let me go with a 128-step DDA using 16.8 fixed point:
Actually, let me just write it and count bytes:
nasm
draw_edges:
mov si, edges
mov cx, 12
.el:
push cx
push si
lodsb ; 1: al = v0
xor ah, ah ; 2: zero extend
shl ax, 2 ; 3: ax = v0*4
mov bx, ax ; 2
mov ax, [pts+bx] ; 4: x0
mov dx, [pts+bx+2] ; 4: y0 (ah=0, dx=y0)
lodsb ; 1: al = v1
xor ah, ah ; 2
shl ax, 2 ; 3
mov bx, ax ; 2
; Compute differences (x1-x0, y1-y0) scaled by 256/128 = 2
mov si, [pts+bx] ; 4: x1 in si
sub si, ax ; 2: si = x1-x0 ... wait ax is bx*4, not x1!
I'm making mistakes with registers. This is getting too complicated to do in my head. Let me start fresh with a cleaner structure.
Let me look at what a real 256b cube demo does. The key insight: many 256b demos are VERY carefully optimized and use tricks specific to the geometry.
Let me try a completely different design that I can actually implement correctly:
New design:
For drawing: I'll use the BIOS interrupt 10h/AH=0Ch to draw pixels. This is slow but very compact.
Wait! Using INT 10h AH=0Ch (write pixel):
nasm
plot_pixel:
; BX=x, DX=y, AL=color
mov ah, 0Ch
mov bh, 0
mov cx, bx ; cx=x
int 10h
ret
That's 8 bytes for the subroutine. But INT 10h is very slow! Might be too slow for animation.
Actually, for a very simple demo, slow is OK. Let me try this approach:
For drawing a line with INT 10h:
nasm
; Line from (x0,y0) to (x1,y1) using 64 steps
; Push: x0, y0, x1, y1 before call
; Or pass in registers
Hmm, let me try yet another approach. Since I'm struggling to fit this in 256 bytes, let me look at what real demos do.
Actually, I think I should just write the code, get it working (maybe ~300 bytes), then optimize it down. Let me write the full code and optimize.
Here's my complete attempt:
nasm
; 256b intro: rotating 3D cube in DOS mode 13h
; NASM: nasm -f bin cube.asm -o cube.com
bits 16
org 100h
S equ 55 ; cube half-size
SCL equ 6 ; fixed-point shift (2^6 = 64)
start:
mov ax, 0013h ; mode 13h (320x200, 256 colors)
int 10h
mov ax, 0a000h ; video segment
mov es, ax
fninit
fldz ; FPU st0 = angle = 0.0
frame:
; Clear screen
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
; Update angle (keep on FPU stack)
fadd dword [da] ; angle += 0.04
; Compute sin and cos as fixed-point integers (scale = 2^SCL = 64)
fld st0
fsin
fmul dword [fscl]
fistp word [isa] ; isa = round(sin(a) * 64)
fld st0
fcos
fmul dword [fscl]
fistp word [ica] ; ica = round(cos(a) * 64)
; Project 8 cube vertices using integer arithmetic
xor bx, bx ; vertex counter
.vl:
; Compute vx, vy, vz as integers (each = ±S = ±55)
; Store at [vcoord], [vcoord+2], [vcoord+4]
push bx
lea di, [vcoord] ; 4 bytes (8D 3E lo hi)
mov cx, 3
.cl:
mov ax, bx ; 2
and al, 1 ; 2
add al, al ; 2
dec al ; 2
cbw ; 1
imul ax, ax, S ; 3
stosw ; 1 (stores ax to [di], di+=2) -- MUCH BETTER than mov [di],ax; inc di; inc di!
shr bx, 1 ; 2
loop .cl ; 2
pop bx ; 1
; Total coord loop: 4+3 + (2+2+2+2+1+3+1+2+2)*1 = 7 + 17 = 24 bytes
; Wait: the body is 2+2+2+2+1+3+1+2+2 = 17 bytes + loop = 2 bytes = 19 bytes in loop
; Setup: lea di,[vcoord] (4) + mov cx,3 (3) = 7 bytes
; Total: push(1) + 7 + 19 + pop(1) = 28 bytes
; vx=[vcoord], vy=[vcoord+2], vz=[vcoord+4]
; rx = (vx*ica - vz*isa) >> SCL
mov ax, [vcoord] ; vx -- 3 bytes
imul word [ica] ; DX:AX = vx*ica -- 4 bytes (F7 2E lo hi)
mov cx, ax ; 2 bytes (cx = vx*ica, assuming fits in 16 bits)
mov ax, [vcoord+4] ; vz -- 3 bytes
imul word [isa] ; DX:AX = vz*isa -- 4 bytes
sub cx, ax ; cx = vx*ica - vz*isa -- 2 bytes
sar cx, SCL ; rx = cx >> 6 -- 3 bytes
add cx, 160 ; sx -- 3 bytes
; Subtotal: 3+4+2+3+4+2+3+3 = 24 bytes
; sy = -vy + 100
mov ax, [vcoord+2] ; vy -- 3 bytes
neg ax ; 2 bytes
add ax, 100 ; 3 bytes
; Subtotal: 8 bytes
; Store in pts array at index bx
mov di, bx ; 2
shl di, 2 ; 3
mov [pts+di], cx ; 4
mov [pts+di+2], ax ; 4
; Subtotal: 13 bytes
inc bx ; 1
cmp bx, 8 ; 3
jl .vl ; 2
; Total vertex loop overhead: 6 bytes
; Total vertex loop: 28 + 24 + 8 + 13 + 6 = 79 bytes
; Draw edges
mov si, edges ; 3
mov cx, 12 ; 3
.el:
push cx ; 1
push si ; 1
; Load v0 screen coords into (dx, bx) = (x0, y0)
xor ah, ah ; 2
lodsb ; 1
shl ax, 2 ; 3
xchg ax, bx ; 1 (XCHG AX, BX = 93h = 1 byte)
mov dx, [pts+bx] ; 4 (89? No: MOV DX, [BX+disp16] = 8B 97 lo hi = 4 bytes)
mov bx, [pts+bx+2] ; 4
; (dx=x0, bx=y0)
; Load v1 screen coords into SI register? But si is the edge pointer...
pop si ; 1 - restore si
push si ; 1 - save again
lodsb ; 1
xor ah, ah ; 2
shl ax, 2 ; 3
xchg ax, di ; hmm, XCHG AX, DI = 97h = 1 byte? Yes! XCHG AX, r16 = 90+r, DI=7: 97h
mov ax, [pts+di] ; x1 -- 4 bytes (8B 85 lo hi for [DI+disp16])
mov di, [pts+di+2] ; y1 ... wait this overwrites di which I need for the address!
This is getting way too complicated. Let me use memory variables for the line drawing instead of trying to juggle registers.
You know what, let me step back and write a clean, working demo that might be slightly over 256 bytes but is correct and readable. Then I'll provide it with notes about optimizations to get it under 256 bytes. This is a better answer than a buggy 256-byte version.
Actually, I've been overthinking this. Let me look at what a typical real 256b cube .COM file looks like and base my answer on that.
For a DOS .COM 256b cube, common patterns I've seen include:
Let me write a complete, working version with reasonable size. Here's my final attempt, carefully written:
nasm
bits 16
org 100h
; ===== CONSTANTS =====
CX equ 160 ; screen center X
CY equ 100 ; screen center Y
S equ 55 ; cube half-size
; ===== CODE =====
start:
; Set video mode 13h (320x200, 256 colors, palette-based)
mov ax, 0013h
int 10h
; Point ES to video memory at A000:0000
mov ax, 0A000h
mov es, ax
; Initialize FPU
fninit
; Push initial angle (0.0) onto FPU stack - stays there permanently
fldz
; ===== MAIN LOOP =====
.frame:
; --- Clear screen ---
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
; --- Increment angle, compute sin/cos as scaled integers ---
fadd dword [.da] ; angle += 0.04 rad
fld st0 ; duplicate angle
fsin ; sin(angle)
fmul dword [.fscl] ; * 64 (scale factor)
fistp word [.isa] ; store as integer
fld st0 ; duplicate angle again
fcos ; cos(angle)
fmul dword [.fscl]
fistp word [.ica]
; --- Project 8 cube vertices ---
xor bx, bx
.vloop:
; Compute vertex coordinates: each = ±S based on bits of BX
; Bit 0 -> vx, bit 1 -> vy, bit 2 -> vz
push bx
mov di, .vx ; di points to vx storage
mov cx, 3 ; 3 coordinates
.coord:
mov ax, bx ; ax = index (or shifted version)
and al, 1 ; take bit 0
add al, al ; 0->0 or 1->2
dec al ; 0->-1 or 2->1
cbw ; sign-extend to word
imul ax, ax, S ; scale: -S or +S
stosw ; store ax to [di], di += 2
shr bx, 1 ; shift to expose next bit
loop .coord
pop bx
; Rotate around Y axis:
; rx = (vx * ica - vz * isa) >> 6
mov ax, [.vx]
imul word [.ica]
mov cx, ax ; cx = vx*ica (low 16 bits; fits since |vx*ica| <= 55*64 = 3520)
mov ax, [.vz]
imul word [.isa]
sub cx, ax ; cx = vx*ica - vz*isa
sar cx, 6 ; rx
add cx, CX ; sx = rx + 160
; sy = -vy + 100 (y is flipped on screen)
mov ax, [.vy]
neg ax
add ax, CY
; Store screen coords in pts array
mov di, bx
shl di, 2 ; di = bx * 4
mov [.pts+di], cx ; store sx
mov [.pts+di+2], ax ; store sy
inc bx
cmp bx, 8
jb .vloop
; --- Draw 12 edges ---
mov si, .edges
mov cx, 12
.edloop:
push cx
push si
; Load vertex 0 coords
lodsb ; al = index of v0
xor ah, ah
shl ax, 2 ; ax = v0 * 4
mov bx, ax
mov dx, [.pts+bx] ; dx = x0
mov ax, [.pts+bx+2] ; ax = y0
; Load vertex 1 coords
push ax ; save y0
lodsb
xor ah, ah
shl ax, 2
mov bx, ax
mov cx, [.pts+bx] ; cx = x1
mov bx, [.pts+bx+2] ; bx = y1 (overwrites bx=index*4, but we're done with it)
pop ax ; ax = y0
; Line from (dx, ax) to (cx, bx)
; Use 128-step DDA
; x_step = (cx-dx) * 2 (fixed-point 8.0 scaled by 1/128 = /128, so *256/128 = *2)
; y_step = (bx-ax) * 2
; Starting from (dx<<8, ax<<8), each step adds (x_step, y_step) then plot >>8
sub cx, dx ; cx = x_diff
sub bx, ax ; bx = y_diff
; Fixed-point: multiply by 256 and divide by 128 = multiply by 2
add cx, cx ; x_step = x_diff * 2
add bx, bx ; y_step = y_diff * 2
; Starting positions (fixed-point)
mov si, dx
shl si, 8 ; si = x0 << 8 (x position * 256)
push ax ; save y0
shl ax, 8 ; ax = y0 << 8
mov di, ax ; di = y position * 256
push cx ; save x_step
push bx ; save y_step
; 128 step loop
mov cx, 128
.linestep:
; Add steps
add si, [bp-?] ; si += x_step ... register pressure issue
; This needs x_step and y_step to be in registers or on stack
; I already pushed them... but accessing via BP from stack is complex
pop bx ; y_step
pop cx ; x_step -- BAD, breaks the loop
I'm running into register pressure again. In 16-bit x86, I only have AX, BX, CX, DX, SI, DI, BP, SP as general registers, but SP is reserved and BP requires segment+offset for stack access.
For the line drawing, I need to track: x_current, y_current, x_step, y_step, iteration_count. That's 5 values. With registers AX, BX, CX, DX, SI, DI = 6 registers, but I also need a counter (CX for LOOP).
Let me use these registers for the line stepping:
And push/pop for saving before/restoring after. But then I can't use LOOP (destroys CX). I can use DEC CX; JNZ instead.
And where do I store the fixed-point values? Actually, I don't have room for all this. Let me use a simpler approach.
Ultra-simple line drawing: Instead of fixed-point DDA, use the FPU for parameterization:
For 64 steps t=0..63:
fld (x2-x1)/64.0 ; step
; hmm this needs FPU per step too
Or: just plot 32 dots per edge using IMUL for interpolation:
nasm
; For t = 0 to 31:
; x = x0 + (x1-x0)*t/32
; y = y0 + (y1-y0)*t/32
; Using: x = x0 + t*dx>>5 where dx = x1-x0
With 32 steps:
nasm
; Given x0,y0,x1,y1 (all ints)
; dx = x1-x0, dy = y1-y0
; For t=0..31:
; px = x0 + (dx * t) / 32 -- use IMUL for multiplication
; py = y0 + (dy * t) / 32
; Inner loop:
mov bx, 0 ; t = 0
.tloop:
; Compute px
mov ax, dx_diff ; dx
imul bx ; DX:AX = dx * t
sar ax, 5 ; >> 5 (divide by 32)
add ax, x0_val ; ax = px
; Compute py
push ax ; save px
mov ax, dy_diff
imul bx
sar ax, 5
add ax, y0_val ; ay = py
; Plot at (px, py) in mode 13h
pop bx_px ; ... register mess
inc bx
cmp bx, 32
jl .tloop
This inner loop is about 20 bytes. With 12 edges, the outer loop setup is ~20 bytes. Total line drawing: ~40-50 bytes + plotting routine (~15 bytes) = ~55-65 bytes.
For the plotting routine in mode 13h:
nasm
; Plot pixel at (AX=x, DX=y) with color 15 (white)
; Video offset = y*320 + x = y*64 + y*256 + x
; y*256 = y << 8 (high byte), y*64 = y << 6
plot:
; Check bounds
; y: 0..199, x: 0..319
test ax, ax
js .done
cmp ax, 320
jge .done
test dx, dx
js .done
cmp dx, 200
jge .done
; Calculate offset
push ax ; save x
mov ax, dx ; y
mov di, ax
shl ax, 8 ; y * 256
shl di, 6 ; y * 64
add di, ax ; di = y * 320
pop ax
add di, ax ; di += x
mov byte [es:di], 15 ; plot white
.done:
ret
About 25 bytes. But that's a lot for just plotting.
Alternative plot without bounds checking (risky but for a demo it's fine):
nasm
plot:
; AX=x, DX=y, plot white
; offset = y*320 + x
push ax
imul dx, 320 ; DX = y*320 (imul r, r, imm: 6B D2 40 01 = 4 bytes... or IMUL DX, DX, 320)
; Actually: IMUL r16, r/m16, imm16 = 69 /r imm16 or 6B /r imm8 if fits in 8 bits
; 320 doesn't fit in 8 bits (>127), so: 69 D2 40 01 = 4 bytes
add dx, ax ; 2 bytes (03 D0)
pop ax
mov di, dx ; 2
mov byte [es:di], 15 ; 3 (26 C6 05 0F)
ret ; 1
Total plot (no bounds check): 4+2+2+3+1 = 12 bytes? Let me recount:
push ax
xchg ax, dx ; ax=y, dx=x
imul ax, ax, 320 ; ax = y*320 (69 C0 40 01 = 4 bytes)
add ax, dx ; ax = y*320 + x
mov di, ax ; 2
pop ax ; restore x (not needed?)
mov byte [es:di], 15 ; 3
ret ; 1
Total: 1+2+4+2+2+1+3+1 = 16 bytes. Still OK.
Actually wait - bounds checking is important. Without it, if a projected coordinate is out of bounds, we'll write to wrong memory. For a simple demo, the cube is well-centered, so maybe I can skip bounds checking. Let me try.
Without bounds check:
nasm
plot:
; AX=x, DX=y
xchg ax, dx ; ax=y, dx=x (XCHG AX, DX = 92h = 1 byte)
imul ax, ax, 320 ; 4 bytes
add ax, dx ; 2 bytes
mov di, ax ; 2 bytes
mov byte [es:di], 15 ; 3 bytes
ret ; 1 byte
Total: 13 bytes
Hmm, but: for xchg ax, dx, XCHG AX, DX = 92h = 1 byte. And XCHG AX, DX swaps AX and DX. So after that, AX has old DX (=y), DX has old AX (=x). ✓
Then imul ax, ax, 320: IMUL r16, r/m16, imm16 (non-8bit): opcode 69, ModRM with reg=AX=0, r/m=AX=0 in mod=11: ModRM = 11_000_000 = 0xC0. Immediate = 320 = 0x0140. So: 69 C0 40 01 = 4 bytes. ✓
Then add ax, dx: ModRM for AX,DX... ADD AX, DX = ADD r16, r/m16 = 03 /r. AX+DX: 03 C2 = 2 bytes. ✓
mov di, ax: 89 C7? Let me check. MOV r/m16, r16 = 89. With r=AX, r/m=DI: 89 C7... wait: MOV DI, AX = MOV r/m16, r16 = 89 /r with r/m=DI(7), r=AX(0): 89 F8? No. Actually MOV DI, AX is better written as MOV DI, r16 using MOV r16, r/m16 = 8B /r: 8B F8: mod=11, reg=DI(7), r/m=AX(0) → 11_111_000 = 0xF8. So: 8B F8 = 2 bytes. Or using MOV r/m16, r16 = 89: r=AX(0), r/m=DI(7) in mod=11 → 11_000_111 = 0xC7. So 89 C7 = 2 bytes. Either way, 2 bytes.
mov byte [es:di], 15: ES segment override prefix = 26h, then MOV byte [DI], imm8 = C6 05 0F (ModRM=05 for [DI] without displacement, imm=15=0x0F). Total: 26 C6 05 0F = 4 bytes.
Actually, wait: MOV m8, imm8 = C6 /0. For [DI]: mod=00, reg=000(/0), rm=101(DI) → 00_000_101 = 0x05. So: 26 C6 05 0F = 4 bytes. ✓
Plot total: 1+4+2+2+4+1 = 14 bytes. That's quite compact.
Now for the line drawing loop. Let me design it to use CALL plot:
nasm
; Input registers: need x0,y0,x1,y1
; Let me load x0,y0 into SI,DI and x1,y1 into [lx1],[ly1]
; Actually for simplicity, use memory variables for all 4 coords
; and use a register as the parameter t
draw_line:
; draws line from (lx0,ly0) to (lx1,ly1)
mov cx, 32 ; 32 steps
.ls:
; px = lx0 + (lx1-lx0)*cx/32
; py = ly0 + (ly1-ly0)*cx/32
; Using: t = 32-cx (so t goes 0..31)
push cx
mov ax, 32
sub ax, cx ; ax = t = 0..31
; px: ax * (lx1-lx0) / 32 + lx0
mov bx, [lx1]
sub bx, [lx0] ; bx = dx
imul ax ; DX:AX = t * dx ... wait imul r/m: IMUL r/m16, one-operand = F7 /5
; This is 16x16→32 multiply. For t≤31, dx≤320: product ≤ 31*320 = 9920, fits in AX
sar ax, 5 ; divide by 32
add ax, [lx0] ; px
push ax ; save px
; py: t * dy / 32 + ly0
pop ax ; oops, I need t again...
; This is getting messy. Let me restructure.
The problem is t is being used for both px and py. I need to save and restore it. This uses the stack extensively.
Let me use a different approach: precompute dx and dy, then step from (x0, y0) adding (dx, dy) each step (scaled):
nasm
; dx = (x1-x0) * 256 / 32 = (x1-x0) * 8 (8.0 fixed-point step)
; Start at x0, add dx each step for 32 steps
; Then real x = round(current/8)... hmm this is getting complex too
; Simplest: use 8.8 fixed-point
; x_current = x0 * 256
; x_step = (x1-x0) * 256 / 32 = (x1-x0) * 8
; Plot at x_current >> 8 each iteration
; Problem: shifts eat precision for very short edges
; Alternative: use signed division
OK I think I need to just pick an approach and implement it even if it's a bit messy. Let me use the "t from 0 to 31" approach with temp variables:
nasm
draw_line: ; Draws from (lx0,ly0) to (lx1,ly1) in 32 steps
; Compute dx = lx1-lx0, dy = ly1-ly0
mov ax, [lx1]
sub ax, [lx0]
mov [ldx], ax
mov ax, [ly1]
sub ax, [ly0]
mov [ldy], ax
mov cx, 33 ; loop from 32 down to 1 (plots t=0 to t=31)
.ls:
dec cx
jz .done_line
push cx
; t = 32 - cx
mov ax, 32
sub ax, cx ; ax = t (but cx already decremented, so t = 32-cx correctly? t starts at 0 when cx=32... hmm)
; When cx=32: t = 32-32 = 0 ✓
; When cx=1: t = 32-1 = 31 ✓
; px = t * ldx / 32 + lx0
imul word [ldx] ; DX:AX = t * dx (single-operand form: F7 /5)
sar ax, 5 ; divide by 32
add ax, [lx0] ; px
; AX = px
push ax ; save px
; py = t * ldy / 32 + ly0
mov ax, [0??] ; but t was destroyed by imul... need to reload
I keep running into the problem that IMUL overwrites AX and DX, and I need t twice. Let me save t to a variable:
nasm
mov [lt], ax ; save t
imul word [ldx]
sar ax, 5
add ax, [lx0]
push ax ; save px
mov ax, [lt]
imul word [ldy]
sar ax, 5
add ax, [ly0]
mov dx, ax ; dy = py
pop ax ; ax = px
call plot ; plot(ax=x, dx=y)
pop cx
jmp .ls
.done_line:
ret
This line drawing function: about 40 bytes.
But that's still too much. Let me count everything:
Setup: 12 Clear: 9 Sin/cos: ~28 Vertex loop: 79 Edge setup: 6 Edge loop (12 edges × per-edge code): edge loop overhead + line drawing
Each edge: load coords, call draw_line = maybe 30 bytes code + 12 repetitions (but it's a loop) Edge loop: ~25 bytes per iteration + 5 bytes overhead = 25 + 5 = 30 bytes (code, not per iteration) draw_line: 40 bytes plot: 14 bytes Key check + exit: 10 bytes Data: ~30 bytes
Total: 12+9+28+79+30+40+14+10+30 = 252 bytes. Very tight!
But my estimates might be off. Let me try to write the full code and see.
Actually, I realize I'm spending way too long on this analysis. Let me just write the code in a practical way, accepting it might be 256-350 bytes, and provide it as a demo. The user asked for "256b intro in assembly x86 running on DOS in 13h mode" - they want working code that demonstrates this concept. Getting it exactly under 256 bytes requires extensive tweaking that I can note but not necessarily guarantee in this format.
Let me write clean, working code that's reasonably sized and note where to optimize for the 256b constraint.
Actually wait - I just realized something. Let me think about a completely different, much simpler approach.
What if instead of a general line drawing algorithm, I use the fact that for a cube rotating around Y axis with orthographic projection, the projected vertices and edges have a very regular structure? I could potentially precompute just the 4 unique x values and 2 y values and draw lines between them, but this makes the edge drawing code specific to the cube geometry.
Actually, maybe I should simplify by using INT 10h for EVERYTHING - set a single pixel with INT 10h AH=0Ch. This is slow but eliminates the need for manual video memory calculation:
nasm
; Plot pixel at (AX=x, DX=y) with color 15
xchg ax, cx ; cx=x
mov ah, 0Ch
mov al, 15 ; white
xor bh, bh
int 10h ; BH=page, AL=color, CX=x, DX=y
; 7 bytes total, no ret needed if inline
But INT 10h is very slow - way too slow for animation. A typical 13h demo needs direct video memory access.
OK let me just write the complete code. Here's my final, clean version:
nasm
; 256b Rotating 3D Cube - DOS Mode 13h
; Build: nasm -f bin cube.asm -o cube.com
bits 16
org 100h
; ---- Configuration ----
SIZE equ 55 ; cube half-size in pixels
STEPS equ 32 ; line drawing steps per edge
; ---- Entry ----
start:
mov ax, 0013h ; Mode 13h: 320x200x256
int 10h
mov ax, 0A000h
mov es, ax
fninit
fldz ; FPU st0 = rotation angle (keep on stack)
; ---- Main loop ----
.frame:
; Clear screen
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
; Rotate angle and compute sin/cos as fixed-point (<<6 = *64)
fadd dword [.da]
fld st0
fsin
fmul dword [.fscl]
fistp word [.isa]
fld st0
fcos
fmul dword [.fscl]
fistp word [.ica]
; Project 8 vertices: coord = ±SIZE based on vertex index bits
xor bx, bx
.vloop:
push bx
mov di, .vx
mov cx, 3
.coord_loop:
mov ax, bx
and al, 1
add al, al
dec al
cbw
imul ax, ax, SIZE
stosw
shr bx, 1
loop .coord_loop
pop bx
; rx = (vx*ica - vz*isa) >> 6 (Y-axis rotation only)
mov ax, [.vx]
imul word [.ica]
mov dx, ax
mov ax, [.vz]
imul word [.isa]
sub dx, ax
sar dx, 6
add dx, 160 ; sx = rx + screen_cx
; sy = -vy + 100
mov ax, [.vy]
neg ax
add ax, 100
; Save screen coords: pts[bx] = (dx, ax)
mov di, bx
shl di, 2
mov [.pts+di], dx
mov [.pts+di+2], ax
inc bx
cmp bx, 8
jb .vloop
; Draw 12 edges
mov si, .edges
mov cx, 12
.eloop:
push cx
push si
; Load (x0,y0) of first vertex
lodsb
xor ah, ah
shl ax, 2
mov bx, ax
mov dx, [.pts+bx] ; x0
mov di, [.pts+bx+2] ; y0
; Load (x1,y1) of second vertex
lodsb
xor ah, ah
shl ax, 2
mov bx, ax
mov ax, [.pts+bx] ; x1
mov bx, [.pts+bx+2] ; y1
; Draw line from (dx,di) to (ax,bx) using 32 steps
; x_step = (x1-x0)*8 = <<3 (for /32 later via >>5... wait: dx for step = (x1-x0)<<8/32 = (x1-x0)<<3)
; y_step = (y1-y0)*8
; current_x = dx<<8, current_y = di<<8
sub ax, dx ; ax = x_diff
sub bx, di ; bx = y_diff
; Fixed point: shift up by 8, step = diff*256/32 = diff*8 = diff<<3
; But I need to keep x0,y0 too... register pressure!
; Use memory vars
mov [.x0], dx
mov [.y0], di
shl ax, 3 ; ax = x_step (<<3 means *8, which is diff*256/32)
shl bx, 3 ; bx = y_step
mov [.xs], ax
mov [.ys], bx
; Initialize current pos (as 16.8 fixed point, well, just use x0<<8 as start)
mov ax, [.x0]
shl ax, 8 ; x0 * 256
mov [.cx_line], ax
mov ax, [.y0]
shl ax, 8
mov [.cy_line], ax
mov cx, 32
.lstep:
push cx
; Advance position
mov ax, [.cx_line]
add ax, [.xs]
mov [.cx_line], ax
sar ax, 8 ; integer pixel x
mov bx, [.cy_line]
add bx, [.ys]
mov [.cy_line], bx
sar bx, 8 ; integer pixel y
; Plot pixel at (ax, bx) -- white = color 15
; Check bounds
test ax, ax
js .skip
cmp ax, 320
jge .skip
test bx, bx
js .skip
cmp bx, 200
jge .skip
; Offset = bx*320 + ax
push ax
imul bx, bx, 320 ; 4 bytes
add bx, ax ; bx = offset
pop ax
mov di, bx
mov byte [es:di], 15 ; white pixel
.skip:
pop cx
loop .lstep
pop si
pop cx
loop .eloop
; Check for ESC key
in al, 60h
cmp al, 1
jne .frame
; Restore text mode and exit
mov ax, 3
int 10h
ret
; ---- Data ----
align 2
.da: dd 0.04 ; angle increment per frame
.fscl: dd 64.0 ; sin/cos scale factor
.ica: dw 0 ; integer cos(a) * 64
.isa: dw 0 ; integer sin(a) * 64
.vx: dw 0
.vy: dw 0
.vz: dw 0
.pts: times 16 dw 0 ; 8 screen coords * (x,y) = 32 bytes
; Temp vars for line drawing
.x0: dw 0
.y0: dw 0
.xs: dw 0
.ys: dw 0
.cx_line: dw 0
.cy_line: dw 0
; Edge table: 12 edges as vertex index pairs
.edges:
db 0,1, 1,3, 3,2, 2,0 ; front face (vz positive)
db 4,5, 5,7, 7,6, 6,4 ; back face (vz negative)
db 0,4, 1,5, 3,7, 2,6 ; connecting edges
Wait, I need to check my vertex encoding again. With bits: bit0=x, bit1=y, bit2=z:
Edges (differ in exactly 1 bit): Bit 0 pairs: (0,1), (2,3), (4,5), (6,7) Bit 1 pairs: (0,2), (1,3), (4,6), (5,7) Bit 2 pairs: (0,4), (1,5), (2,6), (3,7)
So edges: 0,1, 2,3, 4,5, 6,7, 0,2, 1,3, 4,6, 5,7, 0,4, 1,5, 2,6, 3,7
Wait, this is what I want in .edges. Let me fix that.
Also, I realize there's another issue: the coordinate loop uses stosw which advances DI by 2, storing to .vx (bit0→vx), .vy (bit1→vy), .vz (bit2→vz) in order. After the loop, bx has been shifted right 3 times (bx = original_bx >> 3). The pop bx restores original bx. ✓
But wait: lea di, .vx vs mov di, .vx. In 16-bit mode:
So use mov di, .vx for 3 bytes.
Also, .align 2 adds padding between code and data. In a .COM file, we don't want padding wasted bytes. Remove align 2.
And the line drawing: I have cx_line/cy_line initialized to x0<<8/y0<<8. Then each step adds xs (= x_diff<<3) to cx_line.
Wait: the step is dx8, and we advance 32 times. So total advance = dx832 = dx256. And the final position = x0256 + dx256 = x1*256. So we plot t=1 to t=32 (not t=0 to t=31). Let me check if this is a problem:
So the plot goes from approximately x0 (first step) to x1 (last step). But we miss x0 itself (t=0 is never plotted). That's fine for 32 steps - the endpoints are effectively covered by adjacent edges.
Actually wait: the first plotted pixel is at (x0+dx/32, y0+dy/32) not (x0, y0). For 32 steps, the spacing is dx/32 per step. If the edge is 100 pixels long, that's ~3.1 pixels per step. Some pixels will be skipped but the line will be visible.
For a 256b demo, 32 steps is acceptable. Let me go with it.
But I realize I have the starting position at x0*256, and I FIRST add the step then plot. So the first plotted pixel is at x0 + dx/32, not x0. This means neither endpoint is exactly plotted. Hmm.
Let me adjust: plot at t=0 to t=31 instead:
.lstep:
; Plot at current position first
mov ax, [.cx_line]
sar ax, 8 ; x pixel
mov bx, [.cy_line]
sar bx, 8 ; y pixel
; plot(ax, bx)
...
; Then advance
mov ax, [.cx_line]
add ax, [.xs]
mov [.cx_line], ax
mov ax, [.cy_line]
add ax, [.ys]
mov [.cy_line], ax
loop .lstep
This adds ~12 more bytes to the inner loop. Let me count what we have in the inner loop:
push cx : 1
; Advance
mov ax, [.cx_line] : 3
add ax, [.xs] : 3
mov [.cx_line], ax : 3
sar ax, 8 : 3
mov bx, [.cy_line] : 3
add bx, [.ys] : 3
mov [.cy_line], bx : 3
sar bx, 8 : 3
; Bounds check + plot
test ax, ax : 2
js .skip : 2
cmp ax, 320 : 4 (81 F8 40 01)
jge .skip : 2
test bx, bx : 2
js .skip : 2
cmp bx, 200 : 4 (81 FB C8 00)
jge .skip : 2
; Plot
push ax : 1
imul bx, bx, 320 : 4 (69 DB 40 01)
add bx, ax : 2
mov di, bx : 2
mov byte [es:di], 15 : 4 (26 C6 05 0F)
.skip:
pop cx : 1
loop .lstep : 2
Inner loop total: 1+3+3+3+3+3+3+3+3+2+2+4+2+2+2+4+2+1+4+2+2+4+1+2 = 62 bytes
That's a huge inner loop! With 32 iterations × 12 edges = 384 iterations, but code-wise it's 62 bytes just for the inner loop.
Hmm. The total code is going to be way more than 256 bytes. I need to be smarter.
Key insight I've been missing: For a 256b demo, I need to be much more aggressive with code reuse and simplification. Let me look at what I can cut:
imul bx, bx, 320 ; 4 bytes
add bx, ax ; 2
mov di, bx ; 2
mov byte [es:di], 15 ; 4 bytes
12 bytes for plotting.
But wait - I need to not overflow screen. Let me check.
After Y rotation by any angle:
But with orthographic projection only, ry = vy = ±55:
Great, so bounds checking is not needed!
Let me rewrite the inner loop without bounds check:
nasm
.lstep:
push cx
mov ax, [.cx_line]
add ax, [.xs]
mov [.cx_line], ax
sar ax, 8
mov bx, [.cy_line]
add bx, [.ys]
mov [.cy_line], bx
sar bx, 8
; Plot at (ax, bx)
push ax
imul bx, bx, 320
add bx, ax
pop ax ; (not needed if I do it differently)
mov di, bx
mov byte [es:di], 15
pop cx
loop .lstep
Wait: imul bx, bx, 320 computes bx = bx * 320 (= y_pixel * 320). Then add bx, ax adds x_pixel. But ax is still the x pixel value from sar ax, 8. Then mov di, bx and plot. I don't need to pop ax (I don't push it in this version):
nasm
.lstep:
push cx
mov ax, [.cx_line]
add ax, [.xs]
mov [.cx_line], ax
sar ax, 8 ; ax = x pixel
mov bx, [.cy_line]
add bx, [.ys]
mov [.cy_line], bx
sar bx, 8 ; bx = y pixel
imul bx, bx, 320 ; bx = y*320
add bx, ax ; bx = y*320 + x = pixel offset
mov di, bx
mov byte [es:di], 15 ; plot white
pop cx
loop .lstep
Let me count:
Much better! Still 41 bytes of code. But code is code, and this is the inner loop of the line drawing.
Now the outer edge loop setup per edge:
nasm
; Load vertex 0 coords:
lodsb : 1
xor ah, ah : 2
shl ax, 2 : 3
mov bx, ax : 2
mov dx, [.pts+bx] : 4
mov di, [.pts+bx+2] : 4
; Load vertex 1 coords:
lodsb : 1
xor ah, ah : 2
shl ax, 2 : 3
mov bx, ax : 2
mov ax, [.pts+bx] : 4
mov bx, [.pts+bx+2] : 4
; Compute steps:
sub ax, dx : 2 ; ax = x_diff
sub bx, di : 2 ; bx = y_diff
shl ax, 3 : 3 ; x_step = x_diff * 8
shl bx, 3 : 3 ; y_step = y_diff * 8
mov [.xs], ax : 3
mov [.ys], bx : 3
; Init fixed-point position:
mov ax, dx : 2 ; ax = x0
shl ax, 8 : 3 ; ax = x0 * 256
mov [.cx_line], ax : 3
mov ax, di : 2 ; ax = y0
shl ax, 8 : 3
mov [.cy_line], ax : 3
; Inner loop:
mov cx, 32 : 3
.lstep: ... : 41 bytes (loop body)
Edge setup (excluding inner loop): 1+2+3+2+4+4 + 1+2+3+2+4+4 + 2+2+3+3+3+3 + 2+3+3+2+3+3 + 3 = Let me count: 16 (v0 coords) + 16 (v1 coords) + 13 (compute steps) + 16 (init pos) + 3 (cx=32) = 64 bytes
Plus inner loop: 41 bytes. Plus edge loop overhead: push cx (1) + push si (1) + ... + pop si (1) + pop cx (1) + loop .eloop (2) = maybe 8 bytes Total edge section: 64 + 41 + 8 = 113 bytes
That's way too much. Plus the vertex loop was 79 bytes, setup 12 bytes, clear 9 bytes, sin/cos 28 bytes, key check/exit 10 bytes.
Total: 12 + 9 + 28 + 79 + 113 + 10 = 251 bytes... actually that might work!
Plus data: da(4) + fscl(4) + ica(2) + isa(2) + vx(2) + vy(2) + vz(2) + pts(32) + xs(2) + ys(2) + cx_line(2) + cy_line(2) + edges(24) = 84 bytes
Total: 251 + 84 = 335 bytes. Way over 256!
I need to cut ~80 bytes from code + data.
Let me think about what can be eliminated:
Data reduction:
Hmm, can I compute edges programmatically?
nasm
; Enumerate all pairs (i,j) with 0<=i<j<=7 and i^j is a power of 2
xor bx, bx ; i
.i_loop:
xor cx, cx ; j
.j_loop:
mov ax, bx
xor ax, cx
; check if power of 2: ax & (ax-1) == 0 and ax != 0
jz .skip_pair
mov dx, ax
dec dx
test ax, dx
jnz .skip_pair
; it's an edge! draw it
; ... load pts[i] and pts[j] and draw line
.skip_pair:
inc cx
cmp cx, 8
jb .j_loop
inc bx
cmp bx, 7 ; only need i < j, so i goes 0..6
jb .i_loop ; but this draws all edges including duplicates...
Hmm, this nested loop approach: 6 * 7 / 2 = 21 pairs to check, 9 are edges. The checking loop itself is ~20 bytes. But then each time I find an edge, I need to draw it. If I jump to the line drawing code, that's a CALL or repeated code.
Actually, if I inline the edge drawing (calling draw_line when i^j is a power of 2), the total might be less than having an explicit edge table plus the drawing loop. The edge table approach has 24 bytes of data; the algorithmic approach might use 30 bytes of code but save 24 bytes of data.
Net: 30 - 24 = +6 bytes. Not worth it.
Code reduction ideas:
; i in BX, pts stored at .pts+bx*4
; Check bit 0: draw to i^1
test bl, 1
jz .no_edge0
; Draw edge from pts[bx] to pts[bx-1]...
; bx-1 could be bx XOR 1
; Load pts[bx] (already know it: cx=sx, ax=sy from before storing)
; Load pts[bx XOR 1]:
mov di, bx
xor di, 1
shl di, 2
; draw line from (cx, ax) to ([.pts+di], [.pts+di+2])
; ... this is several bytes of code
.no_edge0:
test bl, 2
jz .no_edge1
; draw to i^2 ...
.no_edge1:
test bl, 4
jz .no_edge2
; draw to i^4 ...
.no_edge2:
This approach saves 24 bytes of data but adds maybe 40 bytes of code. Net: more code.
nasm
; After edge setup, I have:
; x0 in dx, y0 in di, x1 in ax, y1 in bx
; Compute steps:
sub ax, dx ; ax = x_diff
sub bx, di ; bx = y_diff
shl ax, 3 ; ax = x_step
shl bx, 3 ; bx = y_step
; Init positions:
shl dx, 8 ; dx = x0 * 256
shl di, 8 ; di = y0 * 256 (but shl di,8: C1 E7 08 = 3 bytes for di, or D1 E7 for *2, 3 times)
; Hmm, SHL DI, 8: C1 E7 08 = 3 bytes.
; Now: dx = x_pos, di = y_pos, ax = x_step, bx = y_step
; 32 inner iterations:
mov cx, 32
.lstep:
add dx, ax ; x_pos += x_step
add di, bx ; y_pos += y_step
push ax ; save x_step
push bx ; save y_step
push cx ; save count
mov cx, dx
sar cx, 8 ; x pixel
mov ax, di
sar ax, 8 ; y pixel (in ax)
imul ax, ax, 320 ; y*320
add ax, cx ; + x
mov di_save, ? ; problem: di is y_pos, but I need di for video offset!
; DI conflict!
mov [temp_di], di ; save y_pos ... need temp var
mov di, ax
mov byte [es:di], 15
mov di, [temp_di] ; restore y_pos
pop cx
pop bx
pop ax
loop .lstep
The DI conflict is a problem because DI is used for both y_pos and video memory offset. The push/pop approach uses the stack. But this adds 12 bytes of push/pop per iteration, making it WORSE than using memory variables!
Alternative: use BX as video offset and keep DI for y_pos:
add dx, ax ; x_pos += x_step
add di, bx ; y_pos += y_step
push ax; push bx; push cx
mov cx, dx
sar cx, 8 ; x pixel (in cx)
mov bx, di
sar bx, 8 ; y pixel (in bx)
imul bx, bx, 320 ; y*320
add bx, cx ; y*320 + x
mov byte [es:bx], 15
pop cx; pop bx; pop ax
loop .lstep
Let me count:
Much better! Down from 41 to 32 bytes.
But wait - I need AX and BX for x_step and y_step during the inner loop, but they're also used inside the loop temporarily. The push/pop handles this. That's 6 bytes of push/pop per iteration (3 bytes push + 3 bytes pop). Let me see if I can eliminate these by being clever with register usage.
What if I use segment registers? No, that's risky.
What if I use DX for x_pos and SI for y_pos, and keep AX=x_step, BX=y_step, CX=count? Then for the plot:
Inner loop with memory-stored steps:
; DX = x_pos*256, SI = y_pos*256
; [xs2], [ys2] = x_step, y_step
.lstep:
add dx, [xs2] ; 3 bytes
add si, [ys2] ; 3 bytes
mov ax, dx
sar ax, 8 ; 3 bytes
mov bx, si
sar bx, 8 ; 3 bytes
imul bx, bx, 320 ; 4 bytes
add bx, ax ; 2 bytes
mov [es:bx], byte 15 ; let me check encoding: 26 C6 07 0F = 4 bytes?
; Actually: ES segment override (26h), MOV [BX], imm8 = C6 07 0F = 3 bytes (ModRM 07 = [BX])
; Total: 26 C6 07 0F = 4 bytes. Yes.
loop .lstep ; 2 bytes
Total: 3+3+2+3+2+3+4+2+4+2 = 28 bytes!
Even better! 28 bytes for the inner loop, with 2 memory temp vars.
Setup for the registers (DX=x_pos, SI=y_pos, store steps in memory):
; x0 in dx_orig, y0 in di_orig, x1 in some place, y1 in some place
; Compute: dx=x0*256, si=y0*256, [xs2]=(x1-x0)*8, [ys2]=(y1-y0)*8
sub ax, dx ; ax = x1-x0
shl ax, 3 ; ax = (x1-x0)*8 = x_step
mov [xs2], ax
sub bx, di ; bx = y1-y0
shl bx, 3
mov [ys2], bx
; Now x_pos:
shl dx, 8 ; dx = x0*256
; y_pos in SI:
shl di, 8 ; di = y0*256
mov si, di ; si = y0*256
mov cx, 32
Hmm, I need x0 in DX and then shift. But DX already contains x0 (it was set when loading vertex coords). And SI needs to be set to y0<<8 but y0 was in DI. After shl di, 8, DI = y0*256, then mov si, di. But this uses DI for two purposes (loading and storing).
Actually, let me restructure the coordinate loading:
After loading vertex 0: dx=x0, di=y0 After loading vertex 1: ax=x1, bx=y1
Then:
But then I need to shift dx and di to create x_pos and y_pos. Let me use SI and replace DI:
sub ax, dx ; x_diff
shl ax, 3 ; x_step
mov [xs2], ax
sub bx, di ; y_diff
shl bx, 3
mov [ys2], bx
shl dx, 8 ; dx = x_pos = x0*256
shl di, 8 ; di = y0*256... but I still need DI for pts loading?
; Actually, DI was used for y0 loading but after this point I don't need to index .pts again
; So: mov si, di after shl di, 8
Wait, can I shl di, 8? SHL DI, 8 = C1 E7 08 = 3 bytes. Then mov si, di = 2 bytes. But I can avoid this: just use shl di, 8; mov si, di = 5 bytes. Or better: shl di, 8; xchg si, di = 3+1 = 4 bytes? XCHG SI, DI = 87 F7 = 2 bytes. Hmm: XCHG r/m16, r16 = 87, with SI(6) and DI(7): 87 FE or 87 F7? Let me check: XCHG DI, SI = 87 /r with r=DI(7), r/m=SI(6) in mod=11: ModRM = 11_111_110 = 0xFE. So 87 FE = 2 bytes. But I want to set SI = DI256 (after the shl). After shl di, 8, DI = y0256. Then xchg si, di would put the old SI into DI and DI*256 into SI. But what's in SI? After the edge loop initialization, SI was the edge pointer (we popped it back). After loading the vertices, SI was used by LODSB.
Actually, let me not use SI for the edge pointer during the line drawing inner loop. After loading both vertices, I've consumed the LODSB data. I can repurpose SI.
OK at this point, I've been thinking about this for way too long. Let me just write the code, accept it will be somewhere around 280-320 bytes, and present it as such. I'll note which optimizations could be made to bring it under 256 bytes.
Actually, you know what? Let me look at this from a totally different angle. Let me look at what's really possible in 256 bytes and design accordingly.
A very popular trick for 256b cube demos: use only the 8 vertex positions (no edge rendering), and draw thick points or just dots. That's not impressive but very small.
Or: render the cube wireframe using a really compact line drawing. The key is making the line drawing very tight.
Let me look at a known technique: using the fact that in mode 13h, each byte is one pixel, and we can use segment arithmetic. For example, if the line is mostly horizontal, I can write bytes sequentially with a simple incrementing DI.
Another key trick: write the color byte directly to [ES:DI] using STOSB (opcode AA, 1 byte) or SCASB, etc.
Here's a very compact plotting idiom: preload AH=0 (we want color 15) and then use IMUL to get offset and STOSB... hmm.
Actually, let me try to write a complete assembly file and trust that with careful optimization, it'll come close to 256 bytes. If it's 280 bytes, that's a good approximation of a 256b intro. Here's my final design:
For the line drawing, I'll use a minimal DDA with the LOOP instruction and as few memory accesses as possible:
nasm
bits 16
org 100h
SIZE equ 55
STEPS equ 32
start:
mov ax, 13h
int 10h
mov ax, 0A000h
mov es, ax
fninit
fldz
.frame:
; Clear
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
; Update angle
fadd dword [.da]
; Compute integer sin/cos scaled by 64
fld st0
fsin
fmul dword [.k64]
fistp word [.sa]
fld st0
fcos
fmul dword [.k64]
fistp word [.ca]
; Project 8 vertices
xor bx, bx
.vl:
push bx
mov di, .vcoord
mov cx, 3
.vc: mov ax, bx
and al, 1
add al, al
dec al
cbw
imul ax, ax, SIZE
stosw
shr bx, 1
loop .vc
pop bx
; Y rotation: sx = (vx*ca - vz*sa)>>6 + 160
mov ax, [.vcoord]
imul word [.ca]
mov si, ax
mov ax, [.vcoord+4]
imul word [.sa]
sub si, ax
sar si, 6
add si, 160
; sy = -vy + 100
mov ax, [.vcoord+2]
neg ax
add ax, 100
; Store
mov di, bx
shl di, 2
mov [.pts+di], si
mov [.pts+di+2], ax
inc bx
cmp bx, 8
jb .vl
; Draw edges
mov si, .edges
mov cx, 12
.el:
push cx
push si
lodsb
xor ah, ah
shl ax, 2
mov bx, ax
mov dx, [.pts+bx] ; x0
mov di, [.pts+bx+2] ; y0
lodsb
xor ah, ah
shl ax, 2
mov bx, ax
mov ax, [.pts+bx] ; x1
mov bx, [.pts+bx+2] ; y1
; Compute steps (8.8 fixed point, 32 steps)
sub ax, dx
sub bx, di
shl ax, 3 ; x_step = x_diff * 8 (= x_diff * 256 / 32)
shl bx, 3
mov [.xs], ax
mov [.ys], bx
; Init pos
shl dx, 8 ; x_pos = x0 << 8
shl di, 8 ; y_pos = y0 << 8
mov cx, STEPS
.ls:
add dx, [.xs]
add di, [.ys]
mov ax, dx
sar ax, 8
mov bx, di
sar bx, 8
imul bx, bx, 320
add bx, ax
mov byte [es:bx], 15
loop .ls
pop si
pop cx
loop .el
; Check for ESC
in al, 60h
dec al
jnz .frame
mov ax, 3
int 10h
ret
.da: dd 0.04
.k64: dd 64.0
.sa: dw 0
.ca: dw 0
.vcoord: dw 0, 0, 0 ; vx, vy, vz
.pts: times 16 dw 0
.xs: dw 0
.ys: dw 0
.edges: db 0,1, 2,3, 4,5, 6,7, 0,2, 1,3, 4,6, 5,7, 0,4, 1,5, 2,6, 3,7
Let me count the size of this:
Code section:
start (mode setup): 3+2+3+2+2+2 = 14 frame clear: 2+2+3+2 = 9 angle update + sin/cos: 4+2+2+4+4+2+2+4+4 = 28 vertex loop outer overhead: 2 (xor bx,bx) per vertex: coord loop: push(1)+mov di,.vcoord(3)+mov cx,3(3)+(mov ax,bx(2)+and al,1(2)+add al,al(2)+dec al(2)+cbw(1)+imul ax,ax,S(3)+stosw(1)+shr bx,1(2)+loop(2))x1_iter+pop(1) = 1+3+3+17+1 = 25 bytes (the loop body 17 bytes runs at runtime, but code-wise it's just 17 bytes) rotation: 3+4+2+3+4+2+3+3 = 24 bytes (mov ax,vcoord + imul ca + mov si,ax + mov ax,vcoord+4 + imul sa + sub si,ax + sar si,6 + add si,160)
Wait let me be more careful:
mov ax, [.vcoord] : 3 (A1 lo hi)
imul word [.ca] : 4 (F7 2E lo hi)
mov si, ax : 2 (89 C6? MOV SI, AX = 89 C6? No: MOV r/m16, r16 = 89, with r=AX(0), r/m=SI(6): ModRM = 11_000_110 = 0xC6. So 89 C6 = 2 bytes)
mov ax, [.vcoord+4] : 3 (A1 lo+4 hi)... actually A1 is MOV AX, [moffs16] = A1 lo hi = 3 bytes ✓ but it's vcoord+4, so address is vcoord_addr+4
imul word [.sa] : 4 (F7 2E lo hi)
sub si, ax : 2 (2B F0)
sar si, 6 : 3 (C1 FE 06)
add si, 160 : 3 (81 C6 A0 00)
= 3+4+2+3+4+2+3+3 = 24 bytes ✓
vy → sy:
mov ax, [.vcoord+2] : 3
neg ax : 2 (F7 D8)
add ax, 100 : 3 (05 64 00)
= 8 bytes
store:
mov di, bx : 2
shl di, 2 : 3 (C1 E7 02)
mov [.pts+di], si : ?
MOV [DI+disp16], SI: ModRM for [DI + disp16] = mod=10, rm=101, reg=SI(6) = 10_110_101 = 0xB5. So MOV [DI+disp16], SI = 89 B5 lo hi = 4 bytes. But the base for pts is the .pts offset = some constant. So: 89 B5 lo_pts hi_pts = 4 bytes.
For MOV [.pts+di], si: In NASM, this would be mov [pts+di], si. The addressing mode [DI + disp16] works in 16-bit mode. Encoding: 89 (MOV), ModRM = mod=10, reg=SI=6, rm=DI=5 → 10_110_101 = 0xB5, then 16-bit displacement = addr of .pts. Total: 4 bytes.
Similarly mov [.pts+di+2], ax: ModRM for [DI + disp16] with reg=AX=0: 10_000_101 = 0x85. Displacement = .pts+2. Total: 89 85 lo hi = 4 bytes.
store:
mov di, bx : 2
shl di, 2 : 3
mov [.pts+di], si : 4
mov [.pts+di+2],ax: 4
= 13 bytes
loop control:
inc bx : 1
cmp bx,8 : 3 (83 FB 08)
jb .vl : 2
= 6 bytes
vertex loop total (per vertex code): 25 + 24 + 8 + 13 + 6 = 76 bytes Plus outer setup: 2 bytes (xor bx,bx) = 78 bytes
edge loading per edge:
lodsb : 1
xor ah, ah : 2
shl ax, 2 : 3
mov bx, ax : 2
mov dx, [.pts+bx] : 4 (8B 97 lo hi: MOV DX, [BX+disp16])
mov di, [.pts+bx+2] : 4 (8B BF lo+2 hi)
lodsb : 1
xor ah, ah : 2
shl ax, 2 : 3
mov bx, ax : 2
mov ax, [.pts+bx] : 4
mov bx, [.pts+bx+2] : 4
= 1+2+3+2+4+4+1+2+3+2+4+4 = 32 bytes
Wait, mov dx, [.pts+bx]: In 16-bit mode, addressing [BX + disp16] uses mod=10, rm=7 (BX), reg=DX=2: ModRM = 10_010_111 = 0x97. With opcode 8B: 8B 97 lo hi = 4 bytes. ✓
step computation:
sub ax, dx : 2 (2B C2)
sub bx, di : 2 (2B DF)
shl ax, 3 : 3 (C1 E0 03)
shl bx, 3 : 3 (C1 E3 03)
mov [.xs],ax: 3 (A3 lo hi)
mov [.ys],bx: 3 (89 1E lo hi) -- MOV [moffs16], BX? Wait: MOV m16, AX has short form A3, but for BX: 89 1E lo hi = 4 bytes
hmm: mov [.xs], ax = MOV [moffs16], AX = A3 lo hi = 3 bytes (short form). ✓ mov [.ys], bx = MOV [m16], r/m16 = 89 1E lo hi = 4 bytes. (No short form for BX → memory).
step computation:
sub ax, dx : 2
sub bx, di : 2
shl ax, 3 : 3
shl bx, 3 : 3
mov [.xs],ax: 3
mov [.ys],bx: 4
= 17 bytes
init pos:
shl dx, 8 : 3 (C1 E2 08)
shl di, 8 : 3 (C1 E7 08)
= 6 bytes
inner loop:
add dx, [.xs] : 3 (03 16 lo hi)
add di, [.ys] : 3 (03 3E lo hi)... wait: ADD DI, [moffs16]?
Hmm, add di, [.ys] in 16-bit mode: ADD r16, r/m16 = 03 /r. With r=DI(7), r/m=[.ys]: ModRM = mod=00, reg=DI=7, rm=110 (direct) = 00_111_110 = 0x3E. So: 03 3E lo hi = 4 bytes. Wait, with direct address in 16-bit mode: mod=00, rm=110 means [disp16]. So it's always 4 bytes.
Hmm wait, add dx, [.xs]: ADD r16, r/m16 = 03. r=DX(2), r/m=.xs: mod=00, rm=110, reg=010 = 00_010_110 = 0x16. So: 03 16 lo hi = 4 bytes. Not 3 bytes as I thought earlier.
OK so add dx, [.xs] and add di, [.ys] are each 4 bytes. Let me recount:
inner loop (DX=x_pos, DI=y_pos):
add dx, [.xs] : 4
add di, [.ys] : 4
mov ax, dx : 2 (8B C2)
sar ax, 8 : 3 (C1 F8 08)
mov bx, di : 2 (8B DF)
sar bx, 8 : 3 (C1 FB 08)
imul bx,bx,320 : 4 (69 DB 40 01)
add bx, ax : 2 (03 D8)
mov byte [es:bx],15 : 4 (26 C6 07 0F)
loop .ls : 2
= 4+4+2+3+2+3+4+2+4+2 = 30 bytes
edge loop overhead:
push cx : 1
push si : 1
...inner stuff...
pop si : 1
pop cx : 1
loop .el: 2
= 6 bytes (plus the 32+17+6+30 = 85 bytes inside)
edge section total = 85 + 6 = 91 bytes (but 6 is shared overhead for the loop header/footer) Actually: code per edge = 32(load) + 17(steps) + 6(init) + 30(inner loop) = 85 bytes + 6 (loop overhead) = 91 bytes
key check + exit:
in al, 60h : 2 (E4 60)
dec al : 2 (FE C8)
jnz .frame : 2 (75 xx)
mov ax, 3 : 3 (B8 03 00)
int 10h : 2 (CD 10)
ret : 1 (C3)
= 12 bytes
Data:
.da: dd 0.04 : 4
.k64: dd 64.0 : 4
.sa: dw 0 : 2
.ca: dw 0 : 2
.vcoord: 6 bytes (3 words)
.pts: 32 bytes (16 words)
.xs: 2 bytes
.ys: 2 bytes
.edges: 24 bytes (12 pairs)
= 4+4+2+2+6+32+2+2+24 = 78 bytes
Grand total:
= 14 + 9 + 28 + 78 + 91 + 12 + 78 = 310 bytes
That's about 54 bytes over the 256-byte limit. I need to cut 54 bytes.
Possible savings:
fld1 ; 1.0
fld st0 ; 1.0, 1.0
fadd ; 2.0 ... wait: fadd = st1 += st0, pop... or just adds? Let me check.
; fadd st0, st0 would double st0
; But fadd (no operand) = fadd st1, st0 = st1 += st0, pop, then st0 = 2.0
This is getting complex. Just keep .k64 as a constant (4 bytes).
3. Eliminate .xs and .ys variables (saving 4 bytes of data) by using registers. If I use different registers for the inner loop, I can keep steps in registers.
After the edge setup (load vertices), I have x0 in DX, y0 in DI. I compute x_diff=ax=x1-x0, y_diff=bx=y1-y0. Then I shift them. If I keep the steps in registers:
But then in the inner loop, I need AX and BX for plot computation. Push/pop needed. Let me try:
; Setup (after loading vertices, dx=x0, di=y0, ax=x_diff, bx=y_diff):
shl ax, 3 ; x_step
shl bx, 3 ; y_step
shl dx, 8 ; x_pos = x0<<8
shl di, 8 ; y_pos = y0<<8
push bx ; save y_step
push ax ; save x_step
; After setup, stack: [..., y_step, x_step] (x_step at top)
; Register: DX=x_pos, DI=y_pos
; Inner loop (CX=32):
mov cx, 32
.ls:
pop ax ; ax = x_step (but now it's gone from stack!)
Problem: after first pop in the loop, x_step is gone. I'd need to push it back.
Alternative: store steps in SI and BP? But BP needs stack segment...
Hmm. Actually, let me try: use push bx; push ax before the loop and in the loop do add dx, [sp]; add di, [sp+2] to access steps from stack. But accessing [SP] directly is not possible in 16-bit mode (SP not a valid base register).
Can I set up BP to point to the stack frame? mov bp, sp before loop, then use [bp] and [bp+2] inside. That adds 2 bytes setup but saves 4 bytes of data (.xs, .ys). Net: 4-2 = 2 bytes saved. Plus the add dx, [bp] vs add dx, [.xs] - both are 4 bytes? Let me check:
add dx, [bp]: ADD r16, r/m16 = 03, r=DX(2), r/m=[BP+disp0] = mod=01, rm=110... wait: [BP+0] = mod=01, rm=110 = 01_010_110 = 0x56 (with disp8=0). So: 03 56 00 = 3 bytes! Only 3 bytes (using 8-bit displacement of 0).
add di, [bp+2]: 03 7E 02 = 3 bytes (mod=01, rm=110, reg=DI=7 = 01_111_110 = 0x7E, disp=2). 3 bytes!
vs add dx, [.xs] = 4 bytes, add di, [.ys] = 4 bytes.
Savings in inner loop: 2 bytes (3 vs 4, twice) = 2 bytes per loop iteration × 32 iterations = wait no, it's 2 bytes of CODE that runs 32 times. CODE savings = 2 bytes.
Setup overhead: mov bp, sp = 2 bytes (89 E5). Remove .xs and .ys: save 4 bytes of data. Push step values: need 2 PUSHes before loop = 2 bytes.
Net savings: 4 (data) - 2 (inner loop code savings) - 2 (mov bp, sp) - 2 (extra pushes) = -2 bytes. No savings.
Hmm. But wait, I'd be eliminating the mov [.xs], ax and mov [.ys], bx operations (4+4=8 bytes of code). And instead using pushes (2 bytes) + mov bp, sp (2 bytes) = 4 bytes. And in the inner loop, saving 2 bytes. Total code savings: 8 - 4 = 4 bytes. Data savings: 4 bytes. Total: 8 bytes saved.
; Compute vx
push bx
mov di, .vcoord
mov cx, 3
.vc: ... (as before)
pop bx
; Now use [.vcoord] for rotation
vs. computing inline:
; vx:
mov ax, bx
and al, 1
add al, al
dec al
cbw
imul ax, ax, SIZE ; ax = vx
imul ax, [.ca] ; BUT: imul m16 destroys DX! and we need the result in AX
; DX:AX = vx * ca (this is fine, result in AX since |vx*ca| < 32768)
mov si, ax ; si = vx*ca (save for later)
This might save the 6-byte .vcoord array and 6 bytes of "load from .vcoord" instructions (3 loads × 3 bytes? Actually load from vcoord+0 is A1 lo hi = 3 bytes, load from vcoord+2 is also 3 bytes, etc.). Let me see if I can save by eliminating .vcoord.
Instead of the coord loop + 3 separate loads, compute vx, vz, vy inline in the rotation:
; Compute rx = (vx*ca - vz*sa) >> 6
; Step 1: compute vx*ca
mov ax, bx ; 2
and al, 1 ; 2
add al, al ; 2
dec al ; 2
cbw ; 1
imul ax, ax, SIZE ; 3
imul word [.ca] ; 4: DX:AX = vx * ca
mov si, ax ; 2: si = vx*ca
; Step 2: compute vz*sa
mov ax, bx ; 2
shr al, 2 ; 3
and al, 1 ; 2
add al, al ; 2
dec al ; 2
cbw ; 1
imul ax, ax, SIZE ; 3
imul word [.sa] ; 4: DX:AX = vz * sa
sub si, ax ; 2
sar si, 6 ; 3
add si, 160 ; 3
Total: 2+2+2+2+1+3+4+2 + 2+3+2+2+2+1+3+4 + 2+3+3 = 44 bytes
vs. old approach (coord loop + load + rotate): coord loop: 25 bytes loads + rotate: 24 bytes Total: 49 bytes
So inline saves 5 bytes of code. Plus eliminating .vcoord saves 6 bytes of data = 11 bytes total savings.
Similarly for vy:
; vy = ((bit1 of bx) * 2 - 1) * SIZE
mov ax, bx ; 2
shr al, 1 ; 3
and al, 1 ; 2
add al, al ; 2
dec al ; 2
cbw ; 1
imul ax, ax, SIZE ; 3
neg ax ; 2
add ax, 100 ; 3
Total: 20 bytes
vs. old: 8 bytes (just loading and using [.vcoord+2]).
Wait, the old approach loaded vy from .vcoord+2 (3 bytes to load) + neg (2) + add (3) = 8 bytes. The new inline approach is 20 bytes. So the vy part is 12 bytes WORSE inline.
Combined: rotation inline (44) + vy inline (20) = 64 bytes. vs. coord loop + rotation + vy = 25 + 24 + 8 = 57 bytes.
Actually inline is worse! The coord loop is efficient because it amortizes the index computation across all 3 coordinates.
; Plot at (AX=x, BX=y) color 15
mov cx, ax ; cx = x
mov dx, bx ; dx = y (already bx, just use dx)
mov ax, 0C0Fh ; AH=0Ch, AL=15
xor bh, bh
int 10h ; 5 bytes to call
But INT 10h is very slow.
11. Use only Y rotation and skip depth sorting: Already doing this. ✓
Let me look at other approaches to reduce code. Here are some micro-optimizations:
For shr al, 2; and al, 1 (extracting bit 2), I can use:
Alternative for computing (bit n of BX) ? SIZE : -SIZE more compactly:
; Method: sbb al, al trick
; TEST BL, n: sets CF=0 if bit clear, CF unchanged if set...
; Actually: test sets CF=0 always.
; bt bl, n: sets CF to bit n of bl
; bt bx, 0: 0F A3 C3? No: BT r/m16, r16 = 0F A3. BT BX, 0 would be BT r/m16, CX or whatever reg holds 0.
; Or BT r/m16, imm8 = 0F BA /4 ib: BT BX, 2 = 0F BA E3 02 = 4 bytes (or BT BX, 1 = 0F BA E3 01)
; Then: SBB AL, AL = AL = AL - AL - CF. If CF=0: AL=0. If CF=1: AL=-1=0xFF.
; Combine: bt bx, n (4 bytes); sbb al, al (2 bytes); then we have 0 or -1 in AL
; Then: or al, 1 → 0→1 or -1→-1 (as two's complement: 0xFF | 1 = 0xFF = -1, but I want -1 or 1)
; Hmm: if CF=0: al=0 → need +1 → add al, al+1 or NEG + INC ... nah
; Different trick: bt bx, n; jnc skip; neg ax; skip: ... (2+4+2=8, same as before)
; OR: use shift to isolate the bit, then multiply by -1 or 1 using two's complement trick
; Another approach: use LEA or other tricks
; For bit 0: mov al, bl; and al, 1 (2 bytes) → 0 or 1 → then (2*al - 1) trick
; For bit 1: shr bl, 1; and al, 1 (but I need bl intact!)
; I'm already shifting bx in the coord loop. This is fine.
Actually I think there's a subtler optimization I'm missing. Let me look at this differently.
For the vertex coordinate computation, what if I represent the sign of each coordinate differently? Since I'm processing all 8 vertices in order 0..7, and each vertex's coordinates are just sign variations of SIZE, I can precompute all 8 projected screen coordinates using vectorized thinking:
Actually wait - I just realized that the coordinate loop is already pretty optimized. The main inefficiency in the vertex projection is loading vcoord+0, vcoord+4 (3 bytes each, 2 loads). If I could use a different representation...
Here's an idea: skip the coord loop entirely and inline the 3 coordinate computations (no loop), but share the vertex index computation across all three (extracting bits 0, 1, 2 of BX). This would:
Let me count: Current coord loop (code): 7 (setup) + 17 (body) + 2 (loop) + 2 (push/pop) = 28 bytes
Inline 3 coord computations (without loop):
; vx (bit 0):
mov ax, bx; and al, 1; add al, al; dec al; cbw; imul ax, ax, SIZE; mov [vx], ax
= 2+2+2+2+1+3+4 = 16 bytes
; vy (bit 1):
mov ax, bx; shr al, 1; and al, 1; add al, al; dec al; cbw; imul ax, ax, SIZE; mov [vy], ax
= 2+3+2+2+2+1+3+4 = 19 bytes
; vz (bit 2):
mov ax, bx; shr al, 2; and al, 1; add al, al; dec al; cbw; imul ax, ax, SIZE; mov [vz], ax
= 2+3+2+2+2+1+3+4 = 19 bytes
Total inline: 16+19+19 = 54 bytes. Worse!
But if I use the coord loop (28 bytes) and then load from .vcoord (3+3+3=9 bytes), total = 37 bytes for the coord section. The inline rotation approach was 44 bytes but saves 6 bytes of data. Net: 44-37=7 bytes more code, -6 bytes data = net +1 byte worse.
So the coord loop approach is better. Let me keep it.
OK let me think about the biggest chunk: the edge loop (91 bytes). Can I make it shorter?
Key insight: In the inner loop, the FPU is available! I could use the FPU for the line stepping instead of integer arithmetic. FP
Continue
4 Apr
operations might use fewer bytes in some cases.
But FPU instructions take more bytes for memory loads (4 bytes each), so probably not.
Alternative: use the line drawing from BIOS:
; INT 10h AH=0Bh, 0Ch for line drawing... nope, there's no line drawing function in INT 10h.
What about using INT 10h AH=0Ch for pixel? It's slow but:
; For each pixel:
mov ax, 0C0Fh ; AH=0Ch, AL=15 (white)
xor bh, bh
int 10h ; 6 bytes per pixel, but slow
The set pixel BIOS call needs AH=0Ch, AL=color, BH=page, CX=col, DX=row. 6 bytes to call (3 for MOV AX, 2 for XOR BH, 2 for INT) = 7 bytes... but the inner loop would be:
; DX = y pos, CX = x pos (need to set these)
; 3 bytes to call vs. the current 30-byte inner loop??
; Well, INT 10h is much slower but the code would be:
; in inner loop:
add dx, [.xs] ; 4 bytes
add di, [.ys] ; 4 bytes -- DI = y pos, DX = x pos... wait I have naming confusion
; Let me use SI=x_pos, DI=y_pos
add si, [.xs]
add di, [.ys]
push si, di
; Extract integers:
mov cx, si
sar cx, 8 ; cx = x pixel
mov dx, di
sar dx, 8 ; dx = y pixel
; Plot:
mov ax, 0C0Fh
xor bh, bh
int 10h
; total inner loop (using INT 10h): 4+4+2+3+2+3+3+2+2 = 25 bytes
Still 25 bytes, and way slower.
Hmm, let me accept that the inner loop is going to be ~28-32 bytes and try to optimize other things.
More optimization ideas:
Let me try a concrete optimization pass. Here's my vertex loop with some optimizations:
nasm
.vl:
; Compute vertex index in BX
push bx
mov di, .vcoord
mov cx, 3
.vc:
mov ax, bx
and al, 1
add al, al
dec al
cbw
imul ax, ax, SIZE
stosw ; store ax to [di], di += 2
shr bx, 1
loop .vc
pop bx
One thing I can save: instead of mov ax, bx; and al, 1, I can use:
Actually: mov ax, bx = 2 bytes vs mov al, bl = 2 bytes - same. No saving.
Can I use test? No, test doesn't modify AL.
Alternative: use IMUL with 3 operands for the size computation: imul ax, bx, 1 is not useful. But I need to compute from a bit of bx.
Actually, let me try a different representation. Instead of computing (bit_value * 2 - 1) * SIZE, I can use the lookup sizes = [-SIZE, SIZE]. Then:
; Get bit value (0 or 1)
mov ax, bx
and al, 1
mov si, ax ; si = 0 or 1
; Load from table: sizes[si] = -SIZE or SIZE
mov ax, [.sizes+si*2] ; not valid 16-bit addressing (no scaling)
16-bit mode doesn't support scaled indexing, so [.sizes+si*2] isn't valid directly. I'd need shl si, 1; mov ax, [.sizes+si]. And .sizes would be a 2-word table = 4 bytes.
Cost: and al,1 (2) + mov si,ax (2) + shl si,1 (3) + mov ax,[.sizes+si] (4) = 11 bytes. vs. current: and al,1 (2) + add al,al (2) + dec al (2) + cbw (1) + imul ax,ax,SIZE (3) = 10 bytes.
No improvement.
Let me try a completely different approach for the 256b limit. Instead of trying to micro-optimize the current design, let me look at what a 256b intro typically does for a cube:
In actual 256b cube demos I've seen, common tricks include:
Actually, using a subroutine for line drawing would save code if the call overhead (CALL = 3 bytes, RET = 1 byte) is less than the 85-byte edge setup. But the edge setup per edge can't really be avoided.
Wait - if I use a subroutine for the inner line step loop, the loop is 32 bytes. With 12 edges, that's 12 × 30 bytes (inner loop code) = 360 bytes executed but 30 bytes of code. With a subroutine, I'd have:
; Before calling:
; Set up DX=x_pos*256, DI=y_pos*256, [xs]=x_step, [ys]=y_step
call draw_line_32_steps
draw_line_32_steps:
mov cx, 32
.ls:
add dx, [xs]
add di, [ys]
... (plot code)
loop .ls
ret
The subroutine is called 12 times, saving code repetition. But the 30-byte inner loop isn't repeated currently (it's in the edge loop which iterates 12 times). The code is already in a loop! So no savings from subroutines here.
Let me reconsider. What if I reduce the problem:
Target: 256-280 bytes. If I can get to ~260 bytes, that's close enough for a practical demo. Let me count more carefully what I have and see.
Actually, let me just write the actual NASM code file and let NASM count the bytes. I'll write my best estimate of a compact version.
Based on my analysis, here's a version I'll present:
Total estimated size: ~290-310 bytes. I'll present it as a "compact cube demo" targeting 256b with notes on further optimizations needed.
Actually, you know what - let me just take another approach entirely. Let me look at this from a practical standpoint: write clean, correct code that might be slightly over 256 bytes, and explicitly note what further optimizations can be applied to hit 256 bytes exactly. This is how real 256b demos are made - iterative optimization.
Let me write the NASM code cleanly, with correct logic:
One more optimization I haven't tried: using AL/AH splits to avoid some moves. For example, when loading from .pts[BX], since x and y are words:
Actually in NASM for DOS .COM, I can use 32-bit registers with the 66h operand size prefix. For example, lodsd loads DS:[SI] into EAX (4 bytes), incrementing SI by 4. This loads x (low 16 bits of EAX) and y (high 16 bits of EAX) simultaneously! This could save bytes in the edge loading section.
With 32-bit operand in 16-bit mode (66h prefix):
Hmm, this might save bytes:
; Old way to load x0,y0:
lodsb ; 1
xor ah, ah ; 2
shl ax, 2 ; 3
mov bx, ax ; 2
mov dx, [.pts+bx] ; 4
mov di, [.pts+bx+2] ; 4
= 16 bytes
; New way using 32-bit ops:
lodsb ; 1: al = v0 index
xor ah, ah ; 2: ax = index
shl ax, 2 ; 3: ax = index*4
mov bx, ax ; 2: bx = offset
; Load 32-bit from [.pts+bx]: both x and y
; Use 32-bit: mov eax, [.pts+bx] = 66 8B 87 lo hi = 5 bytes
; Then: low word = x0, high word = y0
; mov dx, ax : 2 (dx = x0)
; shr eax, 16 : 66 C1 E8 10 = 4 bytes (get y0 into ax)
; mov di, ax : 2
= 1+2+3+2+5+2+4+2 = 21 bytes. WORSE!
OK 32-bit tricks don't help here.
Alright, I've spent way too long on this. Let me write the cleanest version I can and present it. I'll target correctness and readability.
For the final code, let me try to be clever about one big saving: use relative jumps and shared code for both the forward and backward sign case.
Actually, here's a key insight I haven't used: XCHG AX, reg = 1 byte for swapping AX with BX/CX/DX/SI/DI/BP. This can save bytes when I need to move a value to/from AX.
For example, in the rotation:
imul word [.ca] ; DX:AX = vx * ca
mov si, ax ; SI = vx*ca (2 bytes: 89 C6)
vs.
imul word [.ca]
xchg si, ax ; 1 byte (96h for XCHG AX, SI)
Saves 1 byte!
Similarly, in various places. Let me apply this.
Also: add cx, 160 vs add cx, 160: ADD r16, imm16 = 81 /0 imm16 = 81 C1 A0 00 = 4 bytes. But add si, 160: 81 C6 A0 00 = 4 bytes. No savings there.
Actually ADD r16, imm8 with sign-extended imm8 = 83 /0 imm8 = 2 bytes? ADD AX, imm16 = 05 imm16 = 3 bytes. For other registers with small immediates: ADD r16, imm8-sign-extended = 83 /0: ADD SI, 100 = 83 C6 64 = 3 bytes (if 100 fits in imm8-sign-extended). 100 = 0x64, which fits as positive byte. ✓ But wait: the short form 83 /0 imm8 is for signed 8-bit that gets sign-extended to 16. 100 = 0x64 = positive, sign-extends to 0x0064 = 100. ✓
So add si, 160: 160 = 0xA0. This sign-extends to 0x00A0 = 160 if treated as unsigned... but as signed byte, 0xA0 = -96. Sign-extended: 0xFF60 = -160. So 83 C6 A0 would be ADD SI, -96! I need the 4-byte form 81 C6 A0 00.
Hmm. For 160: use add si, 160 = 4 bytes. For 100: use add ax, 100 = 3 bytes (83 C0 64). For 6 (sar): C1 F8 06 = 3 bytes.
Let me think about add si, 160 as two adds: add si, 128; add si, 32 = 3+3 = 6 bytes? No, worse.
Or: push 160; pop si to set si = 160, then add si, something? No, different operation.
What about: add si, CX where CX was set to 160? No, CX is the loop counter.
Hmm. Let me use the LEA trick: lea si, [si+160]. LEA r16, m16 = 8D /r. For [SI+disp16]: mod=10, r=SI(6), rm=100? No wait, 16-bit LEA addressing modes...
In 16-bit mode, LEA with source [reg+imm16]: uses the standard 16-bit addressing modes. For [SI+disp16]: mod=10, rm=100 (SI). So LEA SI, [SI+160] = 8D B4 A0 00 = 4 bytes. Same as ADD SI, 160. No savings.
For [SI+128]: 128 fits as negative? No, 128 as a signed byte is -128. [SI-128] would use 8D B4 80 FF (disp16 = 0xFF80 = -128). That's not 160. Hmm.
Can I do add si, 64; add si, 64; add si, 32? 3×3 = 9 bytes. Nope.
OK, I'll just accept 4 bytes for add si, 160.
Let me also look at the edge loading section. Currently:
lodsb ; 1
xor ah, ah ; 2
shl ax, 2 ; 3
mov bx, ax ; 2
mov dx, [.pts+bx] ; 4
mov di, [.pts+bx+2] ; 4
= 16 bytes per vertex, 32 bytes for 2 vertices.
Could I use shl al, 2 instead of xor ah, ah; shl ax, 2?
Alternative: lodsb; cbw; shl ax, 2. CBW sign-extends AL to AX. Since vertex index 0..7 fits in 3 bits, CBW gives AX=0..7 (no sign extension needed). So CBW = 1 byte, shl ax, 2 = 3 bytes → total 1+3 = 4 bytes vs xor ah,ah+shl ax,2 = 2+3 = 5 bytes. Saves 1 byte.
But wait, CBW sign-extends: if AL=5 (0x05), CBW → AX=0x0005. If AL=7 (0x07), CBW → AX=0x0007. All vertex indices 0-7 are positive, so CBW correctly zero-extends them. ✓
New edge loading (per vertex):
lodsb ; 1
cbw ; 1
shl ax, 2 ; 3
mov bx, ax ; 2
mov dx, [.pts+bx] ; 4
mov di, [.pts+bx+2] ; 4
= 15 bytes per vertex, 30 bytes for 2 vertices. Saves 2 bytes.
More: shl ax, 2 = C1 E0 02 = 3 bytes. But I could use add ax, ax; add ax, ax = 2+2 = 4 bytes. No savings there. Or shl ax, 1; shl ax, 1 = 3+3 = 6 bytes. No. The 3-byte shl ax, 2 is optimal.
What about lea bx, [ax+ax+ax+ax]? Not valid in 16-bit (no scale > 1).
Or: can I avoid mov bx, ax by using AX directly as index? mov dx, [.pts+ax] - but in 16-bit mode, AX is not a valid base register for memory addressing (only BX, BP, SI, DI). So I need BX or the others. BX is the best choice.
Can I use SI as the index register instead of BX, to avoid mov bx, ax? mov si, ax = 89 C6 = 2 bytes, same as mov bx, ax (89 C3? wait: MOV BX, AX = MOV r16, r/m16 = 8B /r. r=BX(3), r/m=AX(0) in mod=11 → 8B D8 = 2 bytes). Actually: mov bx, ax = MOV r16, r/m16 with r=BX, r/m=AX: 8B D8 = 2 bytes ✓ mov si, ax = 8B F0 = 2 bytes ✓
But I use SI for the LODSB instruction (it auto-increments SI). So if I move to SI, I can't do mov si, ax without messing up the LODSB pointer (which is saved on stack... yes, it's pushed before the edge loop and popped after).
Wait, I push SI at the start of the edge loop. After LODSB (which uses SI as source pointer), I've already loaded both vertices. Then I can repurpose SI. In fact, after both LODSB instructions, I don't need SI as edge pointer until the next iteration. The SI is pushed on stack and popped after the edge drawing. So I could use SI as an index register in the [.pts+si] addressing.
But [.pts+SI] in 16-bit mode: SI (=110 in rm) with direct displacement. mov dx, [.pts+si] encoding: MOV r16, r/m16 = 8B. r=DX(2), r/m=[SI+disp16]: mod=10, rm=100 (SI), reg=DX=2: 10_010_100 = 0x94. So: 8B 94 lo hi = 4 bytes. Same as with BX.
No savings from using SI instead of BX.
Hmm. Let me try lodsb; cbw; shl ax, 1; shl ax, 1 = 1+1+3+3 = 8 bytes. No: shl ax, 2 is already 3 bytes, not 6.
Actually shl ax, 2 = C1 E0 02 = 3 bytes ✓.
Can I use xchg ax, bx (1 byte) after loading into BX? No, I need AX for arithmetic and BX for memory addressing.
OK I'll accept 15 bytes per vertex in the edge loading = 30 bytes total.
Let me also think about the edge loop structure. Currently:
push cx; push si ; 2 bytes
[edge code]
pop si; pop cx ; 2 bytes
loop .el ; 2 bytes
= 6 bytes overhead per iteration. With 12 iterations, total = 6 bytes of code.
This is standard.
What if I use DEC CX; JNZ instead of LOOP? LOOP = 2 bytes = same as DEC+JNZ (1+2=3 bytes). So LOOP is better.
OK let me now add up all the pieces more carefully and see if I can get to 256 bytes with the 8-byte savings I've found (5 from XCHG trick + 2 from CBW trick + 1 more TBD):
Current estimate: 310 bytes Savings found: ~10 bytes New estimate: ~300 bytes
Still 44 bytes over. I need bigger cuts.
Big ideas I haven't tried:
With 8 steps (instead of 32):
Actually wait - for a VERY SMALL cube (SIZE=15 pixels), max edge ≈ 21 pixels, 8 steps is fine (2.6 px per step). But then the cube is only 30x30 pixels on screen. Too small to be impressive.
For the edge loop, I need to preserve:
If I can use registers that survive the inner loop, I don't need push/pop. The inner loop uses AX, BX, DX, DI (for x_pos, y_pos, and computation). I could put the edge counter in BP and edge pointer in... BP+2? No. I'd need to store one of them.
Actually: I use DX and DI for x_pos and y_pos in the inner loop. What if I use SI and another register for x_pos and y_pos instead?
If I use AX=x_pos, BX=y_pos, and SI,DI,CX for outer loop state... but that's too many.
Actually, the big problem is that I need 5 values live simultaneously:
If I can encode x_step and y_step more compactly... what if I use AX register for x_pos+step increment? No, I use it for computing pixel offset.
What if I just use add dx, [.xs] and add di, [.ys] where .xs and .ys are in the data section (no push/pop needed)? I don't need CX for both loops simultaneously - I can reuse CX.
The outer edge loop counter doesn't need to be a register. I can decrement a memory variable:
mov byte [.ec], 12 ; edge counter
.el:
[edge code with CX used for inner loop]
dec byte [.ec] ; 3 bytes (FE 0E lo hi)
jnz .el ; 2 bytes
This saves push cx; pop cx = 2 bytes per outer iteration = 2 × 12 = 24 bytes... wait, the code is just 2 bytes for push+pop TOTAL (not per iteration). The push/pop in the code is 2 bytes of code that gets executed 12 times. Moving to memory: dec byte [.ec]; jnz .el = 5 bytes of code vs pop cx; loop .el = 1+2 = 3 bytes. Worse!
Actually: current:
push cx ; start of iteration: 1 byte
...
pop cx ; end: 1 byte
loop .el ; 2 bytes
Total: 4 bytes for loop management.
Alternative with memory counter:
; Before loop: initialize counter in memory
; In loop body:
[no push/pop needed]
dec byte [.ec] ; 3 bytes
jnz .el ; 2 bytes
Plus init: mov byte [.ec], 12 = 4 bytes. Total per iteration: 3+2 = 5 bytes (code) + 4 bytes init. vs. current 4 bytes per iteration + 0 init. Worse.
But if I also need to preserve SI (edge pointer), I'd also push/pop SI. So: Current: push cx + push si = 2 bytes, pop si + pop cx + loop = 4 bytes = 6 bytes per iteration × ... wait no: these are FIXED bytes of CODE, not per-iteration. Push/pop instructions are 1 byte each.
Oh I see my confusion. The push/pop/loop instructions are in the loop body, so they execute 12 times at runtime but there's only one copy of the code. The CODE SIZE is:
Now: if I put SI preservation in a memory variable:
Actually, SI is the edge table pointer. After lodsb; [loadsb] (2 LODSB instructions), SI has advanced by 2. The pop si at the end of the iteration restores SI to the start of the current pair (which was pushed at the start). Then loop .el jumps back to the top where we expect SI to point to the current edge pair.
Wait, I push si AFTER the loop counter check but BEFORE lodsb. Then at the end I pop si; loop .el. After popping SI, it points to the start of the current edge pair (as it was before the 2 LODSB calls). But then LOOP decrements CX and if CX≠0, jumps to .el, which immediately does push si again...
Hmm wait, let me re-examine my edge loop structure:
nasm
mov si, .edges ; si = start of edge table
mov cx, 12 ; 12 edges
.el:
push cx ; save outer loop counter
push si ; save edge pointer (before lodsb)
lodsb ; al = v0 index, si += 1
cbw
shl ax, 2
mov bx, ax
mov dx, [.pts+bx]
mov di, [.pts+bx+2]
lodsb ; al = v1 index, si += 1
cbw
shl ax, 2
mov bx, ax
mov ax, [.pts+bx]
mov bx, [.pts+bx+2]
[compute steps and draw]
pop si ; restore si to before lodsb (start of current pair)
pop cx ; restore outer counter
add si, 2 ; advance to next pair
loop .el
Ah, after pop si, SI is at the start of the current pair. I then need to advance it by 2 (add si, 2) before loop .el. And LOOP decrements CX and jumps to .el if CX≠0.
Wait, this adds add si, 2 = 3 bytes. Let me reconsider.
Alternative: push si AFTER lodsb×2, and don't push si at all. Just save the address for the NEXT iteration:
nasm
mov si, .edges
mov cx, 12
.el:
push cx
; SI is already pointing to the current edge pair
lodsb ; load v0, si += 1
cbw
shl ax, 2
mov bx, ax
mov dx, [.pts+bx]
mov di, [.pts+bx+2]
lodsb ; load v1, si += 1
; SI now points to NEXT edge pair
push si ; save next edge pointer!
cbw
shl ax, 2
mov bx, ax
mov ax, [.pts+bx]
mov bx, [.pts+bx+2]
[draw]
pop si ; restore to next edge pair (no add si,2 needed!)
pop cx
loop .el
This saves add si, 2 = 3 bytes! And the push si is just moved to after the second LODSB. Let me verify: when the loop iterates, SI points to the start of the next pair. At the next iteration of .el, we load from SI via LODSB... ✓
Let me count the code impact: Old structure (push si before LODSBs + pop si + add si,2): 1+1+3 = 5 bytes New structure (push si after 2nd LODSB + pop si): 1+1 = 2 bytes Savings: 3 bytes!
Also, I can restructure to avoid pushing CX entirely if I use a different register for the inner loop count. But the inner loop uses LOOP which uses CX. And the outer loop also uses LOOP/CX. So I need to save CX.
What if I use the BL register for the inner loop count instead of CX? Use dec bl; jnz for the inner loop instead of LOOP? That's 2+2=4 bytes vs. LOOP = 2 bytes. Worse.
What if I use the outer loop counter in a memory variable?
mov byte [.ec], 12
.el:
push si (at appropriate point)
[load vertices]
[draw]
pop si
dec byte [.ec]
jnz .el
That's dec byte [.ec] = 3 bytes + jnz = 2 bytes = 5 bytes for loop control. vs. push cx + ... + pop cx + loop = 1+1+2 = 4 bytes. Worse.
OK, the best loop structure I've found saves 3 bytes via the push-si-after-lodsb trick.
Another saving: In the vertex loop, I do xor bx, bx before the loop and inc bx; cmp bx, 8; jb .vl.
Alternative: count DOWN from 7 to 0 using jns (jump if not sign):
mov bx, 7
.vl:
[code]
dec bx
jns .vl
dec bx = 1 byte (4Bh), jns = 2 bytes = 3 bytes total vs. inc bx; cmp bx,8; jb = 1+3+2 = 6 bytes. Saves 3 bytes!
But wait: if I count from 7 to 0, I'm computing vertices in reverse order (7 first, 0 last). Does this matter? For correct edge connectivity, I just need all 8 vertices to be projected. The edge table uses indices 0-7 which are stored in .pts[index*4]. As long as .pts is filled correctly regardless of order, it's fine. ✓
But: with the coord loop, BX is modified (shifted right 3 times). The push bx at start saves original BX, and pop bx restores it. If BX counts from 7 to 0, this still works (the coord loop uses BX as a bitmask, extracting bits correctly). ✓
3 bytes saved!
Another saving: In the edge loading, after cbw; shl ax, 2; mov bx, ax, I use BX as the offset. But I can avoid mov bx, ax if I already have the right value in BX. Hmm.
After the first lodsb; cbw; shl ax, 2; mov bx, ax; mov dx, [.pts+bx]; mov di, [.pts+bx+2]:
After the second lodsb; cbw; shl ax, 2; push si:
Can I avoid mov bx, ax in the second vertex load? If I use SI or DI as the base register? DI currently holds v0.y (which I don't need anymore after storing to v0_y... wait, I haven't stored it anywhere, DI still holds v0.y = y0). And y0 is needed for step computation later.
Hmm. Let me see: after loading v0 (dx=x0, di=y0) and v1 index (ax=v1*4), I need to load v1.x and v1.y. I need to use a register for addressing. AX can't be used as base. SI is being used as edge pointer (just pushed). What if I use SI as index after pushing it?
lodsb ; 1: load v1 index, si += 1
push si ; 1: save next edge pointer
cbw ; 1
shl ax, 2 ; 3
; Now I want to load [.pts+ax]... but can't use AX as base.
; Use SI: pop si; ... no, I just pushed it.
; Or: could I use DI? DI has y0.
; Or: use BX (still valid from v0 loading):
mov bx, ax ; 2
mov ax, [.pts+bx] ; 4: ax = x1
mov bx, [.pts+bx+2] ; 4: bx = y1
After this:
Then step computation:
sub ax, dx ; ax = x_diff = x1-x0
sub bx, di ; bx = y_diff = y1-y0
shl ax, 3 ; x_step
shl bx, 3 ; y_step
mov [.xs], ax
mov [.ys], bx
shl dx, 8 ; x_pos = x0 << 8
shl di, 8 ; y_pos = y0 << 8
Wait, shl di, 8 overwrites the y0 value I used above. But I've already used DI for sub bx, di before this, so it's fine.
After shl dx,8 and shl di,8:
But wait, I need mov [.xs], ax and mov [.ys], bx. Can I avoid these memory stores?
If I use DX=x_pos, DI=y_pos, and keep x_step and y_step in registers... but the inner loop also needs AX and BX for pixel offset computation. So I'd need to push/pop x_step and y_step.
If I use the BP trick: mov bp, sp after pushing, and then use [bp] and [bp+2] in the inner loop for x_step and y_step...
; After computing x_step (ax) and y_step (bx):
push bx ; y_step on stack
push ax ; x_step on stack
mov bp, sp ; bp points to top of stack (x_step)
; Initialize positions:
shl dx, 8 ; x_pos
shl di, 8 ; y_pos
; Inner loop:
mov cx, 32
.ls:
add dx, [bp] ; x_pos += x_step (3 bytes: [bp+0] using short displacement)
add di, [bp+2] ; y_pos += y_step (3 bytes)
mov ax, dx
sar ax, 8
mov bx, di
sar bx, 8
imul bx, bx, 320
add bx, ax
mov byte [es:bx], 15
loop .ls
; Cleanup:
pop ax ; discard x_step
pop bx ; discard y_step
Let me count: push bx + push ax + mov bp,sp = 1+1+2 = 4 bytes setup. Inner loop: replace add dx, [.xs] (4 bytes) + add di, [.ys] (4 bytes) with add dx, [bp] (3 bytes) + add di, [bp+2] (3 bytes) = saves 2 bytes in code. Plus eliminate mov [.xs], ax (3 bytes) + mov [.ys], bx (4 bytes) = 7 bytes saved. And eliminate .xs and .ys data variables = 4 bytes data saved.
Total savings: 7 (code from eliminating stores) + 4 (data) + 2 (inner loop code) - 4 (setup code) = 9 bytes saved.
But I also need cleanup: pop ax; pop bx = 2 bytes. And these need to happen BEFORE the pop si; pop cx; loop .el sequence. Let me restructure:
Stack state at inner loop time: Before entering edge loop iteration: SP is at bottom. push cx (outer): SP-2 push si (after 2nd LODSB): SP-4 ...intermediate computation... push bx (y_step): SP-6 push ax (x_step): SP-8 mov bp, sp → bp points to x_step
At end of inner loop: pop ax (discard), pop bx (discard) → SP back to -4 Then: pop si, pop cx, loop .el
This works. But wait, does BP need to be preserved? In the outer edge loop, I push/pop cx and si. If BP is used inside and potentially destroyed by outer loop operations, is that OK? The BP register is free in my current design (I don't use it elsewhere). ✓
Actually wait: in 16-bit real mode, accessing [BP] uses the SS segment by default (BP is a stack pointer). So add dx, [BP] actually accesses stack memory. This is intentional! The x_step is pushed on the stack, and I access it via [BP]. ✓
Let me also check: does the inner loop corrupting AX and BX cause issues? In the inner loop, I compute pixel offset using AX and BX, but my "live" values are in DX (x_pos) and DI (y_pos). The x_step and y_step are on the stack (BP-relative). So the inner loop body is:
add dx, [bp] ; x_pos += x_step (via stack)
add di, [bp+2] ; y_pos += y_step (via stack)
mov ax, dx
sar ax, 8
mov bx, di
sar bx, 8
imul bx, bx, 320
add bx, ax
mov byte [es:bx], 15
loop .ls
After the inner loop:
Then: pop ax; pop bx clears the stack (discards x_step and y_step).
Wait, pop ax would pop x_step into AX, pop bx would pop y_step into BX. Stack is now at sp-4 (si saved, cx saved). Then: pop si; pop cx; loop .el
This seems correct! ✓
Let me count bytes saved:
Net code change: -7 - 1 - 1 + 4 + 2 = -3 bytes (3 bytes saved in code) Net data change: -4 bytes Total savings: 7 bytes
With previous savings (3 from vertex loop direction + 3 from SI push timing), total: 7+3+3 = 13 bytes saved. New estimate: 310 - 13 = 297 bytes. Still over 256 by 41 bytes.
I need to find more savings. Let me look at the biggest remaining sections:
Edge section total: 30 + 10 + 6 + 28 + 4 = 78 bytes (reduced from 91)
Let me also re-examine the edge section more carefully.
After the 2 vertex loads, I have: dx=x0, di=y0, ax=x1, bx=y1.
Step computation:
sub ax, dx ; 2: ax = x_diff
sub bx, di ; 2: bx = y_diff
shl ax, 3 ; 3
shl bx, 3 ; 3
push bx ; 1: y_step
push ax ; 1: x_step
mov bp, sp ; 2
shl dx, 8 ; 3
shl di, 8 ; 3
= 2+2+3+3+1+1+2+3+3 = 20 bytes (was 17 bytes before for store to memory; now 20 bytes but saves 4 bytes data and inner loop)
Total savings vs. old approach for this section: Old: 17 bytes code + 4 bytes data (xs, ys) + inner loop using 4-byte mem ops = 17+4+24 = 29 bytes New: 20 bytes code + 0 bytes data + inner loop using 3-byte BP ops = 20+0+23 = 26 bytes Savings: 29-26 = 3 bytes ✓ (matches my earlier calculation).
Hmm, let me also look at the add dx, [bp] encoding more carefully. ADD DX, [BP+0] = ADD r16, r/m16 = 03 /r. r=DX(2), r/m=[BP+disp8=0]: mod=01, rm=110 (BP), reg=010 = 01_010_110 = 0x56. Displacement = 0x00. So: 03 56 00 = 3 bytes ✓
ADD DI, [BP+2] = 03, r=DI(7), r/m=[BP+disp8=2]: mod=01, rm=110, reg=111 = 01_111_110 = 0x7E. Displacement = 0x02. So: 03 7E 02 = 3 bytes ✓
Great.
Hmm, let me look at another savings opportunity. The shl dx, 8 and shl di, 8 (3 bytes each = 6 bytes total).
What if I use XOR DH, DH (2 bytes) instead of SHL DX, 8? No: XOR DH,DH would zero the high byte, leaving only DL. I need DX = old_DX * 256 = DL * 256 (if DX < 256). Wait: DX = x0 which is in range [82..238]. So DX has a nonzero high byte only if x0 >= 256, which can't happen here. So DH = 0 always. Then xchg dh, dl = ? Not a standard instruction. But what about: mov dh, dl; xor dl, dl = 2+2 = 4 bytes? That moves DL to DH and zeros DL, effectively doing DX = DX * 256. No wait: that would give DH=DL, DL=0 = DL<<8. But that puts the original DX value (as a byte) in DH. Since x0 < 256, DL = x0 and DH = 0, so mov dh, dl; xor dl, dl gives DX = x0 * 256. That's 4 bytes vs. shl dx, 8 = 3 bytes. Worse.
Alternative: Since x0 < 256, I could use: xchg dh, dl if that were an instruction, but it's not a standard x86 instruction.
What about: shl dl, 8? That's not valid (SHL on DL can only shift by 1..31, and DL is 8 bits, shifting by 8 would zero it). Actually: SHL DL, 1 7 times, but that's ridiculous.
OK shl dx, 8 = C1 E2 08 = 3 bytes is the best. ✓
Let me look at another area: the sin/cos computation.
nasm
fadd dword [.da] ; 4
fld st0 ; 2
fsin ; 2
fmul dword [.k64] ; 4
fistp word [.isa] ; 4
fld st0 ; 2
fcos ; 2
fmul dword [.k64] ; 4
fistp word [.ica] ; 4
= 4+2+2+4+4+2+2+4+4 = 28 bytes
Optimization: use the same [.k64] reference for both. Already done. ✓
Can I combine sin and cos computation? Use FSINCOS: FSINCOS (opcode D9 FB, 2 bytes): pushes cos(st0) then sin(st0)? Actually: FSINCOS computes both sin and cos of st0. Stack after: st0 = sin(original), st1 = cos(original) ... wait let me check.
FSINCOS: replaces st0 with sin(st0) and pushes cos(st0). Result: st0 = cos, st1 = sin. No wait: FSINCOS: pushes sin first, then pushes cos? Or: "replaces st(0) with the sine of st(0), and then pushes the cosine." So: st0 = cos, st1 = sin. (The sine is pushed second, becoming the new top... hmm actually I need to check.)
According to x87 docs: FSINCOS - Compute sine and cosine of top of FPU stack. The result: sin is stored in st(0) and cosine is pushed onto the FPU stack. So: st(0) = cosine, st(1) = sine.
Then:
nasm
fadd dword [.da] ; 4: angle updated, at st0
fld st0 ; 2: duplicate (so we can pop angle later)
fsincos ; 2: st0=cos, st1=sin, st2=angle
Wait: FSINCOS pops the old st0 (angle) and pushes sin then cos? Or pushes cos on top of sin?
From Intel docs: "FSINCOS - Compute sine and cosine of ST(0) and store in ST(0) and ST(1)": After: ST(0) = cos(old ST(0)), new ST(1) = sin(old ST(0))
So if I do:
fld st0 ; duplicate angle: [angle, angle]
fsincos ; [cos, sin, angle]
Then:
Now:
fmul dword [.k64] ; st0 = cos*64
fistp word [.ica] ; pop: [sin, angle]
fmul dword [.k64] ; st0 = sin*64
fistp word [.isa] ; pop: [angle]
vs. old approach:
fld st0; fsin; fmul dword [.k64]; fistp word [.isa] ; 2+2+4+4 = 12 bytes
fld st0; fcos; fmul dword [.k64]; fistp word [.ica] ; 2+2+4+4 = 12 bytes
= 24 bytes for sin+cos.
New approach:
fld st0 ; 2
fsincos ; 2
fmul dword [.k64] ; 4
fistp word [.ica] ; 4
fmul dword [.k64] ; 4
fistp word [.isa] ; 4
= 2+2+4+4+4+4 = 20 bytes. Saves 4 bytes!
Wait, I need to check if FSINCOS is available in early x87 chips. FSINCOS was introduced with the 387 coprocessor. For DOS demos, requiring a 386+387 is generally acceptable (most DOS demos from the early 90s required a 386). If targeting pure 8086, it's not available, but for the purposes of this demo, 386+387 is fine.
Great, 4 bytes saved with FSINCOS! ✓
New estimate: 297 - 4 = 293 bytes. Still over by 37 bytes.
Let me look for more savings...
Let me reconsider the vertex loop. Here's the current structure:
; Outer loop: bx goes 7 to 0
; Inner coord loop: extracts bits 0,1,2 of bx
; Then rotation
; Then store to .pts
For the rotation, I use imul word [.ica] and imul word [.isa]. These single-operand IMUL instructions take 4 bytes each (F7 2E lo hi). Can I make them shorter?
If .ica and .isa are within 127 bytes of the current instruction, I can use mod=01 (8-bit displacement) for the memory addressing. But direct memory addresses (mod=00, rm=110) always use 16-bit. So no savings unless I'm accessing via a base register.
One trick: use BP+offset to access sin/cos if BP is set to a base address. But I'm using BP for the step values in the inner loop (which is in the edge section, not the vertex section). In the vertex section, BP is free!
If I set BP to point to .isa/.ica area at the start:
mov bp, .ica ; 3 bytes (BF lo hi for MOV DI, but not BP... MOV BP, imm16 = BD lo hi = 3 bytes)
Then access: imul word [bp] (for .ica at [bp]) and imul word [bp+2] (for .isa at [bp+2]).
imul word [bp] = F7 6E 00 = 3 bytes (mod=01, rm=110=BP, reg=/5=101: 01_101_110=0x6E, disp8=0). ✓ vs. imul word [.ica] = F7 2E lo hi = 4 bytes.
Saves 1 byte per IMUL instruction. Used twice in vertex loop = 2 bytes saved. Plus setup: mov bp, .ica = 3 bytes. Net: 2-3 = -1 byte. Not worth it!
But wait, if I access .isa too, and .isa follows .ica in memory (2 bytes apart), then:
; Set up once before vertex loop:
mov bp, .ica ; 3 bytes: BP = address of .ica
Then in vertex loop:
imul word [bp] ; 3 bytes: imul .ica
...
imul word [bp+2] ; 3 bytes: imul .isa
vs. old:
imul word [.ica] ; 4 bytes
imul word [.isa] ; 4 bytes
Savings: 2 bytes per iteration × 8 iterations = 2 bytes of CODE (the loop body is the same code, run 8 times). Plus setup: 3 bytes. Net: 2 - 3 = -1 bytes. Still not worth it.
Hmm. What if I used BP throughout and accessed more variables via BP? If I set BP = base of my data area, I could access everything with 8-bit displacements. But this requires all data to be within 127 bytes of BP, and the base-relative addressing (IMUL, MOV, etc.) to use mod=01.
This is the classic "base pointer optimization" for 256b intros! Let me think...
Data area: .da(4) + .k64(4) + .isa(2) + .ica(2) + .pts(32) + .edges(24) + misc = ~70 bytes.
If BP points to the start of the data area (or somewhere in the middle), all accesses use 8-bit displacements (3 bytes each) instead of 16-bit displacements (4 bytes each).
Currently, direct memory accesses with 16-bit displacement take 4 bytes each. With BP + 8-bit displacement, they'd be 3 bytes each.
How many such accesses do I have?
For the pure data accesses (without index registers): Total such accesses that can benefit: ~10 accesses × 1 byte saving = 10 bytes. Cost of setting up BP: mov bp, .data_base = 3 bytes. Net savings: 10 - 3 = 7 bytes!
But this requires restructuring how I access data. Also, the FPU instructions like fadd dword [bp+n] would use: FADD dword [BP+disp8] = D8 46 disp8 = 3 bytes. ✓ (Was D8 06 lo hi = 4 bytes)
Let me count more carefully which accesses can benefit from BP-relative addressing (not involving BX/SI/DI as index registers):
From sin/cos section:
From vertex loop (×8 iterations, but same code):
From coord loop:
From edge loading (using [.pts+bx]): mov dx, [.pts+bx] = 8B 97 lo hi = 4 bytes. mov dx, [bp+bx+pts_off]? In 16-bit mode, the addressing mode [BP+BX+disp8] is valid: ModRM mod=01, rm=011 (BP+BX with displacement): 01_010_011 = 0x53 (for reg=DX=2). So: 8B 53 disp8 = 3 bytes! ✓ (3 bytes instead of 4)
But wait: [.pts+bx] means [offset_of_pts + bx]. With BP pointing to the base of data, [bp+bx+pts_offset_from_bp]. If pts_offset_from_bp is small (< 128), this works as a 3-byte instruction! Let me count how many such .pts accesses there are:
In edge loading (per edge, 2 vertices × 2 coords = 4 accesses):
In store to .pts (×8 iterations, same code):
16-bit mod/rm combinations:
So [BP+DI+disp8] would be rm=011 with mod=01. That's valid! mov [bp+di+pts_off], si = 89 73 pts_off = 3 bytes (mod=01, reg=SI=6, rm=011: 01_110_011 = 0x73). ✓
And mov [bp+di+pts_off+2], ax = 89 43 (pts_off+2) = 3 bytes! ✓
vs. old: mov [.pts+di], si = 4 bytes, mov [.pts+di+2], ax = 4 bytes. Saves: 2 bytes per vertex × 8 vertices = 2 bytes of code.
Total additional savings from BP-relative addressing:
But setup: mov bp, data_base = 3 bytes.
Net: 13 - 3 = 10 bytes saved. ✓
New estimate: 293 - 10 = 283 bytes. Still over by 27 bytes.
Let me keep looking for savings.
One big opportunity: the inner line drawing loop. Currently 28 bytes (with BP-based step access). Can I make it smaller?
Inner loop:
add dx, [bp+...] ; 3
add di, [bp+...] ; 3
mov ax, dx ; 2
sar ax, 8 ; 3 (C1 F8 08)
mov bx, di ; 2
sar bx, 8 ; 3 (C1 FB 08)
imul bx, bx, 320 ; 4 (69 DB 40 01)
add bx, ax ; 2
mov [es:bx], byte 15 ; 4 (26 C6 07 0F)
loop .ls ; 2
= 3+3+2+3+2+3+4+2+4+2 = 28 bytes.
Can I make the pixel plotting faster/smaller?
mov [es:bx], byte 15: ES override (26h) + MOV byte [BX], 15 (C6 07 0F) = 4 bytes. Alternative: Use STOSB to store AL to [ES:DI]. But DI is my y_pos! I'd need to free DI.
What if I use a different register layout? Let me use AX for y_pos and BX for x_pos: Wait, but AX is used by IMUL as the implicit destination...
Hmm, what about BSWAP/ROR tricks? Not helpful here.
For imul bx, bx, 320: since 320 = 256+64 = (1<<8)+(1<<6), I could use:
mov ax, bx
shl bx, 8 ; bx = bx*256 (3 bytes)
shl ax, 6 ; ax = bx*64 (3 bytes)
add bx, ax ; bx = bx*320 (2 bytes)
= 8 bytes vs. imul bx, bx, 320 = 4 bytes. Much worse.
Or: imul bx, bx, 256; add bx, [something]*64 - no. The IMUL 320 (4 bytes) is already pretty good.
What if I precompute y*320 using the FPU? fild word [tmp]; fimul [k320]; fistp [tmp] = way more bytes.
Hmm, what about a different fixed-point representation? If I use DX=y_pos/2 and DI=x_pos/2 (scale factor 4 instead of 8, with 32 steps), the SAR would be 7 instead of 8... no, it would be sar by 7 (since scale = 128 with 32 steps: step = diff128/32 = diff4, so >>2 not >>8). This would make sar ax, 2 = 3 bytes same as sar ax, 8. No savings.
What if the inner loop is 16 steps instead of 32? Saves 0 bytes (same code, just mov cx, 16 instead of mov cx, 32). But lines would look more dotted. With cube size 55, max edge ≈ 78 pixels, 16 steps → spacing ≈ 5 pixels. On a 640×400 display (mode 13h displays at 320×200 but is often shown doubled), this would look like dashes, not lines. Hmm.
Could I increase cube SIZE to 30 and use fewer steps? With SIZE=30, max edge ≈ 42 pixels, 16 steps → ~2.6 px/step. Visible but sparse. Not ideal.
What if I use a fundamentally different line drawing? For example, only draw horizontal and vertical projections? No, that's different from a wireframe cube.
Let me think about the biggest code chunk. Let me re-examine the complete edge loop:
Edge loading (for 2 vertices): 30 bytes (can reduce to 24 with CBW+BP tricks - already counted) Step computation (with stack-based approach): 20 bytes Init positions: 6 bytes Inner loop (28 steps): 28 bytes × 12 edges = 28 bytes (code, not per-execution) Loop overhead: 4 bytes Total edge loop: 30 + 20 + 6 + 28 + 4 + 2 (pop ax pop bx cleanup) = 90 bytes
Hmm, that's still big. But with BP addressing: Edge loading: 24 bytes (4 saved: 4 accesses × 1 byte) Step computation: 20 bytes Init positions: 6 bytes Inner loop: 28 bytes - actually let me recalculate with BP:
Cleanup: pop ax; pop bx = 2 bytes. Loop overhead: push cx; pop si; pop cx; loop .el = 1+1+1+2 = 5 bytes.
Edge section total: 24 + 20 + 6 + 28 + 2 + 5 = 85 bytes.
Still large. Let me try to find more savings in the edge section.
For step computation:
sub ax, dx ; 2: x_diff
sub bx, di ; 2: y_diff
shl ax, 3 ; 3
shl bx, 3 ; 3
push bx ; 1
push ax ; 1
mov bp, sp ; 2
shl dx, 8 ; 3
shl di, 8 ; 3
= 20 bytes.
Hmm, shl ax, 3 = C1 E0 03 = 3 bytes. Can I do shl ax, 1; shl ax, 1; shl ax, 1 = 3+3+3 = 9 bytes? No, shl ax, 1 is D1 E0 = 2 bytes each → 6 bytes. Still worse than 3 bytes.
But wait: ADD AX, AX; ADD AX, AX; ADD AX, AX = 2+2+2 = 6 bytes? ADD AX, AX = 01 C0 or 03 C0? Actually ADD AX, AX = ADD r/m16, r16 = 01 C0 = 2 bytes (adds AX to AX = *2). 3 of these = 6 bytes vs. shl ax, 3 = 3 bytes. The shift is better.
Let me think about shl dx, 8 and shl di, 8. These are 3 bytes each = 6 bytes.
For shl dx, 8: x0 is in DX. We want DX = x0 * 256. Since x0 < 256, we can do: mov dh, dl; xor dl, dl = 2+2 = 4 bytes. But shl dx, 8 = 3 bytes is better.
Or: sub dl, dl (zero DL, but wait: sub dl, dl = 2A D2 = 2 bytes!) + ... no, I need to move DL to DH first.
xchg dl, dh = a real instruction! XCHG r8, r/m8 = 86 /r. With r=DL(2), r/m=DH(6) in mod=11: ModRM = 11_010_110 = 0xD6. So 86 D6 = 2 bytes. Then xor dl, dl = 30 D2 = 2 bytes. Total: 4 bytes. Not better.
What about MOV DH, DL (2 bytes) + XOR DL, DL (2 bytes) = 4 bytes? MOV r8, r/m8 = 8A /r. r=DH(6), r/m=DL(2) in mod=11: 8A F2 = 2 bytes. Then XOR DL, DL = 30 D2 = 2 bytes. Total 4 bytes. Still worse than SHL DX, 8 = 3 bytes.
OK I accept shl dx, 8 and shl di, 8 as 3 bytes each.
Let me now look at the vertex projection loop more carefully and see if I can save bytes there.
Current vertex loop (with down-counting, coord loop via STOSW):
mov bx, 7 ; 3: init BX
.vl:
push bx ; 1: save original BX
; Coord loop:
mov di, .vcoord ; 3: point DI to vcoord storage
mov cx, 3 ; 3: 3 iterations
.vc:
mov ax, bx ; 2
and al, 1 ; 2
add al, al ; 2
dec al ; 2
cbw ; 1
imul ax, ax, SIZE ; 3
stosw ; 1
shr bx, 1 ; 2
loop .vc ; 2
pop bx ; 1: restore BX
; Rotation: rx = (vx*ica - vz*isa) >> 6
; vcoord+0 = vx, vcoord+2 = vy, vcoord+4 = vz
; (wait: with STOSW order: bit0→vcoord+0=vx, bit1→vcoord+2=vy, bit2→vcoord+4=vz)
mov ax, [.vcoord] ; 3: vx... but with BP: 3 bytes
imul word [.ica] ; with BP: 3 bytes
xchg si, ax ; 1: si = vx*ica (XCHG AX,SI = 96h = 1 byte)
mov ax, [.vcoord+4] ; with BP+4: 3 bytes
imul word [.isa] ; with BP: 3 bytes
sub si, ax ; 2
sar si, 6 ; 3 (C1 FE 06)
add si, 160 ; 4 (81 C6 A0 00)
; sy = -vy + 100
mov ax, [.vcoord+2] ; with BP+2: 3 bytes
neg ax ; 2
add ax, 100 ; 3 (05 64 00)
; Store
; DI was used by STOSW and now points past vz (vcoord+6)
; Need to restore DI for pts addressing
mov di, bx ; 2
shl di, 2 ; 3
mov [bp+di+pts_off], si ; 3 (with BP)
mov [bp+di+pts_off+2], ax ; 3 (with BP)
; Loop control
dec bx ; 1
jns .vl ; 2
Let me count this:
Wait, that's 71 bytes of code for the vertex loop body (which runs 8 times, but the code itself is 71 bytes).
Hmm, that's less than my previous estimate of 78 bytes. Let me check if the BP-addressing properly reduces the rotation costs.
With BP pointing to data start, and defining offsets:
Then:
MOV AX, [BP+disp8]: MOV r16, r/m16 = 8B. r=AX(0), r/m=[BP+disp8]: mod=01, rm=110, reg=000: 01_000_110=0x46. So 8B 46 disp8 = 3 bytes ✓.
And for mov ax, [bp+vcoord_off+4] = 8B 46 (vcoord_off+4) = 3 bytes ✓.
But wait, I also need to access [bp+di+pts_off]: MOV [BP+DI+disp8], SI = 89, mod=01, r/m=011(BP+DI), reg=SI(6): ModRM = 01_110_011 = 0x73. So 89 73 disp8 = 3 bytes ✓.
And add si, 160 = 81 C6 A0 00 = 4 bytes. Can I use BP for this? No, it's just adding a constant to SI. If I precompute cx+160 earlier... Let me see: si = rx (result of rotation). I need to add 160. What if I precompute the center at start? Can't use BP since it's used for data.
What if add si, 160 = add si, 128; add si, 32 = 3+3 = 6 bytes? No worse.
Or: lea si, [si+160] = 8D B6 A0 00 = 4 bytes (using [SI+disp16] form). Same as ADD SI, 160.
Actually: add si, 160 encoding: 160 = 0xA0. Since 0xA0 as a signed byte = -96, the short imm8 form 83 C6 A0 would compute SI + (-96) = SI - 96. I want SI + 160. So I need the full 16-bit form: 81 C6 A0 00 = 4 bytes. OR use `add si, -96; sub si, ... no that's wrong. I just need SI = rx + 160.
What if instead of adding 160 to rx, I initialize SI = 160 before the vertex loop and let the rotation result be relative? No, that changes the algorithm.
Hmm. Can I compute the screen X differently to avoid adding 160?
What if I use a different formula: instead of sx = rx + 160, precompute scale_x = (rx + 160) = 160 + (vx*ica - vz*isa)>>6. I can't easily avoid the +160.
Alternative: compute rx + 160 using the FPU before converting to integer:
Another approach: If I use the FPU and keep everything in floats, I can do:
; rx = vx*ca - vz*sa + 160.0 (all at once)
; Then fistp → integer screen x
But this would require floating point for all 8 vertices, which we decided against.
OK let me just accept 4 bytes for add si, 160.
Let me now estimate the total size with all optimizations so far:
Setup (mode change + FPU init): 14 bytes Frame clear: 9 bytes Angle update + fsincos + scale: 4+2+2+4+4+4+4 = 24 bytes (using FSINCOS) Vertex loop:
Total code: 14 + 9 + 24 + 3 + 71 + 85 + 12 + 3 = 221 bytes
Wait, that's only 221 bytes! But I might be undercounting somewhere. Let me recheck the vertex loop body carefully.
Vertex loop body:
Hmm, I was counting wrong earlier. Let me recount:
Rotation section:
mov ax, [bp+vcoord_off] : 3 (8B 46 vcoord_off)
imul word [bp+ica_off] : 3 (F7 6E ica_off) -- wait, is F7 the right opcode?
IMUL r/m16 (single-operand) = F7 /5. With [BP+disp8]: mod=01, r/m=110(BP), reg=/5=101 → 01_101_110 = 0x6E. Instruction: F7 6E ica_off = 3 bytes ✓.
xchg si, ax : 1 (XCHG AX, SI = 96h)
mov ax, [bp+vcoord_off+4] : 3 (8B 46 vcoord_off+4)
imul word [bp+isa_off] : 3 (F7 6E isa_off)
sub si, ax : 2 (2B F0)
sar si, 6 : 3 (C1 FE 06)
add si, 160 : 4 (81 C6 A0 00)
= 3+3+1+3+3+2+3+4 = 22 bytes
sy section:
mov ax, [bp+vcoord_off+2] : 3
neg ax : 2 (F7 D8)
add ax, 100 : 3 (05 64 00... but wait: ADD AX, imm16 = 05 lo hi. 100 = 0x64. 05 64 00 = 3 bytes ✓)
= 8 bytes
Store section:
mov di, bx : 2 (89 DF... MOV DI, BX = 89 FB? Let me check: MOV r/m16, r16 = 89. r=BX(3), r/m=DI(7): ModRM = 11_011_111 = 0xDF. So 89 DF = 2 bytes ✓)
shl di, 2 : 3 (C1 E7 02)
mov [bp+di+pts_off], si : 3 (89 73 pts_off)
mov [bp+di+pts_off+2], ax : 3 (89 43 pts_off+2)
= 11 bytes
Wait, mov [bp+di+pts_off], si: MOV r/m16, r16 = 89. r=SI(6), r/m=[BP+DI+disp8]: mod=01, rm=011, reg=110 → 01_110_011 = 0x73. Disp8 = pts_off. Instruction: 89 73 pts_off = 3 bytes ✓.
And mov [bp+di+pts_off+2], ax: MOV r/m16, r16 = 89. r=AX(0), r/m=[BP+DI+disp8]: mod=01, rm=011, reg=000 → 01_000_011 = 0x43. Disp8 = pts_off+2. Instruction: 89 43 (pts_off+2) = 3 bytes ✓.
Loop control:
dec bx : 1 (4Bh for DEC BX)
jns .vl : 2
= 3 bytes
Vertex loop total per body: 25 + 22 + 8 + 11 + 3 = 69 bytes Plus init: mov bx, 7 = 3 bytes Total vertex loop: 72 bytes
Now for the edge section. Let me recount with the optimizations:
Edge loading:
; Vertex 0:
lodsb : 1
cbw : 1
shl ax, 2 : 3
mov bx, ax : 2
mov dx, [bp+bx+pts_off] : 3 (8B 53 pts_off: MOV DX, [BX+BP+disp8] -- wait, [BX+BP] or [BP+BX]?)
Hmm wait. In 16-bit addressing mode:
There is NO [BX+BP] or [BP+BX] mode in 16-bit! I was wrong earlier.
So mov dx, [.pts+bx] cannot be done with BP as an additional base. In 16-bit mode, the addressing modes with two base registers are BX+SI, BX+DI, BP+SI, BP+DI. There's no BP+BX!
This means I can't use BP+BX for the .pts[bx] accesses. I can use:
Hmm. So my previous claim that .pts access with BP can be 3 bytes was WRONG. Those accesses must stay at 4 bytes.
Let me recalculate the edge loading:
lodsb : 1
cbw : 1
shl ax, 2 : 3
mov bx, ax : 2
mov dx, [.pts+bx] : 4 (8B 97 lo hi: can't use BP here)
mov di, [.pts+bx+2] : 4
lodsb : 1
push si : 1 (save next edge pointer)
cbw : 1
shl ax, 2 : 3
mov bx, ax : 2
mov ax, [.pts+bx] : 4
mov bx, [.pts+bx+2] : 4
= 1+1+3+2+4+4+1+1+1+3+2+4+4 = 31 bytes
Hmm, I was wrong about saving bytes here with BP. The .pts accesses still use [.pts+BX] with 4-byte encoding.
Actually, wait. Let me reconsider. Can I change the indexing to use BP+SI or BP+DI?
If before loading vertex coords, I do mov si, bx (where bx = index*4), then I can use [BP+SI+pts_off] which IS a valid 16-bit addressing mode!
; Set SI = vertex_index * 4
; Then: [BP+SI+pts_off] is valid (rm=010 = [BP+SI])
; mod=01 (8-bit disp): 3 bytes total ✓
So:
lodsb : 1
cbw : 1
shl ax, 2 : 3
mov si, ax : 2 (instead of mov bx, ax: same size)
mov dx, [bp+si+pts_off] : 3 (8B 52 pts_off: MOV DX, [BP+SI+disp8])
mov di, [bp+si+pts_off+2] : 3
lodsb : 1
push si : 1 (save next ptr... wait, SI was just loaded with vertex index, not with edge ptr. Need to reconsider.)
Oh wait - I have a conflict! SI is used as both the edge pointer (for LODSB) and the vertex index register in this approach. After LODSB, SI = edge pointer + 1. Then I do cbw; shl ax, 2; mov si, ax which overwrites SI with the vertex index×4. Then I can use [BP+SI+pts_off]. But now SI is the vertex index, not the edge pointer.
After loading vertex 0, SI = v0_idx * 4. I need to save the edge pointer somewhere. If I push the original SI (edge pointer) before overwriting, and pop it later...
Wait, let me reconsider my loop structure. The push si trick: I push SI after the SECOND lodsb. So:
Hmm. The conflict is: SI is both the LODSB pointer and the vertex index for pts access.
Resolution: Use BX for vertex index (as before), and accept 4-byte .pts access. OR, restructure to avoid the conflict.
Actually, in the old approach, after LODSB, I computed: cbw; shl ax, 2; mov bx, ax to get vertex offset in BX, then access via [.pts+bx] (4 bytes).
Alternative: use SI as vertex index and BX as scratch:
lodsb ; al = v0 index, SI advances
cbw
shl ax, 2 ; ax = v0*4
mov si, ax ; SI = v0*4 (overwrite edge pointer; edge pointer + 1 is already in SI from lodsb)
; But I need to save the CURRENT SI value (edge_ptr+1) before overwriting!
; Hmm.
OK what if I preload both vertex indices before computing addresses?
lodsw ; LODSW: load 2 bytes at once (v0 in AL, v1 in AH), SI += 2
push si ; save SI (now pointing to next edge pair)
; AX = v1:v0 (v0 in AL, v1 in AH)
; Extract v0:
mov bx, ax
and bl, 0xff ; bl = v0 (or: use BL since AL has it, just use BL below)
shl bx, 2 ; wait, need to zero BH first...
; Extract v0 as byte:
xor bh, bh ; BH = 0
mov bl, al ; BL = v0 index
shl bx, 2 ; BX = v0*4
mov dx, [bp+bx+pts_off]... nope, still can't use BP+BX.
Argh. The BP+BX combination is not valid in 16-bit mode.
BUT: I can use BP+DI or BP+SI!
If I load vertex indices into SI and DI (using them as indices):
lodsw ; AX = v1:v0 in bytes (v0 in AL, v1 in AH), SI += 2
push si ; save next edge pointer
; Use SI for v0 index:
xor sh, sh ; ... no, SI is not an 8-bit register in a useful way here
movzx si, al ; In 16-bit mode: 0F B6 F0?
Actually in 16-bit code, MOVZX is a 32-bit instruction... Wait, no. MOVZX is part of the 386 instruction set and works in 16-bit mode too: MOVZX r16, r/m8 = 0F B6 /r. MOVZX SI, AL = 0F B6 F0 = 3 bytes.
Or: mov si, ax; and si, 0xFF = 2+3 = 5 bytes. Or cbw then mov si, ax = 1+2 = 3 bytes (but cbw destroys AH!). Or: mov sl, al; xor sh, sh - not valid as there's no SL/SH split for SI.
Hmm. Let me use a completely different approach:
Use LODSW to load both indices at once. Then:
Hmm: LODSW approach: After LODSW: AL=v0, AH=v1. Save v1 in AH for later. For v0: xor ah, ah; shl ax, 2; mov bx, ax; mov dx, [.pts+bx]; mov di, [.pts+bx+2] But AH is being zeroed, destroying v1 index. Need to save v1 first.
lodsw ; 1: AL=v0, AH=v1 in little-endian (actually: first byte AL=v0, second byte AH=v1)
push si ; 1: save next edge pointer
push ax ; 1: save both indices (AH=v1, AL=v0)
; Load v0 coords:
xor ah, ah ; 2: zero AH, AX = v0
shl ax, 2 ; 3
mov bx, ax ; 2
mov dx, [.pts+bx] ; 4
mov di, [.pts+bx+2] ; 4
; Load v1 coords:
pop ax ; 1: AX = v1:v0
xchg al, ah ; 2: ? Exchange AL and AH: XCHG AL, AH doesn't exist...
; Alternative: shr ax, 8
shr ax, 8 ; 3 (C1 E8 08): AX = v1 (AH shifted down)
; Or: mov al, ah; xor ah, ah: 2+2=4 bytes
shr ax, 8 = C1 E8 08 = 3 bytes. Then AX = v1. Continue:
xor ah, ah ; 2: just to be safe (already should be 0 after shift)
shl ax, 2 ; 3
mov bx, ax ; 2
mov ax, [.pts+bx] ; 4
mov bx, [.pts+bx+2] ; 4
Total with LODSW approach: 1+1+1+2+3+2+4+4+1+3+2+3+2+4+4 = 37 bytes. vs. current double-LODSB approach: 31 bytes. LODSW is WORSE.
What if I use two separate LODSB instructions but access pts via SI/DI+BP? The problem is SI is the LODSB pointer.
After the SECOND lodsb (which advances SI to next pair and I push SI), I can then use SI as a vertex index:
lodsb ; 1: v0 in AL, SI points to v1 byte
cbw ; 1
shl ax, 2 ; 3
mov bx, ax ; 2: BX = v0*4
mov dx, [.pts+bx] ; 4: x0
mov di, [.pts+bx+2] ; 4: y0
lodsb ; 1: v1 in AL, SI points to next pair
push si ; 1: save next pair pointer
cbw ; 1
shl ax, 2 ; 3: AX = v1*4
; Can I use SI here for better access? Let me check:
mov si, ax ; 2: SI = v1*4
; Now access [bp+si+pts_off] (valid since BP+SI is in 16-bit addressing table!)
mov ax, [bp+si+pts_off] ; 3!!! (8B 42 pts_off: MOV AX, [BP+SI+disp8])
mov bx, [bp+si+pts_off+2] ; 3!!! (8B 5A pts_off+2: MOV BX, [BP+SI+disp8+2])
Wait, [BP+SI+disp8] IS a valid 16-bit addressing mode! rm=010 = [BP+SI]. With mod=01 (8-bit displacement): MOV AX, [BP+SI+disp8] = 8B 42 disp8 = 3 bytes! ✓
And MOV BX, [BP+SI+disp8] = 8B 5A disp8 = 3 bytes! ✓
So for vertex 1, after mov si, ax (SI = v1*4), I can load from [BP+SI+pts_off] in 3 bytes!
Let me check the ModRM encoding:
So for vertex 1 loading, I save 2 bytes (3 vs 4 per access, 2 accesses = 2 bytes saved).
For vertex 0, I'm still using [.pts+bx] = 4 bytes each. But can I also use BP+SI for vertex 0?
For vertex 0: After the first LODSB and shl ax, 2, I could do mov si, ax and then mov dx, [bp+si+pts_off].
But then SI would be used for vertex 0 index, not as LODSB pointer. That's fine because after the first LODSB, the LODSB pointer (SI) has already been advanced by 1. I just need to save this pointer.
Wait: if I do mov si, ax AFTER the first LODSB, I've overwritten the LODSB pointer. But I need SI for the second LODSB!
Solution: Save and restore SI. Or: do both LODSBs first, save both indices, then compute addresses.
Here's a revised approach:
; Load both vertex indices first:
lodsb ; 1: v0 in AL
mov bl, al ; 2: save v0 in BL
lodsb ; 1: v1 in AL
push si ; 1: save next edge pointer (SI now points past this pair)
; Load v1 coords:
cbw ; 1: AX = v1
shl ax, 2 ; 3: AX = v1*4
mov si, ax ; 2: SI = v1*4
mov ax, [bp+si+pts_off] ; 3: x1
mov cx, [bp+si+pts_off+2] ; 3: y1 (use CX for y1 temporarily)
; Load v0 coords:
; bl = v0 index
mov al, bl ; 2: AL = v0
xor ah, ah ; 2: AX = v0
shl ax, 2 ; 3: AX = v0*4
mov si, ax ; 2: SI = v0*4
mov dx, [bp+si+pts_off] ; 3: x0
mov di, [bp+si+pts_off+2] ; 3: y0
; Now: dx=x0, di=y0, ax=x1, cx=y1
; Wait, ax was overwritten by shl ax,2 for v0. And cx=y1.
; I need: x1 and y1. They were in AX and CX...
; But now AX and CX are used for v0 offset. I need to restore x1 and y1.
This is getting complicated. Let me use a different register for y1:
; Load v1 coords:
cbw
shl ax, 2
mov si, ax
mov ax, [bp+si+pts_off] ; 3: x1
push ax ; 1: save x1
mov ax, [bp+si+pts_off+2] ; 3: y1
push ax ; 1: save y1
; Load v0 coords:
mov al, bl
xor ah, ah
shl ax, 2
mov si, ax
mov dx, [bp+si+pts_off] ; 3: x0
mov di, [bp+si+pts_off+2] ; 3: y0
; Now: dx=x0, di=y0. Get x1 and y1 from stack:
pop bx ; bx = y1
pop ax ; ax = x1
Total: 1(lodsb)+2(mov bl,al)+1(lodsb)+1(push si) + [1+3+2+3+1+3+1] + [2+2+3+2+3+3] + [1+1] = 1+2+1+1 + 1+3+2+3+1+3+1 + 2+2+3+2+3+3 + 2 = let me add: 5 + 14 + 15 + 2 = 36 bytes.
vs. old approach: 31 bytes. WORSE!
The issue is that using BP+SI saves 1 byte per access but adds overhead for saving/restoring indices.
OK actually, let me try yet another approach. Use BP+DI for v0 and BX direct for v1 (or vice versa):
After first LODSB (v0 in AL):
cbw ; 1
shl ax, 2 ; 3
xchg di, ax ; 1: DI = v0*4, AX = old DI (garbage, don't care)
Wait: XCHG AX, DI = 97h = 1 byte!
Then: mov dx, [bp+di+pts_off] = 3 bytes, mov ax, [bp+di+pts_off+2] = 3 bytes. Hmm but I need to save y0 and AX has it.
Let me try:
lodsb ; 1: v0 in AL
cbw ; 1
shl ax, 2 ; 3
xchg di, ax ; 1: DI = v0*4, AX = garbage
mov dx, [bp+di+pts_off] ; 3: x0
; y0 needed later, load after v1 to keep registers free? no, DI will be overwritten
; Save y0 in... AX? BX?
push word [bp+di+pts_off+2] ; push y0 directly: is this valid?
; PUSH r/m16 = FF /6 = FF 76 pts_off+2 (for [BP+DI+disp8]) = 3 bytes ✓
push word [bp+di+pts_off+2] ; 3 bytes! Push y0 onto stack
lodsb ; 1: v1 in AL
push si ; 1: save next edge pointer
cbw ; 1
shl ax, 2 ; 3
xchg di, ax ; 1: DI = v1*4
; (old DI = v0*4, now lost. That's OK, we already loaded v0 data)
mov ax, [bp+di+pts_off] ; 3: x1
mov bx, [bp+di+pts_off+2] ; 3: y1
; Recover y0:
pop cx ; 1: cx = old (next edge pointer)... oops, wrong order!
The stack order: I pushed y0 first, then pushed SI (next edge pointer), so:
I need to pop in reverse: first pop SI, then pop y0. But I just pushed SI AFTER y0...
Wait:
Hmm. If I pop SI last (after using the edge data), that works. Let me see:
After loading all vertex data, I have dx=x0, bx=y1, ax=x1. y0 is on the stack below SI. The stack order is: [SI, y0, cx_outer, outer_stuff...]
For the step computation, I need: x0(dx), y0(on stack), x1(ax), y1(bx).
I can pop y0 into DI:
; Get y0 from under SI:
pop cx_save ; cx_save = SI (next edge ptr) - I'll use this as a scratch
pop di ; di = y0
push cx_save ; push SI back
This is 3 extra instructions = 3 bytes. But I'm not sure this saves overall.
Alternatively, use XCH and other tricks. This is getting too complex.
Let me try yet another clean approach. Use BP+DI for vertex 0 and BP+DI for vertex 1 (loading them sequentially):
Wait, I have the key insight: I need DI for two things: (1) y0 storage, and (2) vertex index × 4 for BP+DI addressing. These conflict.
What if I use BP+SI for vertex 0 and BP+DI for vertex 1? Then after loading, DI = v14 and SI = v04:
Wait but SI is the LODSB pointer...
OK, you know what, let me just accept that the edge loading section is going to be ~30 bytes and move on. The total savings I've found (~23 bytes from various tricks) bring the total to ~287 bytes. Still over.
Let me think about this differently. What are the biggest code blocks and can I find a fundamentally better algorithm?
Alternative: Bresenham line drawing: The Bresenham algorithm uses only integer addition and subtraction, no multiplication. But the code for Bresenham is typically larger than the simple DDA I'm using. The advantage of Bresenham is that it draws fewer pixels (exactly the number needed), but for a 256b demo, that's not the concern.
Alternative: Use INT 21h to exit: The exit sequence is:
mov ax, 4C00h ; 3 bytes
int 21h ; 2 bytes
= 5 bytes. Currently I restore video mode (3+2=5) and then RET (1). Total 6 bytes. INT 21h exit saves 1 byte.
Alternative: Use IN AL, 60h; DEC AL; JNZ for ESC check but placed differently: Can't really save more there.
Observation: The .vcoord data (6 bytes) can be placed after .pts to coalesce data. This doesn't save bytes but makes the layout cleaner.
Actually, wait. Let me reconsider whether I need .vcoord at all.
In the vertex loop, I use .vcoord to temporarily store vx, vy, vz before using them in the rotation. What if I could compute them inline without storing?
The coord loop computes all 3 coordinates and stores them. Then I load vcoord+0 (vx) and vcoord+4 (vz) for the rotation. What if I compute vx and vz INLINE in the rotation code, avoiding the need to store/load them from memory?
Inline computation of vx: mov ax, bx; and al, 1; add al, al; dec al; cbw; imul ax, ax, SIZE = 11 bytes. Inline computation of vz: push bx; shr bx, 2; and bl, 1; add bl, bl; dec bl; cbw; imul ax, ax, SIZE; pop bx = 1+3+3+2+2+1+3+1 = 16 bytes? No, wait: I need to shift bx by 2 without using push/pop. But the coord loop SHIFTS bx! That's the issue.
Oh! Here's the key insight: In the coord loop, BX is shifted right at each iteration. After the coord loop, BX has been shifted right by 3. The pop bx restores original BX. What if instead of a coord loop, I just inline the three coordinate computations, but process the BITS OF BX one at a time without shifting?
For vx (bit 0): test bl, 1; jz .xn; fld fifty; jmp .xd; .xn: fld neg_fifty → messy
Alternative: compute each coordinate by extracting the bit WITHOUT modifying BX:
For vx (bit 0):
mov ax, bx
and al, 1 ; bit 0
add al, al
dec al
cbw
imul ax, ax, SIZE ; ax = vx
= 11 bytes (BX unchanged since we used AX)
For vy (bit 1):
mov ax, bx
shr al, 1
and al, 1 ; bit 1
add al, al
dec al
cbw
imul ax, ax, SIZE
= 12 bytes (3 for shr = C0 E8 01... wait: shr al, 1 = D0 E8 = 2 bytes! SHR r8, 1 = D0 /5 with r/m=AL(0): D0 E8 = 2 bytes)
Actually: shr al, 1 = D0 E8 = 2 bytes. shr al, 2 = C0 E8 02 = 3 bytes. So for bit 1: mov ax, bx; shr al, 1; and al, 1; ... = 2+2+2+2+2+1+3 = 14 bytes.
For vz (bit 2): mov ax, bx; shr al, 2; and al, 1; ... = 2+3+2+2+2+1+3 = 15 bytes.
And I'd use vx immediately in the rotation without storing:
; Compute vx and use immediately:
mov ax, bx
and al, 1
add al, al
dec al
cbw
imul ax, ax, SIZE ; ax = vx
imul word [bp+ica_off] ; DX:AX = vx * ica
xchg si, ax ; si = vx*ica (using XCHG AX,SI = 1 byte)
; Compute vz and use immediately:
mov ax, bx
shr al, 2
and al, 1
add al, al
dec al
cbw
imul ax, ax, SIZE ; ax = vz
imul word [bp+isa_off] ; DX:AX = vz * isa
sub si, ax ; si = vx*ica - vz*isa
sar si, 6
add si, 160
; Compute vy:
mov ax, bx
shr al, 1
and al, 1
add al, al
dec al
cbw
imul ax, ax, SIZE ; ax = vy
neg ax
add ax, 100
Count: 11+3+1 + (2+3+2+2+2+1+3+3) + 2+3+4 + (2+2+2+2+2+1+3) + 2+3 =
vs. coord loop + rotation:
Inline is 61 bytes, coord loop is 55 bytes. The coord loop wins!
BUT: with the coord loop, I also need the 6 bytes of .vcoord storage. Without coord loop (inline): no vcoord storage needed = 6 bytes data saved. And 61-55=6 bytes more code. Net: 6-6=0 bytes. Break even!
Not worth changing.
Hmm. OK I'm hitting the limits of easy optimizations. Let me try to enumerate remaining potential savings:
Total savings found: 4+3+10+3+1 = 21 bytes (need to verify BP savings more carefully)
Let me reevaluate BP savings more carefully given that BP+BX is not a valid addressing mode:
With BP set to data start:
Total BP savings from data accesses: 5+2+2+3+2+2 = 16 bytes Minus setup cost: mov bp, .data_start = 3 bytes Net savings: 13 bytes.
Hmm, I previously estimated 10 bytes. Let me go with 13 bytes.
Updated estimate: 310 - (4+3+13+3+1) = 310 - 24 = 286 bytes.
But wait, I also save 3 bytes from the push-SI-after-2nd-LODSB trick and 1 byte from CBW: total -28 bytes. 286 - 4 = 282 bytes.
Hmm, 282 bytes. Still over by 26.
Let me look for more savings. Going through each section again:
Frame clear (9 bytes):
xor di, di ; 2
xor ax, ax ; 2
mov cx, 32000; 3
rep stosw ; 2
= 9 bytes. Can I save here? xor di, di + xor ax, ax = 4 bytes. Alternative: sub di, di = 2 bytes (same). Or xor di, di after fldz or similar that zeroes a register... no.
What if ES is not 0A000h? We set it in setup and it stays. So [ES:0..63999] = video memory. Clear with REP STOSW starting at DI=0. AX=0 (black). ✓
Can I save on mode clear? What if I clear in a different way? mov cx, 16000; rep stosd = 3+2 = 5 bytes. But this requires 32-bit operand and DI might not be 0. Actually rep stosd (STOSD) in 16-bit mode needs 32-bit operations (66h prefix): mov ecx, 16000 = 66 B9 ... that's 7 bytes. Not better.
What if I don't clear the whole screen? Just clear a portion and accept artifacts. For a small cube (55×55 pixels), the "dirty area" is maybe 200×200 pixels. But this makes the demo look broken.
Alternative clear: xor al, al; mov cx, 64000/1; rep stosb = but STOSB stores 1 byte per iteration = 64000 iterations. rep stosw = 32000 iterations = 2× faster. For code size: rep stosb = 2 bytes vs rep stosw = 2 bytes. Same. But mov cx, 64000 = 3 bytes vs mov cx, 32000 = 3 bytes. Same. And xor al, al = 2 bytes vs xor ax, ax = 2 bytes. Same. No savings.
What if I use mov cx, -1 ; rep stosw to store 65535 words? That's slightly more than needed but clears more... and might mess up beyond video area. Risky.
What if mode 13h's palette does the clearing for us? Some modes might start with black. No.
What if I don't clear? Use XOR instead: Instead of clearing and redrawing, XOR each pixel with itself (= 0, so it becomes cleared). And draw new pixels. But XOR with 0 is the same as clearing, and XOR with previous draw data... complex.
I can't save much on the clear. 9 bytes is about minimum.
Angle update and sin/cos (let me recount with FSINCOS):
fadd dword [bp+da_off] ; 3 (with BP)
fld st0 ; 2
fsincos ; 2
fmul dword [bp+k64_off] ; 3 (with BP)
fistp word [bp+ica_off] ; 3 (with BP, stores cos)
fmul dword [bp+k64_off] ; 3 (with BP)
fistp word [bp+isa_off] ; 3 (with BP, stores sin)
= 3+2+2+3+3+3+3 = 19 bytes
Wait, I had it as 24 bytes before FSINCOS. With FSINCOS and BP addressing: 19 bytes. Let me recheck FSINCOS semantics:
FSINCOS: computes sin and cos of st(0). Result: st(0) = sin(old st(0)), st(1) = cos. Wait, I thought it was st(0)=cos, st(1)=sin. Let me double-check.
From Intel x87 docs: "FSINCOS: Computes both the sine and the cosine of the source operand in register ST(0), stores the sine in ST(0), and pushes the cosine onto the FPU register stack."
So: After FSINCOS, st(0) = cos, st(1) = sin.
Then:
Oh right! Before fsincos, I do fld st0 to duplicate the angle so that after fsincos, I still have the original angle. Wait, no:
; FPU stack state: [angle]
fadd [da] ; [angle+da] = [new_angle]
fld st0 ; [new_angle, new_angle]
fsincos ; From [new_angle, new_angle]: fsincos pops st0 (new_angle copy), pushes sin then cos. Stack: [cos, sin, original_new_angle]
Hmm, FSINCOS operates on ST(0) and pushes cos after sin. Before fsincos: [new_angle, new_angle]. After: ST(0) gets sin of new_angle, then cos of new_angle is pushed. So: [cos, sin, new_angle].
Wait, let me re-read: "stores the sine in ST(0), and pushes the cosine onto the FPU register stack."
So ST(0) starts as new_angle (the one we're computing). After FSINCOS:
So BEFORE fsincos: [new_angle, new_angle] AFTER fsincos: [cos, sin, new_angle] (from inner-to-outer stack)
Now: fmul k64 → st(0) = cos64: [cos64, sin, new_angle] fistp [ica]: store ica = cos64, pop. [sin, new_angle] fmul k64: [sin64, new_angle] fistp [isa]: store isa = sin*64, pop. [new_angle]
The angle is preserved on the stack. ✓
So the sequence is:
fadd [bp+da_off] ; 3: [angle+da] = [new_angle]
fld st0 ; 2: [new_angle, new_angle]
fsincos ; 2: [cos, sin, new_angle]
fmul [bp+k64_off] ; 3: [cos*64, sin, new_angle]
fistp [bp+ica_off] ; 3: [sin, new_angle]
fmul [bp+k64_off] ; 3: [sin*64, new_angle]
fistp [bp+isa_off] ; 3: [new_angle]
= 3+2+2+3+3+3+3 = 19 bytes ✓
Great. Previously I had this as 24-28 bytes. This is 19 bytes! Savings of ~5-9 bytes.
Let me also check the FPU data accesses more carefully with BP. The FPU instructions use the same addressing modes as regular MOV instructions. For FADD dword [BP+disp8]:
For FMUL dword [BP+disp8]:
For FISTP word [BP+disp8]:
For IMUL word [BP+disp8] (single-operand):
For MOV r16, [BP+disp8]:
For MOV [BP+DI+disp8], SI:
For MOV [BP+DI+disp8], AX:
For MOV AX, [BP+SI+disp8]:
For MOV BX, [BP+SI+disp8]:
For MOV [BP+DI+disp8], ...:
OK good. All confirmed to be 3 bytes.
Now let me also look at the FPU data (in this section, fld dword [bp+k64_off] or similar):
But wait: I also need to check for vcoord loading. .vcoord has 3 words (vx, vy, vz):
Previously these were mov ax, [.vcoord] = 3 bytes (A1 form for AX from direct address). Hmm wait:
MOV AX, [moffs16] = A1 lo hi = 3 bytes. This is the special short form only for AX! For other registers, MOV r16, [direct] = 8B + ModRM(4 bytes total). So for AX, direct address is 3 bytes. Same as BP-relative 3 bytes. No savings!
For mov ax, [.vcoord+4] (vz): A1 with vcoord+4 address = 3 bytes. Same as BP form.
But wait: for mov ax, [.vcoord+2] (vy): same. These are all already 3 bytes in the direct form!
Hmm. So the BP optimization for .vcoord loads doesn't actually save anything for AX! The savings are only when accessing via other registers.
OK let me recalculate BP savings more carefully:
Savings over direct-address access (4 bytes → 3 bytes):
Wait, items 12-14 are not savings. But items 7-8 ARE savings (for the [.pts+DI] accesses). Let me verify item 7:
Old: mov [.pts+di], si = MOV r/m16, r16 = 89, with r=SI(6), r/m=[DI+disp16]: mod=10, rm=101(DI), reg=110 = 10_110_101 = 0xB5. Total: 89 B5 lo hi = 4 bytes ✓.
New (with BP): mov [bp+di+pts_off], si = 89 73 pts_off = 3 bytes ✓. Saves 1 byte.
And item 11: mov ax, [bp+si+pts_off] vs mov ax, [.pts+bx]: Old (using BX as index): 8B 87 lo hi = 4 bytes. New (using SI with BP): 8B 42 pts_off = 3 bytes. Saves 1 byte.
Total BP savings: 1+2+1+1+2+2+1+1+2 = 13 bytes ✓ (matches my earlier count) Minus setup cost: 3 bytes (mov bp, .data) Net: 10 bytes saved.
Hmm, I now get 10 not 13. Let me recount:
Wait: item 7 and 8 are in the same vertex loop body as items 5-6. They're in the same code path. Items 5-6 save 4 bytes in the vertex loop body. Items 7-8 save 2 bytes in the vertex loop body. Item 11 saves 2 bytes in the edge loop.
Total BP savings from data: 5 (FPU section) + 4+2 (vertex loop) + 2 (edge) = 13 bytes Setup: 3 bytes Net: 10 bytes. ✓
OK so with 10 bytes from BP, plus 4 (FSINCOS), 3 (down-count vertex), 3 (push-SI-timing), 1 (CBW) = 21 bytes total savings. 310 - 21 = 289 bytes.
Still over by 33 bytes. I need to find 33 more bytes to cut.
Let me think bigger. What if I redesign the line drawing to be more compact?
Ultra-compact line drawing: Instead of a general DDA, use a specialized approach for the cube's edge structure.
For the cube rotating around Y, the 12 edges fall into 3 groups:
Actually, with Y rotation, the edges don't have special properties that I can exploit.
What about using the cube's symmetry for rendering? The 12 edges come in 4 groups of 3:
After Y rotation:
For the 4 Y-parallel edges: screen_x is constant, screen_y varies from -S to +S (plus center 100). I can draw these 4 as vertical line segments! This is simpler: just draw a column of pixels from (sx, 100-S) to (sx, 100+S).
For the other 8 edges, they're "horizontal" (in the rotated frame): same y, different x and z. These project to horizontal lines on screen (for orthographic projection): y_screen is constant, x varies.
Wait is that right? With Y rotation, a horizontal edge connecting (vx1, vy, vz1) to (vx2, vy, vz2):
So 8 of the 12 edges are horizontal lines! These can be drawn much more efficiently:
And 4 edges are vertical (the Y-parallel ones):
This is a massive simplification! Horizontal line segment draw:
; Draw horizontal segment from (x1, y) to (x2, y)
; Assumes x1 < x2 (or sort first)
; Offset at (x1, y) = y*320 + x1
; Length = x2 - x1 + 1 pixels
imul ax, y, 320 ; offset of row y
add ax, x1 ; offset of first pixel
mov di, ax
; Draw x2-x1+1 pixels:
mov cx, x2
sub cx, x1 ; cx = x2-x1
inc cx
rep stosb ; fill CX bytes with AL (=15, white)
But I need to set AL=15 first. And handle the case where x1 > x2 (swap them).
For vertical segment:
; Draw vertical from (x, y1) to (x, y2) where y1=45, y2=155
; Length = 111 pixels
; Offset at (x, y1) = y1*320 + x
; Each row: offset += 320
mov di, 45*320 + x ; starting offset
mov cx, 111 ; number of rows
.vs:
mov [es:di], al ; plot
add di, 320 ; next row
loop .vs
For horizontal vs. vertical, this is MUCH simpler! The key insight is that after Y rotation with orthographic projection, the cube's edges are exactly either horizontal or vertical on screen!
Wait... is this actually true? Let me verify.
With Y rotation and orthographic projection:
Now consider an edge between vertex i and vertex j = i XOR 1 (differ only in bit 0, which is the x coordinate):
So these edges (bit 0 pairs) have same screen Y → HORIZONTAL ✓ (as long as we're doing Y rotation)
For edges that differ only in bit 2 (z coordinate):
Also HORIZONTAL ✓
For edges that differ only in bit 1 (y coordinate):
VERTICAL ✓
So with pure Y rotation:
I can draw:
For horizontal segment:
; dy=0, dx varies
; Sort x values: min_x = min(sx_i, sx_j), max_x = max(sx_i, sx_j)
; Pixels from (min_x, sy) to (max_x, sy)
imul ax, sy, 320 ; offset of row sy (using IMUL r, r/m, imm = 4 bytes)
add ax, min_x ; starting offset
mov di, ax
mov cx, max_x
sub cx, min_x
inc cx ; length
rep stosb ; fill (AL should be 15)
= 4+2+2+3+2+2 = 15 bytes (for 8 horizontal edges)
For vertical segment:
; dx=0, dy varies (from sy=45 to sy=155 for full vertical edge)
; Actually sy = -S + 100 to +S + 100, so sy1=45, sy2=155 = 110 pixels + 1 = 111
; But after rotation, the x position might change, so sx varies per edge
; For a vertical edge from (sx, 45) to (sx, 155):
mov ax, 45*320
add ax, sx ; starting offset
mov di, ax
mov cx, 111 ; 155 - 45 + 1 = 111
.vl: mov [es:di], al ; (4 bytes: ES override + MOV [DI], AL)
add di, 320 ; (4 bytes: 83 C7 40 01 → actually 320 doesn't fit in 8-bit...)
Wait: ADD DI, 320 = ADD r16, imm16 (since 320 > 127): 81 C7 40 01 = 4 bytes. Or ADD DI, 320 = 81 C7 40 01. Can I use the short form? 320 = 0x140. As signed 8-bit: 0x40=64 but I need 320. Nope. So: 81 C7 40 01 = 4 bytes.
Alternative: add di, 256; add di, 64 = 3+3 = 6 bytes (using 83 C7 40 for +64 = 3 bytes and add di, 256 = 4 bytes for the 256 part... no, 256 as imm8 signed = 0x00 (overflow) hmm.
Actually ADD DI, 256 = 83 C7 00 = 3 bytes (256 as 8-bit sign-extended = 0x00 zero, which is +0... that's wrong).
Hmm, 256 in 8-bit: 0x100. As signed byte: 0x00 = 0 (overflow), doesn't work. What about using 81 form: ADD DI, 256 = 81 C7 00 01 = 4 bytes. Nope.
Alternative encoding: add di, 0x140 = 81 C7 40 01 = 4 bytes. ← Same.
Can I use a register for 320? If I preload CX=320 before the loop... but CX is used for the loop counter!
With LOOP: loop .vl uses CX. I need CX for loop count AND 320 for the step. Can't do both.
Alternative: use dec cx and jnz manually, freeing CX for another use? Then I can use CX=320 for the step and DL=remaining_count:
mov dl, 111 ; 2 bytes
mov cx, 320
.vl:
mov [es:di], al ; 4 bytes
add di, cx ; 2 bytes (ADD DI, CX = 03 CF = 2 bytes)
dec dl ; 2 bytes (FE CA)
jnz .vl ; 2 bytes
= 2+3+4+2+2+2 = 15 bytes for vertical inner loop (smaller!). But setting cx=320 = mov cx, 320 = B9 40 01 = 3 bytes.
Compare to:
mov cx, 111 ; 3 bytes
.vl:
mov [es:di], al ; 4 bytes
add di, 320 ; 4 bytes
loop .vl ; 2 bytes
= 3+4+4+2 = 13 bytes. Actually this is smaller! (3+4+4+2=13 vs 2+3+4+2+2+2=15).
Hmm. The add di, 320 (4 bytes) + loop (2 bytes) = 6 bytes per iteration vs add di, cx (2 bytes) + dec dl (2 bytes) + jnz (2 bytes) = 6 bytes. Same! But the setup is different: mov cx, 111 (3 bytes) vs mov dl, 111; mov cx, 320 (5 bytes). So first approach is better.
OK for the vertical edge drawing:
; sx in some register (say AX from pts lookup)
; Compute starting offset: 45*320 + sx = 14400 + sx
; 45*320 = 14400 = 0x3840
mov di, 14400 ; 3 bytes (BF 40 38)
add di, ax ; 2 bytes (starting offset)
mov cx, 111 ; 3 bytes
.vl:
mov [es:di], al ; 4 bytes (26 88 05: ES override + MOV [DI], AL = 26 88 05 = 3 bytes? Let me check)
; [DI] MOV AL: MOV r/m8, r8 = 88. r=AL(0), r/m=[DI]: mod=00, rm=101 = 00_000_101 = 0x05. So: 88 05 = 2 bytes. With ES override: 26 88 05 = 3 bytes!
; Wait: that's only 3 bytes for storing AL at [ES:DI].
add di, 320 ; 4 bytes
loop .vl ; 2 bytes
= 3+2+3+(3+4+2) = 3+2+3+9 = 17 bytes for vertical edge code (the inner loop is 9 bytes).
For horizontal edge:
; sx1 in DX, sx2 in AX, sy in (need to compute)
; Assuming sy is the same for both vertices, need to get it from pts
; Starting offset: sy*320 + min_x
; Length: |sx2 - sx1| + 1
Actually for horizontal lines, this is simpler. Let me precompute the offset directly.
The horizontal edge drawing:
; We have two pts[v1] and pts[v2] with same sy (since they differ in bit 0 or bit 2 of vertex index)
; sx1 = pts[v1].x, sy1 = pts[v1].y = pts[v2].y = sy2 = sy
; sx2 = pts[v2].x
; Compute screen offset for leftmost x:
; offset = sy*320 + min(sx1, sx2)
imul ax, sy, 320 ; AX = sy*320 (4 bytes: IMUL AX, r16, imm16)
; Hmm: IMUL AX, AX, 320 = 69 C0 40 01 = 4 bytes (if AX=sy)
; But I need sy somewhere. It's in the pts array.
This approach requires loading sy from the pts array. But the key insight is: after loading x and y of both endpoints from pts, if they have the same y (horizontal edge), I can draw a horizontal line, and if they have the same x (vertical edge), a vertical line.
BUT: How do I KNOW in advance whether an edge is horizontal or vertical? I'd need to check. And checking adds code.
Alternative: Just always use the simple step-based DDA. The horizontal/vertical optimization requires detecting which type of edge we're dealing with.
Here's a cleaner approach: the edge table has 12 edges in 3 groups:
So I can split the edge list into:
For horizontal edges, I don't even need to look at y (just use the y from either vertex). For vertical, I don't need x.
Let me redesign the drawing section:
Horizontal edge drawing (8 edges):
; For each horizontal edge (v0, v1):
; sx0 = pts[v0].x, sx1 = pts[v1].x, sy = pts[v0].y (= pts[v1].y)
; Sort: if sx0 > sx1, swap
; Compute offset = sy*320 + min(sx0, sx1)
; Length = |sx1 - sx0| + 1
; REP STOSB from offset, length times
Per horizontal edge:
; Load sx0, sx1, sy (3 loads from pts):
[edge pair from table]
; ...
mov ax, [pts+v0*4] ; sx0
mov dx, [pts+v0*4+2] ; sy (= sy of this edge)
mov bx, [pts+v1*4] ; sx1
; Sort sx0, sx1:
cmp ax, bx
jle .ok
xchg ax, bx
.ok:
; ax = min(sx0, sx1), bx = max
; Length = bx - ax + 1
mov cx, bx
sub cx, ax
inc cx ; 3 bytes
; Offset = sy*320 + ax:
; DX = sy (in DX from earlier)
imul dx, dx, 320 ; DX = sy*320 (4 bytes)
add ax, dx ; AX = offset
mov di, ax
rep stosb ; draw horizontal line! (2 bytes: F3 AA)
Key: REP STOSB draws multiple bytes in one instruction! This is very efficient for horizontal lines.
Counting per horizontal edge: ~30-35 bytes for the 8 edges (code, not per-edge execution).
Wait, but I need to load v0 and v1 coords from pts, which requires the same lodsb/shl/pts-access dance as before. Let me count:
lodsb ; 1: v0
cbw ; 1
shl ax, 2 ; 3
mov bx, ax ; 2
mov ax, [.pts+bx] ; 4: sx0
push ax ; 1: save sx0
push word [.pts+bx+2] ; 4 + overhead: push m16 = FF /6 = FF 76 xx = 3 bytes?
; Wait: PUSH r/m16 = FF /6. For [BX+disp16]: mod=10, rm=111, reg=/6=110 = 10_110_111 = 0xB7. So: FF B7 lo hi = 4 bytes. Hmm.
; Or I can load into a reg and push:
mov dx, [.pts+bx+2] ; 4: sy
push dx ; 1: save sy
lodsb ; 1: v1
push si ; 1
cbw ; 1
shl ax, 2 ; 3
mov bx, ax ; 2
mov cx, [.pts+bx] ; 4: sx1
; Sort sx0 (on stack) and sx1:
pop dx ; dx = sy
pop ax ; ax = sx0
cmp ax, cx
jle .ok ; 2
xchg ax, cx ; 1 (XCHG AX, CX = 91h)
.ok:
; ax = min, cx = max
sub cx, ax ; 2: cx = max - min
inc cx ; 1: cx = length
; Offset = sy*320 + ax:
imul dx, dx, 320 ; 4 (IMUL DX, DX, 320)
add ax, dx ; 2
mov di, ax ; 2
; Plot:
rep stosb ; 2 (F3 AA)
Total per horizontal edge group: 1+1+3+2+4+4+1 + 1+1+3+2+4 + 1+1+2+1+2+1 + 4+2+2+2 = very messy. Let me just count: roughly 45-50 bytes.
Hmm, this might not be much better than the DDA approach. The DDA was 30+20+6+28 = 84 bytes for all 12 edges. The horizontal/vertical split might be similar.
Actually, for 8 horizontal edges, the key benefit is REP STOSB (1 pixel per CPU cycle ≈), but the code to set it up is similar to the DDA setup.
For horizontal edges, I avoid the inner loop entirely! Instead of 32 iterations, it's done in one REP STOSB. This doesn't affect code SIZE (the loop was code, not per-iteration code), but it might reduce code by eliminating the inner loop (28 bytes) for horizontal lines.
For the 8 horizontal edges: no inner loop needed (just REP STOSB = 2 bytes). Saves 28 bytes of inner loop code! But adds sorting code (~5 bytes) and computing length differently.
For 4 vertical edges: still need a loop (draw 111 pixels at DI += 320 per step). The inner loop is maybe 9 bytes instead of 28 bytes (since just one line per pass).
This approach COULD save significant bytes! Let me redesign:
Redesigned drawing with separate horizontal and vertical loops:
; For edge (v0, v1) with v0=v&~2, v1=v|2 (v0 has bit1=0, v1 has bit1=1):
; sx = pts[v0].x = pts[v1].x
mov si, v0_idx * 4
mov ax, [bp+si+pts_off] ; sx
; Compute starting offset: 45*320 + sx = 14400 + sx
add ax, 14400 ; or: mov di, 14400; add di, ax
mov di, ax
mov cx, 111
.vl: mov [es:di], al ; 3 bytes
add di, 320 ; 4 bytes (81 C7 40 01)
loop .vl ; 2 bytes
But 4 vertical edges: if I store them in a loop:
; Vertical edges: bit 1 pairs: (0,2), (1,3), (4,6), (5,7)
; Only need to iterate 4 times, loading sx from pts[v0] where v0 is 0,1,4,5
mov bx, vert_vtable ; table: 0, 1, 4, 5 (4 bytes)
mov cx, 4
.vedge:
lodsb ; v0 index from table
cbw
shl ax, 2
mov si, ax
mov ax, [bp+si+pts_off] ; sx
add ax, 14400
mov di, ax
push cx
mov cx, 111
.vl: mov [es:di], al
add di, 320
loop .vl
pop cx
loop .vedge
Vertical drawing code: 1(lodsb)+1(cbw)+3(shl ax,2)+2(mov si,ax)+3([bp+si])+3(add ax,14400)+2(mov di,ax) + 1(push cx)+3(mov cx,111)+(3+4+2)(inner)+1(pop cx)+2(loop) = 1+1+3+2+3+3+2+1+3+9+1+2 = 31 bytes Plus: table (4 bytes) + setup (3 bytes: mov bx, vert_vtable; wait: I use LODSB so I'd need to set SI or use a different approach)
Hmm, using LODSB for the vertical vtable: set SI to vert_vtable, LODSB loads from [SI] and advances.
Setup: mov si, vert_vtable; mov cx, 4 = 3+3 = 6 bytes.
Total vertical drawing: 6 + 31 = 37 bytes.
lodsb ; v0
cbw
shl ax, 2
mov si, ax
mov dx, [bp+si+pts_off] ; sx0 = DX
mov di, [bp+si+pts_off+2] ; sy = DI
lodsb ; v1
push si_edge ; save edge pointer (after 2nd lodsb)
cbw
shl ax, 2
mov si, ax
mov ax, [bp+si+pts_off] ; sx1 = AX
; sy is already in DI
; Sort: ensure DX <= AX (or swap)
cmp dx, ax
jle .hok
xchg dx, ax
.hok:
; DX = min_x, AX = max_x, DI = sy
; Length = AX - DX + 1
sub ax, dx
inc ax
mov cx, ax ; cx = length
; Offset = sy*320 + min_x = DI*320 + DX
imul di, di, 320 ; DI = sy*320 (4 bytes: IMUL DI, DI, 320 = 69 FF 40 01 = 4 bytes)
add di, dx ; DI = offset
rep stosb ; draw horizontal line (2 bytes)
Per horizontal edge: 1+1+3+2+3+3 + 1+1+1+3+2+3 + 2+1+1 + 2+1+2 + 4+2+2 = Let me add: 13 (load v0) + 12 (load v1) + 4 (sort) + 5 (length) + 8 (offset+plot) = 42 bytes.
With loop overhead (push/pop cx, push si, pop si, loop): 7 bytes. Total horizontal edge loop (8 edges): 42 + 7 = 49 bytes + edge table (8 pairs = 16 bytes).
Hmm, 49 bytes for the horizontal loop is still a lot.
But wait: for horizontal edges, I don't need imul di, di, 320! I can compute the offset differently. sy is in DI and I need DI*320. IMUL DI, DI, 320 = 69 FF 40 01 = 4 bytes.
Alternative: IMUL AX, DI, 320 = 69 C7 40 01 = 4 bytes. Same.
Hmm. Or use a shift: shl di, 8; mov ax, di; shr ax, 2; add di, ax; add di, dx:
Actually: DI320 = DI256 + DI*64:
mov ax, di ; 2
shl di, 8 ; 3: DI = sy*256
shl ax, 6 ; 3: AX = sy*64
add di, ax ; 2: DI = sy*320
add di, dx ; 2: DI = offset
= 12 bytes (vs 4+2=6 for IMUL+ADD). WORSE.
The IMUL approach (6 bytes for offset computation) is better. Let me keep IMUL.
Actually wait: IMUL DI, DI, 320 = 69 FF 40 01 = 4 bytes. Let me double-check this encoding.
IMUL r16, r/m16, imm16 = 69 /r imm16. With r=DI(7), r/m=DI(7): ModRM = mod=11, reg=DI=7, r/m=DI=7 → 11_111_111 = 0xFF. Immediate = 320 = 0x0140. Total: 69 FF 40 01 = 4 bytes ✓.
Then add di, dx = 03 FA = 2 bytes. Total for offset: 6 bytes. Then rep stosb = 2 bytes. OK.
Let me estimate total drawing code:
vs. old DDA approach: 85 bytes (for all 12 edges) + 24-byte table = 109 bytes.
Hmm, it's about the same or slightly worse! The horizontal/vertical optimization doesn't save much code because the setup per edge is similar.
BUT: The horizontal/vertical approach avoids the inner loop entirely for 8 edges! This means the execution is much faster (fewer total instructions per frame), even if code size is similar.
For code SIZE, let me compare more carefully:
Old DDA approach total:
New horizontal/vertical approach:
Saves 3 bytes. Not enough to matter.
OK I've been spending way too much time on this optimization analysis. Let me step back and take a practical approach:
Final decision: I'll write the code as cleanly and correctly as possible, targeting the minimum byte count I can reasonably achieve with the optimizations I've identified. The result will likely be in the 250-280 byte range. If it's slightly over 256, I'll note this and provide tips to reduce further.
Let me now write the final, clean NASM code. I'll incorporate the major optimizations (FSINCOS, BP-relative, down-count vertex loop, CBW trick, push-SI timing) and see what size I get.
Here's the final code:
nasm
; 256b Rotating 3D Cube - DOS Mode 13h
; Build: nasm -f bin cube.asm -o cube.com
; Requires: 386+387 or higher (for FSINCOS)
bits 16
org 100h
; ============ CONSTANTS ============
SIZE equ 55 ; cube half-size
STEPS equ 32 ; line drawing steps per edge
; ============ ENTRY POINT ============
start:
mov ax, 0013h ; set mode 13h (320x200, 256 colors)
int 10h
mov ax, 0A000h ; ES = video segment
mov es, ax
fninit
fldz ; FPU st0 = angle = 0.0
; Set up BP for data-relative addressing
mov bp, data_start ; BP points to our data area
; ============ MAIN LOOP ============
.frame:
; --- Clear screen (320*200 = 64000 bytes) ---
xor di, di
mov al, 0
mov cx, 32000
rep stosw
; --- Update angle, compute sin/cos as scaled integers ---
fadd dword [bp+da_off] ; angle += 0.04 rad
fld st0 ; duplicate angle
fsincos ; st0=cos, st1=sin (pops angle copy)
fmul dword [bp+k64_off] ; cos * 64
fistp word [bp+ica_off] ; ica = round(cos*64), pop
fmul dword [bp+k64_off] ; sin * 64
fistp word [bp+isa_off] ; isa = round(sin*64), pop
; --- Project 8 cube vertices ---
; Vertex i: bit0 = x sign, bit1 = y sign, bit2 = z sign (0 = neg, 1 = pos)
mov bx, 7
.vloop:
; Compute vertex coordinate triple (vx, vy, vz) stored at [bp+vcoord_off]
push bx
lea di, [bp+vcoord_off]
mov cx, 3
.coord:
mov ax, bx
and al, 1
add al, al
dec al
cbw
imul ax, ax, SIZE
stosw ; store to vcoord+0, +2, +4; DI advances
shr bx, 1
loop .coord
pop bx
; Rotate around Y axis: rx = (vx*ica - vz*isa) >> 6
mov ax, [bp+vcoord_off]
imul word [bp+ica_off]
xchg si, ax ; si = vx*ica (XCHG AX, SI = 1 byte)
mov ax, [bp+vcoord_off+4]
imul word [bp+isa_off]
sub si, ax
sar si, 6
add si, 160 ; si = screen x (sx)
; Screen y = -vy + 100
mov ax, [bp+vcoord_off+2]
neg ax
add ax, 100 ; ax = screen y (sy)
; Store (sx, sy) in pts[bx] at [bp+pts_off + bx*4]
mov di, bx
shl di, 2 ; di = bx*4
mov [bp+di+pts_off], si
mov [bp+di+pts_off+2], ax
dec bx
jns .vloop
; --- Draw 12 edges ---
mov al, 15 ; drawing color = white
mov si, edge_table ; SI = edge table pointer
mov cx, 12
.eloop:
push cx
; Load vertex 0 coordinates: (x0, y0) in (dx, di)
lodsb
cbw
shl ax, 2
mov bx, ax
mov dx, [.pts+bx] ; x0 -- PROBLEM: .pts+bx needs 4 bytes!
mov di, [.pts+bx+2] ; y0
; Load vertex 1 coordinates: (x1, y1) in (ax, bx)
lodsb
push si ; save next edge pointer
cbw
shl ax, 2
mov si, ax ; si = v1*4 (repurpose SI)
mov ax, [bp+si+pts_off] ; x1 (BP+SI addressing = 3 bytes!)
mov bx, [bp+si+pts_off+2] ; y1
; Set up DDA: step = diff * 8 (for 256/32 = 8 scale factor)
sub ax, dx ; ax = x_diff
sub bx, di ; bx = y_diff
shl ax, 3 ; ax = x_step
shl bx, 3 ; bx = y_step
push bx ; save y_step on stack
push ax ; save x_step on stack
mov bp_save, bp ; save BP... wait I can't do this easily!
mov bp, sp ; BP → steps on stack (but this corrupts our data pointer!)
Argh! Setting BP to SP to access steps would destroy the BP that points to our data area. This is a fundamental conflict.
Resolution: I need to either:
Let me go with option 2: store steps in dedicated variables (xs and ys) in the data area. Then in the inner loop, access them via BP+offset (3 bytes each).
This means:
OK so for storing ys, using BP saves 1 byte. For xs, no savings (AX has short form already).
And in the inner loop:
But I DON'T set BP=SP. I keep BP pointing to data area throughout. The steps are stored in xs/ys in the data area. This avoids the BP conflict.
So the revised edge loop:
nasm
; For vertex 0 loading, use .pts+bx with 4-byte addresses
mov dx, [.pts+bx] ; 4 bytes: x0
mov di, [.pts+bx+2] ; 4 bytes: y0
; For vertex 1 loading with SI (BP+SI addressing):
mov si, ax ; si = v1*4
mov ax, [bp+si+pts_off] ; 3 bytes: x1
mov bx, [bp+si+pts_off+2] ; 3 bytes: y1
; Steps:
sub ax, dx
sub bx, di
shl ax, 3
shl bx, 3
mov [bp+xs_off], ax ; 3 bytes (save 1 vs direct)
mov [bp+ys_off], bx ; 3 bytes (save 1 vs direct)
shl dx, 8
shl di, 8
; Inner loop:
mov cx, STEPS
.ls:
add dx, [bp+xs_off] ; 3 bytes
add di, [bp+ys_off] ; 3 bytes
mov ax, dx
sar ax, 8
mov bx, di
sar bx, 8
imul bx, bx, 320
add bx, ax
mov byte [es:bx], 15
loop .ls
pop si ; restore edge pointer
pop cx
loop .eloop
OK now let me think about the vertex 0 loading. Currently:
lodsb
cbw
shl ax, 2
mov bx, ax
mov dx, [.pts+bx] ; 4 bytes (can't use BP+BX)
mov di, [.pts+bx+2] ; 4 bytes
= 1+1+3+2+4+4 = 15 bytes
For vertex 0, I can also use SI with BP! After the first LODSB, I do:
lodsb
cbw
shl ax, 2
mov si, ax ; 2 (SI = v0*4, overwriting LODSB pointer)
mov dx, [bp+si+pts_off] ; 3: x0
mov di, [bp+si+pts_off+2] ; 3: y0
= 1+1+3+2+3+3 = 13 bytes (saves 2 bytes!)
Then the second LODSB needs SI to be the edge pointer. But I've overwritten SI with v0*4. I need to restore SI.
Oh wait: after the first LODSB, SI was pointing to v1 byte (edge_ptr + 1). If I save this BEFORE overwriting SI:
lodsb
cbw
shl ax, 2
push si ; 1: save edge pointer (now at v1 position)
mov si, ax ; 2: SI = v0*4
mov dx, [bp+si+pts_off] ; 3: x0
mov di, [bp+si+pts_off+2] ; 3: y0
pop si ; 1: restore edge pointer
= 1+1+3+1+2+3+3+1 = 15 bytes. Same as before! No net savings (since push si before si overwrite adds 1 byte, and pop si adds 1 byte).
But then for vertex 1:
lodsb
push si ; 1: save next edge pointer
cbw
shl ax, 2
mov si, ax ; 2: SI = v1*4
mov ax, [bp+si+pts_off] ; 3: x1
mov bx, [bp+si+pts_off+2] ; 3: y1
= 1+1+1+3+2+3+3 = 14 bytes
Total for both vertices (v0 with push/pop si, v1 with push si after): V0: 15 bytes (1+1+3+1+2+3+3+1) V1: 14 bytes (1+1+1+3+2+3+3) ... wait, the push si for v1 (saving next edge pointer) is already counted.
Hmm, but for v0 I'm pushing si BEFORE overwriting it, then popping it to restore for v1's LODSB. And for v1 I push si AFTER LODSB (to save the next edge ptr). This structure is:
lodsb ; 1: v0, SI → v1
cbw ; 1
shl ax, 2 ; 3
push si ; 1: save SI (= edge_ptr pointing to v1)
mov si, ax ; 2: SI = v0*4
mov dx, [bp+si+pts_off] ; 3: x0
mov di, [bp+si+pts_off+2] ; 3: y0
pop si ; 1: restore SI to v1 position
lodsb ; 1: v1, SI → next pair
push si ; 1: save SI (= next pair)
cbw ; 1
shl ax, 2 ; 3
mov si, ax ; 2: SI = v1*4
mov ax, [bp+si+pts_off] ; 3: x1
mov bx, [bp+si+pts_off+2] ; 3: y1
= 1+1+3+1+2+3+3+1 + 1+1+1+3+2+3+3 = 11+14 = 25 bytes
vs. old:
lodsb; cbw; shl ax,2; mov bx,ax; mov dx,[.pts+bx]; mov di,[.pts+bx+2] = 1+1+3+2+4+4 = 15 bytes
lodsb; push si; cbw; shl ax,2; mov bx,ax; mov ax,[.pts+bx]; mov bx,[.pts+bx+2] = 1+1+1+3+2+4+4 = 16 bytes
= 31 bytes.
New approach: 25 bytes. Saves 6 bytes!
OK great! So using BP+SI for vertex 0 loading AND rearranging the SI save/restore saves 6 bytes.
Updated estimate: 289 - 6 = 283 bytes. Still over by ~27 bytes.
Hmm. Let me look for more large savings.
Data layout: Let me think about the data size.
data_start:
da: dd 0.04 ; 4 bytes (angle increment)
k64: dd 64.0 ; 4 bytes (scale factor)
ica: dw 0 ; 2 bytes (cos * 64, int)
isa: dw 0 ; 2 bytes (sin * 64, int)
vcoord: times 3 dw 0 ; 6 bytes (vx, vy, vz temp)
pts: times 16 dw 0 ; 32 bytes (8 screen coords)
xs: dw 0 ; 2 bytes (x step for DDA)
ys: dw 0 ; 2 bytes (y step for DDA)
edge_table:
db 0,1, 2,3, 4,5, 6,7 ; bit-0 pairs (8 bytes)
db 0,2, 1,3, 4,6, 5,7 ; bit-1 pairs
db 0,4, 1,5, 2,6, 3,7 ; bit-2 pairs
; Total edge table: 24 bytes
Data total: 4+4+2+2+6+32+2+2+24 = 78 bytes.
Can I reduce data?
fld1 ; 2: 1.0
fld st0 ; 2
fadd ; 2: st0=2.0 ... hmm no: FADD with no operand = fadd st0, st1? Let me check.
Actually FADD (no operands) = DE C1 = 2 bytes = FADDP st(1), st(0) = pop st(0) and add to st(1). So: after fld1; fld st0: Stack: [1.0, 1.0] faddp = DE C1 = 2 bytes: st(1) += st(0), pop. Stack: [2.0]. Repeat 5 more times (for ×2 each time) gives: [64.0] after 6 doublings. That's 6×4 = 24 bytes? No: fld st0; faddp = 4 bytes each × 5 = 20 bytes. Plus initial fld1 = 2 bytes. = 22 bytes. WAY worse than fmul dword [k64] which uses a 4-byte constant.
Alternative: store k64 as a word integer (64 = 0x0040 = 2 bytes) and use fimul word [k64_int] instead of fmul dword [k64]. FIMUL word [BP+disp8] = D8 6E disp8 (hmm: FIMUL is DE /1 for word... let me check). Actually:
FIMUL (integer multiply) for word: FIMUL m16int = DE /1. For [BP+disp8]: mod=01, rm=110, reg=001 = 01_001_110 = 0x4E. Total: DE 4E disp8 = 3 bytes. Same as 3 bytes using fmul dword [k64_float] with BP. BUT: the constant is now 2 bytes (word integer) instead of 4 bytes (dword float)!
This saves 2 bytes of data: from 4 bytes (float 64.0) to 2 bytes (word integer 64).
But wait: I use fmul dword [bp+k64_off] twice (once for cos, once for sin). With fimul word [bp+k64_int_off], each use is still 3 bytes. No change in code size. But data shrinks from 4 to 2 bytes = 2 bytes saved.
Let me also check if da can be stored as an integer. I currently have fadd dword [bp+da_off] = 3 bytes with BP. If I store da as a float (4 bytes), that's 3 bytes code + 4 bytes data = 7 bytes total.
Alternative: hardcode a small float constant using fadd with a small integer. For 0.04 ≈ pi/78 ≈ 1/25. I can't easily use an integer form here since 0.04 is not a small integer.
But what if I use a different angle increment that's easier to represent? For example, 2*pi/100 ≈ 0.0628. This still needs a float constant.
OR: I can use a different formula. Instead of adding a float constant to the angle each frame, I can maintain the angle as a byte counter (0..255 = full rotation) and use a byte lookup or computation. But then I need sin/cos from the byte angle, which requires a table or computation.
Let me keep da as a float (4 bytes) and k64 as a word integer (2 bytes). Net savings: 2 bytes.
More data savings:
Data total: 4+2+2+2+6+32+2+2+24 = 76 bytes (saved 2 bytes by changing k64 to word int).
Another data saving: The da constant (0.04 float = 4 bytes). What if instead, I use a precomputed constant that requires fewer bytes?
Float 0.04 in IEEE 754 single precision: 0.04 = 1.01000111101011100001010001111010 × 2^(-5) = 0 01111010 01000111101011100001010 = 0x3D23D70A
That's 4 bytes. Can't do better than 4 bytes for a float.
But what if I don't use a float for the angle increment? What if the angle is stored as a byte (0..255 = full rotation) and each frame I increment the byte, then use a lookup or computation to get sin/cos in integer?
For sin from byte angle: sin_table[i] = round(64 * sin(i * 2*pi/256)) for i=0..255. A 256-byte table = way too big for a 256-byte intro.
Quarter-wave table: 64 entries × 1 byte = 64 bytes. Still too big.
For a 256b intro, the FPU approach with float sin/cos (and data stored as float) is actually economical.
Key insight I haven't tried: Use FLDPI to load π (2 bytes) and compute the angle in a form that doesn't need a data constant. For example:
Each frame: angle += 1/100 radians ≈ 0.01 rad. This isn't a nice fraction of π. Hmm.
OR: keep angle as a 0..255 counter (byte), and compute sin/cos as: sin(counter * 2*pi/256).
This requires: fild byte [counter]; fmul 2*pi/256. 2*pi/256 ≈ 0.0245. Still needs a float constant.
OR: Use the FPU's built-in FLDPI and compute angle ÷ 128 * π:
; counter is a byte 0..255
; angle = counter * pi / 128
fldpi ; 2: pi
fild byte [counter]; 4: counter, pi (fild byte? Actually FILD on byte: it's "FILD m16int" for word... checking: FILD m8int... actually there's no "FILD m8int" in x87. FILD takes 16-bit, 32-bit, or 64-bit integer operands only!
Hmm. FILD doesn't support 8-bit. I'd need to store the counter as a word.
; counter = word (0..255, incremented each frame)
; angle_rad = counter * pi / 128
fild word [counter] ; 4 bytes: push counter
fldpi ; 2: pi, counter
fmul ; 2: fmulp = pi*counter
fidiv word [k128] ; 4 bytes: counter*pi/128... hmm: FIDIV = DE /7 for word = DE 7E disp8 (with BP) = 3 bytes
This approach: 4+2+2+3 = 11 bytes per frame for angle computation. Plus 2 bytes data (word counter) + 2 bytes (k128 = 128 as word integer).
vs. current: fadd dword [bp+da_off] = 3 bytes + 4 bytes data.
The pi-based approach: 11 bytes code + 4 bytes data (counter word + k128 word) = 15 bytes. Current: 3 bytes code + 4 bytes data = 7 bytes.
Current is MUCH better. Keep it.
At this point, I've done quite extensive analysis and I'm around 283 bytes estimated. Let me try to find the remaining 27 bytes.
Idea: Reduce STEPS from 32 to 16: The inner loop code is mov cx, 32 (3 bytes) + loop .ls (2 bytes). If I use STEPS=16, I change mov cx, 32 to mov cx, 16. Same code size. No bytes saved. But the step size doubles (each step = 1/16 of edge instead of 1/32). With SIZE=55 and max projected edge ≈ 78 pixels, 16 steps means ~5 pixels per step. Lines would look dashed. Not ideal.
With SIZE=30 and 16 steps: max projected edge ≈ 42 pixels, ~2.6 pixels per step. Might look OK.
Idea: Use a fixed-point width different from 8 bits: If I use 4-bit fraction instead of 8-bit (scale = 16), I can do sar ax, 4 instead of sar ax, 8. Both are 3 bytes. No savings. But the step computation changes from shl ax, 3 to shl ax, 4-steps_log2. With STEPS=16, step = diff256/16 = diff16 = diff<<4. So shl ax, 4 = 3 bytes. Same.
OK, no savings from changing precision.
Idea: Use only 4 bits for the step. What if I use a very simple step: just shift right by 5 (÷32) from the integer diff, without the fixed-point expansion? Then: each step: add dx, x_step_int; sar dx, 0 but wait that doesn't make sense.
Alternative to fixed-point: Just compute each pixel position directly using IMUL each iteration:
; For t = 0 to STEPS-1:
; px = x0 + t * x_diff / STEPS
; py = y0 + t * y_diff / STEPS
; But IMUL per iteration is slow and complex.
This would be longer code.
Idea: Draw dots instead of lines: Just plot one pixel per edge (the midpoint). Looks terrible. Skip.
Idea: Skip edge drawing entirely and use a filled approach: Not suitable for wireframe.
Idea: Use MODE 13h palette + special drawing trick: Mode 13h has 256 colors. Color 15 is bright white. I'm already using this.
Idea: Reduce the number of edges: Only draw 8 edges instead of 12 (skip the "connecting" edges). The cube would look incomplete but recognizable. Saves 24/124 = 8 bytes of edge table + 4/12 of loop iterations (no code savings). Actually if I use 8 edges, mov cx, 8 instead of mov cx, 12 = same 3 bytes. Edge table: 82 = 16 bytes instead of 24 = 8 bytes saved in data. But the cube looks like 2 separate squares floating, which looks weird.
Idea: Draw only the 3 "visible" faces (backface culling): For a cube with Y rotation, some faces are hidden. I could skip the 6 hidden edges. But determining which faces are visible requires z-sorting (depth computation). That's more code, not less.
Idea: Re-examine the total byte count more carefully. Maybe my estimate is off. Let me actually write out every byte.
Let me write the complete code and carefully count bytes:
nasm
bits 16
org 100h
SIZE equ 55
STEPS equ 32
; ----- SETUP -----
; mov ax, 0013h: B8 13 00 = 3 bytes
; int 10h: CD 10 = 2 bytes
; mov ax, 0A000h: B8 00 A0 = 3 bytes
; mov es, ax: 8E C0 = 2 bytes
; fninit: DB E3 = 2 bytes
; fldz: D9 EE = 2 bytes
; mov bp, data_start: BD lo hi = 3 bytes
; Total setup: 3+2+3+2+2+2+3 = 17 bytes
Wait, where is data_start? It's after the code. With org 100h, code starts at 0x100. Data follows. I'd compute data_start = 0x100 + code_size. In NASM, I'd use a label for data_start after the code.
nasm
; ----- FRAME CLEAR -----
; xor di, di: 33 FF = 2 bytes
; xor ax, ax: 33 C0 = 2 bytes... but wait: I need AL=0 for the clear.
; Actually for clear, AX should be 0 (stosw stores AX). So:
; xor ax, ax: 2 bytes
; xor di, di: 2 bytes
; mov cx, 32000: B9 00 7D = 3 bytes
; rep stosw: F3 AB = 2 bytes
; Total: 2+2+3+2 = 9 bytes
; ----- SIN/COS -----
; fadd dword [bp+0]: D8 46 00 = 3 bytes (da is at bp+0)
; fld st0: D9 C0 = 2 bytes
; fsincos: D9 FB = 2 bytes
; fmul dword [bp+4]: D8 4E 04 = 3 bytes (k64 is at bp+4... hmm but k64 is float or word int now)
; Oh wait, if k64 is a word integer (2 bytes), then I use FIMUL:
; fimul word [bp+4]: DA 4E 04 = 3 bytes... wait:
; FIMUL m16int = DE /1 = DE 4E disp8 for [BP+disp8]. Wait:
; FIMUL: opcode DA for 32-bit integer, DE for 16-bit integer... hmm let me check.
; Actually: in x87:
; FIMUL m16int: DA /1 for 32-bit, DE /1 for 16-bit? Let me check more carefully.
;
; Actually: FIMUL:
; DE /1 is for short integer (16-bit)...
; Let me look this up:
; FIMUL m16int: opcode = DE, ModRM = /1 (reg=001): DE 4E disp8 for [BP+disp8]
; = DE 4E disp8 = 3 bytes ✓
;
; But wait: in NASM, "fimul word [bp+4]" might generate different opcodes. Let me just use:
; fimul word [bp+4]: DE 4E 04 = 3 bytes
; fistp word [bp+ica_off]: DF 76 ica_off = 3 bytes
; Total sin/cos: 3+2+2+3+3+3+3 = 19 bytes
Wait, I had issues with k64 being 2 bytes vs 4 bytes, and using fimul vs fmul. Let me recalculate the FPU instructions:
Using fimul word [bp+k64_off] where k64_off is an 8-bit offset from bp and k64 is stored as a word integer (2 bytes, value 64):
Using fmul dword [bp+k64_off] where k64 is stored as a float (4 bytes):
Both are 3 bytes for the instruction. The difference is data size: float = 4 bytes, word int = 2 bytes.
So by using word integer 64, I save 2 bytes of data. Keep this optimization.
Continuing with byte counting:
; ----- VERTEX LOOP -----
; mov bx, 7: BB 07 00 = 3 bytes (MOV BX, imm16)
; Per vertex:
; push bx: 53 = 1 byte
; lea di, [bp+vcoord_off]: 8D 7E vcoord_off = 3 bytes
; mov cx, 3: B9 03 00 = 3 bytes
; [coord loop body]:
; mov ax, bx: 8B C3 = 2 bytes
; and al, 1: 24 01 = 2 bytes
; add al, al: 00 C0 = 2 bytes
; dec al: FE C8 = 2 bytes
; cbw: 98 = 1 byte
; imul ax, ax, SIZE: 6B C0 37 = 3 bytes (SIZE=55=0x37)
; stosw: AB = 1 byte
; shr bx, 1: D1 EB = 2 bytes
; loop .coord: E2 xx = 2 bytes
; Body total: 2+2+2+2+1+3+1+2+2 = 17 bytes
; pop bx: 5B = 1 byte
; Coord section: 1+3+3+17+1 = 25 bytes
; Rotation:
; mov ax, [bp+vcoord_off]: 8B 46 vcoord_off = 3 bytes
; imul word [bp+ica_off]: F7 6E ica_off = 3 bytes -- hmm: IMUL r/m16 = F7 /5. [BP+disp8]: mod=01, rm=110, reg=101=01_101_110=0x6E. So F7 6E disp8 = 3 bytes ✓
; xchg si, ax: 96 = 1 byte (XCHG AX, SI)
; mov ax, [bp+vcoord_off+4]: 8B 46 (vcoord_off+4) = 3 bytes
; imul word [bp+isa_off]: F7 6E isa_off = 3 bytes
; sub si, ax: 2B F0 = 2 bytes
; sar si, 6: C1 FE 06 = 3 bytes
; add si, 160: 81 C6 A0 00 = 4 bytes (since 160 > 127, can't use short form)
; Rotation subtotal: 3+3+1+3+3+2+3+4 = 22 bytes
; sy = -vy + 100:
; mov ax, [bp+vcoord_off+2]: 8B 46 (vcoord_off+2) = 3 bytes
; neg ax: F7 D8 = 2 bytes
; add ax, 100: 05 64 00 = 3 bytes (ADD AX, imm16 short form)
; sy subtotal: 3+2+3 = 8 bytes
; Store:
; mov di, bx: 89 DF = 2 bytes
; shl di, 2: C1 E7 02 = 3 bytes
; mov [bp+di+pts_off], si: 89 73 pts_off = 3 bytes
; mov [bp+di+pts_off+2], ax: 89 43 (pts_off+2) = 3 bytes
; Store subtotal: 2+3+3+3 = 11 bytes
; Loop control:
; dec bx: 4B = 1 byte
; jns .vloop: 79 xx = 2 bytes
; Loop control: 1+2 = 3 bytes
; Vertex loop body total: 25+22+8+11+3 = 69 bytes
; Plus init: 3 bytes
; Total vertex section: 72 bytes
; ----- EDGE DRAWING -----
; mov al, 15: B0 0F = 2 bytes
; mov si, edge_table: BE lo hi = 3 bytes
; mov cx, 12: B9 0C 00 = 3 bytes
; Edge loop:
; push cx: 51 = 1 byte
; Load v0 (using BP+SI addressing after push si and restore):
; lodsb: AC = 1 byte
; cbw: 98 = 1 byte
; shl ax, 2: C1 E0 02 = 3 bytes
; push si: 56 = 1 byte (save edge pointer to v1)
; mov si, ax: 89 C6 = 2 bytes (SI = v0*4)
; mov dx, [bp+si+pts_off]: 8B 52 pts_off = 3 bytes (MOV DX, [BP+SI+disp8]: mod=01, rm=010, reg=010=01_010_010=0x52, disp8=pts_off)
; mov di, [bp+si+pts_off+2]: 8B 7A (pts_off+2) = 3 bytes (MOV DI, [BP+SI+disp8]: mod=01, rm=010, reg=111=01_111_010=0x7A)
; pop si: 5E = 1 byte (restore SI to v1 position)
; Load v0 subtotal: 1+1+3+1+2+3+3+1 = 15 bytes
; Load v1:
; lodsb: AC = 1 byte
; push si: 56 = 1 byte (save next pair pointer)
; cbw: 98 = 1 byte
; shl ax, 2: C1 E0 02 = 3 bytes
; mov si, ax: 89 C6 = 2 bytes (SI = v1*4)
; mov ax, [bp+si+pts_off]: 8B 42 pts_off = 3 bytes
; mov bx, [bp+si+pts_off+2]: 8B 5A (pts_off+2) = 3 bytes
; Load v1 subtotal: 1+1+1+3+2+3+3 = 14 bytes
; Step computation:
; sub ax, dx: 2B C2 = 2 bytes
; sub bx, di: 2B DF = 2 bytes
; shl ax, 3: C1 E0 03 = 3 bytes
; shl bx, 3: C1 E3 03 = 3 bytes
; mov [bp+xs_off], ax: 89 46 xs_off = 3 bytes -- wait: MOV [BP+disp8], AX = 89 46 disp8 = 3 bytes ✓
; mov [bp+ys_off], bx: 89 5E ys_off = 3 bytes -- MOV [BP+disp8], BX = 89 5E disp8 = 3 bytes
; shl dx, 8: C1 E2 08 = 3 bytes
; shl di, 8: C1 E7 08 = 3 bytes
; Step computation subtotal: 2+2+3+3+3+3+3+3 = 22 bytes
; mov cx, STEPS: B9 20 00 = 3 bytes (STEPS=32)
; Inner loop:
; add dx, [bp+xs_off]: 03 56 xs_off = 3 bytes
; add di, [bp+ys_off]: 03 7E ys_off = 3 bytes
; mov ax, dx: 8B C2 = 2 bytes
; sar ax, 8: C1 F8 08 = 3 bytes
; mov bx, di: 8B DF = 2 bytes
; sar bx, 8: C1 FB 08 = 3 bytes
; imul bx, bx, 320: 69 DB 40 01 = 4 bytes
; add bx, ax: 03 D8 = 2 bytes
; mov byte [es:bx], 15: 26 C6 07 0F = 4 bytes -- ES override(26), MOV [BX] imm8(C6 07 0F). ModRM 07 = mod=00, reg=/0, rm=111(BX) = 00_000_111 = 0x07. ✓
; loop .ls: E2 xx = 2 bytes
; Inner loop body: 3+3+2+3+2+3+4+2+4+2 = 28 bytes
; After inner loop:
; pop si: 5E = 1 byte (restore next pair pointer)
; pop cx: 59 = 1 byte (restore outer loop counter)
; loop .eloop: E2 xx = 2 bytes
; After-loop: 1+1+2 = 4 bytes
; Edge loop overhead: push cx (1) + [loads+steps+inner] + pop si + pop cx + loop = 1+...+4
; Edge section total: 2+3+3 (setup) + 1 (push cx) + 15 (v0) + 14 (v1) + 22 (steps) + 3 (mov cx,STEPS) + 28 (inner) + 4 (after) = 95 bytes
; ----- EXIT CHECK -----
; in al, 60h: E4 60 = 2 bytes
; dec al: FE C8 = 2 bytes
; jnz .frame: 75 xx = 2 bytes
; Total: 6 bytes
; ----- EXIT -----
; mov ax, 3: B8 03 00 = 3 bytes
; int 10h: CD 10 = 2 bytes
; ret: C3 = 1 byte
; Total: 6 bytes
Grand total code:
Plus data: 78 bytes (but I saved 2 bytes → 76 bytes)
Total: 224 + 76 = 300 bytes.
Hmm, 300 bytes. Let me double-check my code section totals.
Actually wait, I may be double-counting the edge section. Let me recount:
Plus edge section preamble: mov al,15(2) + mov si, edge_table(3) + mov cx,12(3) = 8 bytes Total edge section: 87 + 8 = 95 bytes ✓
Hmm. And my vertex section: 72 bytes. Let me recount:
Wait: the coord loop body (17 bytes) runs 3 times at RUNTIME, but there's only ONE copy of the code (17 bytes). The loop .coord instruction at the end of the body jumps back to the start of the body when CX>0.
So the coord section code is: lea di(3) + mov cx,3(3) + body(17) + loop(2 = part of body or separate?)
Hmm: is the loop .coord instruction INSIDE or OUTSIDE the body? It's the last instruction of the body. The body is:
.coord:
mov ax, bx ; 2
and al, 1 ; 2
add al, al ; 2
dec al ; 2
cbw ; 1
imul ax, ax, SIZE ; 3
stosw ; 1
shr bx, 1 ; 2
loop .coord ; 2 ← this is part of the loop body code
= 17 bytes total (2+2+2+2+1+3+1+2+2=17, and the loop at the end = 2 bytes, so total = 17 bytes including loop).
Yes, all 17 bytes are in the loop body. ✓
So coord section: 1(push bx) + 3(lea di) + 3(mov cx,3) + 17(body including loop) + 1(pop bx) = 25 bytes.
Rotation: 22 bytes sy: 8 bytes Store: 11 bytes Loop control: 3 bytes
Vertex loop body total: 25 + 22 + 8 + 11 + 3 = 69 bytes Init (mov bx,7): 3 bytes Total vertex section: 72 bytes ✓
Grand total: 17 + 9 + 19 + 72 + 95 + 6 + 6 = 224 bytes of code.
Hmm wait, let me recount the setup:
mov ax, 0013h ; 3 (B8 13 00)
int 10h ; 2 (CD 10)
mov ax, 0A000h ; 3 (B8 00 A0)
mov es, ax ; 2 (8E C0)
fninit ; 2 (DB E3)
fldz ; 2 (D9 EE)
mov bp, data_start ; 3 (BD lo hi)
= 3+2+3+2+2+2+3 = 17 bytes ✓
And the frame clear:
xor di, di ; 2 (33 FF)
xor ax, ax ; 2 (33 C0)
mov cx, 32000 ; 3 (B9 00 7D)
rep stosw ; 2 (F3 AB)
= 9 bytes ✓
Wait, I have xor ax, ax (2 bytes) before the clear, and I need AL=0 for the rest. But AL is already 0 from xor ax, ax. ✓
Let me recount sin/cos more carefully:
fadd dword [bp+0] ; D8 46 00 = 3 (da at offset 0)
fld st0 ; D9 C0 = 2
fsincos ; D9 FB = 2
fmul word [bp+4] ; D8 4E 04? NO: fimul for word integer
; Actually: FIMUL m16int = DE 4E disp8
fimul word [bp+4] ; DE 4E 04 = 3 (k64_int at offset 4)
fistp word [bp+ica_off]; DF 76 ica_off = 3 (ica at offset ica_off)
fimul word [bp+4] ; DE 4E 04 = 3
fistp word [bp+isa_off]; DF 76 isa_off = 3 (isa at offset isa_off)
Wait, I need to check: does FIMUL word [BP+disp8] correctly compute st(0) *= integer_at_[bp+offset]? Let me verify the opcodes.
From Intel x86 manual:
OK so fimul word [bp+4] = DE 4E 04 = 3 bytes. ✓
And FISTP word [BP+disp8]:
Let me recalculate: mod=01, reg=011, rm=110: ModRM = [01][011][110] = 01011110b = 0x5E.
Hmm, but I wrote 0x76 earlier. Let me recalculate: mod=01 (2 bits): 01 reg=011 (3 bits): 011 rm=110 (3 bits): 110 ModRM = 01_011_110 = 0x5E? No: bit layout is [mod(2)][reg(3)][r/m(3)]. So: [01][011][110] = 01011110 = 0x5E.
But I wrote DF 76 earlier. Let me recheck.
DF = FISTP (32-bit or 64-bit integer)? No:
Hmm, so FISTP m16int = DF /3 = DF with ModRM where reg field = 011.
For [BP+disp8]: ModRM = mod=01, rm=110, reg=011: ModRM byte = [01][011][110] = 01011110b = 0x5E. Total: DF 5E disp8 = 3 bytes.
So fistp word [bp+4] = DF 5E 04 = 3 bytes. I had a typo (0x76 vs 0x5E) but the count is still 3 bytes. ✓
Sin/cos total: 3+2+2+3+3+3+3 = 19 bytes ✓
Now data layout:
data_start: ; BP base
da: dd 0.04 ; offset 0: 4 bytes
k64: dw 64 ; offset 4: 2 bytes (integer, not float)
ica: dw 0 ; offset 6: 2 bytes
isa: dw 0 ; offset 8: 2 bytes
vcoord: dw 0, 0, 0 ; offset 10: 6 bytes
pts: times 16 dw 0 ; offset 16: 32 bytes
xs: dw 0 ; offset 48: 2 bytes
ys: dw 0 ; offset 50: 2 bytes
edge_table: ; offset 52: 24 bytes
db 0,1, 2,3, 4,5, 6,7
db 0,2, 1,3, 4,6, 5,7
db 0,4, 1,5, 2,6, 3,7
; Total data: 4+2+2+2+6+32+2+2+24 = 76 bytes
Offsets:
All offsets ≤ 52+24-1 = 75 < 128, so 8-bit signed displacements work. ✓
Now, in the code, I use these BP offsets. Let me verify the offsets fit in signed 8-bit range (−128 to +127): max = 75. ✓
Now let me check the inner loop's add dx, [bp+xs_off]: xs_off = 48 → add dx, [bp+48] = 03 56 30 = 3 bytes ✓ (48 = 0x30)
And add di, [bp+50] = 03 7E 32 = 3 bytes ✓ (50 = 0x32)
But wait: these offsets (48 and 50) are in the 8-bit range, so the BP-relative addressing works. ✓
GRAND TOTAL: 224 bytes (code) + 76 bytes (data) = 300 bytes.
Hmm. I need to cut 44 bytes. Let me think harder.
Actually wait - I think the problem might be that I'm doing the vertex loop with a complete coord loop that's 25 bytes per vertex. Let me reconsider.
In the coord loop, the shr bx, 1 instruction shifts BX right each iteration. After 3 shifts, BX is BX_original >> 3. The pop bx restores it. But this means I need push/pop which is 2 bytes per vertex.
What if I restructure to avoid push/pop? I could compute the vertex coordinates WITHOUT shifting BX:
nasm
; Instead of shifting BX, extract each bit with mask:
; Bit 0: mov al, bl; and al, 1 → 0 or 1
; Bit 1: mov al, bl; shr al, 1; and al, 1 → 0 or 1
; Bit 2: mov al, bl; shr al, 2; and al, 1 → 0 or 1
If I inline all 3 coordinate computations without a loop, I avoid push/pop (2 bytes) and the loop overhead (3+2=5 bytes for setup+loop instruction), but duplicate code. Let me count:
Inline coord for vx (bit 0):
mov al, bl ; 2 (MOV r8, r8 = 8A C3 = 2)
and al, 1 ; 2 (24 01)
add al, al ; 2 (00 C0)
dec al ; 2 (FE C8)
cbw ; 1 (98)
imul ax, ax, SIZE ; 3 (6B C0 37)
mov [bp+vcoord_off], ax ; 3 (89 46 vcoord_off)
= 15 bytes
Inline coord for vy (bit 1):
mov al, bl ; 2
shr al, 1 ; 2 (D0 E8)
and al, 1 ; 2
add al, al ; 2
dec al ; 2
cbw ; 1
imul ax, ax, SIZE ; 3
mov [bp+vcoord_off+2], ax ; 3 (89 46 (vcoord_off+2))
= 17 bytes
Inline coord for vz (bit 2):
mov al, bl ; 2
shr al, 2 ; 3 (C0 E8 02)
and al, 1 ; 2
add al, al ; 2
dec al ; 2
cbw ; 1
imul ax, ax, SIZE ; 3
mov [bp+vcoord_off+4], ax ; 3 (89 46 (vcoord_off+4))
= 18 bytes
Total inline: 15+17+18 = 50 bytes.
vs. coord loop: 25 bytes.
Much worse inline! The loop saves significant bytes.
Hmm. OK so the coord loop is definitely the right approach.
OK let me look at the problem differently. 300 bytes is 44 bytes over. This is a LOT. I need to find big wins somewhere.
Let me look at potential restructurings:
Idea: Combine vertex projection with initial edge drawing
What if, instead of storing all 8 projected vertices in .pts and then drawing edges, I draw edges incrementally as vertices are computed?
When processing vertex i (from 7 to 0):
Wait, counting DOWN from 7:
For each vertex i (counting down from 7), I draw edges to:
Number of edges drawn per vertex:
Total: 0+1+1+2+1+2+2+3 = 12 edges ✓
By drawing edges within the vertex loop, I can potentially save some code:
The vertex loop already accesses .pts to store the current vertex. To look up OTHER vertices, I can use [bp+di2+pts_off] where di2 = target_vertex * 4.
For each edge to draw from vertex i to vertex j (j < i):
Actually, after computing SI=sx and AX=sy for vertex i, and before/after storing them to pts[i], I can draw lines to already-computed neighbors.
Let me think about the savings:
For each vertex, I need to:
Code for drawing one edge from (si, ax) to neighbor (nx, ny) where neighbor index = bx XOR k:
; Given: si=sx_current, ax=sy_current, BX=current vertex index
; Compute neighbor index = bx XOR k, get its pts entry
mov di, bx
xor di, k ; di = neighbor index (XOR with 1, 2, or 4)
shl di, 2 ; di = neighbor * 4
; Load neighbor coords:
; NOTE: after this point, DI is used for memory access, not as y_pos storage
; Actually: if I'm inside the vertex loop and si/ax hold current vertex,
; I can use BP+DI for neighbor access:
mov cx, [bp+di+pts_off] ; cx = nx (3 bytes: 8B 4B pts_off for MOV CX, [BP+DI+disp8])
mov dx, [bp+di+pts_off+2] ; dx = ny (3 bytes)
; Now draw from (si, ax) to (cx, dx)
Wait: MOV CX, [BP+DI+disp8]: ModRM = mod=01, r=CX=1, rm=011(BP+DI): 01_001_011 = 0x4B. Total: 8B 4B pts_off = 3 bytes ✓. MOV DX, [BP+DI+disp8]: mod=01, r=DX=2, rm=011: 01_010_011 = 0x53. Total: 8B 53 pts_off+2 = 3 bytes ✓.
Then draw the line from (si, ax) to (cx, dx). For this, I need to call the DDA line drawing or draw inline.
If I draw inline (call a subroutine each time), I'd have:
Called multiple times (0-3 times per vertex). The subroutine is 53 bytes for one call.
For the whole vertex loop (7 vertices drawing edges, with avg ~1.71 edges per vertex), I'd call the subroutine 12 times total. But the subroutine is only written once = 53 bytes + 12 × 3 bytes (CALL) = 89 bytes for drawing, vs. old edge section = 95 bytes.
Wait, that's close! Let me count more carefully.
Old edge section (without table): 95 - 24 (edge table now moved to data savings) + 24 = 95 bytes (including table in data).
New approach (inline edges during vertex loop):
Plus draw_line subroutine: 53 bytes (step computation + inner loop).
So additional code in vertex loop body: 57 bytes. Removed: edge table (24 bytes data) + edge loop (95 - 24 = 71 bytes code).
Net code change: +57 (vertex loop additions) + 53 (subroutine) - 71 (removed edge loop) = +39 bytes. WORSE!
Net data change: -24 bytes (edge table removed).
Total net: +39 - 24 = +15 bytes. Still worse!
Hmm. The subroutine approach doesn't save enough because of the per-vertex conditional logic.
What if I structure the edge drawing differently? Instead of 3 conditional checks per vertex, I use the edge connectivity code to directly emit edges?
Actually, let me reconsider. The 12 edges need to be drawn somehow. The minimum code to draw each edge consists of:
For #1, each load is 3 bytes (with BP+SI or BP+DI), so 4 loads = 12 bytes per edge. × 12 edges = 144 bytes just for loading. But these loads are in a loop with loop overhead, so the code is actually 12-15 bytes for the load logic.
For #2, the line drawing is the dominant cost: 22 (step setup) + 28 (inner loop) = 50 bytes. This is the same regardless of approach.
The minimum drawing code = 50 bytes. Plus loading = 15 bytes. Plus loop = 8 bytes. Total = 73 bytes minimum. Vs my current 95 bytes. I need to cut 22 bytes from the edge section.
Key optimization: Can I make the inner loop smaller?
Current inner loop (28 bytes):
add dx, [bp+xs_off] ; 3
add di, [bp+ys_off] ; 3
mov ax, dx ; 2
sar ax, 8 ; 3
mov bx, di ; 2
sar bx, 8 ; 3
imul bx, bx, 320 ; 4
add bx, ax ; 2
mov [es:bx], byte 15 ; 4
loop .ls ; 2
= 28 bytes.
Can I reduce any of these?
Actually, let me try a different pixel addressing approach. What if I split the offset computation?
For pixel (x, y):
With IMUL bx, bx, 320 = 4 bytes. Alternative:
mov ax, bx ; 2: ax = y
shl bx, 8 ; 3: bx = y*256
shl ax, 6 ; 3: ax = y*64
add bx, ax ; 2: bx = y*320
add bx, [dx] ; no, I need to add x... let me use SI for y*320 first
= 2+3+3+2 = 10 bytes. Much worse.
What about using imul ax, bx, 320? AX = y*320. Then I don't need add bx, ax: Actually the distinction is which register I use. Let me try:
mov ax, di ; 2: ax = y_pos (already in DI)
sar ax, 8 ; 3: ax = y_pixel
imul bx, ax, 320; 4: bx = y_pixel*320
mov ax, dx ; 2: ax = x_pos
sar ax, 8 ; 3: ax = x_pixel
add bx, ax ; 2: bx = offset
mov [es:bx], 15 ; 4 (wait: need to specify byte... 26 C6 07 0F or similar)
Hmm, this restructuring doesn't save anything.
What if I use ES:BX instead of ES:[DI] and save 1 byte? mov byte [es:bx], 15 = 26 C6 07 0F = 4 bytes. vs. mov [es:di], al where AL=15: 26 88 05 = 3 bytes!
If I use DI as the pixel offset register instead of BX, I save 1 byte per pixel plot!
MOV [ES:DI], AL = ES override (26h) + MOV [DI], r8 = 88 05 (mod=00, reg=AL=0, rm=101=DI = 00_000_101 = 0x05). Total: 26 88 05 = 3 bytes!
vs. MOV [ES:BX], imm8 = ES override + MOV [BX], imm8 = C6 07 OF (3 bytes + ES prefix = 4 bytes).
OR: set AL=15 once before the loop and use stosb (which uses ES:DI, but changes DI!). Hmm, STOSB uses DI and increments it. I'm using DI for y_pos. So I can't use STOSB here.
But I CAN use DI as the FINAL pixel offset. Let me restructure:
After computing x_pixel (in some reg) and y_pixel (in some reg):
But I currently use DI for y_pos (y_pos×256). If I put the pixel offset in DI, I need to save/restore y_pos. That's additional bytes.
Alternative: use a different register for y_pos. Use SI for y_pos? But SI is used as the edge pointer (saved on stack). Inside the inner loop, SI is not used. I could repurpose SI.
Let me restructure:
; After setting up DX=x_pos, DI=y_pos, save DI to SI:
mov si, di ; 2: SI = y_pos
; (or: shl dx, 8; shl di, 8; mov si, di - restructure init)
; Inner loop:
add dx, [bp+xs_off] ; 3
add si, [bp+ys_off] ; 3
mov ax, dx ; 2
sar ax, 8 ; 3: x_pixel
mov di, si ; 2: copy y_pos to DI for computation
sar di, 8 ; 3: y_pixel (now in DI)
imul di, di, 320 ; 4: DI = y*320 (IMUL DI, DI, 320 = 69 FF 40 01 = 4 bytes)
add di, ax ; 2: DI = y*320 + x = pixel offset
mov [es:di], al ; 3: plot (3 bytes instead of 4!)
loop .ls ; 2
= 3+3+2+3+2+3+4+2+3+2 = 27 bytes. Saves 1 byte! ✓
But I need to adjust the init code: Instead of shl dx, 8; shl di, 8, I do shl dx, 8; shl di, 8; mov si, di = 3+3+2 = 8 bytes (was 3+3=6 bytes). 2 bytes MORE.
Net: saves 1 (inner) but costs 2 (init) = net -1 byte. Not worth it.
Unless I can combine the init differently. If I initialize SI directly from y0 without going through DI:
shl dx, 8 ; 3: DX = x_pos
mov ax, di ; 2: AX = y0
shl ax, 8 ; 3: AX = y_pos
mov si, ax ; 2: SI = y_pos
= 3+2+3+2 = 10 bytes. vs. old shl dx, 8; shl di, 8 = 6 bytes. Worse.
Hmm. What if I just use DI for y_pos initially, save it to SI before the inner loop, and restore after?:
; After vertex 1 loading: dx=x0, di=y0, ax=x1, bx=y1
[step computation: sub ax, dx; sub bx, di; shl ax, 3; shl bx, 3; store to xs/ys]
shl dx, 8 ; dx = x_pos
shl di, 8 ; di = y_pos
push di ; save y_pos... no wait, this approach needs y_pos in SI for the loop
OK, this line of thinking isn't productive. Let me try another approach.
What if I use ES segment arithmetic to simplify pixel offset computation?
Instead of computing y*320+x every time, maintain a running offset:
This approach works for BRESENHAM but is complex for general DDA.
What if I choose a frame buffer layout that's easier to index?
Mode 13h uses 320 bytes per row. What if I use a different mode? Mode Y (mode X) uses 4 planes with 320/4 = 80 bytes per row... but that's more complex.
What about setting up a segment to point to row 0, and incrementing ES by 20 (=320/16) per row? Not worth the complexity.
Fundamental restructure: Use IMUL only once per frame, precomputing all the y*320 offsets?
Hmm, nope.
What if I use 64-step instead of 32-step DDA? mov cx, 64 (3 bytes, same). But step computation changes: instead of shl ax, 3 (diff×8), I'd use shl ax, 2 (diff×4). Still 3 bytes. No savings.
What if I reduce SIZE to 40 and use 16 steps? Max edge ≈ 56 pixels, 16 steps → ~3.5 pixels/step. Lines will have gaps. And mov cx, 16 is still 3 bytes. No code savings.
Can I eliminate the sar ax, 8; sar bx, 8 instructions? These are needed to go from fixed-point to integer pixels. What if I don't use fixed-point? Instead, update integer x_pixel directly with fractional part:
This is basically Bresenham's algorithm! Let me count a Bresenham implementation:
Bresenham for arbitrary line:
; Given: (x0,y0) in some regs, (x1,y1) in others
; Use Bresenham
dx = abs(x1-x0), dy = abs(y1-y0)
sx = (x1>x0)?1:-1, sy = (y1>y0)?1:-1
err = dx-dy
while not at endpoint:
plot(x0, y0)
if 2*err > -dy: err -= dy; x0 += sx
if 2*err < dx: err += dx; y0 += sy
if at endpoint: break
This requires multiple conditionals per pixel and is quite complex to implement compactly. Generally more code than DDA for a 256b demo.
What if I abandon the DDA line drawing entirely and use a different rendering approach?
Idea: Z-buffer with filled faces: Compute z-depths of vertices and draw filled polygons from back to front. This is actually MORE complex than wireframe.
Idea: Use only perspective projection with BIOS pixel plotting: Slow but compact. Not practical for animation.
Idea: Use a self-modifying code trick: Modify the edge table addresses at runtime to avoid computing them. Very complex.
Idea: Reduce from 8-bit to 4-bit fixed-point: Use scale factor 16 (4 bits) instead of 256 (8 bits):
Key realization: The inner loop is tightly optimized at 27-28 bytes. The main overhead is in the edge loading (25-29 bytes), step computation (22 bytes), and loop structure (8 bytes). Total = ~75-80 bytes for the drawing section.
The vertex section is 72 bytes. Together: ~150 bytes just for these two sections. Plus 17 bytes setup + 9 bytes clear + 19 bytes sin/cos + 12 bytes exit = 57 bytes. Total ~207 bytes code + 76 bytes data = 283 bytes.
Wait, that's 283, not 300 as I calculated earlier. Let me recount more carefully.
Actually, let me just accept uncertainty in my count and provide a best-effort implementation. I'll try to write the tightest version I can and note the optimization techniques used.
Given the complexity of my analysis, let me write the actual complete NASM code in a clean and practical way. I'll aim for 256 bytes with reasonable confidence, and if slightly over, I'll provide commentary on what to cut.
OK here is my final implementation plan. I'll write CLEAN, CORRECT code and try to get it as close to 256 bytes as possible:
nasm
bits 16
org 100h
SIZE equ 55
start:
mov ax, 13h
int 10h
mov ax, 0a000h
mov es, ax
fninit
fldz ; FPU st0 = angle = 0
; BP = pointer to our data block
mov bp, data
.frame:
; Clear screen
xor di, di
xor ax, ax
mov cx, 32000
rep stosw
; angle += 0.04, compute ica=round(cos*64), isa=round(sin*64)
fadd dword [bp] ; [bp+0] = da = 0.04
fld st0
fsincos ; st0=cos, st1=sin, st2=angle
fimul word [bp+4] ; cos*64; [bp+4] = k64 = 64
fistp word [bp+6] ; ica = round(cos*64); [bp+6]
fimul word [bp+4]
fistp word [bp+8] ; isa = round(sin*64); [bp+8]
; Project 8 vertices (i = 7..0)
mov bx, 7
.vl:
; Compute vcoord = {vx, vy, vz} at [bp+10..15]
push bx
lea di, [bp+10] ; [bp+10] = vcoord (6 bytes)
mov cx, 3
.vc:
mov ax, bx
and al, 1
add al, al
dec al
cbw
imul ax, ax, SIZE
stosw
shr bx, 1
loop .vc
pop bx
; rx = (vx*ica - vz*isa) >> 6
mov ax, [bp+10] ; vx
imul word [bp+6] ; *ica
xchg si, ax ; si = vx*ica
mov ax, [bp+14] ; vz (= vcoord+4)
imul word [bp+8] ; *isa
sub si, ax
sar si, 6
add si, 160 ; si = sx
; sy = -vy + 100
mov ax, [bp+12] ; vy (= vcoord+2)
neg ax
add ax, 100
; Store to pts[bx] = [bp+16 + bx*4]
mov di, bx
shl di, 2
mov [bp+di+16], si ; [bp+16] = pts
mov [bp+di+18], ax ; pts.y
dec bx
jns .vl
; Draw 12 edges
mov al, 15 ; white color
mov si, .etab
mov cx, 12
.el:
push cx
; Load v0 (x0, y0) in (dx, di)
lodsb
cbw
shl ax, 2
push si ; save edge ptr (at v1)
mov si, ax
mov dx, [bp+si+16] ; x0
mov di, [bp+si+18] ; y0
pop si ; restore to v1
; Load v1 (x1, y1) in (ax, bx)
lodsb
push si ; save next edge ptr
cbw
shl ax, 2
mov si, ax
mov ax, [bp+si+16] ; x1
mov bx, [bp+si+18] ; y1
; Compute steps, init DX/DI as fixed-point pos
sub ax, dx
sub bx, di
shl ax, 3 ; x_step = (x1-x0)*8 (for 32 steps at 1/256 scale)
shl bx, 3
mov [bp+48], ax ; [bp+48] = xs
mov [bp+50], bx ; [bp+50] = ys
shl dx, 8 ; dx = x0*256
shl di, 8 ; di = y0*256
; 32-step line drawing
mov cx, 32
.ls:
add dx, [bp+48] ; advance x
add di, [bp+50] ; advance y
mov ax, dx
sar ax, 8 ; x_pixel
mov bx, di
sar bx, 8 ; y_pixel
imul bx, bx, 320
add bx, ax
mov byte [es:bx], 15
loop .ls
pop si ; restore next edge ptr
pop cx
loop .el
; ESC key = scancode 1
in al, 60h
dec al
jnz .frame
mov ax, 3
int 10h
ret
.etab:
db 0,1, 2,3, 4,5, 6,7 ; bit-0 pairs (8)
db 0,2, 1,3, 4