base:fastest_multiplication_2023
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
base:fastest_multiplication_2023 [2023-12-11 12:22] – repose | base:fastest_multiplication_2023 [2024-02-13 08:11] (current) – missing comments in first multiplication repose | ||
---|---|---|---|
Line 1: | Line 1: | ||
- | -If you see this message, the Wiki is being updated. Please check back later! Thanks- | ||
- | |||
The research continues in 2023 and the fastest multiply got faster! | The research continues in 2023 and the fastest multiply got faster! | ||
- | Thanks to a 6502 simulator written in C and the analyzing of statistics in the program branches and boundary crossings, the exact speed of routines are now known. This analysis has inspired new optimizations! The new routine executes in a blazing | + | Thanks to a 6502 simulator written in C and the analyzing of statistics in the program branches and boundary crossings, the exact speed of routines are now known. This analysis has inspired new optimizations! The new routine executes in a blazing |
- | Code to come, but for now see the end of this post: | + | Code in ACME assembly format. Please adjust addresses |
- | https://csdb.dk/forums/?roomid=11& | + | < |
+ | ; World' | ||
+ | ; | ||
+ | ; 16 bit x 16 bit unsigned multiply, 32 bit result | ||
+ | ; Average cycles: 193.07 (including RTS) | ||
+ | ; 2170 bytes | ||
+ | |||
+ | |||
+ | ; How to use: | ||
+ | ; call jsr init, before first use | ||
+ | ; put numbers in (x0,x1) and (y0,y1) and result is (z3, A, Y, z0) | ||
+ | |||
+ | ; pointers to square tables | ||
+ | p_sqr_lo | ||
+ | p_sqr_hi | ||
+ | p_neg_sqr_lo = $8f ; 2 bytes | ||
+ | p_neg_sqr_hi = $91 ; 2 bytes | ||
+ | |||
+ | ; the inputs and outputs | ||
+ | x0 = $02 ; multiplier, 2 bytes | ||
+ | x1 = $03 | ||
+ | y0 = $04 ; multiplicand, | ||
+ | y1 = $05 | ||
+ | z0 = $06 ; product, 2 bytes + 2 registers | ||
+ | ; z1 = $07 returned in Y reg | ||
+ | ; z2 = $08 returned in A reg | ||
+ | z3 = $09 | ||
+ | |||
+ | * = $C000 | ||
+ | |||
+ | ; Align tables to start of page | ||
+ | ; Note - the last byte of each table is never referenced, as a+b< | ||
+ | sqrlo | ||
+ | !for i, 0, 511 { | ||
+ | !byte <((i*i)/4) | ||
+ | } | ||
+ | sqrhi | ||
+ | !for i, 0, 511 { | ||
+ | !byte >((i*i)/4) | ||
+ | } | ||
+ | |||
+ | negsqrlo | ||
+ | !for i, 0, 511 { | ||
+ | !byte < | ||
+ | } | ||
+ | |||
+ | negsqrhi | ||
+ | !for i, 0, 511 { | ||
+ | !byte > | ||
+ | } | ||
+ | |||
+ | ; Diagram of the additions | ||
+ | ; | ||
+ | ; x x1 x0 | ||
+ | ; | ||
+ | ; x0y0h x0y0l | ||
+ | ; + x0y1h x0y1l | ||
+ | ; + x1y0h x1y0l | ||
+ | ; +x1y1h x1y1l | ||
+ | ; ------------------------ | ||
+ | ; | ||
+ | |||
+ | umult16 | ||
+ | ; set multiplier as x1 | ||
+ | lda x1 | ||
+ | sta p_sqr_lo | ||
+ | sta p_sqr_hi | ||
+ | eor #$ff | ||
+ | sta p_neg_sqr_lo | ||
+ | sta p_neg_sqr_hi | ||
+ | |||
+ | ; set multiplicand as y0 | ||
+ | ldy y0 | ||
+ | |||
+ | ; | ||
+ | ; | ||
+ | sec | ||
+ | lda (p_sqr_lo), | ||
+ | sbc (p_neg_sqr_lo), | ||
+ | sta x1y0l+1 | ||
+ | lda (p_sqr_hi), y | ||
+ | sbc (p_neg_sqr_hi), | ||
+ | sta x1y0h+1 | ||
+ | |||
+ | ; set multiplicand as y1 | ||
+ | ldy y1 | ||
+ | |||
+ | ; x1y1l = low(x1*y1) | ||
+ | ; z3 = high(x1*y1) | ||
+ | lda (p_sqr_lo), | ||
+ | sbc (p_neg_sqr_lo), | ||
+ | sta x1y1l+1 | ||
+ | lda (p_sqr_hi), | ||
+ | sbc (p_neg_sqr_hi), | ||
+ | sta z3 | ||
+ | |||
+ | ; set multiplier as x0 | ||
+ | lda x0 | ||
+ | sta p_sqr_lo | ||
+ | sta p_sqr_hi | ||
+ | eor #$ff | ||
+ | sta p_neg_sqr_lo | ||
+ | sta p_neg_sqr_hi | ||
+ | |||
+ | ; x0y1l = low(x0*y1) | ||
+ | ; X = high(x0*y1) | ||
+ | lda (p_sqr_lo), | ||
+ | sbc (p_neg_sqr_lo), | ||
+ | sta x0y1l+1 | ||
+ | lda (p_sqr_hi), | ||
+ | sbc (p_neg_sqr_hi ),y | ||
+ | tax | ||
+ | |||
+ | ; set multiplicand as y0 | ||
+ | ldy y0 | ||
+ | |||
+ | ; z0 = low(x0*y0) | ||
+ | ; A = high(x0*y0) | ||
+ | lda (p_sqr_lo), | ||
+ | sbc (p_neg_sqr_lo), | ||
+ | sta z0 | ||
+ | lda (p_sqr_hi), | ||
+ | sbc (p_neg_sqr_hi), | ||
+ | |||
+ | clc | ||
+ | do_adds | ||
+ | ; add the first two numbers of column 1 | ||
+ | x0y1l | ||
+ | adc #0 ; x0y0h + x0y1l | ||
+ | tay | ||
+ | |||
+ | ; continue to first two numbers of column 2 | ||
+ | txa | ||
+ | x1y0h | ||
+ | adc #0 ; x0y1h + x1y0h | ||
+ | tax ; X=z2 so far | ||
+ | bcc + | ||
+ | inc z3 ; column 3 | ||
+ | clc | ||
+ | |||
+ | ; add last number of column 1 | ||
+ | + | ||
+ | tya | ||
+ | x1y0l | ||
+ | adc #0 ; + x1y0l | ||
+ | tay ; Y=z1 | ||
+ | |||
+ | ; add last number of column 2 | ||
+ | txa | ||
+ | x1y1l | ||
+ | adc #0 ; + x1y1l | ||
+ | bcc fin ; A=z2 | ||
+ | inc z3 ; column 3 | ||
+ | fin | ||
+ | rts | ||
+ | |||
+ | |||
+ | ; Once only initialization | ||
+ | ; this could set up the pointer values in a loop to save memory | ||
+ | ; it could also generate the square tables in code rather than load them | ||
+ | init | ||
+ | lda #> | ||
+ | sta p_sqr_lo+1 | ||
+ | |||
+ | lda #> | ||
+ | sta p_sqr_hi+1 | ||
+ | |||
+ | lda #> | ||
+ | sta p_neg_sqr_lo+1 | ||
+ | lda #> | ||
+ | sta p_neg_sqr_hi+1 | ||
+ | rts | ||
+ | </ |
base/fastest_multiplication_2023.1702293727.txt.gz · Last modified: 2023-12-11 12:22 by repose