base:fastest_multiplication
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
base:fastest_multiplication [2017-04-17 12:10] – repose | base:fastest_multiplication [2024-02-13 08:24] (current) – accurate timing if Jack Asser code repose | ||
---|---|---|---|
Line 1: | Line 1: | ||
- | Requires tables or a generator routine such as http:// | + | ====== Fastest 16x16 unsigned multiplication ====== |
+ | By Repose | ||
+ | |||
+ | Requires tables or a generator routine such as [[table_generator_routine_for_fast_8_bit_mul_table]] | ||
+ | |||
+ | Jack Asser' | ||
+ | Chris Jam's: 204.5* | ||
+ | Mine: 198.6 zp variation: 194.6 \\ | ||
+ | Times above need to add 6 for rts \\ | ||
+ | Note: updated 2023; corrected code, typos and timings \\ | ||
+ | *Timing approximate for external code \\ | ||
+ | This is the winner out of 120 published algorithms as independently tested here: | ||
+ | https:// | ||
+ | Note: I have a new version which is 188.1 cycles, excepting the RTS. \\ | ||
+ | |||
+ | < | ||
; | ; | ||
;you can go faster, but not without more code and/or data | ;you can go faster, but not without more code and/or data | ||
;and being less elegant and harder to follow. | ;and being less elegant and harder to follow. | ||
;by Repose 2017 | ;by Repose 2017 | ||
+ | ;table generator by Graham | ||
+ | ;addition improvement suggested by JackAsser | ||
+ | |||
+ | ;data: 2044 bytes | ||
+ | ;zero page ram required: minimum 8 bytes, ideally 14 | ||
+ | ;do_add: 30 bytes in zp, if used | ||
+ | ;time: 198.6 cycles, option for 194.6 if you use 30 more zp bytes for do_add | ||
+ | ; | ||
+ | |||
+ | ;How to use: | ||
+ | ;put numbers in x/y and result is Y reg (z3), A reg (z2), z1, z0 | ||
;tables of squares | ;tables of squares | ||
; | ; | ||
; | ; | ||
- | sqrlo=$c000;511 bytes | + | sqrlo=$c100;511 bytes |
- | sqrhi=$c200;511 bytes | + | sqrhi=$c300;511 bytes |
- | negsqrlo=$c400;511 bytes | + | negsqrlo=$c500;511 bytes |
- | negsqrhi=$c600;511 bytes | + | negsqrhi=$c700;511 bytes |
;pointers to square tables above | ;pointers to square tables above | ||
Line 25: | Line 51: | ||
y0=$fd; | y0=$fd; | ||
y1=$fe | y1=$fe | ||
- | z0=$80; | + | z0=$80; |
z1=$81 | z1=$81 | ||
- | z2=$82 | + | z2=$82 |
- | z3=$83 | + | z3=$83 |
- | ;not shown is a routine to make the tables | + | ;Example showing use |
- | ;also you need to init the pointers' | + | *=$c000 |
+ | lda #$ff | ||
+ | sta x0 | ||
+ | sta x1 | ||
+ | sta y0 | ||
+ | sta y1 | ||
+ | jsr makesqrtables | ||
+ | jsr umult16 | ||
+ | sta z2 | ||
+ | sty z3 | ||
+ | rts | ||
+ | ;result should be $fffe0001, e.g. as viewed with a typical m 0080 monitor command: | ||
+ | ;0080 01 00 fe ff | ||
+ | |||
+ | makesqrtables: | ||
+ | ;init zp square | ||
+ | lda #> | ||
+ | sta p_sqr_lo+1 | ||
+ | lda #> | ||
+ | sta p_sqr_hi+1 | ||
+ | lda #> | ||
+ | sta p_invsqr_lo+1 | ||
+ | lda #> | ||
+ | sta p_invsqr_hi+1 | ||
+ | |||
+ | ;generate sqr(x)=x^2/ | ||
+ | ldx #$00 | ||
+ | txa | ||
+ | !by $c9 ; CMP #immediate - skip TYA and clear carry flag | ||
+ | lb1: tya | ||
+ | adc #$00 | ||
+ | ml1: sta sqrhi,x | ||
+ | tay | ||
+ | cmp #$40 | ||
+ | txa | ||
+ | ror | ||
+ | ml9: adc #$00 | ||
+ | sta ml9+1 | ||
+ | inx | ||
+ | ml0: sta sqrlo,x | ||
+ | bne lb1 | ||
+ | inc ml0+2 | ||
+ | inc ml1+2 | ||
+ | clc | ||
+ | iny | ||
+ | bne lb1 | ||
+ | |||
+ | ;generate negsqr(x)=(255-x)^2/ | ||
+ | ldx #$00 | ||
+ | ldy #$ff | ||
+ | mt1: | ||
+ | lda sqrhi+1,x | ||
+ | sta negsqrhi+$100, | ||
+ | lda sqrhi,x | ||
+ | sta negsqrhi, | ||
+ | lda sqrlo+1,x | ||
+ | sta negsqrlo+$100, | ||
+ | lda sqrlo,x | ||
+ | sta negsqrlo, | ||
+ | dey | ||
+ | inx | ||
+ | bne mt1 | ||
+ | rts | ||
umult16: | umult16: | ||
Line 42: | Line 130: | ||
sta p_invsqr_hi; | sta p_invsqr_hi; | ||
- | ldy y0 | ||
sec | sec | ||
+ | ldy y0 | ||
lda (p_sqr_lo), | lda (p_sqr_lo), | ||
- | sbc (p_invsqr_lo), | + | sbc (p_invsqr_lo), |
sta z0;x0*y0l | sta z0;x0*y0l | ||
lda (p_sqr_hi), | lda (p_sqr_hi), | ||
sbc (p_invsqr_hi), | sbc (p_invsqr_hi), | ||
- | sta c1a+1; | + | sta c1a+1; |
;c1a means column 1, row a (partial product to be added later) | ;c1a means column 1, row a (partial product to be added later) | ||
ldy y1 | ldy y1 | ||
- | ;sec ;notice that the high byte of sub above is always | + | ;sec ;notice that the high byte of subtraction |
lda (p_sqr_lo), | lda (p_sqr_lo), | ||
sbc (p_invsqr_lo), | sbc (p_invsqr_lo), | ||
Line 59: | Line 147: | ||
lda (p_sqr_hi), | lda (p_sqr_hi), | ||
sbc (p_invsqr_hi), | sbc (p_invsqr_hi), | ||
- | sta c2a+1; | + | sta c2a+1; |
;set multiplier as x1 | ;set multiplier as x1 | ||
Line 76: | Line 164: | ||
lda (p_sqr_hi), | lda (p_sqr_hi), | ||
sbc (p_invsqr_hi), | sbc (p_invsqr_hi), | ||
- | sta c2b+1;x1*y1h;31 | + | sta c2b+1;x1*y0h;32.992 |
ldy y1 | ldy y1 | ||
Line 85: | Line 173: | ||
lda (p_sqr_hi), | lda (p_sqr_hi), | ||
sbc (p_invsqr_hi), | sbc (p_invsqr_hi), | ||
- | sta z3;x1*y1h;31 | + | tay;Y=x1*y1h, 30.992 cycles |
+ | ;17+34+33+17+33+31=164.97 cycles for main multiply part (minimum=157, | ||
- | ;4*31+2*17 so far=158 | + | ;jmp do_adds; |
- | ;add partials | + | |
- | ;-add first two numbers in column 1 | + | |
- | ;jmp do_adds;put in zp to save 3 cycles :) | + | |
do_adds: | do_adds: | ||
- | clc | + | ;-add the first two numbers of column 1 |
- | c1a lda #0 | + | clc |
- | c1b adc #0;add first two rows of column 1 | + | c1a: lda #0 |
- | sta z1;9 | + | c1b: adc #0 |
- | ;-continue to first two numbers | + | sta z1;9 |
- | c2a lda #0 | + | |
- | c2b adc #0 | + | ;-continue to first two numbers |
- | sta z2;7 | + | c2a: lda #0 |
- | bcc c1c;3 taken/9 not taken, avg 6 | + | c2b: adc #0 |
- | inc z3 | + | tax |
- | clc | + | bcc c1c;9 |
- | ;-add last number of column 1 (row c) | + | iny;z3++ |
- | c1c lda #0 | + | clc;(+3) taken 7% of the time, 3*.07=+.21 |
- | adc z1 | + | |
- | sta z1;8 | + | ;-add last number of column 1 |
+ | c1c: lda #0 | ||
+ | adc z1 | ||
+ | sta z1;8 | ||
;-add last number of column 2 | ;-add last number of column 2 | ||
- | c2c lda #0 | + | txa |
- | adc z2 | + | c2c: adc #0 |
- | sta z2;8 | + | ;A=z2 |
- | bcc fin;3/7 avg 5 | + | bcc fin;7 |
- | inc z3 | + | iny;(+1) taken 42% of the time, 1*.42=.42 |
- | ;9+7+6+8+8+5=43 | + | |
- | fin rts | + | |
+ | ;Y=z3, A=z2 | ||
+ | ;add partials part total cycles=33.63 (minimum=33, | ||
+ | ;total time=164.97+33.63=198.6 | ||
+ | fin: | ||
+ | Diagram of the additions | ||
+ | y1 y0 | ||
+ | | ||
+ | -------- | ||
+ | x0y0h x0y0l | ||
+ | + x0y1h x0y1l | ||
+ | + x1y0h x1y0l | ||
+ | +x1y1h x1y1l | ||
+ | ------------------------ | ||
+ | z3 z2 z1 z0 | ||
+ | </ |
base/fastest_multiplication.1492423821.txt.gz · Last modified: 2017-04-17 12:10 by repose