YAHAL
Yet Another Hardware Abstraction Library
Loading...
Searching...
No Matches
double_v1_rom_shim.S
1/**
2 * Copyright (c) 2020 Mark Owen https://www.quinapalus.com .
3 *
4 * Raspberry Pi (Trading) Ltd (Licensor) hereby grants to you a non-exclusive license to use the software solely on a
5 * Raspberry Pi Pico device. No other use is permitted under the terms of this license.
6 *
7 * This software is also available from the copyright owner under GPLv2 licence.
8 *
9 * THIS SOFTWARE IS PROVIDED BY THE LICENSOR AND COPYRIGHT OWNER "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
10 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
11 * DISCLAIMED. IN NO EVENT SHALL THE LICENSOR OR COPYRIGHT OWNER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
12 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
13 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
14 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
15 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
16 */
17
18#include "asm_helper.S"
19
20.syntax unified
21.cpu cortex-m0plus
22.thumb
23
24.macro double_section name
25// todo separate flag for shims?
26#if PICO_DOUBLE_IN_RAM
27.section RAM_SECTION_NAME(\name), "ax"
28#else
29.section SECTION_NAME(\name), "ax"
30#endif
31.endm
32
33double_section double_table_shim_on_use_helper
34regular_func double_table_shim_on_use_helper
35 push {r0-r2, lr}
36 mov r0, ip
37#ifndef NDEBUG
38 // sanity check to make sure we weren't called by non (shimmable_) table_tail_call macro
39 cmp r0, #0
40 bne 1f
41 bkpt #0
42#endif
431:
44 ldrh r1, [r0]
45 lsrs r2, r1, #8
46 adds r0, #2
47 cmp r2, #0xdf
48 bne 1b
49 uxtb r1, r1 // r1 holds table offset
50 lsrs r2, r0, #2
51 bcc 1f
52 // unaligned
53 ldrh r2, [r0, #0]
54 ldrh r0, [r0, #2]
55 lsls r0, #16
56 orrs r0, r2
57 b 2f
581:
59 ldr r0, [r0]
602:
61 ldr r2, =sd_table
62 str r0, [r2, r1]
63 str r0, [sp, #12]
64 pop {r0-r2, pc}
65
66#if PICO_DOUBLE_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
67// Note that the V1 ROM has no double support, so this is basically the identical
68// library, and shim inter-function calls do not bother to redirect back thru the
69// wrapper functions
70
71.equ use_hw_div,1
72.equ IOPORT ,0xd0000000
73.equ DIV_UDIVIDEND,0x00000060
74.equ DIV_UDIVISOR ,0x00000064
75.equ DIV_QUOTIENT ,0x00000070
76.equ DIV_CSR ,0x00000078
77
78@ Notation:
79@ rx:ry means the concatenation of rx and ry with rx having the less significant bits
80
81.equ debug,0
82.macro mdump k
83.if debug
84 push {r0-r3}
85 push {r14}
86 push {r0-r3}
87 bl osp
88 movs r0,#\k
89 bl o1ch
90 pop {r0-r3}
91 bl dump
92 bl osp
93 bl osp
94 ldr r0,[r13]
95 bl o8hex @ r14
96 bl onl
97 pop {r0}
98 mov r14,r0
99 pop {r0-r3}
100.endif
101.endm
102
103
104@ IEEE double in ra:rb ->
105@ mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 set
106@ exponent in re
107@ sign in rs
108@ trashes rt
109.macro mdunpack ra,rb,re,rs,rt
110 lsrs \re,\rb,#20 @ extract sign and exponent
111 subs \rs,\re,#1
112 lsls \rs,#20
113 subs \rb,\rs @ clear sign and exponent in mantissa; insert implied 1
114 lsrs \rs,\re,#11 @ sign
115 lsls \re,#21
116 lsrs \re,#21 @ exponent
117 beq l\@_1 @ zero exponent?
118 adds \rt,\re,#1
119 lsrs \rt,#11
120 beq l\@_2 @ exponent != 0x7ff? then done
121l\@_1:
122 movs \ra,#0
123 movs \rb,#1
124 lsls \rb,#20
125 subs \re,#128
126 lsls \re,#12
127l\@_2:
128.endm
129
130@ IEEE double in ra:rb ->
131@ signed mantissa in ra:rb 12Q52 (53 significant bits) with implied 1
132@ exponent in re
133@ trashes rt0 and rt1
134@ +zero, +denormal -> exponent=-0x80000
135@ -zero, -denormal -> exponent=-0x80000
136@ +Inf, +NaN -> exponent=+0x77f000
137@ -Inf, -NaN -> exponent=+0x77e000
138.macro mdunpacks ra,rb,re,rt0,rt1
139 lsrs \re,\rb,#20 @ extract sign and exponent
140 lsrs \rt1,\rb,#31 @ sign only
141 subs \rt0,\re,#1
142 lsls \rt0,#20
143 subs \rb,\rt0 @ clear sign and exponent in mantissa; insert implied 1
144 lsls \re,#21
145 bcc l\@_1 @ skip on positive
146 mvns \rb,\rb @ negate mantissa
147 negs \ra,\ra
148 bcc l\@_1
149 adds \rb,#1
150l\@_1:
151 lsrs \re,#21
152 beq l\@_2 @ zero exponent?
153 adds \rt0,\re,#1
154 lsrs \rt0,#11
155 beq l\@_3 @ exponent != 0x7ff? then done
156 subs \re,\rt1
157l\@_2:
158 movs \ra,#0
159 lsls \rt1,#1 @ +ve: 0 -ve: 2
160 adds \rb,\rt1,#1 @ +ve: 1 -ve: 3
161 lsls \rb,#30 @ create +/-1 mantissa
162 asrs \rb,#10
163 subs \re,#128
164 lsls \re,#12
165l\@_3:
166.endm
167
168double_section WRAPPER_FUNC_NAME(__aeabi_dsub)
169
170# frsub first because it is the only one that needs alignment
171regular_func drsub_shim
172 push {r0-r3}
173 pop {r0-r1}
174 pop {r2-r3}
175 // fall thru
176
177regular_func dsub_shim
178 push {r4-r7,r14}
179 movs r4,#1
180 lsls r4,#31
181 eors r3,r4 @ flip sign on second argument
182 b da_entry @ continue in dadd
183
184.align 2
185double_section dadd_shim
186regular_func dadd_shim
187 push {r4-r7,r14}
188da_entry:
189 mdunpacks r0,r1,r4,r6,r7
190 mdunpacks r2,r3,r5,r6,r7
191 subs r7,r5,r4 @ ye-xe
192 subs r6,r4,r5 @ xe-ye
193 bmi da_ygtx
194@ here xe>=ye: need to shift y down r6 places
195 mov r12,r4 @ save exponent
196 cmp r6,#32
197 bge da_xrgty @ xe rather greater than ye?
198 adds r7,#32
199 movs r4,r2
200 lsls r4,r4,r7 @ rounding bit + sticky bits
201da_xgty0:
202 movs r5,r3
203 lsls r5,r5,r7
204 lsrs r2,r6
205 asrs r3,r6
206 orrs r2,r5
207da_add:
208 adds r0,r2
209 adcs r1,r3
210da_pack:
211@ here unnormalised signed result (possibly 0) is in r0:r1 with exponent r12, rounding + sticky bits in r4
212@ Note that if a large normalisation shift is required then the arguments were close in magnitude and so we
213@ cannot have not gone via the xrgty/yrgtx paths. There will therefore always be enough high bits in r4
214@ to provide a correct continuation of the exact result.
215@ now pack result back up
216 lsrs r3,r1,#31 @ get sign bit
217 beq 1f @ skip on positive
218 mvns r1,r1 @ negate mantissa
219 mvns r0,r0
220 movs r2,#0
221 negs r4,r4
222 adcs r0,r2
223 adcs r1,r2
2241:
225 mov r2,r12 @ get exponent
226 lsrs r5,r1,#21
227 bne da_0 @ shift down required?
228 lsrs r5,r1,#20
229 bne da_1 @ normalised?
230 cmp r0,#0
231 beq da_5 @ could mantissa be zero?
232da_2:
233 adds r4,r4
234 adcs r0,r0
235 adcs r1,r1
236 subs r2,#1 @ adjust exponent
237 lsrs r5,r1,#20
238 beq da_2
239da_1:
240 lsls r4,#1 @ check rounding bit
241 bcc da_3
242da_4:
243 adds r0,#1 @ round up
244 bcc 2f
245 adds r1,#1
2462:
247 cmp r4,#0 @ sticky bits zero?
248 bne da_3
249 lsrs r0,#1 @ round to even
250 lsls r0,#1
251da_3:
252 subs r2,#1
253 bmi da_6
254 adds r4,r2,#2 @ check if exponent is overflowing
255 lsrs r4,#11
256 bne da_7
257 lsls r2,#20 @ pack exponent and sign
258 add r1,r2
259 lsls r3,#31
260 add r1,r3
261 pop {r4-r7,r15}
262
263da_7:
264@ here exponent overflow: return signed infinity
265 lsls r1,r3,#31
266 ldr r3,=0x7ff00000
267 orrs r1,r3
268 b 1f
269da_6:
270@ here exponent underflow: return signed zero
271 lsls r1,r3,#31
2721:
273 movs r0,#0
274 pop {r4-r7,r15}
275
276da_5:
277@ here mantissa could be zero
278 cmp r1,#0
279 bne da_2
280 cmp r4,#0
281 bne da_2
282@ inputs must have been of identical magnitude and opposite sign, so return +0
283 pop {r4-r7,r15}
284
285da_0:
286@ here a shift down by one place is required for normalisation
287 adds r2,#1 @ adjust exponent
288 lsls r6,r0,#31 @ save rounding bit
289 lsrs r0,#1
290 lsls r5,r1,#31
291 orrs r0,r5
292 lsrs r1,#1
293 cmp r6,#0
294 beq da_3
295 b da_4
296
297da_xrgty: @ xe>ye and shift>=32 places
298 cmp r6,#60
299 bge da_xmgty @ xe much greater than ye?
300 subs r6,#32
301 adds r7,#64
302
303 movs r4,r2
304 lsls r4,r4,r7 @ these would be shifted off the bottom of the sticky bits
305 beq 1f
306 movs r4,#1
3071:
308 lsrs r2,r2,r6
309 orrs r4,r2
310 movs r2,r3
311 lsls r3,r3,r7
312 orrs r4,r3
313 asrs r3,r2,#31 @ propagate sign bit
314 b da_xgty0
315
316da_ygtx:
317@ here ye>xe: need to shift x down r7 places
318 mov r12,r5 @ save exponent
319 cmp r7,#32
320 bge da_yrgtx @ ye rather greater than xe?
321 adds r6,#32
322 movs r4,r0
323 lsls r4,r4,r6 @ rounding bit + sticky bits
324da_ygtx0:
325 movs r5,r1
326 lsls r5,r5,r6
327 lsrs r0,r7
328 asrs r1,r7
329 orrs r0,r5
330 b da_add
331
332da_yrgtx:
333 cmp r7,#60
334 bge da_ymgtx @ ye much greater than xe?
335 subs r7,#32
336 adds r6,#64
337
338 movs r4,r0
339 lsls r4,r4,r6 @ these would be shifted off the bottom of the sticky bits
340 beq 1f
341 movs r4,#1
3421:
343 lsrs r0,r0,r7
344 orrs r4,r0
345 movs r0,r1
346 lsls r1,r1,r6
347 orrs r4,r1
348 asrs r1,r0,#31 @ propagate sign bit
349 b da_ygtx0
350
351da_ymgtx: @ result is just y
352 movs r0,r2
353 movs r1,r3
354da_xmgty: @ result is just x
355 movs r4,#0 @ clear sticky bits
356 b da_pack
357
358.ltorg
359
360@ equivalent of UMULL
361@ needs five temporary registers
362@ can have rt3==rx, in which case rx trashed
363@ can have rt4==ry, in which case ry trashed
364@ can have rzl==rx
365@ can have rzh==ry
366@ can have rzl,rzh==rt3,rt4
367.macro mul32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
368 @ t0 t1 t2 t3 t4
369 @ (x) (y)
370 uxth \rt0,\rx @ xl
371 uxth \rt1,\ry @ yl
372 muls \rt0,\rt1 @ xlyl=L
373 lsrs \rt2,\rx,#16 @ xh
374 muls \rt1,\rt2 @ xhyl=M0
375 lsrs \rt4,\ry,#16 @ yh
376 muls \rt2,\rt4 @ xhyh=H
377 uxth \rt3,\rx @ xl
378 muls \rt3,\rt4 @ xlyh=M1
379 adds \rt1,\rt3 @ M0+M1=M
380 bcc l\@_1 @ addition of the two cross terms can overflow, so add carry into H
381 movs \rt3,#1 @ 1
382 lsls \rt3,#16 @ 0x10000
383 adds \rt2,\rt3 @ H'
384l\@_1:
385 @ t0 t1 t2 t3 t4
386 @ (zl) (zh)
387 lsls \rzl,\rt1,#16 @ ML
388 lsrs \rzh,\rt1,#16 @ MH
389 adds \rzl,\rt0 @ ZL
390 adcs \rzh,\rt2 @ ZH
391.endm
392
393@ SUMULL: x signed, y unsigned
394@ in table below ¯ means signed variable
395@ needs five temporary registers
396@ can have rt3==rx, in which case rx trashed
397@ can have rt4==ry, in which case ry trashed
398@ can have rzl==rx
399@ can have rzh==ry
400@ can have rzl,rzh==rt3,rt4
401.macro muls32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
402 @ t0 t1 t2 t3 t4
403 @ ¯(x) (y)
404 uxth \rt0,\rx @ xl
405 uxth \rt1,\ry @ yl
406 muls \rt0,\rt1 @ xlyl=L
407 asrs \rt2,\rx,#16 @ ¯xh
408 muls \rt1,\rt2 @ ¯xhyl=M0
409 lsrs \rt4,\ry,#16 @ yh
410 muls \rt2,\rt4 @ ¯xhyh=H
411 uxth \rt3,\rx @ xl
412 muls \rt3,\rt4 @ xlyh=M1
413 asrs \rt4,\rt1,#31 @ M0sx (M1 sign extension is zero)
414 adds \rt1,\rt3 @ M0+M1=M
415 movs \rt3,#0 @ 0
416 adcs \rt4,\rt3 @ ¯Msx
417 lsls \rt4,#16 @ ¯Msx<<16
418 adds \rt2,\rt4 @ H'
419
420 @ t0 t1 t2 t3 t4
421 @ (zl) (zh)
422 lsls \rzl,\rt1,#16 @ M~
423 lsrs \rzh,\rt1,#16 @ M~
424 adds \rzl,\rt0 @ ZL
425 adcs \rzh,\rt2 @ ¯ZH
426.endm
427
428@ SSMULL: x signed, y signed
429@ in table below ¯ means signed variable
430@ needs five temporary registers
431@ can have rt3==rx, in which case rx trashed
432@ can have rt4==ry, in which case ry trashed
433@ can have rzl==rx
434@ can have rzh==ry
435@ can have rzl,rzh==rt3,rt4
436.macro muls32_s32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
437 @ t0 t1 t2 t3 t4
438 @ ¯(x) (y)
439 uxth \rt0,\rx @ xl
440 uxth \rt1,\ry @ yl
441 muls \rt0,\rt1 @ xlyl=L
442 asrs \rt2,\rx,#16 @ ¯xh
443 muls \rt1,\rt2 @ ¯xhyl=M0
444 asrs \rt4,\ry,#16 @ ¯yh
445 muls \rt2,\rt4 @ ¯xhyh=H
446 uxth \rt3,\rx @ xl
447 muls \rt3,\rt4 @ ¯xlyh=M1
448 adds \rt1,\rt3 @ ¯M0+M1=M
449 asrs \rt3,\rt1,#31 @ Msx
450 bvc l\@_1 @
451 mvns \rt3,\rt3 @ ¯Msx flip sign extension bits if overflow
452l\@_1:
453 lsls \rt3,#16 @ ¯Msx<<16
454 adds \rt2,\rt3 @ H'
455
456 @ t0 t1 t2 t3 t4
457 @ (zl) (zh)
458 lsls \rzl,\rt1,#16 @ M~
459 lsrs \rzh,\rt1,#16 @ M~
460 adds \rzl,\rt0 @ ZL
461 adcs \rzh,\rt2 @ ¯ZH
462.endm
463
464@ can have rt2==rx, in which case rx trashed
465@ can have rzl==rx
466@ can have rzh==rt1
467.macro square32_64 rx,rzl,rzh,rt0,rt1,rt2
468 @ t0 t1 t2 zl zh
469 uxth \rt0,\rx @ xl
470 muls \rt0,\rt0 @ xlxl=L
471 uxth \rt1,\rx @ xl
472 lsrs \rt2,\rx,#16 @ xh
473 muls \rt1,\rt2 @ xlxh=M
474 muls \rt2,\rt2 @ xhxh=H
475 lsls \rzl,\rt1,#17 @ ML
476 lsrs \rzh,\rt1,#15 @ MH
477 adds \rzl,\rt0 @ ZL
478 adcs \rzh,\rt2 @ ZH
479.endm
480
481double_section dmul_shim
482 regular_func dmul_shim
483 push {r4-r7,r14}
484 mdunpack r0,r1,r4,r6,r5
485 mov r12,r4
486 mdunpack r2,r3,r4,r7,r5
487 eors r7,r6 @ sign of result
488 add r4,r12 @ exponent of result
489 push {r0-r2,r4,r7}
490
491@ accumulate full product in r12:r5:r6:r7
492 mul32_32_64 r0,r2, r0,r5, r4,r6,r7,r0,r5 @ XL*YL
493 mov r12,r0 @ save LL bits
494
495 mul32_32_64 r1,r3, r6,r7, r0,r2,r4,r6,r7 @ XH*YH
496
497 pop {r0} @ XL
498 mul32_32_64 r0,r3, r0,r3, r1,r2,r4,r0,r3 @ XL*YH
499 adds r5,r0
500 adcs r6,r3
501 movs r0,#0
502 adcs r7,r0
503
504 pop {r1,r2} @ XH,YL
505 mul32_32_64 r1,r2, r1,r2, r0,r3,r4, r1,r2 @ XH*YL
506 adds r5,r1
507 adcs r6,r2
508 movs r0,#0
509 adcs r7,r0
510
511@ here r5:r6:r7 holds the product [1..4) in Q(104-32)=Q72, with extra LSBs in r12
512 pop {r3,r4} @ exponent in r3, sign in r4
513 lsls r1,r7,#11
514 lsrs r2,r6,#21
515 orrs r1,r2
516 lsls r0,r6,#11
517 lsrs r2,r5,#21
518 orrs r0,r2
519 lsls r5,#11 @ now r5:r0:r1 Q83=Q(51+32), extra LSBs in r12
520 lsrs r2,r1,#20
521 bne 1f @ skip if in range [2..4)
522 adds r5,r5 @ shift up so always [2..4) Q83, i.e. [1..2) Q84=Q(52+32)
523 adcs r0,r0
524 adcs r1,r1
525 subs r3,#1 @ correct exponent
5261:
527 ldr r6,=0x3ff
528 subs r3,r6 @ correct for exponent bias
529 lsls r6,#1 @ 0x7fe
530 cmp r3,r6
531 bhs dm_0 @ exponent over- or underflow
532 lsls r5,#1 @ rounding bit to carry
533 bcc 1f @ result is correctly rounded
534 adds r0,#1
535 movs r6,#0
536 adcs r1,r6 @ round up
537 mov r6,r12 @ remaining sticky bits
538 orrs r5,r6
539 bne 1f @ some sticky bits set?
540 lsrs r0,#1
541 lsls r0,#1 @ round to even
5421:
543 lsls r3,#20
544 adds r1,r3
545dm_2:
546 lsls r4,#31
547 add r1,r4
548 pop {r4-r7,r15}
549
550@ here for exponent over- or underflow
551dm_0:
552 bge dm_1 @ overflow?
553 adds r3,#1 @ would-be zero exponent?
554 bne 1f
555 adds r0,#1
556 bne 1f @ all-ones mantissa?
557 adds r1,#1
558 lsrs r7,r1,#21
559 beq 1f
560 lsrs r1,#1
561 b dm_2
5621:
563 lsls r1,r4,#31
564 movs r0,#0
565 pop {r4-r7,r15}
566
567@ here for exponent overflow
568dm_1:
569 adds r6,#1 @ 0x7ff
570 lsls r1,r6,#20
571 movs r0,#0
572 b dm_2
573
574.ltorg
575
576@ Approach to division y/x is as follows.
577@
578@ First generate u1, an approximation to 1/x to about 29 bits. Multiply this by the top
579@ 32 bits of y to generate a0, a first approximation to the result (good to 28 bits or so).
580@ Calculate the exact remainder r0=y-a0*x, which will be about 0. Calculate a correction
581@ d0=r0*u1, and then write a1=a0+d0. If near a rounding boundary, compute the exact
582@ remainder r1=y-a1*x (which can be done using r0 as a basis) to determine whether to
583@ round up or down.
584@
585@ The calculation of 1/x is as given in dreciptest.c. That code verifies exhaustively
586@ that | u1*x-1 | < 10*2^-32.
587@
588@ More precisely:
589@
590@ x0=(q16)x;
591@ x1=(q30)x;
592@ y0=(q31)y;
593@ u0=(q15~)"(0xffffffffU/(unsigned int)roundq(x/x_ulp))/powq(2,16)"(x0); // q15 approximation to 1/x; "~" denotes rounding rather than truncation
594@ v=(q30)(u0*x1-1);
595@ u1=(q30)u0-(q30~)(u0*v);
596@
597@ a0=(q30)(u1*y0);
598@ r0=(q82)y-a0*x;
599@ r0x=(q57)r0;
600@ d0=r0x*u1;
601@ a1=d0+a0;
602@
603@ Error analysis
604@
605@ Use Greek letters to represent the errors introduced by rounding and truncation.
606@
607@ r₀ = y - a₀x
608@ = y - [ u₁ ( y - α ) - β ] x where 0 ≤ α < 2^-31, 0 ≤ β < 2^-30
609@ = y ( 1 - u₁x ) + ( u₁α + β ) x
610@
611@ Hence
612@
613@ | r₀ / x | < 2 * 10*2^-32 + 2^-31 + 2^-30
614@ = 26*2^-32
615@
616@ r₁ = y - a₁x
617@ = y - a₀x - d₀x
618@ = r₀ - d₀x
619@ = r₀ - u₁ ( r₀ - γ ) x where 0 ≤ γ < 2^-57
620@ = r₀ ( 1 - u₁x ) + u₁γx
621@
622@ Hence
623@
624@ | r₁ / x | < 26*2^-32 * 10*2^-32 + 2^-57
625@ = (260+128)*2^-64
626@ < 2^-55
627@
628@ Empirically it seems to be nearly twice as good as this.
629@
630@ To determine correctly whether the exact remainder calculation can be skipped we need a result
631@ accurate to < 0.25ulp. In the case where x>y the quotient will be shifted up one place for normalisation
632@ and so 1ulp is 2^-53 and so the calculation above suffices.
633
634double_section ddiv_shim
635 regular_func ddiv_shim
636 push {r4-r7,r14}
637ddiv0: @ entry point from dtan
638 mdunpack r2,r3,r4,r7,r6 @ unpack divisor
639
640.if use_hw_div
641
642 movs r5,#IOPORT>>24
643 lsls r5,#24
644 movs r6,#0
645 mvns r6,r6
646 str r6,[r5,#DIV_UDIVIDEND]
647 lsrs r6,r3,#4 @ x0=(q16)x
648 str r6,[r5,#DIV_UDIVISOR]
649@ if there are not enough cycles from now to the read of the quotient for
650@ the divider to do its stuff we need a busy-wait here
651
652.endif
653
654@ unpack dividend by hand to save on register use
655 lsrs r6,r1,#31
656 adds r6,r7
657 mov r12,r6 @ result sign in r12b0; r12b1 trashed
658 lsls r1,#1
659 lsrs r7,r1,#21 @ exponent
660 beq 1f @ zero exponent?
661 adds r6,r7,#1
662 lsrs r6,#11
663 beq 2f @ exponent != 0x7ff? then done
6641:
665 movs r0,#0
666 movs r1,#0
667 subs r7,#64 @ less drastic fiddling of exponents to get 0/0, Inf/Inf correct
668 lsls r7,#12
6692:
670 subs r6,r7,r4
671 lsls r6,#2
672 add r12,r12,r6 @ (signed) exponent in r12[31..8]
673 subs r7,#1 @ implied 1
674 lsls r7,#21
675 subs r1,r7
676 lsrs r1,#1
677
678.if use_hw_div
679
680 ldr r6,[r5,#DIV_QUOTIENT]
681 adds r6,#1
682 lsrs r6,#1
683
684.else
685
686@ this is not beautiful; could be replaced by better code that uses knowledge of divisor range
687 push {r0-r3}
688 movs r0,#0
689 mvns r0,r0
690 lsrs r1,r3,#4 @ x0=(q16)x
691 bl __aeabi_uidiv @ !!! this could (but apparently does not) trash R12
692 adds r6,r0,#1
693 lsrs r6,#1
694 pop {r0-r3}
695
696.endif
697
698@ here
699@ r0:r1 y mantissa
700@ r2:r3 x mantissa
701@ r6 u0, first approximation to 1/x Q15
702@ r12: result sign, exponent
703
704 lsls r4,r3,#10
705 lsrs r5,r2,#22
706 orrs r5,r4 @ x1=(q30)x
707 muls r5,r6 @ u0*x1 Q45
708 asrs r5,#15 @ v=u0*x1-1 Q30
709 muls r5,r6 @ u0*v Q45
710 asrs r5,#14
711 adds r5,#1
712 asrs r5,#1 @ round u0*v to Q30
713 lsls r6,#15
714 subs r6,r5 @ u1 Q30
715
716@ here
717@ r0:r1 y mantissa
718@ r2:r3 x mantissa
719@ r6 u1, second approximation to 1/x Q30
720@ r12: result sign, exponent
721
722 push {r2,r3}
723 lsls r4,r1,#11
724 lsrs r5,r0,#21
725 orrs r4,r5 @ y0=(q31)y
726 mul32_32_64 r4,r6, r4,r5, r2,r3,r7,r4,r5 @ y0*u1 Q61
727 adds r4,r4
728 adcs r5,r5 @ a0=(q30)(y0*u1)
729
730@ here
731@ r0:r1 y mantissa
732@ r5 a0, first approximation to y/x Q30
733@ r6 u1, second approximation to 1/x Q30
734@ r12 result sign, exponent
735
736 ldr r2,[r13,#0] @ xL
737 mul32_32_64 r2,r5, r2,r3, r1,r4,r7,r2,r3 @ xL*a0
738 ldr r4,[r13,#4] @ xH
739 muls r4,r5 @ xH*a0
740 adds r3,r4 @ r2:r3 now x*a0 Q82
741 lsrs r2,#25
742 lsls r1,r3,#7
743 orrs r2,r1 @ r2 now x*a0 Q57; r7:r2 is x*a0 Q89
744 lsls r4,r0,#5 @ y Q57
745 subs r0,r4,r2 @ r0x=y-x*a0 Q57 (signed)
746
747@ here
748@ r0 r0x Q57
749@ r5 a0, first approximation to y/x Q30
750@ r4 yL Q57
751@ r6 u1 Q30
752@ r12 result sign, exponent
753
754 muls32_32_64 r0,r6, r7,r6, r1,r2,r3, r7,r6 @ r7:r6 r0x*u1 Q87
755 asrs r3,r6,#25
756 adds r5,r3
757 lsls r3,r6,#7 @ r3:r5 a1 Q62 (but bottom 7 bits are zero so 55 bits of precision after binary point)
758@ here we could recover another 7 bits of precision (but not accuracy) from the top of r7
759@ but these bits are thrown away in the rounding and conversion to Q52 below
760
761@ here
762@ r3:r5 a1 Q62 candidate quotient [0.5,2) or so
763@ r4 yL Q57
764@ r12 result sign, exponent
765
766 movs r6,#0
767 adds r3,#128 @ for initial rounding to Q53
768 adcs r5,r5,r6
769 lsrs r1,r5,#30
770 bne dd_0
771@ here candidate quotient a1 is in range [0.5,1)
772@ so 30 significant bits in r5
773
774 lsls r4,#1 @ y now Q58
775 lsrs r1,r5,#9 @ to Q52
776 lsls r0,r5,#23
777 lsrs r3,#9 @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
778 orrs r0,r3
779 bcs dd_1
780 b dd_2
781dd_0:
782@ here candidate quotient a1 is in range [1,2)
783@ so 31 significant bits in r5
784
785 movs r2,#4
786 add r12,r12,r2 @ fix exponent; r3:r5 now effectively Q61
787 adds r3,#128 @ complete rounding to Q53
788 adcs r5,r5,r6
789 lsrs r1,r5,#10
790 lsls r0,r5,#22
791 lsrs r3,#10 @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
792 orrs r0,r3
793 bcc dd_2
794dd_1:
795
796@ here
797@ r0:r1 rounded result Q53 [0.5,1) or Q52 [1,2), but may not be correctly rounded-to-nearest
798@ r4 yL Q58 or Q57
799@ r12 result sign, exponent
800@ carry set
801
802 adcs r0,r0,r0
803 adcs r1,r1,r1 @ z Q53 with 1 in LSB
804 lsls r4,#16 @ Q105-32=Q73
805 ldr r2,[r13,#0] @ xL Q52
806 ldr r3,[r13,#4] @ xH Q20
807
808 movs r5,r1 @ zH Q21
809 muls r5,r2 @ zH*xL Q73
810 subs r4,r5
811 muls r3,r0 @ zL*xH Q73
812 subs r4,r3
813 mul32_32_64 r2,r0, r2,r3, r5,r6,r7,r2,r3 @ xL*zL
814 negs r2,r2 @ borrow from low half?
815 sbcs r4,r3 @ y-xz Q73 (remainder bits 52..73)
816
817 cmp r4,#0
818
819 bmi 1f
820 movs r2,#0 @ round up
821 adds r0,#1
822 adcs r1,r2
8231:
824 lsrs r0,#1 @ shift back down to Q52
825 lsls r2,r1,#31
826 orrs r0,r2
827 lsrs r1,#1
828dd_2:
829 add r13,#8
830 mov r2,r12
831 lsls r7,r2,#31 @ result sign
832 asrs r2,#2 @ result exponent
833 ldr r3,=0x3fd
834 adds r2,r3
835 ldr r3,=0x7fe
836 cmp r2,r3
837 bhs dd_3 @ over- or underflow?
838 lsls r2,#20
839 adds r1,r2 @ pack exponent
840dd_5:
841 adds r1,r7 @ pack sign
842 pop {r4-r7,r15}
843
844dd_3:
845 movs r0,#0
846 cmp r2,#0
847 bgt dd_4 @ overflow?
848 movs r1,r7
849 pop {r4-r7,r15}
850
851dd_4:
852 adds r3,#1 @ 0x7ff
853 lsls r1,r3,#20
854 b dd_5
855
856.section SECTION_NAME(dsqrt_shim)
857/*
858Approach to square root x=sqrt(y) is as follows.
859
860First generate a3, an approximation to 1/sqrt(y) to about 30 bits. Multiply this by y
861to give a4~sqrt(y) to about 28 bits and a remainder r4=y-a4^2. Then, because
862d sqrt(y) / dy = 1 / (2 sqrt(y)) let d4=r4*a3/2 and then the value a5=a4+d4 is
863a better approximation to sqrt(y). If this is near a rounding boundary we
864compute an exact remainder y-a5*a5 to decide whether to round up or down.
865
866The calculation of a3 and a4 is as given in dsqrttest.c. That code verifies exhaustively
867that | 1 - a3a4 | < 10*2^-32, | r4 | < 40*2^-32 and | r4/y | < 20*2^-32.
868
869More precisely, with "y" representing y truncated to 30 binary places:
870
871u=(q3)y; // 24-entry table
872a0=(q8~)"1/sqrtq(x+x_ulp/2)"(u); // first approximation from table
873p0=(q16)(a0*a0) * (q16)y;
874r0=(q20)(p0-1);
875dy0=(q15)(r0*a0); // Newton-Raphson correction term
876a1=(q16)a0-dy0/2; // good to ~9 bits
877
878p1=(q19)(a1*a1)*(q19)y;
879r1=(q23)(p1-1);
880dy1=(q15~)(r1*a1); // second Newton-Raphson correction
881a2x=(q16)a1-dy1/2; // good to ~16 bits
882a2=a2x-a2x/1t16; // prevent overflow of a2*a2 in 32 bits
883
884p2=(a2*a2)*(q30)y; // Q62
885r2=(q36)(p2-1+1t-31);
886dy2=(q30)(r2*a2); // Q52->Q30
887a3=(q31)a2-dy2/2; // good to about 30 bits
888a4=(q30)(a3*(q30)y+1t-31); // good to about 28 bits
889
890Error analysis
891
892 r₄ = y - a₄²
893 d₄ = 1/2 a₃r₄
894 a₅ = a₄ + d₄
895 r₅ = y - a₅²
896 = y - ( a₄ + d₄ )²
897 = y - a₄² - a₃a₄r₄ - 1/4 a₃²r₄²
898 = r₄ - a₃a₄r₄ - 1/4 a₃²r₄²
899
900 | r₅ | < | r₄ | | 1 - a₃a₄ | + 1/4 r₄²
901
902 a₅ = √y √( 1 - r₅/y )
903 = √y ( 1 - 1/2 r₅/y + ... )
904
905So to first order (second order being very tiny)
906
907 √y - a₅ = 1/2 r₅/y
908
909and
910
911 | √y - a₅ | < 1/2 ( | r₄/y | | 1 - a₃a₄ | + 1/4 r₄²/y )
912
913From dsqrttest.c (conservatively):
914
915 < 1/2 ( 20*2^-32 * 10*2^-32 + 1/4 * 40*2^-32*20*2^-32 )
916 = 1/2 ( 200 + 200 ) * 2^-64
917 < 2^-56
918
919Empirically we see about 1ulp worst-case error including rounding at Q57.
920
921To determine correctly whether the exact remainder calculation can be skipped we need a result
922accurate to < 0.25ulp at Q52, or 2^-54.
923*/
924
925dq_2:
926 bge dq_3 @ +Inf?
927 movs r1,#0
928 b dq_4
929
930dq_0:
931 lsrs r1,#31
932 lsls r1,#31 @ preserve sign bit
933 lsrs r2,#21 @ extract exponent
934 beq dq_4 @ -0? return it
935 asrs r1,#11 @ make -Inf
936 b dq_4
937
938dq_3:
939 ldr r1,=0x7ff
940 lsls r1,#20 @ return +Inf
941dq_4:
942 movs r0,#0
943dq_1:
944 bx r14
945
946.align 2
947regular_func dsqrt_shim
948 lsls r2,r1,#1
949 bcs dq_0 @ negative?
950 lsrs r2,#21 @ extract exponent
951 subs r2,#1
952 ldr r3,=0x7fe
953 cmp r2,r3
954 bhs dq_2 @ catches 0 and +Inf
955 push {r4-r7,r14}
956 lsls r4,r2,#20
957 subs r1,r4 @ insert implied 1
958 lsrs r2,#1
959 bcc 1f @ even exponent? skip
960 adds r0,r0,r0 @ odd exponent: shift up mantissa
961 adcs r1,r1,r1
9621:
963 lsrs r3,#2
964 adds r2,r3
965 lsls r2,#20
966 mov r12,r2 @ save result exponent
967
968@ here
969@ r0:r1 y mantissa Q52 [1,4)
970@ r12 result exponent
971.equ drsqrtapp_minus_8, (drsqrtapp-8)
972 adr r4,drsqrtapp_minus_8 @ first eight table entries are never accessed because of the mantissa's leading 1
973 lsrs r2,r1,#17 @ y Q3
974 ldrb r2,[r4,r2] @ initial approximation to reciprocal square root a0 Q8
975 lsrs r3,r1,#4 @ first Newton-Raphson iteration
976 muls r3,r2
977 muls r3,r2 @ i32 p0=a0*a0*(y>>14); // Q32
978 asrs r3,r3,#12 @ i32 r0=p0>>12; // Q20
979 muls r3,r2
980 asrs r3,#13 @ i32 dy0=(r0*a0)>>13; // Q15
981 lsls r2,#8
982 subs r2,r3 @ i32 a1=(a0<<8)-dy0; // Q16
983
984 movs r3,r2
985 muls r3,r3
986 lsrs r3,#13
987 lsrs r4,r1,#1
988 muls r3,r4 @ i32 p1=((a1*a1)>>11)*(y>>11); // Q19*Q19=Q38
989 asrs r3,#15 @ i32 r1=p1>>15; // Q23
990 muls r3,r2
991 asrs r3,#23
992 adds r3,#1
993 asrs r3,#1 @ i32 dy1=(r1*a1+(1<<23))>>24; // Q23*Q16=Q39; Q15
994 subs r2,r3 @ i32 a2=a1-dy1; // Q16
995 lsrs r3,r2,#16
996 subs r2,r3 @ if(a2>=0x10000) a2=0xffff; to prevent overflow of a2*a2
997
998@ here
999@ r0:r1 y mantissa
1000@ r2 a2 ~ 1/sqrt(y) Q16
1001@ r12 result exponent
1002
1003 movs r3,r2
1004 muls r3,r3
1005 lsls r1,#10
1006 lsrs r4,r0,#22
1007 orrs r1,r4 @ y Q30
1008 mul32_32_64 r1,r3, r4,r3, r5,r6,r7,r4,r3 @ i64 p2=(ui64)(a2*a2)*(ui64)y; // Q62 r4:r3
1009 lsls r5,r3,#6
1010 lsrs r4,#26
1011 orrs r4,r5
1012 adds r4,#0x20 @ i32 r2=(p2>>26)+0x20; // Q36 r4
1013 uxth r5,r4
1014 muls r5,r2
1015 asrs r4,#16
1016 muls r4,r2
1017 lsrs r5,#16
1018 adds r4,r5
1019 asrs r4,#6 @ i32 dy2=((i64)r2*(i64)a2)>>22; // Q36*Q16=Q52; Q30
1020 lsls r2,#15
1021 subs r2,r4
1022
1023@ here
1024@ r0 y low bits
1025@ r1 y Q30
1026@ r2 a3 ~ 1/sqrt(y) Q31
1027@ r12 result exponent
1028
1029 mul32_32_64 r2,r1, r3,r4, r5,r6,r7,r3,r4
1030 adds r3,r3,r3
1031 adcs r4,r4,r4
1032 adds r3,r3,r3
1033 movs r3,#0
1034 adcs r3,r4 @ ui32 a4=((ui64)a3*(ui64)y+(1U<<31))>>31; // Q30
1035
1036@ here
1037@ r0 y low bits
1038@ r1 y Q30
1039@ r2 a3 Q31 ~ 1/sqrt(y)
1040@ r3 a4 Q30 ~ sqrt(y)
1041@ r12 result exponent
1042
1043 square32_64 r3, r4,r5, r6,r5,r7
1044 lsls r6,r0,#8
1045 lsrs r7,r1,#2
1046 subs r6,r4
1047 sbcs r7,r5 @ r4=(q60)y-a4*a4
1048
1049@ by exhaustive testing, r4 = fffffffc0e134fdc .. 00000003c2bf539c Q60
1050
1051 lsls r5,r7,#29
1052 lsrs r6,#3
1053 adcs r6,r5 @ r4 Q57 with rounding
1054 muls32_32_64 r6,r2, r6,r2, r4,r5,r7,r6,r2 @ d4=a3*r4/2 Q89
1055@ r4+d4 is correct to 1ULP at Q57, tested on ~9bn cases including all extreme values of r4 for each possible y Q30
1056
1057 adds r2,#8
1058 asrs r2,#5 @ d4 Q52, rounded to Q53 with spare bit in carry
1059
1060@ here
1061@ r0 y low bits
1062@ r1 y Q30
1063@ r2 d4 Q52, rounded to Q53
1064@ C flag contains d4_b53
1065@ r3 a4 Q30
1066
1067 bcs dq_5
1068
1069 lsrs r5,r3,#10 @ a4 Q52
1070 lsls r4,r3,#22
1071
1072 asrs r1,r2,#31
1073 adds r0,r2,r4
1074 adcs r1,r5 @ a4+d4
1075
1076 add r1,r12 @ pack exponent
1077 pop {r4-r7,r15}
1078
1079.ltorg
1080
1081
1082@ round(sqrt(2^22./[68:8:252]))
1083drsqrtapp:
1084.byte 0xf8,0xeb,0xdf,0xd6,0xcd,0xc5,0xbe,0xb8
1085.byte 0xb2,0xad,0xa8,0xa4,0xa0,0x9c,0x99,0x95
1086.byte 0x92,0x8f,0x8d,0x8a,0x88,0x85,0x83,0x81
1087
1088dq_5:
1089@ here we are near a rounding boundary, C is set
1090 adcs r2,r2,r2 @ d4 Q53+1ulp
1091 lsrs r5,r3,#9
1092 lsls r4,r3,#23 @ r4:r5 a4 Q53
1093 asrs r1,r2,#31
1094 adds r4,r2,r4
1095 adcs r5,r1 @ r4:r5 a5=a4+d4 Q53+1ulp
1096 movs r3,r5
1097 muls r3,r4
1098 square32_64 r4,r1,r2,r6,r2,r7
1099 adds r2,r3
1100 adds r2,r3 @ r1:r2 a5^2 Q106
1101 lsls r0,#22 @ y Q84
1102
1103 negs r1,r1
1104 sbcs r0,r2 @ remainder y-a5^2
1105 bmi 1f @ y<a5^2: no need to increment a5
1106 movs r3,#0
1107 adds r4,#1
1108 adcs r5,r3 @ bump a5 if over rounding boundary
11091:
1110 lsrs r0,r4,#1
1111 lsrs r1,r5,#1
1112 lsls r5,#31
1113 orrs r0,r5
1114 add r1,r12
1115 pop {r4-r7,r15}
1116
1117@ "scientific" functions start here
1118
1119@ double-length CORDIC rotation step
1120
1121@ r0:r1 ω
1122@ r6 32-i (complementary shift)
1123@ r7 i (shift)
1124@ r8:r9 x
1125@ r10:r11 y
1126@ r12 coefficient pointer
1127
1128@ an option in rotation mode would be to compute the sequence of σ values
1129@ in one pass, rotate the initial vector by the residual ω and then run a
1130@ second pass to compute the final x and y. This would relieve pressure
1131@ on registers and hence possibly be faster. The same trick does not work
1132@ in vectoring mode (but perhaps one could work to single precision in
1133@ a first pass and then double precision in a second pass?).
1134
1135double_section dcordic_vec_step
1136 regular_func dcordic_vec_step
1137 mov r2,r12
1138 ldmia r2!,{r3,r4}
1139 mov r12,r2
1140 mov r2,r11
1141 cmp r2,#0
1142 blt 1f
1143 b 2f
1144
1145double_section dcordic_rot_step
1146 regular_func dcordic_rot_step
1147 mov r2,r12
1148 ldmia r2!,{r3,r4}
1149 mov r12,r2
1150 cmp r1,#0
1151 bge 1f
11522:
1153@ ω<0 / y>=0
1154@ ω+=dω
1155@ x+=y>>i, y-=x>>i
1156 adds r0,r3
1157 adcs r1,r4
1158
1159 mov r3,r11
1160 asrs r3,r7
1161 mov r4,r11
1162 lsls r4,r6
1163 mov r2,r10
1164 lsrs r2,r7
1165 orrs r2,r4 @ r2:r3 y>>i, rounding in carry
1166 mov r4,r8
1167 mov r5,r9 @ r4:r5 x
1168 adcs r2,r4
1169 adcs r3,r5 @ r2:r3 x+(y>>i)
1170 mov r8,r2
1171 mov r9,r3
1172
1173 mov r3,r5
1174 lsls r3,r6
1175 asrs r5,r7
1176 lsrs r4,r7
1177 orrs r4,r3 @ r4:r5 x>>i, rounding in carry
1178 mov r2,r10
1179 mov r3,r11
1180 sbcs r2,r4
1181 sbcs r3,r5 @ r2:r3 y-(x>>i)
1182 mov r10,r2
1183 mov r11,r3
1184 bx r14
1185
1186
1187@ ω>0 / y<0
1188@ ω-=dω
1189@ x-=y>>i, y+=x>>i
11901:
1191 subs r0,r3
1192 sbcs r1,r4
1193
1194 mov r3,r9
1195 asrs r3,r7
1196 mov r4,r9
1197 lsls r4,r6
1198 mov r2,r8
1199 lsrs r2,r7
1200 orrs r2,r4 @ r2:r3 x>>i, rounding in carry
1201 mov r4,r10
1202 mov r5,r11 @ r4:r5 y
1203 adcs r2,r4
1204 adcs r3,r5 @ r2:r3 y+(x>>i)
1205 mov r10,r2
1206 mov r11,r3
1207
1208 mov r3,r5
1209 lsls r3,r6
1210 asrs r5,r7
1211 lsrs r4,r7
1212 orrs r4,r3 @ r4:r5 y>>i, rounding in carry
1213 mov r2,r8
1214 mov r3,r9
1215 sbcs r2,r4
1216 sbcs r3,r5 @ r2:r3 x-(y>>i)
1217 mov r8,r2
1218 mov r9,r3
1219 bx r14
1220
1221@ convert packed double in r0:r1 to signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point], with rounding towards -Inf
1222@ fixed-point versions only work with reasonable values in r2 because of the way dunpacks works
1223
1224double_section double2int_shim
1225 regular_func double2int_shim
1226 movs r2,#0 @ and fall through
1227regular_func double2fix_shim
1228 push {r14}
1229 adds r2,#32
1230 bl double2fix64_shim
1231 movs r0,r1
1232 pop {r15}
1233
1234double_section double2uint_shim
1235 regular_func double2uint_shim
1236 movs r2,#0 @ and fall through
1237regular_func double2ufix_shim
1238 push {r14}
1239 adds r2,#32
1240 bl double2ufix64_shim
1241 movs r0,r1
1242 pop {r15}
1243
1244double_section double2int64_shim
1245 regular_func double2int64_shim
1246 movs r2,#0 @ and fall through
1247regular_func double2fix64_shim
1248 push {r14}
1249 bl d2fix
1250
1251 asrs r2,r1,#31
1252 cmp r2,r3
1253 bne 1f @ sign extension bits fail to match sign of result?
1254 pop {r15}
12551:
1256 mvns r0,r3
1257 movs r1,#1
1258 lsls r1,#31
1259 eors r1,r1,r0 @ generate extreme fixed-point values
1260 pop {r15}
1261
1262double_section double2uint64_shim
1263 regular_func double2uint64_shim
1264 movs r2,#0 @ and fall through
1265regular_func double2ufix64_shim
1266 asrs r3,r1,#20 @ negative? return 0
1267 bmi ret_dzero
1268@ and fall through
1269
1270@ convert double in r0:r1 to signed fixed point in r0:r1:r3, r2 places after point, rounding towards -Inf
1271@ result clamped so that r3 can only be 0 or -1
1272@ trashes r12
1273.thumb_func
1274d2fix:
1275 push {r4,r14}
1276 mov r12,r2
1277 bl dunpacks
1278 asrs r4,r2,#16
1279 adds r4,#1
1280 bge 1f
1281 movs r1,#0 @ -0 -> +0
12821:
1283 asrs r3,r1,#31
1284 ldr r4, =d2fix_a
1285 bx r4
1286
1287ret_dzero:
1288 movs r0,#0
1289 movs r1,#0
1290 bx r14
1291
1292.weak d2fix_a // weak because it exists in float code too
1293.thumb_func
1294d2fix_a:
1295@ here
1296@ r0:r1 two's complement mantissa
1297@ r2 unbaised exponent
1298@ r3 mantissa sign extension bits
1299 add r2,r12 @ exponent plus offset for required binary point position
1300 subs r2,#52 @ required shift
1301 bmi 1f @ shift down?
1302@ here a shift up by r2 places
1303 cmp r2,#12 @ will clamp?
1304 bge 2f
1305 movs r4,r0
1306 lsls r1,r2
1307 lsls r0,r2
1308 negs r2,r2
1309 adds r2,#32 @ complementary shift
1310 lsrs r4,r2
1311 orrs r1,r4
1312 pop {r4,r15}
13132:
1314 mvns r0,r3
1315 mvns r1,r3 @ overflow: clamp to extreme fixed-point values
1316 pop {r4,r15}
13171:
1318@ here a shift down by -r2 places
1319 adds r2,#32
1320 bmi 1f @ long shift?
1321 mov r4,r1
1322 lsls r4,r2
1323 negs r2,r2
1324 adds r2,#32 @ complementary shift
1325 asrs r1,r2
1326 lsrs r0,r2
1327 orrs r0,r4
1328 pop {r4,r15}
13291:
1330@ here a long shift down
1331 movs r0,r1
1332 asrs r1,#31 @ shift down 32 places
1333 adds r2,#32
1334 bmi 1f @ very long shift?
1335 negs r2,r2
1336 adds r2,#32
1337 asrs r0,r2
1338 pop {r4,r15}
13391:
1340 movs r0,r3 @ result very near zero: use sign extension bits
1341 movs r1,r3
1342 pop {r4,r15}
1343
1344double_section double2float_shim
1345 regular_func double2float_shim
1346 lsls r2,r1,#1
1347 lsrs r2,#21 @ exponent
1348 ldr r3,=0x3ff-0x7f
1349 subs r2,r3 @ fix exponent bias
1350 ble 1f @ underflow or zero
1351 cmp r2,#0xff
1352 bge 2f @ overflow or infinity
1353 lsls r2,#23 @ position exponent of result
1354 lsrs r3,r1,#31
1355 lsls r3,#31
1356 orrs r2,r3 @ insert sign
1357 lsls r3,r0,#3 @ rounding bits
1358 lsrs r0,#29
1359 lsls r1,#12
1360 lsrs r1,#9
1361 orrs r0,r1 @ assemble mantissa
1362 orrs r0,r2 @ insert exponent and sign
1363 lsls r3,#1
1364 bcc 3f @ no rounding
1365 beq 4f @ all sticky bits 0?
13665:
1367 adds r0,#1
13683:
1369 bx r14
13704:
1371 lsrs r3,r0,#1 @ odd? then round up
1372 bcs 5b
1373 bx r14
13741:
1375 beq 6f @ check case where value is just less than smallest normal
13767:
1377 lsrs r0,r1,#31
1378 lsls r0,#31
1379 bx r14
13806:
1381 lsls r2,r1,#12 @ 20 1:s at top of mantissa?
1382 asrs r2,#12
1383 adds r2,#1
1384 bne 7b
1385 lsrs r2,r0,#29 @ and 3 more 1:s?
1386 cmp r2,#7
1387 bne 7b
1388 movs r2,#1 @ return smallest normal with correct sign
1389 b 8f
13902:
1391 movs r2,#0xff
13928:
1393 lsrs r0,r1,#31 @ return signed infinity
1394 lsls r0,#8
1395 adds r0,r2
1396 lsls r0,#23
1397 bx r14
1398
1399double_section x2double_shims
1400@ convert signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point] to packed double in r0:r1, with rounding
1401
1402.align 2
1403regular_func uint2double_shim
1404 movs r1,#0 @ and fall through
1405regular_func ufix2double_shim
1406 movs r2,r1
1407 movs r1,#0
1408 b ufix642double_shim
1409
1410.align 2
1411regular_func int2double_shim
1412 movs r1,#0 @ and fall through
1413regular_func fix2double_shim
1414 movs r2,r1
1415 asrs r1,r0,#31 @ sign extend
1416 b fix642double_shim
1417
1418.align 2
1419regular_func uint642double_shim
1420 movs r2,#0 @ and fall through
1421regular_func ufix642double_shim
1422 movs r3,#0
1423 b uf2d
1424
1425.align 2
1426regular_func int642double_shim
1427 movs r2,#0 @ and fall through
1428regular_func fix642double_shim
1429 asrs r3,r1,#31 @ sign bit across all bits
1430 eors r0,r3
1431 eors r1,r3
1432 subs r0,r3
1433 sbcs r1,r3
1434uf2d:
1435 push {r4,r5,r14}
1436 ldr r4,=0x432
1437 subs r2,r4,r2 @ form biased exponent
1438@ here
1439@ r0:r1 unnormalised mantissa
1440@ r2 -Q (will become exponent)
1441@ r3 sign across all bits
1442 cmp r1,#0
1443 bne 1f @ short normalising shift?
1444 movs r1,r0
1445 beq 2f @ zero? return it
1446 movs r0,#0
1447 subs r2,#32 @ fix exponent
14481:
1449 asrs r4,r1,#21
1450 bne 3f @ will need shift down (and rounding?)
1451 bcs 4f @ normalised already?
14525:
1453 subs r2,#1
1454 adds r0,r0 @ shift up
1455 adcs r1,r1
1456 lsrs r4,r1,#21
1457 bcc 5b
14584:
1459 ldr r4,=0x7fe
1460 cmp r2,r4
1461 bhs 6f @ over/underflow? return signed zero/infinity
14627:
1463 lsls r2,#20 @ pack and return
1464 adds r1,r2
1465 lsls r3,#31
1466 adds r1,r3
14672:
1468 pop {r4,r5,r15}
14696: @ return signed zero/infinity according to unclamped exponent in r2
1470 mvns r2,r2
1471 lsrs r2,#21
1472 movs r0,#0
1473 movs r1,#0
1474 b 7b
1475
14763:
1477@ here we need to shift down to normalise and possibly round
1478 bmi 1f @ already normalised to Q63?
14792:
1480 subs r2,#1
1481 adds r0,r0 @ shift up
1482 adcs r1,r1
1483 bpl 2b
14841:
1485@ here we have a 1 in b63 of r0:r1
1486 adds r2,#11 @ correct exponent for subsequent shift down
1487 lsls r4,r0,#21 @ save bits for rounding
1488 lsrs r0,#11
1489 lsls r5,r1,#21
1490 orrs r0,r5
1491 lsrs r1,#11
1492 lsls r4,#1
1493 beq 1f @ sticky bits are zero?
14948:
1495 movs r4,#0
1496 adcs r0,r4
1497 adcs r1,r4
1498 b 4b
14991:
1500 bcc 4b @ sticky bits are zero but not on rounding boundary
1501 lsrs r4,r0,#1 @ increment if odd (force round to even)
1502 b 8b
1503
1504
1505.ltorg
1506
1507double_section dunpacks
1508 regular_func dunpacks
1509 mdunpacks r0,r1,r2,r3,r4
1510 ldr r3,=0x3ff
1511 subs r2,r3 @ exponent without offset
1512 bx r14
1513
1514@ r0:r1 signed mantissa Q52
1515@ r2 unbiased exponent < 10 (i.e., |x|<2^10)
1516@ r4 pointer to:
1517@ - divisor reciprocal approximation r=1/d Q15
1518@ - divisor d Q62 0..20
1519@ - divisor d Q62 21..41
1520@ - divisor d Q62 42..62
1521@ returns:
1522@ r0:r1 reduced result y Q62, -0.6 d < y < 0.6 d (better in practice)
1523@ r2 quotient q (number of reductions)
1524@ if exponent >=10, returns r0:r1=0, r2=1024*mantissa sign
1525@ designed to work for 0.5<d<2, in particular d=ln2 (~0.7) and d=π/2 (~1.6)
1526double_section dreduce
1527 regular_func dreduce
1528 adds r2,#2 @ e+2
1529 bmi 1f @ |x|<0.25, too small to need adjustment
1530 cmp r2,#12
1531 bge 4f
15322:
1533 movs r5,#17
1534 subs r5,r2 @ 15-e
1535 movs r3,r1 @ Q20
1536 asrs r3,r5 @ x Q5
1537 adds r2,#8 @ e+10
1538 adds r5,#7 @ 22-e = 32-(e+10)
1539 movs r6,r0
1540 lsrs r6,r5
1541 lsls r0,r2
1542 lsls r1,r2
1543 orrs r1,r6 @ r0:r1 x Q62
1544 ldmia r4,{r4-r7}
1545 muls r3,r4 @ rx Q20
1546 asrs r2,r3,#20
1547 movs r3,#0
1548 adcs r2,r3 @ rx Q0 rounded = q; for e.g. r=1.5 |q|<1.5*2^10
1549 muls r5,r2 @ qd in pieces: L Q62
1550 muls r6,r2 @ M Q41
1551 muls r7,r2 @ H Q20
1552 lsls r7,#10
1553 asrs r4,r6,#11
1554 lsls r6,#21
1555 adds r6,r5
1556 adcs r7,r4
1557 asrs r5,#31
1558 adds r7,r5 @ r6:r7 qd Q62
1559 subs r0,r6
1560 sbcs r1,r7 @ remainder Q62
1561 bx r14
15624:
1563 movs r2,#12 @ overflow: clamp to +/-1024
1564 movs r0,#0
1565 asrs r1,#31
1566 lsls r1,#1
1567 adds r1,#1
1568 lsls r1,#20
1569 b 2b
1570
15711:
1572 lsls r1,#8
1573 lsrs r3,r0,#24
1574 orrs r1,r3
1575 lsls r0,#8 @ r0:r1 Q60, to be shifted down -r2 places
1576 negs r3,r2
1577 adds r2,#32 @ shift down in r3, complementary shift in r2
1578 bmi 1f @ long shift?
15792:
1580 movs r4,r1
1581 asrs r1,r3
1582 lsls r4,r2
1583 lsrs r0,r3
1584 orrs r0,r4
1585 movs r2,#0 @ rounding
1586 adcs r0,r2
1587 adcs r1,r2
1588 bx r14
1589
15901:
1591 movs r0,r1 @ down 32 places
1592 asrs r1,#31
1593 subs r3,#32
1594 adds r2,#32
1595 bpl 2b
1596 movs r0,#0 @ very long shift? return 0
1597 movs r1,#0
1598 movs r2,#0
1599 bx r14
1600
1601double_section dtan_shim
1602 regular_func dtan_shim
1603 push {r4-r7,r14}
1604 bl push_r8_r11
1605 bl dsincos_internal
1606 mov r12,r0 @ save ε
1607 bl dcos_finish
1608 push {r0,r1}
1609 mov r0,r12
1610 bl dsin_finish
1611 pop {r2,r3}
1612 bl pop_r8_r11
1613 b ddiv0 @ compute sin θ/cos θ
1614
1615double_section dcos_shim
1616 regular_func dcos_shim
1617 push {r4-r7,r14}
1618 bl push_r8_r11
1619 bl dsincos_internal
1620 bl dcos_finish
1621 b 1f
1622
1623double_section dsin_shim
1624 regular_func dsin_shim
1625 push {r4-r7,r14}
1626 bl push_r8_r11
1627 bl dsincos_internal
1628 bl dsin_finish
16291:
1630 bl pop_r8_r11
1631 pop {r4-r7,r15}
1632
1633double_section dsincos_shim
1634
1635 @ Note that this function returns in r0-r3
1636 regular_func dsincos_shim
1637
1638 push {r4-r7,r14}
1639 bl push_r8_r11
1640 bl dsincos_internal
1641 mov r12,r0 @ save ε
1642 bl dcos_finish
1643 push {r0,r1}
1644 mov r0,r12
1645 bl dsin_finish
1646 pop {r2,r3}
1647 bl pop_r8_r11
1648 pop {r4-r7,r15}
1649
1650double_section dtrig_guts
1651
1652@ unpack double θ in r0:r1, range reduce and calculate ε, cos α and sin α such that
1653@ θ=α+ε and |ε|≤2^-32
1654@ on return:
1655@ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
1656@ r8:r9 cos α Q62
1657@ r10:r11 sin α Q62
1658.align 2
1659.thumb_func
1660dsincos_internal:
1661 push {r14}
1662 bl dunpacks
1663 adr r4,dreddata0
1664 bl dreduce
1665
1666 movs r4,#0
1667 ldr r5,=0x9df04dbb @ this value compensates for the non-unity scaling of the CORDIC rotations
1668 ldr r6,=0x36f656c5
1669 lsls r2,#31
1670 bcc 1f
1671@ quadrant 2 or 3
1672 mvns r6,r6
1673 negs r5,r5
1674 adcs r6,r4
16751:
1676 lsls r2,#1
1677 bcs 1f
1678@ even quadrant
1679 mov r10,r4
1680 mov r11,r4
1681 mov r8,r5
1682 mov r9,r6
1683 b 2f
16841:
1685@ odd quadrant
1686 mov r8,r4
1687 mov r9,r4
1688 mov r10,r5
1689 mov r11,r6
16902:
1691 adr r4,dtab_cc
1692 mov r12,r4
1693 movs r7,#1
1694 movs r6,#31
16951:
1696 bl dcordic_rot_step
1697 adds r7,#1
1698 subs r6,#1
1699 cmp r7,#33
1700 bne 1b
1701 pop {r15}
1702
1703dcos_finish:
1704@ here
1705@ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
1706@ r8:r9 cos α Q62
1707@ r10:r11 sin α Q62
1708@ and we wish to calculate cos θ=cos(α+ε)~cos α - ε sin α
1709 mov r1,r11
1710@ mov r2,r10
1711@ lsrs r2,#31
1712@ adds r1,r2 @ rounding improves accuracy very slightly
1713 muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
1714@ r2:r3 ε sin α Q(62+62-32)=Q92
1715 mov r0,r8
1716 mov r1,r9
1717 lsls r5,r3,#2
1718 asrs r3,r3,#30
1719 lsrs r2,r2,#30
1720 orrs r2,r5
1721 sbcs r0,r2 @ include rounding
1722 sbcs r1,r3
1723 movs r2,#62
1724 b fix642double_shim
1725
1726dsin_finish:
1727@ here
1728@ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
1729@ r8:r9 cos α Q62
1730@ r10:r11 sin α Q62
1731@ and we wish to calculate sin θ=sin(α+ε)~sin α + ε cos α
1732 mov r1,r9
1733 muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
1734@ r2:r3 ε cos α Q(62+62-32)=Q92
1735 mov r0,r10
1736 mov r1,r11
1737 lsls r5,r3,#2
1738 asrs r3,r3,#30
1739 lsrs r2,r2,#30
1740 orrs r2,r5
1741 adcs r0,r2 @ include rounding
1742 adcs r1,r3
1743 movs r2,#62
1744 b fix642double_shim
1745
1746.ltorg
1747.align 2
1748dreddata0:
1749.word 0x0000517d @ 2/π Q15
1750.word 0x0014611A @ π/2 Q62=6487ED5110B4611A split into 21-bit pieces
1751.word 0x000A8885
1752.word 0x001921FB
1753
1754
1755.align 2
1756regular_func datan2_shim
1757@ r0:r1 y
1758@ r2:r3 x
1759 push {r4-r7,r14}
1760 bl push_r8_r11
1761 ldr r5,=0x7ff00000
1762 movs r4,r1
1763 ands r4,r5 @ y==0?
1764 beq 1f
1765 cmp r4,r5 @ or Inf/NaN?
1766 bne 2f
17671:
1768 lsrs r1,#20 @ flush
1769 lsls r1,#20
1770 movs r0,#0
17712:
1772 movs r4,r3
1773 ands r4,r5 @ x==0?
1774 beq 1f
1775 cmp r4,r5 @ or Inf/NaN?
1776 bne 2f
17771:
1778 lsrs r3,#20 @ flush
1779 lsls r3,#20
1780 movs r2,#0
17812:
1782 movs r6,#0 @ quadrant offset
1783 lsls r5,#11 @ constant 0x80000000
1784 cmp r3,#0
1785 bpl 1f @ skip if x positive
1786 movs r6,#2
1787 eors r3,r5
1788 eors r1,r5
1789 bmi 1f @ quadrant offset=+2 if y was positive
1790 negs r6,r6 @ quadrant offset=-2 if y was negative
17911:
1792@ now in quadrant 0 or 3
1793 adds r7,r1,r5 @ r7=-r1
1794 bpl 1f
1795@ y>=0: in quadrant 0
1796 cmp r1,r3
1797 ble 2f @ y<~x so 0≤θ<~π/4: skip
1798 adds r6,#1
1799 eors r1,r5 @ negate x
1800 b 3f @ and exchange x and y = rotate by -π/2
18011:
1802 cmp r3,r7
1803 bge 2f @ -y<~x so -π/4<~θ≤0: skip
1804 subs r6,#1
1805 eors r3,r5 @ negate y and ...
18063:
1807 movs r7,r0 @ exchange x and y
1808 movs r0,r2
1809 movs r2,r7
1810 movs r7,r1
1811 movs r1,r3
1812 movs r3,r7
18132:
1814@ here -π/4<~θ<~π/4
1815@ r6 has quadrant offset
1816 push {r6}
1817 cmp r2,#0
1818 bne 1f
1819 cmp r3,#0
1820 beq 10f @ x==0 going into division?
1821 lsls r4,r3,#1
1822 asrs r4,#21
1823 adds r4,#1
1824 bne 1f @ x==Inf going into division?
1825 lsls r4,r1,#1
1826 asrs r4,#21
1827 adds r4,#1 @ y also ±Inf?
1828 bne 10f
1829 subs r1,#1 @ make them both just finite
1830 subs r3,#1
1831 b 1f
1832
183310:
1834 movs r0,#0
1835 movs r1,#0
1836 b 12f
1837
18381:
1839 bl ddiv_shim
1840 movs r2,#62
1841 bl double2fix64_shim
1842@ r0:r1 y/x
1843 mov r10,r0
1844 mov r11,r1
1845 movs r0,#0 @ ω=0
1846 movs r1,#0
1847 mov r8,r0
1848 movs r2,#1
1849 lsls r2,#30
1850 mov r9,r2 @ x=1
1851
1852 adr r4,dtab_cc
1853 mov r12,r4
1854 movs r7,#1
1855 movs r6,#31
18561:
1857 bl dcordic_vec_step
1858 adds r7,#1
1859 subs r6,#1
1860 cmp r7,#33
1861 bne 1b
1862@ r0:r1 atan(y/x) Q62
1863@ r8:r9 x residual Q62
1864@ r10:r11 y residual Q62
1865 mov r2,r9
1866 mov r3,r10
1867 subs r2,#12 @ this makes atan(0)==0
1868@ the following is basically a division residual y/x ~ atan(residual y/x)
1869 movs r4,#1
1870 lsls r4,#29
1871 movs r7,#0
18722:
1873 lsrs r2,#1
1874 movs r3,r3 @ preserve carry
1875 bmi 1f
1876 sbcs r3,r2
1877 adds r0,r4
1878 adcs r1,r7
1879 lsrs r4,#1
1880 bne 2b
1881 b 3f
18821:
1883 adcs r3,r2
1884 subs r0,r4
1885 sbcs r1,r7
1886 lsrs r4,#1
1887 bne 2b
18883:
1889 lsls r6,r1,#31
1890 asrs r1,#1
1891 lsrs r0,#1
1892 orrs r0,r6 @ Q61
1893
189412:
1895 pop {r6}
1896
1897 cmp r6,#0
1898 beq 1f
1899 ldr r4,=0x885A308D @ π/2 Q61
1900 ldr r5,=0x3243F6A8
1901 bpl 2f
1902 mvns r4,r4 @ negative quadrant offset
1903 mvns r5,r5
19042:
1905 lsls r6,#31
1906 bne 2f @ skip if quadrant offset is ±1
1907 adds r0,r4
1908 adcs r1,r5
19092:
1910 adds r0,r4
1911 adcs r1,r5
19121:
1913 movs r2,#61
1914 bl fix642double_shim
1915
1916 bl pop_r8_r11
1917 pop {r4-r7,r15}
1918
1919.ltorg
1920
1921dtab_cc:
1922.word 0x61bb4f69, 0x1dac6705 @ atan 2^-1 Q62
1923.word 0x96406eb1, 0x0fadbafc @ atan 2^-2 Q62
1924.word 0xab0bdb72, 0x07f56ea6 @ atan 2^-3 Q62
1925.word 0xe59fbd39, 0x03feab76 @ atan 2^-4 Q62
1926.word 0xba97624b, 0x01ffd55b @ atan 2^-5 Q62
1927.word 0xdddb94d6, 0x00fffaaa @ atan 2^-6 Q62
1928.word 0x56eeea5d, 0x007fff55 @ atan 2^-7 Q62
1929.word 0xaab7776e, 0x003fffea @ atan 2^-8 Q62
1930.word 0x5555bbbc, 0x001ffffd @ atan 2^-9 Q62
1931.word 0xaaaaadde, 0x000fffff @ atan 2^-10 Q62
1932.word 0xf555556f, 0x0007ffff @ atan 2^-11 Q62
1933.word 0xfeaaaaab, 0x0003ffff @ atan 2^-12 Q62
1934.word 0xffd55555, 0x0001ffff @ atan 2^-13 Q62
1935.word 0xfffaaaab, 0x0000ffff @ atan 2^-14 Q62
1936.word 0xffff5555, 0x00007fff @ atan 2^-15 Q62
1937.word 0xffffeaab, 0x00003fff @ atan 2^-16 Q62
1938.word 0xfffffd55, 0x00001fff @ atan 2^-17 Q62
1939.word 0xffffffab, 0x00000fff @ atan 2^-18 Q62
1940.word 0xfffffff5, 0x000007ff @ atan 2^-19 Q62
1941.word 0xffffffff, 0x000003ff @ atan 2^-20 Q62
1942.word 0x00000000, 0x00000200 @ atan 2^-21 Q62 @ consider optimising these
1943.word 0x00000000, 0x00000100 @ atan 2^-22 Q62
1944.word 0x00000000, 0x00000080 @ atan 2^-23 Q62
1945.word 0x00000000, 0x00000040 @ atan 2^-24 Q62
1946.word 0x00000000, 0x00000020 @ atan 2^-25 Q62
1947.word 0x00000000, 0x00000010 @ atan 2^-26 Q62
1948.word 0x00000000, 0x00000008 @ atan 2^-27 Q62
1949.word 0x00000000, 0x00000004 @ atan 2^-28 Q62
1950.word 0x00000000, 0x00000002 @ atan 2^-29 Q62
1951.word 0x00000000, 0x00000001 @ atan 2^-30 Q62
1952.word 0x80000000, 0x00000000 @ atan 2^-31 Q62
1953.word 0x40000000, 0x00000000 @ atan 2^-32 Q62
1954
1955double_section dexp_guts
1956regular_func dexp_shim
1957 push {r4-r7,r14}
1958 bl dunpacks
1959 adr r4,dreddata1
1960 bl dreduce
1961 cmp r1,#0
1962 bge 1f
1963 ldr r4,=0xF473DE6B
1964 ldr r5,=0x2C5C85FD @ ln2 Q62
1965 adds r0,r4
1966 adcs r1,r5
1967 subs r2,#1
19681:
1969 push {r2}
1970 movs r7,#1 @ shift
1971 adr r6,dtab_exp
1972 movs r2,#0
1973 movs r3,#1
1974 lsls r3,#30 @ x=1 Q62
1975
19763:
1977 ldmia r6!,{r4,r5}
1978 mov r12,r6
1979 subs r0,r4
1980 sbcs r1,r5
1981 bmi 1f
1982
1983 negs r6,r7
1984 adds r6,#32 @ complementary shift
1985 movs r5,r3
1986 asrs r5,r7
1987 movs r4,r3
1988 lsls r4,r6
1989 movs r6,r2
1990 lsrs r6,r7 @ rounding bit in carry
1991 orrs r4,r6
1992 adcs r2,r4
1993 adcs r3,r5 @ x+=x>>i
1994 b 2f
1995
19961:
1997 adds r0,r4 @ restore argument
1998 adcs r1,r5
19992:
2000 mov r6,r12
2001 adds r7,#1
2002 cmp r7,#33
2003 bne 3b
2004
2005@ here
2006@ r0:r1 ε (residual x, where x=a+ε) Q62, |ε|≤2^-32 (so fits in r0)
2007@ r2:r3 exp a Q62
2008@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε)
2009 muls32_32_64 r0,r3, r4,r1, r5,r6,r7,r4,r1
2010@ r4:r1 ε exp a Q(62+62-32)=Q92
2011 lsrs r4,#30
2012 lsls r0,r1,#2
2013 orrs r0,r4
2014 asrs r1,#30
2015 adds r0,r2
2016 adcs r1,r3
2017
2018 pop {r2}
2019 negs r2,r2
2020 adds r2,#62
2021 bl fix642double_shim @ in principle we can pack faster than this because we know the exponent
2022 pop {r4-r7,r15}
2023
2024.ltorg
2025
2026.align 2
2027regular_func dln_shim
2028 push {r4-r7,r14}
2029 lsls r7,r1,#1
2030 bcs 5f @ <0 ...
2031 asrs r7,#21
2032 beq 5f @ ... or =0? return -Inf
2033 adds r7,#1
2034 beq 6f @ Inf/NaN? return +Inf
2035 bl dunpacks
2036 push {r2}
2037 lsls r1,#9
2038 lsrs r2,r0,#23
2039 orrs r1,r2
2040 lsls r0,#9
2041@ r0:r1 m Q61 = m/2 Q62 0.5≤m/2<1
2042
2043 movs r7,#1 @ shift
2044 adr r6,dtab_exp
2045 mov r12,r6
2046 movs r2,#0
2047 movs r3,#0 @ y=0 Q62
2048
20493:
2050 negs r6,r7
2051 adds r6,#32 @ complementary shift
2052 movs r5,r1
2053 asrs r5,r7
2054 movs r4,r1
2055 lsls r4,r6
2056 movs r6,r0
2057 lsrs r6,r7
2058 orrs r4,r6 @ x>>i, rounding bit in carry
2059 adcs r4,r0
2060 adcs r5,r1 @ x+(x>>i)
2061
2062 lsrs r6,r5,#30
2063 bne 1f @ x+(x>>i)>1?
2064 movs r0,r4
2065 movs r1,r5 @ x+=x>>i
2066 mov r6,r12
2067 ldmia r6!,{r4,r5}
2068 subs r2,r4
2069 sbcs r3,r5
2070
20711:
2072 movs r4,#8
2073 add r12,r4
2074 adds r7,#1
2075 cmp r7,#33
2076 bne 3b
2077@ here:
2078@ r0:r1 residual x, nearly 1 Q62
2079@ r2:r3 y ~ ln m/2 = ln m - ln2 Q62
2080@ result is y + ln2 + ln x ~ y + ln2 + (x-1)
2081 lsls r1,#2
2082 asrs r1,#2 @ x-1
2083 adds r2,r0
2084 adcs r3,r1
2085
2086 pop {r7}
2087@ here:
2088@ r2:r3 ln m/2 = ln m - ln2 Q62
2089@ r7 unbiased exponent
2090.equ dreddata1_plus_4, (dreddata1+4)
2091 adr r4,dreddata1_plus_4
2092 ldmia r4,{r0,r1,r4}
2093 adds r7,#1
2094 muls r0,r7 @ Q62
2095 muls r1,r7 @ Q41
2096 muls r4,r7 @ Q20
2097 lsls r7,r1,#21
2098 asrs r1,#11
2099 asrs r5,r1,#31
2100 adds r0,r7
2101 adcs r1,r5
2102 lsls r7,r4,#10
2103 asrs r4,#22
2104 asrs r5,r1,#31
2105 adds r1,r7
2106 adcs r4,r5
2107@ r0:r1:r4 exponent*ln2 Q62
2108 asrs r5,r3,#31
2109 adds r0,r2
2110 adcs r1,r3
2111 adcs r4,r5
2112@ r0:r1:r4 result Q62
2113 movs r2,#62
21141:
2115 asrs r5,r1,#31
2116 cmp r4,r5
2117 beq 2f @ r4 a sign extension of r1?
2118 lsrs r0,#4 @ no: shift down 4 places and try again
2119 lsls r6,r1,#28
2120 orrs r0,r6
2121 lsrs r1,#4
2122 lsls r6,r4,#28
2123 orrs r1,r6
2124 asrs r4,#4
2125 subs r2,#4
2126 b 1b
21272:
2128 bl fix642double_shim
2129 pop {r4-r7,r15}
2130
21315:
2132 ldr r1,=0xfff00000
2133 movs r0,#0
2134 pop {r4-r7,r15}
2135
21366:
2137 ldr r1,=0x7ff00000
2138 movs r0,#0
2139 pop {r4-r7,r15}
2140
2141.ltorg
2142
2143.align 2
2144dreddata1:
2145.word 0x0000B8AA @ 1/ln2 Q15
2146.word 0x0013DE6B @ ln2 Q62 Q62=2C5C85FDF473DE6B split into 21-bit pieces
2147.word 0x000FEFA3
2148.word 0x000B1721
2149
2150dtab_exp:
2151.word 0xbf984bf3, 0x19f323ec @ log 1+2^-1 Q62
2152.word 0xcd4d10d6, 0x0e47fbe3 @ log 1+2^-2 Q62
2153.word 0x8abcb97a, 0x0789c1db @ log 1+2^-3 Q62
2154.word 0x022c54cc, 0x03e14618 @ log 1+2^-4 Q62
2155.word 0xe7833005, 0x01f829b0 @ log 1+2^-5 Q62
2156.word 0x87e01f1e, 0x00fe0545 @ log 1+2^-6 Q62
2157.word 0xac419e24, 0x007f80a9 @ log 1+2^-7 Q62
2158.word 0x45621781, 0x003fe015 @ log 1+2^-8 Q62
2159.word 0xa9ab10e6, 0x001ff802 @ log 1+2^-9 Q62
2160.word 0x55455888, 0x000ffe00 @ log 1+2^-10 Q62
2161.word 0x0aa9aac4, 0x0007ff80 @ log 1+2^-11 Q62
2162.word 0x01554556, 0x0003ffe0 @ log 1+2^-12 Q62
2163.word 0x002aa9ab, 0x0001fff8 @ log 1+2^-13 Q62
2164.word 0x00055545, 0x0000fffe @ log 1+2^-14 Q62
2165.word 0x8000aaaa, 0x00007fff @ log 1+2^-15 Q62
2166.word 0xe0001555, 0x00003fff @ log 1+2^-16 Q62
2167.word 0xf80002ab, 0x00001fff @ log 1+2^-17 Q62
2168.word 0xfe000055, 0x00000fff @ log 1+2^-18 Q62
2169.word 0xff80000b, 0x000007ff @ log 1+2^-19 Q62
2170.word 0xffe00001, 0x000003ff @ log 1+2^-20 Q62
2171.word 0xfff80000, 0x000001ff @ log 1+2^-21 Q62
2172.word 0xfffe0000, 0x000000ff @ log 1+2^-22 Q62
2173.word 0xffff8000, 0x0000007f @ log 1+2^-23 Q62
2174.word 0xffffe000, 0x0000003f @ log 1+2^-24 Q62
2175.word 0xfffff800, 0x0000001f @ log 1+2^-25 Q62
2176.word 0xfffffe00, 0x0000000f @ log 1+2^-26 Q62
2177.word 0xffffff80, 0x00000007 @ log 1+2^-27 Q62
2178.word 0xffffffe0, 0x00000003 @ log 1+2^-28 Q62
2179.word 0xfffffff8, 0x00000001 @ log 1+2^-29 Q62
2180.word 0xfffffffe, 0x00000000 @ log 1+2^-30 Q62
2181.word 0x80000000, 0x00000000 @ log 1+2^-31 Q62
2182.word 0x40000000, 0x00000000 @ log 1+2^-32 Q62
2183
2184
2185#endif