double__v1__rom__shim_8S_source.html

/**

 * Copyright (c) 2020 Mark Owen https://www.quinapalus.com .

 *

 * Raspberry Pi (Trading) Ltd (Licensor) hereby grants to you a non-exclusive license to use the software solely on a

 * Raspberry Pi Pico device. No other use is permitted under the terms of this license.

 *

 * This software is also available from the copyright owner under GPLv2 licence.

 *

 * THIS SOFTWARE IS PROVIDED BY THE LICENSOR AND COPYRIGHT OWNER "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,

 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL THE LICENSOR OR COPYRIGHT OWNER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR

 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,

 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */


#include "asm_helper.S"


.syntax unified

.cpu cortex-m0plus

.thumb


.macro double_section name

// todo separate flag for shims?

#if PICO_DOUBLE_IN_RAM

.section RAM_SECTION_NAME(\name), "ax"

#else

.section SECTION_NAME(\name), "ax"

#endif

.endm


double_section double_table_shim_on_use_helper

regular_func double_table_shim_on_use_helper

    push {r0-r2, lr}

    mov r0, ip

#ifndef NDEBUG

    // sanity check to make sure we weren't called by non (shimmable_) table_tail_call macro

    cmp r0, #0

    bne 1f

    bkpt #0

#endif

1:

    ldrh r1, [r0]

    lsrs r2, r1, #8

    adds r0, #2

    cmp r2, #0xdf

    bne 1b

    uxtb r1, r1 // r1 holds table offset

    lsrs r2, r0, #2

    bcc 1f

    // unaligned

    ldrh r2, [r0, #0]

    ldrh r0, [r0, #2]

    lsls r0, #16

    orrs r0, r2

    b 2f

1:

    ldr r0, [r0]

2:

    ldr r2, =sd_table

    str r0, [r2, r1]

    str r0, [sp, #12]

    pop {r0-r2, pc}


#if PICO_DOUBLE_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED

// Note that the V1 ROM has no double support, so this is basically the identical

// library, and shim inter-function calls do not bother to redirect back thru the

// wrapper functions


.equ use_hw_div,1

.equ IOPORT       ,0xd0000000

.equ DIV_UDIVIDEND,0x00000060

.equ DIV_UDIVISOR ,0x00000064

.equ DIV_QUOTIENT ,0x00000070

.equ DIV_CSR      ,0x00000078


@ Notation:

@ rx:ry means the concatenation of rx and ry with rx having the less significant bits


.equ debug,0

.macro mdump k

.if debug

 push {r0-r3}

 push {r14}

 push {r0-r3}

 bl osp

 movs r0,#\k

 bl o1ch

 pop {r0-r3}

 bl dump

 bl osp

 bl osp

 ldr r0,[r13]

 bl o8hex                      @ r14

 bl onl

 pop {r0}

 mov r14,r0

 pop {r0-r3}

.endif

.endm


@ IEEE double in ra:rb ->

@ mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 set

@ exponent in re

@ sign in rs

@ trashes rt

.macro mdunpack ra,rb,re,rs,rt

 lsrs \re,\rb,#20              @ extract sign and exponent

 subs \rs,\re,#1

 lsls \rs,#20

 subs \rb,\rs                  @ clear sign and exponent in mantissa; insert implied 1

 lsrs \rs,\re,#11              @ sign

 lsls \re,#21

 lsrs \re,#21                  @ exponent

 beq l\@_1                     @ zero exponent?

 adds \rt,\re,#1

 lsrs \rt,#11

 beq l\@_2                     @ exponent != 0x7ff? then done

l\@_1:

 movs \ra,#0

 movs \rb,#1

 lsls \rb,#20

 subs \re,#128

 lsls \re,#12

l\@_2:

.endm


@ IEEE double in ra:rb ->

@ signed mantissa in ra:rb 12Q52 (53 significant bits) with implied 1

@ exponent in re

@ trashes rt0 and rt1

@ +zero, +denormal -> exponent=-0x80000

@ -zero, -denormal -> exponent=-0x80000

@ +Inf, +NaN -> exponent=+0x77f000

@ -Inf, -NaN -> exponent=+0x77e000

.macro mdunpacks ra,rb,re,rt0,rt1

 lsrs \re,\rb,#20              @ extract sign and exponent

 lsrs \rt1,\rb,#31             @ sign only

 subs \rt0,\re,#1

 lsls \rt0,#20

 subs \rb,\rt0                 @ clear sign and exponent in mantissa; insert implied 1

 lsls \re,#21

 bcc l\@_1                     @ skip on positive

 mvns \rb,\rb                  @ negate mantissa

 negs \ra,\ra

 bcc l\@_1

 adds \rb,#1

l\@_1:

 lsrs \re,#21

 beq l\@_2                     @ zero exponent?

 adds \rt0,\re,#1

 lsrs \rt0,#11

 beq l\@_3                     @ exponent != 0x7ff? then done

 subs \re,\rt1

l\@_2:

 movs \ra,#0

 lsls \rt1,#1                  @ +ve: 0  -ve: 2

 adds \rb,\rt1,#1              @ +ve: 1  -ve: 3

 lsls \rb,#30                  @ create +/-1 mantissa

 asrs \rb,#10

 subs \re,#128

 lsls \re,#12

l\@_3:

.endm


double_section WRAPPER_FUNC_NAME(__aeabi_dsub)


# frsub first because it is the only one that needs alignment

regular_func drsub_shim

    push {r0-r3}

    pop {r0-r1}

    pop {r2-r3}

    // fall thru


regular_func dsub_shim

 push {r4-r7,r14}

 movs r4,#1

 lsls r4,#31

 eors r3,r4                    @ flip sign on second argument

 b da_entry                    @ continue in dadd


.align 2

double_section dadd_shim

regular_func dadd_shim

 push {r4-r7,r14}

da_entry:

 mdunpacks r0,r1,r4,r6,r7

 mdunpacks r2,r3,r5,r6,r7

 subs r7,r5,r4                 @ ye-xe

 subs r6,r4,r5                 @ xe-ye

 bmi da_ygtx

@ here xe>=ye: need to shift y down r6 places

 mov r12,r4                    @ save exponent

 cmp r6,#32

 bge da_xrgty                  @ xe rather greater than ye?

 adds r7,#32

 movs r4,r2

 lsls r4,r4,r7                 @ rounding bit + sticky bits

da_xgty0:

 movs r5,r3

 lsls r5,r5,r7

 lsrs r2,r6

 asrs r3,r6

 orrs r2,r5

da_add:

 adds r0,r2

 adcs r1,r3

da_pack:

@ here unnormalised signed result (possibly 0) is in r0:r1 with exponent r12, rounding + sticky bits in r4

@ Note that if a large normalisation shift is required then the arguments were close in magnitude and so we

@ cannot have not gone via the xrgty/yrgtx paths. There will therefore always be enough high bits in r4

@ to provide a correct continuation of the exact result.

@ now pack result back up

 lsrs r3,r1,#31                @ get sign bit

 beq 1f                        @ skip on positive

 mvns r1,r1                    @ negate mantissa

 mvns r0,r0

 movs r2,#0

 negs r4,r4

 adcs r0,r2

 adcs r1,r2

1:

 mov r2,r12                    @ get exponent

 lsrs r5,r1,#21

 bne da_0                      @ shift down required?

 lsrs r5,r1,#20

 bne da_1                      @ normalised?

 cmp r0,#0

 beq da_5                      @ could mantissa be zero?

da_2:

 adds r4,r4

 adcs r0,r0

 adcs r1,r1

 subs r2,#1                    @ adjust exponent

 lsrs r5,r1,#20

 beq da_2

da_1:

 lsls r4,#1                    @ check rounding bit

 bcc da_3

da_4:

 adds r0,#1                    @ round up

 bcc 2f

 adds r1,#1

2:

 cmp r4,#0                     @ sticky bits zero?

 bne da_3

 lsrs r0,#1                    @ round to even

 lsls r0,#1

da_3:

 subs r2,#1

 bmi da_6

 adds r4,r2,#2                 @ check if exponent is overflowing

 lsrs r4,#11

 bne da_7

 lsls r2,#20                   @ pack exponent and sign

 add r1,r2

 lsls r3,#31

 add r1,r3

 pop {r4-r7,r15}


da_7:

@ here exponent overflow: return signed infinity

 lsls r1,r3,#31

 ldr r3,=0x7ff00000

 orrs r1,r3

 b 1f

da_6:

@ here exponent underflow: return signed zero

 lsls r1,r3,#31

1:

 movs r0,#0

 pop {r4-r7,r15}


da_5:

@ here mantissa could be zero

 cmp r1,#0

 bne da_2

 cmp r4,#0

 bne da_2

@ inputs must have been of identical magnitude and opposite sign, so return +0

 pop {r4-r7,r15}


da_0:

@ here a shift down by one place is required for normalisation

 adds r2,#1                    @ adjust exponent

 lsls r6,r0,#31                @ save rounding bit

 lsrs r0,#1

 lsls r5,r1,#31

 orrs r0,r5

 lsrs r1,#1

 cmp r6,#0

 beq da_3

 b da_4


da_xrgty:                      @ xe>ye and shift>=32 places

 cmp r6,#60

 bge da_xmgty                  @ xe much greater than ye?

 subs r6,#32

 adds r7,#64


 movs r4,r2

 lsls r4,r4,r7                 @ these would be shifted off the bottom of the sticky bits

 beq 1f

 movs r4,#1

1:

 lsrs r2,r2,r6

 orrs r4,r2

 movs r2,r3

 lsls r3,r3,r7

 orrs r4,r3

 asrs r3,r2,#31                @ propagate sign bit

 b da_xgty0


da_ygtx:

@ here ye>xe: need to shift x down r7 places

 mov r12,r5                    @ save exponent

 cmp r7,#32

 bge da_yrgtx                  @ ye rather greater than xe?

 adds r6,#32

 movs r4,r0

 lsls r4,r4,r6                 @ rounding bit + sticky bits

da_ygtx0:

 movs r5,r1

 lsls r5,r5,r6

 lsrs r0,r7

 asrs r1,r7

 orrs r0,r5

 b da_add


da_yrgtx:

 cmp r7,#60

 bge da_ymgtx                  @ ye much greater than xe?

 subs r7,#32

 adds r6,#64


 movs r4,r0

 lsls r4,r4,r6                 @ these would be shifted off the bottom of the sticky bits

 beq 1f

 movs r4,#1

1:

 lsrs r0,r0,r7

 orrs r4,r0

 movs r0,r1

 lsls r1,r1,r6

 orrs r4,r1

 asrs r1,r0,#31                @ propagate sign bit

 b da_ygtx0


da_ymgtx:                      @ result is just y

 movs r0,r2

 movs r1,r3

da_xmgty:                      @ result is just x

 movs r4,#0                    @ clear sticky bits

 b da_pack


.ltorg


@ equivalent of UMULL

@ needs five temporary registers

@ can have rt3==rx, in which case rx trashed

@ can have rt4==ry, in which case ry trashed

@ can have rzl==rx

@ can have rzh==ry

@ can have rzl,rzh==rt3,rt4

.macro mul32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4

                               @   t0   t1   t2   t3   t4

                               @                  (x)  (y)

 uxth \rt0,\rx                 @   xl

 uxth \rt1,\ry                 @        yl

 muls \rt0,\rt1                @  xlyl=L

 lsrs \rt2,\rx,#16             @             xh

 muls \rt1,\rt2                @       xhyl=M0

 lsrs \rt4,\ry,#16             @                       yh

 muls \rt2,\rt4                @           xhyh=H

 uxth \rt3,\rx                 @                   xl

 muls \rt3,\rt4                @                  xlyh=M1

 adds \rt1,\rt3                @      M0+M1=M

 bcc l\@_1                     @ addition of the two cross terms can overflow, so add carry into H

 movs \rt3,#1                  @                   1

 lsls \rt3,#16                 @                0x10000

 adds \rt2,\rt3                @             H'

l\@_1:

                               @   t0   t1   t2   t3   t4

                               @                 (zl) (zh)

 lsls \rzl,\rt1,#16            @                  ML

 lsrs \rzh,\rt1,#16            @                       MH

 adds \rzl,\rt0                @                  ZL

 adcs \rzh,\rt2                @                       ZH

.endm


@ SUMULL: x signed, y unsigned

@ in table below ¯ means signed variable

@ needs five temporary registers

@ can have rt3==rx, in which case rx trashed

@ can have rt4==ry, in which case ry trashed

@ can have rzl==rx

@ can have rzh==ry

@ can have rzl,rzh==rt3,rt4

.macro muls32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4

                               @   t0   t1   t2   t3   t4

                               @                 ¯(x)  (y)

 uxth \rt0,\rx                 @   xl

 uxth \rt1,\ry                 @        yl

 muls \rt0,\rt1                @  xlyl=L

 asrs \rt2,\rx,#16             @            ¯xh

 muls \rt1,\rt2                @      ¯xhyl=M0

 lsrs \rt4,\ry,#16             @                       yh

 muls \rt2,\rt4                @          ¯xhyh=H

 uxth \rt3,\rx                 @                   xl

 muls \rt3,\rt4                @                 xlyh=M1

 asrs \rt4,\rt1,#31            @                      M0sx   (M1 sign extension is zero)

 adds \rt1,\rt3                @      M0+M1=M

 movs \rt3,#0                  @                    0

 adcs \rt4,\rt3                @                      ¯Msx

 lsls \rt4,#16                 @                    ¯Msx<<16

 adds \rt2,\rt4                @             H'


                               @   t0   t1   t2   t3   t4

                               @                 (zl) (zh)

 lsls \rzl,\rt1,#16            @                  M~

 lsrs \rzh,\rt1,#16            @                       M~

 adds \rzl,\rt0                @                  ZL

 adcs \rzh,\rt2                @                      ¯ZH

.endm


@ SSMULL: x signed, y signed

@ in table below ¯ means signed variable

@ needs five temporary registers

@ can have rt3==rx, in which case rx trashed

@ can have rt4==ry, in which case ry trashed

@ can have rzl==rx

@ can have rzh==ry

@ can have rzl,rzh==rt3,rt4

.macro muls32_s32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4

                               @   t0   t1   t2   t3   t4

                               @                 ¯(x)  (y)

 uxth \rt0,\rx                 @   xl

 uxth \rt1,\ry                 @        yl

 muls \rt0,\rt1                @  xlyl=L

 asrs \rt2,\rx,#16             @            ¯xh

 muls \rt1,\rt2                @      ¯xhyl=M0

 asrs \rt4,\ry,#16             @                      ¯yh

 muls \rt2,\rt4                @          ¯xhyh=H

 uxth \rt3,\rx                 @                   xl

 muls \rt3,\rt4                @                ¯xlyh=M1

 adds \rt1,\rt3                @     ¯M0+M1=M

 asrs \rt3,\rt1,#31            @                  Msx

 bvc l\@_1                     @

 mvns \rt3,\rt3                @                 ¯Msx        flip sign extension bits if overflow

l\@_1:

 lsls \rt3,#16                 @                    ¯Msx<<16

 adds \rt2,\rt3                @             H'


                               @   t0   t1   t2   t3   t4

                               @                 (zl) (zh)

 lsls \rzl,\rt1,#16            @                  M~

 lsrs \rzh,\rt1,#16            @                       M~

 adds \rzl,\rt0                @                  ZL

 adcs \rzh,\rt2                @                      ¯ZH

.endm


@ can have rt2==rx, in which case rx trashed

@ can have rzl==rx

@ can have rzh==rt1

.macro square32_64 rx,rzl,rzh,rt0,rt1,rt2

                               @   t0   t1   t2   zl   zh

 uxth \rt0,\rx                 @   xl

 muls \rt0,\rt0                @ xlxl=L

 uxth \rt1,\rx                 @        xl

 lsrs \rt2,\rx,#16             @             xh

 muls \rt1,\rt2                @      xlxh=M

 muls \rt2,\rt2                @           xhxh=H

 lsls \rzl,\rt1,#17            @                  ML

 lsrs \rzh,\rt1,#15            @                       MH

 adds \rzl,\rt0                @                  ZL

 adcs \rzh,\rt2                @                       ZH

.endm


double_section dmul_shim

 regular_func dmul_shim

 push {r4-r7,r14}

 mdunpack r0,r1,r4,r6,r5

 mov r12,r4

 mdunpack r2,r3,r4,r7,r5

 eors r7,r6                    @ sign of result

 add r4,r12                    @ exponent of result

 push {r0-r2,r4,r7}


@ accumulate full product in r12:r5:r6:r7

 mul32_32_64 r0,r2, r0,r5, r4,r6,r7,r0,r5    @ XL*YL

 mov r12,r0                    @ save LL bits


 mul32_32_64 r1,r3, r6,r7, r0,r2,r4,r6,r7    @ XH*YH


 pop {r0}                      @ XL

 mul32_32_64 r0,r3, r0,r3, r1,r2,r4,r0,r3    @ XL*YH

 adds r5,r0

 adcs r6,r3

 movs r0,#0

 adcs r7,r0


 pop {r1,r2}                   @ XH,YL

 mul32_32_64 r1,r2, r1,r2, r0,r3,r4, r1,r2   @ XH*YL

 adds r5,r1

 adcs r6,r2

 movs r0,#0

 adcs r7,r0


@ here r5:r6:r7 holds the product [1..4) in Q(104-32)=Q72, with extra LSBs in r12

 pop {r3,r4}                   @ exponent in r3, sign in r4

 lsls r1,r7,#11

 lsrs r2,r6,#21

 orrs r1,r2

 lsls r0,r6,#11

 lsrs r2,r5,#21

 orrs r0,r2

 lsls r5,#11                   @ now r5:r0:r1 Q83=Q(51+32), extra LSBs in r12

 lsrs r2,r1,#20

 bne 1f                        @ skip if in range [2..4)

 adds r5,r5                    @ shift up so always [2..4) Q83, i.e. [1..2) Q84=Q(52+32)

 adcs r0,r0

 adcs r1,r1

 subs r3,#1                    @ correct exponent

1:

 ldr r6,=0x3ff

 subs r3,r6                    @ correct for exponent bias

 lsls r6,#1                    @ 0x7fe

 cmp r3,r6

 bhs dm_0                      @ exponent over- or underflow

 lsls r5,#1                    @ rounding bit to carry

 bcc 1f                        @ result is correctly rounded

 adds r0,#1

 movs r6,#0

 adcs r1,r6                    @ round up

 mov r6,r12                    @ remaining sticky bits

 orrs r5,r6

 bne 1f                        @ some sticky bits set?

 lsrs r0,#1

 lsls r0,#1                    @ round to even

1:

 lsls r3,#20

 adds r1,r3

dm_2:

 lsls r4,#31

 add r1,r4

 pop {r4-r7,r15}


@ here for exponent over- or underflow

dm_0:

 bge dm_1                      @ overflow?

 adds r3,#1                    @ would-be zero exponent?

 bne 1f

 adds r0,#1

 bne 1f                        @ all-ones mantissa?

 adds r1,#1

 lsrs r7,r1,#21

 beq 1f

 lsrs r1,#1

 b dm_2

1:

 lsls r1,r4,#31

 movs r0,#0

 pop {r4-r7,r15}


@ here for exponent overflow

dm_1:

 adds r6,#1                    @ 0x7ff

 lsls r1,r6,#20

 movs r0,#0

 b dm_2


.ltorg


@ Approach to division y/x is as follows.

@

@ First generate u1, an approximation to 1/x to about 29 bits. Multiply this by the top

@ 32 bits of y to generate a0, a first approximation to the result (good to 28 bits or so).

@ Calculate the exact remainder r0=y-a0*x, which will be about 0. Calculate a correction

@ d0=r0*u1, and then write a1=a0+d0. If near a rounding boundary, compute the exact

@ remainder r1=y-a1*x (which can be done using r0 as a basis) to determine whether to

@ round up or down.

@

@ The calculation of 1/x is as given in dreciptest.c. That code verifies exhaustively

@ that | u1*x-1 | < 10*2^-32.

@

@ More precisely:

@

@ x0=(q16)x;

@ x1=(q30)x;

@ y0=(q31)y;

@ u0=(q15~)"(0xffffffffU/(unsigned int)roundq(x/x_ulp))/powq(2,16)"(x0); // q15 approximation to 1/x; "~" denotes rounding rather than truncation

@ v=(q30)(u0*x1-1);

@ u1=(q30)u0-(q30~)(u0*v);

@

@ a0=(q30)(u1*y0);

@ r0=(q82)y-a0*x;

@ r0x=(q57)r0;

@ d0=r0x*u1;

@ a1=d0+a0;

@

@ Error analysis

@

@ Use Greek letters to represent the errors introduced by rounding and truncation.

@

@               r₀ = y - a₀x

@                  = y - [ u₁ ( y - α ) - β ] x    where 0 ≤ α < 2^-31, 0 ≤ β < 2^-30

@                  = y ( 1 - u₁x ) + ( u₁α + β ) x

@

@     Hence

@

@       | r₀ / x | < 2 * 10*2^-32 + 2^-31 + 2^-30

@                  = 26*2^-32

@

@               r₁ = y - a₁x

@                  = y - a₀x - d₀x

@                  = r₀ - d₀x

@                  = r₀ - u₁ ( r₀ - γ ) x    where 0 ≤ γ < 2^-57

@                  = r₀ ( 1 - u₁x ) + u₁γx

@

@     Hence

@

@       | r₁ / x | < 26*2^-32 * 10*2^-32 + 2^-57

@                  = (260+128)*2^-64

@                  < 2^-55

@

@ Empirically it seems to be nearly twice as good as this.

@

@ To determine correctly whether the exact remainder calculation can be skipped we need a result

@ accurate to < 0.25ulp. In the case where x>y the quotient will be shifted up one place for normalisation

@ and so 1ulp is 2^-53 and so the calculation above suffices.


double_section ddiv_shim

 regular_func ddiv_shim

 push {r4-r7,r14}

ddiv0:                         @ entry point from dtan

 mdunpack r2,r3,r4,r7,r6       @ unpack divisor


.if use_hw_div


 movs r5,#IOPORT>>24

 lsls r5,#24

 movs r6,#0

 mvns r6,r6

 str r6,[r5,#DIV_UDIVIDEND]

 lsrs r6,r3,#4                 @ x0=(q16)x

 str r6,[r5,#DIV_UDIVISOR]

@ if there are not enough cycles from now to the read of the quotient for

@ the divider to do its stuff we need a busy-wait here


.endif


@ unpack dividend by hand to save on register use

 lsrs r6,r1,#31

 adds r6,r7

 mov r12,r6                    @ result sign in r12b0; r12b1 trashed

 lsls r1,#1

 lsrs r7,r1,#21                @ exponent

 beq 1f                        @ zero exponent?

 adds r6,r7,#1

 lsrs r6,#11

 beq 2f                        @ exponent != 0x7ff? then done

1:

 movs r0,#0

 movs r1,#0

 subs r7,#64                   @ less drastic fiddling of exponents to get 0/0, Inf/Inf correct

 lsls r7,#12

2:

 subs r6,r7,r4

 lsls r6,#2

 add r12,r12,r6                @ (signed) exponent in r12[31..8]

 subs r7,#1                    @ implied 1

 lsls r7,#21

 subs r1,r7

 lsrs r1,#1


.if use_hw_div


 ldr r6,[r5,#DIV_QUOTIENT]

 adds r6,#1

 lsrs r6,#1


.else


@ this is not beautiful; could be replaced by better code that uses knowledge of divisor range

 push {r0-r3}

 movs r0,#0

 mvns r0,r0

 lsrs r1,r3,#4                 @ x0=(q16)x

 bl __aeabi_uidiv              @ !!! this could (but apparently does not) trash R12

 adds r6,r0,#1

 lsrs r6,#1

 pop {r0-r3}


.endif


@ here

@ r0:r1 y mantissa

@ r2:r3 x mantissa

@ r6    u0, first approximation to 1/x Q15

@ r12: result sign, exponent


 lsls r4,r3,#10

 lsrs r5,r2,#22

 orrs r5,r4                    @ x1=(q30)x

 muls r5,r6                    @ u0*x1 Q45

 asrs r5,#15                   @ v=u0*x1-1 Q30

 muls r5,r6                    @ u0*v Q45

 asrs r5,#14

 adds r5,#1

 asrs r5,#1                    @ round u0*v to Q30

 lsls r6,#15

 subs r6,r5                    @ u1 Q30


@ here

@ r0:r1 y mantissa

@ r2:r3 x mantissa

@ r6    u1, second approximation to 1/x Q30

@ r12: result sign, exponent


 push {r2,r3}

 lsls r4,r1,#11

 lsrs r5,r0,#21

 orrs r4,r5                    @ y0=(q31)y

 mul32_32_64 r4,r6, r4,r5, r2,r3,r7,r4,r5  @ y0*u1 Q61

 adds r4,r4

 adcs r5,r5                    @ a0=(q30)(y0*u1)


@ here

@ r0:r1 y mantissa

@ r5    a0, first approximation to y/x Q30

@ r6    u1, second approximation to 1/x Q30

@ r12   result sign, exponent


 ldr r2,[r13,#0]               @ xL

 mul32_32_64 r2,r5, r2,r3, r1,r4,r7,r2,r3  @ xL*a0

 ldr r4,[r13,#4]               @ xH

 muls r4,r5                    @ xH*a0

 adds r3,r4                    @ r2:r3 now x*a0 Q82

 lsrs r2,#25

 lsls r1,r3,#7

 orrs r2,r1                    @ r2 now x*a0 Q57; r7:r2 is x*a0 Q89

 lsls r4,r0,#5                 @ y Q57

 subs r0,r4,r2                 @ r0x=y-x*a0 Q57 (signed)


@ here

@ r0  r0x Q57

@ r5  a0, first approximation to y/x Q30

@ r4  yL  Q57

@ r6  u1 Q30

@ r12 result sign, exponent


 muls32_32_64 r0,r6, r7,r6, r1,r2,r3, r7,r6   @ r7:r6 r0x*u1 Q87

 asrs r3,r6,#25

 adds r5,r3

 lsls r3,r6,#7                 @ r3:r5 a1 Q62 (but bottom 7 bits are zero so 55 bits of precision after binary point)

@ here we could recover another 7 bits of precision (but not accuracy) from the top of r7

@ but these bits are thrown away in the rounding and conversion to Q52 below


@ here

@ r3:r5  a1 Q62 candidate quotient [0.5,2) or so

@ r4     yL Q57

@ r12    result sign, exponent


 movs r6,#0

 adds r3,#128                  @ for initial rounding to Q53

 adcs r5,r5,r6

 lsrs  r1,r5,#30

 bne dd_0

@ here candidate quotient a1 is in range [0.5,1)

@ so 30 significant bits in r5


 lsls r4,#1                    @ y now Q58

 lsrs r1,r5,#9                 @ to Q52

 lsls r0,r5,#23

 lsrs r3,#9                    @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result

 orrs r0,r3

 bcs dd_1

 b dd_2

dd_0:

@ here candidate quotient a1 is in range [1,2)

@ so 31 significant bits in r5


 movs r2,#4

 add r12,r12,r2                @ fix exponent; r3:r5 now effectively Q61

 adds r3,#128                  @ complete rounding to Q53

 adcs r5,r5,r6

 lsrs r1,r5,#10

 lsls r0,r5,#22

 lsrs r3,#10                   @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result

 orrs r0,r3

 bcc dd_2

dd_1:


@ here

@ r0:r1  rounded result Q53 [0.5,1) or Q52 [1,2), but may not be correctly rounded-to-nearest

@ r4     yL Q58 or Q57

@ r12    result sign, exponent

@ carry set


 adcs r0,r0,r0

 adcs r1,r1,r1                 @ z Q53 with 1 in LSB

 lsls r4,#16                   @ Q105-32=Q73

 ldr r2,[r13,#0]               @ xL Q52

 ldr r3,[r13,#4]               @ xH Q20


 movs r5,r1                    @ zH Q21

 muls r5,r2                    @ zH*xL Q73

 subs r4,r5

 muls r3,r0                    @ zL*xH Q73

 subs r4,r3

 mul32_32_64 r2,r0, r2,r3, r5,r6,r7,r2,r3  @ xL*zL

 negs r2,r2                    @ borrow from low half?

 sbcs r4,r3                    @ y-xz Q73 (remainder bits 52..73)


 cmp r4,#0


 bmi 1f

 movs r2,#0                    @ round up

 adds r0,#1

 adcs r1,r2

1:

 lsrs r0,#1                    @ shift back down to Q52

 lsls r2,r1,#31

 orrs r0,r2

 lsrs r1,#1

dd_2:

 add r13,#8

 mov r2,r12

 lsls r7,r2,#31                @ result sign

 asrs r2,#2                    @ result exponent

 ldr r3,=0x3fd

 adds r2,r3

 ldr r3,=0x7fe

 cmp r2,r3

 bhs dd_3                      @ over- or underflow?

 lsls r2,#20

 adds r1,r2                    @ pack exponent

dd_5:

 adds r1,r7                    @ pack sign

 pop {r4-r7,r15}


dd_3:

 movs r0,#0

 cmp r2,#0

 bgt dd_4                      @ overflow?

 movs r1,r7

 pop {r4-r7,r15}


dd_4:

 adds r3,#1                    @ 0x7ff

 lsls r1,r3,#20

 b dd_5


.section SECTION_NAME(dsqrt_shim)

/*

Approach to square root x=sqrt(y) is as follows.


First generate a3, an approximation to 1/sqrt(y) to about 30 bits. Multiply this by y

to give a4~sqrt(y) to about 28 bits and a remainder r4=y-a4^2. Then, because

d sqrt(y) / dy = 1 / (2 sqrt(y)) let d4=r4*a3/2 and then the value a5=a4+d4 is

a better approximation to sqrt(y). If this is near a rounding boundary we

compute an exact remainder y-a5*a5 to decide whether to round up or down.


The calculation of a3 and a4 is as given in dsqrttest.c. That code verifies exhaustively

that | 1 - a3a4 | < 10*2^-32, | r4 | < 40*2^-32 and | r4/y | < 20*2^-32.


More precisely, with "y" representing y truncated to 30 binary places:


u=(q3)y;                          // 24-entry table

a0=(q8~)"1/sqrtq(x+x_ulp/2)"(u);  // first approximation from table

p0=(q16)(a0*a0) * (q16)y;

r0=(q20)(p0-1);

dy0=(q15)(r0*a0);                 // Newton-Raphson correction term

a1=(q16)a0-dy0/2;                 // good to ~9 bits


p1=(q19)(a1*a1)*(q19)y;

r1=(q23)(p1-1);

dy1=(q15~)(r1*a1);                // second Newton-Raphson correction

a2x=(q16)a1-dy1/2;                // good to ~16 bits

a2=a2x-a2x/1t16;                  // prevent overflow of a2*a2 in 32 bits


p2=(a2*a2)*(q30)y;                // Q62

r2=(q36)(p2-1+1t-31);

dy2=(q30)(r2*a2);                 // Q52->Q30

a3=(q31)a2-dy2/2;                 // good to about 30 bits

a4=(q30)(a3*(q30)y+1t-31);        // good to about 28 bits


Error analysis


          r₄ = y - a₄²

          d₄ = 1/2 a₃r₄

          a₅ = a₄ + d₄

          r₅ = y - a₅²

             = y - ( a₄ + d₄ )²

             = y - a₄² - a₃a₄r₄ - 1/4 a₃²r₄²

             = r₄ - a₃a₄r₄ - 1/4 a₃²r₄²


      | r₅ | < | r₄ | | 1 - a₃a₄ | + 1/4 r₄²


          a₅ = √y √( 1 - r₅/y )

             = √y ( 1 - 1/2 r₅/y + ... )


So to first order (second order being very tiny)


     √y - a₅ = 1/2 r₅/y


and


 | √y - a₅ | < 1/2 ( | r₄/y | | 1 - a₃a₄ | + 1/4 r₄²/y )


From dsqrttest.c (conservatively):


             < 1/2 ( 20*2^-32 * 10*2^-32 + 1/4 * 40*2^-32*20*2^-32 )

             = 1/2 ( 200 + 200 ) * 2^-64

             < 2^-56


Empirically we see about 1ulp worst-case error including rounding at Q57.


To determine correctly whether the exact remainder calculation can be skipped we need a result

accurate to < 0.25ulp at Q52, or 2^-54.

*/


dq_2:

 bge dq_3                      @ +Inf?

 movs r1,#0

 b dq_4


dq_0:

 lsrs r1,#31

 lsls r1,#31                   @ preserve sign bit

 lsrs r2,#21                   @ extract exponent

 beq dq_4                      @ -0? return it

 asrs r1,#11                   @ make -Inf

 b dq_4


dq_3:

 ldr r1,=0x7ff

 lsls r1,#20                   @ return +Inf

dq_4:

 movs r0,#0

dq_1:

 bx r14


.align 2

regular_func dsqrt_shim

 lsls r2,r1,#1

 bcs dq_0                      @ negative?

 lsrs r2,#21                   @ extract exponent

 subs r2,#1

 ldr r3,=0x7fe

 cmp r2,r3

 bhs dq_2                      @ catches 0 and +Inf

 push {r4-r7,r14}

 lsls r4,r2,#20

 subs r1,r4                    @ insert implied 1

 lsrs r2,#1

 bcc 1f                        @ even exponent? skip

 adds r0,r0,r0                 @ odd exponent: shift up mantissa

 adcs r1,r1,r1

1:

 lsrs r3,#2

 adds r2,r3

 lsls r2,#20

 mov r12,r2                    @ save result exponent


@ here

@ r0:r1  y mantissa Q52 [1,4)

@ r12    result exponent

.equ drsqrtapp_minus_8, (drsqrtapp-8)

 adr r4,drsqrtapp_minus_8      @ first eight table entries are never accessed because of the mantissa's leading 1

 lsrs r2,r1,#17                @ y Q3

 ldrb r2,[r4,r2]               @ initial approximation to reciprocal square root a0 Q8

 lsrs r3,r1,#4                 @ first Newton-Raphson iteration

 muls r3,r2

 muls r3,r2                    @  i32 p0=a0*a0*(y>>14);          // Q32

 asrs r3,r3,#12                @  i32 r0=p0>>12;                 // Q20

 muls r3,r2

 asrs r3,#13                   @  i32 dy0=(r0*a0)>>13;           // Q15

 lsls r2,#8

 subs r2,r3                    @  i32 a1=(a0<<8)-dy0;         // Q16


 movs r3,r2

 muls r3,r3

 lsrs r3,#13

 lsrs r4,r1,#1

 muls r3,r4                    @  i32 p1=((a1*a1)>>11)*(y>>11);  // Q19*Q19=Q38

 asrs r3,#15                   @  i32 r1=p1>>15;                 // Q23

 muls r3,r2

 asrs r3,#23

 adds r3,#1

 asrs r3,#1                    @  i32 dy1=(r1*a1+(1<<23))>>24;   // Q23*Q16=Q39; Q15

 subs r2,r3                    @  i32 a2=a1-dy1;                 // Q16

 lsrs r3,r2,#16

 subs r2,r3                    @  if(a2>=0x10000) a2=0xffff; to prevent overflow of a2*a2


@ here

@ r0:r1 y mantissa

@ r2    a2 ~ 1/sqrt(y) Q16

@ r12   result exponent


 movs r3,r2

 muls r3,r3

 lsls r1,#10

 lsrs r4,r0,#22

 orrs r1,r4                    @ y Q30

 mul32_32_64 r1,r3, r4,r3, r5,r6,r7,r4,r3   @  i64 p2=(ui64)(a2*a2)*(ui64)y;  // Q62 r4:r3

 lsls r5,r3,#6

 lsrs r4,#26

 orrs r4,r5

 adds r4,#0x20                 @  i32 r2=(p2>>26)+0x20;          // Q36 r4

 uxth r5,r4

 muls r5,r2

 asrs r4,#16

 muls r4,r2

 lsrs r5,#16

 adds r4,r5

 asrs r4,#6                    @ i32 dy2=((i64)r2*(i64)a2)>>22; // Q36*Q16=Q52; Q30

 lsls r2,#15

 subs r2,r4


@ here

@ r0    y low bits

@ r1    y Q30

@ r2    a3 ~ 1/sqrt(y) Q31

@ r12   result exponent


 mul32_32_64 r2,r1, r3,r4, r5,r6,r7,r3,r4

 adds r3,r3,r3

 adcs r4,r4,r4

 adds r3,r3,r3

 movs r3,#0

 adcs r3,r4                    @ ui32 a4=((ui64)a3*(ui64)y+(1U<<31))>>31; // Q30


@ here

@ r0    y low bits

@ r1    y Q30

@ r2    a3 Q31 ~ 1/sqrt(y)

@ r3    a4 Q30 ~ sqrt(y)

@ r12   result exponent


 square32_64 r3, r4,r5, r6,r5,r7

 lsls r6,r0,#8

 lsrs r7,r1,#2

 subs r6,r4

 sbcs r7,r5                    @ r4=(q60)y-a4*a4


@ by exhaustive testing, r4 = fffffffc0e134fdc .. 00000003c2bf539c Q60


 lsls r5,r7,#29

 lsrs r6,#3

 adcs r6,r5                    @ r4 Q57 with rounding

 muls32_32_64 r6,r2, r6,r2, r4,r5,r7,r6,r2    @ d4=a3*r4/2 Q89

@ r4+d4 is correct to 1ULP at Q57, tested on ~9bn cases including all extreme values of r4 for each possible y Q30


 adds r2,#8

 asrs r2,#5                    @ d4 Q52, rounded to Q53 with spare bit in carry


@ here

@ r0    y low bits

@ r1    y Q30

@ r2    d4 Q52, rounded to Q53

@ C flag contains d4_b53

@ r3    a4 Q30


 bcs dq_5


 lsrs r5,r3,#10                @ a4 Q52

 lsls r4,r3,#22


 asrs r1,r2,#31

 adds r0,r2,r4

 adcs r1,r5                    @ a4+d4


 add r1,r12                    @ pack exponent

 pop {r4-r7,r15}


.ltorg


@ round(sqrt(2^22./[68:8:252]))

drsqrtapp:

.byte 0xf8,0xeb,0xdf,0xd6,0xcd,0xc5,0xbe,0xb8

.byte 0xb2,0xad,0xa8,0xa4,0xa0,0x9c,0x99,0x95

.byte 0x92,0x8f,0x8d,0x8a,0x88,0x85,0x83,0x81


dq_5:

@ here we are near a rounding boundary, C is set

 adcs r2,r2,r2                 @ d4 Q53+1ulp

 lsrs r5,r3,#9

 lsls r4,r3,#23                @ r4:r5 a4 Q53

 asrs r1,r2,#31

 adds r4,r2,r4

 adcs r5,r1                    @ r4:r5 a5=a4+d4 Q53+1ulp

 movs r3,r5

 muls r3,r4

 square32_64 r4,r1,r2,r6,r2,r7

 adds r2,r3

 adds r2,r3                    @ r1:r2 a5^2 Q106

 lsls r0,#22                   @ y Q84


 negs r1,r1

 sbcs r0,r2                    @ remainder y-a5^2

 bmi 1f                        @ y<a5^2: no need to increment a5

 movs r3,#0

 adds r4,#1

 adcs r5,r3                    @ bump a5 if over rounding boundary

1:

 lsrs r0,r4,#1

 lsrs r1,r5,#1

 lsls r5,#31

 orrs r0,r5

 add r1,r12

 pop {r4-r7,r15}


@ "scientific" functions start here


@ double-length CORDIC rotation step


@ r0:r1   ω

@ r6      32-i (complementary shift)

@ r7      i (shift)

@ r8:r9   x

@ r10:r11 y

@ r12     coefficient pointer


@ an option in rotation mode would be to compute the sequence of σ values

@ in one pass, rotate the initial vector by the residual ω and then run a

@ second pass to compute the final x and y. This would relieve pressure

@ on registers and hence possibly be faster. The same trick does not work

@ in vectoring mode (but perhaps one could work to single precision in

@ a first pass and then double precision in a second pass?).


double_section dcordic_vec_step

 regular_func dcordic_vec_step

 mov r2,r12

 ldmia r2!,{r3,r4}

 mov r12,r2

 mov r2,r11

 cmp r2,#0

 blt 1f

 b 2f


double_section dcordic_rot_step

 regular_func dcordic_rot_step

 mov r2,r12

 ldmia r2!,{r3,r4}

 mov r12,r2

 cmp r1,#0

 bge 1f

2:

@ ω<0 / y>=0

@ ω+=dω

@ x+=y>>i, y-=x>>i

 adds r0,r3

 adcs r1,r4


 mov r3,r11

 asrs r3,r7

 mov r4,r11

 lsls r4,r6

 mov r2,r10

 lsrs r2,r7

 orrs r2,r4                    @ r2:r3 y>>i, rounding in carry

 mov r4,r8

 mov r5,r9                     @ r4:r5 x

 adcs r2,r4

 adcs r3,r5                    @ r2:r3 x+(y>>i)

 mov r8,r2

 mov r9,r3


 mov r3,r5

 lsls r3,r6

 asrs r5,r7

 lsrs r4,r7

 orrs r4,r3                    @ r4:r5 x>>i, rounding in carry

 mov r2,r10

 mov r3,r11

 sbcs r2,r4

 sbcs r3,r5                    @ r2:r3 y-(x>>i)

 mov r10,r2

 mov r11,r3

 bx r14


@ ω>0 / y<0

@ ω-=dω

@ x-=y>>i, y+=x>>i

1:

 subs r0,r3

 sbcs r1,r4


 mov r3,r9

 asrs r3,r7

 mov r4,r9

 lsls r4,r6

 mov r2,r8

 lsrs r2,r7

 orrs r2,r4                    @ r2:r3 x>>i, rounding in carry

 mov r4,r10

 mov r5,r11                    @ r4:r5 y

 adcs r2,r4

 adcs r3,r5                    @ r2:r3 y+(x>>i)

 mov r10,r2

 mov r11,r3


 mov r3,r5

 lsls r3,r6

 asrs r5,r7

 lsrs r4,r7

 orrs r4,r3                    @ r4:r5 y>>i, rounding in carry

 mov r2,r8

 mov r3,r9

 sbcs r2,r4

 sbcs r3,r5                    @ r2:r3 x-(y>>i)

 mov r8,r2

 mov r9,r3

 bx r14


@ convert packed double in r0:r1 to signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point], with rounding towards -Inf

@ fixed-point versions only work with reasonable values in r2 because of the way dunpacks works


double_section double2int_shim

 regular_func double2int_shim

 movs r2,#0                    @ and fall through

regular_func double2fix_shim

 push {r14}

 adds r2,#32

 bl double2fix64_shim

 movs r0,r1

 pop {r15}


double_section double2uint_shim

 regular_func double2uint_shim

 movs r2,#0                    @ and fall through

regular_func double2ufix_shim

 push {r14}

 adds r2,#32

 bl double2ufix64_shim

 movs r0,r1

 pop {r15}


double_section double2int64_shim

 regular_func double2int64_shim

 movs r2,#0                    @ and fall through

regular_func double2fix64_shim

 push {r14}

 bl d2fix


 asrs r2,r1,#31

 cmp r2,r3

 bne 1f                        @ sign extension bits fail to match sign of result?

 pop {r15}

1:

 mvns r0,r3

 movs r1,#1

 lsls r1,#31

 eors r1,r1,r0                 @ generate extreme fixed-point values

 pop {r15}


double_section double2uint64_shim

 regular_func double2uint64_shim

 movs r2,#0                    @ and fall through

regular_func double2ufix64_shim

 asrs r3,r1,#20                @ negative? return 0

 bmi ret_dzero

@ and fall through


@ convert double in r0:r1 to signed fixed point in r0:r1:r3, r2 places after point, rounding towards -Inf

@ result clamped so that r3 can only be 0 or -1

@ trashes r12

.thumb_func

d2fix:

 push {r4,r14}

 mov r12,r2

 bl dunpacks

 asrs r4,r2,#16

 adds r4,#1

 bge 1f

 movs r1,#0                    @ -0 -> +0

1:

 asrs r3,r1,#31

 ldr r4, =d2fix_a

 bx r4


ret_dzero:

 movs r0,#0

 movs r1,#0

 bx r14


.weak d2fix_a // weak because it exists in float code too

.thumb_func

d2fix_a:

@ here

@ r0:r1 two's complement mantissa

@ r2    unbaised exponent

@ r3    mantissa sign extension bits

 add r2,r12                    @ exponent plus offset for required binary point position

 subs r2,#52                   @ required shift

 bmi 1f                        @ shift down?

@ here a shift up by r2 places

 cmp r2,#12                    @ will clamp?

 bge 2f

 movs r4,r0

 lsls r1,r2

 lsls r0,r2

 negs r2,r2

 adds r2,#32                   @ complementary shift

 lsrs r4,r2

 orrs r1,r4

 pop {r4,r15}

2:

 mvns r0,r3

 mvns r1,r3                    @ overflow: clamp to extreme fixed-point values

 pop {r4,r15}

1:

@ here a shift down by -r2 places

 adds r2,#32

 bmi 1f                        @ long shift?

 mov r4,r1

 lsls r4,r2

 negs r2,r2

 adds r2,#32                   @ complementary shift

 asrs r1,r2

 lsrs r0,r2

 orrs r0,r4

 pop {r4,r15}

1:

@ here a long shift down

 movs r0,r1

 asrs r1,#31                   @ shift down 32 places

 adds r2,#32

 bmi 1f                        @ very long shift?

 negs r2,r2

 adds r2,#32

 asrs r0,r2

 pop {r4,r15}

1:

 movs r0,r3                    @ result very near zero: use sign extension bits

 movs r1,r3

 pop {r4,r15}


double_section double2float_shim

 regular_func double2float_shim

 lsls r2,r1,#1

 lsrs r2,#21                   @ exponent

 ldr r3,=0x3ff-0x7f

 subs r2,r3                    @ fix exponent bias

 ble 1f                        @ underflow or zero

 cmp r2,#0xff

 bge 2f                        @ overflow or infinity

 lsls r2,#23                   @ position exponent of result

 lsrs r3,r1,#31

 lsls r3,#31

 orrs r2,r3                    @ insert sign

 lsls r3,r0,#3                 @ rounding bits

 lsrs r0,#29

 lsls r1,#12

 lsrs r1,#9

 orrs r0,r1                    @ assemble mantissa

 orrs r0,r2                    @ insert exponent and sign

 lsls r3,#1

 bcc 3f                        @ no rounding

 beq 4f                        @ all sticky bits 0?

5:

 adds r0,#1

3:

 bx r14

4:

 lsrs r3,r0,#1                 @ odd? then round up

 bcs 5b

 bx r14

1:

 beq 6f                        @ check case where value is just less than smallest normal

7:

 lsrs r0,r1,#31

 lsls r0,#31

 bx r14

6:

 lsls r2,r1,#12                @ 20 1:s at top of mantissa?

 asrs r2,#12

 adds r2,#1

 bne 7b

 lsrs r2,r0,#29                @ and 3 more 1:s?

 cmp r2,#7

 bne 7b

 movs r2,#1                    @ return smallest normal with correct sign

 b 8f

2:

 movs r2,#0xff

8:

 lsrs r0,r1,#31                @ return signed infinity

 lsls r0,#8

 adds r0,r2

 lsls r0,#23

 bx r14


double_section x2double_shims

@ convert signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point] to packed double in r0:r1, with rounding


.align 2

regular_func uint2double_shim

 movs r1,#0                    @ and fall through

regular_func ufix2double_shim

 movs r2,r1

 movs r1,#0

 b ufix642double_shim


.align 2

regular_func int2double_shim

 movs r1,#0                    @ and fall through

regular_func fix2double_shim

 movs r2,r1

 asrs r1,r0,#31                @ sign extend

 b fix642double_shim


.align 2

regular_func uint642double_shim

 movs r2,#0                    @ and fall through

regular_func ufix642double_shim

 movs r3,#0

 b uf2d


.align 2

regular_func int642double_shim

 movs r2,#0                    @ and fall through

regular_func fix642double_shim

 asrs r3,r1,#31                @ sign bit across all bits

 eors r0,r3

 eors r1,r3

 subs r0,r3

 sbcs r1,r3

uf2d:

 push {r4,r5,r14}

 ldr r4,=0x432

 subs r2,r4,r2                 @ form biased exponent

@ here

@ r0:r1 unnormalised mantissa

@ r2 -Q (will become exponent)

@ r3 sign across all bits

 cmp r1,#0

 bne 1f                        @ short normalising shift?

 movs r1,r0

 beq 2f                        @ zero? return it

 movs r0,#0

 subs r2,#32                   @ fix exponent

1:

 asrs r4,r1,#21

 bne 3f                        @ will need shift down (and rounding?)

 bcs 4f                        @ normalised already?

5:

 subs r2,#1

 adds r0,r0                    @ shift up

 adcs r1,r1

 lsrs r4,r1,#21

 bcc 5b

4:

 ldr r4,=0x7fe

 cmp r2,r4

 bhs 6f                        @ over/underflow? return signed zero/infinity

7:

 lsls r2,#20                   @ pack and return

 adds r1,r2

 lsls r3,#31

 adds r1,r3

2:

 pop {r4,r5,r15}

6:                             @ return signed zero/infinity according to unclamped exponent in r2

 mvns r2,r2

 lsrs r2,#21

 movs r0,#0

 movs r1,#0

 b 7b


3:

@ here we need to shift down to normalise and possibly round

 bmi 1f                        @ already normalised to Q63?

2:

 subs r2,#1

 adds r0,r0                    @ shift up

 adcs r1,r1

 bpl 2b

1:

@ here we have a 1 in b63 of r0:r1

 adds r2,#11                   @ correct exponent for subsequent shift down

 lsls r4,r0,#21                @ save bits for rounding

 lsrs r0,#11

 lsls r5,r1,#21

 orrs r0,r5

 lsrs r1,#11

 lsls r4,#1

 beq 1f                        @ sticky bits are zero?

8:

 movs r4,#0

 adcs r0,r4

 adcs r1,r4

 b 4b

1:

 bcc 4b                        @ sticky bits are zero but not on rounding boundary

 lsrs r4,r0,#1                 @ increment if odd (force round to even)

 b 8b


.ltorg


double_section dunpacks

 regular_func dunpacks

 mdunpacks r0,r1,r2,r3,r4

 ldr r3,=0x3ff

 subs r2,r3                    @ exponent without offset

 bx r14


@ r0:r1  signed mantissa Q52

@ r2     unbiased exponent < 10 (i.e., |x|<2^10)

@ r4     pointer to:

@          - divisor reciprocal approximation r=1/d Q15

@          - divisor d Q62  0..20

@          - divisor d Q62 21..41

@          - divisor d Q62 42..62

@ returns:

@ r0:r1  reduced result y Q62, -0.6 d < y < 0.6 d (better in practice)

@ r2     quotient q (number of reductions)

@ if exponent >=10, returns r0:r1=0, r2=1024*mantissa sign

@ designed to work for 0.5<d<2, in particular d=ln2 (~0.7) and d=π/2 (~1.6)

double_section dreduce

 regular_func dreduce

 adds r2,#2                    @ e+2

 bmi 1f                        @ |x|<0.25, too small to need adjustment

 cmp r2,#12

 bge 4f

2:

 movs r5,#17

 subs r5,r2                    @ 15-e

 movs r3,r1                    @ Q20

 asrs r3,r5                    @ x Q5

 adds r2,#8                    @ e+10

 adds r5,#7                    @ 22-e = 32-(e+10)

 movs r6,r0

 lsrs r6,r5

 lsls r0,r2

 lsls r1,r2

 orrs r1,r6                    @ r0:r1 x Q62

 ldmia r4,{r4-r7}

 muls r3,r4                    @ rx Q20

 asrs r2,r3,#20

 movs r3,#0

 adcs r2,r3                    @ rx Q0 rounded = q; for e.g. r=1.5 |q|<1.5*2^10

 muls r5,r2                    @ qd in pieces: L Q62

 muls r6,r2                    @               M Q41

 muls r7,r2                    @               H Q20

 lsls r7,#10

 asrs r4,r6,#11

 lsls r6,#21

 adds r6,r5

 adcs r7,r4

 asrs r5,#31

 adds r7,r5                    @ r6:r7 qd Q62

 subs r0,r6

 sbcs r1,r7                    @ remainder Q62

 bx r14

4:

 movs r2,#12                   @ overflow: clamp to +/-1024

 movs r0,#0

 asrs r1,#31

 lsls r1,#1

 adds r1,#1

 lsls r1,#20

 b 2b


1:

 lsls r1,#8

 lsrs r3,r0,#24

 orrs r1,r3

 lsls r0,#8                    @ r0:r1 Q60, to be shifted down -r2 places

 negs r3,r2

 adds r2,#32                   @ shift down in r3, complementary shift in r2

 bmi 1f                        @ long shift?

2:

 movs r4,r1

 asrs r1,r3

 lsls r4,r2

 lsrs r0,r3

 orrs r0,r4

 movs r2,#0                    @ rounding

 adcs r0,r2

 adcs r1,r2

 bx r14


1:

 movs r0,r1                    @ down 32 places

 asrs r1,#31

 subs r3,#32

 adds r2,#32

 bpl 2b

 movs r0,#0                    @ very long shift? return 0

 movs r1,#0

 movs r2,#0

 bx r14


double_section dtan_shim

 regular_func dtan_shim

 push {r4-r7,r14}

 bl push_r8_r11

 bl dsincos_internal

 mov r12,r0                    @ save ε

 bl dcos_finish

 push {r0,r1}

 mov r0,r12

 bl dsin_finish

 pop {r2,r3}

 bl pop_r8_r11

 b ddiv0                       @ compute sin θ/cos θ


double_section dcos_shim

 regular_func dcos_shim

 push {r4-r7,r14}

 bl push_r8_r11

 bl dsincos_internal

 bl dcos_finish

 b 1f


double_section dsin_shim

 regular_func dsin_shim

 push {r4-r7,r14}

 bl push_r8_r11

 bl dsincos_internal

 bl dsin_finish

1:

 bl pop_r8_r11

 pop {r4-r7,r15}


double_section dsincos_shim


 @ Note that this function returns in r0-r3

 regular_func dsincos_shim


 push {r4-r7,r14}

 bl push_r8_r11

 bl dsincos_internal

 mov r12,r0                    @ save ε

 bl dcos_finish

 push {r0,r1}

 mov r0,r12

 bl dsin_finish

 pop {r2,r3}

 bl pop_r8_r11

 pop {r4-r7,r15}


double_section dtrig_guts


@ unpack double θ in r0:r1, range reduce and calculate ε, cos α and sin α such that

@ θ=α+ε and |ε|≤2^-32

@ on return:

@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)

@ r8:r9   cos α Q62

@ r10:r11 sin α Q62

.align 2

.thumb_func

dsincos_internal:

 push {r14}

 bl dunpacks

 adr r4,dreddata0

 bl dreduce


 movs r4,#0

 ldr r5,=0x9df04dbb           @ this value compensates for the non-unity scaling of the CORDIC rotations

 ldr r6,=0x36f656c5

 lsls r2,#31

 bcc 1f

@ quadrant 2 or 3

 mvns r6,r6

 negs r5,r5

 adcs r6,r4

1:

 lsls r2,#1

 bcs 1f

@ even quadrant

 mov r10,r4

 mov r11,r4

 mov r8,r5

 mov r9,r6

 b 2f

1:

@ odd quadrant

 mov r8,r4

 mov r9,r4

 mov r10,r5

 mov r11,r6

2:

 adr r4,dtab_cc

 mov r12,r4

 movs r7,#1

 movs r6,#31

1:

 bl dcordic_rot_step

 adds r7,#1

 subs r6,#1

 cmp r7,#33

 bne 1b

 pop {r15}


dcos_finish:

@ here

@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)

@ r8:r9   cos α Q62

@ r10:r11 sin α Q62

@ and we wish to calculate cos θ=cos(α+ε)~cos α - ε sin α

 mov r1,r11

@ mov r2,r10

@ lsrs r2,#31

@ adds r1,r2                    @ rounding improves accuracy very slightly

 muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3

@ r2:r3   ε sin α Q(62+62-32)=Q92

 mov r0,r8

 mov r1,r9

 lsls r5,r3,#2

 asrs r3,r3,#30

 lsrs r2,r2,#30

 orrs r2,r5

 sbcs r0,r2                    @ include rounding

 sbcs r1,r3

 movs r2,#62

 b fix642double_shim


dsin_finish:

@ here

@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)

@ r8:r9   cos α Q62

@ r10:r11 sin α Q62

@ and we wish to calculate sin θ=sin(α+ε)~sin α + ε cos α

 mov r1,r9

 muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3

@ r2:r3   ε cos α Q(62+62-32)=Q92

 mov r0,r10

 mov r1,r11

 lsls r5,r3,#2

 asrs r3,r3,#30

 lsrs r2,r2,#30

 orrs r2,r5

 adcs r0,r2                    @ include rounding

 adcs r1,r3

 movs r2,#62

 b fix642double_shim


.ltorg

.align 2

dreddata0:

.word 0x0000517d               @ 2/π Q15

.word 0x0014611A               @ π/2 Q62=6487ED5110B4611A split into 21-bit pieces

.word 0x000A8885

.word 0x001921FB


.align 2

regular_func datan2_shim

@ r0:r1 y

@ r2:r3 x

 push {r4-r7,r14}

 bl push_r8_r11

 ldr r5,=0x7ff00000

 movs r4,r1

 ands r4,r5                    @ y==0?

 beq 1f

 cmp r4,r5                     @ or Inf/NaN?

 bne 2f

1:

 lsrs r1,#20                   @ flush

 lsls r1,#20

 movs r0,#0

2:

 movs r4,r3

 ands r4,r5                    @ x==0?

 beq 1f

 cmp r4,r5                     @ or Inf/NaN?

 bne 2f

1:

 lsrs r3,#20                   @ flush

 lsls r3,#20

 movs r2,#0

2:

 movs r6,#0                    @ quadrant offset

 lsls r5,#11                   @ constant 0x80000000

 cmp r3,#0

 bpl 1f                        @ skip if x positive

 movs r6,#2

 eors r3,r5

 eors r1,r5

 bmi 1f                        @ quadrant offset=+2 if y was positive

 negs r6,r6                    @ quadrant offset=-2 if y was negative

1:

@ now in quadrant 0 or 3

 adds r7,r1,r5                 @ r7=-r1

 bpl 1f

@ y>=0: in quadrant 0

 cmp r1,r3

 ble 2f                        @ y<~x so 0≤θ<~π/4: skip

 adds r6,#1

 eors r1,r5                    @ negate x

 b 3f                          @ and exchange x and y = rotate by -π/2

1:

 cmp r3,r7

 bge 2f                        @ -y<~x so -π/4<~θ≤0: skip

 subs r6,#1

 eors r3,r5                    @ negate y and ...

3:

 movs r7,r0                    @ exchange x and y

 movs r0,r2

 movs r2,r7

 movs r7,r1

 movs r1,r3

 movs r3,r7

2:

@ here -π/4<~θ<~π/4

@ r6 has quadrant offset

 push {r6}

 cmp r2,#0

 bne 1f

 cmp r3,#0

 beq 10f                       @ x==0 going into division?

 lsls r4,r3,#1

 asrs r4,#21

 adds r4,#1

 bne 1f                        @ x==Inf going into division?

 lsls r4,r1,#1

 asrs r4,#21

 adds r4,#1                    @ y also ±Inf?

 bne 10f

 subs r1,#1                    @ make them both just finite

 subs r3,#1

 b 1f


10:

 movs r0,#0

 movs r1,#0

 b 12f


1:

 bl ddiv_shim

 movs r2,#62

 bl double2fix64_shim

@ r0:r1 y/x

 mov r10,r0

 mov r11,r1

 movs r0,#0                    @ ω=0

 movs r1,#0

 mov r8,r0

 movs r2,#1

 lsls r2,#30

 mov r9,r2                     @ x=1


 adr r4,dtab_cc

 mov r12,r4

 movs r7,#1

 movs r6,#31

1:

 bl dcordic_vec_step

 adds r7,#1

 subs r6,#1

 cmp r7,#33

 bne 1b

@ r0:r1   atan(y/x) Q62

@ r8:r9   x residual Q62

@ r10:r11 y residual Q62

 mov r2,r9

 mov r3,r10

 subs r2,#12                   @ this makes atan(0)==0

@ the following is basically a division residual y/x ~ atan(residual y/x)

 movs r4,#1

 lsls r4,#29

 movs r7,#0

2:

 lsrs r2,#1

 movs r3,r3                    @ preserve carry

 bmi 1f

 sbcs r3,r2

 adds r0,r4

 adcs r1,r7

 lsrs r4,#1

 bne 2b

 b 3f

1:

 adcs r3,r2

 subs r0,r4

 sbcs r1,r7

 lsrs r4,#1

 bne 2b

3:

 lsls r6,r1,#31

 asrs r1,#1

 lsrs r0,#1

 orrs r0,r6                    @ Q61


12:

 pop {r6}


 cmp r6,#0

 beq 1f

 ldr r4,=0x885A308D           @ π/2 Q61

 ldr r5,=0x3243F6A8

 bpl 2f

 mvns r4,r4                    @ negative quadrant offset

 mvns r5,r5

2:

 lsls r6,#31

 bne 2f                        @ skip if quadrant offset is ±1

 adds r0,r4

 adcs r1,r5

2:

 adds r0,r4

 adcs r1,r5

1:

 movs r2,#61

 bl fix642double_shim


 bl pop_r8_r11

 pop {r4-r7,r15}


.ltorg


dtab_cc:

.word 0x61bb4f69, 0x1dac6705   @ atan 2^-1 Q62

.word 0x96406eb1, 0x0fadbafc   @ atan 2^-2 Q62

.word 0xab0bdb72, 0x07f56ea6   @ atan 2^-3 Q62

.word 0xe59fbd39, 0x03feab76   @ atan 2^-4 Q62

.word 0xba97624b, 0x01ffd55b   @ atan 2^-5 Q62

.word 0xdddb94d6, 0x00fffaaa   @ atan 2^-6 Q62

.word 0x56eeea5d, 0x007fff55   @ atan 2^-7 Q62

.word 0xaab7776e, 0x003fffea   @ atan 2^-8 Q62

.word 0x5555bbbc, 0x001ffffd   @ atan 2^-9 Q62

.word 0xaaaaadde, 0x000fffff   @ atan 2^-10 Q62

.word 0xf555556f, 0x0007ffff   @ atan 2^-11 Q62

.word 0xfeaaaaab, 0x0003ffff   @ atan 2^-12 Q62

.word 0xffd55555, 0x0001ffff   @ atan 2^-13 Q62

.word 0xfffaaaab, 0x0000ffff   @ atan 2^-14 Q62

.word 0xffff5555, 0x00007fff   @ atan 2^-15 Q62

.word 0xffffeaab, 0x00003fff   @ atan 2^-16 Q62

.word 0xfffffd55, 0x00001fff   @ atan 2^-17 Q62

.word 0xffffffab, 0x00000fff   @ atan 2^-18 Q62

.word 0xfffffff5, 0x000007ff   @ atan 2^-19 Q62

.word 0xffffffff, 0x000003ff   @ atan 2^-20 Q62

.word 0x00000000, 0x00000200   @ atan 2^-21 Q62 @ consider optimising these

.word 0x00000000, 0x00000100   @ atan 2^-22 Q62

.word 0x00000000, 0x00000080   @ atan 2^-23 Q62

.word 0x00000000, 0x00000040   @ atan 2^-24 Q62

.word 0x00000000, 0x00000020   @ atan 2^-25 Q62

.word 0x00000000, 0x00000010   @ atan 2^-26 Q62

.word 0x00000000, 0x00000008   @ atan 2^-27 Q62

.word 0x00000000, 0x00000004   @ atan 2^-28 Q62

.word 0x00000000, 0x00000002   @ atan 2^-29 Q62

.word 0x00000000, 0x00000001   @ atan 2^-30 Q62

.word 0x80000000, 0x00000000   @ atan 2^-31 Q62

.word 0x40000000, 0x00000000   @ atan 2^-32 Q62


double_section dexp_guts

regular_func dexp_shim

 push {r4-r7,r14}

 bl dunpacks

 adr r4,dreddata1

 bl dreduce

 cmp r1,#0

 bge 1f

 ldr r4,=0xF473DE6B

 ldr r5,=0x2C5C85FD           @ ln2 Q62

 adds r0,r4

 adcs r1,r5

 subs r2,#1

1:

 push {r2}

 movs r7,#1                    @ shift

 adr r6,dtab_exp

 movs r2,#0

 movs r3,#1

 lsls r3,#30                   @ x=1 Q62


3:

 ldmia r6!,{r4,r5}

 mov r12,r6

 subs r0,r4

 sbcs r1,r5

 bmi 1f


 negs r6,r7

 adds r6,#32                   @ complementary shift

 movs r5,r3

 asrs r5,r7

 movs r4,r3

 lsls r4,r6

 movs r6,r2

 lsrs r6,r7                    @ rounding bit in carry

 orrs r4,r6

 adcs r2,r4

 adcs r3,r5                    @ x+=x>>i

 b 2f


1:

 adds r0,r4                    @ restore argument

 adcs r1,r5

2:

 mov r6,r12

 adds r7,#1

 cmp r7,#33

 bne 3b


@ here

@ r0:r1   ε (residual x, where x=a+ε) Q62, |ε|≤2^-32 (so fits in r0)

@ r2:r3   exp a Q62

@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε)

 muls32_32_64 r0,r3, r4,r1, r5,r6,r7,r4,r1

@ r4:r1 ε exp a Q(62+62-32)=Q92

 lsrs r4,#30

 lsls r0,r1,#2

 orrs r0,r4

 asrs r1,#30

 adds r0,r2

 adcs r1,r3


 pop {r2}

 negs r2,r2

 adds r2,#62

 bl fix642double_shim                 @ in principle we can pack faster than this because we know the exponent

 pop {r4-r7,r15}


.ltorg


.align 2

regular_func dln_shim

 push {r4-r7,r14}

 lsls r7,r1,#1

 bcs 5f                        @ <0 ...

 asrs r7,#21

 beq 5f                        @ ... or =0? return -Inf

 adds r7,#1

 beq 6f                        @ Inf/NaN? return +Inf

 bl dunpacks

 push {r2}

 lsls r1,#9

 lsrs r2,r0,#23

 orrs r1,r2

 lsls r0,#9

@ r0:r1 m Q61 = m/2 Q62 0.5≤m/2<1


 movs r7,#1                    @ shift

 adr r6,dtab_exp

 mov r12,r6

 movs r2,#0

 movs r3,#0                    @ y=0 Q62


3:

 negs r6,r7

 adds r6,#32                   @ complementary shift

 movs r5,r1

 asrs r5,r7

 movs r4,r1

 lsls r4,r6

 movs r6,r0

 lsrs r6,r7

 orrs r4,r6                    @ x>>i, rounding bit in carry

 adcs r4,r0

 adcs r5,r1                    @ x+(x>>i)


 lsrs r6,r5,#30

 bne 1f                        @ x+(x>>i)>1?

 movs r0,r4

 movs r1,r5                    @ x+=x>>i

 mov r6,r12

 ldmia r6!,{r4,r5}

 subs r2,r4

 sbcs r3,r5


1:

 movs r4,#8

 add r12,r4

 adds r7,#1

 cmp r7,#33

 bne 3b

@ here:

@ r0:r1 residual x, nearly 1 Q62

@ r2:r3 y ~ ln m/2 = ln m - ln2 Q62

@ result is y + ln2 + ln x ~ y + ln2 + (x-1)

 lsls r1,#2

 asrs r1,#2                    @ x-1

 adds r2,r0

 adcs r3,r1


 pop {r7}

@ here:

@ r2:r3 ln m/2 = ln m - ln2 Q62

@ r7    unbiased exponent

.equ dreddata1_plus_4, (dreddata1+4)

 adr r4,dreddata1_plus_4

 ldmia r4,{r0,r1,r4}

 adds r7,#1

 muls r0,r7                    @ Q62

 muls r1,r7                    @ Q41

 muls r4,r7                    @ Q20

 lsls r7,r1,#21

 asrs r1,#11

 asrs r5,r1,#31

 adds r0,r7

 adcs r1,r5

 lsls r7,r4,#10

 asrs r4,#22

 asrs r5,r1,#31

 adds r1,r7

 adcs r4,r5

@ r0:r1:r4 exponent*ln2 Q62

 asrs r5,r3,#31

 adds r0,r2

 adcs r1,r3

 adcs r4,r5

@ r0:r1:r4 result Q62

 movs r2,#62

1:

 asrs r5,r1,#31

 cmp r4,r5

 beq 2f                        @ r4 a sign extension of r1?

 lsrs r0,#4                    @ no: shift down 4 places and try again

 lsls r6,r1,#28

 orrs r0,r6

 lsrs r1,#4

 lsls r6,r4,#28

 orrs r1,r6

 asrs r4,#4

 subs r2,#4

 b 1b

2:

 bl fix642double_shim

 pop {r4-r7,r15}


5:

 ldr r1,=0xfff00000

 movs r0,#0

 pop {r4-r7,r15}


6:

 ldr r1,=0x7ff00000

 movs r0,#0

 pop {r4-r7,r15}


.ltorg


.align 2

dreddata1:

.word 0x0000B8AA               @ 1/ln2 Q15

.word 0x0013DE6B               @ ln2 Q62 Q62=2C5C85FDF473DE6B split into 21-bit pieces

.word 0x000FEFA3

.word 0x000B1721


dtab_exp:

.word 0xbf984bf3, 0x19f323ec   @ log 1+2^-1 Q62

.word 0xcd4d10d6, 0x0e47fbe3   @ log 1+2^-2 Q62

.word 0x8abcb97a, 0x0789c1db   @ log 1+2^-3 Q62

.word 0x022c54cc, 0x03e14618   @ log 1+2^-4 Q62

.word 0xe7833005, 0x01f829b0   @ log 1+2^-5 Q62

.word 0x87e01f1e, 0x00fe0545   @ log 1+2^-6 Q62

.word 0xac419e24, 0x007f80a9   @ log 1+2^-7 Q62

.word 0x45621781, 0x003fe015   @ log 1+2^-8 Q62

.word 0xa9ab10e6, 0x001ff802   @ log 1+2^-9 Q62

.word 0x55455888, 0x000ffe00   @ log 1+2^-10 Q62

.word 0x0aa9aac4, 0x0007ff80   @ log 1+2^-11 Q62

.word 0x01554556, 0x0003ffe0   @ log 1+2^-12 Q62

.word 0x002aa9ab, 0x0001fff8   @ log 1+2^-13 Q62

.word 0x00055545, 0x0000fffe   @ log 1+2^-14 Q62

.word 0x8000aaaa, 0x00007fff   @ log 1+2^-15 Q62

.word 0xe0001555, 0x00003fff   @ log 1+2^-16 Q62

.word 0xf80002ab, 0x00001fff   @ log 1+2^-17 Q62

.word 0xfe000055, 0x00000fff   @ log 1+2^-18 Q62

.word 0xff80000b, 0x000007ff   @ log 1+2^-19 Q62

.word 0xffe00001, 0x000003ff   @ log 1+2^-20 Q62

.word 0xfff80000, 0x000001ff   @ log 1+2^-21 Q62

.word 0xfffe0000, 0x000000ff   @ log 1+2^-22 Q62

.word 0xffff8000, 0x0000007f   @ log 1+2^-23 Q62

.word 0xffffe000, 0x0000003f   @ log 1+2^-24 Q62

.word 0xfffff800, 0x0000001f   @ log 1+2^-25 Q62

.word 0xfffffe00, 0x0000000f   @ log 1+2^-26 Q62

.word 0xffffff80, 0x00000007   @ log 1+2^-27 Q62

.word 0xffffffe0, 0x00000003   @ log 1+2^-28 Q62

.word 0xfffffff8, 0x00000001   @ log 1+2^-29 Q62

.word 0xfffffffe, 0x00000000   @ log 1+2^-30 Q62

.word 0x80000000, 0x00000000   @ log 1+2^-31 Q62

.word 0x40000000, 0x00000000   @ log 1+2^-32 Q62


#endif