YAHAL
Yet Another Hardware Abstraction Library
Loading...
Searching...
No Matches
float_v1_rom_shim.S
1/*
2 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 */
6
7#include "asm_helper.S"
8
9#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
10.syntax unified
11.cpu cortex-m0plus
12.thumb
13
14#ifndef PICO_FLOAT_IN_RAM
15#define PICO_FLOAT_IN_RAM 0
16#endif
17
18.macro float_section name
19// todo separate flag for shims?
20#if PICO_FLOAT_IN_RAM
21.section RAM_SECTION_NAME(\name), "ax"
22#else
23.section SECTION_NAME(\name), "ax"
24#endif
25.endm
26
27float_section float_table_shim_on_use_helper
28regular_func float_table_shim_on_use_helper
29 push {r0-r2, lr}
30 mov r0, ip
31#ifndef NDEBUG
32 // sanity check to make sure we weren't called by non (shimmable_) table_tail_call macro
33 cmp r0, #0
34 bne 1f
35 bkpt #0
36#endif
371:
38 ldrh r1, [r0]
39 lsrs r2, r1, #8
40 adds r0, #2
41 cmp r2, #0xdf
42 bne 1b
43 uxtb r1, r1 // r1 holds table offset
44 lsrs r2, r0, #2
45 bcc 1f
46 // unaligned
47 ldrh r2, [r0, #0]
48 ldrh r0, [r0, #2]
49 lsls r0, #16
50 orrs r0, r2
51 b 2f
521:
53 ldr r0, [r0]
542:
55 ldr r2, =sf_table
56 str r0, [r2, r1]
57 str r0, [sp, #12]
58 pop {r0-r2, pc}
59
60float_section 642float_shims
61
62@ convert uint64 to float, rounding
63regular_func uint642float_shim
64 movs r2,#0 @ fall through
65
66@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
67regular_func ufix642float_shim
68 push {r4,r5,r14}
69 cmp r1,#0
70 bpl 3f @ positive? we can use signed code
71 lsls r5,r1,#31 @ contribution to sticky bits
72 orrs r5,r0
73 lsrs r0,r1,#1
74 subs r2,#1
75 b 4f
76
77@ convert int64 to float, rounding
78regular_func int642float_shim
79 movs r2,#0 @ fall through
80
81@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
82regular_func fix642float_shim
83 push {r4,r5,r14}
843:
85 movs r5,r0
86 orrs r5,r1
87 beq ret_pop45 @ zero? return +0
88 asrs r5,r1,#31 @ sign bits
892:
90 asrs r4,r1,#24 @ try shifting 7 bits at a time
91 cmp r4,r5
92 bne 1f @ next shift will overflow?
93 lsls r1,#7
94 lsrs r4,r0,#25
95 orrs r1,r4
96 lsls r0,#7
97 adds r2,#7
98 b 2b
991:
100 movs r5,r0
101 movs r0,r1
1024:
103 negs r2,r2
104 adds r2,#32+29
105
106 // bl packx
107 ldr r1, =0x29ef // packx
108 blx r1
109ret_pop45:
110 pop {r4,r5,r15}
111
112float_section fatan2_shim
113regular_func fatan2_shim
114 push {r4,r5,r14}
115
116 ldr r4, =0x29c1 // unpackx
117 mov ip, r4
118@ unpack arguments and shift one down to have common exponent
119 blx ip
120 mov r4,r0
121 mov r0,r1
122 mov r1,r4
123 mov r4,r2
124 mov r2,r3
125 mov r3,r4
126 blx ip
127 lsls r0,r0,#5 @ Q28
128 lsls r1,r1,#5 @ Q28
129 adds r4,r2,r3 @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise
130 asrs r4,#9
131 adds r4,#1
132 bmi 2f @ force y to 0 proper, so result will be zero
133 subs r4,r2,r3 @ calculate shift
134 bge 1f @ ex>=ey?
135 negs r4,r4 @ make shift positive
136 asrs r0,r4
137 cmp r4,#28
138 blo 3f
139 asrs r0,#31
140 b 3f
1411:
142 asrs r1,r4
143 cmp r4,#28
144 blo 3f
1452:
146@ here |x|>>|y| or both x and y are ±0
147 cmp r0,#0
148 bge 4f @ x positive, return signed 0
149 ldr r3, =0x2cfc @ &pi_q29, circular coefficients
150 ldr r0,[r3] @ x negative, return +/- pi
151 asrs r1,#31
152 eors r0,r1
153 b 7f
1544:
155 asrs r0,r1,#31
156 b 7f
1573:
158 movs r2,#0 @ initial angle
159 ldr r3, =0x2cfc @ &pi_q29, circular coefficients
160 cmp r0,#0 @ x negative
161 bge 5f
162 negs r0,r0 @ rotate to 1st/4th quadrants
163 negs r1,r1
164 ldr r2,[r3] @ pi Q29
1655:
166 movs r4,#1 @ m=1
167 ldr r5, =0x2b97 @ cordic_vec
168 blx r5 @ also produces magnitude (with scaling factor 1.646760119), which is discarded
169 mov r0,r2 @ result here is -pi/2..3pi/2 Q29
170@ asrs r2,#29
171@ subs r0,r2
172 ldr r3, =0x2cfc @ &pi_q29, circular coefficients
173 ldr r2,[r3] @ pi Q29
174 adds r4,r0,r2 @ attempt to fix -3pi/2..-pi case
175 bcs 6f @ -pi/2..0? leave result as is
176 subs r4,r0,r2 @ <pi? leave as is
177 bmi 6f
178 subs r0,r4,r2 @ >pi: take off 2pi
1796:
180 subs r0,#1 @ fiddle factor so atan2(0,1)==0
1817:
182 movs r2,#0 @ exponent for pack
183 ldr r3, =0x2b19
184 bx r3
185
186float_section float232_shims
187
188regular_func float2int_shim
189 movs r1,#0 @ fall through
190regular_func float2fix_shim
191 // check for -0 or -denormal upfront
192 asrs r2, r0, #23
193 adds r2, #128
194 adds r2, #128
195 beq 1f
196 // call original
197 ldr r2, =0x2acd
198 bx r2
199 1:
200 movs r0, #0
201 bx lr
202
203float_section float264_shims
204
205regular_func float2int64_shim
206 movs r1,#0 @ and fall through
207regular_func float2fix64_shim
208 push {r14}
209 bl f2fix
210 b d2f64_a
211
212regular_func float2uint64_shim
213 movs r1,#0 @ and fall through
214regular_func float2ufix64_shim
215 asrs r3,r0,#23 @ negative? return 0
216 bmi ret_dzero
217@ and fall through
218
219@ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf
220@ result clamped so that r3 can only be 0 or -1
221@ trashes r12
222.thumb_func
223f2fix:
224 push {r4,r14}
225 mov r12,r1
226 asrs r3,r0,#31
227 lsls r0,#1
228 lsrs r2,r0,#24
229 beq 1f @ zero?
230 cmp r2,#0xff @ Inf?
231 beq 2f
232 subs r1,r2,#1
233 subs r2,#0x7f @ remove exponent bias
234 lsls r1,#24
235 subs r0,r1 @ insert implied 1
236 eors r0,r3
237 subs r0,r3 @ top two's complement
238 asrs r1,r0,#4 @ convert to double format
239 lsls r0,#28
240 ldr r4, =d2fix_a
241 bx r4
2421:
243 movs r0,#0
244 movs r1,r0
245 movs r3,r0
246 pop {r4,r15}
2472:
248 mvns r0,r3 @ return max/min value
249 mvns r1,r3
250 pop {r4,r15}
251
252ret_dzero:
253 movs r0,#0
254 movs r1,#0
255 bx r14
256
257float_section d2fix_a_float
258
259.weak d2fix_a // weak because it exists in float shims too
260.thumb_func
261d2fix_a:
262@ here
263@ r0:r1 two's complement mantissa
264@ r2 unbaised exponent
265@ r3 mantissa sign extension bits
266 add r2,r12 @ exponent plus offset for required binary point position
267 subs r2,#52 @ required shift
268 bmi 1f @ shift down?
269@ here a shift up by r2 places
270 cmp r2,#12 @ will clamp?
271 bge 2f
272 movs r4,r0
273 lsls r1,r2
274 lsls r0,r2
275 negs r2,r2
276 adds r2,#32 @ complementary shift
277 lsrs r4,r2
278 orrs r1,r4
279 pop {r4,r15}
2802:
281 mvns r0,r3
282 mvns r1,r3 @ overflow: clamp to extreme fixed-point values
283 pop {r4,r15}
2841:
285@ here a shift down by -r2 places
286 adds r2,#32
287 bmi 1f @ long shift?
288 mov r4,r1
289 lsls r4,r2
290 negs r2,r2
291 adds r2,#32 @ complementary shift
292 asrs r1,r2
293 lsrs r0,r2
294 orrs r0,r4
295 pop {r4,r15}
2961:
297@ here a long shift down
298 movs r0,r1
299 asrs r1,#31 @ shift down 32 places
300 adds r2,#32
301 bmi 1f @ very long shift?
302 negs r2,r2
303 adds r2,#32
304 asrs r0,r2
305 pop {r4,r15}
3061:
307 movs r0,r3 @ result very near zero: use sign extension bits
308 movs r1,r3
309 pop {r4,r15}
310d2f64_a:
311 asrs r2,r1,#31
312 cmp r2,r3
313 bne 1f @ sign extension bits fail to match sign of result?
314 pop {r15}
3151:
316 mvns r0,r3
317 movs r1,#1
318 lsls r1,#31
319 eors r1,r1,r0 @ generate extreme fixed-point values
320 pop {r15}
321
322float_section float2double_shim
323regular_func float2double_shim
324 lsrs r3,r0,#31 @ sign bit
325 lsls r3,#31
326 lsls r1,r0,#1
327 lsrs r2,r1,#24 @ exponent
328 beq 1f @ zero?
329 cmp r2,#0xff @ Inf?
330 beq 2f
331 lsrs r1,#4 @ exponent and top 20 bits of mantissa
332 ldr r2,=(0x3ff-0x7f)<<20 @ difference in exponent offsets
333 adds r1,r2
334 orrs r1,r3
335 lsls r0,#29 @ bottom 3 bits of mantissa
336 bx r14
3371:
338 movs r1,r3 @ return signed zero
3393:
340 movs r0,#0
341 bx r14
3422:
343 ldr r1,=0x7ff00000 @ return signed infinity
344 adds r1,r3
345 b 3b
346
347#endif