VisionServer  v2.1.1-1-g21dc5465
FRC vision library
neon64.S
Go to the documentation of this file.
1.global memcpy_threshold_asm
2.global memcpy_threshold_binary_asm
3.global memcpy_compare3_add_asm // a bit broken?
4.global memcpy_wst_asm
5.global memcpy_deinterlace_wstb_asm
6.global memcpy_deinterlace_togray_asm
7.global memcpy_bitwise_or_asm
8.global memcpy_bitwise_or_3c_asm
9.global memcpy_add_asm
10.global memcpy_add_3c_asm
11.global memcpy_add_3c2_asm
12.global memcpy_subtract_asm
13
14
15/**
16 * --> https://arm-software.github.io/acle/neon_intrinsics/advsimd.html
17 * --> https://developer.arm.com/documentation/102159/0400/Load-and-store---example-RGB-conversion
18 * --> https://developer.arm.com/documentation/den0024/a
19 */
20
21/* For the aarch64 ABI, x0-x7 are callee-saved, so no stack frame push is necessary unless more than 8 args are needed
22 * To push the sp and fp, use (change the alloc ammount based on how many extra args there are):
23 stp x29, x30, [sp, #-16]!
24 mov x29, sp
25 * And to restore, use (before ret):
26 ldp x29, x30, [sp], #16
27 * This is just a reminder in case this is ever needed, currently none of the functions have more than 8 args
28 */
29
30
31/* Arg 0 ~ x0: Source address (single channel frame)
32 * Arg 1 ~ x1: Destonation address
33 * Arg 2 ~ x2: Pixel count
34 * Arg 3 ~ x3: Threshold value */
35memcpy_threshold_asm:
36
37 lsr w2, w2, #4
38 dup v3.16b, w3
39 _thresh_loop:
40 ldr q0, [x0], #16
41 cmhi v1.16b, v0.16b, v3.16b
42 and v2.16b, v1.16b, v0.16b
43 str q2, [x1], #16
44
45 subs x2, x2, #1
46 bne _thresh_loop
47
48 ret
49// END
50
51
52/* Arg 0 ~ x0: Source address (single channel frame)
53 * Arg 1 ~ x1: Destonation address
54 * Arg 2 ~ x2: Pixel count
55 * Arg 3 ~ x3: Threshold value */
56memcpy_threshold_binary_asm:
57
58 lsr w2, w2, #4
59 dup v3.8b, w3
60 _thresh_bin_loop:
61 ldr q0, [x1], #16
62 cmhi v1.16b, v0.16b, v3.16b
63 str q1, [x0], #16
64
65 subs x2, x2, #1
66 bne _thresh_bin_loop
67
68 ret
69// END
70
71
72/* Arg 0 ~ x0: Primary channel address
73 * Arg 1 ~ x1: Compare channel 1 address
74 * Arg 2 ~ x2: Compare channel 2 address
75 * Arg 3 ~ x3: Addition address
76 * Arg 4 ~ x4: Destination Address
77 * Arg 5 ~ x5: Count */
78memcpy_compare3_add_asm:
79
80 lsr w5, w5, #4 // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
81 _cmp3_add_loop:
82 ldr q0, [x0], #16 // load primary compare
83 ldr q1, [x1], #16 // load compare 1
84 ldr q2, [x2], #16 // load compare 2
85 ldr q3, [x3], #16 // load addition
86
87 cmhi v1.16b, v0.16b, v1.16b // compare primary and channel 1
88 cmhi v2.16b, v0.16b, v2.16b // compare primary and channel 2
89 and v1.16b, v1.16b, v2.16b // AND the results
90 and v2.16b, v0.16b, v1.16b // AND the threshold to the input
91 ushr v1.16b, v2.16b, #6 // rshift result by 6 (div 32)
92
93 neg v1.16b, v1.16b // invert sign
94 uqshl v2.16b, v0.16b, v1.16b // left shift primary by negated result (right shift, mimics a division)
95 uqshl v2.16b, v2.16b, #4 // left shift result to fill all 8 bits
96 uqadd v3.16b, v2.16b, v3.16b // add the threshold to the additive
97
98 str q3, [x4], #16 // Load out of Q3
99
100 subs x5, x5, #1
101 bne _cmp3_add_loop
102
103 ret
104// END
105
106
107/* Arg 0 ~ x0: Primary channel address
108 * Arg 1 ~ x1: Compare channel 2 address
109 * Arg 2 ~ x2: Compare channel 3 address
110 * Arg 3 ~ x3: Destination address
111 * Arg 4 ~ x4: Count
112 * Arg 5 ~ x5: Alpha
113 * Arg 6 ~ x6: Beta
114 * Arg 7 ~ x7: Gamma */
115memcpy_wst_asm:
116
117 dup v4.16b, w5 // duplicate alpha across 16x8bit lanes
118 clz v4.16b, v4.16b // count the leading 0's (inverse of log2(value))
119 neg v4.16b, v4.16b // negate count (for right shifting)
120
121 dup v5.16b, w6 // ''' for beta
122 clz v5.16b, v5.16b
123 neg v5.16b, v5.16b
124
125 dup v6.16b, w7 // duplicate gamma across 16x8bit lanes
126
127 lsr x4, x4, #4 // right shift the count by 4 bits (divide by 16, 16 bytes = 128 bits)
128
129 _wst_loop:
130 ldr q0, [x0], #16
131 ldr q1, [x1], #16
132 ldr q2, [x2], #16
133
134 // Q0 --> primary, Q1 --> alpha, Q2 --> beta
135 ushl v1.16b, v1.16b, v4.16b // weight alpha --> shift right by the amount of leading 0's from alpha (lshift by negative)
136 ushl v2.16b, v2.16b, v5.16b // weight beta
137 uqadd v3.16b, v1.16b, v2.16b // add weights
138 uqadd v3.16b, v3.16b, v6.16b // add gamma
139 uqsub v3.16b, v0.16b, v3.16b // subtract result from primary
140
141 str q3, [x1], #16 // Load out of Q3
142
143 subs x4, x4, #1
144 bne _wst_loop
145
146 ret
147// END
148
149
150/* Arg 0 ~ x0: Source address (3-channel continuous buffer required)
151 * Arg 1 ~ x1: Destination address
152 * Arg 2 ~ x2: Count (frame size) --> NOT(size * #channels)
153 * Arg 3 ~ x3: C1/Primary channel offset(index) --> ex. 0 for first; alpha/beta follow
154 * Arg 4 ~ x4: Alpha
155 * Arg 5 ~ x5: Beta
156 * Arg 6 ~ x6: Threshold */
157memcpy_deinterlace_wstb_asm:
158
159 dup v4.16b, w4 // duplicate alpha across 16x8bit lanes
160 clz v4.16b, v4.16b // count the leading 0's (inverse of log2(value))
161 neg v4.16b, v4.16b // negate count (for right shifting)
162
163 dup v5.16b, w5 // ''' for beta
164 clz v5.16b, v5.16b
165 neg v5.16b, v5.16b
166
167 dup v6.16b, w6 // duplicate threshold '''
168
169 lsr x2, x2, #4 // right shift the count by 4 bits (divide by 16, 16 bytes = 128 bits)
170
171 cmp x3, #1
172 b.lt _c0_split_wstb_loop
173 b.eq _c1_split_wstb_loop
174 b.gt _c2_split_wstb_loop
175
176 _c0_split_wstb_loop:
177 ld3 { v0.16b, v1.16b, v2.16b }, [x0], #48 // deinterlace 48 bytes of RGB into 3x16byte V registers
178
179 // Q0 --> primary, Q1 --> alpha, Q2 --> beta
180 ushl v1.16b, v1.16b, v4.16b // weight alpha --> shift right by the amount of leading 0's from alpha (lshift by negative)
181 ushl v2.16b, v2.16b, v5.16b // weight beta
182 uqadd v3.16b, v1.16b, v2.16b // add weights
183 uqsub v3.16b, v0.16b, v3.16b // subtract result from primary
184 cmhi v3.16b, v3.16b, v6.16b // threshold result
185
186 str q3, [x1], #16 // Load out of Q3
187
188 subs x2, x2, #1
189 bne _c0_split_wstb_loop
190 beq _split_wstb_end
191
192 _c1_split_wstb_loop:
193 ld3 { v0.16b, v1.16b, v2.16b }, [x0], #48
194
195 // Q0 --> beta, Q1 --> primary, Q2 --> alpha
196 ushl v2.16b, v2.16b, v4.16b // weight alpha
197 ushl v0.16b, v0.16b, v5.16b // weight beta
198 uqadd v3.16b, v2.16b, v0.16b // add weights
199 uqsub v3.16b, v1.16b, v3.16b // subtract result from primary
200 cmhi v3.16b, v3.16b, v6.16b // threshold result
201
202 str q3, [x1], #16 // Load out of Q3
203
204 subs x2, x2, #1
205 bne _c1_split_wstb_loop
206 beq _split_wstb_end
207
208 _c2_split_wstb_loop:
209 ld3 { v0.16b, v1.16b, v2.16b }, [x0], #48
210
211 // Q0 --> alpha, Q1 --> beta, Q2 --> primary
212 ushl v0.16b, v0.16b, v4.16b // weight alpha
213 ushl v1.16b, v1.16b, v5.16b // weight beta
214 uqadd v3.16b, v0.16b, v1.16b // add weights
215 uqsub v3.16b, v2.16b, v3.16b // subtract result from primary
216 cmhi v3.16b, v3.16b, v6.16b // threshold result
217
218 str q3, [x1], #16 // Load out of Q3
219
220 subs x2, x2, #1
221 bne _c2_split_wstb_loop
222 beq _split_wstb_end
223
224 _split_wstb_end:
225 ret
226// END
227
228
229/* Arg 0 ~ x0: Source array address (3-channel interlaced buffer)
230 * Arg 1 ~ x1: Destination array address
231 * Arg 2 ~ x2: Pixel total -- size of destination, size of source / 3 */
232memcpy_deinterlace_togray_asm:
233
234 cmp x2, #16
235 b.lo _togray_single
236 _togray_loop:
237 ld3 { v0.16b - v2.16b }, [x0], #48
238
239 ushr v0.16b, v0.16b, #2 // divide red/blue by 4
240 ushr v1.16b, v1.16b, #1 // divide green by 2
241 ushr v2.16b, v2.16b, #2 // divide blue/red by 4
242
243 uqadd v3.16b, v0.16b, v1.16b
244 uqadd v3.16b, v2.16b, v3.16b // 1/4 + 1/2 + 1/4 = 1/1
245 str q3, [x1], #16
246
247 subs x2, x2, #16
248 beq _togray_end // exactly 0 bytes left --> end
249 cmp x2, #16
250 b.hs _togray_loop // >= 16 --> keep going
251
252 _togray_single:
253 ld3 { v0.b - v2.b }[0], [x0], #3
254
255 ushr v0.16b, v0.16b, #2 // divide red/blue by 4
256 ushr v1.16b, v1.16b, #1 // divide green by 2
257 ushr v2.16b, v2.16b, #2 // divide blue/red by 4
258
259 uqadd v3.16b, v0.16b, v1.16b
260 uqadd v3.16b, v2.16b, v3.16b // 1/4 + 1/2 + 1/4 = 1/1
261 st1 {v2.b}[0], [x1], #1
262
263 subs x2, x2, #1
264 bne _togray_single
265
266 _togray_end:
267 ret
268// END
269
270
271
272/* Arg 0 ~ x0: Source A
273 * Arg 1 ~ x1: Source B
274 * Arg 2 ~ x2: Destination
275 * Arg 3 ~ x3: Count */
276memcpy_bitwise_or_asm:
277
278 lsr x3, x3, #4 // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
279 _bitwise_or_loop:
280 ldr q0, [x0], #16
281 ldr q1, [x1], #16
282 orr v2.16b, v0.16b, v1.16b
283 str q2, [x2], #16
284
285 subs x3, x3, #1
286 bne _bitwise_or_loop
287
288 ret
289// END
290
291/* Arg 0 ~ x0: Source A -- >> 3 channels <<
292 * Arg 1 ~ x1: Source B
293 * Arg 2 ~ x2: Destination -- >> 3 channels <<
294 * Arg 3 ~ x3: Pixel count */
295memcpy_bitwise_or_3c_asm:
296
297 lsr x3, x3, #4
298 _bitwise_or_3c_loop:
299 ld3 { v0.16b - v2.16b }, [x0], #48 // deinterlace load rgb channels in q0-q2
300 ldr q3, [x1], #16 // load binary frame normally in q3
301 orr v0.16b, v0.16b, v3.16b
302 orr v1.16b, v1.16b, v3.16b
303 orr v2.16b, v2.16b, v3.16b // or each channel with binary frame
304 st3 { v0.16b - v2.16b }, [x2], #48 // interlace store the channels back
305
306 subs x3, x3, #1
307 bne _bitwise_or_loop
308
309 ret
310// END
311
312
313/* Arg 0 ~ x0: Source A
314 * Arg 1 ~ x1: Source B
315 * Arg 2 ~ x2: Destination
316 * Arg 3 ~ x3: Pixel count */
317memcpy_add_asm:
318
319 lsr x3, x3, #4 // get iterations by dividing size by 16
320 _add_loop:
321 ldr q0, [x0], #16
322 ldr q1, [x1], #16
323 uqadd v2.16b, v0.16b, v1.16b
324 str q2, [x2], #16
325
326 subs x3, x3, #1
327 bne _add_loop
328
329 ret
330// END
331
332/* Arg 0 ~ x0: Source A -- >> 3 channels <<
333 * Arg 1 ~ x1: Source B
334 * Arg 2 ~ x2: Destination -- >> 3 channels <<
335 * Arg 3 ~ x3: Pixel count */
336memcpy_add_3c_asm:
337
338 lsr x3, x3, #4
339 _add_3c_loop:
340 ld3 { v0.16b - v2.16b }, [x0], #48
341 ldr q3, [x1], #16
342 uqadd v0.16b, v0.16b, v3.16b
343 uqadd v1.16b, v1.16b, v3.16b
344 uqadd v2.16b, v2.16b, v3.16b
345 st3 { v0.16b - v2.16b }, [x2], #48
346
347 subs x3, x3, #1
348 bne _add_3c_loop
349
350 ret
351// END
352
353/* Arg 0 ~ x0: Source A -- >> 3 channels <<
354 * Arg 1 ~ x1: Source B -- >> 3 channels <<
355 * Arg 2 ~ x2: Destination -- >> 3 channels <<
356 * Arg 3 ~ x3: Pixel count */
357memcpy_add_3c2_asm:
358
359 lsr x3, x3, #4
360 _add_3c2_loop:
361 ld3 { v0.16b - v2.16b }, [x0], #48
362 ld3 { v3.16b - v5.16b }, [x1], #48
363 uqadd v0.16b, v0.16b, v3.16b
364 uqadd v1.16b, v1.16b, v4.16b
365 uqadd v2.16b, v2.16b, v5.16b
366 st3 { v0.16b - v2.16b }, [x2], #48
367
368 subs x3, x3, #1
369 bne _add_3c_loop
370
371 ret
372// END
373
374
375/* Arg 0 ~ x0: Address of base array that is being subtracted
376 * Arg 1 ~ x1: Address of second array that is being subtracted
377 * Arg 2 ~ x2: Address of destination array
378 * Arg 3 ~ x3: Size of arrays (width * height) */
379memcpy_subtract_asm:
380
381 lsr x3, x3, #4 // get iterations by dividing size by 16
382 _subtract_loop:
383 ldr q0, [x0], #16
384 ldr q1, [x1], #16
385 uqsub v2.16b, v0.16b, v1.16b
386 str q2, [x2], #16
387
388 subs x3, x3, #1
389 bne _subtract_loop
390
391 ret
392// END