VisionServer  v2.1.1-1-g21dc5465
FRC vision library
neon32.S
Go to the documentation of this file.
1.global memcpy_threshold_asm
2.global memcpy_threshold_binary_asm
3.global memcpy_compare3_add_asm // a bit broken
4.global memcpy_wst_asm
5.global memcpy_split_wst_asm
6
7.global memcpy_bitwise_or_asm
8.global memcpy_subtract_asm
9
10
11
12// Arg 0: r0: Destination Address
13// Arg 1: r1: Source Address
14// Arg 2: r2: Count
15// Arg 3: r3: Threshold Minimum
16memcpy_threshold_asm:
17 push {fp}
18 add fp, sp, #0 // Push new Stack Frame
19
20 lsr r2, #4 // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
21 vdup.8 q3, r3 // Write threshold to q3
22 _thresh_loop:
23 // Load into Q0
24 vld1.64 d0, [r1]!
25 vld1.64 d1, [r1]!
26
27 vcgt.u8 q1, q0, q3 // Will set all to 1 if Greater
28 vand.u8 q2, q1, q0 // AND the new set, result is in q2
29
30 // Load out of Q2
31 vst1.64 d4, [r0]!
32 vst1.64 d5, [r0]!
33
34 sub r2, r2, #1
35 cmp r2, #0
36 bgt _thresh_loop
37
38 sub sp, fp, #0 // Pop our Stack Frame
39 pop {fp}
40 bx lr
41
42
43// Arg 0: r0: Destination Address
44// Arg 1: r1: Source Address
45// Arg 2: r2: Count
46// Arg 3: r3: Threshold Minimum
47memcpy_threshold_binary_asm:
48 push {fp}
49 add fp, sp, #0 // Push new Stack Frame
50
51 lsr r2, #4 // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
52 vdup.8 q3, r3 // Write threshold to q3
53 _thresh_bin_loop:
54 // Load into Q0
55 vld1.64 d0, [r1]!
56 vld1.64 d1, [r1]!
57
58 vcgt.u8 q1, q0, q3 // Will set all to 1 if Greater
59
60 // Load out of Q1
61 vst1.64 d2, [r0]!
62 vst1.64 d3, [r0]!
63
64 sub r2, r2, #1
65 cmp r2, #0
66 bgt _thresh_bin_loop
67
68 sub sp, fp, #0 // Pop our Stack Frame
69 pop {fp}
70 bx lr
71
72
73// Arg 0: r0: Primary channel address
74// Arg 1: r1: Compare channel 1 address
75// Arg 2: r2: Compare channel 2 address
76// Arg 3: r3: Addition address
77// Arg 4: r4: Destination Address
78// Arg 5: r5: Count
79memcpy_compare3_add_asm:
80 push {r4, r5, fp}
81 add fp, sp, #0 // Push new Stack Frame
82
83 ldr r4, [sp, #12] // load 5th and 6th args from stack
84 ldr r5, [sp, #16]
85
86 lsr r5, #4 // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
87 _cmp3_add_loop:
88 // load primary compare
89 vld1.64 d0, [r0]! // Q0
90 vld1.64 d1, [r0]!
91 // load compare 1
92 vld1.64 d2, [r1]! // Q1
93 vld1.64 d3, [r1]!
94 // load compare 2
95 vld1.64 d4, [r2]! // Q2
96 vld1.64 d5, [r2]!
97 // load addition
98 vld1.64 d6, [r3]! // Q3
99 vld1.64 d7, [r3]!
100
101 vcgt.u8 q1, q0, q1 // compare primary and channel 1
102 vcgt.u8 q2, q0, q2 // compare primary and channel 2
103 vand.u8 q1, q1, q2 // AND the results
104 vand.u8 q2, q0, q1 // AND the threshold to the input
105 vshr.u8 q1, q2, #6 // rshift result by 6 (div 32)
106
107 vneg.s8 q1, q1 // invert sign
108 vshl.s8 q2, q0, q1 // left shift primary by negated result (right shift, mimics a division)
109 vshl.u8 q2, q2, #4 // left shift result to fill all 8 bits
110 vqadd.u8 q3, q2, q3 // add the threshold to the additive
111
112 // Load out of Q3
113 vst1.64 d6, [r4]!
114 vst1.64 d7, [r4]!
115
116 sub r5, r5, #1
117 cmp r5, #0
118 bgt _cmp3_add_loop
119
120 sub sp, fp, #0 // Pop our Stack Frame
121 pop {r4, r5, fp}
122 bx lr
123
124
125// Arg 0: r0: Primary channel address
126// Arg 1: r1: Compare channel 2 address
127// Arg 2: r2: Compare channel 3 address
128// Arg 3: r3: Destination address
129// Arg 4: r4: Count
130// Arg 5: r5: Alpha
131// Arg 6: r6: Beta
132// Arg 7: r7: Gamma
133memcpy_wst_asm:
134 push {r4-r7, fp}
135 add fp, sp, #0 // Push new Stack Frame
136
137 ldr r4, [sp, #(0+20)] // load 5th - 8th args from stack
138 ldr r5, [sp, #(4+20)]
139 ldr r6, [sp, #(8+20)]
140 ldr r7, [sp, #(12+20)]
141
142 vdup.8 q8, r5 // duplicate alpha across 16x8bit chunks
143 vclz.u8 q8, q8 // count the leading 0's (inverse of log2(value))
144 vneg.s8 q8, q8 // negate count (for right shifting)
145
146 vdup.8 q9, r6 // ''' for beta
147 vclz.u8 q9, q9
148 vneg.s8 q9, q9
149
150 vdup.8 q10, r7 // duplicate gamma across 16x8bit chunks
151
152 lsr r4, #4 // right shift the count by 4 bits (divide by 16, 16 bytes = 128 bits)
153 _wst_loop:
154 // load primary channel
155 vld1.64 d0, [r0]! // Q0
156 vld1.64 d1, [r0]!
157 // load channel 2
158 vld1.64 d2, [r1]! // Q1
159 vld1.64 d3, [r1]!
160 // load channel 3
161 vld1.64 d4, [r2]! // Q2
162 vld1.64 d5, [r2]!
163
164 vshl.u8 q1, q1, q8 // weight q1 --> q1
165 vshl.u8 q2, q2, q9 // weight q2 --> q2
166 vqadd.u8 q3, q1, q2 // add weights
167 vqadd.u8 q3, q3, q10 // add gamma
168
169 vqsub.u8 q3, q0, q3 // subtract result from primary
170
171 // Load out of Q3
172 vst1.64 d6, [r3]!
173 vst1.64 d7, [r3]!
174
175 sub r4, r4, #1
176 cmp r4, #0
177 bgt _wst_loop
178
179 sub sp, fp, #0 // Pop our Stack Frame
180 pop {r4-r7, fp}
181 bx lr
182
183
184// Arg 0: r0: Source address (3-channel continuous buffer required)
185// Arg 1: r1: Destination address
186// Arg 2: r2: Count (frame size) --> NOT(size * #channels)
187// Arg 3: r3: C1/Primary channel offset(index) --> ex. 0 for first; alpha/beta follow
188// Arg 4: r4: Alpha
189// Arg 5: r5: Beta
190// Arg 6: r6: Gamma
191memcpy_split_wst_asm:
192 push {r4-r6, fp}
193 add fp, sp, #0 // Push new Stack Frame
194
195 ldr r4, [sp, #(0+16)] // load 5th - 7th args from stack
196 ldr r5, [sp, #(4+16)]
197 ldr r6, [sp, #(8+16)]
198
199 vdup.8 q8, r4 // duplicate alpha across 16x8bit chunks
200 vclz.u8 q8, q8 // count the leading 0's (inverse of log2(value))
201 vneg.s8 q8, q8 // negate count (for right shifting)
202
203 vdup.8 q9, r5 // ''' for beta
204 vclz.u8 q9, q9
205 vneg.s8 q9, q9
206
207 vdup.8 q10, r6 // duplicate gamma across 16x8bit chunks
208
209 lsr r2, #4 // right shift the count by 4 bits (divide by 16, 16 bytes = 128 bits)
210
211 cmp r3, #1
212 blt _c0_split_wst_loop
213 beq _c1_split_wst_loop
214 bgt _c2_split_wst_loop
215
216 _c0_split_wst_loop:
217 //ld3 { v0.16b, v1.16b, v2.16b }, [r0]!
218 vld3.8 { d0, d2, d4 }, [r0]!
219 vld3.8 { d1, d3, d5 }, [r0]!
220
221 // Q0 --> primary, Q1 --> alpha, Q2 --> beta
222 vshl.u8 q1, q1, q8 // weight alpha
223 vshl.u8 q2, q2, q9 // weight beta
224 vqadd.u8 q3, q1, q2 // add weights
225 vqadd.u8 q3, q3, q10 // add gamma
226
227 vqsub.u8 q3, q0, q3 // subtract result from primary
228
229 // Load out of Q3
230 vst1.64 { d6, d7 }, [r1]!
231
232 sub r2, r2, #1
233 cmp r2, #0
234 bgt _c0_split_wst_loop
235 ble _split_wst_end
236
237 _c1_split_wst_loop:
238 vld3.8 { d0, d2, d4 }, [r0]!
239 vld3.8 { d1, d3, d5 }, [r0]!
240
241 // Q0 --> beta, Q1 --> primary, Q2 --> alpha
242 vshl.u8 q2, q2, q8 // weight alpha
243 vshl.u8 q0, q0, q9 // weight beta
244 vqadd.u8 q3, q2, q0 // add weights
245 vqadd.u8 q3, q3, q10 // add gamma
246
247 vqsub.u8 q3, q1, q3 // subtract result from primary
248
249 // Load out of Q3
250 vst1.64 { d6, d7 }, [r1]!
251
252 sub r2, r2, #1
253 cmp r2, #0
254 bgt _c1_split_wst_loop
255 ble _split_wst_end
256
257 _c2_split_wst_loop:
258 vld3.8 { d0, d2, d4 }, [r0]!
259 vld3.8 { d1, d3, d5 }, [r0]!
260
261 // Q0 --> alpha, Q1 --> beta, Q2 --> primary
262 vshl.u8 q0, q0, q8 // weight alpha
263 vshl.u8 q1, q1, q9 // weight beta
264 vqadd.u8 q3, q0, q1 // add weights
265 vqadd.u8 q3, q3, q10 // add gamma
266
267 vqsub.u8 q3, q2, q3 // subtract result from primary
268
269 // Load out of Q3
270 vst1.64 { d6, d7 }, [r1]!
271
272 sub r2, r2, #1
273 cmp r2, #0
274 bgt _c2_split_wst_loop
275 ble _split_wst_end
276
277 _split_wst_end:
278 sub sp, fp, #0 // Pop our Stack Frame
279 pop {r4-r6, fp}
280 bx lr
281
282
283
284
285
286// Arg 0: r0: Source A
287// Arg 1: r1: Source B
288// Arg 2: r2: Dest
289// Arg 3: r3: Count
290memcpy_bitwise_or_asm:
291 push {fp}
292 add fp, sp, #0 // Push new Stack Frame
293
294 lsr r3, #4 // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
295 _bitwise_or_loop:
296 // Load into Q0
297 vld1.64 d0, [r0]!
298 vld1.64 d1, [r0]!
299 // Load into Q1
300 vld1.64 d2, [r1]!
301 vld1.64 d3, [r1]!
302
303 vorr.u8 q2, q0, q1
304
305 // Load out of Q2
306 vst1.64 d4, [r2]!
307 vst1.64 d5, [r2]!
308
309 sub r3, r3, #1
310 cmp r3, #0
311 bgt _bitwise_or_loop
312
313 sub sp, fp, #0 // Pop our Stack Frame
314 pop {fp}
315 bx lr
316
317
318// Arg 0: r0: Address of base array that is being subtracted
319// Arg 1: r1: Address of second array that is being subtracted
320// Arg 2: r2: Address of destination array
321// Arg 3: r3: Size of arrays (width * height)
322memcpy_subtract_asm:
323 push {fp}
324 add fp, sp, #0
325
326 lsr r3, #4 // get iterations by dividing size by 16
327 _subtract_loop:
328 // load base
329 vld1.64 d0, [r0]!
330 vld1.64 d1, [r0]!
331
332 // load subtractor
333 vld1.64 d2, [r1]!
334 vld1.64 d3, [r1]!
335
336 // subtract
337 vqsub.u8 q2, q0, q1
338
339 // load Q2 to destination
340 vst1.64 d4, [r2]!
341 vst1.64 d5, [r2]!
342
343 sub r3, r3, #1
344 cmp r3, #0
345 bgt _subtract_loop
346
347 sub sp, fp, #0 // Pop our Stack Frame
348 pop {fp}
349 bx lr