1.global memcpy_threshold_asm
2.global memcpy_threshold_binary_asm
3.global memcpy_compare3_add_asm // a bit broken?
5.global memcpy_deinterlace_wstb_asm
6.global memcpy_deinterlace_togray_asm
7.global memcpy_bitwise_or_asm
8.global memcpy_bitwise_or_3c_asm
10.global memcpy_add_3c_asm
11.global memcpy_add_3c2_asm
12.global memcpy_subtract_asm
16 * --> https://arm-software.github.io/acle/neon_intrinsics/advsimd.html
17 * --> https://developer.arm.com/documentation/102159/0400/Load-and-store---example-RGB-conversion
18 * --> https://developer.arm.com/documentation/den0024/a
21/* For the aarch64 ABI, x0-x7 are callee-saved, so no stack frame push is necessary unless more than 8 args are needed
22 * To push the sp and fp, use (change the alloc ammount based on how many extra args there are):
23 stp x29, x30, [sp, #-16]!
25 * And to restore, use (before ret):
26 ldp x29, x30, [sp], #16
27 * This is just a reminder in case this is ever needed, currently none of the functions have more than 8 args
31/* Arg 0 ~ x0: Source address (single channel frame)
32 * Arg 1 ~ x1: Destonation address
33 * Arg 2 ~ x2: Pixel count
34 * Arg 3 ~ x3: Threshold value */
41 cmhi v1.16b, v0.16b, v3.16b
42 and v2.16b, v1.16b, v0.16b
52/* Arg 0 ~ x0: Source address (single channel frame)
53 * Arg 1 ~ x1: Destonation address
54 * Arg 2 ~ x2: Pixel count
55 * Arg 3 ~ x3: Threshold value */
56memcpy_threshold_binary_asm:
62 cmhi v1.16b, v0.16b, v3.16b
72/* Arg 0 ~ x0: Primary channel address
73 * Arg 1 ~ x1: Compare channel 1 address
74 * Arg 2 ~ x2: Compare channel 2 address
75 * Arg 3 ~ x3: Addition address
76 * Arg 4 ~ x4: Destination Address
77 * Arg 5 ~ x5: Count */
78memcpy_compare3_add_asm:
80 lsr w5, w5, #4 // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
82 ldr q0, [x0], #16 // load primary compare
83 ldr q1, [x1], #16 // load compare 1
84 ldr q2, [x2], #16 // load compare 2
85 ldr q3, [x3], #16 // load addition
87 cmhi v1.16b, v0.16b, v1.16b // compare primary and channel 1
88 cmhi v2.16b, v0.16b, v2.16b // compare primary and channel 2
89 and v1.16b, v1.16b, v2.16b // AND the results
90 and v2.16b, v0.16b, v1.16b // AND the threshold to the input
91 ushr v1.16b, v2.16b, #6 // rshift result by 6 (div 32)
93 neg v1.16b, v1.16b // invert sign
94 uqshl v2.16b, v0.16b, v1.16b // left shift primary by negated result (right shift, mimics a division)
95 uqshl v2.16b, v2.16b, #4 // left shift result to fill all 8 bits
96 uqadd v3.16b, v2.16b, v3.16b // add the threshold to the additive
98 str q3, [x4], #16 // Load out of Q3
107/* Arg 0 ~ x0: Primary channel address
108 * Arg 1 ~ x1: Compare channel 2 address
109 * Arg 2 ~ x2: Compare channel 3 address
110 * Arg 3 ~ x3: Destination address
114 * Arg 7 ~ x7: Gamma */
117 dup v4.16b, w5 // duplicate alpha across 16x8bit lanes
118 clz v4.16b, v4.16b // count the leading 0's (inverse of log2(value))
119 neg v4.16b, v4.16b // negate count (for right shifting)
121 dup v5.16b, w6 // ''' for beta
125 dup v6.16b, w7 // duplicate gamma across 16x8bit lanes
127 lsr x4, x4, #4 // right shift the count by 4 bits (divide by 16, 16 bytes = 128 bits)
134 // Q0 --> primary, Q1 --> alpha, Q2 --> beta
135 ushl v1.16b, v1.16b, v4.16b // weight alpha --> shift right by the amount of leading 0's from alpha (lshift by negative)
136 ushl v2.16b, v2.16b, v5.16b // weight beta
137 uqadd v3.16b, v1.16b, v2.16b // add weights
138 uqadd v3.16b, v3.16b, v6.16b // add gamma
139 uqsub v3.16b, v0.16b, v3.16b // subtract result from primary
141 str q3, [x1], #16 // Load out of Q3
150/* Arg 0 ~ x0: Source address (3-channel continuous buffer required)
151 * Arg 1 ~ x1: Destination address
152 * Arg 2 ~ x2: Count (frame size) --> NOT(size * #channels)
153 * Arg 3 ~ x3: C1/Primary channel offset(index) --> ex. 0 for first; alpha/beta follow
156 * Arg 6 ~ x6: Threshold */
157memcpy_deinterlace_wstb_asm:
159 dup v4.16b, w4 // duplicate alpha across 16x8bit lanes
160 clz v4.16b, v4.16b // count the leading 0's (inverse of log2(value))
161 neg v4.16b, v4.16b // negate count (for right shifting)
163 dup v5.16b, w5 // ''' for beta
167 dup v6.16b, w6 // duplicate threshold '''
169 lsr x2, x2, #4 // right shift the count by 4 bits (divide by 16, 16 bytes = 128 bits)
172 b.lt _c0_split_wstb_loop
173 b.eq _c1_split_wstb_loop
174 b.gt _c2_split_wstb_loop
177 ld3 { v0.16b, v1.16b, v2.16b }, [x0], #48 // deinterlace 48 bytes of RGB into 3x16byte V registers
179 // Q0 --> primary, Q1 --> alpha, Q2 --> beta
180 ushl v1.16b, v1.16b, v4.16b // weight alpha --> shift right by the amount of leading 0's from alpha (lshift by negative)
181 ushl v2.16b, v2.16b, v5.16b // weight beta
182 uqadd v3.16b, v1.16b, v2.16b // add weights
183 uqsub v3.16b, v0.16b, v3.16b // subtract result from primary
184 cmhi v3.16b, v3.16b, v6.16b // threshold result
186 str q3, [x1], #16 // Load out of Q3
189 bne _c0_split_wstb_loop
193 ld3 { v0.16b, v1.16b, v2.16b }, [x0], #48
195 // Q0 --> beta, Q1 --> primary, Q2 --> alpha
196 ushl v2.16b, v2.16b, v4.16b // weight alpha
197 ushl v0.16b, v0.16b, v5.16b // weight beta
198 uqadd v3.16b, v2.16b, v0.16b // add weights
199 uqsub v3.16b, v1.16b, v3.16b // subtract result from primary
200 cmhi v3.16b, v3.16b, v6.16b // threshold result
202 str q3, [x1], #16 // Load out of Q3
205 bne _c1_split_wstb_loop
209 ld3 { v0.16b, v1.16b, v2.16b }, [x0], #48
211 // Q0 --> alpha, Q1 --> beta, Q2 --> primary
212 ushl v0.16b, v0.16b, v4.16b // weight alpha
213 ushl v1.16b, v1.16b, v5.16b // weight beta
214 uqadd v3.16b, v0.16b, v1.16b // add weights
215 uqsub v3.16b, v2.16b, v3.16b // subtract result from primary
216 cmhi v3.16b, v3.16b, v6.16b // threshold result
218 str q3, [x1], #16 // Load out of Q3
221 bne _c2_split_wstb_loop
229/* Arg 0 ~ x0: Source array address (3-channel interlaced buffer)
230 * Arg 1 ~ x1: Destination array address
231 * Arg 2 ~ x2: Pixel total -- size of destination, size of source / 3 */
232memcpy_deinterlace_togray_asm:
237 ld3 { v0.16b - v2.16b }, [x0], #48
239 ushr v0.16b, v0.16b, #2 // divide red/blue by 4
240 ushr v1.16b, v1.16b, #1 // divide green by 2
241 ushr v2.16b, v2.16b, #2 // divide blue/red by 4
243 uqadd v3.16b, v0.16b, v1.16b
244 uqadd v3.16b, v2.16b, v3.16b // 1/4 + 1/2 + 1/4 = 1/1
248 beq _togray_end // exactly 0 bytes left --> end
250 b.hs _togray_loop // >= 16 --> keep going
253 ld3 { v0.b - v2.b }[0], [x0], #3
255 ushr v0.16b, v0.16b, #2 // divide red/blue by 4
256 ushr v1.16b, v1.16b, #1 // divide green by 2
257 ushr v2.16b, v2.16b, #2 // divide blue/red by 4
259 uqadd v3.16b, v0.16b, v1.16b
260 uqadd v3.16b, v2.16b, v3.16b // 1/4 + 1/2 + 1/4 = 1/1
261 st1 {v2.b}[0], [x1], #1
272/* Arg 0 ~ x0: Source A
273 * Arg 1 ~ x1: Source B
274 * Arg 2 ~ x2: Destination
275 * Arg 3 ~ x3: Count */
276memcpy_bitwise_or_asm:
278 lsr x3, x3, #4 // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
282 orr v2.16b, v0.16b, v1.16b
291/* Arg 0 ~ x0: Source A -- >> 3 channels <<
292 * Arg 1 ~ x1: Source B
293 * Arg 2 ~ x2: Destination -- >> 3 channels <<
294 * Arg 3 ~ x3: Pixel count */
295memcpy_bitwise_or_3c_asm:
299 ld3 { v0.16b - v2.16b }, [x0], #48 // deinterlace load rgb channels in q0-q2
300 ldr q3, [x1], #16 // load binary frame normally in q3
301 orr v0.16b, v0.16b, v3.16b
302 orr v1.16b, v1.16b, v3.16b
303 orr v2.16b, v2.16b, v3.16b // or each channel with binary frame
304 st3 { v0.16b - v2.16b }, [x2], #48 // interlace store the channels back
313/* Arg 0 ~ x0: Source A
314 * Arg 1 ~ x1: Source B
315 * Arg 2 ~ x2: Destination
316 * Arg 3 ~ x3: Pixel count */
319 lsr x3, x3, #4 // get iterations by dividing size by 16
323 uqadd v2.16b, v0.16b, v1.16b
332/* Arg 0 ~ x0: Source A -- >> 3 channels <<
333 * Arg 1 ~ x1: Source B
334 * Arg 2 ~ x2: Destination -- >> 3 channels <<
335 * Arg 3 ~ x3: Pixel count */
340 ld3 { v0.16b - v2.16b }, [x0], #48
342 uqadd v0.16b, v0.16b, v3.16b
343 uqadd v1.16b, v1.16b, v3.16b
344 uqadd v2.16b, v2.16b, v3.16b
345 st3 { v0.16b - v2.16b }, [x2], #48
353/* Arg 0 ~ x0: Source A -- >> 3 channels <<
354 * Arg 1 ~ x1: Source B -- >> 3 channels <<
355 * Arg 2 ~ x2: Destination -- >> 3 channels <<
356 * Arg 3 ~ x3: Pixel count */
361 ld3 { v0.16b - v2.16b }, [x0], #48
362 ld3 { v3.16b - v5.16b }, [x1], #48
363 uqadd v0.16b, v0.16b, v3.16b
364 uqadd v1.16b, v1.16b, v4.16b
365 uqadd v2.16b, v2.16b, v5.16b
366 st3 { v0.16b - v2.16b }, [x2], #48
375/* Arg 0 ~ x0: Address of base array that is being subtracted
376 * Arg 1 ~ x1: Address of second array that is being subtracted
377 * Arg 2 ~ x2: Address of destination array
378 * Arg 3 ~ x3: Size of arrays (width * height) */
381 lsr x3, x3, #4 // get iterations by dividing size by 16
385 uqsub v2.16b, v0.16b, v1.16b