1.global memcpy_threshold_asm
2.global memcpy_threshold_binary_asm
3.global memcpy_compare3_add_asm // a bit broken
5.global memcpy_split_wst_asm
7.global memcpy_bitwise_or_asm
8.global memcpy_subtract_asm
12// Arg 0: r0: Destination Address
13// Arg 1: r1: Source Address
15// Arg 3: r3: Threshold Minimum
18 add fp, sp, #0 // Push new Stack Frame
20 lsr r2, #4 // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
21 vdup.8 q3, r3 // Write threshold to q3
27 vcgt.u8 q1, q0, q3 // Will set all to 1 if Greater
28 vand.u8 q2, q1, q0 // AND the new set, result is in q2
38 sub sp, fp, #0 // Pop our Stack Frame
43// Arg 0: r0: Destination Address
44// Arg 1: r1: Source Address
46// Arg 3: r3: Threshold Minimum
47memcpy_threshold_binary_asm:
49 add fp, sp, #0 // Push new Stack Frame
51 lsr r2, #4 // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
52 vdup.8 q3, r3 // Write threshold to q3
58 vcgt.u8 q1, q0, q3 // Will set all to 1 if Greater
68 sub sp, fp, #0 // Pop our Stack Frame
73// Arg 0: r0: Primary channel address
74// Arg 1: r1: Compare channel 1 address
75// Arg 2: r2: Compare channel 2 address
76// Arg 3: r3: Addition address
77// Arg 4: r4: Destination Address
79memcpy_compare3_add_asm:
81 add fp, sp, #0 // Push new Stack Frame
83 ldr r4, [sp, #12] // load 5th and 6th args from stack
86 lsr r5, #4 // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
88 // load primary compare
89 vld1.64 d0, [r0]! // Q0
92 vld1.64 d2, [r1]! // Q1
95 vld1.64 d4, [r2]! // Q2
98 vld1.64 d6, [r3]! // Q3
101 vcgt.u8 q1, q0, q1 // compare primary and channel 1
102 vcgt.u8 q2, q0, q2 // compare primary and channel 2
103 vand.u8 q1, q1, q2 // AND the results
104 vand.u8 q2, q0, q1 // AND the threshold to the input
105 vshr.u8 q1, q2, #6 // rshift result by 6 (div 32)
107 vneg.s8 q1, q1 // invert sign
108 vshl.s8 q2, q0, q1 // left shift primary by negated result (right shift, mimics a division)
109 vshl.u8 q2, q2, #4 // left shift result to fill all 8 bits
110 vqadd.u8 q3, q2, q3 // add the threshold to the additive
120 sub sp, fp, #0 // Pop our Stack Frame
125// Arg 0: r0: Primary channel address
126// Arg 1: r1: Compare channel 2 address
127// Arg 2: r2: Compare channel 3 address
128// Arg 3: r3: Destination address
135 add fp, sp, #0 // Push new Stack Frame
137 ldr r4, [sp, #(0+20)] // load 5th - 8th args from stack
138 ldr r5, [sp, #(4+20)]
139 ldr r6, [sp, #(8+20)]
140 ldr r7, [sp, #(12+20)]
142 vdup.8 q8, r5 // duplicate alpha across 16x8bit chunks
143 vclz.u8 q8, q8 // count the leading 0's (inverse of log2(value))
144 vneg.s8 q8, q8 // negate count (for right shifting)
146 vdup.8 q9, r6 // ''' for beta
150 vdup.8 q10, r7 // duplicate gamma across 16x8bit chunks
152 lsr r4, #4 // right shift the count by 4 bits (divide by 16, 16 bytes = 128 bits)
154 // load primary channel
155 vld1.64 d0, [r0]! // Q0
158 vld1.64 d2, [r1]! // Q1
161 vld1.64 d4, [r2]! // Q2
164 vshl.u8 q1, q1, q8 // weight q1 --> q1
165 vshl.u8 q2, q2, q9 // weight q2 --> q2
166 vqadd.u8 q3, q1, q2 // add weights
167 vqadd.u8 q3, q3, q10 // add gamma
169 vqsub.u8 q3, q0, q3 // subtract result from primary
179 sub sp, fp, #0 // Pop our Stack Frame
184// Arg 0: r0: Source address (3-channel continuous buffer required)
185// Arg 1: r1: Destination address
186// Arg 2: r2: Count (frame size) --> NOT(size * #channels)
187// Arg 3: r3: C1/Primary channel offset(index) --> ex. 0 for first; alpha/beta follow
193 add fp, sp, #0 // Push new Stack Frame
195 ldr r4, [sp, #(0+16)] // load 5th - 7th args from stack
196 ldr r5, [sp, #(4+16)]
197 ldr r6, [sp, #(8+16)]
199 vdup.8 q8, r4 // duplicate alpha across 16x8bit chunks
200 vclz.u8 q8, q8 // count the leading 0's (inverse of log2(value))
201 vneg.s8 q8, q8 // negate count (for right shifting)
203 vdup.8 q9, r5 // ''' for beta
207 vdup.8 q10, r6 // duplicate gamma across 16x8bit chunks
209 lsr r2, #4 // right shift the count by 4 bits (divide by 16, 16 bytes = 128 bits)
212 blt _c0_split_wst_loop
213 beq _c1_split_wst_loop
214 bgt _c2_split_wst_loop
217 //ld3 { v0.16b, v1.16b, v2.16b }, [r0]!
218 vld3.8 { d0, d2, d4 }, [r0]!
219 vld3.8 { d1, d3, d5 }, [r0]!
221 // Q0 --> primary, Q1 --> alpha, Q2 --> beta
222 vshl.u8 q1, q1, q8 // weight alpha
223 vshl.u8 q2, q2, q9 // weight beta
224 vqadd.u8 q3, q1, q2 // add weights
225 vqadd.u8 q3, q3, q10 // add gamma
227 vqsub.u8 q3, q0, q3 // subtract result from primary
230 vst1.64 { d6, d7 }, [r1]!
234 bgt _c0_split_wst_loop
238 vld3.8 { d0, d2, d4 }, [r0]!
239 vld3.8 { d1, d3, d5 }, [r0]!
241 // Q0 --> beta, Q1 --> primary, Q2 --> alpha
242 vshl.u8 q2, q2, q8 // weight alpha
243 vshl.u8 q0, q0, q9 // weight beta
244 vqadd.u8 q3, q2, q0 // add weights
245 vqadd.u8 q3, q3, q10 // add gamma
247 vqsub.u8 q3, q1, q3 // subtract result from primary
250 vst1.64 { d6, d7 }, [r1]!
254 bgt _c1_split_wst_loop
258 vld3.8 { d0, d2, d4 }, [r0]!
259 vld3.8 { d1, d3, d5 }, [r0]!
261 // Q0 --> alpha, Q1 --> beta, Q2 --> primary
262 vshl.u8 q0, q0, q8 // weight alpha
263 vshl.u8 q1, q1, q9 // weight beta
264 vqadd.u8 q3, q0, q1 // add weights
265 vqadd.u8 q3, q3, q10 // add gamma
267 vqsub.u8 q3, q2, q3 // subtract result from primary
270 vst1.64 { d6, d7 }, [r1]!
274 bgt _c2_split_wst_loop
278 sub sp, fp, #0 // Pop our Stack Frame
286// Arg 0: r0: Source A
287// Arg 1: r1: Source B
290memcpy_bitwise_or_asm:
292 add fp, sp, #0 // Push new Stack Frame
294 lsr r3, #4 // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
313 sub sp, fp, #0 // Pop our Stack Frame
318// Arg 0: r0: Address of base array that is being subtracted
319// Arg 1: r1: Address of second array that is being subtracted
320// Arg 2: r2: Address of destination array
321// Arg 3: r3: Size of arrays (width * height)
326 lsr r3, #4 // get iterations by dividing size by 16
339 // load Q2 to destination
347 sub sp, fp, #0 // Pop our Stack Frame