1.global memcpy_threshold_asm
 
    2.global memcpy_threshold_binary_asm
 
    3.global memcpy_compare3_add_asm     // a bit broken?
 
    5.global memcpy_deinterlace_wstb_asm
 
    6.global memcpy_deinterlace_togray_asm
 
    7.global memcpy_bitwise_or_asm
 
    8.global memcpy_bitwise_or_3c_asm
 
   10.global memcpy_add_3c_asm
 
   11.global memcpy_add_3c2_asm
 
   12.global memcpy_subtract_asm
 
   16 * --> https://arm-software.github.io/acle/neon_intrinsics/advsimd.html
 
   17 * --> https://developer.arm.com/documentation/102159/0400/Load-and-store---example-RGB-conversion
 
   18 * --> https://developer.arm.com/documentation/den0024/a
 
   21/* For the aarch64 ABI, x0-x7 are callee-saved, so no stack frame push is necessary unless more than 8 args are needed
 
   22 * To push the sp and fp, use (change the alloc ammount based on how many extra args there are):
 
   23    stp x29, x30, [sp, #-16]!
 
   25 * And to restore, use (before ret):
 
   26    ldp x29, x30, [sp], #16
 
   27 * This is just a reminder in case this is ever needed, currently none of the functions have more than 8 args
 
   31/* Arg 0 ~ x0: Source address (single channel frame)
 
   32 * Arg 1 ~ x1: Destonation address
 
   33 * Arg 2 ~ x2: Pixel count
 
   34 * Arg 3 ~ x3: Threshold value */
 
   41        cmhi v1.16b, v0.16b, v3.16b
 
   42        and v2.16b, v1.16b, v0.16b
 
   52/* Arg 0 ~ x0: Source address (single channel frame)
 
   53 * Arg 1 ~ x1: Destonation address
 
   54 * Arg 2 ~ x2: Pixel count
 
   55 * Arg 3 ~ x3: Threshold value */
 
   56memcpy_threshold_binary_asm:
 
   62        cmhi v1.16b, v0.16b, v3.16b
 
   72/* Arg 0 ~ x0: Primary channel address
 
   73 * Arg 1 ~ x1: Compare channel 1 address
 
   74 * Arg 2 ~ x2: Compare channel 2 address
 
   75 * Arg 3 ~ x3: Addition address
 
   76 * Arg 4 ~ x4: Destination Address
 
   77 * Arg 5 ~ x5: Count */
 
   78memcpy_compare3_add_asm:
 
   80    lsr w5, w5, #4       // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
 
   82        ldr q0, [x0], #16   // load primary compare
 
   83        ldr q1, [x1], #16   // load compare 1
 
   84        ldr q2, [x2], #16   // load compare 2
 
   85        ldr q3, [x3], #16   // load addition
 
   87        cmhi v1.16b, v0.16b, v1.16b     // compare primary and channel 1
 
   88        cmhi v2.16b, v0.16b, v2.16b     // compare primary and channel 2
 
   89        and v1.16b, v1.16b, v2.16b      // AND the results
 
   90        and v2.16b, v0.16b, v1.16b      // AND the threshold to the input
 
   91        ushr v1.16b, v2.16b, #6         // rshift result by 6 (div 32)
 
   93        neg v1.16b, v1.16b              // invert sign
 
   94        uqshl v2.16b, v0.16b, v1.16b    // left shift primary by negated result (right shift, mimics a division)
 
   95        uqshl v2.16b, v2.16b, #4        // left shift result to fill all 8 bits
 
   96        uqadd v3.16b, v2.16b, v3.16b    // add the threshold to the additive
 
   98        str q3, [x4], #16   // Load out of Q3
 
  107/* Arg 0 ~ x0: Primary channel address
 
  108 * Arg 1 ~ x1: Compare channel 2 address
 
  109 * Arg 2 ~ x2: Compare channel 3 address
 
  110 * Arg 3 ~ x3: Destination address
 
  114 * Arg 7 ~ x7: Gamma */
 
  117    dup v4.16b, w5      // duplicate alpha across 16x8bit lanes
 
  118    clz v4.16b, v4.16b  // count the leading 0's (inverse of log2(value))
 
  119    neg v4.16b, v4.16b  // negate count (for right shifting)
 
  121    dup v5.16b, w6      // ''' for beta
 
  125    dup v6.16b, w7      // duplicate gamma across 16x8bit lanes
 
  127    lsr x4, x4, #4      // right shift the count by 4 bits (divide by 16, 16 bytes = 128 bits)
 
  134        // Q0 --> primary, Q1 --> alpha, Q2 --> beta
 
  135        ushl v1.16b, v1.16b, v4.16b     // weight alpha --> shift right by the amount of leading 0's from alpha (lshift by negative)
 
  136        ushl v2.16b, v2.16b, v5.16b     // weight beta
 
  137        uqadd v3.16b, v1.16b, v2.16b    // add weights
 
  138        uqadd v3.16b, v3.16b, v6.16b    // add gamma
 
  139        uqsub v3.16b, v0.16b, v3.16b    // subtract result from primary
 
  141        str q3, [x1], #16   // Load out of Q3
 
  150/* Arg 0 ~ x0: Source address (3-channel continuous buffer required)
 
  151 * Arg 1 ~ x1: Destination address
 
  152 * Arg 2 ~ x2: Count (frame size) --> NOT(size * #channels)
 
  153 * Arg 3 ~ x3: C1/Primary channel offset(index) --> ex. 0 for first; alpha/beta follow
 
  156 * Arg 6 ~ x6: Threshold */
 
  157memcpy_deinterlace_wstb_asm:
 
  159    dup v4.16b, w4      // duplicate alpha across 16x8bit lanes
 
  160    clz v4.16b, v4.16b  // count the leading 0's (inverse of log2(value))
 
  161    neg v4.16b, v4.16b  // negate count (for right shifting)
 
  163    dup v5.16b, w5      // ''' for beta
 
  167    dup v6.16b, w6      // duplicate threshold '''
 
  169    lsr x2, x2, #4      // right shift the count by 4 bits (divide by 16, 16 bytes = 128 bits)
 
  172    b.lt _c0_split_wstb_loop
 
  173    b.eq _c1_split_wstb_loop
 
  174    b.gt _c2_split_wstb_loop
 
  177        ld3 { v0.16b, v1.16b, v2.16b }, [x0], #48   // deinterlace 48 bytes of RGB into 3x16byte V registers
 
  179        // Q0 --> primary, Q1 --> alpha, Q2 --> beta
 
  180        ushl v1.16b, v1.16b, v4.16b     // weight alpha --> shift right by the amount of leading 0's from alpha (lshift by negative)
 
  181        ushl v2.16b, v2.16b, v5.16b     // weight beta
 
  182        uqadd v3.16b, v1.16b, v2.16b    // add weights
 
  183        uqsub v3.16b, v0.16b, v3.16b    // subtract result from primary
 
  184        cmhi v3.16b, v3.16b, v6.16b     // threshold result
 
  186        str q3, [x1], #16   // Load out of Q3
 
  189        bne _c0_split_wstb_loop
 
  193        ld3 { v0.16b, v1.16b, v2.16b }, [x0], #48
 
  195        // Q0 --> beta, Q1 --> primary, Q2 --> alpha
 
  196        ushl v2.16b, v2.16b, v4.16b     // weight alpha
 
  197        ushl v0.16b, v0.16b, v5.16b     // weight beta
 
  198        uqadd v3.16b, v2.16b, v0.16b    // add weights
 
  199        uqsub v3.16b, v1.16b, v3.16b    // subtract result from primary
 
  200        cmhi v3.16b, v3.16b, v6.16b     // threshold result
 
  202        str q3, [x1], #16   // Load out of Q3
 
  205        bne _c1_split_wstb_loop
 
  209        ld3 { v0.16b, v1.16b, v2.16b }, [x0], #48
 
  211        // Q0 --> alpha, Q1 --> beta, Q2 --> primary
 
  212        ushl v0.16b, v0.16b, v4.16b     // weight alpha
 
  213        ushl v1.16b, v1.16b, v5.16b     // weight beta
 
  214        uqadd v3.16b, v0.16b, v1.16b    // add weights
 
  215        uqsub v3.16b, v2.16b, v3.16b    // subtract result from primary
 
  216        cmhi v3.16b, v3.16b, v6.16b     // threshold result
 
  218        str q3, [x1], #16   // Load out of Q3
 
  221        bne _c2_split_wstb_loop
 
  229/* Arg 0 ~ x0: Source array address (3-channel interlaced buffer)
 
  230 * Arg 1 ~ x1: Destination array address
 
  231 * Arg 2 ~ x2: Pixel total -- size of destination, size of source / 3 */
 
  232memcpy_deinterlace_togray_asm:
 
  237        ld3 { v0.16b - v2.16b }, [x0], #48
 
  239        ushr v0.16b, v0.16b, #2     // divide red/blue by 4
 
  240        ushr v1.16b, v1.16b, #1     // divide green by 2
 
  241        ushr v2.16b, v2.16b, #2     // divide blue/red by 4
 
  243        uqadd v3.16b, v0.16b, v1.16b
 
  244        uqadd v3.16b, v2.16b, v3.16b    // 1/4 + 1/2 + 1/4 = 1/1
 
  248        beq _togray_end     // exactly 0 bytes left --> end
 
  250        b.hs _togray_loop   // >= 16 --> keep going
 
  253        ld3 { v0.b - v2.b }[0], [x0], #3
 
  255        ushr v0.16b, v0.16b, #2     // divide red/blue by 4
 
  256        ushr v1.16b, v1.16b, #1     // divide green by 2
 
  257        ushr v2.16b, v2.16b, #2     // divide blue/red by 4
 
  259        uqadd v3.16b, v0.16b, v1.16b
 
  260        uqadd v3.16b, v2.16b, v3.16b    // 1/4 + 1/2 + 1/4 = 1/1
 
  261        st1 {v2.b}[0], [x1], #1
 
  272/* Arg 0 ~ x0: Source A
 
  273 * Arg 1 ~ x1: Source B
 
  274 * Arg 2 ~ x2: Destination
 
  275 * Arg 3 ~ x3: Count */
 
  276memcpy_bitwise_or_asm:
 
  278    lsr x3, x3, #4      // Shift count right by 4 bits (divide by 16, 16 bytes = 128 bits)
 
  282        orr v2.16b, v0.16b, v1.16b
 
  291/* Arg 0 ~ x0: Source A -- >> 3 channels <<
 
  292 * Arg 1 ~ x1: Source B
 
  293 * Arg 2 ~ x2: Destination -- >> 3 channels <<
 
  294 * Arg 3 ~ x3: Pixel count */
 
  295memcpy_bitwise_or_3c_asm:
 
  299        ld3 { v0.16b - v2.16b }, [x0], #48  // deinterlace load rgb channels in q0-q2
 
  300        ldr q3, [x1], #16                   // load binary frame normally in q3
 
  301        orr v0.16b, v0.16b, v3.16b
 
  302        orr v1.16b, v1.16b, v3.16b
 
  303        orr v2.16b, v2.16b, v3.16b          // or each channel with binary frame
 
  304        st3 { v0.16b - v2.16b }, [x2], #48  // interlace store the channels back
 
  313/* Arg 0 ~ x0: Source A
 
  314 * Arg 1 ~ x1: Source B
 
  315 * Arg 2 ~ x2: Destination
 
  316 * Arg 3 ~ x3: Pixel count */
 
  319    lsr x3, x3, #4      // get iterations by dividing size by 16
 
  323        uqadd v2.16b, v0.16b, v1.16b
 
  332/* Arg 0 ~ x0: Source A -- >> 3 channels <<
 
  333 * Arg 1 ~ x1: Source B
 
  334 * Arg 2 ~ x2: Destination -- >> 3 channels <<
 
  335 * Arg 3 ~ x3: Pixel count */
 
  340        ld3 { v0.16b - v2.16b }, [x0], #48
 
  342        uqadd v0.16b, v0.16b, v3.16b
 
  343        uqadd v1.16b, v1.16b, v3.16b
 
  344        uqadd v2.16b, v2.16b, v3.16b
 
  345        st3 { v0.16b - v2.16b }, [x2], #48
 
  353/* Arg 0 ~ x0: Source A -- >> 3 channels <<
 
  354 * Arg 1 ~ x1: Source B -- >> 3 channels <<
 
  355 * Arg 2 ~ x2: Destination -- >> 3 channels <<
 
  356 * Arg 3 ~ x3: Pixel count */
 
  361        ld3 { v0.16b - v2.16b }, [x0], #48
 
  362        ld3 { v3.16b - v5.16b }, [x1], #48
 
  363        uqadd v0.16b, v0.16b, v3.16b
 
  364        uqadd v1.16b, v1.16b, v4.16b
 
  365        uqadd v2.16b, v2.16b, v5.16b
 
  366        st3 { v0.16b - v2.16b }, [x2], #48
 
  375/* Arg 0 ~ x0: Address of base array that is being subtracted
 
  376 * Arg 1 ~ x1: Address of second array that is being subtracted
 
  377 * Arg 2 ~ x2: Address of destination array
 
  378 * Arg 3 ~ x3: Size of arrays (width * height) */
 
  381    lsr x3, x3, #4      // get iterations by dividing size by 16
 
  385        uqsub v2.16b, v0.16b, v1.16b