%PDF- <> %âãÏÓ endobj 2 0 obj <> endobj 3 0 obj <>/ExtGState<>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI] >>/Annots[ 28 0 R 29 0 R] /MediaBox[ 0 0 595.5 842.25] /Contents 4 0 R/Group<>/Tabs/S>> endobj ºaâÚÎΞ-ÌE1ÍØÄ÷{òò2ÿ ÛÖ^ÔÀá TÎ{¦?§®¥kuµùÕ5sLOšuY>endobj 2 0 obj<>endobj 2 0 obj<>endobj 2 0 obj<>endobj 2 0 obj<> endobj 2 0 obj<>endobj 2 0 obj<>es 3 0 R>> endobj 2 0 obj<> ox[ 0.000000 0.000000 609.600000 935.600000]/Fi endobj 3 0 obj<> endobj 7 1 obj<>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI]>>/Subtype/Form>> stream
// Apologies in advance for combining the preprocessor with inline assembly, // two notoriously gnarly parts of C, but it was necessary to avoid a lot of // code repetition. The preprocessor is used to template large sections of // inline assembly that differ only in the registers used. If the code was // written out by hand, it would become very large and hard to audit. // Generate a block of inline assembly that loads three user-defined registers // A, B, C from memory and deinterleaves them, post-incrementing the src // pointer. The register set should be sequential. #define LOAD(A, B, C) \ "ld3 {"A".16b, "B".16b, "C".16b}, [%[src]], #48 \n\t" // Generate a block of inline assembly that takes three deinterleaved registers // and shuffles the bytes. The output is in temporary registers t0..t3. #define SHUF(A, B, C) \ "ushr %[t0].16b, "A".16b, #2 \n\t" \ "ushr %[t1].16b, "B".16b, #4 \n\t" \ "ushr %[t2].16b, "C".16b, #6 \n\t" \ "sli %[t1].16b, "A".16b, #4 \n\t" \ "sli %[t2].16b, "B".16b, #2 \n\t" \ "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" \ "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" \ "and %[t3].16b, "C".16b, %[n63].16b \n\t" // Generate a block of inline assembly that takes temporary registers t0..t3 // and translates them to the base64 alphabet, using a table loaded into // v8..v11. The output is in user-defined registers A..D. #define TRAN(A, B, C, D) \ "tbl "A".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \ "tbl "B".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \ "tbl "C".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \ "tbl "D".16b, {v8.16b-v11.16b}, %[t3].16b \n\t" // Generate a block of inline assembly that interleaves four registers and // stores them, post-incrementing the destination pointer. #define STOR(A, B, C, D) \ "st4 {"A".16b, "B".16b, "C".16b, "D".16b}, [%[dst]], #64 \n\t" // Generate a block of inline assembly that generates a single self-contained // encoder round: fetch the data, process it, and store the result. #define ROUND() \ LOAD("v12", "v13", "v14") \ SHUF("v12", "v13", "v14") \ TRAN("v12", "v13", "v14", "v15") \ STOR("v12", "v13", "v14", "v15") // Generate a block of assembly that generates a type A interleaved encoder // round. It uses registers that were loaded by the previous type B round, and // in turn loads registers for the next type B round. #define ROUND_A() \ SHUF("v2", "v3", "v4") \ LOAD("v12", "v13", "v14") \ TRAN("v2", "v3", "v4", "v5") \ STOR("v2", "v3", "v4", "v5") // Type B interleaved encoder round. Same as type A, but register sets swapped. #define ROUND_B() \ SHUF("v12", "v13", "v14") \ LOAD("v2", "v3", "v4") \ TRAN("v12", "v13", "v14", "v15") \ STOR("v12", "v13", "v14", "v15") // The first type A round needs to load its own registers. #define ROUND_A_FIRST() \ LOAD("v2", "v3", "v4") \ ROUND_A() // The last type B round omits the load for the next step. #define ROUND_B_LAST() \ SHUF("v12", "v13", "v14") \ TRAN("v12", "v13", "v14", "v15") \ STOR("v12", "v13", "v14", "v15") // Suppress clang's warning that the literal string in the asm statement is // overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99 // compilers). It may be true, but the goal here is not C99 portability. #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Woverlength-strings" static inline void enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) { size_t rounds = *slen / 48; if (rounds == 0) { return; } *slen -= rounds * 48; // 48 bytes consumed per round. *olen += rounds * 64; // 64 bytes produced per round. // Number of times to go through the 8x loop. size_t loops = rounds / 8; // Number of rounds remaining after the 8x loop. rounds %= 8; // Temporary registers, used as scratch space. uint8x16_t tmp0, tmp1, tmp2, tmp3; __asm__ volatile ( // Load the encoding table into v8..v11. " ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t" // If there are eight rounds or more, enter an 8x unrolled loop // of interleaved encoding rounds. The rounds interleave memory // operations (load/store) with data operations to maximize // pipeline throughput. " cbz %[loops], 4f \n\t" // The SIMD instructions do not touch the flags. "88: subs %[loops], %[loops], #1 \n\t" " " ROUND_A_FIRST() " " ROUND_B() " " ROUND_A() " " ROUND_B() " " ROUND_A() " " ROUND_B() " " ROUND_A() " " ROUND_B_LAST() " b.ne 88b \n\t" // Enter a 4x unrolled loop for rounds of 4 or more. "4: cmp %[rounds], #4 \n\t" " b.lt 30f \n\t" " " ROUND_A_FIRST() " " ROUND_B() " " ROUND_A() " " ROUND_B_LAST() " sub %[rounds], %[rounds], #4 \n\t" // Dispatch the remaining rounds 0..3. "30: cbz %[rounds], 0f \n\t" " cmp %[rounds], #2 \n\t" " b.eq 2f \n\t" " b.lt 1f \n\t" // Block of non-interlaced encoding rounds, which can each // individually be jumped to. Rounds fall through to the next. "3: " ROUND() "2: " ROUND() "1: " ROUND() "0: \n\t" // Outputs (modified). : [loops] "+r" (loops), [src] "+r" (*s), [dst] "+r" (*o), [t0] "=&w" (tmp0), [t1] "=&w" (tmp1), [t2] "=&w" (tmp2), [t3] "=&w" (tmp3) // Inputs (not modified). : [rounds] "r" (rounds), [tbl] "r" (base64_table_enc_6bit), [n63] "w" (vdupq_n_u8(63)) // Clobbers. : "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" ); } #pragma GCC diagnostic pop