17#if defined(HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_) == defined(HWY_TARGET_TOGGLE)
18#ifdef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
19#undef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
21#define HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
35template <
size_t kBits>
37template <
size_t kBits>
46 using VU16 =
Vec<
decltype(d16)>;
47 const size_t N8 =
Lanes(d8);
69 using VU16 =
Vec<
decltype(d16)>;
70 const size_t N8 =
Lanes(d8);
71 const VU16 mask =
Set(d16, 0x0101u);
75 const VU16 raw0 =
And(packed, mask);
107 using VU16 =
Vec<
decltype(d16)>;
108 const size_t N8 =
Lanes(d8);
110 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
111 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
112 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
113 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
114 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
115 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
116 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
117 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
131 using VU16 =
Vec<
decltype(d16)>;
132 const size_t N8 =
Lanes(d8);
133 const VU16 mask =
Set(d16, 0x0303u);
135 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
136 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
138 const VU16 raw0 =
And(packed0, mask);
141 const VU16 raw1 =
And(packed1, mask);
170 using VU16 =
Vec<
decltype(d16)>;
171 const size_t N8 =
Lanes(d8);
172 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
173 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
174 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
175 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
176 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
177 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
178 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
179 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
187 const VU16 hi2 =
Set(d16, 0xC0C0u);
200 using VU16 =
Vec<
decltype(d16)>;
201 const size_t N8 =
Lanes(d8);
202 const VU16 mask =
Set(d16, 0x0707u);
204 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
205 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
206 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
208 const VU16 raw0 =
And(packed0, mask);
211 const VU16 raw1 =
And(packed1, mask);
214 const VU16 raw2 =
And(packed2, mask);
227 const VU16 hi2 =
Set(d16, 0xC0C0u);
232 const VU16 raw3 =
And(mask, raw73);
246 using VU16 =
Vec<
decltype(d16)>;
247 const size_t N8 =
Lanes(d8);
249 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
250 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
251 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
252 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
253 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
254 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
255 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
256 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
273 using VU16 =
Vec<
decltype(d16)>;
274 const size_t N8 =
Lanes(d8);
275 const VU16 mask =
Set(d16, 0x0F0Fu);
277 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
278 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
279 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
280 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
282 const VU16 raw0 =
And(packed0, mask);
285 const VU16 raw1 =
And(packed1, mask);
294 const VU16 raw4 =
And(packed2, mask);
297 const VU16 raw5 =
And(packed3, mask);
314 using VU16 =
Vec<
decltype(d16)>;
315 const size_t N8 =
Lanes(d8);
316 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
317 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
318 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
319 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
320 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
321 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
322 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
323 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
326 const VU16 hi3 =
Set(d16, 0xE0E0u);
338 const VU16 lo2 =
Set(d16, 0x0303u);
349 using VU16 =
Vec<
decltype(d16)>;
350 const size_t N8 =
Lanes(d8);
352 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
353 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
354 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
355 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
356 const VU16 packed4 =
BitCast(d16,
LoadU(d8, packed_in + 4 * N8));
358 const VU16 mask =
Set(d16, 0x1F1Fu);
360 const VU16 raw0 =
And(packed0, mask);
363 const VU16 raw1 =
And(packed1, mask);
366 const VU16 raw2 =
And(packed2, mask);
369 const VU16 raw3 =
And(packed3, mask);
379 const VU16 lo2 =
Set(d16, 0x0303u);
380 const VU16 raw4 =
OrAnd(top4, lo2, packed4);
398 using VU16 =
Vec<
decltype(d16)>;
399 const size_t N8 =
Lanes(d8);
400 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
401 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
402 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
403 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
404 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
405 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
406 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
407 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
409 const VU16 hi2 =
Set(d16, 0xC0C0u);
430 using VU16 =
Vec<
decltype(d16)>;
431 const size_t N8 =
Lanes(d8);
432 const VU16 mask =
Set(d16, 0x3F3Fu);
434 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
435 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
436 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
437 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
438 const VU16 packed4 =
BitCast(d16,
LoadU(d8, packed_in + 4 * N8));
439 const VU16 packed5 =
BitCast(d16,
LoadU(d8, packed_in + 5 * N8));
441 const VU16 raw0 =
And(packed0, mask);
444 const VU16 raw1 =
And(packed1, mask);
447 const VU16 raw2 =
And(packed2, mask);
450 const VU16 raw4 =
And(packed3, mask);
453 const VU16 raw5 =
And(packed4, mask);
456 const VU16 raw6 =
And(packed5, mask);
477 using VU16 =
Vec<
decltype(d16)>;
478 const size_t N8 =
Lanes(d8);
479 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
480 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
481 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
482 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
483 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
484 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
485 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
487 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
489 const VU16 hi1 =
Set(d16, 0x8080u);
490 const VU16 packed0 =
OrAnd(raw0,
Add(raw7, raw7), hi1);
511 using VU16 =
Vec<
decltype(d16)>;
512 const size_t N8 =
Lanes(d8);
514 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
515 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
516 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
517 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
518 const VU16 packed4 =
BitCast(d16,
LoadU(d8, packed_in + 4 * N8));
519 const VU16 packed5 =
BitCast(d16,
LoadU(d8, packed_in + 5 * N8));
520 const VU16 packed6 =
BitCast(d16,
LoadU(d8, packed_in + 6 * N8));
522 const VU16 mask =
Set(d16, 0x7F7Fu);
524 const VU16 raw0 =
And(packed0, mask);
527 const VU16 raw1 =
And(packed1, mask);
530 const VU16 raw2 =
And(packed2, mask);
533 const VU16 raw3 =
And(packed3, mask);
536 const VU16 raw4 =
And(packed4, mask);
539 const VU16 raw5 =
And(packed5, mask);
542 const VU16 raw6 =
And(packed6, mask);
561 using VU8 =
Vec<
decltype(d8)>;
562 const size_t N8 =
Lanes(d8);
563 const VU8 raw0 =
LoadU(d8, raw + 0 * N8);
564 const VU8 raw1 =
LoadU(d8, raw + 1 * N8);
565 const VU8 raw2 =
LoadU(d8, raw + 2 * N8);
566 const VU8 raw3 =
LoadU(d8, raw + 3 * N8);
567 const VU8 raw4 =
LoadU(d8, raw + 4 * N8);
568 const VU8 raw5 =
LoadU(d8, raw + 5 * N8);
569 const VU8 raw6 =
LoadU(d8, raw + 6 * N8);
570 const VU8 raw7 =
LoadU(d8, raw + 7 * N8);
572 StoreU(raw0, d8, packed_out + 0 * N8);
573 StoreU(raw1, d8, packed_out + 1 * N8);
574 StoreU(raw2, d8, packed_out + 2 * N8);
575 StoreU(raw3, d8, packed_out + 3 * N8);
576 StoreU(raw4, d8, packed_out + 4 * N8);
577 StoreU(raw5, d8, packed_out + 5 * N8);
578 StoreU(raw6, d8, packed_out + 6 * N8);
579 StoreU(raw7, d8, packed_out + 7 * N8);
585 using VU8 =
Vec<
decltype(d8)>;
586 const size_t N8 =
Lanes(d8);
587 const VU8 raw0 =
LoadU(d8, packed_in + 0 * N8);
588 const VU8 raw1 =
LoadU(d8, packed_in + 1 * N8);
589 const VU8 raw2 =
LoadU(d8, packed_in + 2 * N8);
590 const VU8 raw3 =
LoadU(d8, packed_in + 3 * N8);
591 const VU8 raw4 =
LoadU(d8, packed_in + 4 * N8);
592 const VU8 raw5 =
LoadU(d8, packed_in + 5 * N8);
593 const VU8 raw6 =
LoadU(d8, packed_in + 6 * N8);
594 const VU8 raw7 =
LoadU(d8, packed_in + 7 * N8);
596 StoreU(raw0, d8, raw + 0 * N8);
597 StoreU(raw1, d8, raw + 1 * N8);
598 StoreU(raw2, d8, raw + 2 * N8);
599 StoreU(raw3, d8, raw + 3 * N8);
600 StoreU(raw4, d8, raw + 4 * N8);
601 StoreU(raw5, d8, raw + 5 * N8);
602 StoreU(raw6, d8, raw + 6 * N8);
603 StoreU(raw7, d8, raw + 7 * N8);
612 using VU16 =
Vec<
decltype(
d)>;
613 const size_t N =
Lanes(
d);
614 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
615 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
616 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
617 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
618 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
619 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
620 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
621 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
622 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
623 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
624 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
625 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
626 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
627 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
628 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
629 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
648 using VU16 =
Vec<
decltype(
d)>;
649 const size_t N =
Lanes(
d);
650 const VU16 mask =
Set(
d, 1u);
652 const VU16 packed =
LoadU(
d, packed_in);
654 const VU16 raw0 =
And(packed, mask);
685 StoreU(rawA,
d, raw + 0xA * N);
688 StoreU(rawB,
d, raw + 0xB * N);
691 StoreU(rawC,
d, raw + 0xC * N);
694 StoreU(rawD,
d, raw + 0xD * N);
697 StoreU(rawE,
d, raw + 0xE * N);
700 StoreU(rawF,
d, raw + 0xF * N);
709 using VU16 =
Vec<
decltype(
d)>;
710 const size_t N =
Lanes(
d);
711 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
712 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
713 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
714 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
715 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
716 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
717 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
718 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
719 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
720 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
721 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
722 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
723 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
724 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
725 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
726 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
738 StoreU(packed0,
d, packed_out + 0 * N);
739 StoreU(packed1,
d, packed_out + 1 * N);
745 using VU16 =
Vec<
decltype(
d)>;
746 const size_t N =
Lanes(
d);
747 const VU16 mask =
Set(
d, 0x3u);
749 const VU16 packed0 =
LoadU(
d, packed_in + 0 * N);
750 const VU16 packed1 =
LoadU(
d, packed_in + 1 * N);
752 const VU16 raw0 =
And(packed0, mask);
755 const VU16 raw1 =
And(packed1, mask);
783 StoreU(rawA,
d, raw + 0xA * N);
786 StoreU(rawB,
d, raw + 0xB * N);
789 StoreU(rawC,
d, raw + 0xC * N);
792 StoreU(rawD,
d, raw + 0xD * N);
795 StoreU(rawE,
d, raw + 0xE * N);
798 StoreU(rawF,
d, raw + 0xF * N);
807 using VU16 =
Vec<
decltype(
d)>;
808 const size_t N =
Lanes(
d);
809 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
810 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
811 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
812 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
813 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
814 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
815 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
816 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
817 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
818 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
819 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
820 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
821 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
822 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
823 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
824 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
836 const VU16 hi1 =
Set(
d, 0x8000u);
840 StoreU(packed0,
d, packed_out + 0 * N);
841 StoreU(packed1,
d, packed_out + 1 * N);
842 StoreU(packed2,
d, packed_out + 2 * N);
848 using VU16 =
Vec<
decltype(
d)>;
849 const size_t N =
Lanes(
d);
850 const VU16 mask =
Set(
d, 0x7u);
852 const VU16 packed0 =
LoadU(
d, packed_in + 0 * N);
853 const VU16 packed1 =
LoadU(
d, packed_in + 1 * N);
854 const VU16 packed2 =
LoadU(
d, packed_in + 2 * N);
856 const VU16 raw0 =
And(mask, packed0);
859 const VU16 raw1 =
And(mask, packed1);
862 const VU16 raw2 =
And(mask, packed2);
887 StoreU(rawA,
d, raw + 0xA * N);
890 StoreU(rawB,
d, raw + 0xB * N);
893 StoreU(rawC,
d, raw + 0xC * N);
896 StoreU(rawD,
d, raw + 0xD * N);
899 StoreU(rawE,
d, raw + 0xE * N);
906 StoreU(rawF,
d, raw + 0xF * N);
915 using VU16 =
Vec<
decltype(
d)>;
916 const size_t N =
Lanes(
d);
917 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
918 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
919 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
920 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
921 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
922 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
923 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
924 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
925 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
926 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
927 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
928 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
929 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
930 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
931 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
932 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
943 StoreU(packed0,
d, packed_out + 0 * N);
944 StoreU(packed1,
d, packed_out + 1 * N);
945 StoreU(packed2,
d, packed_out + 2 * N);
946 StoreU(packed3,
d, packed_out + 3 * N);
952 using VU16 =
Vec<
decltype(
d)>;
953 const size_t N =
Lanes(
d);
954 const VU16 mask =
Set(
d, 0xFu);
956 const VU16 packed0 =
LoadU(
d, packed_in + 0 * N);
957 const VU16 packed1 =
LoadU(
d, packed_in + 1 * N);
958 const VU16 packed2 =
LoadU(
d, packed_in + 2 * N);
959 const VU16 packed3 =
LoadU(
d, packed_in + 3 * N);
961 const VU16 raw0 =
And(packed0, mask);
964 const VU16 raw1 =
And(packed1, mask);
985 const VU16 raw8 =
And(packed2, mask);
988 const VU16 raw9 =
And(packed3, mask);
992 StoreU(rawA,
d, raw + 0xA * N);
995 StoreU(rawB,
d, raw + 0xB * N);
998 StoreU(rawC,
d, raw + 0xC * N);
1001 StoreU(rawD,
d, raw + 0xD * N);
1004 StoreU(rawE,
d, raw + 0xE * N);
1007 StoreU(rawF,
d, raw + 0xF * N);
1016 using VU16 =
Vec<
decltype(
d)>;
1017 const size_t N =
Lanes(
d);
1018 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1019 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1020 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1021 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1022 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1023 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1024 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1025 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1026 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1027 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1028 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1029 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1030 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1031 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1032 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1033 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1043 const VU16 hi1 =
Set(
d, 0x8000u);
1050 StoreU(packed0,
d, packed_out + 0 * N);
1051 StoreU(packed1,
d, packed_out + 1 * N);
1052 StoreU(packed2,
d, packed_out + 2 * N);
1053 StoreU(packed3,
d, packed_out + 3 * N);
1054 StoreU(packed4,
d, packed_out + 4 * N);
1060 using VU16 =
Vec<
decltype(
d)>;
1061 const size_t N =
Lanes(
d);
1063 const VU16 packed0 =
LoadU(
d, packed_in + 0 * N);
1064 const VU16 packed1 =
LoadU(
d, packed_in + 1 * N);
1065 const VU16 packed2 =
LoadU(
d, packed_in + 2 * N);
1066 const VU16 packed3 =
LoadU(
d, packed_in + 3 * N);
1067 const VU16 packed4 =
LoadU(
d, packed_in + 4 * N);
1069 const VU16 mask =
Set(
d, 0x1Fu);
1071 const VU16 raw0 =
And(packed0, mask);
1074 const VU16 raw1 =
And(packed1, mask);
1077 const VU16 raw2 =
And(packed2, mask);
1080 const VU16 raw3 =
And(packed3, mask);
1083 const VU16 raw4 =
And(packed4, mask);
1102 StoreU(rawA,
d, raw + 0xA * N);
1105 StoreU(rawB,
d, raw + 0xB * N);
1108 StoreU(rawC,
d, raw + 0xC * N);
1111 StoreU(rawD,
d, raw + 0xD * N);
1114 StoreU(rawE,
d, raw + 0xE * N);
1119 const VU16 hi1 =
Set(
d, 0x8000u);
1124 StoreU(rawF,
d, raw + 0xF * N);
1133 using VU16 =
Vec<
decltype(
d)>;
1134 const size_t N =
Lanes(
d);
1135 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1136 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1137 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1138 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1139 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1140 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1141 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1142 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1143 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1144 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1145 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1146 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1147 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1148 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1149 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1150 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1163 const VU16 hi4 =
Set(
d, 0xF000u);
1169 StoreU(packed0,
d, packed_out + 0 * N);
1170 StoreU(packed1,
d, packed_out + 1 * N);
1171 StoreU(packed2,
d, packed_out + 2 * N);
1172 StoreU(packed4,
d, packed_out + 3 * N);
1173 StoreU(packed5,
d, packed_out + 4 * N);
1174 StoreU(packed6,
d, packed_out + 5 * N);
1180 using VU16 =
Vec<
decltype(
d)>;
1181 const size_t N =
Lanes(
d);
1182 const VU16 mask =
Set(
d, 0x3Fu);
1184 const VU16 packed0 =
LoadU(
d, packed_in + 0 * N);
1185 const VU16 packed1 =
LoadU(
d, packed_in + 1 * N);
1186 const VU16 packed2 =
LoadU(
d, packed_in + 2 * N);
1187 const VU16 packed4 =
LoadU(
d, packed_in + 3 * N);
1188 const VU16 packed5 =
LoadU(
d, packed_in + 4 * N);
1189 const VU16 packed6 =
LoadU(
d, packed_in + 5 * N);
1191 const VU16 raw0 =
And(packed0, mask);
1194 const VU16 raw1 =
And(packed1, mask);
1197 const VU16 raw2 =
And(packed2, mask);
1209 const VU16 raw8 =
And(packed4, mask);
1212 const VU16 raw9 =
And(packed5, mask);
1215 const VU16 rawA =
And(packed6, mask);
1216 StoreU(rawA,
d, raw + 0xA * N);
1219 StoreU(rawC,
d, raw + 0xC * N);
1222 StoreU(rawD,
d, raw + 0xD * N);
1225 StoreU(rawE,
d, raw + 0xE * N);
1230 const VU16 hi4 =
Set(
d, 0xF000u);
1235 const VU16 raw3 =
And(packed3, mask);
1238 const VU16 rawB =
And(packed7, mask);
1239 StoreU(rawB,
d, raw + 0xB * N);
1245 StoreU(rawF,
d, raw + 0xF * N);
1254 using VU16 =
Vec<
decltype(
d)>;
1255 const size_t N =
Lanes(
d);
1256 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1257 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1258 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1259 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1260 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1261 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1262 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1263 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1264 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1265 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1266 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1267 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1268 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1269 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1270 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1271 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1284 const VU16 hi2 =
Set(
d, 0xC000u);
1292 StoreU(packed0,
d, packed_out + 0 * N);
1293 StoreU(packed1,
d, packed_out + 1 * N);
1294 StoreU(packed2,
d, packed_out + 2 * N);
1295 StoreU(packed3,
d, packed_out + 3 * N);
1296 StoreU(packed4,
d, packed_out + 4 * N);
1297 StoreU(packed5,
d, packed_out + 5 * N);
1298 StoreU(packed6,
d, packed_out + 6 * N);
1304 using VU16 =
Vec<
decltype(
d)>;
1305 const size_t N =
Lanes(
d);
1315 const VU16 mask =
Set(
d, 0x7Fu);
1317 const VU16 raw0 =
And(packed0, mask);
1320 const VU16 raw1 =
And(packed1, mask);
1323 const VU16 raw2 =
And(packed2, mask);
1326 const VU16 raw3 =
And(packed3, mask);
1329 const VU16 raw4 =
And(packed4, mask);
1332 const VU16 raw5 =
And(packed5, mask);
1335 const VU16 raw6 =
And(packed6, mask);
1345 StoreU(rawA,
d, raw + 0xA * N);
1348 StoreU(rawB,
d, raw + 0xB * N);
1351 StoreU(rawC,
d, raw + 0xC * N);
1354 StoreU(rawD,
d, raw + 0xD * N);
1357 StoreU(rawE,
d, raw + 0xE * N);
1361 const VU16 hi2 =
Set(
d, 0xC000u);
1369 const VU16 raw7 =
And(packed7, mask);
1373 StoreU(rawF,
d, raw + 0xF * N);
1382 using VU16 =
Vec<
decltype(
d)>;
1383 const size_t N =
Lanes(
d);
1384 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1385 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1386 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1387 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1388 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1389 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1390 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1391 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1392 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1393 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1394 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1395 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1396 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1397 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1398 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1399 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1412 StoreU(packed0,
d, packed_out + 0 * N);
1413 StoreU(packed1,
d, packed_out + 1 * N);
1414 StoreU(packed2,
d, packed_out + 2 * N);
1415 StoreU(packed3,
d, packed_out + 3 * N);
1416 StoreU(packed4,
d, packed_out + 4 * N);
1417 StoreU(packed5,
d, packed_out + 5 * N);
1418 StoreU(packed6,
d, packed_out + 6 * N);
1419 StoreU(packed7,
d, packed_out + 7 * N);
1425 using VU16 =
Vec<
decltype(
d)>;
1426 const size_t N =
Lanes(
d);
1436 const VU16 mask =
Set(
d, 0xFFu);
1438 const VU16 raw0 =
And(packed0, mask);
1441 const VU16 raw1 =
And(packed1, mask);
1450 const VU16 raw4 =
And(packed2, mask);
1453 const VU16 raw5 =
And(packed3, mask);
1462 const VU16 raw8 =
And(packed4, mask);
1465 const VU16 raw9 =
And(packed5, mask);
1469 StoreU(rawA,
d, raw + 0xA * N);
1472 StoreU(rawB,
d, raw + 0xB * N);
1474 const VU16 rawC =
And(packed6, mask);
1475 StoreU(rawC,
d, raw + 0xC * N);
1477 const VU16 rawD =
And(packed7, mask);
1478 StoreU(rawD,
d, raw + 0xD * N);
1481 StoreU(rawE,
d, raw + 0xE * N);
1484 StoreU(rawF,
d, raw + 0xF * N);
1493 using VU16 =
Vec<
decltype(
d)>;
1494 const size_t N =
Lanes(
d);
1495 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1496 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1497 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1498 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1499 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1500 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1501 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1502 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1503 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1504 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1505 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1506 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1507 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1508 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1509 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1510 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1524 const VU16 mid2 =
Set(
d, 0x180u);
1533 const VU16 packed8 =
Xor3(
Xor3(part8, part9, partA),
1534 Xor3(partB, partC, partD),
Or(partE, partF));
1536 StoreU(packed0,
d, packed_out + 0 * N);
1537 StoreU(packed1,
d, packed_out + 1 * N);
1538 StoreU(packed2,
d, packed_out + 2 * N);
1539 StoreU(packed3,
d, packed_out + 3 * N);
1540 StoreU(packed4,
d, packed_out + 4 * N);
1541 StoreU(packed5,
d, packed_out + 5 * N);
1542 StoreU(packed6,
d, packed_out + 6 * N);
1543 StoreU(packed7,
d, packed_out + 7 * N);
1544 StoreU(packed8,
d, packed_out + 8 * N);
1550 using VU16 =
Vec<
decltype(
d)>;
1551 const size_t N =
Lanes(
d);
1563 const VU16 mask =
Set(
d, 0x1FFu);
1565 const VU16 raw0 =
And(packed0, mask);
1568 const VU16 raw1 =
And(packed1, mask);
1571 const VU16 raw2 =
And(packed2, mask);
1574 const VU16 raw3 =
And(packed3, mask);
1577 const VU16 raw4 =
And(packed4, mask);
1580 const VU16 raw5 =
And(packed5, mask);
1583 const VU16 raw6 =
And(packed6, mask);
1586 const VU16 raw7 =
And(packed7, mask);
1589 const VU16 mid2 =
Set(
d, 0x180u);
1609 StoreU(rawA,
d, raw + 0xA * N);
1610 StoreU(rawB,
d, raw + 0xB * N);
1611 StoreU(rawC,
d, raw + 0xC * N);
1612 StoreU(rawD,
d, raw + 0xD * N);
1613 StoreU(rawE,
d, raw + 0xE * N);
1614 StoreU(rawF,
d, raw + 0xF * N);
1623 using VU16 =
Vec<
decltype(
d)>;
1624 const size_t N =
Lanes(
d);
1625 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1626 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1627 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1628 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1629 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1630 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1631 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1632 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1633 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1634 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1635 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1636 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1637 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1638 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1639 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1640 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1656 const VU16 mid4 =
Set(
d, 0x3C0u);
1665 const VU16 packed8 =
Or(
Xor3(part8, part9, partA), partB);
1666 const VU16 packed9 =
Or(
Xor3(partC, partD, partE), partF);
1668 StoreU(packed0,
d, packed_out + 0 * N);
1669 StoreU(packed1,
d, packed_out + 1 * N);
1670 StoreU(packed2,
d, packed_out + 2 * N);
1671 StoreU(packed3,
d, packed_out + 3 * N);
1672 StoreU(packed4,
d, packed_out + 4 * N);
1673 StoreU(packed5,
d, packed_out + 5 * N);
1674 StoreU(packed6,
d, packed_out + 6 * N);
1675 StoreU(packed7,
d, packed_out + 7 * N);
1676 StoreU(packed8,
d, packed_out + 8 * N);
1677 StoreU(packed9,
d, packed_out + 9 * N);
1683 using VU16 =
Vec<
decltype(
d)>;
1684 const size_t N =
Lanes(
d);
1697 const VU16 mask =
Set(
d, 0x3FFu);
1699 const VU16 raw0 =
And(packed0, mask);
1702 const VU16 raw1 =
And(packed1, mask);
1705 const VU16 raw2 =
And(packed2, mask);
1708 const VU16 raw3 =
And(packed3, mask);
1711 const VU16 raw4 =
And(packed4, mask);
1714 const VU16 raw5 =
And(packed5, mask);
1717 const VU16 raw6 =
And(packed6, mask);
1720 const VU16 raw7 =
And(packed7, mask);
1723 const VU16 mid4 =
Set(
d, 0x3C0u);
1743 StoreU(rawA,
d, raw + 0xA * N);
1744 StoreU(rawB,
d, raw + 0xB * N);
1745 StoreU(rawC,
d, raw + 0xC * N);
1746 StoreU(rawD,
d, raw + 0xD * N);
1747 StoreU(rawE,
d, raw + 0xE * N);
1748 StoreU(rawF,
d, raw + 0xF * N);
1757 using VU16 =
Vec<
decltype(
d)>;
1758 const size_t N =
Lanes(
d);
1759 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1760 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1761 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1762 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1763 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1764 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1765 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1766 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1767 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1768 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1769 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1770 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1771 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1772 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1773 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1774 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1780 const VU16 lo8 =
Set(
d, 0xFFu);
1792 StoreU(packed0,
d, packed_out + 0 * N);
1793 StoreU(packed1,
d, packed_out + 1 * N);
1794 StoreU(packed2,
d, packed_out + 2 * N);
1795 StoreU(packed3,
d, packed_out + 3 * N);
1796 StoreU(packed4,
d, packed_out + 4 * N);
1797 StoreU(packed5,
d, packed_out + 5 * N);
1798 StoreU(packed6,
d, packed_out + 6 * N);
1799 StoreU(packed7,
d, packed_out + 7 * N);
1807 VU16 next =
Set(
d, 0x38u);
1816 packed8 =
OrAnd(packed8,
Add(raw9, raw9), next);
1817 packed9 =
OrAnd(packed9,
Add(rawA, rawA), next);
1818 packedA =
OrAnd(packedA,
Add(rawB, rawB), next);
1830 StoreU(packed8,
d, packed_out + 8 * N);
1831 StoreU(packed9,
d, packed_out + 9 * N);
1832 StoreU(packedA,
d, packed_out + 0xA * N);
1838 using VU16 =
Vec<
decltype(
d)>;
1839 const size_t N =
Lanes(
d);
1853 const VU16 mask =
Set(
d, 0xFFu);
1855 const VU16 down0 =
And(packed0, mask);
1857 const VU16 down2 =
And(packed1, mask);
1859 const VU16 down4 =
And(packed2, mask);
1861 const VU16 down6 =
And(packed3, mask);
1863 const VU16 down8 =
And(packed4, mask);
1865 const VU16 downA =
And(packed5, mask);
1867 const VU16 downC =
And(packed6, mask);
1869 const VU16 downE =
And(packed7, mask);
1873 const VU16 hi3 =
Set(
d, 0x700u);
1909 StoreU(rawA,
d, raw + 0xA * N);
1910 StoreU(rawB,
d, raw + 0xB * N);
1911 StoreU(rawC,
d, raw + 0xC * N);
1912 StoreU(rawD,
d, raw + 0xD * N);
1913 StoreU(rawE,
d, raw + 0xE * N);
1914 StoreU(rawF,
d, raw + 0xF * N);
1923 using VU16 =
Vec<
decltype(
d)>;
1924 const size_t N =
Lanes(
d);
1925 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1926 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1927 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1928 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1929 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1930 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1931 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1932 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1933 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1934 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1935 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1936 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1937 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1938 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1939 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1940 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1954 const VU16 hi8 =
Set(
d, 0xFF00u);
1959 StoreU(packed0,
d, packed_out + 0 * N);
1960 StoreU(packed1,
d, packed_out + 1 * N);
1961 StoreU(packed2,
d, packed_out + 2 * N);
1962 StoreU(packed3,
d, packed_out + 3 * N);
1963 StoreU(packed4,
d, packed_out + 4 * N);
1964 StoreU(packed5,
d, packed_out + 5 * N);
1965 StoreU(packed6,
d, packed_out + 6 * N);
1966 StoreU(packed7,
d, packed_out + 7 * N);
1967 StoreU(packed8,
d, packed_out + 8 * N);
1968 StoreU(packed9,
d, packed_out + 9 * N);
1969 StoreU(packedA,
d, packed_out + 0xA * N);
1970 StoreU(packedB,
d, packed_out + 0xB * N);
1976 using VU16 =
Vec<
decltype(
d)>;
1977 const size_t N =
Lanes(
d);
1992 const VU16 mask =
Set(
d, 0xFFFu);
1994 const VU16 raw0 =
And(packed0, mask);
1997 const VU16 raw1 =
And(packed1, mask);
2000 const VU16 raw2 =
And(packed2, mask);
2003 const VU16 raw3 =
And(packed3, mask);
2006 const VU16 raw4 =
And(packed4, mask);
2009 const VU16 raw5 =
And(packed5, mask);
2012 const VU16 raw6 =
And(packed6, mask);
2015 const VU16 raw7 =
And(packed7, mask);
2018 const VU16 mid8 =
Set(
d, 0xFF0u);
2037 StoreU(rawA,
d, raw + 0xA * N);
2038 StoreU(rawB,
d, raw + 0xB * N);
2039 StoreU(rawC,
d, raw + 0xC * N);
2040 StoreU(rawD,
d, raw + 0xD * N);
2041 StoreU(rawE,
d, raw + 0xE * N);
2042 StoreU(rawF,
d, raw + 0xF * N);
2051 using VU16 =
Vec<
decltype(
d)>;
2052 const size_t N =
Lanes(
d);
2053 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
2054 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
2055 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
2056 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
2057 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
2058 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
2059 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
2060 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
2061 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
2062 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
2063 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
2064 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
2065 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
2066 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
2067 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
2068 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
2072 const VU16 lo8 =
Set(
d, 0xFFu);
2084 StoreU(packed0,
d, packed_out + 0 * N);
2085 StoreU(packed1,
d, packed_out + 1 * N);
2086 StoreU(packed2,
d, packed_out + 2 * N);
2087 StoreU(packed3,
d, packed_out + 3 * N);
2088 StoreU(packed4,
d, packed_out + 4 * N);
2089 StoreU(packed5,
d, packed_out + 5 * N);
2090 StoreU(packed6,
d, packed_out + 6 * N);
2091 StoreU(packed7,
d, packed_out + 7 * N);
2102 VU16 next =
Set(
d, 0x3E0u);
2123 StoreU(packed8,
d, packed_out + 8 * N);
2124 StoreU(packed9,
d, packed_out + 9 * N);
2125 StoreU(packedA,
d, packed_out + 0xA * N);
2126 StoreU(packedB,
d, packed_out + 0xB * N);
2127 StoreU(packedC,
d, packed_out + 0xC * N);
2133 using VU16 =
Vec<
decltype(
d)>;
2134 const size_t N =
Lanes(
d);
2150 const VU16 mask =
Set(
d, 0xFFu);
2152 const VU16 down0 =
And(packed0, mask);
2154 const VU16 down2 =
And(packed1, mask);
2156 const VU16 down4 =
And(packed2, mask);
2158 const VU16 down6 =
And(packed3, mask);
2160 const VU16 down8 =
And(packed4, mask);
2162 const VU16 downA =
And(packed5, mask);
2164 const VU16 downC =
And(packed6, mask);
2166 const VU16 downE =
And(packed7, mask);
2170 const VU16 hi5 =
Set(
d, 0x1F00u);
2195 const VU16 rawF =
Or(p0, p1);
2207 StoreU(rawA,
d, raw + 0xA * N);
2208 StoreU(rawB,
d, raw + 0xB * N);
2209 StoreU(rawC,
d, raw + 0xC * N);
2210 StoreU(rawD,
d, raw + 0xD * N);
2211 StoreU(rawE,
d, raw + 0xE * N);
2212 StoreU(rawF,
d, raw + 0xF * N);
2221 using VU16 =
Vec<
decltype(
d)>;
2222 const size_t N =
Lanes(
d);
2223 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
2224 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
2225 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
2226 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
2227 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
2228 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
2229 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
2230 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
2231 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
2232 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
2233 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
2234 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
2235 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
2236 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
2237 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
2238 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
2242 const VU16 hi2 =
Set(
d, 0xC000u);
2258 StoreU(packed0,
d, packed_out + 0 * N);
2259 StoreU(packed1,
d, packed_out + 1 * N);
2260 StoreU(packed2,
d, packed_out + 2 * N);
2261 StoreU(packed3,
d, packed_out + 3 * N);
2262 StoreU(packed4,
d, packed_out + 4 * N);
2263 StoreU(packed5,
d, packed_out + 5 * N);
2264 StoreU(packed6,
d, packed_out + 6 * N);
2265 StoreU(packed7,
d, packed_out + 7 * N);
2266 StoreU(packed8,
d, packed_out + 8 * N);
2267 StoreU(packed9,
d, packed_out + 9 * N);
2268 StoreU(packedA,
d, packed_out + 0xA * N);
2269 StoreU(packedB,
d, packed_out + 0xB * N);
2270 StoreU(packedC,
d, packed_out + 0xC * N);
2271 StoreU(packedD,
d, packed_out + 0xD * N);
2277 using VU16 =
Vec<
decltype(
d)>;
2278 const size_t N =
Lanes(
d);
2295 const VU16 mask =
Set(
d, 0x3FFFu);
2297 const VU16 raw0 =
And(packed0, mask);
2300 const VU16 raw1 =
And(packed1, mask);
2303 const VU16 raw2 =
And(packed2, mask);
2306 const VU16 raw3 =
And(packed3, mask);
2309 const VU16 raw4 =
And(packed4, mask);
2312 const VU16 raw5 =
And(packed5, mask);
2315 const VU16 raw6 =
And(packed6, mask);
2318 const VU16 raw7 =
And(packed7, mask);
2321 const VU16 raw8 =
And(packed8, mask);
2324 const VU16 raw9 =
And(packed9, mask);
2327 const VU16 rawA =
And(packedA, mask);
2328 StoreU(rawA,
d, raw + 0xA * N);
2330 const VU16 rawB =
And(packedB, mask);
2331 StoreU(rawB,
d, raw + 0xB * N);
2333 const VU16 rawC =
And(packedC, mask);
2334 StoreU(rawC,
d, raw + 0xC * N);
2336 const VU16 rawD =
And(packedD, mask);
2337 StoreU(rawD,
d, raw + 0xD * N);
2354 StoreU(rawE,
d, raw + 0xE * N);
2355 StoreU(rawF,
d, raw + 0xF * N);
2364 using VU16 =
Vec<
decltype(
d)>;
2365 const size_t N =
Lanes(
d);
2366 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
2367 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
2368 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
2369 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
2370 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
2371 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
2372 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
2373 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
2374 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
2375 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
2376 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
2377 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
2378 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
2379 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
2380 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
2381 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
2385 const VU16 hi1 =
Set(
d, 0x8000u);
2402 StoreU(packed0,
d, packed_out + 0 * N);
2403 StoreU(packed1,
d, packed_out + 1 * N);
2404 StoreU(packed2,
d, packed_out + 2 * N);
2405 StoreU(packed3,
d, packed_out + 3 * N);
2406 StoreU(packed4,
d, packed_out + 4 * N);
2407 StoreU(packed5,
d, packed_out + 5 * N);
2408 StoreU(packed6,
d, packed_out + 6 * N);
2409 StoreU(packed7,
d, packed_out + 7 * N);
2410 StoreU(packed8,
d, packed_out + 8 * N);
2411 StoreU(packed9,
d, packed_out + 9 * N);
2412 StoreU(packedA,
d, packed_out + 0xA * N);
2413 StoreU(packedB,
d, packed_out + 0xB * N);
2414 StoreU(packedC,
d, packed_out + 0xC * N);
2415 StoreU(packedD,
d, packed_out + 0xD * N);
2416 StoreU(packedE,
d, packed_out + 0xE * N);
2422 using VU16 =
Vec<
decltype(
d)>;
2423 const size_t N =
Lanes(
d);
2441 const VU16 mask =
Set(
d, 0x7FFFu);
2443 const VU16 raw0 =
And(packed0, mask);
2446 const VU16 raw1 =
And(packed1, mask);
2449 const VU16 raw2 =
And(packed2, mask);
2452 const VU16 raw3 =
And(packed3, mask);
2455 const VU16 raw4 =
And(packed4, mask);
2458 const VU16 raw5 =
And(packed5, mask);
2461 const VU16 raw6 =
And(packed6, mask);
2464 const VU16 raw7 =
And(packed7, mask);
2467 const VU16 raw8 =
And(packed8, mask);
2470 const VU16 raw9 =
And(packed9, mask);
2473 const VU16 rawA =
And(packedA, mask);
2474 StoreU(rawA,
d, raw + 0xA * N);
2476 const VU16 rawB =
And(packedB, mask);
2477 StoreU(rawB,
d, raw + 0xB * N);
2479 const VU16 rawC =
And(packedC, mask);
2480 StoreU(rawC,
d, raw + 0xC * N);
2482 const VU16 rawD =
And(packedD, mask);
2483 StoreU(rawD,
d, raw + 0xD * N);
2485 const VU16 rawE =
And(packedE, mask);
2486 StoreU(rawE,
d, raw + 0xE * N);
2504 const VU16 rawF =
Xor3(F0, F1,
Xor3(F2, F3, F4));
2505 StoreU(rawF,
d, raw + 0xF * N);
2514 using VU16 =
Vec<
decltype(
d)>;
2515 const size_t N =
Lanes(
d);
2516 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
2517 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
2518 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
2519 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
2520 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
2521 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
2522 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
2523 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
2524 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
2525 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
2526 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
2527 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
2528 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
2529 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
2530 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
2531 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
2533 StoreU(raw0,
d, packed_out + 0 * N);
2534 StoreU(raw1,
d, packed_out + 1 * N);
2535 StoreU(raw2,
d, packed_out + 2 * N);
2536 StoreU(raw3,
d, packed_out + 3 * N);
2537 StoreU(raw4,
d, packed_out + 4 * N);
2538 StoreU(raw5,
d, packed_out + 5 * N);
2539 StoreU(raw6,
d, packed_out + 6 * N);
2540 StoreU(raw7,
d, packed_out + 7 * N);
2541 StoreU(raw8,
d, packed_out + 8 * N);
2542 StoreU(raw9,
d, packed_out + 9 * N);
2543 StoreU(rawA,
d, packed_out + 0xA * N);
2544 StoreU(rawB,
d, packed_out + 0xB * N);
2545 StoreU(rawC,
d, packed_out + 0xC * N);
2546 StoreU(rawD,
d, packed_out + 0xD * N);
2547 StoreU(rawE,
d, packed_out + 0xE * N);
2548 StoreU(rawF,
d, packed_out + 0xF * N);
2554 using VU16 =
Vec<
decltype(
d)>;
2555 const size_t N =
Lanes(
d);
2584 StoreU(rawA,
d, raw + 0xA * N);
2585 StoreU(rawB,
d, raw + 0xB * N);
2586 StoreU(rawC,
d, raw + 0xC * N);
2587 StoreU(rawD,
d, raw + 0xD * N);
2588 StoreU(rawE,
d, raw + 0xE * N);
2589 StoreU(rawF,
d, raw + 0xF * N);
#define HWY_RESTRICT
Definition base.h:95
#define HWY_INLINE
Definition base.h:101
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:46
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1681
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1621
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1755
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1836
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1974
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1921
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2049
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2131
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2275
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2219
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2362
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2420
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2552
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2512
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:646
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:610
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:707
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:743
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:846
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:805
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:950
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:913
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1058
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1014
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1178
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1131
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1302
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1252
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1380
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1423
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1548
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1491
Definition bit_pack-inl.h:38
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:66
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:43
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:128
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:104
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:197
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:167
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:270
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:243
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:311
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:346
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:427
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:395
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:474
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:508
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:583
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:559
Definition bit_pack-inl.h:36