8#ifndef SIMDUTF_COMPILER_CHECK_H
9#define SIMDUTF_COMPILER_CHECK_H
12 #error simdutf requires a C++ compiler
15#ifndef SIMDUTF_CPLUSPLUS
16 #if defined(_MSVC_LANG) && !defined(__clang__)
17 #define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
19 #define SIMDUTF_CPLUSPLUS __cplusplus
24#if !defined(SIMDUTF_CPLUSPLUS26) && (SIMDUTF_CPLUSPLUS >= 202602L)
25 #define SIMDUTF_CPLUSPLUS26 1
29#if !defined(SIMDUTF_CPLUSPLUS23) && (SIMDUTF_CPLUSPLUS >= 202302L)
30 #define SIMDUTF_CPLUSPLUS23 1
34#if !defined(SIMDUTF_CPLUSPLUS20) && (SIMDUTF_CPLUSPLUS >= 202002L)
35 #define SIMDUTF_CPLUSPLUS20 1
39#if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L)
40 #define SIMDUTF_CPLUSPLUS17 1
44#if !defined(SIMDUTF_CPLUSPLUS14) && (SIMDUTF_CPLUSPLUS >= 201402L)
45 #define SIMDUTF_CPLUSPLUS14 1
49#if !defined(SIMDUTF_CPLUSPLUS11) && (SIMDUTF_CPLUSPLUS >= 201103L)
50 #define SIMDUTF_CPLUSPLUS11 1
53#ifndef SIMDUTF_CPLUSPLUS11
54 #error simdutf requires a compiler compliant with the C++11 standard
60#ifndef SIMDUTF_COMMON_DEFS_H
61#define SIMDUTF_COMMON_DEFS_H
64#ifndef SIMDUTF_PORTABILITY_H
65#define SIMDUTF_PORTABILITY_H
77#if defined(__apple_build_version__)
78 #if __apple_build_version__ < 14000000
79 #define SIMDUTF_SPAN_DISABLED \
84#if SIMDUTF_CPLUSPLUS20
86 #if __cpp_concepts >= 201907L && __cpp_lib_span >= 202002L && \
87 !defined(SIMDUTF_SPAN_DISABLED)
88 #define SIMDUTF_SPAN 1
90 #if __cpp_lib_atomic_ref >= 201806L
91 #define SIMDUTF_ATOMIC_REF 1
93 #if __has_cpp_attribute(maybe_unused) >= 201603L
94 #define SIMDUTF_MAYBE_UNUSED_AVAILABLE 1
103#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
104 #define SIMDUTF_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
106 #define SIMDUTF_IS_BIG_ENDIAN 0
108 #if defined(__APPLE__) || \
111 #include <machine/endian.h>
112 #elif defined(sun) || \
114 #include <sys/byteorder.h>
118 #if __has_include(<endian.h>)
125 #ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__)
126 #define SIMDUTF_IS_BIG_ENDIAN 0
129 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
130 #define SIMDUTF_IS_BIG_ENDIAN 0
132 #define SIMDUTF_IS_BIG_ENDIAN 1
142 #define SIMDUTF_VISUAL_STUDIO 1
155 #define SIMDUTF_CLANG_VISUAL_STUDIO 1
158 #define SIMDUTF_REGULAR_VISUAL_STUDIO 1
162#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
169#if (defined(__x86_64__) || defined(_M_AMD64)) && !defined(_M_ARM64EC)
170 #define SIMDUTF_IS_X86_64 1
171#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
172 #define SIMDUTF_IS_ARM64 1
173#elif defined(__PPC64__) || defined(_M_PPC64)
174 #if defined(__VEC__) && defined(__ALTIVEC__)
175 #define SIMDUTF_IS_PPC64 1
177#elif defined(__s390__)
179#elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64
181 #define SIMDUTF_IS_RISCV64 1
188 #if __riscv_v_intrinsic >= 11000
189 #define SIMDUTF_HAS_RVV_INTRINSICS 1
192 #define SIMDUTF_HAS_ZVBB_INTRINSICS \
195 #if SIMDUTF_HAS_RVV_INTRINSICS && __riscv_vector && \
196 __riscv_v_min_vlen >= 128 && __riscv_v_elen >= 64
198 #define SIMDUTF_IS_RVV 1
199 #if SIMDUTF_HAS_ZVBB_INTRINSICS && __riscv_zvbb >= 1000000
201 #define SIMDUTF_IS_ZVBB 1
205#elif defined(__loongarch_lp64)
206 #if defined(__loongarch_sx) && defined(__loongarch_asx)
207 #define SIMDUTF_IS_LSX 1
208 #define SIMDUTF_IS_LASX 1
209 #elif defined(__loongarch_sx)
210 #define SIMDUTF_IS_LSX 1
217 #define SIMDUTF_IS_32BITS 1
221 #if defined(_M_IX86) || defined(__i386__)
222 #define SIMDUTF_IS_X86_32BITS 1
223 #elif defined(__arm__) || defined(_M_ARM)
224 #define SIMDUTF_IS_ARM_32BITS 1
225 #elif defined(__PPC__) || defined(_M_PPC)
226 #define SIMDUTF_IS_PPC_32BITS 1
231#ifdef SIMDUTF_IS_32BITS
232 #ifndef SIMDUTF_NO_PORTABILITY_WARNING
239#define SIMDUTF_STRINGIFY_IMPLEMENTATION_(a) #a
240#define SIMDUTF_STRINGIFY(a) SIMDUTF_STRINGIFY_IMPLEMENTATION_(a)
258#if defined(SIMDUTF_IS_X86_64) || defined(SIMDUTF_IS_LSX)
264 #define SIMDUTF_TARGET_REGION(T) \
265 _Pragma(SIMDUTF_STRINGIFY(clang attribute push( \
266 __attribute__((target(T))), apply_to = function)))
267 #define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop")
268 #elif defined(__GNUC__)
270 #define SIMDUTF_TARGET_REGION(T) \
271 _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T)))
272 #define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options")
278#ifndef SIMDUTF_TARGET_REGION
279 #define SIMDUTF_TARGET_REGION(T)
280 #define SIMDUTF_UNTARGET_REGION
284#if defined(_REENTRANT) || defined(_MT)
285 #ifndef SIMDUTF_THREADS_ENABLED
286 #define SIMDUTF_THREADS_ENABLED
299 #undef SIMDUTF_THREADS_ENABLED
303#ifdef SIMDUTF_VISUAL_STUDIO
308 #define simdutf_strcasecmp _stricmp
309 #define simdutf_strncasecmp _strnicmp
315 #define simdutf_strcasecmp strcasecmp
316 #define simdutf_strncasecmp strncasecmp
319#if defined(__GNUC__) && !defined(__clang__)
321 #define SIMDUTF_GCC11ORMORE 1
324 #define SIMDUTF_GCC10 1
327 #define SIMDUTF_GCC9OROLDER 1
334#ifndef SIMDUTF_AVX512_H_
335#define SIMDUTF_AVX512_H_
346#ifndef SIMDUTF_HAS_AVX512F
347 #if defined(__AVX512F__) && __AVX512F__ == 1
348 #define SIMDUTF_HAS_AVX512F 1
352#ifndef SIMDUTF_HAS_AVX512DQ
353 #if defined(__AVX512DQ__) && __AVX512DQ__ == 1
354 #define SIMDUTF_HAS_AVX512DQ 1
358#ifndef SIMDUTF_HAS_AVX512IFMA
359 #if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1
360 #define SIMDUTF_HAS_AVX512IFMA 1
364#ifndef SIMDUTF_HAS_AVX512CD
365 #if defined(__AVX512CD__) && __AVX512CD__ == 1
366 #define SIMDUTF_HAS_AVX512CD 1
370#ifndef SIMDUTF_HAS_AVX512BW
371 #if defined(__AVX512BW__) && __AVX512BW__ == 1
372 #define SIMDUTF_HAS_AVX512BW 1
376#ifndef SIMDUTF_HAS_AVX512VL
377 #if defined(__AVX512VL__) && __AVX512VL__ == 1
378 #define SIMDUTF_HAS_AVX512VL 1
382#ifndef SIMDUTF_HAS_AVX512VBMI
383 #if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1
384 #define SIMDUTF_HAS_AVX512VBMI 1
388#ifndef SIMDUTF_HAS_AVX512VBMI2
389 #if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1
390 #define SIMDUTF_HAS_AVX512VBMI2 1
394#ifndef SIMDUTF_HAS_AVX512VNNI
395 #if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1
396 #define SIMDUTF_HAS_AVX512VNNI 1
400#ifndef SIMDUTF_HAS_AVX512BITALG
401 #if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1
402 #define SIMDUTF_HAS_AVX512BITALG 1
406#ifndef SIMDUTF_HAS_AVX512VPOPCNTDQ
407 #if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1
408 #define SIMDUTF_HAS_AVX512VPOPCNTDQ 1
417#ifdef SIMDUTF_LOGGING
419 #define simdutf_log(msg) \
420 std::cout << "[" << __FUNCTION__ << "]: " << msg << std::endl \
421 << "\t" << __FILE__ << ":" << __LINE__ << std::endl;
422 #define simdutf_log_assert(cond, msg) \
425 std::cerr << "[" << __FUNCTION__ << "]: " << msg << std::endl \
426 << "\t" << __FILE__ << ":" << __LINE__ << std::endl; \
431 #define simdutf_log(msg)
432 #define simdutf_log_assert(cond, msg)
435#if defined(SIMDUTF_REGULAR_VISUAL_STUDIO)
436 #define SIMDUTF_DEPRECATED __declspec(deprecated)
438 #define simdutf_really_inline __forceinline
439 #define simdutf_always_inline __forceinline
440 #define simdutf_never_inline __declspec(noinline)
442 #define simdutf_unused
443 #define simdutf_warn_unused
445 #ifndef simdutf_likely
446 #define simdutf_likely(x) x
448 #ifndef simdutf_unlikely
449 #define simdutf_unlikely(x) x
452 #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning(push))
453 #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning(push, 0))
454 #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) \
455 __pragma(warning(disable : WARNING_NUMBER))
460 #if __has_include(<CppCoreCheck\Warnings.h>)
461 #include <CppCoreCheck\Warnings.h>
462 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS \
463 SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS)
467 #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS
468 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
471 #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996)
472 #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING
473 #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning(pop))
474 #define SIMDUTF_DISABLE_UNUSED_WARNING
476 #if defined(__OPTIMIZE__) || defined(NDEBUG)
477 #define simdutf_really_inline inline __attribute__((always_inline))
479 #define simdutf_really_inline inline
481 #define simdutf_always_inline \
482 inline __attribute__((always_inline))
483 #define SIMDUTF_DEPRECATED __attribute__((deprecated))
484 #define simdutf_never_inline inline __attribute__((noinline))
486 #define simdutf_unused __attribute__((unused))
487 #define simdutf_warn_unused __attribute__((warn_unused_result))
489 #ifndef simdutf_likely
490 #define simdutf_likely(x) __builtin_expect(!!(x), 1)
492 #ifndef simdutf_unlikely
493 #define simdutf_unlikely(x) __builtin_expect(!!(x), 0)
496 #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push")
499 #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS \
500 SIMDUTF_PUSH_DISABLE_WARNINGS \
501 SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \
502 SIMDUTF_DISABLE_GCC_WARNING(-Wall) \
503 SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \
504 SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \
505 SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \
506 SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \
507 SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \
508 SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \
509 SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \
510 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \
511 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable)
512 #define SIMDUTF_PRAGMA(P) _Pragma(#P)
513 #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) \
514 SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING)
515 #if defined(SIMDUTF_CLANG_VISUAL_STUDIO)
516 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS \
517 SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include)
519 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
521 #define SIMDUTF_DISABLE_DEPRECATED_WARNING \
522 SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations)
523 #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING \
524 SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow)
525 #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop")
526 #define SIMDUTF_DISABLE_UNUSED_WARNING \
527 SIMDUTF_PUSH_DISABLE_WARNINGS \
528 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-function) \
529 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-const-variable)
535#if SIMDUTF_CPLUSPLUS17
536 #define simdutf_constexpr constexpr
538 #define simdutf_constexpr
543#if SIMDUTF_CPLUSPLUS23
544 #define simdutf_constexpr23 constexpr
546 #define simdutf_constexpr23
549#ifndef SIMDUTF_DLLIMPORTEXPORT
550 #if defined(SIMDUTF_VISUAL_STUDIO)
566 #if SIMDUTF_BUILDING_WINDOWS_DYNAMIC_LIBRARY
572 #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
573 #elif SIMDUTF_USING_WINDOWS_DYNAMIC_LIBRARY
577 #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
580 #define SIMDUTF_DLLIMPORTEXPORT
584 #define SIMDUTF_DLLIMPORTEXPORT
588#if SIMDUTF_MAYBE_UNUSED_AVAILABLE
589 #define simdutf_maybe_unused [[maybe_unused]]
591 #define simdutf_maybe_unused
597#ifndef SIMDUTF_ENCODING_TYPES_H
598#define SIMDUTF_ENCODING_TYPES_H
601#if !defined(SIMDUTF_NO_STD_TEXT_ENCODING) && \
602 defined(__cpp_lib_text_encoding) && __cpp_lib_text_encoding >= 202306L
603 #define SIMDUTF_HAS_STD_TEXT_ENCODING 1
604 #include <text_encoding>
620#ifndef SIMDUTF_IS_BIG_ENDIAN
621 #error "SIMDUTF_IS_BIG_ENDIAN needs to be defined."
628#if SIMDUTF_IS_BIG_ENDIAN
635simdutf_warn_unused simdutf_really_inline
constexpr bool
636match_system(endianness e) {
637 return e == endianness::NATIVE;
640simdutf_warn_unused std::string to_string(encoding_type bom);
652simdutf_warn_unused encoding_type check_bom(
const uint8_t *
byte,
size_t length);
653simdutf_warn_unused encoding_type check_bom(
const char *
byte,
size_t length);
660simdutf_warn_unused
size_t bom_byte_size(encoding_type bom);
664#ifdef SIMDUTF_HAS_STD_TEXT_ENCODING
672simdutf_warn_unused
constexpr std::text_encoding
673to_std_encoding(encoding_type enc)
noexcept {
676 return std::text_encoding(std::text_encoding::id::UTF8);
678 return std::text_encoding(std::text_encoding::id::UTF16LE);
680 return std::text_encoding(std::text_encoding::id::UTF16BE);
682 return std::text_encoding(std::text_encoding::id::UTF32LE);
684 return std::text_encoding(std::text_encoding::id::UTF32BE);
686 return std::text_encoding(std::text_encoding::id::ISOLatin1);
689 return std::text_encoding(std::text_encoding::id::unknown);
700simdutf_warn_unused
constexpr encoding_type
701from_std_encoding(
const std::text_encoding &enc)
noexcept {
703 case std::text_encoding::id::UTF8:
705 case std::text_encoding::id::UTF16LE:
707 case std::text_encoding::id::UTF16BE:
709 case std::text_encoding::id::UTF32LE:
711 case std::text_encoding::id::UTF32BE:
713 case std::text_encoding::id::ISOLatin1:
725simdutf_warn_unused
constexpr encoding_type native_utf16_encoding() noexcept {
726 #if SIMDUTF_IS_BIG_ENDIAN
738simdutf_warn_unused
constexpr encoding_type native_utf32_encoding() noexcept {
739 #if SIMDUTF_IS_BIG_ENDIAN
757simdutf_warn_unused
constexpr encoding_type
758from_std_encoding_native(
const std::text_encoding &enc)
noexcept {
760 case std::text_encoding::id::UTF8:
762 case std::text_encoding::id::UTF16:
763 return native_utf16_encoding();
764 case std::text_encoding::id::UTF16LE:
766 case std::text_encoding::id::UTF16BE:
768 case std::text_encoding::id::UTF32:
769 return native_utf32_encoding();
770 case std::text_encoding::id::UTF32LE:
772 case std::text_encoding::id::UTF32BE:
774 case std::text_encoding::id::ISOLatin1:
786#ifndef SIMDUTF_ERROR_H
787#define SIMDUTF_ERROR_H
818 INVALID_BASE64_CHARACTER,
821 BASE64_INPUT_REMAINDER,
826 OUTPUT_BUFFER_TOO_SMALL,
829#if SIMDUTF_CPLUSPLUS17
830inline std::string_view error_to_string(error_code code)
noexcept {
835 return "HEADER_BITS";
846 case INVALID_BASE64_CHARACTER:
847 return "INVALID_BASE64_CHARACTER";
848 case BASE64_INPUT_REMAINDER:
849 return "BASE64_INPUT_REMAINDER";
850 case BASE64_EXTRA_BITS:
851 return "BASE64_EXTRA_BITS";
852 case OUTPUT_BUFFER_TOO_SMALL:
853 return "OUTPUT_BUFFER_TOO_SMALL";
866 simdutf_really_inline simdutf_constexpr23
result() noexcept
867 : error{error_code::SUCCESS}, count{0} {}
869 simdutf_really_inline simdutf_constexpr23
result(error_code err,
871 : error{err}, count{pos} {}
873 simdutf_really_inline simdutf_constexpr23
bool is_ok()
const noexcept {
874 return error == error_code::SUCCESS;
877 simdutf_really_inline simdutf_constexpr23
bool is_err()
const noexcept {
878 return error != error_code::SUCCESS;
886 bool padding_error =
false;
889 simdutf_really_inline simdutf_constexpr23
full_result() noexcept
890 : error{error_code::SUCCESS}, input_count{0}, output_count{0} {}
892 simdutf_really_inline simdutf_constexpr23
full_result(error_code err,
894 size_t pos_out) noexcept
895 : error{err}, input_count{pos_in}, output_count{pos_out} {}
896 simdutf_really_inline simdutf_constexpr23
full_result(
897 error_code err,
size_t pos_in,
size_t pos_out,
bool padding_err) noexcept
898 : error{err}, input_count{pos_in}, output_count{pos_out},
899 padding_error{padding_err} {}
901 simdutf_really_inline simdutf_constexpr23
operator result()
const noexcept {
902 if (error == error_code::SUCCESS) {
903 return result{error, output_count};
905 return result{error, input_count};
914SIMDUTF_PUSH_DISABLE_WARNINGS
915SIMDUTF_DISABLE_UNDESIRED_WARNINGS
921#ifndef SIMDUTF_SIMDUTF_VERSION_H
922#define SIMDUTF_SIMDUTF_VERSION_H
925#define SIMDUTF_VERSION "8.0.0"
932 SIMDUTF_VERSION_MAJOR = 8,
936 SIMDUTF_VERSION_MINOR = 0,
940 SIMDUTF_VERSION_REVISION = 0
947#ifndef SIMDUTF_IMPLEMENTATION_H
948#define SIMDUTF_IMPLEMENTATION_H
949#if !defined(SIMDUTF_NO_THREADS)
953#ifdef SIMDUTF_INTERNAL_TESTS
1002#ifndef SIMDutf_INTERNAL_ISADETECTION_H
1003#define SIMDutf_INTERNAL_ISADETECTION_H
1007#if defined(_MSC_VER)
1009#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
1015#if SIMDUTF_IS_RISCV64 && defined(__linux__)
1018struct simdutf_riscv_hwprobe {
1022 #define simdutf_riscv_hwprobe(...) syscall(258, __VA_ARGS__)
1023 #define SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0 4
1024 #define SIMDUTF_RISCV_HWPROBE_IMA_V (1 << 2)
1025 #define SIMDUTF_RISCV_HWPROBE_EXT_ZVBB (1 << 17)
1028#if defined(__loongarch__) && defined(__linux__)
1029 #include <sys/auxv.h>
1038enum instruction_set {
1055 AVX512VBMI2 = 0x10000,
1056 AVX512VPOPCNTDQ = 0x2000,
1063#if defined(__PPC64__)
1065static inline uint32_t detect_supported_architectures() {
1066 return instruction_set::ALTIVEC;
1069#elif SIMDUTF_IS_RISCV64
1071static inline uint32_t detect_supported_architectures() {
1072 uint32_t host_isa = instruction_set::DEFAULT;
1074 host_isa |= instruction_set::RVV;
1077 host_isa |= instruction_set::ZVBB;
1079 #if defined(__linux__)
1080 simdutf_riscv_hwprobe probes[] = {{SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0, 0}};
1081 long ret = simdutf_riscv_hwprobe(&probes,
sizeof probes /
sizeof *probes, 0,
1084 uint64_t extensions = probes[0].value;
1085 if (extensions & SIMDUTF_RISCV_HWPROBE_IMA_V)
1086 host_isa |= instruction_set::RVV;
1087 if (extensions & SIMDUTF_RISCV_HWPROBE_EXT_ZVBB)
1088 host_isa |= instruction_set::ZVBB;
1091 #if defined(RUN_IN_SPIKE_SIMULATOR)
1093 host_isa |= instruction_set::RVV;
1098#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
1100static inline uint32_t detect_supported_architectures() {
1101 return instruction_set::NEON;
1104#elif defined(__x86_64__) || defined(_M_AMD64)
1107namespace cpuid_bit {
1111constexpr uint32_t pclmulqdq = uint32_t(1)
1113constexpr uint32_t sse42 = uint32_t(1)
1115constexpr uint32_t osxsave =
1116 (uint32_t(1) << 26) |
1117 (uint32_t(1) << 27);
1122constexpr uint32_t bmi1 = uint32_t(1) << 3;
1123constexpr uint32_t avx2 = uint32_t(1) << 5;
1124constexpr uint32_t bmi2 = uint32_t(1) << 8;
1125constexpr uint32_t avx512f = uint32_t(1) << 16;
1126constexpr uint32_t avx512dq = uint32_t(1) << 17;
1127constexpr uint32_t avx512ifma = uint32_t(1) << 21;
1128constexpr uint32_t avx512cd = uint32_t(1) << 28;
1129constexpr uint32_t avx512bw = uint32_t(1) << 30;
1130constexpr uint32_t avx512vl = uint32_t(1) << 31;
1134constexpr uint32_t avx512vbmi = uint32_t(1) << 1;
1135constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6;
1136constexpr uint32_t avx512vnni = uint32_t(1) << 11;
1137constexpr uint32_t avx512bitalg = uint32_t(1) << 12;
1138constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14;
1141constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
1144constexpr uint64_t avx256_saved = uint64_t(1) << 2;
1145constexpr uint64_t avx512_saved =
1151static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
1153 #if defined(_MSC_VER)
1155 __cpuidex(cpu_info, *eax, *ecx);
1160 #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
1161 uint32_t level = *eax;
1162 __get_cpuid(level, eax, ebx, ecx, edx);
1164 uint32_t a = *eax, b, c = *ecx, d;
1165 asm volatile(
"cpuid\n\t" :
"+a"(a),
"=b"(b),
"+c"(c),
"=d"(d));
1173static inline uint64_t xgetbv() {
1174 #if defined(_MSC_VER)
1177 uint32_t xcr0_lo, xcr0_hi;
1178 asm volatile(
"xgetbv\n\t" :
"=a"(xcr0_lo),
"=d"(xcr0_hi) :
"c"(0));
1179 return xcr0_lo | ((uint64_t)xcr0_hi << 32);
1183static inline uint32_t detect_supported_architectures() {
1188 uint32_t host_isa = 0x0;
1192 cpuid(&eax, &ebx, &ecx, &edx);
1194 if (ecx & cpuid_bit::sse42) {
1195 host_isa |= instruction_set::SSE42;
1198 if (ecx & cpuid_bit::pclmulqdq) {
1199 host_isa |= instruction_set::PCLMULQDQ;
1202 if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) {
1207 uint64_t xcr0 = xgetbv();
1209 if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) {
1215 cpuid(&eax, &ebx, &ecx, &edx);
1216 if (ebx & cpuid_bit::ebx::avx2) {
1217 host_isa |= instruction_set::AVX2;
1219 if (ebx & cpuid_bit::ebx::bmi1) {
1220 host_isa |= instruction_set::BMI1;
1222 if (ebx & cpuid_bit::ebx::bmi2) {
1223 host_isa |= instruction_set::BMI2;
1225 if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) ==
1226 cpuid_bit::xcr0_bit::avx512_saved)) {
1229 if (ebx & cpuid_bit::ebx::avx512f) {
1230 host_isa |= instruction_set::AVX512F;
1232 if (ebx & cpuid_bit::ebx::avx512bw) {
1233 host_isa |= instruction_set::AVX512BW;
1235 if (ebx & cpuid_bit::ebx::avx512cd) {
1236 host_isa |= instruction_set::AVX512CD;
1238 if (ebx & cpuid_bit::ebx::avx512dq) {
1239 host_isa |= instruction_set::AVX512DQ;
1241 if (ebx & cpuid_bit::ebx::avx512vl) {
1242 host_isa |= instruction_set::AVX512VL;
1244 if (ecx & cpuid_bit::ecx::avx512vbmi2) {
1245 host_isa |= instruction_set::AVX512VBMI2;
1247 if (ecx & cpuid_bit::ecx::avx512vpopcnt) {
1248 host_isa |= instruction_set::AVX512VPOPCNTDQ;
1252#elif defined(__loongarch__)
1254static inline uint32_t detect_supported_architectures() {
1255 uint32_t host_isa = instruction_set::DEFAULT;
1256 #if defined(__linux__)
1258 hwcap = getauxval(AT_HWCAP);
1259 if (hwcap & HWCAP_LOONGARCH_LSX) {
1260 host_isa |= instruction_set::LSX;
1262 if (hwcap & HWCAP_LOONGARCH_LASX) {
1263 host_isa |= instruction_set::LASX;
1271static inline uint32_t detect_supported_architectures() {
1272 return instruction_set::DEFAULT;
1285 #include <type_traits>
1289#if SIMDUTF_CPLUSPLUS17
1290 #include <string_view>
1301#define SIMDUTF_FEATURE_DETECT_ENCODING 0
1302#define SIMDUTF_FEATURE_ASCII 0
1303#define SIMDUTF_FEATURE_LATIN1 0
1304#define SIMDUTF_FEATURE_UTF8 1
1305#define SIMDUTF_FEATURE_UTF16 1
1306#define SIMDUTF_FEATURE_UTF32 0
1307#define SIMDUTF_FEATURE_BASE64 0
1309#if SIMDUTF_CPLUSPLUS23
1311#ifndef SIMDUTF_CONSTEXPR_PTR_H
1312#define SIMDUTF_CONSTEXPR_PTR_H
1322template <
typename to,
typename from>
1323 requires(
sizeof(to) ==
sizeof(from))
1324struct constexpr_ptr {
1327 constexpr explicit constexpr_ptr(
const from *ptr) noexcept : p(ptr) {}
1329 constexpr to operator*() const noexcept {
return static_cast<to
>(*p); }
1331 constexpr constexpr_ptr &operator++() noexcept {
1336 constexpr constexpr_ptr operator++(
int)
noexcept {
1342 constexpr constexpr_ptr &operator--() noexcept {
1347 constexpr constexpr_ptr operator--(
int)
noexcept {
1353 constexpr constexpr_ptr &operator+=(std::ptrdiff_t n)
noexcept {
1358 constexpr constexpr_ptr &operator-=(std::ptrdiff_t n)
noexcept {
1363 constexpr constexpr_ptr operator+(std::ptrdiff_t n)
const noexcept {
1364 return constexpr_ptr{p + n};
1367 constexpr constexpr_ptr operator-(std::ptrdiff_t n)
const noexcept {
1368 return constexpr_ptr{p - n};
1371 constexpr std::ptrdiff_t operator-(
const constexpr_ptr &o)
const noexcept {
1375 constexpr to operator[](std::ptrdiff_t n)
const noexcept {
1376 return static_cast<to
>(*(p + n));
1381 constexpr operator const void *()
const noexcept {
return p; }
1384template <
typename to,
typename from>
1385constexpr constexpr_ptr<to, from> constexpr_cast_ptr(from *p)
noexcept {
1386 return constexpr_ptr<to, from>{p};
1393template <
typename SrcType,
typename TargetType>
1394struct constexpr_write_ptr_proxy {
1396 constexpr explicit constexpr_write_ptr_proxy(TargetType *raw) : p(raw) {}
1398 constexpr constexpr_write_ptr_proxy &operator=(SrcType v) {
1399 *p =
static_cast<TargetType
>(v);
1411template <
typename SrcType,
typename TargetType>
struct constexpr_write_ptr {
1412 constexpr explicit constexpr_write_ptr(TargetType *raw) : p(raw) {}
1414 constexpr constexpr_write_ptr_proxy<SrcType, TargetType> operator*()
const {
1415 return constexpr_write_ptr_proxy<SrcType, TargetType>{p};
1418 constexpr constexpr_write_ptr_proxy<SrcType, TargetType>
1419 operator[](std::ptrdiff_t n)
const {
1420 return constexpr_write_ptr_proxy<SrcType, TargetType>{p + n};
1423 constexpr constexpr_write_ptr &operator++() {
1428 constexpr constexpr_write_ptr operator++(
int) {
1429 constexpr_write_ptr old = *
this;
1434 constexpr std::ptrdiff_t operator-(
const constexpr_write_ptr &other)
const {
1441template <
typename SrcType,
typename TargetType>
1442constexpr auto constexpr_cast_writeptr(TargetType *raw) {
1443 return constexpr_write_ptr<SrcType, TargetType>{raw};
1460template <
typename T>
1461concept byte_like = std::is_same_v<T, std::byte> ||
1462 std::is_same_v<T, char> ||
1463 std::is_same_v<T, signed char> ||
1464 std::is_same_v<T, unsigned char> ||
1465 std::is_same_v<T, char8_t>;
1467template <
typename T>
1468concept is_byte_like = byte_like<std::remove_cvref_t<T>>;
1470template <
typename T>
1471concept is_pointer = std::is_pointer_v<T>;
1478template <
typename T>
1479concept input_span_of_byte_like =
requires(
const T &t) {
1480 { t.size() }
noexcept -> std::convertible_to<std::size_t>;
1481 { t.data() }
noexcept -> is_pointer;
1482 { *t.data() }
noexcept -> is_byte_like;
1485template <
typename T>
1486concept is_mutable = !std::is_const_v<std::remove_reference_t<T>>;
1491template <
typename T>
1492concept output_span_of_byte_like =
requires(T &t) {
1493 { t.size() }
noexcept -> std::convertible_to<std::size_t>;
1494 { t.data() }
noexcept -> is_pointer;
1495 { *t.data() }
noexcept -> is_byte_like;
1496 { *t.data() }
noexcept -> is_mutable;
1504template <
class InputPtr>
1505concept indexes_into_byte_like =
requires(InputPtr p) {
1506 { std::decay_t<
decltype(p[0])>{} } -> simdutf::detail::byte_like;
1508template <
class InputPtr>
1509concept indexes_into_utf16 =
requires(InputPtr p) {
1510 { std::decay_t<
decltype(p[0])>{} } -> std::same_as<char16_t>;
1512template <
class InputPtr>
1513concept indexes_into_utf32 =
requires(InputPtr p) {
1514 { std::decay_t<
decltype(p[0])>{} } -> std::same_as<char32_t>;
1517template <
class InputPtr>
1518concept index_assignable_from_char =
requires(InputPtr p,
char s) {
1526template <
class InputPtr>
1527concept indexes_into_uint32 =
requires(InputPtr p) {
1528 { std::decay_t<
decltype(p[0])>{} } -> std::same_as<std::uint32_t>;
1537#ifndef SIMDUTF_SWAP_BYTES_H
1538#define SIMDUTF_SWAP_BYTES_H
1543constexpr inline simdutf_warn_unused uint16_t
1544u16_swap_bytes(
const uint16_t word) {
1545 return uint16_t((word >> 8) | (word << 8));
1548constexpr inline simdutf_warn_unused uint32_t
1549u32_swap_bytes(
const uint32_t word) {
1550 return ((word >> 24) & 0xff) |
1551 ((word << 8) & 0xff0000) |
1552 ((word >> 8) & 0xff00) |
1553 ((word << 24) & 0xff000000);
1557template <endianness big_endian>
constexpr uint32_t swap_if_needed(uint32_t c) {
1558 return !match_system(big_endian) ? scalar::u32_swap_bytes(c) : c;
1563template <endianness big_endian>
constexpr uint16_t swap_if_needed(uint16_t c) {
1564 return !match_system(big_endian) ? scalar::u16_swap_bytes(c) : c;
1574#ifndef SIMDUTF_ASCII_H
1575#define SIMDUTF_ASCII_H
1582template <
class InputPtr>
1583#if SIMDUTF_CPLUSPLUS20
1584 requires simdutf::detail::indexes_into_byte_like<InputPtr>
1586simdutf_warn_unused simdutf_constexpr23
bool validate(InputPtr data,
1587 size_t len)
noexcept {
1590#if SIMDUTF_CPLUSPLUS23
1596 for (; pos + 16 <= len; pos += 16) {
1598 std::memcpy(&v1, data + pos,
sizeof(uint64_t));
1600 std::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
1601 uint64_t v{v1 | v2};
1602 if ((v & 0x8080808080808080) != 0) {
1609 for (; pos < len; pos++) {
1610 if (
static_cast<std::uint8_t
>(data[pos]) >= 0b10000000) {
1616template <
class InputPtr>
1617#if SIMDUTF_CPLUSPLUS20
1618 requires simdutf::detail::indexes_into_byte_like<InputPtr>
1620simdutf_warn_unused simdutf_constexpr23 result
1621validate_with_errors(InputPtr data,
size_t len)
noexcept {
1623#if SIMDUTF_CPLUSPLUS23
1629 for (; pos + 16 <= len; pos += 16) {
1631 std::memcpy(&v1, data + pos,
sizeof(uint64_t));
1633 std::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
1634 uint64_t v{v1 | v2};
1635 if ((v & 0x8080808080808080) != 0) {
1636 for (; pos < len; pos++) {
1637 if (
static_cast<std::uint8_t
>(data[pos]) >= 0b10000000) {
1638 return result(error_code::TOO_LARGE, pos);
1646 for (; pos < len; pos++) {
1647 if (
static_cast<std::uint8_t
>(data[pos]) >= 0b10000000) {
1648 return result(error_code::TOO_LARGE, pos);
1651 return result(error_code::SUCCESS, pos);
1662#ifndef SIMDUTF_ATOMIC_UTIL_H
1663#define SIMDUTF_ATOMIC_UTIL_H
1664#if SIMDUTF_ATOMIC_REF
1671inline void memcpy_atomic_read(
char *dst,
const char *src,
size_t len) {
1672 static_assert(std::atomic_ref<char>::required_alignment ==
sizeof(char),
1673 "std::atomic_ref requires the same alignment as char_type");
1677 constexpr size_t alignment =
sizeof(uint64_t);
1680 auto bbb_memcpy_atomic_read = [](
char *bytedst,
const char *bytesrc,
1681 size_t bytelen)
noexcept {
1682 char *mutable_src =
const_cast<char *
>(bytesrc);
1683 for (
size_t j = 0; j < bytelen; ++j) {
1685 std::atomic_ref<char>(mutable_src[j]).load(std::memory_order_relaxed);
1690 size_t offset =
reinterpret_cast<std::uintptr_t
>(src) % alignment;
1692 size_t to_align = std::min(len, alignment - offset);
1693 bbb_memcpy_atomic_read(dst, src, to_align);
1700 while (len >= alignment) {
1701 auto *src_aligned =
reinterpret_cast<uint64_t *
>(
const_cast<char *
>(src));
1702 const auto dst_value =
1703 std::atomic_ref<uint64_t>(*src_aligned).load(std::memory_order_relaxed);
1704 std::memcpy(dst, &dst_value,
sizeof(uint64_t));
1712 bbb_memcpy_atomic_read(dst, src, len);
1718inline void memcpy_atomic_write(
char *dst,
const char *src,
size_t len) {
1719 static_assert(std::atomic_ref<char>::required_alignment ==
sizeof(char),
1720 "std::atomic_ref requires the same alignment as char");
1725 constexpr size_t alignment =
sizeof(uint64_t);
1728 auto bbb_memcpy_atomic_write = [](
char *bytedst,
const char *bytesrc,
1729 size_t bytelen)
noexcept {
1730 for (
size_t j = 0; j < bytelen; ++j) {
1731 std::atomic_ref<char>(bytedst[j])
1732 .store(bytesrc[j], std::memory_order_relaxed);
1737 size_t offset =
reinterpret_cast<std::uintptr_t
>(dst) % alignment;
1739 size_t to_align = std::min(len, alignment - offset);
1740 bbb_memcpy_atomic_write(dst, src, to_align);
1747 while (len >= alignment) {
1748 auto *dst_aligned =
reinterpret_cast<uint64_t *
>(dst);
1750 std::memcpy(&src_val, src,
sizeof(uint64_t));
1751 std::atomic_ref<uint64_t>(*dst_aligned)
1752 .store(src_val, std::memory_order_relaxed);
1760 bbb_memcpy_atomic_write(dst, src, len);
1769#ifndef SIMDUTF_LATIN1_H
1770#define SIMDUTF_LATIN1_H
1777simdutf_really_inline
size_t utf8_length_from_latin1(
const char *buf,
1779 const uint8_t *c =
reinterpret_cast<const uint8_t *
>(buf);
1781 for (
size_t i = 0; i < len; i++) {
1786 return answer + len;
1797#ifndef SIMDUTF_LATIN1_TO_UTF16_H
1798#define SIMDUTF_LATIN1_TO_UTF16_H
1803namespace latin1_to_utf16 {
1805template <endianness big_endian,
typename InputPtr>
1806#if SIMDUTF_CPLUSPLUS20
1807 requires simdutf::detail::indexes_into_byte_like<InputPtr>
1809simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
1810 char16_t *utf16_output) {
1812 char16_t *start{utf16_output};
1818 char16_t(match_system(big_endian) ? word : u16_swap_bytes(word));
1822 return utf16_output - start;
1825template <endianness big_endian>
1826inline result convert_with_errors(
const char *buf,
size_t len,
1827 char16_t *utf16_output) {
1828 const uint8_t *data =
reinterpret_cast<const uint8_t *
>(buf);
1830 char16_t *start{utf16_output};
1834 uint16_t(data[pos]);
1836 char16_t(match_system(big_endian) ? word : u16_swap_bytes(word));
1840 return result(error_code::SUCCESS, utf16_output - start);
1851#ifndef SIMDUTF_LATIN1_TO_UTF32_H
1852#define SIMDUTF_LATIN1_TO_UTF32_H
1857namespace latin1_to_utf32 {
1859template <
typename InputPtr>
1860#if SIMDUTF_CPLUSPLUS20
1861 requires simdutf::detail::indexes_into_byte_like<InputPtr>
1863simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
1864 char32_t *utf32_output) {
1865 char32_t *start{utf32_output};
1866 for (
size_t i = 0; i < len; i++) {
1867 *utf32_output++ = uint8_t(data[i]);
1869 return utf32_output - start;
1880#ifndef SIMDUTF_LATIN1_TO_UTF8_H
1881#define SIMDUTF_LATIN1_TO_UTF8_H
1886namespace latin1_to_utf8 {
1888template <
typename InputPtr,
typename OutputPtr>
1889#if SIMDUTF_CPLUSPLUS20
1890 requires(simdutf::detail::indexes_into_byte_like<InputPtr> &&
1891 simdutf::detail::index_assignable_from_char<OutputPtr>)
1893simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
1894 OutputPtr utf8_output) {
1897 size_t utf8_pos = 0;
1900#if SIMDUTF_CPLUSPLUS23
1905 if (pos + 16 <= len) {
1908 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
1910 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
1914 if ((v & 0x8080808080808080) ==
1917 size_t final_pos = pos + 16;
1918 while (pos < final_pos) {
1919 utf8_output[utf8_pos++] = char(data[pos]);
1927 unsigned char byte = data[pos];
1928 if ((
byte & 0x80) == 0) {
1930 utf8_output[utf8_pos++] = char(
byte);
1934 utf8_output[utf8_pos++] = char((
byte >> 6) | 0b11000000);
1935 utf8_output[utf8_pos++] = char((
byte & 0b111111) | 0b10000000);
1942simdutf_really_inline
size_t convert(
const char *buf,
size_t len,
1943 char *utf8_output) {
1944 return convert(
reinterpret_cast<const unsigned char *
>(buf), len,
1948inline size_t convert_safe(
const char *buf,
size_t len,
char *utf8_output,
1950 const unsigned char *data =
reinterpret_cast<const unsigned char *
>(buf);
1952 size_t skip_pos = 0;
1953 size_t utf8_pos = 0;
1954 while (pos < len && utf8_pos < utf8_len) {
1956 if (pos >= skip_pos && pos + 16 <= len &&
1957 utf8_pos + 16 <= utf8_len) {
1960 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
1962 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
1966 if ((v & 0x8080808080808080) ==
1969 ::memcpy(utf8_output + utf8_pos, buf + pos, 16);
1975 skip_pos = pos + 16;
1978 const auto byte = data[pos];
1979 if ((
byte & 0x80) == 0) {
1981 utf8_output[utf8_pos++] = char(
byte);
1983 }
else if (utf8_pos + 2 <= utf8_len) {
1985 utf8_output[utf8_pos++] = char((
byte >> 6) | 0b11000000);
1986 utf8_output[utf8_pos++] = char((
byte & 0b111111) | 0b10000000);
1996template <
typename InputPtr,
typename OutputPtr>
1997#if SIMDUTF_CPLUSPLUS20
1998 requires(simdutf::detail::indexes_into_byte_like<InputPtr> &&
1999 simdutf::detail::index_assignable_from_char<OutputPtr>)
2001simdutf_constexpr23
size_t convert_safe_constexpr(InputPtr data,
size_t len,
2002 OutputPtr utf8_output,
2005 size_t utf8_pos = 0;
2006 while (pos < len && utf8_pos < utf8_len) {
2007 const unsigned char byte = data[pos];
2008 if ((
byte & 0x80) == 0) {
2010 utf8_output[utf8_pos++] = char(
byte);
2012 }
else if (utf8_pos + 2 <= utf8_len) {
2014 utf8_output[utf8_pos++] = char((
byte >> 6) | 0b11000000);
2015 utf8_output[utf8_pos++] = char((
byte & 0b111111) | 0b10000000);
2024template <
typename InputPtr>
2025#if SIMDUTF_CPLUSPLUS20
2026 requires simdutf::detail::indexes_into_byte_like<InputPtr>
2028simdutf_constexpr23 simdutf_warn_unused
size_t
2029utf8_length_from_latin1(InputPtr input,
size_t length)
noexcept {
2030 size_t answer = length;
2033#if SIMDUTF_CPLUSPLUS23
2037 auto pop = [](uint64_t v) {
2038 return (
size_t)(((v >> 7) & UINT64_C(0x0101010101010101)) *
2039 UINT64_C(0x0101010101010101) >>
2042 for (; i + 32 <= length; i += 32) {
2044 memcpy(&v, input + i, 8);
2046 memcpy(&v, input + i + 8,
sizeof(v));
2048 memcpy(&v, input + i + 16,
sizeof(v));
2050 memcpy(&v, input + i + 24,
sizeof(v));
2053 for (; i + 8 <= length; i += 8) {
2055 memcpy(&v, input + i,
sizeof(v));
2059 for (; i + 1 <= length; i += 1) {
2060 answer +=
static_cast<uint8_t
>(input[i]) >> 7;
2073#ifndef SIMDUTF_UTF16_H
2074#define SIMDUTF_UTF16_H
2080template <endianness big_endian>
2081simdutf_warn_unused simdutf_constexpr23
bool
2082validate_as_ascii(
const char16_t *data,
size_t len)
noexcept {
2083 for (
size_t pos = 0; pos < len; pos++) {
2084 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
2092template <endianness big_endian>
2093inline simdutf_warn_unused simdutf_constexpr23
bool
2094validate(
const char16_t *data,
size_t len)
noexcept {
2097 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
2098 if ((word & 0xF800) == 0xD800) {
2099 if (pos + 1 >= len) {
2102 char16_t diff = char16_t(word - 0xD800);
2106 char16_t next_word = !match_system(big_endian)
2107 ? u16_swap_bytes(data[pos + 1])
2109 char16_t diff2 = char16_t(next_word - 0xDC00);
2110 if (diff2 > 0x3FF) {
2121template <endianness big_endian>
2122inline simdutf_warn_unused simdutf_constexpr23 result
2123validate_with_errors(
const char16_t *data,
size_t len)
noexcept {
2126 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
2127 if ((word & 0xF800) == 0xD800) {
2128 if (pos + 1 >= len) {
2129 return result(error_code::SURROGATE, pos);
2131 char16_t diff = char16_t(word - 0xD800);
2133 return result(error_code::SURROGATE, pos);
2135 char16_t next_word = !match_system(big_endian)
2136 ? u16_swap_bytes(data[pos + 1])
2138 char16_t diff2 = uint16_t(next_word - 0xDC00);
2139 if (diff2 > 0x3FF) {
2140 return result(error_code::SURROGATE, pos);
2147 return result(error_code::SUCCESS, pos);
2150template <endianness big_endian>
2151simdutf_constexpr23
size_t count_code_points(
const char16_t *p,
size_t len) {
2154 for (
size_t i = 0; i < len; i++) {
2155 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
2156 counter += ((word & 0xFC00) != 0xDC00);
2161template <endianness big_endian>
2162simdutf_constexpr23
size_t utf8_length_from_utf16(
const char16_t *p,
2166 for (
size_t i = 0; i < len; i++) {
2167 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
2169 counter +=
static_cast<size_t>(
2172 counter +=
static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
2178template <endianness big_endian>
2179simdutf_constexpr23
size_t utf32_length_from_utf16(
const char16_t *p,
2183 for (
size_t i = 0; i < len; i++) {
2184 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
2185 counter += ((word & 0xFC00) != 0xDC00);
2190simdutf_really_inline simdutf_constexpr23
void
2191change_endianness_utf16(
const char16_t *input,
size_t size,
char16_t *output) {
2192 for (
size_t i = 0; i < size; i++) {
2193 *output++ = char16_t(input[i] >> 8 | input[i] << 8);
2197template <endianness big_endian>
2198simdutf_warn_unused simdutf_constexpr23
size_t
2199trim_partial_utf16(
const char16_t *input,
size_t length) {
2203 uint16_t last_word = uint16_t(input[length - 1]);
2204 last_word = scalar::utf16::swap_if_needed<big_endian>(last_word);
2205 length -= ((last_word & 0xFC00) == 0xD800);
2209template <endianness big_endian>
2210simdutf_constexpr
bool is_high_surrogate(
char16_t c) {
2211 c = scalar::utf16::swap_if_needed<big_endian>(c);
2212 return (0xd800 <= c && c <= 0xdbff);
2215template <endianness big_endian>
2216simdutf_constexpr
bool is_low_surrogate(
char16_t c) {
2217 c = scalar::utf16::swap_if_needed<big_endian>(c);
2218 return (0xdc00 <= c && c <= 0xdfff);
2221simdutf_really_inline
constexpr bool high_surrogate(
char16_t c) {
2222 return (0xd800 <= c && c <= 0xdbff);
2225simdutf_really_inline
constexpr bool low_surrogate(
char16_t c) {
2226 return (0xdc00 <= c && c <= 0xdfff);
2229template <endianness big_endian>
2230simdutf_constexpr23 result
2231utf8_length_from_utf16_with_replacement(
const char16_t *p,
size_t len) {
2232 bool any_surrogates =
false;
2235 for (
size_t i = 0; i < len; i++) {
2236 if (is_high_surrogate<big_endian>(p[i])) {
2237 any_surrogates =
true;
2239 if (i + 1 < len && is_low_surrogate<big_endian>(p[i + 1])) {
2246 }
else if (is_low_surrogate<big_endian>(p[i])) {
2247 any_surrogates =
true;
2251 char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i];
2254 static_cast<size_t>(word > 0x7F);
2255 counter +=
static_cast<size_t>(word > 0x7FF);
2257 return {any_surrogates ? error_code::SURROGATE : error_code::SUCCESS,
2262template <endianness big_endian>
constexpr char16_t replacement() {
2263 return !match_system(big_endian) ? scalar::u16_swap_bytes(0xfffd) : 0xfffd;
2266template <endianness big_endian>
2267simdutf_constexpr23
void to_well_formed_utf16(
const char16_t *input,
size_t len,
2269 const char16_t replacement = utf16::replacement<big_endian>();
2270 bool high_surrogate_prev =
false, high_surrogate, low_surrogate;
2272 for (; i < len; i++) {
2273 char16_t c = input[i];
2274 high_surrogate = is_high_surrogate<big_endian>(c);
2275 low_surrogate = is_low_surrogate<big_endian>(c);
2276 if (high_surrogate_prev && !low_surrogate) {
2277 output[i - 1] = replacement;
2280 if (!high_surrogate_prev && low_surrogate) {
2281 output[i] = replacement;
2283 output[i] = input[i];
2285 high_surrogate_prev = high_surrogate;
2289 if (high_surrogate_prev) {
2290 output[i - 1] = replacement;
2301#ifndef SIMDUTF_UTF16_TO_LATIN1_H
2302#define SIMDUTF_UTF16_TO_LATIN1_H
2309namespace utf16_to_latin1 {
2311template <endianness big_endian,
typename InputPtr,
typename OutputPtr>
2312#if SIMDUTF_CPLUSPLUS20
2313 requires(simdutf::detail::indexes_into_utf16<InputPtr> &&
2314 simdutf::detail::index_assignable_from_char<OutputPtr>)
2316simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
2317 OutputPtr latin_output) {
2322 const auto latin_output_start = latin_output;
2324 uint16_t too_large = 0;
2327 word = !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2329 *latin_output++ = char(word & 0xFF);
2332 if ((too_large & 0xFF00) != 0) {
2336 return latin_output - latin_output_start;
2339template <endianness big_endian,
typename InputPtr,
typename OutputPtr>
2340#if SIMDUTF_CPLUSPLUS20
2341 requires(simdutf::detail::indexes_into_utf16<InputPtr> &&
2342 simdutf::detail::index_assignable_from_char<OutputPtr>)
2344simdutf_constexpr23 result convert_with_errors(InputPtr data,
size_t len,
2345 OutputPtr latin_output) {
2347 return result(error_code::SUCCESS, 0);
2350 auto start = latin_output;
2354#if SIMDUTF_CPLUSPLUS23
2358 if (pos + 16 <= len) {
2360 uint64_t v1, v2, v3, v4;
2361 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
2362 ::memcpy(&v2, data + pos + 4,
sizeof(uint64_t));
2363 ::memcpy(&v3, data + pos + 8,
sizeof(uint64_t));
2364 ::memcpy(&v4, data + pos + 12,
sizeof(uint64_t));
2366 if simdutf_constexpr (!match_system(big_endian)) {
2367 v1 = (v1 >> 8) | (v1 << (64 - 8));
2369 if simdutf_constexpr (!match_system(big_endian)) {
2370 v2 = (v2 >> 8) | (v2 << (64 - 8));
2372 if simdutf_constexpr (!match_system(big_endian)) {
2373 v3 = (v3 >> 8) | (v3 << (64 - 8));
2375 if simdutf_constexpr (!match_system(big_endian)) {
2376 v4 = (v4 >> 8) | (v4 << (64 - 8));
2379 if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
2380 size_t final_pos = pos + 16;
2381 while (pos < final_pos) {
2382 *latin_output++ = !match_system(big_endian)
2383 ? char(u16_swap_bytes(data[pos]))
2392 word = !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2393 if ((word & 0xFF00) == 0) {
2394 *latin_output++ = char(word & 0xFF);
2397 return result(error_code::TOO_LARGE, pos);
2400 return result(error_code::SUCCESS, latin_output - start);
2411#ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H
2412#define SIMDUTF_VALID_UTF16_TO_LATIN1_H
2417namespace utf16_to_latin1 {
2419template <endianness big_endian,
class InputIterator,
class OutputIterator>
2420simdutf_constexpr23
inline size_t
2421convert_valid_impl(InputIterator data,
size_t len,
2422 OutputIterator latin_output) {
2424 std::is_same<
typename std::decay<
decltype(*data)>::type, uint16_t>::value,
2425 "must decay to uint16_t");
2427 const auto start = latin_output;
2431 word = !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2432 *latin_output++ = char(word);
2436 return latin_output - start;
2439template <endianness big_endian>
2440simdutf_really_inline
size_t convert_valid(
const char16_t *buf,
size_t len,
2441 char *latin_output) {
2442 return convert_valid_impl<big_endian>(
reinterpret_cast<const uint16_t *
>(buf),
2453#ifndef SIMDUTF_UTF16_TO_UTF32_H
2454#define SIMDUTF_UTF16_TO_UTF32_H
2459namespace utf16_to_utf32 {
2461template <endianness big_endian>
2462simdutf_constexpr23
size_t convert(
const char16_t *data,
size_t len,
2463 char32_t *utf32_output) {
2465 char32_t *start{utf32_output};
2468 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2469 if ((word & 0xF800) != 0xD800) {
2471 *utf32_output++ = char32_t(word);
2475 uint16_t diff = uint16_t(word - 0xD800);
2479 if (pos + 1 >= len) {
2482 uint16_t next_word = !match_system(big_endian)
2483 ? u16_swap_bytes(data[pos + 1])
2485 uint16_t diff2 = uint16_t(next_word - 0xDC00);
2486 if (diff2 > 0x3FF) {
2489 uint32_t value = (diff << 10) + diff2 + 0x10000;
2490 *utf32_output++ = char32_t(value);
2494 return utf32_output - start;
2497template <endianness big_endian>
2498simdutf_constexpr23 result convert_with_errors(
const char16_t *data,
size_t len,
2499 char32_t *utf32_output) {
2501 char32_t *start{utf32_output};
2504 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2505 if ((word & 0xF800) != 0xD800) {
2507 *utf32_output++ = char32_t(word);
2511 uint16_t diff = uint16_t(word - 0xD800);
2513 return result(error_code::SURROGATE, pos);
2515 if (pos + 1 >= len) {
2516 return result(error_code::SURROGATE, pos);
2518 uint16_t next_word = !match_system(big_endian)
2519 ? u16_swap_bytes(data[pos + 1])
2521 uint16_t diff2 = uint16_t(next_word - 0xDC00);
2522 if (diff2 > 0x3FF) {
2523 return result(error_code::SURROGATE, pos);
2525 uint32_t value = (diff << 10) + diff2 + 0x10000;
2526 *utf32_output++ = char32_t(value);
2530 return result(error_code::SUCCESS, utf32_output - start);
2541#ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H
2542#define SIMDUTF_VALID_UTF16_TO_UTF32_H
2547namespace utf16_to_utf32 {
2549template <endianness big_endian>
2550simdutf_constexpr23
size_t convert_valid(
const char16_t *data,
size_t len,
2551 char32_t *utf32_output) {
2553 char32_t *start{utf32_output};
2556 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2557 if ((word & 0xF800) != 0xD800) {
2559 *utf32_output++ = char32_t(word);
2563 uint16_t diff = uint16_t(word - 0xD800);
2564 if (pos + 1 >= len) {
2567 uint16_t next_word = !match_system(big_endian)
2568 ? u16_swap_bytes(data[pos + 1])
2570 uint16_t diff2 = uint16_t(next_word - 0xDC00);
2571 uint32_t value = (diff << 10) + diff2 + 0x10000;
2572 *utf32_output++ = char32_t(value);
2576 return utf32_output - start;
2587#ifndef SIMDUTF_UTF16_TO_UTF8_H
2588#define SIMDUTF_UTF16_TO_UTF8_H
2593namespace utf16_to_utf8 {
2595template <endianness big_endian,
typename InputPtr,
typename OutputPtr>
2596#if SIMDUTF_CPLUSPLUS20
2597 requires simdutf::detail::indexes_into_utf16<InputPtr>
2600simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
2601 OutputPtr utf8_output) {
2603 const auto start = utf8_output;
2605#if SIMDUTF_CPLUSPLUS23
2610 if (pos + 4 <= len) {
2613 ::memcpy(&v, data + pos,
sizeof(uint64_t));
2614 if simdutf_constexpr (!match_system(big_endian)) {
2615 v = (v >> 8) | (v << (64 - 8));
2617 if ((v & 0xFF80FF80FF80FF80) == 0) {
2618 size_t final_pos = pos + 4;
2619 while (pos < final_pos) {
2620 *utf8_output++ = !match_system(big_endian)
2621 ? char(u16_swap_bytes(data[pos]))
2630 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2631 if ((word & 0xFF80) == 0) {
2633 *utf8_output++ = char(word);
2635 }
else if ((word & 0xF800) == 0) {
2638 *utf8_output++ = char((word >> 6) | 0b11000000);
2639 *utf8_output++ = char((word & 0b111111) | 0b10000000);
2641 }
else if ((word & 0xF800) != 0xD800) {
2644 *utf8_output++ = char((word >> 12) | 0b11100000);
2645 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
2646 *utf8_output++ = char((word & 0b111111) | 0b10000000);
2650 if (pos + 1 >= len) {
2653 uint16_t diff = uint16_t(word - 0xD800);
2657 uint16_t next_word = !match_system(big_endian)
2658 ? u16_swap_bytes(data[pos + 1])
2660 uint16_t diff2 = uint16_t(next_word - 0xDC00);
2661 if (diff2 > 0x3FF) {
2664 uint32_t value = (diff << 10) + diff2 + 0x10000;
2667 *utf8_output++ = char((value >> 18) | 0b11110000);
2668 *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
2669 *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
2670 *utf8_output++ = char((value & 0b111111) | 0b10000000);
2674 return utf8_output - start;
2677template <endianness big_endian,
bool check_output =
false,
typename InputPtr,
2679#if SIMDUTF_CPLUSPLUS20
2680 requires(simdutf::detail::indexes_into_utf16<InputPtr> &&
2681 simdutf::detail::index_assignable_from_char<OutputPtr>)
2683simdutf_constexpr23 full_result convert_with_errors(InputPtr data,
size_t len,
2684 OutputPtr utf8_output,
2685 size_t utf8_len = 0) {
2686 if (check_output && utf8_len == 0) {
2687 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, 0, 0);
2691 auto start = utf8_output;
2692 auto end = utf8_output + utf8_len;
2695#if SIMDUTF_CPLUSPLUS23
2700 if (pos + 4 <= len) {
2703 ::memcpy(&v, data + pos,
sizeof(uint64_t));
2704 if simdutf_constexpr (!match_system(big_endian))
2705 v = (v >> 8) | (v << (64 - 8));
2706 if ((v & 0xFF80FF80FF80FF80) == 0) {
2707 size_t final_pos = pos + 4;
2708 while (pos < final_pos) {
2709 if (check_output &&
size_t(end - utf8_output) < 1) {
2710 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
2711 utf8_output - start);
2713 *utf8_output++ = !match_system(big_endian)
2714 ? char(u16_swap_bytes(data[pos]))
2724 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2725 if ((word & 0xFF80) == 0) {
2727 if (check_output &&
size_t(end - utf8_output) < 1) {
2728 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
2729 utf8_output - start);
2731 *utf8_output++ = char(word);
2733 }
else if ((word & 0xF800) == 0) {
2736 if (check_output &&
size_t(end - utf8_output) < 2) {
2737 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
2738 utf8_output - start);
2740 *utf8_output++ = char((word >> 6) | 0b11000000);
2741 *utf8_output++ = char((word & 0b111111) | 0b10000000);
2744 }
else if ((word & 0xF800) != 0xD800) {
2747 if (check_output &&
size_t(end - utf8_output) < 3) {
2748 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
2749 utf8_output - start);
2751 *utf8_output++ = char((word >> 12) | 0b11100000);
2752 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
2753 *utf8_output++ = char((word & 0b111111) | 0b10000000);
2757 if (check_output &&
size_t(end - utf8_output) < 4) {
2758 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
2759 utf8_output - start);
2762 if (pos + 1 >= len) {
2763 return full_result(error_code::SURROGATE, pos, utf8_output - start);
2765 uint16_t diff = uint16_t(word - 0xD800);
2767 return full_result(error_code::SURROGATE, pos, utf8_output - start);
2769 uint16_t next_word = !match_system(big_endian)
2770 ? u16_swap_bytes(data[pos + 1])
2772 uint16_t diff2 = uint16_t(next_word - 0xDC00);
2773 if (diff2 > 0x3FF) {
2774 return full_result(error_code::SURROGATE, pos, utf8_output - start);
2776 uint32_t value = (diff << 10) + diff2 + 0x10000;
2779 *utf8_output++ = char((value >> 18) | 0b11110000);
2780 *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
2781 *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
2782 *utf8_output++ = char((value & 0b111111) | 0b10000000);
2786 return full_result(error_code::SUCCESS, pos, utf8_output - start);
2789template <endianness big_endian>
2790inline result simple_convert_with_errors(
const char16_t *buf,
size_t len,
2791 char *utf8_output) {
2792 return convert_with_errors<big_endian, false>(buf, len, utf8_output, 0);
2803#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
2804#define SIMDUTF_VALID_UTF16_TO_UTF8_H
2809namespace utf16_to_utf8 {
2811template <endianness big_endian,
typename InputPtr,
typename OutputPtr>
2812#if SIMDUTF_CPLUSPLUS20
2813 requires(simdutf::detail::indexes_into_utf16<InputPtr> &&
2814 simdutf::detail::index_assignable_from_char<OutputPtr>)
2816simdutf_constexpr23
size_t convert_valid(InputPtr data,
size_t len,
2817 OutputPtr utf8_output) {
2819 auto start = utf8_output;
2821#if SIMDUTF_CPLUSPLUS23
2826 if (pos + 4 <= len) {
2829 ::memcpy(&v, data + pos,
sizeof(uint64_t));
2830 if simdutf_constexpr (!match_system(big_endian)) {
2831 v = (v >> 8) | (v << (64 - 8));
2833 if ((v & 0xFF80FF80FF80FF80) == 0) {
2834 size_t final_pos = pos + 4;
2835 while (pos < final_pos) {
2836 *utf8_output++ = !match_system(big_endian)
2837 ? char(u16_swap_bytes(data[pos]))
2847 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2848 if ((word & 0xFF80) == 0) {
2850 *utf8_output++ = char(word);
2852 }
else if ((word & 0xF800) == 0) {
2855 *utf8_output++ = char((word >> 6) | 0b11000000);
2856 *utf8_output++ = char((word & 0b111111) | 0b10000000);
2858 }
else if ((word & 0xF800) != 0xD800) {
2861 *utf8_output++ = char((word >> 12) | 0b11100000);
2862 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
2863 *utf8_output++ = char((word & 0b111111) | 0b10000000);
2867 uint16_t diff = uint16_t(word - 0xD800);
2868 if (pos + 1 >= len) {
2871 uint16_t next_word = !match_system(big_endian)
2872 ? u16_swap_bytes(data[pos + 1])
2874 uint16_t diff2 = uint16_t(next_word - 0xDC00);
2875 uint32_t value = (diff << 10) + diff2 + 0x10000;
2878 *utf8_output++ = char((value >> 18) | 0b11110000);
2879 *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
2880 *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
2881 *utf8_output++ = char((value & 0b111111) | 0b10000000);
2885 return utf8_output - start;
2896#ifndef SIMDUTF_UTF32_H
2897#define SIMDUTF_UTF32_H
2903template <
typename InputPtr>
2904#if SIMDUTF_CPLUSPLUS20
2905 requires simdutf::detail::indexes_into_uint32<InputPtr>
2907simdutf_warn_unused simdutf_constexpr23
bool validate(InputPtr data,
2908 size_t len)
noexcept {
2910 for (; pos < len; pos++) {
2911 uint32_t word = data[pos];
2912 if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
2919simdutf_warn_unused simdutf_really_inline
bool validate(
const char32_t *buf,
2920 size_t len)
noexcept {
2921 return validate(
reinterpret_cast<const uint32_t *
>(buf), len);
2924template <
typename InputPtr>
2925#if SIMDUTF_CPLUSPLUS20
2926 requires simdutf::detail::indexes_into_uint32<InputPtr>
2928simdutf_warn_unused simdutf_constexpr23 result
2929validate_with_errors(InputPtr data,
size_t len)
noexcept {
2931 for (; pos < len; pos++) {
2932 uint32_t word = data[pos];
2933 if (word > 0x10FFFF) {
2934 return result(error_code::TOO_LARGE, pos);
2936 if (word >= 0xD800 && word <= 0xDFFF) {
2937 return result(error_code::SURROGATE, pos);
2940 return result(error_code::SUCCESS, pos);
2943simdutf_warn_unused simdutf_really_inline result
2944validate_with_errors(
const char32_t *buf,
size_t len)
noexcept {
2945 return validate_with_errors(
reinterpret_cast<const uint32_t *
>(buf), len);
2948inline simdutf_constexpr23
size_t utf8_length_from_utf32(
const char32_t *p,
2952 for (
size_t i = 0; i < len; i++) {
2955 counter +=
static_cast<size_t>(p[i] > 0x7F);
2956 counter +=
static_cast<size_t>(p[i] > 0x7FF);
2957 counter +=
static_cast<size_t>(p[i] > 0xFFFF);
2962inline simdutf_warn_unused simdutf_constexpr23
size_t
2963utf16_length_from_utf32(
const char32_t *p,
size_t len) {
2966 for (
size_t i = 0; i < len; i++) {
2968 counter +=
static_cast<size_t>(p[i] > 0xFFFF);
2980#ifndef SIMDUTF_UTF32_TO_LATIN1_H
2981#define SIMDUTF_UTF32_TO_LATIN1_H
2986namespace utf32_to_latin1 {
2988inline simdutf_constexpr23
size_t convert(
const char32_t *data,
size_t len,
2989 char *latin1_output) {
2990 char *start = latin1_output;
2991 uint32_t utf32_char;
2993 uint32_t too_large = 0;
2996 utf32_char = (uint32_t)data[pos];
2997 too_large |= utf32_char;
2998 *latin1_output++ = (char)(utf32_char & 0xFF);
3001 if ((too_large & 0xFFFFFF00) != 0) {
3004 return latin1_output - start;
3007inline simdutf_constexpr23 result convert_with_errors(
const char32_t *data,
3009 char *latin1_output) {
3010 char *start{latin1_output};
3013#if SIMDUTF_CPLUSPLUS23
3017 if (pos + 2 <= len) {
3020 ::memcpy(&v, data + pos,
sizeof(uint64_t));
3021 if ((v & 0xFFFFFF00FFFFFF00) == 0) {
3022 *latin1_output++ = char(data[pos]);
3023 *latin1_output++ = char(data[pos + 1]);
3030 uint32_t utf32_char = data[pos];
3031 if ((utf32_char & 0xFFFFFF00) ==
3033 *latin1_output++ = (char)(utf32_char & 0xFF);
3036 return result(error_code::TOO_LARGE, pos);
3039 return result(error_code::SUCCESS, latin1_output - start);
3050#ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H
3051#define SIMDUTF_VALID_UTF32_TO_LATIN1_H
3056namespace utf32_to_latin1 {
3058template <
typename ReadPtr,
typename WritePtr>
3059simdutf_constexpr23
size_t convert_valid(ReadPtr data,
size_t len,
3060 WritePtr latin1_output) {
3062 std::is_same<
typename std::decay<
decltype(*data)>::type, uint32_t>::value,
3063 "dereferencing the data pointer must result in a uint32_t");
3064 auto start = latin1_output;
3065 uint32_t utf32_char;
3069 utf32_char = data[pos];
3071#if SIMDUTF_CPLUSPLUS23
3077 if (pos + 2 <= len) {
3080 std::memcpy(&v, data + pos,
sizeof(uint64_t));
3081 if ((v & 0xFFFFFF00FFFFFF00) == 0) {
3082 *latin1_output++ = char(data[pos]);
3083 *latin1_output++ = char(data[pos + 1]);
3091#if SIMDUTF_CPLUSPLUS23
3094 if ((utf32_char & 0xFFFFFF00) == 0) {
3095 *latin1_output++ = char(utf32_char);
3102 return latin1_output - start;
3105simdutf_really_inline
size_t convert_valid(
const char32_t *buf,
size_t len,
3106 char *latin1_output) {
3107 return convert_valid(
reinterpret_cast<const uint32_t *
>(buf), len,
3119#ifndef SIMDUTF_UTF32_TO_UTF16_H
3120#define SIMDUTF_UTF32_TO_UTF16_H
3125namespace utf32_to_utf16 {
3127template <endianness big_endian>
3128simdutf_constexpr23
size_t convert(
const char32_t *data,
size_t len,
3129 char16_t *utf16_output) {
3131 char16_t *start{utf16_output};
3133 uint32_t word = data[pos];
3134 if ((word & 0xFFFF0000) == 0) {
3135 if (word >= 0xD800 && word <= 0xDFFF) {
3139 *utf16_output++ = !match_system(big_endian)
3140 ? char16_t(u16_swap_bytes(uint16_t(word)))
3144 if (word > 0x10FFFF) {
3148 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
3149 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
3150 if simdutf_constexpr (!match_system(big_endian)) {
3151 high_surrogate = u16_swap_bytes(high_surrogate);
3152 low_surrogate = u16_swap_bytes(low_surrogate);
3154 *utf16_output++ = char16_t(high_surrogate);
3155 *utf16_output++ = char16_t(low_surrogate);
3159 return utf16_output - start;
3162template <endianness big_endian>
3163simdutf_constexpr23 result convert_with_errors(
const char32_t *data,
size_t len,
3164 char16_t *utf16_output) {
3166 char16_t *start{utf16_output};
3168 uint32_t word = data[pos];
3169 if ((word & 0xFFFF0000) == 0) {
3170 if (word >= 0xD800 && word <= 0xDFFF) {
3171 return result(error_code::SURROGATE, pos);
3174 *utf16_output++ = !match_system(big_endian)
3175 ? char16_t(u16_swap_bytes(uint16_t(word)))
3179 if (word > 0x10FFFF) {
3180 return result(error_code::TOO_LARGE, pos);
3183 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
3184 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
3185 if simdutf_constexpr (!match_system(big_endian)) {
3186 high_surrogate = u16_swap_bytes(high_surrogate);
3187 low_surrogate = u16_swap_bytes(low_surrogate);
3189 *utf16_output++ = char16_t(high_surrogate);
3190 *utf16_output++ = char16_t(low_surrogate);
3194 return result(error_code::SUCCESS, utf16_output - start);
3205#ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H
3206#define SIMDUTF_VALID_UTF32_TO_UTF16_H
3211namespace utf32_to_utf16 {
3213template <endianness big_endian>
3214simdutf_constexpr23
size_t convert_valid(
const char32_t *data,
size_t len,
3215 char16_t *utf16_output) {
3217 char16_t *start{utf16_output};
3219 uint32_t word = data[pos];
3220 if ((word & 0xFFFF0000) == 0) {
3222 *utf16_output++ = !match_system(big_endian)
3223 ? char16_t(u16_swap_bytes(uint16_t(word)))
3229 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
3230 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
3231 if simdutf_constexpr (!match_system(big_endian)) {
3232 high_surrogate = u16_swap_bytes(high_surrogate);
3233 low_surrogate = u16_swap_bytes(low_surrogate);
3235 *utf16_output++ = char16_t(high_surrogate);
3236 *utf16_output++ = char16_t(low_surrogate);
3240 return utf16_output - start;
3251#ifndef SIMDUTF_UTF32_TO_UTF8_H
3252#define SIMDUTF_UTF32_TO_UTF8_H
3257namespace utf32_to_utf8 {
3259template <
typename InputPtr,
typename OutputPtr>
3260#if SIMDUTF_CPLUSPLUS20
3261 requires(simdutf::detail::indexes_into_utf32<InputPtr> &&
3262 simdutf::detail::index_assignable_from_char<OutputPtr>)
3264simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
3265 OutputPtr utf8_output) {
3267 auto start = utf8_output;
3269#if SIMDUTF_CPLUSPLUS23
3273 if (pos + 2 <= len) {
3276 ::memcpy(&v, data + pos,
sizeof(uint64_t));
3277 if ((v & 0xFFFFFF80FFFFFF80) == 0) {
3278 *utf8_output++ = char(data[pos]);
3279 *utf8_output++ = char(data[pos + 1]);
3286 uint32_t word = data[pos];
3287 if ((word & 0xFFFFFF80) == 0) {
3289 *utf8_output++ = char(word);
3291 }
else if ((word & 0xFFFFF800) == 0) {
3294 *utf8_output++ = char((word >> 6) | 0b11000000);
3295 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3297 }
else if ((word & 0xFFFF0000) == 0) {
3300 if (word >= 0xD800 && word <= 0xDFFF) {
3303 *utf8_output++ = char((word >> 12) | 0b11100000);
3304 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
3305 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3310 if (word > 0x10FFFF) {
3313 *utf8_output++ = char((word >> 18) | 0b11110000);
3314 *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
3315 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
3316 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3320 return utf8_output - start;
3323template <
typename InputPtr,
typename OutputPtr>
3324#if SIMDUTF_CPLUSPLUS20
3325 requires(simdutf::detail::indexes_into_utf32<InputPtr> &&
3326 simdutf::detail::index_assignable_from_char<OutputPtr>)
3328simdutf_constexpr23 result convert_with_errors(InputPtr data,
size_t len,
3329 OutputPtr utf8_output) {
3331 auto start = utf8_output;
3333#if SIMDUTF_CPLUSPLUS23
3337 if (pos + 2 <= len) {
3340 ::memcpy(&v, data + pos,
sizeof(uint64_t));
3341 if ((v & 0xFFFFFF80FFFFFF80) == 0) {
3342 *utf8_output++ = char(data[pos]);
3343 *utf8_output++ = char(data[pos + 1]);
3350 uint32_t word = data[pos];
3351 if ((word & 0xFFFFFF80) == 0) {
3353 *utf8_output++ = char(word);
3355 }
else if ((word & 0xFFFFF800) == 0) {
3358 *utf8_output++ = char((word >> 6) | 0b11000000);
3359 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3361 }
else if ((word & 0xFFFF0000) == 0) {
3364 if (word >= 0xD800 && word <= 0xDFFF) {
3365 return result(error_code::SURROGATE, pos);
3367 *utf8_output++ = char((word >> 12) | 0b11100000);
3368 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
3369 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3374 if (word > 0x10FFFF) {
3375 return result(error_code::TOO_LARGE, pos);
3377 *utf8_output++ = char((word >> 18) | 0b11110000);
3378 *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
3379 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
3380 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3384 return result(error_code::SUCCESS, utf8_output - start);
3395#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
3396#define SIMDUTF_VALID_UTF32_TO_UTF8_H
3401namespace utf32_to_utf8 {
3403template <
typename InputPtr,
typename OutputPtr>
3404#if SIMDUTF_CPLUSPLUS20
3405 requires(simdutf::detail::indexes_into_utf32<InputPtr> &&
3406 simdutf::detail::index_assignable_from_char<OutputPtr>)
3408simdutf_constexpr23
size_t convert_valid(InputPtr data,
size_t len,
3409 OutputPtr utf8_output) {
3411 auto start = utf8_output;
3413#if SIMDUTF_CPLUSPLUS23
3417 if (pos + 2 <= len) {
3420 ::memcpy(&v, data + pos,
sizeof(uint64_t));
3421 if ((v & 0xFFFFFF80FFFFFF80) == 0) {
3422 *utf8_output++ = char(data[pos]);
3423 *utf8_output++ = char(data[pos + 1]);
3430 uint32_t word = data[pos];
3431 if ((word & 0xFFFFFF80) == 0) {
3433 *utf8_output++ = char(word);
3435 }
else if ((word & 0xFFFFF800) == 0) {
3438 *utf8_output++ = char((word >> 6) | 0b11000000);
3439 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3441 }
else if ((word & 0xFFFF0000) == 0) {
3444 *utf8_output++ = char((word >> 12) | 0b11100000);
3445 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
3446 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3451 *utf8_output++ = char((word >> 18) | 0b11110000);
3452 *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
3453 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
3454 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3458 return utf8_output - start;
3469#ifndef SIMDUTF_UTF8_H
3470#define SIMDUTF_UTF8_H
3478template <
class BytePtr>
3479simdutf_constexpr23 simdutf_warn_unused
bool validate(BytePtr data,
3480 size_t len)
noexcept {
3482 std::is_same<
typename std::decay<
decltype(*data)>::type, uint8_t>::value,
3483 "dereferencing the data pointer must result in a uint8_t");
3485 uint32_t code_point = 0;
3488#if SIMDUTF_CPLUSPLUS23
3492 next_pos = pos + 16;
3493 if (next_pos <= len) {
3496 std::memcpy(&v1, data + pos,
sizeof(uint64_t));
3498 std::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
3499 uint64_t v{v1 | v2};
3500 if ((v & 0x8080808080808080) == 0) {
3507 unsigned char byte = data[pos];
3509 while (
byte < 0b10000000) {
3516 if ((
byte & 0b11100000) == 0b11000000) {
3518 if (next_pos > len) {
3521 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3525 code_point = (
byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
3526 if ((code_point < 0x80) || (0x7ff < code_point)) {
3529 }
else if ((
byte & 0b11110000) == 0b11100000) {
3531 if (next_pos > len) {
3534 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3537 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
3541 code_point = (
byte & 0b00001111) << 12 |
3542 (data[pos + 1] & 0b00111111) << 6 |
3543 (data[pos + 2] & 0b00111111);
3544 if ((code_point < 0x800) || (0xffff < code_point) ||
3545 (0xd7ff < code_point && code_point < 0xe000)) {
3548 }
else if ((
byte & 0b11111000) == 0b11110000) {
3550 if (next_pos > len) {
3553 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3556 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
3559 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
3564 (
byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
3565 (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
3566 if (code_point <= 0xffff || 0x10ffff < code_point) {
3578simdutf_really_inline simdutf_warn_unused
bool validate(
const char *buf,
3579 size_t len)
noexcept {
3580 return validate(
reinterpret_cast<const uint8_t *
>(buf), len);
3583template <
class BytePtr>
3584simdutf_constexpr23 simdutf_warn_unused result
3585validate_with_errors(BytePtr data,
size_t len)
noexcept {
3587 std::is_same<
typename std::decay<
decltype(*data)>::type, uint8_t>::value,
3588 "dereferencing the data pointer must result in a uint8_t");
3590 uint32_t code_point = 0;
3593 size_t next_pos = pos + 16;
3597 std::memcpy(&v1, data + pos,
sizeof(uint64_t));
3599 std::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
3600 uint64_t v{v1 | v2};
3601 if ((v & 0x8080808080808080) == 0) {
3606 unsigned char byte = data[pos];
3608 while (
byte < 0b10000000) {
3610 return result(error_code::SUCCESS, len);
3615 if ((
byte & 0b11100000) == 0b11000000) {
3617 if (next_pos > len) {
3618 return result(error_code::TOO_SHORT, pos);
3620 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3621 return result(error_code::TOO_SHORT, pos);
3624 code_point = (
byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
3625 if ((code_point < 0x80) || (0x7ff < code_point)) {
3626 return result(error_code::OVERLONG, pos);
3628 }
else if ((
byte & 0b11110000) == 0b11100000) {
3630 if (next_pos > len) {
3631 return result(error_code::TOO_SHORT, pos);
3633 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3634 return result(error_code::TOO_SHORT, pos);
3636 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
3637 return result(error_code::TOO_SHORT, pos);
3640 code_point = (
byte & 0b00001111) << 12 |
3641 (data[pos + 1] & 0b00111111) << 6 |
3642 (data[pos + 2] & 0b00111111);
3643 if ((code_point < 0x800) || (0xffff < code_point)) {
3644 return result(error_code::OVERLONG, pos);
3646 if (0xd7ff < code_point && code_point < 0xe000) {
3647 return result(error_code::SURROGATE, pos);
3649 }
else if ((
byte & 0b11111000) == 0b11110000) {
3651 if (next_pos > len) {
3652 return result(error_code::TOO_SHORT, pos);
3654 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3655 return result(error_code::TOO_SHORT, pos);
3657 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
3658 return result(error_code::TOO_SHORT, pos);
3660 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
3661 return result(error_code::TOO_SHORT, pos);
3665 (
byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
3666 (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
3667 if (code_point <= 0xffff) {
3668 return result(error_code::OVERLONG, pos);
3670 if (0x10ffff < code_point) {
3671 return result(error_code::TOO_LARGE, pos);
3675 if ((
byte & 0b11000000) == 0b10000000) {
3676 return result(error_code::TOO_LONG, pos);
3678 return result(error_code::HEADER_BITS, pos);
3683 return result(error_code::SUCCESS, len);
3686simdutf_really_inline simdutf_warn_unused result
3687validate_with_errors(
const char *buf,
size_t len)
noexcept {
3688 return validate_with_errors(
reinterpret_cast<const uint8_t *
>(buf), len);
3696inline simdutf_warn_unused result rewind_and_validate_with_errors(
3697 const char *start,
const char *buf,
size_t len)
noexcept {
3699 if ((*start & 0b11000000) == 0b10000000) {
3700 return result(error_code::TOO_LONG, 0);
3702 size_t extra_len{0};
3704 for (
int i = 0; i < 5; i++) {
3705 unsigned char byte = *buf;
3706 if ((
byte & 0b11000000) != 0b10000000) {
3714 result res = validate_with_errors(buf, len + extra_len);
3715 res.count -= extra_len;
3719template <
typename InputPtr>
3720#if SIMDUTF_CPLUSPLUS20
3721 requires simdutf::detail::indexes_into_byte_like<InputPtr>
3723simdutf_constexpr23
size_t count_code_points(InputPtr data,
size_t len) {
3725 for (
size_t i = 0; i < len; i++) {
3728 if (int8_t(data[i]) > -65) {
3735template <
typename InputPtr>
3736#if SIMDUTF_CPLUSPLUS20
3737 requires simdutf::detail::indexes_into_byte_like<InputPtr>
3739simdutf_constexpr23
size_t utf16_length_from_utf8(InputPtr data,
size_t len) {
3741 for (
size_t i = 0; i < len; i++) {
3742 if (int8_t(data[i]) > -65) {
3745 if (uint8_t(data[i]) >= 240) {
3752template <
typename InputPtr>
3753#if SIMDUTF_CPLUSPLUS20
3754 requires simdutf::detail::indexes_into_byte_like<InputPtr>
3756simdutf_warn_unused simdutf_constexpr23
size_t
3757trim_partial_utf8(InputPtr input,
size_t length) {
3761 if (uint8_t(input[length - 1]) >= 0xc0) {
3764 if (uint8_t(input[length - 2]) >= 0xe0) {
3769 if (uint8_t(input[length - 1]) >= 0xc0) {
3777 if (uint8_t(input[length - 1]) >= 0xc0) {
3780 if (uint8_t(input[length - 2]) >= 0xe0) {
3783 if (uint8_t(input[length - 3]) >= 0xf0) {
3797#ifndef SIMDUTF_UTF8_TO_LATIN1_H
3798#define SIMDUTF_UTF8_TO_LATIN1_H
3803namespace utf8_to_latin1 {
3805template <
typename InputPtr,
typename OutputPtr>
3806#if SIMDUTF_CPLUSPLUS20
3807 requires(simdutf::detail::indexes_into_byte_like<InputPtr> &&
3808 simdutf::detail::indexes_into_byte_like<OutputPtr>)
3810simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
3811 OutputPtr latin_output) {
3813 auto start = latin_output;
3816#if SIMDUTF_CPLUSPLUS23
3821 if (pos + 16 <= len) {
3824 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
3826 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
3827 uint64_t v{v1 | v2};
3829 if ((v & 0x8080808080808080) ==
3832 size_t final_pos = pos + 16;
3833 while (pos < final_pos) {
3834 *latin_output++ = char(data[pos]);
3843 uint8_t leading_byte = data[pos];
3844 if (leading_byte < 0b10000000) {
3846 *latin_output++ = char(leading_byte);
3848 }
else if ((leading_byte & 0b11100000) ==
3851 if (pos + 1 >= len) {
3854 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3859 uint32_t code_point =
3860 (leading_byte & 0b00011111) << 6 |
3867 if (code_point < 0x80 || 0xFF < code_point) {
3872 *latin_output++ = char(code_point);
3878 return latin_output - start;
3881template <
typename InputPtr>
3882#if SIMDUTF_CPLUSPLUS20
3883 requires simdutf::detail::indexes_into_byte_like<InputPtr>
3885simdutf_constexpr23 result convert_with_errors(InputPtr data,
size_t len,
3886 char *latin_output) {
3888 char *start{latin_output};
3891#if SIMDUTF_CPLUSPLUS23
3896 if (pos + 16 <= len) {
3899 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
3901 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
3902 uint64_t v{v1 | v2};
3904 if ((v & 0x8080808080808080) ==
3907 size_t final_pos = pos + 16;
3908 while (pos < final_pos) {
3909 *latin_output++ = char(data[pos]);
3917 uint8_t leading_byte = data[pos];
3918 if (leading_byte < 0b10000000) {
3920 *latin_output++ = char(leading_byte);
3922 }
else if ((leading_byte & 0b11100000) ==
3925 if (pos + 1 >= len) {
3926 return result(error_code::TOO_SHORT, pos);
3928 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3929 return result(error_code::TOO_SHORT, pos);
3933 uint32_t code_point =
3934 (leading_byte & 0b00011111) << 6 |
3941 if (code_point < 0x80) {
3942 return result(error_code::OVERLONG, pos);
3944 if (0xFF < code_point) {
3945 return result(error_code::TOO_LARGE, pos);
3948 *latin_output++ = char(code_point);
3950 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
3952 return result(error_code::TOO_LARGE, pos);
3953 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
3955 return result(error_code::TOO_LARGE, pos);
3958 if ((leading_byte & 0b11000000) == 0b10000000) {
3959 return result(error_code::TOO_LONG, pos);
3962 return result(error_code::HEADER_BITS, pos);
3965 return result(error_code::SUCCESS, latin_output - start);
3968inline result rewind_and_convert_with_errors(
size_t prior_bytes,
3969 const char *buf,
size_t len,
3970 char *latin1_output) {
3971 size_t extra_len{0};
3975 size_t how_far_back = prior_bytes;
3978 bool found_leading_bytes{
false};
3980 for (
size_t i = 0; i <= how_far_back; i++) {
3981 unsigned char byte = buf[-
static_cast<std::ptrdiff_t
>(i)];
3982 found_leading_bytes = ((
byte & 0b11000000) != 0b10000000);
3983 if (found_leading_bytes) {
3984 if (i > 0 &&
byte < 128) {
3987 return result(error_code::TOO_LONG, 0 - i + 1);
4002 if (!found_leading_bytes) {
4007 return result(error_code::TOO_LONG, 0 - how_far_back);
4009 result res = convert_with_errors(buf, len + extra_len, latin1_output);
4011 res.count -= extra_len;
4024#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H
4025#define SIMDUTF_VALID_UTF8_TO_LATIN1_H
4030namespace utf8_to_latin1 {
4032template <
typename InputPtr>
4033#if SIMDUTF_CPLUSPLUS20
4034 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4036simdutf_constexpr23
size_t convert_valid(InputPtr data,
size_t len,
4037 char *latin_output) {
4040 char *start{latin_output};
4043#if SIMDUTF_CPLUSPLUS23
4048 if (pos + 16 <= len) {
4051 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
4053 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
4057 if ((v & 0x8080808080808080) ==
4060 size_t final_pos = pos + 16;
4061 while (pos < final_pos) {
4062 *latin_output++ = uint8_t(data[pos]);
4071 auto leading_byte = uint8_t(data[pos]);
4072 if (leading_byte < 0b10000000) {
4074 *latin_output++ = char(leading_byte);
4076 }
else if ((leading_byte & 0b11100000) ==
4079 if (pos + 1 >= len) {
4082 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4087 uint32_t code_point =
4088 (leading_byte & 0b00011111) << 6 |
4089 (uint8_t(data[pos + 1]) &
4095 *latin_output++ = char(code_point);
4102 return latin_output - start;
4113#ifndef SIMDUTF_UTF8_TO_UTF16_H
4114#define SIMDUTF_UTF8_TO_UTF16_H
4119namespace utf8_to_utf16 {
4121template <endianness big_endian,
typename InputPtr>
4122#if SIMDUTF_CPLUSPLUS20
4123 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4125simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
4126 char16_t *utf16_output) {
4128 char16_t *start{utf16_output};
4130#if SIMDUTF_CPLUSPLUS23
4135 if (pos + 16 <= len) {
4138 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
4140 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
4141 uint64_t v{v1 | v2};
4142 if ((v & 0x8080808080808080) == 0) {
4143 size_t final_pos = pos + 16;
4144 while (pos < final_pos) {
4145 *utf16_output++ = !match_system(big_endian)
4146 ? char16_t(u16_swap_bytes(data[pos]))
4147 : char16_t(data[pos]);
4155 uint8_t leading_byte = data[pos];
4156 if (leading_byte < 0b10000000) {
4158 *utf16_output++ = !match_system(big_endian)
4159 ? char16_t(u16_swap_bytes(leading_byte))
4160 : char16_t(leading_byte);
4162 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
4165 if (pos + 1 >= len) {
4168 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
4172 uint32_t code_point =
4173 (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
4174 if (code_point < 0x80 || 0x7ff < code_point) {
4177 if simdutf_constexpr (!match_system(big_endian)) {
4178 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
4180 *utf16_output++ = char16_t(code_point);
4182 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
4185 if (pos + 2 >= len) {
4189 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
4192 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
4196 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
4197 (data[pos + 1] & 0b00111111) << 6 |
4198 (data[pos + 2] & 0b00111111);
4199 if (code_point < 0x800 || 0xffff < code_point ||
4200 (0xd7ff < code_point && code_point < 0xe000)) {
4203 if simdutf_constexpr (!match_system(big_endian)) {
4204 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
4206 *utf16_output++ = char16_t(code_point);
4208 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
4210 if (pos + 3 >= len) {
4213 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
4216 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
4219 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
4224 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
4225 (data[pos + 1] & 0b00111111) << 12 |
4226 (data[pos + 2] & 0b00111111) << 6 |
4227 (data[pos + 3] & 0b00111111);
4228 if (code_point <= 0xffff || 0x10ffff < code_point) {
4231 code_point -= 0x10000;
4232 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
4233 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
4234 if simdutf_constexpr (!match_system(big_endian)) {
4235 high_surrogate = u16_swap_bytes(high_surrogate);
4236 low_surrogate = u16_swap_bytes(low_surrogate);
4238 *utf16_output++ = char16_t(high_surrogate);
4239 *utf16_output++ = char16_t(low_surrogate);
4245 return utf16_output - start;
4248template <endianness big_endian,
typename InputPtr>
4249#if SIMDUTF_CPLUSPLUS20
4250 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4252simdutf_constexpr23 result convert_with_errors(InputPtr data,
size_t len,
4253 char16_t *utf16_output) {
4255 char16_t *start{utf16_output};
4257#if SIMDUTF_CPLUSPLUS23
4262 if (pos + 16 <= len) {
4265 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
4267 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
4268 uint64_t v{v1 | v2};
4269 if ((v & 0x8080808080808080) == 0) {
4270 size_t final_pos = pos + 16;
4271 while (pos < final_pos) {
4272 const char16_t byte = uint8_t(data[pos]);
4274 !match_system(big_endian) ? u16_swap_bytes(
byte) : byte;
4282 auto leading_byte = uint8_t(data[pos]);
4283 if (leading_byte < 0b10000000) {
4285 *utf16_output++ = !match_system(big_endian)
4286 ? char16_t(u16_swap_bytes(leading_byte))
4287 : char16_t(leading_byte);
4289 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
4292 if (pos + 1 >= len) {
4293 return result(error_code::TOO_SHORT, pos);
4295 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4296 return result(error_code::TOO_SHORT, pos);
4299 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
4300 (uint8_t(data[pos + 1]) & 0b00111111);
4301 if (code_point < 0x80 || 0x7ff < code_point) {
4302 return result(error_code::OVERLONG, pos);
4304 if simdutf_constexpr (!match_system(big_endian)) {
4305 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
4307 *utf16_output++ = char16_t(code_point);
4309 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
4312 if (pos + 2 >= len) {
4313 return result(error_code::TOO_SHORT, pos);
4316 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4317 return result(error_code::TOO_SHORT, pos);
4319 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
4320 return result(error_code::TOO_SHORT, pos);
4323 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
4324 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
4325 (uint8_t(data[pos + 2]) & 0b00111111);
4326 if ((code_point < 0x800) || (0xffff < code_point)) {
4327 return result(error_code::OVERLONG, pos);
4329 if (0xd7ff < code_point && code_point < 0xe000) {
4330 return result(error_code::SURROGATE, pos);
4332 if simdutf_constexpr (!match_system(big_endian)) {
4333 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
4335 *utf16_output++ = char16_t(code_point);
4337 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
4339 if (pos + 3 >= len) {
4340 return result(error_code::TOO_SHORT, pos);
4342 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4343 return result(error_code::TOO_SHORT, pos);
4345 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
4346 return result(error_code::TOO_SHORT, pos);
4348 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
4349 return result(error_code::TOO_SHORT, pos);
4353 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
4354 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
4355 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
4356 (uint8_t(data[pos + 3]) & 0b00111111);
4357 if (code_point <= 0xffff) {
4358 return result(error_code::OVERLONG, pos);
4360 if (0x10ffff < code_point) {
4361 return result(error_code::TOO_LARGE, pos);
4363 code_point -= 0x10000;
4364 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
4365 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
4366 if simdutf_constexpr (!match_system(big_endian)) {
4367 high_surrogate = u16_swap_bytes(high_surrogate);
4368 low_surrogate = u16_swap_bytes(low_surrogate);
4370 *utf16_output++ = char16_t(high_surrogate);
4371 *utf16_output++ = char16_t(low_surrogate);
4375 if ((leading_byte & 0b11000000) == 0b10000000) {
4376 return result(error_code::TOO_LONG, pos);
4378 return result(error_code::HEADER_BITS, pos);
4382 return result(error_code::SUCCESS, utf16_output - start);
4400template <endianness endian>
4401inline result rewind_and_convert_with_errors(
size_t prior_bytes,
4402 const char *buf,
size_t len,
4403 char16_t *utf16_output) {
4404 size_t extra_len{0};
4408 size_t how_far_back = prior_bytes;
4411 bool found_leading_bytes{
false};
4413 for (
size_t i = 0; i <= how_far_back; i++) {
4414 unsigned char byte = buf[-
static_cast<std::ptrdiff_t
>(i)];
4415 found_leading_bytes = ((
byte & 0b11000000) != 0b10000000);
4416 if (found_leading_bytes) {
4417 if (i > 0 &&
byte < 128) {
4420 return result(error_code::TOO_LONG, 0 - i + 1);
4435 if (!found_leading_bytes) {
4440 return result(error_code::TOO_LONG, 0 - how_far_back);
4442 result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
4444 res.count -= extra_len;
4457#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
4458#define SIMDUTF_VALID_UTF8_TO_UTF16_H
4463namespace utf8_to_utf16 {
4465template <endianness big_endian,
typename InputPtr>
4466#if SIMDUTF_CPLUSPLUS20
4467 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4469simdutf_constexpr23
size_t convert_valid(InputPtr data,
size_t len,
4470 char16_t *utf16_output) {
4472 char16_t *start{utf16_output};
4474#if SIMDUTF_CPLUSPLUS23
4478 if (pos + 8 <= len) {
4481 ::memcpy(&v, data + pos,
sizeof(uint64_t));
4482 if ((v & 0x8080808080808080) == 0) {
4483 size_t final_pos = pos + 8;
4484 while (pos < final_pos) {
4485 const char16_t byte = uint8_t(data[pos]);
4487 !match_system(big_endian) ? u16_swap_bytes(
byte) : byte;
4495 auto leading_byte = uint8_t(data[pos]);
4496 if (leading_byte < 0b10000000) {
4498 *utf16_output++ = !match_system(big_endian)
4499 ? char16_t(u16_swap_bytes(leading_byte))
4500 : char16_t(leading_byte);
4502 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
4505 if (pos + 1 >= len) {
4508 uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) |
4509 (uint8_t(data[pos + 1]) & 0b00111111));
4510 if simdutf_constexpr (!match_system(big_endian)) {
4511 code_point = u16_swap_bytes(uint16_t(code_point));
4513 *utf16_output++ = char16_t(code_point);
4515 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
4518 if (pos + 2 >= len) {
4521 uint16_t code_point =
4522 uint16_t(((leading_byte & 0b00001111) << 12) |
4523 ((uint8_t(data[pos + 1]) & 0b00111111) << 6) |
4524 (uint8_t(data[pos + 2]) & 0b00111111));
4525 if simdutf_constexpr (!match_system(big_endian)) {
4526 code_point = u16_swap_bytes(uint16_t(code_point));
4528 *utf16_output++ = char16_t(code_point);
4530 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
4532 if (pos + 3 >= len) {
4535 uint32_t code_point = ((leading_byte & 0b00000111) << 18) |
4536 ((uint8_t(data[pos + 1]) & 0b00111111) << 12) |
4537 ((uint8_t(data[pos + 2]) & 0b00111111) << 6) |
4538 (uint8_t(data[pos + 3]) & 0b00111111);
4539 code_point -= 0x10000;
4540 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
4541 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
4542 if simdutf_constexpr (!match_system(big_endian)) {
4543 high_surrogate = u16_swap_bytes(high_surrogate);
4544 low_surrogate = u16_swap_bytes(low_surrogate);
4546 *utf16_output++ = char16_t(high_surrogate);
4547 *utf16_output++ = char16_t(low_surrogate);
4554 return utf16_output - start;
4565#ifndef SIMDUTF_UTF8_TO_UTF32_H
4566#define SIMDUTF_UTF8_TO_UTF32_H
4571namespace utf8_to_utf32 {
4573template <
typename InputPtr>
4574#if SIMDUTF_CPLUSPLUS20
4575 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4577simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
4578 char32_t *utf32_output) {
4580 char32_t *start{utf32_output};
4582#if SIMDUTF_CPLUSPLUS23
4587 if (pos + 16 <= len) {
4590 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
4592 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
4593 uint64_t v{v1 | v2};
4594 if ((v & 0x8080808080808080) == 0) {
4595 size_t final_pos = pos + 16;
4596 while (pos < final_pos) {
4597 *utf32_output++ = uint8_t(data[pos]);
4604 auto leading_byte = uint8_t(data[pos]);
4605 if (leading_byte < 0b10000000) {
4607 *utf32_output++ = char32_t(leading_byte);
4609 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
4611 if (pos + 1 >= len) {
4614 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
4618 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
4619 (uint8_t(data[pos + 1]) & 0b00111111);
4620 if (code_point < 0x80 || 0x7ff < code_point) {
4623 *utf32_output++ = char32_t(code_point);
4625 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
4627 if (pos + 2 >= len) {
4631 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4634 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
4638 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
4639 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
4640 (uint8_t(data[pos + 2]) & 0b00111111);
4641 if (code_point < 0x800 || 0xffff < code_point ||
4642 (0xd7ff < code_point && code_point < 0xe000)) {
4645 *utf32_output++ = char32_t(code_point);
4647 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
4649 if (pos + 3 >= len) {
4652 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4655 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
4658 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
4663 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
4664 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
4665 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
4666 (uint8_t(data[pos + 3]) & 0b00111111);
4667 if (code_point <= 0xffff || 0x10ffff < code_point) {
4670 *utf32_output++ = char32_t(code_point);
4676 return utf32_output - start;
4679template <
typename InputPtr>
4680#if SIMDUTF_CPLUSPLUS20
4681 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4683simdutf_constexpr23 result convert_with_errors(InputPtr data,
size_t len,
4684 char32_t *utf32_output) {
4686 char32_t *start{utf32_output};
4688#if SIMDUTF_CPLUSPLUS23
4693 if (pos + 16 <= len) {
4696 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
4698 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
4699 uint64_t v{v1 | v2};
4700 if ((v & 0x8080808080808080) == 0) {
4701 size_t final_pos = pos + 16;
4702 while (pos < final_pos) {
4703 *utf32_output++ = uint8_t(data[pos]);
4710 auto leading_byte = uint8_t(data[pos]);
4711 if (leading_byte < 0b10000000) {
4713 *utf32_output++ = char32_t(leading_byte);
4715 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
4717 if (pos + 1 >= len) {
4718 return result(error_code::TOO_SHORT, pos);
4720 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4721 return result(error_code::TOO_SHORT, pos);
4724 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
4725 (uint8_t(data[pos + 1]) & 0b00111111);
4726 if (code_point < 0x80 || 0x7ff < code_point) {
4727 return result(error_code::OVERLONG, pos);
4729 *utf32_output++ = char32_t(code_point);
4731 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
4733 if (pos + 2 >= len) {
4734 return result(error_code::TOO_SHORT, pos);
4737 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4738 return result(error_code::TOO_SHORT, pos);
4740 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
4741 return result(error_code::TOO_SHORT, pos);
4744 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
4745 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
4746 (uint8_t(data[pos + 2]) & 0b00111111);
4747 if (code_point < 0x800 || 0xffff < code_point) {
4748 return result(error_code::OVERLONG, pos);
4750 if (0xd7ff < code_point && code_point < 0xe000) {
4751 return result(error_code::SURROGATE, pos);
4753 *utf32_output++ = char32_t(code_point);
4755 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
4757 if (pos + 3 >= len) {
4758 return result(error_code::TOO_SHORT, pos);
4760 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4761 return result(error_code::TOO_SHORT, pos);
4763 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
4764 return result(error_code::TOO_SHORT, pos);
4766 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
4767 return result(error_code::TOO_SHORT, pos);
4771 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
4772 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
4773 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
4774 (uint8_t(data[pos + 3]) & 0b00111111);
4775 if (code_point <= 0xffff) {
4776 return result(error_code::OVERLONG, pos);
4778 if (0x10ffff < code_point) {
4779 return result(error_code::TOO_LARGE, pos);
4781 *utf32_output++ = char32_t(code_point);
4785 if ((leading_byte & 0b11000000) == 0b10000000) {
4786 return result(error_code::TOO_LONG, pos);
4788 return result(error_code::HEADER_BITS, pos);
4792 return result(error_code::SUCCESS, utf32_output - start);
4810inline result rewind_and_convert_with_errors(
size_t prior_bytes,
4811 const char *buf,
size_t len,
4812 char32_t *utf32_output) {
4813 size_t extra_len{0};
4815 size_t how_far_back = 3;
4816 if (how_far_back > prior_bytes) {
4817 how_far_back = prior_bytes;
4819 bool found_leading_bytes{
false};
4821 for (
size_t i = 0; i <= how_far_back; i++) {
4822 unsigned char byte = buf[-
static_cast<std::ptrdiff_t
>(i)];
4823 found_leading_bytes = ((
byte & 0b11000000) != 0b10000000);
4824 if (found_leading_bytes) {
4825 if (i > 0 &&
byte < 128) {
4828 return result(error_code::TOO_LONG, 0 - i + 1);
4843 if (!found_leading_bytes) {
4848 return result(error_code::TOO_LONG, 0 - how_far_back);
4851 result res = convert_with_errors(buf, len + extra_len, utf32_output);
4853 res.count -= extra_len;
4866#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
4867#define SIMDUTF_VALID_UTF8_TO_UTF32_H
4872namespace utf8_to_utf32 {
4874template <
typename InputPtr>
4875#if SIMDUTF_CPLUSPLUS20
4876 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4878simdutf_constexpr23
size_t convert_valid(InputPtr data,
size_t len,
4879 char32_t *utf32_output) {
4881 char32_t *start{utf32_output};
4883#if SIMDUTF_CPLUSPLUS23
4888 if (pos + 8 <= len) {
4891 ::memcpy(&v, data + pos,
sizeof(uint64_t));
4892 if ((v & 0x8080808080808080) == 0) {
4893 size_t final_pos = pos + 8;
4894 while (pos < final_pos) {
4895 *utf32_output++ = uint8_t(data[pos]);
4902 auto leading_byte = uint8_t(data[pos]);
4903 if (leading_byte < 0b10000000) {
4905 *utf32_output++ = char32_t(leading_byte);
4907 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
4909 if (pos + 1 >= len) {
4912 *utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) |
4913 (uint8_t(data[pos + 1]) & 0b00111111));
4915 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
4917 if (pos + 2 >= len) {
4920 *utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) |
4921 ((uint8_t(data[pos + 1]) & 0b00111111) << 6) |
4922 (uint8_t(data[pos + 2]) & 0b00111111));
4924 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
4926 if (pos + 3 >= len) {
4929 uint32_t code_word = ((leading_byte & 0b00000111) << 18) |
4930 ((uint8_t(data[pos + 1]) & 0b00111111) << 12) |
4931 ((uint8_t(data[pos + 2]) & 0b00111111) << 6) |
4932 (uint8_t(data[pos + 3]) & 0b00111111);
4933 *utf32_output++ = char32_t(code_word);
4940 return utf32_output - start;
4953constexpr size_t default_line_length =
4967simdutf_warn_unused
bool validate_utf8(
const char *buf,
size_t len)
noexcept;
4969simdutf_constexpr23 simdutf_really_inline simdutf_warn_unused
bool
4970validate_utf8(
const detail::input_span_of_byte_like
auto &input)
noexcept {
4971 #if SIMDUTF_CPLUSPLUS23
4973 return scalar::utf8::validate(
4974 detail::constexpr_cast_ptr<uint8_t>(input.data()), input.size());
4978 return validate_utf8(
reinterpret_cast<const char *
>(input.data()),
4996simdutf_warn_unused result validate_utf8_with_errors(
const char *buf,
4997 size_t len)
noexcept;
4999simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused result
5000validate_utf8_with_errors(
5001 const detail::input_span_of_byte_like
auto &input)
noexcept {
5002 #if SIMDUTF_CPLUSPLUS23
5004 return scalar::utf8::validate_with_errors(
5005 detail::constexpr_cast_ptr<uint8_t>(input.data()), input.size());
5009 return validate_utf8_with_errors(
5010 reinterpret_cast<const char *
>(input.data()), input.size());
5029simdutf_warn_unused
bool validate_utf16(
const char16_t *buf,
5030 size_t len)
noexcept;
5032simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
bool
5033validate_utf16(std::span<const char16_t> input)
noexcept {
5034 #if SIMDUTF_CPLUSPLUS23
5036 return scalar::utf16::validate<endianness::NATIVE>(input.data(),
5041 return validate_utf16(input.data(), input.size());
5060simdutf_warn_unused
bool validate_utf16le(
const char16_t *buf,
5061 size_t len)
noexcept;
5063simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused
bool
5064validate_utf16le(std::span<const char16_t> input)
noexcept {
5065 #if SIMDUTF_CPLUSPLUS23
5067 return scalar::utf16::validate<endianness::LITTLE>(input.data(),
5072 return validate_utf16le(input.data(), input.size());
5091simdutf_warn_unused
bool validate_utf16be(
const char16_t *buf,
5092 size_t len)
noexcept;
5094simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
bool
5095validate_utf16be(std::span<const char16_t> input)
noexcept {
5096 #if SIMDUTF_CPLUSPLUS23
5098 return scalar::utf16::validate<endianness::BIG>(input.data(), input.size());
5102 return validate_utf16be(input.data(), input.size());
5124simdutf_warn_unused result validate_utf16_with_errors(
const char16_t *buf,
5125 size_t len)
noexcept;
5127simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5128validate_utf16_with_errors(std::span<const char16_t> input)
noexcept {
5129 #if SIMDUTF_CPLUSPLUS23
5131 return scalar::utf16::validate_with_errors<endianness::NATIVE>(
5132 input.data(), input.size());
5136 return validate_utf16_with_errors(input.data(), input.size());
5157simdutf_warn_unused result validate_utf16le_with_errors(
const char16_t *buf,
5158 size_t len)
noexcept;
5160simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5161validate_utf16le_with_errors(std::span<const char16_t> input)
noexcept {
5162 #if SIMDUTF_CPLUSPLUS23
5164 return scalar::utf16::validate_with_errors<endianness::LITTLE>(
5165 input.data(), input.size());
5169 return validate_utf16le_with_errors(input.data(), input.size());
5190simdutf_warn_unused result validate_utf16be_with_errors(
const char16_t *buf,
5191 size_t len)
noexcept;
5193simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5194validate_utf16be_with_errors(std::span<const char16_t> input)
noexcept {
5195 #if SIMDUTF_CPLUSPLUS23
5197 return scalar::utf16::validate_with_errors<endianness::BIG>(input.data(),
5202 return validate_utf16be_with_errors(input.data(), input.size());
5219void to_well_formed_utf16le(
const char16_t *input,
size_t len,
5220 char16_t *output)
noexcept;
5222simdutf_really_inline simdutf_constexpr23
void
5223to_well_formed_utf16le(std::span<const char16_t> input,
5224 std::span<char16_t> output)
noexcept {
5225 #if SIMDUTF_CPLUSPLUS23
5227 scalar::utf16::to_well_formed_utf16<endianness::LITTLE>(
5228 input.data(), input.size(), output.data());
5232 to_well_formed_utf16le(input.data(), input.size(), output.data());
5249void to_well_formed_utf16be(
const char16_t *input,
size_t len,
5250 char16_t *output)
noexcept;
5252simdutf_really_inline simdutf_constexpr23
void
5253to_well_formed_utf16be(std::span<const char16_t> input,
5254 std::span<char16_t> output)
noexcept {
5255 #if SIMDUTF_CPLUSPLUS23
5257 scalar::utf16::to_well_formed_utf16<endianness::BIG>(
5258 input.data(), input.size(), output.data());
5262 to_well_formed_utf16be(input.data(), input.size(), output.data());
5279void to_well_formed_utf16(
const char16_t *input,
size_t len,
5280 char16_t *output)
noexcept;
5282simdutf_really_inline simdutf_constexpr23
void
5283to_well_formed_utf16(std::span<const char16_t> input,
5284 std::span<char16_t> output)
noexcept {
5285 #if SIMDUTF_CPLUSPLUS23
5287 scalar::utf16::to_well_formed_utf16<endianness::NATIVE>(
5288 input.data(), input.size(), output.data());
5292 to_well_formed_utf16(input.data(), input.size(), output.data());
5310simdutf_warn_unused
size_t convert_utf8_to_utf16(
5311 const char *input,
size_t length,
char16_t *utf16_output)
noexcept;
5313simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
5314convert_utf8_to_utf16(
const detail::input_span_of_byte_like
auto &input,
5315 std::span<char16_t> output)
noexcept {
5316 #if SIMDUTF_CPLUSPLUS23
5318 return scalar::utf8_to_utf16::convert<endianness::NATIVE>(
5319 input.data(), input.size(), output.data());
5323 return convert_utf8_to_utf16(
reinterpret_cast<const char *
>(input.data()),
5324 input.size(), output.data());
5346simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
5347 const char16_t *input,
size_t length)
noexcept;
5349simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused result
5350utf8_length_from_utf16le_with_replacement(
5351 std::span<const char16_t> valid_utf16_input)
noexcept {
5352 #if SIMDUTF_CPLUSPLUS23
5354 return scalar::utf16::utf8_length_from_utf16_with_replacement<
5355 endianness::LITTLE>(valid_utf16_input.data(), valid_utf16_input.size());
5359 return utf8_length_from_utf16le_with_replacement(valid_utf16_input.data(),
5360 valid_utf16_input.size());
5382simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
5383 const char16_t *input,
size_t length)
noexcept;
5385simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5386utf8_length_from_utf16be_with_replacement(
5387 std::span<const char16_t> valid_utf16_input)
noexcept {
5388 #if SIMDUTF_CPLUSPLUS23
5390 return scalar::utf16::utf8_length_from_utf16_with_replacement<
5391 endianness::BIG>(valid_utf16_input.data(), valid_utf16_input.size());
5395 return utf8_length_from_utf16be_with_replacement(valid_utf16_input.data(),
5396 valid_utf16_input.size());
5413simdutf_warn_unused
size_t convert_utf8_to_utf16le(
5414 const char *input,
size_t length,
char16_t *utf16_output)
noexcept;
5416simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
5417convert_utf8_to_utf16le(
const detail::input_span_of_byte_like
auto &utf8_input,
5418 std::span<char16_t> utf16_output)
noexcept {
5419 #if SIMDUTF_CPLUSPLUS23
5421 return scalar::utf8_to_utf16::convert<endianness::LITTLE>(
5422 utf8_input.data(), utf8_input.size(), utf16_output.data());
5426 return convert_utf8_to_utf16le(
5427 reinterpret_cast<const char *
>(utf8_input.data()), utf8_input.size(),
5428 utf16_output.data());
5445simdutf_warn_unused
size_t convert_utf8_to_utf16be(
5446 const char *input,
size_t length,
char16_t *utf16_output)
noexcept;
5448simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
5449convert_utf8_to_utf16be(
const detail::input_span_of_byte_like
auto &utf8_input,
5450 std::span<char16_t> utf16_output)
noexcept {
5452 #if SIMDUTF_CPLUSPLUS23
5454 return scalar::utf8_to_utf16::convert<endianness::BIG>(
5455 utf8_input.data(), utf8_input.size(), utf16_output.data());
5459 return convert_utf8_to_utf16be(
5460 reinterpret_cast<const char *
>(utf8_input.data()), utf8_input.size(),
5461 utf16_output.data());
5481simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
5482 const char *input,
size_t length,
char16_t *utf16_output)
noexcept;
5484simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5485convert_utf8_to_utf16_with_errors(
5486 const detail::input_span_of_byte_like
auto &utf8_input,
5487 std::span<char16_t> utf16_output)
noexcept {
5488 #if SIMDUTF_CPLUSPLUS23
5490 return scalar::utf8_to_utf16::convert_with_errors<endianness::NATIVE>(
5491 utf8_input.data(), utf8_input.size(), utf16_output.data());
5495 return convert_utf8_to_utf16_with_errors(
5496 reinterpret_cast<const char *
>(utf8_input.data()), utf8_input.size(),
5497 utf16_output.data());
5516simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
5517 const char *input,
size_t length,
char16_t *utf16_output)
noexcept;
5519simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5520convert_utf8_to_utf16le_with_errors(
5521 const detail::input_span_of_byte_like
auto &utf8_input,
5522 std::span<char16_t> utf16_output)
noexcept {
5523 #if SIMDUTF_CPLUSPLUS23
5525 return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
5526 utf8_input.data(), utf8_input.size(), utf16_output.data());
5530 return convert_utf8_to_utf16le_with_errors(
5531 reinterpret_cast<const char *
>(utf8_input.data()), utf8_input.size(),
5532 utf16_output.data());
5551simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
5552 const char *input,
size_t length,
char16_t *utf16_output)
noexcept;
5554simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5555convert_utf8_to_utf16be_with_errors(
5556 const detail::input_span_of_byte_like
auto &utf8_input,
5557 std::span<char16_t> utf16_output)
noexcept {
5558 #if SIMDUTF_CPLUSPLUS23
5560 return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(
5561 utf8_input.data(), utf8_input.size(), utf16_output.data());
5565 return convert_utf8_to_utf16be_with_errors(
5566 reinterpret_cast<const char *
>(utf8_input.data()), utf8_input.size(),
5567 utf16_output.data());
5582simdutf_warn_unused
size_t convert_valid_utf8_to_utf16(
5583 const char *input,
size_t length,
char16_t *utf16_buffer)
noexcept;
5585simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
5586convert_valid_utf8_to_utf16(
5587 const detail::input_span_of_byte_like
auto &valid_utf8_input,
5588 std::span<char16_t> utf16_output)
noexcept {
5589 #if SIMDUTF_CPLUSPLUS23
5591 return scalar::utf8_to_utf16::convert_valid<endianness::NATIVE>(
5592 valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
5596 return convert_valid_utf8_to_utf16(
5597 reinterpret_cast<const char *
>(valid_utf8_input.data()),
5598 valid_utf8_input.size(), utf16_output.data());
5613simdutf_warn_unused
size_t convert_valid_utf8_to_utf16le(
5614 const char *input,
size_t length,
char16_t *utf16_buffer)
noexcept;
5616simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
5617convert_valid_utf8_to_utf16le(
5618 const detail::input_span_of_byte_like
auto &valid_utf8_input,
5619 std::span<char16_t> utf16_output)
noexcept {
5621 #if SIMDUTF_CPLUSPLUS23
5623 return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
5624 valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
5628 return convert_valid_utf8_to_utf16le(
5629 reinterpret_cast<const char *
>(valid_utf8_input.data()),
5630 valid_utf8_input.size(), utf16_output.data());
5645simdutf_warn_unused
size_t convert_valid_utf8_to_utf16be(
5646 const char *input,
size_t length,
char16_t *utf16_buffer)
noexcept;
5648simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
5649convert_valid_utf8_to_utf16be(
5650 const detail::input_span_of_byte_like
auto &valid_utf8_input,
5651 std::span<char16_t> utf16_output)
noexcept {
5652 #if SIMDUTF_CPLUSPLUS23
5654 return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
5655 valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
5659 return convert_valid_utf8_to_utf16be(
5660 reinterpret_cast<const char *
>(valid_utf8_input.data()),
5661 valid_utf8_input.size(), utf16_output.data());
5680simdutf_warn_unused
size_t utf16_length_from_utf8(
const char *input,
5681 size_t length)
noexcept;
5683simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
5684utf16_length_from_utf8(
5685 const detail::input_span_of_byte_like
auto &valid_utf8_input)
noexcept {
5686 #if SIMDUTF_CPLUSPLUS23
5688 return scalar::utf8::utf16_length_from_utf8(valid_utf8_input.data(),
5689 valid_utf8_input.size());
5693 return utf16_length_from_utf8(
5694 reinterpret_cast<const char *
>(valid_utf8_input.data()),
5695 valid_utf8_input.size());
5715simdutf_warn_unused
size_t convert_utf16_to_utf8(
const char16_t *input,
5717 char *utf8_buffer)
noexcept;
5719simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
5720convert_utf16_to_utf8(
5721 std::span<const char16_t> utf16_input,
5722 detail::output_span_of_byte_like
auto &&utf8_output)
noexcept {
5723 #if SIMDUTF_CPLUSPLUS23
5725 return scalar::utf16_to_utf8::convert<endianness::NATIVE>(
5726 utf16_input.data(), utf16_input.size(), utf8_output.data());
5730 return convert_utf16_to_utf8(utf16_input.data(), utf16_input.size(),
5731 reinterpret_cast<char *
>(utf8_output.data()));
5754simdutf_warn_unused
size_t convert_utf16_to_utf8_safe(
const char16_t *input,
5757 size_t utf8_len)
noexcept;
5759simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
5760convert_utf16_to_utf8_safe(
5761 std::span<const char16_t> utf16_input,
5762 detail::output_span_of_byte_like
auto &&utf8_output)
noexcept {
5769 #if SIMDUTF_CPLUSPLUS23
5771 const full_result r =
5772 scalar::utf16_to_utf8::convert_with_errors<endianness::NATIVE, true>(
5773 utf16_input.data(), utf16_input.size(), utf8_output.data(),
5774 utf8_output.size());
5775 if (r.error != error_code::SUCCESS &&
5776 r.error != error_code::OUTPUT_BUFFER_TOO_SMALL) {
5779 return r.output_count;
5783 return convert_utf16_to_utf8_safe(
5784 utf16_input.data(), utf16_input.size(),
5785 reinterpret_cast<char *
>(utf8_output.data()), utf8_output.size());
5804simdutf_warn_unused
size_t convert_utf16le_to_utf8(
const char16_t *input,
5806 char *utf8_buffer)
noexcept;
5808simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
5809convert_utf16le_to_utf8(
5810 std::span<const char16_t> utf16_input,
5811 detail::output_span_of_byte_like
auto &&utf8_output)
noexcept {
5812 #if SIMDUTF_CPLUSPLUS23
5814 return scalar::utf16_to_utf8::convert<endianness::LITTLE>(
5815 utf16_input.data(), utf16_input.size(), utf8_output.data());
5819 return convert_utf16le_to_utf8(
5820 utf16_input.data(), utf16_input.size(),
5821 reinterpret_cast<char *
>(utf8_output.data()));
5840simdutf_warn_unused
size_t convert_utf16be_to_utf8(
const char16_t *input,
5842 char *utf8_buffer)
noexcept;
5844simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
5845convert_utf16be_to_utf8(
5846 std::span<const char16_t> utf16_input,
5847 detail::output_span_of_byte_like
auto &&utf8_output)
noexcept {
5848 #if SIMDUTF_CPLUSPLUS23
5850 return scalar::utf16_to_utf8::convert<endianness::BIG>(
5851 utf16_input.data(), utf16_input.size(), utf8_output.data());
5855 return convert_utf16be_to_utf8(
5856 utf16_input.data(), utf16_input.size(),
5857 reinterpret_cast<char *
>(utf8_output.data()));
5879simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
5880 const char16_t *input,
size_t length,
char *utf8_buffer)
noexcept;
5882simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5883convert_utf16_to_utf8_with_errors(
5884 std::span<const char16_t> utf16_input,
5885 detail::output_span_of_byte_like
auto &&utf8_output)
noexcept {
5886 #if SIMDUTF_CPLUSPLUS23
5888 return scalar::utf16_to_utf8::convert_with_errors<endianness::NATIVE>(
5889 utf16_input.data(), utf16_input.size(), utf8_output.data());
5893 return convert_utf16_to_utf8_with_errors(
5894 utf16_input.data(), utf16_input.size(),
5895 reinterpret_cast<char *
>(utf8_output.data()));
5916simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
5917 const char16_t *input,
size_t length,
char *utf8_buffer)
noexcept;
5919simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5920convert_utf16le_to_utf8_with_errors(
5921 std::span<const char16_t> utf16_input,
5922 detail::output_span_of_byte_like
auto &&utf8_output)
noexcept {
5923 #if SIMDUTF_CPLUSPLUS23
5925 return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
5926 utf16_input.data(), utf16_input.size(), utf8_output.data());
5930 return convert_utf16le_to_utf8_with_errors(
5931 utf16_input.data(), utf16_input.size(),
5932 reinterpret_cast<char *
>(utf8_output.data()));
5953simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
5954 const char16_t *input,
size_t length,
char *utf8_buffer)
noexcept;
5956simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5957convert_utf16be_to_utf8_with_errors(
5958 std::span<const char16_t> utf16_input,
5959 detail::output_span_of_byte_like
auto &&utf8_output)
noexcept {
5960 #if SIMDUTF_CPLUSPLUS23
5962 return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
5963 utf16_input.data(), utf16_input.size(), utf8_output.data());
5967 return convert_utf16be_to_utf8_with_errors(
5968 utf16_input.data(), utf16_input.size(),
5969 reinterpret_cast<char *
>(utf8_output.data()));
5987simdutf_warn_unused
size_t convert_valid_utf16_to_utf8(
5988 const char16_t *input,
size_t length,
char *utf8_buffer)
noexcept;
5990simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
5991convert_valid_utf16_to_utf8(
5992 std::span<const char16_t> valid_utf16_input,
5993 detail::output_span_of_byte_like
auto &&utf8_output)
noexcept {
5994 #if SIMDUTF_CPLUSPLUS23
5996 return scalar::utf16_to_utf8::convert_valid<endianness::NATIVE>(
5997 valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
6001 return convert_valid_utf16_to_utf8(
6002 valid_utf16_input.data(), valid_utf16_input.size(),
6003 reinterpret_cast<char *
>(utf8_output.data()));
6021simdutf_warn_unused
size_t convert_valid_utf16le_to_utf8(
6022 const char16_t *input,
size_t length,
char *utf8_buffer)
noexcept;
6024simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
6025convert_valid_utf16le_to_utf8(
6026 std::span<const char16_t> valid_utf16_input,
6027 detail::output_span_of_byte_like
auto &&utf8_output)
noexcept {
6028 #if SIMDUTF_CPLUSPLUS23
6030 return scalar::utf16_to_utf8::convert_valid<endianness::NATIVE>(
6031 valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
6035 return convert_valid_utf16le_to_utf8(
6036 valid_utf16_input.data(), valid_utf16_input.size(),
6037 reinterpret_cast<char *
>(utf8_output.data()));
6055simdutf_warn_unused
size_t convert_valid_utf16be_to_utf8(
6056 const char16_t *input,
size_t length,
char *utf8_buffer)
noexcept;
6058simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
6059convert_valid_utf16be_to_utf8(
6060 std::span<const char16_t> valid_utf16_input,
6061 detail::output_span_of_byte_like
auto &&utf8_output)
noexcept {
6062 #if SIMDUTF_CPLUSPLUS23
6064 return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(
6065 valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
6069 return convert_valid_utf16be_to_utf8(
6070 valid_utf16_input.data(), valid_utf16_input.size(),
6071 reinterpret_cast<char *
>(utf8_output.data()));
6087simdutf_warn_unused
size_t utf8_length_from_utf16(
const char16_t *input,
6088 size_t length)
noexcept;
6090simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
6091utf8_length_from_utf16(std::span<const char16_t> valid_utf16_input)
noexcept {
6092 #if SIMDUTF_CPLUSPLUS23
6094 return scalar::utf16::utf8_length_from_utf16<endianness::NATIVE>(
6095 valid_utf16_input.data(), valid_utf16_input.size());
6099 return utf8_length_from_utf16(valid_utf16_input.data(),
6100 valid_utf16_input.size());
6123simdutf_warn_unused result utf8_length_from_utf16_with_replacement(
6124 const char16_t *input,
size_t length)
noexcept;
6126simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
6127utf8_length_from_utf16_with_replacement(
6128 std::span<const char16_t> valid_utf16_input)
noexcept {
6129 #if SIMDUTF_CPLUSPLUS23
6131 return scalar::utf16::utf8_length_from_utf16_with_replacement<
6132 endianness::NATIVE>(valid_utf16_input.data(), valid_utf16_input.size());
6136 return utf8_length_from_utf16_with_replacement(valid_utf16_input.data(),
6137 valid_utf16_input.size());
6153simdutf_warn_unused
size_t utf8_length_from_utf16le(
const char16_t *input,
6154 size_t length)
noexcept;
6156simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused
size_t
6157utf8_length_from_utf16le(std::span<const char16_t> valid_utf16_input)
noexcept {
6158 #if SIMDUTF_CPLUSPLUS23
6160 return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(
6161 valid_utf16_input.data(), valid_utf16_input.size());
6165 return utf8_length_from_utf16le(valid_utf16_input.data(),
6166 valid_utf16_input.size());
6182simdutf_warn_unused
size_t utf8_length_from_utf16be(
const char16_t *input,
6183 size_t length)
noexcept;
6185simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
6186utf8_length_from_utf16be(std::span<const char16_t> valid_utf16_input)
noexcept {
6187 #if SIMDUTF_CPLUSPLUS23
6189 return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(
6190 valid_utf16_input.data(), valid_utf16_input.size());
6194 return utf8_length_from_utf16be(valid_utf16_input.data(),
6195 valid_utf16_input.size());
6213void change_endianness_utf16(
const char16_t *input,
size_t length,
6214 char16_t *output)
noexcept;
6216simdutf_really_inline simdutf_constexpr23
void
6217change_endianness_utf16(std::span<const char16_t> utf16_input,
6218 std::span<char16_t> utf16_output)
noexcept {
6219 #if SIMDUTF_CPLUSPLUS23
6221 return scalar::utf16::change_endianness_utf16(
6222 utf16_input.data(), utf16_input.size(), utf16_output.data());
6226 return change_endianness_utf16(utf16_input.data(), utf16_input.size(),
6227 utf16_output.data());
6246simdutf_warn_unused
size_t count_utf16(
const char16_t *input,
6247 size_t length)
noexcept;
6249simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
6250count_utf16(std::span<const char16_t> valid_utf16_input)
noexcept {
6251 #if SIMDUTF_CPLUSPLUS23
6253 return scalar::utf16::count_code_points<endianness::NATIVE>(
6254 valid_utf16_input.data(), valid_utf16_input.size());
6258 return count_utf16(valid_utf16_input.data(), valid_utf16_input.size());
6277simdutf_warn_unused
size_t count_utf16le(
const char16_t *input,
6278 size_t length)
noexcept;
6280simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
6281count_utf16le(std::span<const char16_t> valid_utf16_input)
noexcept {
6282 #if SIMDUTF_CPLUSPLUS23
6284 return scalar::utf16::count_code_points<endianness::LITTLE>(
6285 valid_utf16_input.data(), valid_utf16_input.size());
6289 return count_utf16le(valid_utf16_input.data(), valid_utf16_input.size());
6308simdutf_warn_unused
size_t count_utf16be(
const char16_t *input,
6309 size_t length)
noexcept;
6311simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
6312count_utf16be(std::span<const char16_t> valid_utf16_input)
noexcept {
6313 #if SIMDUTF_CPLUSPLUS23
6315 return scalar::utf16::count_code_points<endianness::BIG>(
6316 valid_utf16_input.data(), valid_utf16_input.size());
6320 return count_utf16be(valid_utf16_input.data(), valid_utf16_input.size());
6337simdutf_warn_unused
size_t count_utf8(
const char *input,
6338 size_t length)
noexcept;
6340simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t count_utf8(
6341 const detail::input_span_of_byte_like
auto &valid_utf8_input)
noexcept {
6342 #if SIMDUTF_CPLUSPLUS23
6344 return scalar::utf8::count_code_points(valid_utf8_input.data(),
6345 valid_utf8_input.size());
6349 return count_utf8(
reinterpret_cast<const char *
>(valid_utf8_input.data()),
6350 valid_utf8_input.size());
6369simdutf_warn_unused
size_t trim_partial_utf8(
const char *input,
size_t length);
6371simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
6373 const detail::input_span_of_byte_like
auto &valid_utf8_input)
noexcept {
6374 #if SIMDUTF_CPLUSPLUS23
6376 return scalar::utf8::trim_partial_utf8(valid_utf8_input.data(),
6377 valid_utf8_input.size());
6381 return trim_partial_utf8(
6382 reinterpret_cast<const char *
>(valid_utf8_input.data()),
6383 valid_utf8_input.size());
6402simdutf_warn_unused
size_t trim_partial_utf16be(
const char16_t *input,
6405simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
6406trim_partial_utf16be(std::span<const char16_t> valid_utf16_input)
noexcept {
6407 #if SIMDUTF_CPLUSPLUS23
6409 return scalar::utf16::trim_partial_utf16<endianness::BIG>(
6410 valid_utf16_input.data(), valid_utf16_input.size());
6414 return trim_partial_utf16be(valid_utf16_input.data(),
6415 valid_utf16_input.size());
6434simdutf_warn_unused
size_t trim_partial_utf16le(
const char16_t *input,
6437simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
6438trim_partial_utf16le(std::span<const char16_t> valid_utf16_input)
noexcept {
6439 #if SIMDUTF_CPLUSPLUS23
6441 return scalar::utf16::trim_partial_utf16<endianness::LITTLE>(
6442 valid_utf16_input.data(), valid_utf16_input.size());
6446 return trim_partial_utf16le(valid_utf16_input.data(),
6447 valid_utf16_input.size());
6466simdutf_warn_unused
size_t trim_partial_utf16(
const char16_t *input,
6469simdutf_really_inline simdutf_warn_unused simdutf_constexpr23
size_t
6470trim_partial_utf16(std::span<const char16_t> valid_utf16_input)
noexcept {
6471 #if SIMDUTF_CPLUSPLUS23
6473 return scalar::utf16::trim_partial_utf16<endianness::NATIVE>(
6474 valid_utf16_input.data(), valid_utf16_input.size());
6478 return trim_partial_utf16(valid_utf16_input.data(),
6479 valid_utf16_input.size());
6484 #ifndef SIMDUTF_NEED_TRAILING_ZEROES
6485 #define SIMDUTF_NEED_TRAILING_ZEROES 1
6506 virtual std::string
name()
const {
return std::string(
_name); }
6551 size_t len)
const noexcept = 0;
6565 simdutf_warn_unused
virtual result
6582 simdutf_warn_unused
virtual bool
6599 simdutf_warn_unused
virtual bool
6618 simdutf_warn_unused
virtual result
6620 size_t len)
const noexcept = 0;
6638 simdutf_warn_unused
virtual result
6640 size_t len)
const noexcept = 0;
6654 char16_t *output)
const noexcept = 0;
6668 char16_t *output)
const noexcept = 0;
6682 simdutf_warn_unused
virtual size_t
6684 char16_t *utf16_output)
const noexcept = 0;
6698 simdutf_warn_unused
virtual size_t
6700 char16_t *utf16_output)
const noexcept = 0;
6718 const char *input,
size_t length,
6719 char16_t *utf16_output)
const noexcept = 0;
6737 const char *input,
size_t length,
6738 char16_t *utf16_output)
const noexcept = 0;
6759 const char16_t *input,
size_t length)
const noexcept = 0;
6781 const char16_t *input,
size_t length)
const noexcept = 0;
6793 simdutf_warn_unused
virtual size_t
6795 char16_t *utf16_buffer)
const noexcept = 0;
6807 simdutf_warn_unused
virtual size_t
6809 char16_t *utf16_buffer)
const noexcept = 0;
6823 simdutf_warn_unused
virtual size_t
6841 simdutf_warn_unused
virtual size_t
6843 char *utf8_buffer)
const noexcept = 0;
6860 simdutf_warn_unused
virtual size_t
6862 char *utf8_buffer)
const noexcept = 0;
6882 simdutf_warn_unused
virtual result
6884 char *utf8_buffer)
const noexcept = 0;
6904 simdutf_warn_unused
virtual result
6906 char *utf8_buffer)
const noexcept = 0;
6922 simdutf_warn_unused
virtual size_t
6924 char *utf8_buffer)
const noexcept = 0;
6940 simdutf_warn_unused
virtual size_t
6942 char *utf8_buffer)
const noexcept = 0;
6958 simdutf_warn_unused
virtual size_t
6960 size_t length)
const noexcept = 0;
6976 simdutf_warn_unused
virtual size_t
6978 size_t length)
const noexcept = 0;
6995 char16_t *output)
const noexcept = 0;
7012 simdutf_warn_unused
virtual size_t
7030 simdutf_warn_unused
virtual size_t
7045 simdutf_warn_unused
virtual size_t
7048#ifdef SIMDUTF_INTERNAL_TESTS
7057 struct TestProcedure {
7065 virtual std::vector<TestProcedure> internal_tests()
const;
7108 size_t size() const noexcept;
7129 if (impl->
name() == name) {
7156#if defined(SIMDUTF_NO_THREADS)
7157 operator const T *()
const {
return ptr; }
7158 const T &operator*()
const {
return *ptr; }
7159 const T *operator->()
const {
return ptr; }
7161 operator T *() {
return ptr; }
7162 T &operator*() {
return *ptr; }
7163 T *operator->() {
return ptr; }
7170 operator const T *()
const {
return ptr.load(); }
7171 const T &operator*()
const {
return *ptr; }
7172 const T *operator->()
const {
return ptr.load(); }
7174 operator T *() {
return ptr.load(); }
7175 T &operator*() {
return *ptr; }
7176 T *operator->() {
return ptr.load(); }
7185#if defined(SIMDUTF_NO_THREADS)
7188 std::atomic<T *> ptr;
7200get_available_implementations();
7209get_active_implementation();
7222SIMDUTF_POP_DISABLE_WARNINGS
Definition: simdutf.h:6495
virtual simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char *input, size_t length, char16_t *utf16_buffer) const noexcept=0
virtual simdutf_warn_unused size_t convert_utf8_to_utf16be(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
virtual simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
virtual simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept=0
const char * _description
Definition: simdutf.h:7089
virtual simdutf_warn_unused size_t count_utf8(const char *input, size_t length) const noexcept=0
virtual simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
virtual void to_well_formed_utf16le(const char16_t *input, size_t len, char16_t *output) const noexcept=0
virtual simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept=0
virtual simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept=0
virtual simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
virtual simdutf_warn_unused size_t convert_utf8_to_utf16le(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
virtual simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(const char16_t *input, size_t length) const noexcept=0
virtual simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept=0
virtual simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
virtual simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(const char16_t *input, size_t length) const noexcept=0
virtual simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept=0
virtual simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
virtual simdutf_warn_unused size_t count_utf16le(const char16_t *input, size_t length) const noexcept=0
simdutf_really_inline implementation(const char *name, const char *description, uint32_t required_instruction_sets)
Definition: simdutf.h:7071
bool supported_by_runtime_system() const
Definition: simdutf.cpp:10267
virtual simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept=0
virtual simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
virtual uint32_t required_instruction_sets() const
Definition: simdutf.h:6537
virtual void change_endianness_utf16(const char16_t *input, size_t length, char16_t *output) const noexcept=0
virtual simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char *input, size_t length, char16_t *utf16_buffer) const noexcept=0
virtual std::string name() const
Definition: simdutf.h:6506
virtual simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept=0
const char * _name
Definition: simdutf.h:7084
virtual simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
virtual simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept=0
virtual simdutf_warn_unused size_t count_utf16be(const char16_t *input, size_t length) const noexcept=0
virtual void to_well_formed_utf16be(const char16_t *input, size_t len, char16_t *output) const noexcept=0
virtual simdutf_warn_unused size_t utf16_length_from_utf8(const char *input, size_t length) const noexcept=0
virtual simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
const uint32_t _required_instruction_sets
Definition: simdutf.h:7094
virtual std::string description() const
Definition: simdutf.h:6517
Definition: simdutf.h:7152
Definition: simdutf.h:7103
const implementation *const * begin() const noexcept
Definition: simdutf.cpp:10765
size_t size() const noexcept
Definition: simdutf.cpp:10761
simdutf_really_inline available_implementation_list()
Definition: simdutf.h:7106
const implementation *const * end() const noexcept
Definition: simdutf.cpp:10769
const implementation * detect_best_supported() const noexcept
Definition: simdutf.cpp:10773
Definition: simdutf.cpp:10378
Definition: simdutf.h:882
Definition: simdutf.h:860