mbxmlutils  1.3.0
Multi-Body XML Utils
simdutf.h
1/* auto-generated on 2026-01-30 11:51:34 -0500. Do not edit! */
2/* begin file include/simdutf.h */
3#ifndef SIMDUTF_H
4#define SIMDUTF_H
5#include <cstring>
6
7/* begin file include/simdutf/compiler_check.h */
8#ifndef SIMDUTF_COMPILER_CHECK_H
9#define SIMDUTF_COMPILER_CHECK_H
10
11#ifndef __cplusplus
12 #error simdutf requires a C++ compiler
13#endif
14
15#ifndef SIMDUTF_CPLUSPLUS
16 #if defined(_MSVC_LANG) && !defined(__clang__)
17 #define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
18 #else
19 #define SIMDUTF_CPLUSPLUS __cplusplus
20 #endif
21#endif
22
23// C++ 26
24#if !defined(SIMDUTF_CPLUSPLUS26) && (SIMDUTF_CPLUSPLUS >= 202602L)
25 #define SIMDUTF_CPLUSPLUS26 1
26#endif
27
28// C++ 23
29#if !defined(SIMDUTF_CPLUSPLUS23) && (SIMDUTF_CPLUSPLUS >= 202302L)
30 #define SIMDUTF_CPLUSPLUS23 1
31#endif
32
33// C++ 20
34#if !defined(SIMDUTF_CPLUSPLUS20) && (SIMDUTF_CPLUSPLUS >= 202002L)
35 #define SIMDUTF_CPLUSPLUS20 1
36#endif
37
38// C++ 17
39#if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L)
40 #define SIMDUTF_CPLUSPLUS17 1
41#endif
42
43// C++ 14
44#if !defined(SIMDUTF_CPLUSPLUS14) && (SIMDUTF_CPLUSPLUS >= 201402L)
45 #define SIMDUTF_CPLUSPLUS14 1
46#endif
47
48// C++ 11
49#if !defined(SIMDUTF_CPLUSPLUS11) && (SIMDUTF_CPLUSPLUS >= 201103L)
50 #define SIMDUTF_CPLUSPLUS11 1
51#endif
52
53#ifndef SIMDUTF_CPLUSPLUS11
54 #error simdutf requires a compiler compliant with the C++11 standard
55#endif
56
57#endif // SIMDUTF_COMPILER_CHECK_H
58/* end file include/simdutf/compiler_check.h */
59/* begin file include/simdutf/common_defs.h */
60#ifndef SIMDUTF_COMMON_DEFS_H
61#define SIMDUTF_COMMON_DEFS_H
62
63/* begin file include/simdutf/portability.h */
64#ifndef SIMDUTF_PORTABILITY_H
65#define SIMDUTF_PORTABILITY_H
66
67
68#include <cfloat>
69#include <cstddef>
70#include <cstdint>
71#include <cstdlib>
72#ifndef _WIN32
73 // strcasecmp, strncasecmp
74 #include <strings.h>
75#endif
76
77#if defined(__apple_build_version__)
78 #if __apple_build_version__ < 14000000
79 #define SIMDUTF_SPAN_DISABLED \
80 1 // apple-clang/13 doesn't support std::convertible_to
81 #endif
82#endif
83
84#if SIMDUTF_CPLUSPLUS20
85 #include <version>
86 #if __cpp_concepts >= 201907L && __cpp_lib_span >= 202002L && \
87 !defined(SIMDUTF_SPAN_DISABLED)
88 #define SIMDUTF_SPAN 1
89 #endif // __cpp_concepts >= 201907L && __cpp_lib_span >= 202002L
90 #if __cpp_lib_atomic_ref >= 201806L
91 #define SIMDUTF_ATOMIC_REF 1
92 #endif // __cpp_lib_atomic_ref
93 #if __has_cpp_attribute(maybe_unused) >= 201603L
94 #define SIMDUTF_MAYBE_UNUSED_AVAILABLE 1
95 #endif // __has_cpp_attribute(maybe_unused) >= 201603L
96#endif
97
103#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
104 #define SIMDUTF_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
105#elif defined(_WIN32)
106 #define SIMDUTF_IS_BIG_ENDIAN 0
107#else
108 #if defined(__APPLE__) || \
109 defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined
110 // __ORDER_BIG_ENDIAN__
111 #include <machine/endian.h>
112 #elif defined(sun) || \
113 defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__)
114 #include <sys/byteorder.h>
115 #else // defined(__APPLE__) || defined(__FreeBSD__)
116
117 #ifdef __has_include
118 #if __has_include(<endian.h>)
119 #include <endian.h>
120 #endif //__has_include(<endian.h>)
121 #endif //__has_include
122
123 #endif // defined(__APPLE__) || defined(__FreeBSD__)
124
125 #ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__)
126 #define SIMDUTF_IS_BIG_ENDIAN 0
127 #endif
128
129 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
130 #define SIMDUTF_IS_BIG_ENDIAN 0
131 #else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
132 #define SIMDUTF_IS_BIG_ENDIAN 1
133 #endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
134
135#endif // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
136
141#ifdef _MSC_VER
142 #define SIMDUTF_VISUAL_STUDIO 1
153 #ifdef __clang__
154 // clang under visual studio
155 #define SIMDUTF_CLANG_VISUAL_STUDIO 1
156 #else
157 // just regular visual studio (best guess)
158 #define SIMDUTF_REGULAR_VISUAL_STUDIO 1
159 #endif // __clang__
160#endif // _MSC_VER
161
162#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
163 // https://en.wikipedia.org/wiki/C_alternative_tokens
164 // This header should have no effect, except maybe
165 // under Visual Studio.
166 #include <iso646.h>
167#endif
168
169#if (defined(__x86_64__) || defined(_M_AMD64)) && !defined(_M_ARM64EC)
170 #define SIMDUTF_IS_X86_64 1
171#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
172 #define SIMDUTF_IS_ARM64 1
173#elif defined(__PPC64__) || defined(_M_PPC64)
174 #if defined(__VEC__) && defined(__ALTIVEC__)
175 #define SIMDUTF_IS_PPC64 1
176 #endif
177#elif defined(__s390__)
178// s390 IBM system. Big endian.
179#elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64
180 // RISC-V 64-bit
181 #define SIMDUTF_IS_RISCV64 1
182
183 // #if __riscv_v_intrinsic >= 1000000
184 // #define SIMDUTF_HAS_RVV_INTRINSICS 1
185 // #define SIMDUTF_HAS_RVV_TARGET_REGION 1
186 // #elif ...
187 // Check for special compiler versions that implement pre v1.0 intrinsics
188 #if __riscv_v_intrinsic >= 11000
189 #define SIMDUTF_HAS_RVV_INTRINSICS 1
190 #endif
191
192 #define SIMDUTF_HAS_ZVBB_INTRINSICS \
193 0 // there is currently no way to detect this
194
195 #if SIMDUTF_HAS_RVV_INTRINSICS && __riscv_vector && \
196 __riscv_v_min_vlen >= 128 && __riscv_v_elen >= 64
197 // RISC-V V extension
198 #define SIMDUTF_IS_RVV 1
199 #if SIMDUTF_HAS_ZVBB_INTRINSICS && __riscv_zvbb >= 1000000
200 // RISC-V Vector Basic Bit-manipulation
201 #define SIMDUTF_IS_ZVBB 1
202 #endif
203 #endif
204
205#elif defined(__loongarch_lp64)
206 #if defined(__loongarch_sx) && defined(__loongarch_asx)
207 #define SIMDUTF_IS_LSX 1
208 #define SIMDUTF_IS_LASX 1 // We can always run both
209 #elif defined(__loongarch_sx)
210 #define SIMDUTF_IS_LSX 1
211 #endif
212#else
213 // The simdutf library is designed
214 // for 64-bit processors and it seems that you are not
215 // compiling for a known 64-bit platform. Please
216 // use a 64-bit target such as x64 or 64-bit ARM for best performance.
217 #define SIMDUTF_IS_32BITS 1
218
219 // We do not support 32-bit platforms, but it can be
220 // handy to identify them.
221 #if defined(_M_IX86) || defined(__i386__)
222 #define SIMDUTF_IS_X86_32BITS 1
223 #elif defined(__arm__) || defined(_M_ARM)
224 #define SIMDUTF_IS_ARM_32BITS 1
225 #elif defined(__PPC__) || defined(_M_PPC)
226 #define SIMDUTF_IS_PPC_32BITS 1
227 #endif
228
229#endif // defined(__x86_64__) || defined(_M_AMD64)
230
231#ifdef SIMDUTF_IS_32BITS
232 #ifndef SIMDUTF_NO_PORTABILITY_WARNING
233 // In the future, we may want to warn users of 32-bit systems that
234 // the simdutf does not support accelerated kernels for such systems.
235 #endif // SIMDUTF_NO_PORTABILITY_WARNING
236#endif // SIMDUTF_IS_32BITS
237
238// this is almost standard?
239#define SIMDUTF_STRINGIFY_IMPLEMENTATION_(a) #a
240#define SIMDUTF_STRINGIFY(a) SIMDUTF_STRINGIFY_IMPLEMENTATION_(a)
241
242// Our fast kernels require 64-bit systems.
243//
244// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions.
245// Furthermore, the number of SIMD registers is reduced.
246//
247// On 32-bit ARM, we would have smaller registers.
248//
249// The simdutf users should still have the fallback kernel. It is
250// slower, but it should run everywhere.
251
252//
253// Enable valid runtime implementations, and select
254// SIMDUTF_BUILTIN_IMPLEMENTATION
255//
256
257// We are going to use runtime dispatch.
258#if defined(SIMDUTF_IS_X86_64) || defined(SIMDUTF_IS_LSX)
259 #ifdef __clang__
260 // clang does not have GCC push pop
261 // warning: clang attribute push can't be used within a namespace in clang
262 // up til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be
263 // *outside* of a namespace.
264 #define SIMDUTF_TARGET_REGION(T) \
265 _Pragma(SIMDUTF_STRINGIFY(clang attribute push( \
266 __attribute__((target(T))), apply_to = function)))
267 #define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop")
268 #elif defined(__GNUC__)
269 // GCC is easier
270 #define SIMDUTF_TARGET_REGION(T) \
271 _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T)))
272 #define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options")
273 #endif // clang then gcc
274
275#endif // defined(SIMDUTF_IS_X86_64) || defined(SIMDUTF_IS_LSX)
276
277// Default target region macros don't do anything.
278#ifndef SIMDUTF_TARGET_REGION
279 #define SIMDUTF_TARGET_REGION(T)
280 #define SIMDUTF_UNTARGET_REGION
281#endif
282
283// Is threading enabled?
284#if defined(_REENTRANT) || defined(_MT)
285 #ifndef SIMDUTF_THREADS_ENABLED
286 #define SIMDUTF_THREADS_ENABLED
287 #endif
288#endif
289
290// workaround for large stack sizes under -O0.
291// https://github.com/simdutf/simdutf/issues/691
292#ifdef __APPLE__
293 #ifndef __OPTIMIZE__
294 // Apple systems have small stack sizes in secondary threads.
295 // Lack of compiler optimization may generate high stack usage.
296 // Users may want to disable threads for safety, but only when
297 // in debug mode which we detect by the fact that the __OPTIMIZE__
298 // macro is not defined.
299 #undef SIMDUTF_THREADS_ENABLED
300 #endif
301#endif
302
303#ifdef SIMDUTF_VISUAL_STUDIO
304 // This is one case where we do not distinguish between
305 // regular visual studio and clang under visual studio.
306 // clang under Windows has _stricmp (like visual studio) but not strcasecmp
307 // (as clang normally has)
308 #define simdutf_strcasecmp _stricmp
309 #define simdutf_strncasecmp _strnicmp
310#else
311 // The strcasecmp, strncasecmp, and strcasestr functions do not work with
312 // multibyte strings (e.g. UTF-8). So they are only useful for ASCII in our
313 // context.
314 // https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings
315 #define simdutf_strcasecmp strcasecmp
316 #define simdutf_strncasecmp strncasecmp
317#endif
318
319#if defined(__GNUC__) && !defined(__clang__)
320 #if __GNUC__ >= 11
321 #define SIMDUTF_GCC11ORMORE 1
322 #endif // __GNUC__ >= 11
323 #if __GNUC__ == 10
324 #define SIMDUTF_GCC10 1
325 #endif // __GNUC__ == 10
326 #if __GNUC__ < 10
327 #define SIMDUTF_GCC9OROLDER 1
328 #endif // __GNUC__ == 10
329#endif // defined(__GNUC__) && !defined(__clang__)
330
331#endif // SIMDUTF_PORTABILITY_H
332/* end file include/simdutf/portability.h */
333/* begin file include/simdutf/avx512.h */
334#ifndef SIMDUTF_AVX512_H_
335#define SIMDUTF_AVX512_H_
336
337/*
338 It's possible to override AVX512 settings with cmake DCMAKE_CXX_FLAGS.
339
340 All preprocessor directives has form `SIMDUTF_HAS_AVX512{feature}`,
341 where a feature is a code name for extensions.
342
343 Please see the listing below to find which are supported.
344*/
345
346#ifndef SIMDUTF_HAS_AVX512F
347 #if defined(__AVX512F__) && __AVX512F__ == 1
348 #define SIMDUTF_HAS_AVX512F 1
349 #endif
350#endif
351
352#ifndef SIMDUTF_HAS_AVX512DQ
353 #if defined(__AVX512DQ__) && __AVX512DQ__ == 1
354 #define SIMDUTF_HAS_AVX512DQ 1
355 #endif
356#endif
357
358#ifndef SIMDUTF_HAS_AVX512IFMA
359 #if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1
360 #define SIMDUTF_HAS_AVX512IFMA 1
361 #endif
362#endif
363
364#ifndef SIMDUTF_HAS_AVX512CD
365 #if defined(__AVX512CD__) && __AVX512CD__ == 1
366 #define SIMDUTF_HAS_AVX512CD 1
367 #endif
368#endif
369
370#ifndef SIMDUTF_HAS_AVX512BW
371 #if defined(__AVX512BW__) && __AVX512BW__ == 1
372 #define SIMDUTF_HAS_AVX512BW 1
373 #endif
374#endif
375
376#ifndef SIMDUTF_HAS_AVX512VL
377 #if defined(__AVX512VL__) && __AVX512VL__ == 1
378 #define SIMDUTF_HAS_AVX512VL 1
379 #endif
380#endif
381
382#ifndef SIMDUTF_HAS_AVX512VBMI
383 #if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1
384 #define SIMDUTF_HAS_AVX512VBMI 1
385 #endif
386#endif
387
388#ifndef SIMDUTF_HAS_AVX512VBMI2
389 #if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1
390 #define SIMDUTF_HAS_AVX512VBMI2 1
391 #endif
392#endif
393
394#ifndef SIMDUTF_HAS_AVX512VNNI
395 #if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1
396 #define SIMDUTF_HAS_AVX512VNNI 1
397 #endif
398#endif
399
400#ifndef SIMDUTF_HAS_AVX512BITALG
401 #if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1
402 #define SIMDUTF_HAS_AVX512BITALG 1
403 #endif
404#endif
405
406#ifndef SIMDUTF_HAS_AVX512VPOPCNTDQ
407 #if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1
408 #define SIMDUTF_HAS_AVX512VPOPCNTDQ 1
409 #endif
410#endif
411
412#endif // SIMDUTF_AVX512_H_
413/* end file include/simdutf/avx512.h */
414
415// Sometimes logging is useful, but we want it disabled by default
416// and free of any logging code in release builds.
417#ifdef SIMDUTF_LOGGING
418 #include <iostream>
419 #define simdutf_log(msg) \
420 std::cout << "[" << __FUNCTION__ << "]: " << msg << std::endl \
421 << "\t" << __FILE__ << ":" << __LINE__ << std::endl;
422 #define simdutf_log_assert(cond, msg) \
423 do { \
424 if (!(cond)) { \
425 std::cerr << "[" << __FUNCTION__ << "]: " << msg << std::endl \
426 << "\t" << __FILE__ << ":" << __LINE__ << std::endl; \
427 std::abort(); \
428 } \
429 } while (0)
430#else
431 #define simdutf_log(msg)
432 #define simdutf_log_assert(cond, msg)
433#endif
434
435#if defined(SIMDUTF_REGULAR_VISUAL_STUDIO)
436 #define SIMDUTF_DEPRECATED __declspec(deprecated)
437
438 #define simdutf_really_inline __forceinline // really inline in release mode
439 #define simdutf_always_inline __forceinline // always inline, no matter what
440 #define simdutf_never_inline __declspec(noinline)
441
442 #define simdutf_unused
443 #define simdutf_warn_unused
444
445 #ifndef simdutf_likely
446 #define simdutf_likely(x) x
447 #endif
448 #ifndef simdutf_unlikely
449 #define simdutf_unlikely(x) x
450 #endif
451
452 #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning(push))
453 #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning(push, 0))
454 #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) \
455 __pragma(warning(disable : WARNING_NUMBER))
456 // Get rid of Intellisense-only warnings (Code Analysis)
457 // Though __has_include is C++17, it is supported in Visual Studio 2017 or
458 // better (_MSC_VER>=1910).
459 #ifdef __has_include
460 #if __has_include(<CppCoreCheck\Warnings.h>)
461 #include <CppCoreCheck\Warnings.h>
462 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS \
463 SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS)
464 #endif
465 #endif
466
467 #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS
468 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
469 #endif
470
471 #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996)
472 #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING
473 #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning(pop))
474 #define SIMDUTF_DISABLE_UNUSED_WARNING
475#else // SIMDUTF_REGULAR_VISUAL_STUDIO
476 #if defined(__OPTIMIZE__) || defined(NDEBUG)
477 #define simdutf_really_inline inline __attribute__((always_inline))
478 #else
479 #define simdutf_really_inline inline
480 #endif
481 #define simdutf_always_inline \
482 inline __attribute__((always_inline)) // always inline, no matter what
483 #define SIMDUTF_DEPRECATED __attribute__((deprecated))
484 #define simdutf_never_inline inline __attribute__((noinline))
485
486 #define simdutf_unused __attribute__((unused))
487 #define simdutf_warn_unused __attribute__((warn_unused_result))
488
489 #ifndef simdutf_likely
490 #define simdutf_likely(x) __builtin_expect(!!(x), 1)
491 #endif
492 #ifndef simdutf_unlikely
493 #define simdutf_unlikely(x) __builtin_expect(!!(x), 0)
494 #endif
495 // clang-format off
496 #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push")
497 // gcc doesn't seem to disable all warnings with all and extra, add warnings
498 // here as necessary
499 #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS \
500 SIMDUTF_PUSH_DISABLE_WARNINGS \
501 SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \
502 SIMDUTF_DISABLE_GCC_WARNING(-Wall) \
503 SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \
504 SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \
505 SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \
506 SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \
507 SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \
508 SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \
509 SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \
510 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \
511 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable)
512 #define SIMDUTF_PRAGMA(P) _Pragma(#P)
513 #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) \
514 SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING)
515 #if defined(SIMDUTF_CLANG_VISUAL_STUDIO)
516 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS \
517 SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include)
518 #else
519 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
520 #endif
521 #define SIMDUTF_DISABLE_DEPRECATED_WARNING \
522 SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations)
523 #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING \
524 SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow)
525 #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop")
526 #define SIMDUTF_DISABLE_UNUSED_WARNING \
527 SIMDUTF_PUSH_DISABLE_WARNINGS \
528 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-function) \
529 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-const-variable)
530 // clang-format on
531
532#endif // MSC_VER
533
534// Conditional constexpr macro: expands to constexpr for C++17+, empty otherwise
535#if SIMDUTF_CPLUSPLUS17
536 #define simdutf_constexpr constexpr
537#else
538 #define simdutf_constexpr
539#endif
540
541// Will evaluate to constexpr in C++23 or later. This makes it possible to mark
542// functions constexpr if the "if consteval" feature is available to use.
543#if SIMDUTF_CPLUSPLUS23
544 #define simdutf_constexpr23 constexpr
545#else
546 #define simdutf_constexpr23
547#endif
548
549#ifndef SIMDUTF_DLLIMPORTEXPORT
550 #if defined(SIMDUTF_VISUAL_STUDIO) // Visual Studio
566 #if SIMDUTF_BUILDING_WINDOWS_DYNAMIC_LIBRARY
567
568 // We set SIMDUTF_BUILDING_WINDOWS_DYNAMIC_LIBRARY when we build a DLL
569 // under Windows. It should never happen that both
570 // SIMDUTF_BUILDING_WINDOWS_DYNAMIC_LIBRARY and
571 // SIMDUTF_USING_WINDOWS_DYNAMIC_LIBRARY are set.
572 #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
573 #elif SIMDUTF_USING_WINDOWS_DYNAMIC_LIBRARY
574 // Windows user who call a dynamic library should set
575 // SIMDUTF_USING_WINDOWS_DYNAMIC_LIBRARY to 1.
576
577 #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
578 #else
579 // We assume by default static linkage
580 #define SIMDUTF_DLLIMPORTEXPORT
581 #endif
582 #else // defined(SIMDUTF_VISUAL_STUDIO)
583 // Non-Windows systems do not have this complexity.
584 #define SIMDUTF_DLLIMPORTEXPORT
585 #endif // defined(SIMDUTF_VISUAL_STUDIO)
586#endif
587
588#if SIMDUTF_MAYBE_UNUSED_AVAILABLE
589 #define simdutf_maybe_unused [[maybe_unused]]
590#else
591 #define simdutf_maybe_unused
592#endif
593
594#endif // SIMDUTF_COMMON_DEFS_H
595/* end file include/simdutf/common_defs.h */
596/* begin file include/simdutf/encoding_types.h */
597#ifndef SIMDUTF_ENCODING_TYPES_H
598#define SIMDUTF_ENCODING_TYPES_H
599#include <string>
600
601#if !defined(SIMDUTF_NO_STD_TEXT_ENCODING) && \
602 defined(__cpp_lib_text_encoding) && __cpp_lib_text_encoding >= 202306L
603 #define SIMDUTF_HAS_STD_TEXT_ENCODING 1
604 #include <text_encoding>
605#endif
606
607namespace simdutf {
608
609enum encoding_type {
610 UTF8 = 1, // BOM 0xef 0xbb 0xbf
611 UTF16_LE = 2, // BOM 0xff 0xfe
612 UTF16_BE = 4, // BOM 0xfe 0xff
613 UTF32_LE = 8, // BOM 0xff 0xfe 0x00 0x00
614 UTF32_BE = 16, // BOM 0x00 0x00 0xfe 0xff
615 Latin1 = 32,
616
617 unspecified = 0
618};
619
620#ifndef SIMDUTF_IS_BIG_ENDIAN
621 #error "SIMDUTF_IS_BIG_ENDIAN needs to be defined."
622#endif
623
624enum endianness {
625 LITTLE = 0,
626 BIG = 1,
627 NATIVE =
628#if SIMDUTF_IS_BIG_ENDIAN
629 BIG
630#else
631 LITTLE
632#endif
633};
634
635simdutf_warn_unused simdutf_really_inline constexpr bool
636match_system(endianness e) {
637 return e == endianness::NATIVE;
638}
639
640simdutf_warn_unused std::string to_string(encoding_type bom);
641
642// Note that BOM for UTF8 is discouraged.
643namespace BOM {
644
652simdutf_warn_unused encoding_type check_bom(const uint8_t *byte, size_t length);
653simdutf_warn_unused encoding_type check_bom(const char *byte, size_t length);
660simdutf_warn_unused size_t bom_byte_size(encoding_type bom);
661
662} // namespace BOM
663
664#ifdef SIMDUTF_HAS_STD_TEXT_ENCODING
672simdutf_warn_unused constexpr std::text_encoding
673to_std_encoding(encoding_type enc) noexcept {
674 switch (enc) {
675 case UTF8:
676 return std::text_encoding(std::text_encoding::id::UTF8);
677 case UTF16_LE:
678 return std::text_encoding(std::text_encoding::id::UTF16LE);
679 case UTF16_BE:
680 return std::text_encoding(std::text_encoding::id::UTF16BE);
681 case UTF32_LE:
682 return std::text_encoding(std::text_encoding::id::UTF32LE);
683 case UTF32_BE:
684 return std::text_encoding(std::text_encoding::id::UTF32BE);
685 case Latin1:
686 return std::text_encoding(std::text_encoding::id::ISOLatin1);
687 case unspecified:
688 default:
689 return std::text_encoding(std::text_encoding::id::unknown);
690 }
691}
692
700simdutf_warn_unused constexpr encoding_type
701from_std_encoding(const std::text_encoding &enc) noexcept {
702 switch (enc.mib()) {
703 case std::text_encoding::id::UTF8:
704 return UTF8;
705 case std::text_encoding::id::UTF16LE:
706 return UTF16_LE;
707 case std::text_encoding::id::UTF16BE:
708 return UTF16_BE;
709 case std::text_encoding::id::UTF32LE:
710 return UTF32_LE;
711 case std::text_encoding::id::UTF32BE:
712 return UTF32_BE;
713 case std::text_encoding::id::ISOLatin1:
714 return Latin1;
715 default:
716 return unspecified;
717 }
718}
719
725simdutf_warn_unused constexpr encoding_type native_utf16_encoding() noexcept {
726 #if SIMDUTF_IS_BIG_ENDIAN
727 return UTF16_BE;
728 #else
729 return UTF16_LE;
730 #endif
731}
732
738simdutf_warn_unused constexpr encoding_type native_utf32_encoding() noexcept {
739 #if SIMDUTF_IS_BIG_ENDIAN
740 return UTF32_BE;
741 #else
742 return UTF32_LE;
743 #endif
744}
745
757simdutf_warn_unused constexpr encoding_type
758from_std_encoding_native(const std::text_encoding &enc) noexcept {
759 switch (enc.mib()) {
760 case std::text_encoding::id::UTF8:
761 return UTF8;
762 case std::text_encoding::id::UTF16:
763 return native_utf16_encoding();
764 case std::text_encoding::id::UTF16LE:
765 return UTF16_LE;
766 case std::text_encoding::id::UTF16BE:
767 return UTF16_BE;
768 case std::text_encoding::id::UTF32:
769 return native_utf32_encoding();
770 case std::text_encoding::id::UTF32LE:
771 return UTF32_LE;
772 case std::text_encoding::id::UTF32BE:
773 return UTF32_BE;
774 case std::text_encoding::id::ISOLatin1:
775 return Latin1;
776 default:
777 return unspecified;
778 }
779}
780#endif // SIMDUTF_HAS_STD_TEXT_ENCODING
781
782} // namespace simdutf
783#endif
784/* end file include/simdutf/encoding_types.h */
785/* begin file include/simdutf/error.h */
786#ifndef SIMDUTF_ERROR_H
787#define SIMDUTF_ERROR_H
788namespace simdutf {
789
790enum error_code {
791 SUCCESS = 0,
792 HEADER_BITS, // Any byte must have fewer than 5 header bits.
793 TOO_SHORT, // The leading byte must be followed by N-1 continuation bytes,
794 // where N is the UTF-8 character length This is also the error
795 // when the input is truncated.
796 TOO_LONG, // We either have too many consecutive continuation bytes or the
797 // string starts with a continuation byte.
798 OVERLONG, // The decoded character must be above U+7F for two-byte characters,
799 // U+7FF for three-byte characters, and U+FFFF for four-byte
800 // characters.
801 TOO_LARGE, // The decoded character must be less than or equal to
802 // U+10FFFF,less than or equal than U+7F for ASCII OR less than
803 // equal than U+FF for Latin1
804 SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or
805 // UTF-32)
806 // OR
807 // a high surrogate must be followed by a low surrogate
808 // and a low surrogate must be preceded by a high surrogate
809 // (UTF-16)
810 // OR
811 // there must be no surrogate at all and one is
812 // found (Latin1 functions)
813 // OR
814 // *specifically* for the function
815 // utf8_length_from_utf16_with_replacement, a surrogate (whether
816 // in error or not) has been found (I.e., whether we are in the
817 // Basic Multilingual Plane or not).
818 INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid
819 // base64 string. This may include a misplaced
820 // padding character ('=').
821 BASE64_INPUT_REMAINDER, // The base64 input terminates with a single
822 // character, excluding padding (=). It is also used
823 // in strict mode when padding is not adequate.
824 BASE64_EXTRA_BITS, // The base64 input terminates with non-zero
825 // padding bits.
826 OUTPUT_BUFFER_TOO_SMALL, // The provided buffer is too small.
827 OTHER // Not related to validation/transcoding.
828};
829#if SIMDUTF_CPLUSPLUS17
830inline std::string_view error_to_string(error_code code) noexcept {
831 switch (code) {
832 case SUCCESS:
833 return "SUCCESS";
834 case HEADER_BITS:
835 return "HEADER_BITS";
836 case TOO_SHORT:
837 return "TOO_SHORT";
838 case TOO_LONG:
839 return "TOO_LONG";
840 case OVERLONG:
841 return "OVERLONG";
842 case TOO_LARGE:
843 return "TOO_LARGE";
844 case SURROGATE:
845 return "SURROGATE";
846 case INVALID_BASE64_CHARACTER:
847 return "INVALID_BASE64_CHARACTER";
848 case BASE64_INPUT_REMAINDER:
849 return "BASE64_INPUT_REMAINDER";
850 case BASE64_EXTRA_BITS:
851 return "BASE64_EXTRA_BITS";
852 case OUTPUT_BUFFER_TOO_SMALL:
853 return "OUTPUT_BUFFER_TOO_SMALL";
854 default:
855 return "OTHER";
856 }
857}
858#endif
859
860struct result {
861 error_code error;
862 size_t count; // In case of error, indicates the position of the error. In
863 // case of success, indicates the number of code units
864 // validated/written.
865
866 simdutf_really_inline simdutf_constexpr23 result() noexcept
867 : error{error_code::SUCCESS}, count{0} {}
868
869 simdutf_really_inline simdutf_constexpr23 result(error_code err,
870 size_t pos) noexcept
871 : error{err}, count{pos} {}
872
873 simdutf_really_inline simdutf_constexpr23 bool is_ok() const noexcept {
874 return error == error_code::SUCCESS;
875 }
876
877 simdutf_really_inline simdutf_constexpr23 bool is_err() const noexcept {
878 return error != error_code::SUCCESS;
879 }
880};
881
883 error_code error;
884 size_t input_count;
885 size_t output_count;
886 bool padding_error = false; // true if the error is due to padding, only
887 // meaningful when error is not SUCCESS
888
889 simdutf_really_inline simdutf_constexpr23 full_result() noexcept
890 : error{error_code::SUCCESS}, input_count{0}, output_count{0} {}
891
892 simdutf_really_inline simdutf_constexpr23 full_result(error_code err,
893 size_t pos_in,
894 size_t pos_out) noexcept
895 : error{err}, input_count{pos_in}, output_count{pos_out} {}
896 simdutf_really_inline simdutf_constexpr23 full_result(
897 error_code err, size_t pos_in, size_t pos_out, bool padding_err) noexcept
898 : error{err}, input_count{pos_in}, output_count{pos_out},
899 padding_error{padding_err} {}
900
901 simdutf_really_inline simdutf_constexpr23 operator result() const noexcept {
902 if (error == error_code::SUCCESS) {
903 return result{error, output_count};
904 } else {
905 return result{error, input_count};
906 }
907 }
908};
909
910} // namespace simdutf
911#endif
912/* end file include/simdutf/error.h */
913
914SIMDUTF_PUSH_DISABLE_WARNINGS
915SIMDUTF_DISABLE_UNDESIRED_WARNINGS
916
917// Public API
918/* begin file include/simdutf/simdutf_version.h */
919// /include/simdutf/simdutf_version.h automatically generated by release.py,
920// do not change by hand
921#ifndef SIMDUTF_SIMDUTF_VERSION_H
922#define SIMDUTF_SIMDUTF_VERSION_H
923
925#define SIMDUTF_VERSION "8.0.0"
926
927namespace simdutf {
928enum {
932 SIMDUTF_VERSION_MAJOR = 8,
936 SIMDUTF_VERSION_MINOR = 0,
940 SIMDUTF_VERSION_REVISION = 0
941};
942} // namespace simdutf
943
944#endif // SIMDUTF_SIMDUTF_VERSION_H
945/* end file include/simdutf/simdutf_version.h */
946/* begin file include/simdutf/implementation.h */
947#ifndef SIMDUTF_IMPLEMENTATION_H
948#define SIMDUTF_IMPLEMENTATION_H
949#if !defined(SIMDUTF_NO_THREADS)
950 #include <atomic>
951#endif
952#include <string>
953#ifdef SIMDUTF_INTERNAL_TESTS
954 #include <vector>
955#endif
956/* begin file include/simdutf/internal/isadetection.h */
957/* From
958https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
959Highly modified.
960
961Copyright (c) 2016- Facebook, Inc (Adam Paszke)
962Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
963Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
964Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
965Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
966Copyright (c) 2011-2013 NYU (Clement Farabet)
967Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
968Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute
969(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
970Samy Bengio, Johnny Mariethoz)
971
972All rights reserved.
973
974Redistribution and use in source and binary forms, with or without
975modification, are permitted provided that the following conditions are met:
976
9771. Redistributions of source code must retain the above copyright
978 notice, this list of conditions and the following disclaimer.
979
9802. Redistributions in binary form must reproduce the above copyright
981 notice, this list of conditions and the following disclaimer in the
982 documentation and/or other materials provided with the distribution.
983
9843. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
985America and IDIAP Research Institute nor the names of its contributors may be
986 used to endorse or promote products derived from this software without
987 specific prior written permission.
988
989THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
990AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
991IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
992ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
993LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
994CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
995SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
996INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
997CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
998ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
999POSSIBILITY OF SUCH DAMAGE.
1000*/
1001
1002#ifndef SIMDutf_INTERNAL_ISADETECTION_H
1003#define SIMDutf_INTERNAL_ISADETECTION_H
1004
1005#include <cstdint>
1006#include <cstdlib>
1007#if defined(_MSC_VER)
1008 #include <intrin.h>
1009#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
1010 #include <cpuid.h>
1011#endif
1012
1013
1014// RISC-V ISA detection utilities
1015#if SIMDUTF_IS_RISCV64 && defined(__linux__)
1016 #include <unistd.h> // for syscall
1017// We define these ourselves, for backwards compatibility
1018struct simdutf_riscv_hwprobe {
1019 int64_t key;
1020 uint64_t value;
1021};
1022 #define simdutf_riscv_hwprobe(...) syscall(258, __VA_ARGS__)
1023 #define SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0 4
1024 #define SIMDUTF_RISCV_HWPROBE_IMA_V (1 << 2)
1025 #define SIMDUTF_RISCV_HWPROBE_EXT_ZVBB (1 << 17)
1026#endif // SIMDUTF_IS_RISCV64 && defined(__linux__)
1027
1028#if defined(__loongarch__) && defined(__linux__)
1029 #include <sys/auxv.h>
1030// bits/hwcap.h
1031// #define HWCAP_LOONGARCH_LSX (1 << 4)
1032// #define HWCAP_LOONGARCH_LASX (1 << 5)
1033#endif
1034
1035namespace simdutf {
1036namespace internal {
1037
1038enum instruction_set {
1039 DEFAULT = 0x0,
1040 NEON = 0x1,
1041 AVX2 = 0x4,
1042 SSE42 = 0x8,
1043 PCLMULQDQ = 0x10,
1044 BMI1 = 0x20,
1045 BMI2 = 0x40,
1046 ALTIVEC = 0x80,
1047 AVX512F = 0x100,
1048 AVX512DQ = 0x200,
1049 AVX512IFMA = 0x400,
1050 AVX512PF = 0x800,
1051 AVX512ER = 0x1000,
1052 AVX512CD = 0x2000,
1053 AVX512BW = 0x4000,
1054 AVX512VL = 0x8000,
1055 AVX512VBMI2 = 0x10000,
1056 AVX512VPOPCNTDQ = 0x2000,
1057 RVV = 0x4000,
1058 ZVBB = 0x8000,
1059 LSX = 0x40000,
1060 LASX = 0x80000,
1061};
1062
1063#if defined(__PPC64__)
1064
1065static inline uint32_t detect_supported_architectures() {
1066 return instruction_set::ALTIVEC;
1067}
1068
1069#elif SIMDUTF_IS_RISCV64
1070
1071static inline uint32_t detect_supported_architectures() {
1072 uint32_t host_isa = instruction_set::DEFAULT;
1073 #if SIMDUTF_IS_RVV
1074 host_isa |= instruction_set::RVV;
1075 #endif
1076 #if SIMDUTF_IS_ZVBB
1077 host_isa |= instruction_set::ZVBB;
1078 #endif
1079 #if defined(__linux__)
1080 simdutf_riscv_hwprobe probes[] = {{SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0, 0}};
1081 long ret = simdutf_riscv_hwprobe(&probes, sizeof probes / sizeof *probes, 0,
1082 nullptr, 0);
1083 if (ret == 0) {
1084 uint64_t extensions = probes[0].value;
1085 if (extensions & SIMDUTF_RISCV_HWPROBE_IMA_V)
1086 host_isa |= instruction_set::RVV;
1087 if (extensions & SIMDUTF_RISCV_HWPROBE_EXT_ZVBB)
1088 host_isa |= instruction_set::ZVBB;
1089 }
1090 #endif
1091 #if defined(RUN_IN_SPIKE_SIMULATOR)
1092 // Proxy Kernel does not implement yet hwprobe syscall
1093 host_isa |= instruction_set::RVV;
1094 #endif
1095 return host_isa;
1096}
1097
1098#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
1099
1100static inline uint32_t detect_supported_architectures() {
1101 return instruction_set::NEON;
1102}
1103
1104#elif defined(__x86_64__) || defined(_M_AMD64) // x64
1105
1106namespace {
1107namespace cpuid_bit {
1108// Can be found on Intel ISA Reference for CPUID
1109
1110// EAX = 0x01
1111constexpr uint32_t pclmulqdq = uint32_t(1)
1112 << 1;
1113constexpr uint32_t sse42 = uint32_t(1)
1114 << 20;
1115constexpr uint32_t osxsave =
1116 (uint32_t(1) << 26) |
1117 (uint32_t(1) << 27);
1118
1119// EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf)
1120// See: "Table 3-8. Information Returned by CPUID Instruction"
1121namespace ebx {
1122constexpr uint32_t bmi1 = uint32_t(1) << 3;
1123constexpr uint32_t avx2 = uint32_t(1) << 5;
1124constexpr uint32_t bmi2 = uint32_t(1) << 8;
1125constexpr uint32_t avx512f = uint32_t(1) << 16;
1126constexpr uint32_t avx512dq = uint32_t(1) << 17;
1127constexpr uint32_t avx512ifma = uint32_t(1) << 21;
1128constexpr uint32_t avx512cd = uint32_t(1) << 28;
1129constexpr uint32_t avx512bw = uint32_t(1) << 30;
1130constexpr uint32_t avx512vl = uint32_t(1) << 31;
1131} // namespace ebx
1132
1133namespace ecx {
1134constexpr uint32_t avx512vbmi = uint32_t(1) << 1;
1135constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6;
1136constexpr uint32_t avx512vnni = uint32_t(1) << 11;
1137constexpr uint32_t avx512bitalg = uint32_t(1) << 12;
1138constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14;
1139} // namespace ecx
1140namespace edx {
1141constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
1142}
1143namespace xcr0_bit {
1144constexpr uint64_t avx256_saved = uint64_t(1) << 2;
1145constexpr uint64_t avx512_saved =
1146 uint64_t(7) << 5;
1147} // namespace xcr0_bit
1148} // namespace cpuid_bit
1149} // namespace
1150
1151static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
1152 uint32_t *edx) {
1153 #if defined(_MSC_VER)
1154 int cpu_info[4];
1155 __cpuidex(cpu_info, *eax, *ecx);
1156 *eax = cpu_info[0];
1157 *ebx = cpu_info[1];
1158 *ecx = cpu_info[2];
1159 *edx = cpu_info[3];
1160 #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
1161 uint32_t level = *eax;
1162 __get_cpuid(level, eax, ebx, ecx, edx);
1163 #else
1164 uint32_t a = *eax, b, c = *ecx, d;
1165 asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
1166 *eax = a;
1167 *ebx = b;
1168 *ecx = c;
1169 *edx = d;
1170 #endif
1171}
1172
1173static inline uint64_t xgetbv() {
1174 #if defined(_MSC_VER)
1175 return _xgetbv(0);
1176 #else
1177 uint32_t xcr0_lo, xcr0_hi;
1178 asm volatile("xgetbv\n\t" : "=a"(xcr0_lo), "=d"(xcr0_hi) : "c"(0));
1179 return xcr0_lo | ((uint64_t)xcr0_hi << 32);
1180 #endif
1181}
1182
1183static inline uint32_t detect_supported_architectures() {
1184 uint32_t eax;
1185 uint32_t ebx = 0;
1186 uint32_t ecx = 0;
1187 uint32_t edx = 0;
1188 uint32_t host_isa = 0x0;
1189
1190 // EBX for EAX=0x1
1191 eax = 0x1;
1192 cpuid(&eax, &ebx, &ecx, &edx);
1193
1194 if (ecx & cpuid_bit::sse42) {
1195 host_isa |= instruction_set::SSE42;
1196 }
1197
1198 if (ecx & cpuid_bit::pclmulqdq) {
1199 host_isa |= instruction_set::PCLMULQDQ;
1200 }
1201
1202 if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) {
1203 return host_isa;
1204 }
1205
1206 // xgetbv for checking if the OS saves registers
1207 uint64_t xcr0 = xgetbv();
1208
1209 if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) {
1210 return host_isa;
1211 }
1212 // ECX for EAX=0x7
1213 eax = 0x7;
1214 ecx = 0x0; // Sub-leaf = 0
1215 cpuid(&eax, &ebx, &ecx, &edx);
1216 if (ebx & cpuid_bit::ebx::avx2) {
1217 host_isa |= instruction_set::AVX2;
1218 }
1219 if (ebx & cpuid_bit::ebx::bmi1) {
1220 host_isa |= instruction_set::BMI1;
1221 }
1222 if (ebx & cpuid_bit::ebx::bmi2) {
1223 host_isa |= instruction_set::BMI2;
1224 }
1225 if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) ==
1226 cpuid_bit::xcr0_bit::avx512_saved)) {
1227 return host_isa;
1228 }
1229 if (ebx & cpuid_bit::ebx::avx512f) {
1230 host_isa |= instruction_set::AVX512F;
1231 }
1232 if (ebx & cpuid_bit::ebx::avx512bw) {
1233 host_isa |= instruction_set::AVX512BW;
1234 }
1235 if (ebx & cpuid_bit::ebx::avx512cd) {
1236 host_isa |= instruction_set::AVX512CD;
1237 }
1238 if (ebx & cpuid_bit::ebx::avx512dq) {
1239 host_isa |= instruction_set::AVX512DQ;
1240 }
1241 if (ebx & cpuid_bit::ebx::avx512vl) {
1242 host_isa |= instruction_set::AVX512VL;
1243 }
1244 if (ecx & cpuid_bit::ecx::avx512vbmi2) {
1245 host_isa |= instruction_set::AVX512VBMI2;
1246 }
1247 if (ecx & cpuid_bit::ecx::avx512vpopcnt) {
1248 host_isa |= instruction_set::AVX512VPOPCNTDQ;
1249 }
1250 return host_isa;
1251}
1252#elif defined(__loongarch__)
1253
1254static inline uint32_t detect_supported_architectures() {
1255 uint32_t host_isa = instruction_set::DEFAULT;
1256 #if defined(__linux__)
1257 uint64_t hwcap = 0;
1258 hwcap = getauxval(AT_HWCAP);
1259 if (hwcap & HWCAP_LOONGARCH_LSX) {
1260 host_isa |= instruction_set::LSX;
1261 }
1262 if (hwcap & HWCAP_LOONGARCH_LASX) {
1263 host_isa |= instruction_set::LASX;
1264 }
1265 #endif
1266 return host_isa;
1267}
1268#else // fallback
1269
1270// includes 32-bit ARM.
1271static inline uint32_t detect_supported_architectures() {
1272 return instruction_set::DEFAULT;
1273}
1274
1275#endif // end SIMD extension detection code
1276
1277} // namespace internal
1278} // namespace simdutf
1279
1280#endif // SIMDutf_INTERNAL_ISADETECTION_H
1281/* end file include/simdutf/internal/isadetection.h */
1282
1283#if SIMDUTF_SPAN
1284 #include <concepts>
1285 #include <type_traits>
1286 #include <span>
1287 #include <tuple>
1288#endif
1289#if SIMDUTF_CPLUSPLUS17
1290 #include <string_view>
1291#endif
1292// The following defines are conditionally enabled/disabled during amalgamation.
1293// By default all features are enabled, regular code shouldn't check them. Only
1294// when user code really relies of a selected subset, it's good to verify these
1295// flags, like:
1296//
1297// #if !SIMDUTF_FEATURE_UTF16
1298// # error("Please amalgamate simdutf with UTF-16 support")
1299// #endif
1300//
1301#define SIMDUTF_FEATURE_DETECT_ENCODING 0
1302#define SIMDUTF_FEATURE_ASCII 0
1303#define SIMDUTF_FEATURE_LATIN1 0
1304#define SIMDUTF_FEATURE_UTF8 1
1305#define SIMDUTF_FEATURE_UTF16 1
1306#define SIMDUTF_FEATURE_UTF32 0
1307#define SIMDUTF_FEATURE_BASE64 0
1308
1309#if SIMDUTF_CPLUSPLUS23
1310/* begin file include/simdutf/constexpr_ptr.h */
1311#ifndef SIMDUTF_CONSTEXPR_PTR_H
1312#define SIMDUTF_CONSTEXPR_PTR_H
1313
1314#include <cstddef>
1315
1316namespace simdutf {
1317namespace detail {
1322template <typename to, typename from>
1323 requires(sizeof(to) == sizeof(from))
1324struct constexpr_ptr {
1325 const from *p;
1326
1327 constexpr explicit constexpr_ptr(const from *ptr) noexcept : p(ptr) {}
1328
1329 constexpr to operator*() const noexcept { return static_cast<to>(*p); }
1330
1331 constexpr constexpr_ptr &operator++() noexcept {
1332 ++p;
1333 return *this;
1334 }
1335
1336 constexpr constexpr_ptr operator++(int) noexcept {
1337 auto old = *this;
1338 ++p;
1339 return old;
1340 }
1341
1342 constexpr constexpr_ptr &operator--() noexcept {
1343 --p;
1344 return *this;
1345 }
1346
1347 constexpr constexpr_ptr operator--(int) noexcept {
1348 auto old = *this;
1349 --p;
1350 return old;
1351 }
1352
1353 constexpr constexpr_ptr &operator+=(std::ptrdiff_t n) noexcept {
1354 p += n;
1355 return *this;
1356 }
1357
1358 constexpr constexpr_ptr &operator-=(std::ptrdiff_t n) noexcept {
1359 p -= n;
1360 return *this;
1361 }
1362
1363 constexpr constexpr_ptr operator+(std::ptrdiff_t n) const noexcept {
1364 return constexpr_ptr{p + n};
1365 }
1366
1367 constexpr constexpr_ptr operator-(std::ptrdiff_t n) const noexcept {
1368 return constexpr_ptr{p - n};
1369 }
1370
1371 constexpr std::ptrdiff_t operator-(const constexpr_ptr &o) const noexcept {
1372 return p - o.p;
1373 }
1374
1375 constexpr to operator[](std::ptrdiff_t n) const noexcept {
1376 return static_cast<to>(*(p + n));
1377 }
1378
1379 // to prevent compilation errors for memcpy, even if it is never
1380 // called during constant evaluation
1381 constexpr operator const void *() const noexcept { return p; }
1382};
1383
1384template <typename to, typename from>
1385constexpr constexpr_ptr<to, from> constexpr_cast_ptr(from *p) noexcept {
1386 return constexpr_ptr<to, from>{p};
1387}
1388
1393template <typename SrcType, typename TargetType>
1394struct constexpr_write_ptr_proxy {
1395
1396 constexpr explicit constexpr_write_ptr_proxy(TargetType *raw) : p(raw) {}
1397
1398 constexpr constexpr_write_ptr_proxy &operator=(SrcType v) {
1399 *p = static_cast<TargetType>(v);
1400 return *this;
1401 }
1402
1403 TargetType *p;
1404};
1405
1411template <typename SrcType, typename TargetType> struct constexpr_write_ptr {
1412 constexpr explicit constexpr_write_ptr(TargetType *raw) : p(raw) {}
1413
1414 constexpr constexpr_write_ptr_proxy<SrcType, TargetType> operator*() const {
1415 return constexpr_write_ptr_proxy<SrcType, TargetType>{p};
1416 }
1417
1418 constexpr constexpr_write_ptr_proxy<SrcType, TargetType>
1419 operator[](std::ptrdiff_t n) const {
1420 return constexpr_write_ptr_proxy<SrcType, TargetType>{p + n};
1421 }
1422
1423 constexpr constexpr_write_ptr &operator++() {
1424 ++p;
1425 return *this;
1426 }
1427
1428 constexpr constexpr_write_ptr operator++(int) {
1429 constexpr_write_ptr old = *this;
1430 ++p;
1431 return old;
1432 }
1433
1434 constexpr std::ptrdiff_t operator-(const constexpr_write_ptr &other) const {
1435 return p - other.p;
1436 }
1437
1438 TargetType *p;
1439};
1440
1441template <typename SrcType, typename TargetType>
1442constexpr auto constexpr_cast_writeptr(TargetType *raw) {
1443 return constexpr_write_ptr<SrcType, TargetType>{raw};
1444}
1445
1446} // namespace detail
1447} // namespace simdutf
1448#endif
1449/* end file include/simdutf/constexpr_ptr.h */
1450#endif
1451
1452#if SIMDUTF_SPAN
1454namespace simdutf {
1455namespace detail {
1460template <typename T>
1461concept byte_like = std::is_same_v<T, std::byte> || //
1462 std::is_same_v<T, char> || //
1463 std::is_same_v<T, signed char> || //
1464 std::is_same_v<T, unsigned char> || //
1465 std::is_same_v<T, char8_t>;
1466
1467template <typename T>
1468concept is_byte_like = byte_like<std::remove_cvref_t<T>>;
1469
1470template <typename T>
1471concept is_pointer = std::is_pointer_v<T>;
1472
1478template <typename T>
1479concept input_span_of_byte_like = requires(const T &t) {
1480 { t.size() } noexcept -> std::convertible_to<std::size_t>;
1481 { t.data() } noexcept -> is_pointer;
1482 { *t.data() } noexcept -> is_byte_like;
1483};
1484
1485template <typename T>
1486concept is_mutable = !std::is_const_v<std::remove_reference_t<T>>;
1487
1491template <typename T>
1492concept output_span_of_byte_like = requires(T &t) {
1493 { t.size() } noexcept -> std::convertible_to<std::size_t>;
1494 { t.data() } noexcept -> is_pointer;
1495 { *t.data() } noexcept -> is_byte_like;
1496 { *t.data() } noexcept -> is_mutable;
1497};
1498
1504template <class InputPtr>
1505concept indexes_into_byte_like = requires(InputPtr p) {
1506 { std::decay_t<decltype(p[0])>{} } -> simdutf::detail::byte_like;
1507};
1508template <class InputPtr>
1509concept indexes_into_utf16 = requires(InputPtr p) {
1510 { std::decay_t<decltype(p[0])>{} } -> std::same_as<char16_t>;
1511};
1512template <class InputPtr>
1513concept indexes_into_utf32 = requires(InputPtr p) {
1514 { std::decay_t<decltype(p[0])>{} } -> std::same_as<char32_t>;
1515};
1516
1517template <class InputPtr>
1518concept index_assignable_from_char = requires(InputPtr p, char s) {
1519 { p[0] = s };
1520};
1521
1526template <class InputPtr>
1527concept indexes_into_uint32 = requires(InputPtr p) {
1528 { std::decay_t<decltype(p[0])>{} } -> std::same_as<std::uint32_t>;
1529};
1530} // namespace detail
1531} // namespace simdutf
1532#endif // SIMDUTF_SPAN
1533
1534// these includes are needed for constexpr support. they are
1535// not part of the public api.
1536/* begin file include/simdutf/scalar/swap_bytes.h */
1537#ifndef SIMDUTF_SWAP_BYTES_H
1538#define SIMDUTF_SWAP_BYTES_H
1539
1540namespace simdutf {
1541namespace scalar {
1542
1543constexpr inline simdutf_warn_unused uint16_t
1544u16_swap_bytes(const uint16_t word) {
1545 return uint16_t((word >> 8) | (word << 8));
1546}
1547
1548constexpr inline simdutf_warn_unused uint32_t
1549u32_swap_bytes(const uint32_t word) {
1550 return ((word >> 24) & 0xff) | // move byte 3 to byte 0
1551 ((word << 8) & 0xff0000) | // move byte 1 to byte 2
1552 ((word >> 8) & 0xff00) | // move byte 2 to byte 1
1553 ((word << 24) & 0xff000000); // byte 0 to byte 3
1554}
1555
1556namespace utf32 {
1557template <endianness big_endian> constexpr uint32_t swap_if_needed(uint32_t c) {
1558 return !match_system(big_endian) ? scalar::u32_swap_bytes(c) : c;
1559}
1560} // namespace utf32
1561
1562namespace utf16 {
1563template <endianness big_endian> constexpr uint16_t swap_if_needed(uint16_t c) {
1564 return !match_system(big_endian) ? scalar::u16_swap_bytes(c) : c;
1565}
1566} // namespace utf16
1567
1568} // namespace scalar
1569} // namespace simdutf
1570
1571#endif
1572/* end file include/simdutf/scalar/swap_bytes.h */
1573/* begin file include/simdutf/scalar/ascii.h */
1574#ifndef SIMDUTF_ASCII_H
1575#define SIMDUTF_ASCII_H
1576
1577namespace simdutf {
1578namespace scalar {
1579namespace {
1580namespace ascii {
1581
1582template <class InputPtr>
1583#if SIMDUTF_CPLUSPLUS20
1584 requires simdutf::detail::indexes_into_byte_like<InputPtr>
1585#endif
1586simdutf_warn_unused simdutf_constexpr23 bool validate(InputPtr data,
1587 size_t len) noexcept {
1588 uint64_t pos = 0;
1589
1590#if SIMDUTF_CPLUSPLUS23
1591 // avoid memcpy during constant evaluation
1592 if !consteval
1593#endif
1594 // process in blocks of 16 bytes when possible
1595 {
1596 for (; pos + 16 <= len; pos += 16) {
1597 uint64_t v1;
1598 std::memcpy(&v1, data + pos, sizeof(uint64_t));
1599 uint64_t v2;
1600 std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
1601 uint64_t v{v1 | v2};
1602 if ((v & 0x8080808080808080) != 0) {
1603 return false;
1604 }
1605 }
1606 }
1607
1608 // process the tail byte-by-byte
1609 for (; pos < len; pos++) {
1610 if (static_cast<std::uint8_t>(data[pos]) >= 0b10000000) {
1611 return false;
1612 }
1613 }
1614 return true;
1615}
1616template <class InputPtr>
1617#if SIMDUTF_CPLUSPLUS20
1618 requires simdutf::detail::indexes_into_byte_like<InputPtr>
1619#endif
1620simdutf_warn_unused simdutf_constexpr23 result
1621validate_with_errors(InputPtr data, size_t len) noexcept {
1622 size_t pos = 0;
1623#if SIMDUTF_CPLUSPLUS23
1624 // avoid memcpy during constant evaluation
1625 if !consteval
1626#endif
1627 {
1628 // process in blocks of 16 bytes when possible
1629 for (; pos + 16 <= len; pos += 16) {
1630 uint64_t v1;
1631 std::memcpy(&v1, data + pos, sizeof(uint64_t));
1632 uint64_t v2;
1633 std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
1634 uint64_t v{v1 | v2};
1635 if ((v & 0x8080808080808080) != 0) {
1636 for (; pos < len; pos++) {
1637 if (static_cast<std::uint8_t>(data[pos]) >= 0b10000000) {
1638 return result(error_code::TOO_LARGE, pos);
1639 }
1640 }
1641 }
1642 }
1643 }
1644
1645 // process the tail byte-by-byte
1646 for (; pos < len; pos++) {
1647 if (static_cast<std::uint8_t>(data[pos]) >= 0b10000000) {
1648 return result(error_code::TOO_LARGE, pos);
1649 }
1650 }
1651 return result(error_code::SUCCESS, pos);
1652}
1653
1654} // namespace ascii
1655} // unnamed namespace
1656} // namespace scalar
1657} // namespace simdutf
1658
1659#endif
1660/* end file include/simdutf/scalar/ascii.h */
1661/* begin file include/simdutf/scalar/atomic_util.h */
1662#ifndef SIMDUTF_ATOMIC_UTIL_H
1663#define SIMDUTF_ATOMIC_UTIL_H
1664#if SIMDUTF_ATOMIC_REF
1665 #include <atomic>
1666namespace simdutf {
1667namespace scalar {
1668
1669// This function is a memcpy that uses atomic operations to read from the
1670// source.
1671inline void memcpy_atomic_read(char *dst, const char *src, size_t len) {
1672 static_assert(std::atomic_ref<char>::required_alignment == sizeof(char),
1673 "std::atomic_ref requires the same alignment as char_type");
1674 // We expect all 64-bit systems to be able to read 64-bit words from an
1675 // aligned memory region atomically. You might be able to do better on
1676 // specific systems, e.g., x64 systems can read 128-bit words atomically.
1677 constexpr size_t alignment = sizeof(uint64_t);
1678
1679 // Lambda for atomic byte-by-byte copy
1680 auto bbb_memcpy_atomic_read = [](char *bytedst, const char *bytesrc,
1681 size_t bytelen) noexcept {
1682 char *mutable_src = const_cast<char *>(bytesrc);
1683 for (size_t j = 0; j < bytelen; ++j) {
1684 bytedst[j] =
1685 std::atomic_ref<char>(mutable_src[j]).load(std::memory_order_relaxed);
1686 }
1687 };
1688
1689 // Handle unaligned start
1690 size_t offset = reinterpret_cast<std::uintptr_t>(src) % alignment;
1691 if (offset) {
1692 size_t to_align = std::min(len, alignment - offset);
1693 bbb_memcpy_atomic_read(dst, src, to_align);
1694 src += to_align;
1695 dst += to_align;
1696 len -= to_align;
1697 }
1698
1699 // Process aligned 64-bit chunks
1700 while (len >= alignment) {
1701 auto *src_aligned = reinterpret_cast<uint64_t *>(const_cast<char *>(src));
1702 const auto dst_value =
1703 std::atomic_ref<uint64_t>(*src_aligned).load(std::memory_order_relaxed);
1704 std::memcpy(dst, &dst_value, sizeof(uint64_t));
1705 src += alignment;
1706 dst += alignment;
1707 len -= alignment;
1708 }
1709
1710 // Handle remaining bytes
1711 if (len) {
1712 bbb_memcpy_atomic_read(dst, src, len);
1713 }
1714}
1715
1716// This function is a memcpy that uses atomic operations to write to the
1717// destination.
1718inline void memcpy_atomic_write(char *dst, const char *src, size_t len) {
1719 static_assert(std::atomic_ref<char>::required_alignment == sizeof(char),
1720 "std::atomic_ref requires the same alignment as char");
1721 // We expect all 64-bit systems to be able to write 64-bit words to an aligned
1722 // memory region atomically.
1723 // You might be able to do better on specific systems, e.g., x64 systems can
1724 // write 128-bit words atomically.
1725 constexpr size_t alignment = sizeof(uint64_t);
1726
1727 // Lambda for atomic byte-by-byte write
1728 auto bbb_memcpy_atomic_write = [](char *bytedst, const char *bytesrc,
1729 size_t bytelen) noexcept {
1730 for (size_t j = 0; j < bytelen; ++j) {
1731 std::atomic_ref<char>(bytedst[j])
1732 .store(bytesrc[j], std::memory_order_relaxed);
1733 }
1734 };
1735
1736 // Handle unaligned start
1737 size_t offset = reinterpret_cast<std::uintptr_t>(dst) % alignment;
1738 if (offset) {
1739 size_t to_align = std::min(len, alignment - offset);
1740 bbb_memcpy_atomic_write(dst, src, to_align);
1741 dst += to_align;
1742 src += to_align;
1743 len -= to_align;
1744 }
1745
1746 // Process aligned 64-bit chunks
1747 while (len >= alignment) {
1748 auto *dst_aligned = reinterpret_cast<uint64_t *>(dst);
1749 uint64_t src_val;
1750 std::memcpy(&src_val, src, sizeof(uint64_t)); // Non-atomic read from src
1751 std::atomic_ref<uint64_t>(*dst_aligned)
1752 .store(src_val, std::memory_order_relaxed);
1753 dst += alignment;
1754 src += alignment;
1755 len -= alignment;
1756 }
1757
1758 // Handle remaining bytes
1759 if (len) {
1760 bbb_memcpy_atomic_write(dst, src, len);
1761 }
1762}
1763} // namespace scalar
1764} // namespace simdutf
1765#endif // SIMDUTF_ATOMIC_REF
1766#endif // SIMDUTF_ATOMIC_UTIL_H
1767/* end file include/simdutf/scalar/atomic_util.h */
1768/* begin file include/simdutf/scalar/latin1.h */
1769#ifndef SIMDUTF_LATIN1_H
1770#define SIMDUTF_LATIN1_H
1771
1772namespace simdutf {
1773namespace scalar {
1774namespace {
1775namespace latin1 {
1776
1777simdutf_really_inline size_t utf8_length_from_latin1(const char *buf,
1778 size_t len) {
1779 const uint8_t *c = reinterpret_cast<const uint8_t *>(buf);
1780 size_t answer = 0;
1781 for (size_t i = 0; i < len; i++) {
1782 if ((c[i] >> 7)) {
1783 answer++;
1784 }
1785 }
1786 return answer + len;
1787}
1788
1789} // namespace latin1
1790} // unnamed namespace
1791} // namespace scalar
1792} // namespace simdutf
1793
1794#endif
1795/* end file include/simdutf/scalar/latin1.h */
1796/* begin file include/simdutf/scalar/latin1_to_utf16/latin1_to_utf16.h */
1797#ifndef SIMDUTF_LATIN1_TO_UTF16_H
1798#define SIMDUTF_LATIN1_TO_UTF16_H
1799
1800namespace simdutf {
1801namespace scalar {
1802namespace {
1803namespace latin1_to_utf16 {
1804
1805template <endianness big_endian, typename InputPtr>
1806#if SIMDUTF_CPLUSPLUS20
1807 requires simdutf::detail::indexes_into_byte_like<InputPtr>
1808#endif
1809simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
1810 char16_t *utf16_output) {
1811 size_t pos = 0;
1812 char16_t *start{utf16_output};
1813
1814 while (pos < len) {
1815 uint16_t word =
1816 uint8_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
1817 *utf16_output++ =
1818 char16_t(match_system(big_endian) ? word : u16_swap_bytes(word));
1819 pos++;
1820 }
1821
1822 return utf16_output - start;
1823}
1824
1825template <endianness big_endian>
1826inline result convert_with_errors(const char *buf, size_t len,
1827 char16_t *utf16_output) {
1828 const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
1829 size_t pos = 0;
1830 char16_t *start{utf16_output};
1831
1832 while (pos < len) {
1833 uint16_t word =
1834 uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
1835 *utf16_output++ =
1836 char16_t(match_system(big_endian) ? word : u16_swap_bytes(word));
1837 pos++;
1838 }
1839
1840 return result(error_code::SUCCESS, utf16_output - start);
1841}
1842
1843} // namespace latin1_to_utf16
1844} // unnamed namespace
1845} // namespace scalar
1846} // namespace simdutf
1847
1848#endif
1849/* end file include/simdutf/scalar/latin1_to_utf16/latin1_to_utf16.h */
1850/* begin file include/simdutf/scalar/latin1_to_utf32/latin1_to_utf32.h */
1851#ifndef SIMDUTF_LATIN1_TO_UTF32_H
1852#define SIMDUTF_LATIN1_TO_UTF32_H
1853
1854namespace simdutf {
1855namespace scalar {
1856namespace {
1857namespace latin1_to_utf32 {
1858
1859template <typename InputPtr>
1860#if SIMDUTF_CPLUSPLUS20
1861 requires simdutf::detail::indexes_into_byte_like<InputPtr>
1862#endif
1863simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
1864 char32_t *utf32_output) {
1865 char32_t *start{utf32_output};
1866 for (size_t i = 0; i < len; i++) {
1867 *utf32_output++ = uint8_t(data[i]);
1868 }
1869 return utf32_output - start;
1870}
1871
1872} // namespace latin1_to_utf32
1873} // unnamed namespace
1874} // namespace scalar
1875} // namespace simdutf
1876
1877#endif
1878/* end file include/simdutf/scalar/latin1_to_utf32/latin1_to_utf32.h */
1879/* begin file include/simdutf/scalar/latin1_to_utf8/latin1_to_utf8.h */
1880#ifndef SIMDUTF_LATIN1_TO_UTF8_H
1881#define SIMDUTF_LATIN1_TO_UTF8_H
1882
1883namespace simdutf {
1884namespace scalar {
1885namespace {
1886namespace latin1_to_utf8 {
1887
1888template <typename InputPtr, typename OutputPtr>
1889#if SIMDUTF_CPLUSPLUS20
1890 requires(simdutf::detail::indexes_into_byte_like<InputPtr> &&
1891 simdutf::detail::index_assignable_from_char<OutputPtr>)
1892#endif
1893simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
1894 OutputPtr utf8_output) {
1895 // const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
1896 size_t pos = 0;
1897 size_t utf8_pos = 0;
1898
1899 while (pos < len) {
1900#if SIMDUTF_CPLUSPLUS23
1901 if !consteval
1902#endif
1903 {
1904 // try to convert the next block of 16 ASCII bytes
1905 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
1906 // they are ascii
1907 uint64_t v1;
1908 ::memcpy(&v1, data + pos, sizeof(uint64_t));
1909 uint64_t v2;
1910 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
1911 uint64_t v{v1 |
1912 v2}; // We are only interested in these bits: 1000 1000 1000
1913 // 1000, so it makes sense to concatenate everything
1914 if ((v & 0x8080808080808080) ==
1915 0) { // if NONE of these are set, e.g. all of them are zero, then
1916 // everything is ASCII
1917 size_t final_pos = pos + 16;
1918 while (pos < final_pos) {
1919 utf8_output[utf8_pos++] = char(data[pos]);
1920 pos++;
1921 }
1922 continue;
1923 }
1924 } // if (pos + 16 <= len)
1925 } // !consteval scope
1926
1927 unsigned char byte = data[pos];
1928 if ((byte & 0x80) == 0) { // if ASCII
1929 // will generate one UTF-8 bytes
1930 utf8_output[utf8_pos++] = char(byte);
1931 pos++;
1932 } else {
1933 // will generate two UTF-8 bytes
1934 utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
1935 utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
1936 pos++;
1937 }
1938 } // while
1939 return utf8_pos;
1940}
1941
1942simdutf_really_inline size_t convert(const char *buf, size_t len,
1943 char *utf8_output) {
1944 return convert(reinterpret_cast<const unsigned char *>(buf), len,
1945 utf8_output);
1946}
1947
1948inline size_t convert_safe(const char *buf, size_t len, char *utf8_output,
1949 size_t utf8_len) {
1950 const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
1951 size_t pos = 0;
1952 size_t skip_pos = 0;
1953 size_t utf8_pos = 0;
1954 while (pos < len && utf8_pos < utf8_len) {
1955 // try to convert the next block of 16 ASCII bytes
1956 if (pos >= skip_pos && pos + 16 <= len &&
1957 utf8_pos + 16 <= utf8_len) { // if it is safe to read 16 more bytes,
1958 // check that they are ascii
1959 uint64_t v1;
1960 ::memcpy(&v1, data + pos, sizeof(uint64_t));
1961 uint64_t v2;
1962 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
1963 uint64_t v{v1 |
1964 v2}; // We are only interested in these bits: 1000 1000 1000
1965 // 1000, so it makes sense to concatenate everything
1966 if ((v & 0x8080808080808080) ==
1967 0) { // if NONE of these are set, e.g. all of them are zero, then
1968 // everything is ASCII
1969 ::memcpy(utf8_output + utf8_pos, buf + pos, 16);
1970 utf8_pos += 16;
1971 pos += 16;
1972 } else {
1973 // At least one of the next 16 bytes are not ASCII, we will process them
1974 // one by one
1975 skip_pos = pos + 16;
1976 }
1977 } else {
1978 const auto byte = data[pos];
1979 if ((byte & 0x80) == 0) { // if ASCII
1980 // will generate one UTF-8 bytes
1981 utf8_output[utf8_pos++] = char(byte);
1982 pos++;
1983 } else if (utf8_pos + 2 <= utf8_len) {
1984 // will generate two UTF-8 bytes
1985 utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
1986 utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
1987 pos++;
1988 } else {
1989 break;
1990 }
1991 }
1992 }
1993 return utf8_pos;
1994}
1995
1996template <typename InputPtr, typename OutputPtr>
1997#if SIMDUTF_CPLUSPLUS20
1998 requires(simdutf::detail::indexes_into_byte_like<InputPtr> &&
1999 simdutf::detail::index_assignable_from_char<OutputPtr>)
2000#endif
2001simdutf_constexpr23 size_t convert_safe_constexpr(InputPtr data, size_t len,
2002 OutputPtr utf8_output,
2003 size_t utf8_len) {
2004 size_t pos = 0;
2005 size_t utf8_pos = 0;
2006 while (pos < len && utf8_pos < utf8_len) {
2007 const unsigned char byte = data[pos];
2008 if ((byte & 0x80) == 0) { // if ASCII
2009 // will generate one UTF-8 bytes
2010 utf8_output[utf8_pos++] = char(byte);
2011 pos++;
2012 } else if (utf8_pos + 2 <= utf8_len) {
2013 // will generate two UTF-8 bytes
2014 utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
2015 utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
2016 pos++;
2017 } else {
2018 break;
2019 }
2020 }
2021 return utf8_pos;
2022}
2023
2024template <typename InputPtr>
2025#if SIMDUTF_CPLUSPLUS20
2026 requires simdutf::detail::indexes_into_byte_like<InputPtr>
2027#endif
2028simdutf_constexpr23 simdutf_warn_unused size_t
2029utf8_length_from_latin1(InputPtr input, size_t length) noexcept {
2030 size_t answer = length;
2031 size_t i = 0;
2032
2033#if SIMDUTF_CPLUSPLUS23
2034 if !consteval
2035#endif
2036 {
2037 auto pop = [](uint64_t v) {
2038 return (size_t)(((v >> 7) & UINT64_C(0x0101010101010101)) *
2039 UINT64_C(0x0101010101010101) >>
2040 56);
2041 };
2042 for (; i + 32 <= length; i += 32) {
2043 uint64_t v;
2044 memcpy(&v, input + i, 8);
2045 answer += pop(v);
2046 memcpy(&v, input + i + 8, sizeof(v));
2047 answer += pop(v);
2048 memcpy(&v, input + i + 16, sizeof(v));
2049 answer += pop(v);
2050 memcpy(&v, input + i + 24, sizeof(v));
2051 answer += pop(v);
2052 }
2053 for (; i + 8 <= length; i += 8) {
2054 uint64_t v;
2055 memcpy(&v, input + i, sizeof(v));
2056 answer += pop(v);
2057 }
2058 } // !consteval scope
2059 for (; i + 1 <= length; i += 1) {
2060 answer += static_cast<uint8_t>(input[i]) >> 7;
2061 }
2062 return answer;
2063}
2064
2065} // namespace latin1_to_utf8
2066} // unnamed namespace
2067} // namespace scalar
2068} // namespace simdutf
2069
2070#endif
2071/* end file include/simdutf/scalar/latin1_to_utf8/latin1_to_utf8.h */
2072/* begin file include/simdutf/scalar/utf16.h */
2073#ifndef SIMDUTF_UTF16_H
2074#define SIMDUTF_UTF16_H
2075
2076namespace simdutf {
2077namespace scalar {
2078namespace utf16 {
2079
2080template <endianness big_endian>
2081simdutf_warn_unused simdutf_constexpr23 bool
2082validate_as_ascii(const char16_t *data, size_t len) noexcept {
2083 for (size_t pos = 0; pos < len; pos++) {
2084 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
2085 if (word >= 0x80) {
2086 return false;
2087 }
2088 }
2089 return true;
2090}
2091
2092template <endianness big_endian>
2093inline simdutf_warn_unused simdutf_constexpr23 bool
2094validate(const char16_t *data, size_t len) noexcept {
2095 uint64_t pos = 0;
2096 while (pos < len) {
2097 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
2098 if ((word & 0xF800) == 0xD800) {
2099 if (pos + 1 >= len) {
2100 return false;
2101 }
2102 char16_t diff = char16_t(word - 0xD800);
2103 if (diff > 0x3FF) {
2104 return false;
2105 }
2106 char16_t next_word = !match_system(big_endian)
2107 ? u16_swap_bytes(data[pos + 1])
2108 : data[pos + 1];
2109 char16_t diff2 = char16_t(next_word - 0xDC00);
2110 if (diff2 > 0x3FF) {
2111 return false;
2112 }
2113 pos += 2;
2114 } else {
2115 pos++;
2116 }
2117 }
2118 return true;
2119}
2120
2121template <endianness big_endian>
2122inline simdutf_warn_unused simdutf_constexpr23 result
2123validate_with_errors(const char16_t *data, size_t len) noexcept {
2124 size_t pos = 0;
2125 while (pos < len) {
2126 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
2127 if ((word & 0xF800) == 0xD800) {
2128 if (pos + 1 >= len) {
2129 return result(error_code::SURROGATE, pos);
2130 }
2131 char16_t diff = char16_t(word - 0xD800);
2132 if (diff > 0x3FF) {
2133 return result(error_code::SURROGATE, pos);
2134 }
2135 char16_t next_word = !match_system(big_endian)
2136 ? u16_swap_bytes(data[pos + 1])
2137 : data[pos + 1];
2138 char16_t diff2 = uint16_t(next_word - 0xDC00);
2139 if (diff2 > 0x3FF) {
2140 return result(error_code::SURROGATE, pos);
2141 }
2142 pos += 2;
2143 } else {
2144 pos++;
2145 }
2146 }
2147 return result(error_code::SUCCESS, pos);
2148}
2149
2150template <endianness big_endian>
2151simdutf_constexpr23 size_t count_code_points(const char16_t *p, size_t len) {
2152 // We are not BOM aware.
2153 size_t counter{0};
2154 for (size_t i = 0; i < len; i++) {
2155 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
2156 counter += ((word & 0xFC00) != 0xDC00);
2157 }
2158 return counter;
2159}
2160
2161template <endianness big_endian>
2162simdutf_constexpr23 size_t utf8_length_from_utf16(const char16_t *p,
2163 size_t len) {
2164 // We are not BOM aware.
2165 size_t counter{0};
2166 for (size_t i = 0; i < len; i++) {
2167 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
2168 counter++; // ASCII
2169 counter += static_cast<size_t>(
2170 word >
2171 0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
2172 counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
2173 (word >= 0xE000)); // three-byte
2174 }
2175 return counter;
2176}
2177
2178template <endianness big_endian>
2179simdutf_constexpr23 size_t utf32_length_from_utf16(const char16_t *p,
2180 size_t len) {
2181 // We are not BOM aware.
2182 size_t counter{0};
2183 for (size_t i = 0; i < len; i++) {
2184 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
2185 counter += ((word & 0xFC00) != 0xDC00);
2186 }
2187 return counter;
2188}
2189
2190simdutf_really_inline simdutf_constexpr23 void
2191change_endianness_utf16(const char16_t *input, size_t size, char16_t *output) {
2192 for (size_t i = 0; i < size; i++) {
2193 *output++ = char16_t(input[i] >> 8 | input[i] << 8);
2194 }
2195}
2196
2197template <endianness big_endian>
2198simdutf_warn_unused simdutf_constexpr23 size_t
2199trim_partial_utf16(const char16_t *input, size_t length) {
2200 if (length == 0) {
2201 return 0;
2202 }
2203 uint16_t last_word = uint16_t(input[length - 1]);
2204 last_word = scalar::utf16::swap_if_needed<big_endian>(last_word);
2205 length -= ((last_word & 0xFC00) == 0xD800);
2206 return length;
2207}
2208
2209template <endianness big_endian>
2210simdutf_constexpr bool is_high_surrogate(char16_t c) {
2211 c = scalar::utf16::swap_if_needed<big_endian>(c);
2212 return (0xd800 <= c && c <= 0xdbff);
2213}
2214
2215template <endianness big_endian>
2216simdutf_constexpr bool is_low_surrogate(char16_t c) {
2217 c = scalar::utf16::swap_if_needed<big_endian>(c);
2218 return (0xdc00 <= c && c <= 0xdfff);
2219}
2220
2221simdutf_really_inline constexpr bool high_surrogate(char16_t c) {
2222 return (0xd800 <= c && c <= 0xdbff);
2223}
2224
2225simdutf_really_inline constexpr bool low_surrogate(char16_t c) {
2226 return (0xdc00 <= c && c <= 0xdfff);
2227}
2228
2229template <endianness big_endian>
2230simdutf_constexpr23 result
2231utf8_length_from_utf16_with_replacement(const char16_t *p, size_t len) {
2232 bool any_surrogates = false;
2233 // We are not BOM aware.
2234 size_t counter{0};
2235 for (size_t i = 0; i < len; i++) {
2236 if (is_high_surrogate<big_endian>(p[i])) {
2237 any_surrogates = true;
2238 // surrogate pair
2239 if (i + 1 < len && is_low_surrogate<big_endian>(p[i + 1])) {
2240 counter += 4;
2241 i++; // skip low surrogate
2242 } else {
2243 counter += 3; // unpaired high surrogate replaced by U+FFFD
2244 }
2245 continue;
2246 } else if (is_low_surrogate<big_endian>(p[i])) {
2247 any_surrogates = true;
2248 counter += 3; // unpaired low surrogate replaced by U+FFFD
2249 continue;
2250 }
2251 char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i];
2252 counter++; // at least 1 byte
2253 counter +=
2254 static_cast<size_t>(word > 0x7F); // non-ASCII is at least 2 bytes
2255 counter += static_cast<size_t>(word > 0x7FF); // three-byte
2256 }
2257 return {any_surrogates ? error_code::SURROGATE : error_code::SUCCESS,
2258 counter};
2259}
2260
2261// variable templates are a C++14 extension
2262template <endianness big_endian> constexpr char16_t replacement() {
2263 return !match_system(big_endian) ? scalar::u16_swap_bytes(0xfffd) : 0xfffd;
2264}
2265
2266template <endianness big_endian>
2267simdutf_constexpr23 void to_well_formed_utf16(const char16_t *input, size_t len,
2268 char16_t *output) {
2269 const char16_t replacement = utf16::replacement<big_endian>();
2270 bool high_surrogate_prev = false, high_surrogate, low_surrogate;
2271 size_t i = 0;
2272 for (; i < len; i++) {
2273 char16_t c = input[i];
2274 high_surrogate = is_high_surrogate<big_endian>(c);
2275 low_surrogate = is_low_surrogate<big_endian>(c);
2276 if (high_surrogate_prev && !low_surrogate) {
2277 output[i - 1] = replacement;
2278 }
2279
2280 if (!high_surrogate_prev && low_surrogate) {
2281 output[i] = replacement;
2282 } else {
2283 output[i] = input[i];
2284 }
2285 high_surrogate_prev = high_surrogate;
2286 }
2287
2288 /* string may not end with high surrogate */
2289 if (high_surrogate_prev) {
2290 output[i - 1] = replacement;
2291 }
2292}
2293
2294} // namespace utf16
2295} // namespace scalar
2296} // namespace simdutf
2297
2298#endif
2299/* end file include/simdutf/scalar/utf16.h */
2300/* begin file include/simdutf/scalar/utf16_to_latin1/utf16_to_latin1.h */
2301#ifndef SIMDUTF_UTF16_TO_LATIN1_H
2302#define SIMDUTF_UTF16_TO_LATIN1_H
2303
2304#include <cstring> // for std::memcpy
2305
2306namespace simdutf {
2307namespace scalar {
2308namespace {
2309namespace utf16_to_latin1 {
2310
2311template <endianness big_endian, typename InputPtr, typename OutputPtr>
2312#if SIMDUTF_CPLUSPLUS20
2313 requires(simdutf::detail::indexes_into_utf16<InputPtr> &&
2314 simdutf::detail::index_assignable_from_char<OutputPtr>)
2315#endif
2316simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
2317 OutputPtr latin_output) {
2318 if (len == 0) {
2319 return 0;
2320 }
2321 size_t pos = 0;
2322 const auto latin_output_start = latin_output;
2323 uint16_t word = 0;
2324 uint16_t too_large = 0;
2325
2326 while (pos < len) {
2327 word = !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2328 too_large |= word;
2329 *latin_output++ = char(word & 0xFF);
2330 pos++;
2331 }
2332 if ((too_large & 0xFF00) != 0) {
2333 return 0;
2334 }
2335
2336 return latin_output - latin_output_start;
2337}
2338
2339template <endianness big_endian, typename InputPtr, typename OutputPtr>
2340#if SIMDUTF_CPLUSPLUS20
2341 requires(simdutf::detail::indexes_into_utf16<InputPtr> &&
2342 simdutf::detail::index_assignable_from_char<OutputPtr>)
2343#endif
2344simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len,
2345 OutputPtr latin_output) {
2346 if (len == 0) {
2347 return result(error_code::SUCCESS, 0);
2348 }
2349 size_t pos = 0;
2350 auto start = latin_output;
2351 uint16_t word;
2352
2353 while (pos < len) {
2354#if SIMDUTF_CPLUSPLUS23
2355 if !consteval
2356#endif
2357 {
2358 if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that
2359 // they are Latin1
2360 uint64_t v1, v2, v3, v4;
2361 ::memcpy(&v1, data + pos, sizeof(uint64_t));
2362 ::memcpy(&v2, data + pos + 4, sizeof(uint64_t));
2363 ::memcpy(&v3, data + pos + 8, sizeof(uint64_t));
2364 ::memcpy(&v4, data + pos + 12, sizeof(uint64_t));
2365
2366 if simdutf_constexpr (!match_system(big_endian)) {
2367 v1 = (v1 >> 8) | (v1 << (64 - 8));
2368 }
2369 if simdutf_constexpr (!match_system(big_endian)) {
2370 v2 = (v2 >> 8) | (v2 << (64 - 8));
2371 }
2372 if simdutf_constexpr (!match_system(big_endian)) {
2373 v3 = (v3 >> 8) | (v3 << (64 - 8));
2374 }
2375 if simdutf_constexpr (!match_system(big_endian)) {
2376 v4 = (v4 >> 8) | (v4 << (64 - 8));
2377 }
2378
2379 if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
2380 size_t final_pos = pos + 16;
2381 while (pos < final_pos) {
2382 *latin_output++ = !match_system(big_endian)
2383 ? char(u16_swap_bytes(data[pos]))
2384 : char(data[pos]);
2385 pos++;
2386 }
2387 continue;
2388 }
2389 }
2390 }
2391
2392 word = !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2393 if ((word & 0xFF00) == 0) {
2394 *latin_output++ = char(word & 0xFF);
2395 pos++;
2396 } else {
2397 return result(error_code::TOO_LARGE, pos);
2398 }
2399 }
2400 return result(error_code::SUCCESS, latin_output - start);
2401}
2402
2403} // namespace utf16_to_latin1
2404} // unnamed namespace
2405} // namespace scalar
2406} // namespace simdutf
2407
2408#endif
2409/* end file include/simdutf/scalar/utf16_to_latin1/utf16_to_latin1.h */
2410/* begin file include/simdutf/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
2411#ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H
2412#define SIMDUTF_VALID_UTF16_TO_LATIN1_H
2413
2414namespace simdutf {
2415namespace scalar {
2416namespace {
2417namespace utf16_to_latin1 {
2418
2419template <endianness big_endian, class InputIterator, class OutputIterator>
2420simdutf_constexpr23 inline size_t
2421convert_valid_impl(InputIterator data, size_t len,
2422 OutputIterator latin_output) {
2423 static_assert(
2424 std::is_same<typename std::decay<decltype(*data)>::type, uint16_t>::value,
2425 "must decay to uint16_t");
2426 size_t pos = 0;
2427 const auto start = latin_output;
2428 uint16_t word = 0;
2429
2430 while (pos < len) {
2431 word = !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2432 *latin_output++ = char(word);
2433 pos++;
2434 }
2435
2436 return latin_output - start;
2437}
2438
2439template <endianness big_endian>
2440simdutf_really_inline size_t convert_valid(const char16_t *buf, size_t len,
2441 char *latin_output) {
2442 return convert_valid_impl<big_endian>(reinterpret_cast<const uint16_t *>(buf),
2443 len, latin_output);
2444}
2445} // namespace utf16_to_latin1
2446} // unnamed namespace
2447} // namespace scalar
2448} // namespace simdutf
2449
2450#endif
2451/* end file include/simdutf/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
2452/* begin file include/simdutf/scalar/utf16_to_utf32/utf16_to_utf32.h */
2453#ifndef SIMDUTF_UTF16_TO_UTF32_H
2454#define SIMDUTF_UTF16_TO_UTF32_H
2455
2456namespace simdutf {
2457namespace scalar {
2458namespace {
2459namespace utf16_to_utf32 {
2460
2461template <endianness big_endian>
2462simdutf_constexpr23 size_t convert(const char16_t *data, size_t len,
2463 char32_t *utf32_output) {
2464 size_t pos = 0;
2465 char32_t *start{utf32_output};
2466 while (pos < len) {
2467 uint16_t word =
2468 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2469 if ((word & 0xF800) != 0xD800) {
2470 // No surrogate pair, extend 16-bit word to 32-bit word
2471 *utf32_output++ = char32_t(word);
2472 pos++;
2473 } else {
2474 // must be a surrogate pair
2475 uint16_t diff = uint16_t(word - 0xD800);
2476 if (diff > 0x3FF) {
2477 return 0;
2478 }
2479 if (pos + 1 >= len) {
2480 return 0;
2481 } // minimal bound checking
2482 uint16_t next_word = !match_system(big_endian)
2483 ? u16_swap_bytes(data[pos + 1])
2484 : data[pos + 1];
2485 uint16_t diff2 = uint16_t(next_word - 0xDC00);
2486 if (diff2 > 0x3FF) {
2487 return 0;
2488 }
2489 uint32_t value = (diff << 10) + diff2 + 0x10000;
2490 *utf32_output++ = char32_t(value);
2491 pos += 2;
2492 }
2493 }
2494 return utf32_output - start;
2495}
2496
2497template <endianness big_endian>
2498simdutf_constexpr23 result convert_with_errors(const char16_t *data, size_t len,
2499 char32_t *utf32_output) {
2500 size_t pos = 0;
2501 char32_t *start{utf32_output};
2502 while (pos < len) {
2503 uint16_t word =
2504 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2505 if ((word & 0xF800) != 0xD800) {
2506 // No surrogate pair, extend 16-bit word to 32-bit word
2507 *utf32_output++ = char32_t(word);
2508 pos++;
2509 } else {
2510 // must be a surrogate pair
2511 uint16_t diff = uint16_t(word - 0xD800);
2512 if (diff > 0x3FF) {
2513 return result(error_code::SURROGATE, pos);
2514 }
2515 if (pos + 1 >= len) {
2516 return result(error_code::SURROGATE, pos);
2517 } // minimal bound checking
2518 uint16_t next_word = !match_system(big_endian)
2519 ? u16_swap_bytes(data[pos + 1])
2520 : data[pos + 1];
2521 uint16_t diff2 = uint16_t(next_word - 0xDC00);
2522 if (diff2 > 0x3FF) {
2523 return result(error_code::SURROGATE, pos);
2524 }
2525 uint32_t value = (diff << 10) + diff2 + 0x10000;
2526 *utf32_output++ = char32_t(value);
2527 pos += 2;
2528 }
2529 }
2530 return result(error_code::SUCCESS, utf32_output - start);
2531}
2532
2533} // namespace utf16_to_utf32
2534} // unnamed namespace
2535} // namespace scalar
2536} // namespace simdutf
2537
2538#endif
2539/* end file include/simdutf/scalar/utf16_to_utf32/utf16_to_utf32.h */
2540/* begin file include/simdutf/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
2541#ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H
2542#define SIMDUTF_VALID_UTF16_TO_UTF32_H
2543
2544namespace simdutf {
2545namespace scalar {
2546namespace {
2547namespace utf16_to_utf32 {
2548
2549template <endianness big_endian>
2550simdutf_constexpr23 size_t convert_valid(const char16_t *data, size_t len,
2551 char32_t *utf32_output) {
2552 size_t pos = 0;
2553 char32_t *start{utf32_output};
2554 while (pos < len) {
2555 uint16_t word =
2556 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2557 if ((word & 0xF800) != 0xD800) {
2558 // No surrogate pair, extend 16-bit word to 32-bit word
2559 *utf32_output++ = char32_t(word);
2560 pos++;
2561 } else {
2562 // must be a surrogate pair
2563 uint16_t diff = uint16_t(word - 0xD800);
2564 if (pos + 1 >= len) {
2565 return 0;
2566 } // minimal bound checking
2567 uint16_t next_word = !match_system(big_endian)
2568 ? u16_swap_bytes(data[pos + 1])
2569 : data[pos + 1];
2570 uint16_t diff2 = uint16_t(next_word - 0xDC00);
2571 uint32_t value = (diff << 10) + diff2 + 0x10000;
2572 *utf32_output++ = char32_t(value);
2573 pos += 2;
2574 }
2575 }
2576 return utf32_output - start;
2577}
2578
2579} // namespace utf16_to_utf32
2580} // unnamed namespace
2581} // namespace scalar
2582} // namespace simdutf
2583
2584#endif
2585/* end file include/simdutf/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
2586/* begin file include/simdutf/scalar/utf16_to_utf8/utf16_to_utf8.h */
2587#ifndef SIMDUTF_UTF16_TO_UTF8_H
2588#define SIMDUTF_UTF16_TO_UTF8_H
2589
2590namespace simdutf {
2591namespace scalar {
2592namespace {
2593namespace utf16_to_utf8 {
2594
2595template <endianness big_endian, typename InputPtr, typename OutputPtr>
2596#if SIMDUTF_CPLUSPLUS20
2597 requires simdutf::detail::indexes_into_utf16<InputPtr>
2598// FIXME constrain output as well
2599#endif
2600simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
2601 OutputPtr utf8_output) {
2602 size_t pos = 0;
2603 const auto start = utf8_output;
2604 while (pos < len) {
2605#if SIMDUTF_CPLUSPLUS23
2606 if !consteval
2607#endif
2608 {
2609 // try to convert the next block of 8 bytes
2610 if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that
2611 // they are ascii
2612 uint64_t v;
2613 ::memcpy(&v, data + pos, sizeof(uint64_t));
2614 if simdutf_constexpr (!match_system(big_endian)) {
2615 v = (v >> 8) | (v << (64 - 8));
2616 }
2617 if ((v & 0xFF80FF80FF80FF80) == 0) {
2618 size_t final_pos = pos + 4;
2619 while (pos < final_pos) {
2620 *utf8_output++ = !match_system(big_endian)
2621 ? char(u16_swap_bytes(data[pos]))
2622 : char(data[pos]);
2623 pos++;
2624 }
2625 continue;
2626 }
2627 }
2628 }
2629 uint16_t word =
2630 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2631 if ((word & 0xFF80) == 0) {
2632 // will generate one UTF-8 bytes
2633 *utf8_output++ = char(word);
2634 pos++;
2635 } else if ((word & 0xF800) == 0) {
2636 // will generate two UTF-8 bytes
2637 // we have 0b110XXXXX 0b10XXXXXX
2638 *utf8_output++ = char((word >> 6) | 0b11000000);
2639 *utf8_output++ = char((word & 0b111111) | 0b10000000);
2640 pos++;
2641 } else if ((word & 0xF800) != 0xD800) {
2642 // will generate three UTF-8 bytes
2643 // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
2644 *utf8_output++ = char((word >> 12) | 0b11100000);
2645 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
2646 *utf8_output++ = char((word & 0b111111) | 0b10000000);
2647 pos++;
2648 } else {
2649 // must be a surrogate pair
2650 if (pos + 1 >= len) {
2651 return 0;
2652 }
2653 uint16_t diff = uint16_t(word - 0xD800);
2654 if (diff > 0x3FF) {
2655 return 0;
2656 }
2657 uint16_t next_word = !match_system(big_endian)
2658 ? u16_swap_bytes(data[pos + 1])
2659 : data[pos + 1];
2660 uint16_t diff2 = uint16_t(next_word - 0xDC00);
2661 if (diff2 > 0x3FF) {
2662 return 0;
2663 }
2664 uint32_t value = (diff << 10) + diff2 + 0x10000;
2665 // will generate four UTF-8 bytes
2666 // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
2667 *utf8_output++ = char((value >> 18) | 0b11110000);
2668 *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
2669 *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
2670 *utf8_output++ = char((value & 0b111111) | 0b10000000);
2671 pos += 2;
2672 }
2673 }
2674 return utf8_output - start;
2675}
2676
2677template <endianness big_endian, bool check_output = false, typename InputPtr,
2678 typename OutputPtr>
2679#if SIMDUTF_CPLUSPLUS20
2680 requires(simdutf::detail::indexes_into_utf16<InputPtr> &&
2681 simdutf::detail::index_assignable_from_char<OutputPtr>)
2682#endif
2683simdutf_constexpr23 full_result convert_with_errors(InputPtr data, size_t len,
2684 OutputPtr utf8_output,
2685 size_t utf8_len = 0) {
2686 if (check_output && utf8_len == 0) {
2687 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, 0, 0);
2688 }
2689
2690 size_t pos = 0;
2691 auto start = utf8_output;
2692 auto end = utf8_output + utf8_len;
2693
2694 while (pos < len) {
2695#if SIMDUTF_CPLUSPLUS23
2696 if !consteval
2697#endif
2698 {
2699 // try to convert the next block of 8 bytes
2700 if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that
2701 // they are ascii
2702 uint64_t v;
2703 ::memcpy(&v, data + pos, sizeof(uint64_t));
2704 if simdutf_constexpr (!match_system(big_endian))
2705 v = (v >> 8) | (v << (64 - 8));
2706 if ((v & 0xFF80FF80FF80FF80) == 0) {
2707 size_t final_pos = pos + 4;
2708 while (pos < final_pos) {
2709 if (check_output && size_t(end - utf8_output) < 1) {
2710 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
2711 utf8_output - start);
2712 }
2713 *utf8_output++ = !match_system(big_endian)
2714 ? char(u16_swap_bytes(data[pos]))
2715 : char(data[pos]);
2716 pos++;
2717 }
2718 continue;
2719 }
2720 }
2721 }
2722
2723 uint16_t word =
2724 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2725 if ((word & 0xFF80) == 0) {
2726 // will generate one UTF-8 bytes
2727 if (check_output && size_t(end - utf8_output) < 1) {
2728 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
2729 utf8_output - start);
2730 }
2731 *utf8_output++ = char(word);
2732 pos++;
2733 } else if ((word & 0xF800) == 0) {
2734 // will generate two UTF-8 bytes
2735 // we have 0b110XXXXX 0b10XXXXXX
2736 if (check_output && size_t(end - utf8_output) < 2) {
2737 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
2738 utf8_output - start);
2739 }
2740 *utf8_output++ = char((word >> 6) | 0b11000000);
2741 *utf8_output++ = char((word & 0b111111) | 0b10000000);
2742 pos++;
2743
2744 } else if ((word & 0xF800) != 0xD800) {
2745 // will generate three UTF-8 bytes
2746 // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
2747 if (check_output && size_t(end - utf8_output) < 3) {
2748 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
2749 utf8_output - start);
2750 }
2751 *utf8_output++ = char((word >> 12) | 0b11100000);
2752 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
2753 *utf8_output++ = char((word & 0b111111) | 0b10000000);
2754 pos++;
2755 } else {
2756
2757 if (check_output && size_t(end - utf8_output) < 4) {
2758 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
2759 utf8_output - start);
2760 }
2761 // must be a surrogate pair
2762 if (pos + 1 >= len) {
2763 return full_result(error_code::SURROGATE, pos, utf8_output - start);
2764 }
2765 uint16_t diff = uint16_t(word - 0xD800);
2766 if (diff > 0x3FF) {
2767 return full_result(error_code::SURROGATE, pos, utf8_output - start);
2768 }
2769 uint16_t next_word = !match_system(big_endian)
2770 ? u16_swap_bytes(data[pos + 1])
2771 : data[pos + 1];
2772 uint16_t diff2 = uint16_t(next_word - 0xDC00);
2773 if (diff2 > 0x3FF) {
2774 return full_result(error_code::SURROGATE, pos, utf8_output - start);
2775 }
2776 uint32_t value = (diff << 10) + diff2 + 0x10000;
2777 // will generate four UTF-8 bytes
2778 // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
2779 *utf8_output++ = char((value >> 18) | 0b11110000);
2780 *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
2781 *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
2782 *utf8_output++ = char((value & 0b111111) | 0b10000000);
2783 pos += 2;
2784 }
2785 }
2786 return full_result(error_code::SUCCESS, pos, utf8_output - start);
2787}
2788
2789template <endianness big_endian>
2790inline result simple_convert_with_errors(const char16_t *buf, size_t len,
2791 char *utf8_output) {
2792 return convert_with_errors<big_endian, false>(buf, len, utf8_output, 0);
2793}
2794
2795} // namespace utf16_to_utf8
2796} // unnamed namespace
2797} // namespace scalar
2798} // namespace simdutf
2799
2800#endif
2801/* end file include/simdutf/scalar/utf16_to_utf8/utf16_to_utf8.h */
2802/* begin file include/simdutf/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
2803#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
2804#define SIMDUTF_VALID_UTF16_TO_UTF8_H
2805
2806namespace simdutf {
2807namespace scalar {
2808namespace {
2809namespace utf16_to_utf8 {
2810
2811template <endianness big_endian, typename InputPtr, typename OutputPtr>
2812#if SIMDUTF_CPLUSPLUS20
2813 requires(simdutf::detail::indexes_into_utf16<InputPtr> &&
2814 simdutf::detail::index_assignable_from_char<OutputPtr>)
2815#endif
2816simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len,
2817 OutputPtr utf8_output) {
2818 size_t pos = 0;
2819 auto start = utf8_output;
2820 while (pos < len) {
2821#if SIMDUTF_CPLUSPLUS23
2822 if !consteval
2823#endif
2824 {
2825 // try to convert the next block of 4 ASCII characters
2826 if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that
2827 // they are ascii
2828 uint64_t v;
2829 ::memcpy(&v, data + pos, sizeof(uint64_t));
2830 if simdutf_constexpr (!match_system(big_endian)) {
2831 v = (v >> 8) | (v << (64 - 8));
2832 }
2833 if ((v & 0xFF80FF80FF80FF80) == 0) {
2834 size_t final_pos = pos + 4;
2835 while (pos < final_pos) {
2836 *utf8_output++ = !match_system(big_endian)
2837 ? char(u16_swap_bytes(data[pos]))
2838 : char(data[pos]);
2839 pos++;
2840 }
2841 continue;
2842 }
2843 }
2844 }
2845
2846 uint16_t word =
2847 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
2848 if ((word & 0xFF80) == 0) {
2849 // will generate one UTF-8 bytes
2850 *utf8_output++ = char(word);
2851 pos++;
2852 } else if ((word & 0xF800) == 0) {
2853 // will generate two UTF-8 bytes
2854 // we have 0b110XXXXX 0b10XXXXXX
2855 *utf8_output++ = char((word >> 6) | 0b11000000);
2856 *utf8_output++ = char((word & 0b111111) | 0b10000000);
2857 pos++;
2858 } else if ((word & 0xF800) != 0xD800) {
2859 // will generate three UTF-8 bytes
2860 // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
2861 *utf8_output++ = char((word >> 12) | 0b11100000);
2862 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
2863 *utf8_output++ = char((word & 0b111111) | 0b10000000);
2864 pos++;
2865 } else {
2866 // must be a surrogate pair
2867 uint16_t diff = uint16_t(word - 0xD800);
2868 if (pos + 1 >= len) {
2869 return 0;
2870 } // minimal bound checking
2871 uint16_t next_word = !match_system(big_endian)
2872 ? u16_swap_bytes(data[pos + 1])
2873 : data[pos + 1];
2874 uint16_t diff2 = uint16_t(next_word - 0xDC00);
2875 uint32_t value = (diff << 10) + diff2 + 0x10000;
2876 // will generate four UTF-8 bytes
2877 // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
2878 *utf8_output++ = char((value >> 18) | 0b11110000);
2879 *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
2880 *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
2881 *utf8_output++ = char((value & 0b111111) | 0b10000000);
2882 pos += 2;
2883 }
2884 }
2885 return utf8_output - start;
2886}
2887
2888} // namespace utf16_to_utf8
2889} // unnamed namespace
2890} // namespace scalar
2891} // namespace simdutf
2892
2893#endif
2894/* end file include/simdutf/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
2895/* begin file include/simdutf/scalar/utf32.h */
2896#ifndef SIMDUTF_UTF32_H
2897#define SIMDUTF_UTF32_H
2898
2899namespace simdutf {
2900namespace scalar {
2901namespace utf32 {
2902
2903template <typename InputPtr>
2904#if SIMDUTF_CPLUSPLUS20
2905 requires simdutf::detail::indexes_into_uint32<InputPtr>
2906#endif
2907simdutf_warn_unused simdutf_constexpr23 bool validate(InputPtr data,
2908 size_t len) noexcept {
2909 uint64_t pos = 0;
2910 for (; pos < len; pos++) {
2911 uint32_t word = data[pos];
2912 if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
2913 return false;
2914 }
2915 }
2916 return true;
2917}
2918
2919simdutf_warn_unused simdutf_really_inline bool validate(const char32_t *buf,
2920 size_t len) noexcept {
2921 return validate(reinterpret_cast<const uint32_t *>(buf), len);
2922}
2923
2924template <typename InputPtr>
2925#if SIMDUTF_CPLUSPLUS20
2926 requires simdutf::detail::indexes_into_uint32<InputPtr>
2927#endif
2928simdutf_warn_unused simdutf_constexpr23 result
2929validate_with_errors(InputPtr data, size_t len) noexcept {
2930 size_t pos = 0;
2931 for (; pos < len; pos++) {
2932 uint32_t word = data[pos];
2933 if (word > 0x10FFFF) {
2934 return result(error_code::TOO_LARGE, pos);
2935 }
2936 if (word >= 0xD800 && word <= 0xDFFF) {
2937 return result(error_code::SURROGATE, pos);
2938 }
2939 }
2940 return result(error_code::SUCCESS, pos);
2941}
2942
2943simdutf_warn_unused simdutf_really_inline result
2944validate_with_errors(const char32_t *buf, size_t len) noexcept {
2945 return validate_with_errors(reinterpret_cast<const uint32_t *>(buf), len);
2946}
2947
2948inline simdutf_constexpr23 size_t utf8_length_from_utf32(const char32_t *p,
2949 size_t len) {
2950 // We are not BOM aware.
2951 size_t counter{0};
2952 for (size_t i = 0; i < len; i++) {
2953 // credit: @ttsugriy for the vectorizable approach
2954 counter++; // ASCII
2955 counter += static_cast<size_t>(p[i] > 0x7F); // two-byte
2956 counter += static_cast<size_t>(p[i] > 0x7FF); // three-byte
2957 counter += static_cast<size_t>(p[i] > 0xFFFF); // four-bytes
2958 }
2959 return counter;
2960}
2961
2962inline simdutf_warn_unused simdutf_constexpr23 size_t
2963utf16_length_from_utf32(const char32_t *p, size_t len) {
2964 // We are not BOM aware.
2965 size_t counter{0};
2966 for (size_t i = 0; i < len; i++) {
2967 counter++; // non-surrogate word
2968 counter += static_cast<size_t>(p[i] > 0xFFFF); // surrogate pair
2969 }
2970 return counter;
2971}
2972
2973} // namespace utf32
2974} // namespace scalar
2975} // namespace simdutf
2976
2977#endif
2978/* end file include/simdutf/scalar/utf32.h */
2979/* begin file include/simdutf/scalar/utf32_to_latin1/utf32_to_latin1.h */
2980#ifndef SIMDUTF_UTF32_TO_LATIN1_H
2981#define SIMDUTF_UTF32_TO_LATIN1_H
2982
2983namespace simdutf {
2984namespace scalar {
2985namespace {
2986namespace utf32_to_latin1 {
2987
2988inline simdutf_constexpr23 size_t convert(const char32_t *data, size_t len,
2989 char *latin1_output) {
2990 char *start = latin1_output;
2991 uint32_t utf32_char;
2992 size_t pos = 0;
2993 uint32_t too_large = 0;
2994
2995 while (pos < len) {
2996 utf32_char = (uint32_t)data[pos];
2997 too_large |= utf32_char;
2998 *latin1_output++ = (char)(utf32_char & 0xFF);
2999 pos++;
3000 }
3001 if ((too_large & 0xFFFFFF00) != 0) {
3002 return 0;
3003 }
3004 return latin1_output - start;
3005}
3006
3007inline simdutf_constexpr23 result convert_with_errors(const char32_t *data,
3008 size_t len,
3009 char *latin1_output) {
3010 char *start{latin1_output};
3011 size_t pos = 0;
3012 while (pos < len) {
3013#if SIMDUTF_CPLUSPLUS23
3014 if !consteval
3015#endif
3016 {
3017 if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that
3018 // they are Latin1
3019 uint64_t v;
3020 ::memcpy(&v, data + pos, sizeof(uint64_t));
3021 if ((v & 0xFFFFFF00FFFFFF00) == 0) {
3022 *latin1_output++ = char(data[pos]);
3023 *latin1_output++ = char(data[pos + 1]);
3024 pos += 2;
3025 continue;
3026 }
3027 }
3028 }
3029
3030 uint32_t utf32_char = data[pos];
3031 if ((utf32_char & 0xFFFFFF00) ==
3032 0) { // Check if the character can be represented in Latin-1
3033 *latin1_output++ = (char)(utf32_char & 0xFF);
3034 pos++;
3035 } else {
3036 return result(error_code::TOO_LARGE, pos);
3037 };
3038 }
3039 return result(error_code::SUCCESS, latin1_output - start);
3040}
3041
3042} // namespace utf32_to_latin1
3043} // unnamed namespace
3044} // namespace scalar
3045} // namespace simdutf
3046
3047#endif
3048/* end file include/simdutf/scalar/utf32_to_latin1/utf32_to_latin1.h */
3049/* begin file include/simdutf/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
3050#ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H
3051#define SIMDUTF_VALID_UTF32_TO_LATIN1_H
3052
3053namespace simdutf {
3054namespace scalar {
3055namespace {
3056namespace utf32_to_latin1 {
3057
3058template <typename ReadPtr, typename WritePtr>
3059simdutf_constexpr23 size_t convert_valid(ReadPtr data, size_t len,
3060 WritePtr latin1_output) {
3061 static_assert(
3062 std::is_same<typename std::decay<decltype(*data)>::type, uint32_t>::value,
3063 "dereferencing the data pointer must result in a uint32_t");
3064 auto start = latin1_output;
3065 uint32_t utf32_char;
3066 size_t pos = 0;
3067
3068 while (pos < len) {
3069 utf32_char = data[pos];
3070
3071#if SIMDUTF_CPLUSPLUS23
3072 // avoid using the 8 byte at a time optimization in constant evaluation
3073 // mode. memcpy can't be used and replacing it with bitwise or gave worse
3074 // codegen (when not during constant evaluation).
3075 if !consteval {
3076#endif
3077 if (pos + 2 <= len) {
3078 // if it is safe to read 8 more bytes, check that they are Latin1
3079 uint64_t v;
3080 std::memcpy(&v, data + pos, sizeof(uint64_t));
3081 if ((v & 0xFFFFFF00FFFFFF00) == 0) {
3082 *latin1_output++ = char(data[pos]);
3083 *latin1_output++ = char(data[pos + 1]);
3084 pos += 2;
3085 continue;
3086 } else {
3087 // output can not be represented in latin1
3088 return 0;
3089 }
3090 }
3091#if SIMDUTF_CPLUSPLUS23
3092 } // if ! consteval
3093#endif
3094 if ((utf32_char & 0xFFFFFF00) == 0) {
3095 *latin1_output++ = char(utf32_char);
3096 } else {
3097 // output can not be represented in latin1
3098 return 0;
3099 }
3100 pos++;
3101 }
3102 return latin1_output - start;
3103}
3104
3105simdutf_really_inline size_t convert_valid(const char32_t *buf, size_t len,
3106 char *latin1_output) {
3107 return convert_valid(reinterpret_cast<const uint32_t *>(buf), len,
3108 latin1_output);
3109}
3110
3111} // namespace utf32_to_latin1
3112} // unnamed namespace
3113} // namespace scalar
3114} // namespace simdutf
3115
3116#endif
3117/* end file include/simdutf/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
3118/* begin file include/simdutf/scalar/utf32_to_utf16/utf32_to_utf16.h */
3119#ifndef SIMDUTF_UTF32_TO_UTF16_H
3120#define SIMDUTF_UTF32_TO_UTF16_H
3121
3122namespace simdutf {
3123namespace scalar {
3124namespace {
3125namespace utf32_to_utf16 {
3126
3127template <endianness big_endian>
3128simdutf_constexpr23 size_t convert(const char32_t *data, size_t len,
3129 char16_t *utf16_output) {
3130 size_t pos = 0;
3131 char16_t *start{utf16_output};
3132 while (pos < len) {
3133 uint32_t word = data[pos];
3134 if ((word & 0xFFFF0000) == 0) {
3135 if (word >= 0xD800 && word <= 0xDFFF) {
3136 return 0;
3137 }
3138 // will not generate a surrogate pair
3139 *utf16_output++ = !match_system(big_endian)
3140 ? char16_t(u16_swap_bytes(uint16_t(word)))
3141 : char16_t(word);
3142 } else {
3143 // will generate a surrogate pair
3144 if (word > 0x10FFFF) {
3145 return 0;
3146 }
3147 word -= 0x10000;
3148 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
3149 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
3150 if simdutf_constexpr (!match_system(big_endian)) {
3151 high_surrogate = u16_swap_bytes(high_surrogate);
3152 low_surrogate = u16_swap_bytes(low_surrogate);
3153 }
3154 *utf16_output++ = char16_t(high_surrogate);
3155 *utf16_output++ = char16_t(low_surrogate);
3156 }
3157 pos++;
3158 }
3159 return utf16_output - start;
3160}
3161
3162template <endianness big_endian>
3163simdutf_constexpr23 result convert_with_errors(const char32_t *data, size_t len,
3164 char16_t *utf16_output) {
3165 size_t pos = 0;
3166 char16_t *start{utf16_output};
3167 while (pos < len) {
3168 uint32_t word = data[pos];
3169 if ((word & 0xFFFF0000) == 0) {
3170 if (word >= 0xD800 && word <= 0xDFFF) {
3171 return result(error_code::SURROGATE, pos);
3172 }
3173 // will not generate a surrogate pair
3174 *utf16_output++ = !match_system(big_endian)
3175 ? char16_t(u16_swap_bytes(uint16_t(word)))
3176 : char16_t(word);
3177 } else {
3178 // will generate a surrogate pair
3179 if (word > 0x10FFFF) {
3180 return result(error_code::TOO_LARGE, pos);
3181 }
3182 word -= 0x10000;
3183 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
3184 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
3185 if simdutf_constexpr (!match_system(big_endian)) {
3186 high_surrogate = u16_swap_bytes(high_surrogate);
3187 low_surrogate = u16_swap_bytes(low_surrogate);
3188 }
3189 *utf16_output++ = char16_t(high_surrogate);
3190 *utf16_output++ = char16_t(low_surrogate);
3191 }
3192 pos++;
3193 }
3194 return result(error_code::SUCCESS, utf16_output - start);
3195}
3196
3197} // namespace utf32_to_utf16
3198} // unnamed namespace
3199} // namespace scalar
3200} // namespace simdutf
3201
3202#endif
3203/* end file include/simdutf/scalar/utf32_to_utf16/utf32_to_utf16.h */
3204/* begin file include/simdutf/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
3205#ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H
3206#define SIMDUTF_VALID_UTF32_TO_UTF16_H
3207
3208namespace simdutf {
3209namespace scalar {
3210namespace {
3211namespace utf32_to_utf16 {
3212
3213template <endianness big_endian>
3214simdutf_constexpr23 size_t convert_valid(const char32_t *data, size_t len,
3215 char16_t *utf16_output) {
3216 size_t pos = 0;
3217 char16_t *start{utf16_output};
3218 while (pos < len) {
3219 uint32_t word = data[pos];
3220 if ((word & 0xFFFF0000) == 0) {
3221 // will not generate a surrogate pair
3222 *utf16_output++ = !match_system(big_endian)
3223 ? char16_t(u16_swap_bytes(uint16_t(word)))
3224 : char16_t(word);
3225 pos++;
3226 } else {
3227 // will generate a surrogate pair
3228 word -= 0x10000;
3229 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
3230 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
3231 if simdutf_constexpr (!match_system(big_endian)) {
3232 high_surrogate = u16_swap_bytes(high_surrogate);
3233 low_surrogate = u16_swap_bytes(low_surrogate);
3234 }
3235 *utf16_output++ = char16_t(high_surrogate);
3236 *utf16_output++ = char16_t(low_surrogate);
3237 pos++;
3238 }
3239 }
3240 return utf16_output - start;
3241}
3242
3243} // namespace utf32_to_utf16
3244} // unnamed namespace
3245} // namespace scalar
3246} // namespace simdutf
3247
3248#endif
3249/* end file include/simdutf/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
3250/* begin file include/simdutf/scalar/utf32_to_utf8/utf32_to_utf8.h */
3251#ifndef SIMDUTF_UTF32_TO_UTF8_H
3252#define SIMDUTF_UTF32_TO_UTF8_H
3253
3254namespace simdutf {
3255namespace scalar {
3256namespace {
3257namespace utf32_to_utf8 {
3258
3259template <typename InputPtr, typename OutputPtr>
3260#if SIMDUTF_CPLUSPLUS20
3261 requires(simdutf::detail::indexes_into_utf32<InputPtr> &&
3262 simdutf::detail::index_assignable_from_char<OutputPtr>)
3263#endif
3264simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
3265 OutputPtr utf8_output) {
3266 size_t pos = 0;
3267 auto start = utf8_output;
3268 while (pos < len) {
3269#if SIMDUTF_CPLUSPLUS23
3270 if !consteval
3271#endif
3272 { // try to convert the next block of 2 ASCII characters
3273 if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that
3274 // they are ascii
3275 uint64_t v;
3276 ::memcpy(&v, data + pos, sizeof(uint64_t));
3277 if ((v & 0xFFFFFF80FFFFFF80) == 0) {
3278 *utf8_output++ = char(data[pos]);
3279 *utf8_output++ = char(data[pos + 1]);
3280 pos += 2;
3281 continue;
3282 }
3283 }
3284 }
3285
3286 uint32_t word = data[pos];
3287 if ((word & 0xFFFFFF80) == 0) {
3288 // will generate one UTF-8 bytes
3289 *utf8_output++ = char(word);
3290 pos++;
3291 } else if ((word & 0xFFFFF800) == 0) {
3292 // will generate two UTF-8 bytes
3293 // we have 0b110XXXXX 0b10XXXXXX
3294 *utf8_output++ = char((word >> 6) | 0b11000000);
3295 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3296 pos++;
3297 } else if ((word & 0xFFFF0000) == 0) {
3298 // will generate three UTF-8 bytes
3299 // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
3300 if (word >= 0xD800 && word <= 0xDFFF) {
3301 return 0;
3302 }
3303 *utf8_output++ = char((word >> 12) | 0b11100000);
3304 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
3305 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3306 pos++;
3307 } else {
3308 // will generate four UTF-8 bytes
3309 // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
3310 if (word > 0x10FFFF) {
3311 return 0;
3312 }
3313 *utf8_output++ = char((word >> 18) | 0b11110000);
3314 *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
3315 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
3316 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3317 pos++;
3318 }
3319 }
3320 return utf8_output - start;
3321}
3322
3323template <typename InputPtr, typename OutputPtr>
3324#if SIMDUTF_CPLUSPLUS20
3325 requires(simdutf::detail::indexes_into_utf32<InputPtr> &&
3326 simdutf::detail::index_assignable_from_char<OutputPtr>)
3327#endif
3328simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len,
3329 OutputPtr utf8_output) {
3330 size_t pos = 0;
3331 auto start = utf8_output;
3332 while (pos < len) {
3333#if SIMDUTF_CPLUSPLUS23
3334 if !consteval
3335#endif
3336 { // try to convert the next block of 2 ASCII characters
3337 if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that
3338 // they are ascii
3339 uint64_t v;
3340 ::memcpy(&v, data + pos, sizeof(uint64_t));
3341 if ((v & 0xFFFFFF80FFFFFF80) == 0) {
3342 *utf8_output++ = char(data[pos]);
3343 *utf8_output++ = char(data[pos + 1]);
3344 pos += 2;
3345 continue;
3346 }
3347 }
3348 }
3349
3350 uint32_t word = data[pos];
3351 if ((word & 0xFFFFFF80) == 0) {
3352 // will generate one UTF-8 bytes
3353 *utf8_output++ = char(word);
3354 pos++;
3355 } else if ((word & 0xFFFFF800) == 0) {
3356 // will generate two UTF-8 bytes
3357 // we have 0b110XXXXX 0b10XXXXXX
3358 *utf8_output++ = char((word >> 6) | 0b11000000);
3359 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3360 pos++;
3361 } else if ((word & 0xFFFF0000) == 0) {
3362 // will generate three UTF-8 bytes
3363 // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
3364 if (word >= 0xD800 && word <= 0xDFFF) {
3365 return result(error_code::SURROGATE, pos);
3366 }
3367 *utf8_output++ = char((word >> 12) | 0b11100000);
3368 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
3369 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3370 pos++;
3371 } else {
3372 // will generate four UTF-8 bytes
3373 // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
3374 if (word > 0x10FFFF) {
3375 return result(error_code::TOO_LARGE, pos);
3376 }
3377 *utf8_output++ = char((word >> 18) | 0b11110000);
3378 *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
3379 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
3380 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3381 pos++;
3382 }
3383 }
3384 return result(error_code::SUCCESS, utf8_output - start);
3385}
3386
3387} // namespace utf32_to_utf8
3388} // unnamed namespace
3389} // namespace scalar
3390} // namespace simdutf
3391
3392#endif
3393/* end file include/simdutf/scalar/utf32_to_utf8/utf32_to_utf8.h */
3394/* begin file include/simdutf/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
3395#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
3396#define SIMDUTF_VALID_UTF32_TO_UTF8_H
3397
3398namespace simdutf {
3399namespace scalar {
3400namespace {
3401namespace utf32_to_utf8 {
3402
3403template <typename InputPtr, typename OutputPtr>
3404#if SIMDUTF_CPLUSPLUS20
3405 requires(simdutf::detail::indexes_into_utf32<InputPtr> &&
3406 simdutf::detail::index_assignable_from_char<OutputPtr>)
3407#endif
3408simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len,
3409 OutputPtr utf8_output) {
3410 size_t pos = 0;
3411 auto start = utf8_output;
3412 while (pos < len) {
3413#if SIMDUTF_CPLUSPLUS23
3414 if !consteval
3415#endif
3416 { // try to convert the next block of 2 ASCII characters
3417 if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that
3418 // they are ascii
3419 uint64_t v;
3420 ::memcpy(&v, data + pos, sizeof(uint64_t));
3421 if ((v & 0xFFFFFF80FFFFFF80) == 0) {
3422 *utf8_output++ = char(data[pos]);
3423 *utf8_output++ = char(data[pos + 1]);
3424 pos += 2;
3425 continue;
3426 }
3427 }
3428 }
3429
3430 uint32_t word = data[pos];
3431 if ((word & 0xFFFFFF80) == 0) {
3432 // will generate one UTF-8 bytes
3433 *utf8_output++ = char(word);
3434 pos++;
3435 } else if ((word & 0xFFFFF800) == 0) {
3436 // will generate two UTF-8 bytes
3437 // we have 0b110XXXXX 0b10XXXXXX
3438 *utf8_output++ = char((word >> 6) | 0b11000000);
3439 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3440 pos++;
3441 } else if ((word & 0xFFFF0000) == 0) {
3442 // will generate three UTF-8 bytes
3443 // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
3444 *utf8_output++ = char((word >> 12) | 0b11100000);
3445 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
3446 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3447 pos++;
3448 } else {
3449 // will generate four UTF-8 bytes
3450 // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
3451 *utf8_output++ = char((word >> 18) | 0b11110000);
3452 *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
3453 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
3454 *utf8_output++ = char((word & 0b111111) | 0b10000000);
3455 pos++;
3456 }
3457 }
3458 return utf8_output - start;
3459}
3460
3461} // namespace utf32_to_utf8
3462} // unnamed namespace
3463} // namespace scalar
3464} // namespace simdutf
3465
3466#endif
3467/* end file include/simdutf/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
3468/* begin file include/simdutf/scalar/utf8.h */
3469#ifndef SIMDUTF_UTF8_H
3470#define SIMDUTF_UTF8_H
3471
3472namespace simdutf {
3473namespace scalar {
3474namespace {
3475namespace utf8 {
3476
3477// credit: based on code from Google Fuchsia (Apache Licensed)
3478template <class BytePtr>
3479simdutf_constexpr23 simdutf_warn_unused bool validate(BytePtr data,
3480 size_t len) noexcept {
3481 static_assert(
3482 std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value,
3483 "dereferencing the data pointer must result in a uint8_t");
3484 uint64_t pos = 0;
3485 uint32_t code_point = 0;
3486 while (pos < len) {
3487 uint64_t next_pos;
3488#if SIMDUTF_CPLUSPLUS23
3489 if !consteval
3490#endif
3491 { // check if the next 16 bytes are ascii.
3492 next_pos = pos + 16;
3493 if (next_pos <= len) { // if it is safe to read 16 more bytes, check
3494 // that they are ascii
3495 uint64_t v1{};
3496 std::memcpy(&v1, data + pos, sizeof(uint64_t));
3497 uint64_t v2{};
3498 std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
3499 uint64_t v{v1 | v2};
3500 if ((v & 0x8080808080808080) == 0) {
3501 pos = next_pos;
3502 continue;
3503 }
3504 }
3505 }
3506
3507 unsigned char byte = data[pos];
3508
3509 while (byte < 0b10000000) {
3510 if (++pos == len) {
3511 return true;
3512 }
3513 byte = data[pos];
3514 }
3515
3516 if ((byte & 0b11100000) == 0b11000000) {
3517 next_pos = pos + 2;
3518 if (next_pos > len) {
3519 return false;
3520 }
3521 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3522 return false;
3523 }
3524 // range check
3525 code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
3526 if ((code_point < 0x80) || (0x7ff < code_point)) {
3527 return false;
3528 }
3529 } else if ((byte & 0b11110000) == 0b11100000) {
3530 next_pos = pos + 3;
3531 if (next_pos > len) {
3532 return false;
3533 }
3534 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3535 return false;
3536 }
3537 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
3538 return false;
3539 }
3540 // range check
3541 code_point = (byte & 0b00001111) << 12 |
3542 (data[pos + 1] & 0b00111111) << 6 |
3543 (data[pos + 2] & 0b00111111);
3544 if ((code_point < 0x800) || (0xffff < code_point) ||
3545 (0xd7ff < code_point && code_point < 0xe000)) {
3546 return false;
3547 }
3548 } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
3549 next_pos = pos + 4;
3550 if (next_pos > len) {
3551 return false;
3552 }
3553 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3554 return false;
3555 }
3556 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
3557 return false;
3558 }
3559 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
3560 return false;
3561 }
3562 // range check
3563 code_point =
3564 (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
3565 (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
3566 if (code_point <= 0xffff || 0x10ffff < code_point) {
3567 return false;
3568 }
3569 } else {
3570 // we may have a continuation
3571 return false;
3572 }
3573 pos = next_pos;
3574 }
3575 return true;
3576}
3577
3578simdutf_really_inline simdutf_warn_unused bool validate(const char *buf,
3579 size_t len) noexcept {
3580 return validate(reinterpret_cast<const uint8_t *>(buf), len);
3581}
3582
3583template <class BytePtr>
3584simdutf_constexpr23 simdutf_warn_unused result
3585validate_with_errors(BytePtr data, size_t len) noexcept {
3586 static_assert(
3587 std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value,
3588 "dereferencing the data pointer must result in a uint8_t");
3589 size_t pos = 0;
3590 uint32_t code_point = 0;
3591 while (pos < len) {
3592 // check of the next 16 bytes are ascii.
3593 size_t next_pos = pos + 16;
3594 if (next_pos <=
3595 len) { // if it is safe to read 16 more bytes, check that they are ascii
3596 uint64_t v1;
3597 std::memcpy(&v1, data + pos, sizeof(uint64_t));
3598 uint64_t v2;
3599 std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
3600 uint64_t v{v1 | v2};
3601 if ((v & 0x8080808080808080) == 0) {
3602 pos = next_pos;
3603 continue;
3604 }
3605 }
3606 unsigned char byte = data[pos];
3607
3608 while (byte < 0b10000000) {
3609 if (++pos == len) {
3610 return result(error_code::SUCCESS, len);
3611 }
3612 byte = data[pos];
3613 }
3614
3615 if ((byte & 0b11100000) == 0b11000000) {
3616 next_pos = pos + 2;
3617 if (next_pos > len) {
3618 return result(error_code::TOO_SHORT, pos);
3619 }
3620 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3621 return result(error_code::TOO_SHORT, pos);
3622 }
3623 // range check
3624 code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
3625 if ((code_point < 0x80) || (0x7ff < code_point)) {
3626 return result(error_code::OVERLONG, pos);
3627 }
3628 } else if ((byte & 0b11110000) == 0b11100000) {
3629 next_pos = pos + 3;
3630 if (next_pos > len) {
3631 return result(error_code::TOO_SHORT, pos);
3632 }
3633 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3634 return result(error_code::TOO_SHORT, pos);
3635 }
3636 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
3637 return result(error_code::TOO_SHORT, pos);
3638 }
3639 // range check
3640 code_point = (byte & 0b00001111) << 12 |
3641 (data[pos + 1] & 0b00111111) << 6 |
3642 (data[pos + 2] & 0b00111111);
3643 if ((code_point < 0x800) || (0xffff < code_point)) {
3644 return result(error_code::OVERLONG, pos);
3645 }
3646 if (0xd7ff < code_point && code_point < 0xe000) {
3647 return result(error_code::SURROGATE, pos);
3648 }
3649 } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
3650 next_pos = pos + 4;
3651 if (next_pos > len) {
3652 return result(error_code::TOO_SHORT, pos);
3653 }
3654 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3655 return result(error_code::TOO_SHORT, pos);
3656 }
3657 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
3658 return result(error_code::TOO_SHORT, pos);
3659 }
3660 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
3661 return result(error_code::TOO_SHORT, pos);
3662 }
3663 // range check
3664 code_point =
3665 (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
3666 (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
3667 if (code_point <= 0xffff) {
3668 return result(error_code::OVERLONG, pos);
3669 }
3670 if (0x10ffff < code_point) {
3671 return result(error_code::TOO_LARGE, pos);
3672 }
3673 } else {
3674 // we either have too many continuation bytes or an invalid leading byte
3675 if ((byte & 0b11000000) == 0b10000000) {
3676 return result(error_code::TOO_LONG, pos);
3677 } else {
3678 return result(error_code::HEADER_BITS, pos);
3679 }
3680 }
3681 pos = next_pos;
3682 }
3683 return result(error_code::SUCCESS, len);
3684}
3685
3686simdutf_really_inline simdutf_warn_unused result
3687validate_with_errors(const char *buf, size_t len) noexcept {
3688 return validate_with_errors(reinterpret_cast<const uint8_t *>(buf), len);
3689}
3690
3691// Finds the previous leading byte starting backward from buf and validates with
3692// errors from there Used to pinpoint the location of an error when an invalid
3693// chunk is detected We assume that the stream starts with a leading byte, and
3694// to check that it is the case, we ask that you pass a pointer to the start of
3695// the stream (start).
3696inline simdutf_warn_unused result rewind_and_validate_with_errors(
3697 const char *start, const char *buf, size_t len) noexcept {
3698 // First check that we start with a leading byte
3699 if ((*start & 0b11000000) == 0b10000000) {
3700 return result(error_code::TOO_LONG, 0);
3701 }
3702 size_t extra_len{0};
3703 // A leading byte cannot be further than 4 bytes away
3704 for (int i = 0; i < 5; i++) {
3705 unsigned char byte = *buf;
3706 if ((byte & 0b11000000) != 0b10000000) {
3707 break;
3708 } else {
3709 buf--;
3710 extra_len++;
3711 }
3712 }
3713
3714 result res = validate_with_errors(buf, len + extra_len);
3715 res.count -= extra_len;
3716 return res;
3717}
3718
3719template <typename InputPtr>
3720#if SIMDUTF_CPLUSPLUS20
3721 requires simdutf::detail::indexes_into_byte_like<InputPtr>
3722#endif
3723simdutf_constexpr23 size_t count_code_points(InputPtr data, size_t len) {
3724 size_t counter{0};
3725 for (size_t i = 0; i < len; i++) {
3726 // -65 is 0b10111111, anything larger in two-complement's should start a new
3727 // code point.
3728 if (int8_t(data[i]) > -65) {
3729 counter++;
3730 }
3731 }
3732 return counter;
3733}
3734
3735template <typename InputPtr>
3736#if SIMDUTF_CPLUSPLUS20
3737 requires simdutf::detail::indexes_into_byte_like<InputPtr>
3738#endif
3739simdutf_constexpr23 size_t utf16_length_from_utf8(InputPtr data, size_t len) {
3740 size_t counter{0};
3741 for (size_t i = 0; i < len; i++) {
3742 if (int8_t(data[i]) > -65) {
3743 counter++;
3744 }
3745 if (uint8_t(data[i]) >= 240) {
3746 counter++;
3747 }
3748 }
3749 return counter;
3750}
3751
3752template <typename InputPtr>
3753#if SIMDUTF_CPLUSPLUS20
3754 requires simdutf::detail::indexes_into_byte_like<InputPtr>
3755#endif
3756simdutf_warn_unused simdutf_constexpr23 size_t
3757trim_partial_utf8(InputPtr input, size_t length) {
3758 if (length < 3) {
3759 switch (length) {
3760 case 2:
3761 if (uint8_t(input[length - 1]) >= 0xc0) {
3762 return length - 1;
3763 } // 2-, 3- and 4-byte characters with only 1 byte left
3764 if (uint8_t(input[length - 2]) >= 0xe0) {
3765 return length - 2;
3766 } // 3- and 4-byte characters with only 2 bytes left
3767 return length;
3768 case 1:
3769 if (uint8_t(input[length - 1]) >= 0xc0) {
3770 return length - 1;
3771 } // 2-, 3- and 4-byte characters with only 1 byte left
3772 return length;
3773 case 0:
3774 return length;
3775 }
3776 }
3777 if (uint8_t(input[length - 1]) >= 0xc0) {
3778 return length - 1;
3779 } // 2-, 3- and 4-byte characters with only 1 byte left
3780 if (uint8_t(input[length - 2]) >= 0xe0) {
3781 return length - 2;
3782 } // 3- and 4-byte characters with only 1 byte left
3783 if (uint8_t(input[length - 3]) >= 0xf0) {
3784 return length - 3;
3785 } // 4-byte characters with only 3 bytes left
3786 return length;
3787}
3788
3789} // namespace utf8
3790} // unnamed namespace
3791} // namespace scalar
3792} // namespace simdutf
3793
3794#endif
3795/* end file include/simdutf/scalar/utf8.h */
3796/* begin file include/simdutf/scalar/utf8_to_latin1/utf8_to_latin1.h */
3797#ifndef SIMDUTF_UTF8_TO_LATIN1_H
3798#define SIMDUTF_UTF8_TO_LATIN1_H
3799
3800namespace simdutf {
3801namespace scalar {
3802namespace {
3803namespace utf8_to_latin1 {
3804
3805template <typename InputPtr, typename OutputPtr>
3806#if SIMDUTF_CPLUSPLUS20
3807 requires(simdutf::detail::indexes_into_byte_like<InputPtr> &&
3808 simdutf::detail::indexes_into_byte_like<OutputPtr>)
3809#endif
3810simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
3811 OutputPtr latin_output) {
3812 size_t pos = 0;
3813 auto start = latin_output;
3814
3815 while (pos < len) {
3816#if SIMDUTF_CPLUSPLUS23
3817 if !consteval
3818#endif
3819 {
3820 // try to convert the next block of 16 ASCII bytes
3821 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
3822 // they are ascii
3823 uint64_t v1;
3824 ::memcpy(&v1, data + pos, sizeof(uint64_t));
3825 uint64_t v2;
3826 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
3827 uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
3828 // 1000 1000 .... etc
3829 if ((v & 0x8080808080808080) ==
3830 0) { // if NONE of these are set, e.g. all of them are zero, then
3831 // everything is ASCII
3832 size_t final_pos = pos + 16;
3833 while (pos < final_pos) {
3834 *latin_output++ = char(data[pos]);
3835 pos++;
3836 }
3837 continue;
3838 }
3839 }
3840 }
3841
3842 // suppose it is not an all ASCII byte sequence
3843 uint8_t leading_byte = data[pos]; // leading byte
3844 if (leading_byte < 0b10000000) {
3845 // converting one ASCII byte !!!
3846 *latin_output++ = char(leading_byte);
3847 pos++;
3848 } else if ((leading_byte & 0b11100000) ==
3849 0b11000000) { // the first three bits indicate:
3850 // We have a two-byte UTF-8
3851 if (pos + 1 >= len) {
3852 return 0;
3853 } // minimal bound checking
3854 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3855 return 0;
3856 } // checks if the next byte is a valid continuation byte in UTF-8. A
3857 // valid continuation byte starts with 10.
3858 // range check -
3859 uint32_t code_point =
3860 (leading_byte & 0b00011111) << 6 |
3861 (data[pos + 1] &
3862 0b00111111); // assembles the Unicode code point from the two bytes.
3863 // It does this by discarding the leading 110 and 10
3864 // bits from the two bytes, shifting the remaining bits
3865 // of the first byte, and then combining the results
3866 // with a bitwise OR operation.
3867 if (code_point < 0x80 || 0xFF < code_point) {
3868 return 0; // We only care about the range 129-255 which is Non-ASCII
3869 // latin1 characters. A code_point beneath 0x80 is invalid as
3870 // it is already covered by bytes whose leading bit is zero.
3871 }
3872 *latin_output++ = char(code_point);
3873 pos += 2;
3874 } else {
3875 return 0;
3876 }
3877 }
3878 return latin_output - start;
3879}
3880
3881template <typename InputPtr>
3882#if SIMDUTF_CPLUSPLUS20
3883 requires simdutf::detail::indexes_into_byte_like<InputPtr>
3884#endif
3885simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len,
3886 char *latin_output) {
3887 size_t pos = 0;
3888 char *start{latin_output};
3889
3890 while (pos < len) {
3891#if SIMDUTF_CPLUSPLUS23
3892 if !consteval
3893#endif
3894 {
3895 // try to convert the next block of 16 ASCII bytes
3896 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
3897 // they are ascii
3898 uint64_t v1;
3899 ::memcpy(&v1, data + pos, sizeof(uint64_t));
3900 uint64_t v2;
3901 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
3902 uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
3903 // 1000 1000...etc
3904 if ((v & 0x8080808080808080) ==
3905 0) { // if NONE of these are set, e.g. all of them are zero, then
3906 // everything is ASCII
3907 size_t final_pos = pos + 16;
3908 while (pos < final_pos) {
3909 *latin_output++ = char(data[pos]);
3910 pos++;
3911 }
3912 continue;
3913 }
3914 }
3915 }
3916 // suppose it is not an all ASCII byte sequence
3917 uint8_t leading_byte = data[pos]; // leading byte
3918 if (leading_byte < 0b10000000) {
3919 // converting one ASCII byte !!!
3920 *latin_output++ = char(leading_byte);
3921 pos++;
3922 } else if ((leading_byte & 0b11100000) ==
3923 0b11000000) { // the first three bits indicate:
3924 // We have a two-byte UTF-8
3925 if (pos + 1 >= len) {
3926 return result(error_code::TOO_SHORT, pos);
3927 } // minimal bound checking
3928 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
3929 return result(error_code::TOO_SHORT, pos);
3930 } // checks if the next byte is a valid continuation byte in UTF-8. A
3931 // valid continuation byte starts with 10.
3932 // range check -
3933 uint32_t code_point =
3934 (leading_byte & 0b00011111) << 6 |
3935 (data[pos + 1] &
3936 0b00111111); // assembles the Unicode code point from the two bytes.
3937 // It does this by discarding the leading 110 and 10
3938 // bits from the two bytes, shifting the remaining bits
3939 // of the first byte, and then combining the results
3940 // with a bitwise OR operation.
3941 if (code_point < 0x80) {
3942 return result(error_code::OVERLONG, pos);
3943 }
3944 if (0xFF < code_point) {
3945 return result(error_code::TOO_LARGE, pos);
3946 } // We only care about the range 129-255 which is Non-ASCII latin1
3947 // characters
3948 *latin_output++ = char(code_point);
3949 pos += 2;
3950 } else if ((leading_byte & 0b11110000) == 0b11100000) {
3951 // We have a three-byte UTF-8
3952 return result(error_code::TOO_LARGE, pos);
3953 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
3954 // we have a 4-byte UTF-8 word.
3955 return result(error_code::TOO_LARGE, pos);
3956 } else {
3957 // we either have too many continuation bytes or an invalid leading byte
3958 if ((leading_byte & 0b11000000) == 0b10000000) {
3959 return result(error_code::TOO_LONG, pos);
3960 }
3961
3962 return result(error_code::HEADER_BITS, pos);
3963 }
3964 }
3965 return result(error_code::SUCCESS, latin_output - start);
3966}
3967
3968inline result rewind_and_convert_with_errors(size_t prior_bytes,
3969 const char *buf, size_t len,
3970 char *latin1_output) {
3971 size_t extra_len{0};
3972 // We potentially need to go back in time and find a leading byte.
3973 // In theory '3' would be sufficient, but sometimes the error can go back
3974 // quite far.
3975 size_t how_far_back = prior_bytes;
3976 // size_t how_far_back = 3; // 3 bytes in the past + current position
3977 // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
3978 bool found_leading_bytes{false};
3979 // important: it is i <= how_far_back and not 'i < how_far_back'.
3980 for (size_t i = 0; i <= how_far_back; i++) {
3981 unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
3982 found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
3983 if (found_leading_bytes) {
3984 if (i > 0 && byte < 128) {
3985 // If we had to go back and the leading byte is ascii
3986 // then we can stop right away.
3987 return result(error_code::TOO_LONG, 0 - i + 1);
3988 }
3989 buf -= i;
3990 extra_len = i;
3991 break;
3992 }
3993 }
3994 //
3995 // It is possible for this function to return a negative count in its result.
3996 // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
3997 // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
3998 // unsigned integral type of the result of the sizeof operator
3999 //
4000 // An unsigned type will simply wrap round arithmetically (well defined).
4001 //
4002 if (!found_leading_bytes) {
4003 // If how_far_back == 3, we may have four consecutive continuation bytes!!!
4004 // [....] [continuation] [continuation] [continuation] | [buf is
4005 // continuation] Or we possibly have a stream that does not start with a
4006 // leading byte.
4007 return result(error_code::TOO_LONG, 0 - how_far_back);
4008 }
4009 result res = convert_with_errors(buf, len + extra_len, latin1_output);
4010 if (res.error) {
4011 res.count -= extra_len;
4012 }
4013 return res;
4014}
4015
4016} // namespace utf8_to_latin1
4017} // unnamed namespace
4018} // namespace scalar
4019} // namespace simdutf
4020
4021#endif
4022/* end file include/simdutf/scalar/utf8_to_latin1/utf8_to_latin1.h */
4023/* begin file include/simdutf/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
4024#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H
4025#define SIMDUTF_VALID_UTF8_TO_LATIN1_H
4026
4027namespace simdutf {
4028namespace scalar {
4029namespace {
4030namespace utf8_to_latin1 {
4031
4032template <typename InputPtr>
4033#if SIMDUTF_CPLUSPLUS20
4034 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4035#endif
4036simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len,
4037 char *latin_output) {
4038
4039 size_t pos = 0;
4040 char *start{latin_output};
4041
4042 while (pos < len) {
4043#if SIMDUTF_CPLUSPLUS23
4044 if !consteval
4045#endif
4046 {
4047 // try to convert the next block of 16 ASCII bytes
4048 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
4049 // they are ascii
4050 uint64_t v1;
4051 ::memcpy(&v1, data + pos, sizeof(uint64_t));
4052 uint64_t v2;
4053 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
4054 uint64_t v{v1 |
4055 v2}; // We are only interested in these bits: 1000 1000 1000
4056 // 1000, so it makes sense to concatenate everything
4057 if ((v & 0x8080808080808080) ==
4058 0) { // if NONE of these are set, e.g. all of them are zero, then
4059 // everything is ASCII
4060 size_t final_pos = pos + 16;
4061 while (pos < final_pos) {
4062 *latin_output++ = uint8_t(data[pos]);
4063 pos++;
4064 }
4065 continue;
4066 }
4067 }
4068 }
4069
4070 // suppose it is not an all ASCII byte sequence
4071 auto leading_byte = uint8_t(data[pos]); // leading byte
4072 if (leading_byte < 0b10000000) {
4073 // converting one ASCII byte !!!
4074 *latin_output++ = char(leading_byte);
4075 pos++;
4076 } else if ((leading_byte & 0b11100000) ==
4077 0b11000000) { // the first three bits indicate:
4078 // We have a two-byte UTF-8
4079 if (pos + 1 >= len) {
4080 break;
4081 } // minimal bound checking
4082 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4083 return 0;
4084 } // checks if the next byte is a valid continuation byte in UTF-8. A
4085 // valid continuation byte starts with 10.
4086 // range check -
4087 uint32_t code_point =
4088 (leading_byte & 0b00011111) << 6 |
4089 (uint8_t(data[pos + 1]) &
4090 0b00111111); // assembles the Unicode code point from the two bytes.
4091 // It does this by discarding the leading 110 and 10
4092 // bits from the two bytes, shifting the remaining bits
4093 // of the first byte, and then combining the results
4094 // with a bitwise OR operation.
4095 *latin_output++ = char(code_point);
4096 pos += 2;
4097 } else {
4098 // we may have a continuation but we do not do error checking
4099 return 0;
4100 }
4101 }
4102 return latin_output - start;
4103}
4104
4105} // namespace utf8_to_latin1
4106} // unnamed namespace
4107} // namespace scalar
4108} // namespace simdutf
4109
4110#endif
4111/* end file include/simdutf/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
4112/* begin file include/simdutf/scalar/utf8_to_utf16/utf8_to_utf16.h */
4113#ifndef SIMDUTF_UTF8_TO_UTF16_H
4114#define SIMDUTF_UTF8_TO_UTF16_H
4115
4116namespace simdutf {
4117namespace scalar {
4118namespace {
4119namespace utf8_to_utf16 {
4120
4121template <endianness big_endian, typename InputPtr>
4122#if SIMDUTF_CPLUSPLUS20
4123 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4124#endif
4125simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
4126 char16_t *utf16_output) {
4127 size_t pos = 0;
4128 char16_t *start{utf16_output};
4129 while (pos < len) {
4130#if SIMDUTF_CPLUSPLUS23
4131 if !consteval
4132#endif
4133 // try to convert the next block of 16 ASCII bytes
4134 {
4135 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
4136 // they are ascii
4137 uint64_t v1;
4138 ::memcpy(&v1, data + pos, sizeof(uint64_t));
4139 uint64_t v2;
4140 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
4141 uint64_t v{v1 | v2};
4142 if ((v & 0x8080808080808080) == 0) {
4143 size_t final_pos = pos + 16;
4144 while (pos < final_pos) {
4145 *utf16_output++ = !match_system(big_endian)
4146 ? char16_t(u16_swap_bytes(data[pos]))
4147 : char16_t(data[pos]);
4148 pos++;
4149 }
4150 continue;
4151 }
4152 }
4153 }
4154
4155 uint8_t leading_byte = data[pos]; // leading byte
4156 if (leading_byte < 0b10000000) {
4157 // converting one ASCII byte !!!
4158 *utf16_output++ = !match_system(big_endian)
4159 ? char16_t(u16_swap_bytes(leading_byte))
4160 : char16_t(leading_byte);
4161 pos++;
4162 } else if ((leading_byte & 0b11100000) == 0b11000000) {
4163 // We have a two-byte UTF-8, it should become
4164 // a single UTF-16 word.
4165 if (pos + 1 >= len) {
4166 return 0;
4167 } // minimal bound checking
4168 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
4169 return 0;
4170 }
4171 // range check
4172 uint32_t code_point =
4173 (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
4174 if (code_point < 0x80 || 0x7ff < code_point) {
4175 return 0;
4176 }
4177 if simdutf_constexpr (!match_system(big_endian)) {
4178 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
4179 }
4180 *utf16_output++ = char16_t(code_point);
4181 pos += 2;
4182 } else if ((leading_byte & 0b11110000) == 0b11100000) {
4183 // We have a three-byte UTF-8, it should become
4184 // a single UTF-16 word.
4185 if (pos + 2 >= len) {
4186 return 0;
4187 } // minimal bound checking
4188
4189 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
4190 return 0;
4191 }
4192 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
4193 return 0;
4194 }
4195 // range check
4196 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
4197 (data[pos + 1] & 0b00111111) << 6 |
4198 (data[pos + 2] & 0b00111111);
4199 if (code_point < 0x800 || 0xffff < code_point ||
4200 (0xd7ff < code_point && code_point < 0xe000)) {
4201 return 0;
4202 }
4203 if simdutf_constexpr (!match_system(big_endian)) {
4204 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
4205 }
4206 *utf16_output++ = char16_t(code_point);
4207 pos += 3;
4208 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
4209 // we have a 4-byte UTF-8 word.
4210 if (pos + 3 >= len) {
4211 return 0;
4212 } // minimal bound checking
4213 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
4214 return 0;
4215 }
4216 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
4217 return 0;
4218 }
4219 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
4220 return 0;
4221 }
4222
4223 // range check
4224 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
4225 (data[pos + 1] & 0b00111111) << 12 |
4226 (data[pos + 2] & 0b00111111) << 6 |
4227 (data[pos + 3] & 0b00111111);
4228 if (code_point <= 0xffff || 0x10ffff < code_point) {
4229 return 0;
4230 }
4231 code_point -= 0x10000;
4232 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
4233 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
4234 if simdutf_constexpr (!match_system(big_endian)) {
4235 high_surrogate = u16_swap_bytes(high_surrogate);
4236 low_surrogate = u16_swap_bytes(low_surrogate);
4237 }
4238 *utf16_output++ = char16_t(high_surrogate);
4239 *utf16_output++ = char16_t(low_surrogate);
4240 pos += 4;
4241 } else {
4242 return 0;
4243 }
4244 }
4245 return utf16_output - start;
4246}
4247
4248template <endianness big_endian, typename InputPtr>
4249#if SIMDUTF_CPLUSPLUS20
4250 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4251#endif
4252simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len,
4253 char16_t *utf16_output) {
4254 size_t pos = 0;
4255 char16_t *start{utf16_output};
4256 while (pos < len) {
4257#if SIMDUTF_CPLUSPLUS23
4258 if !consteval
4259#endif
4260 {
4261 // try to convert the next block of 16 ASCII bytes
4262 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
4263 // they are ascii
4264 uint64_t v1;
4265 ::memcpy(&v1, data + pos, sizeof(uint64_t));
4266 uint64_t v2;
4267 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
4268 uint64_t v{v1 | v2};
4269 if ((v & 0x8080808080808080) == 0) {
4270 size_t final_pos = pos + 16;
4271 while (pos < final_pos) {
4272 const char16_t byte = uint8_t(data[pos]);
4273 *utf16_output++ =
4274 !match_system(big_endian) ? u16_swap_bytes(byte) : byte;
4275 pos++;
4276 }
4277 continue;
4278 }
4279 }
4280 }
4281
4282 auto leading_byte = uint8_t(data[pos]); // leading byte
4283 if (leading_byte < 0b10000000) {
4284 // converting one ASCII byte !!!
4285 *utf16_output++ = !match_system(big_endian)
4286 ? char16_t(u16_swap_bytes(leading_byte))
4287 : char16_t(leading_byte);
4288 pos++;
4289 } else if ((leading_byte & 0b11100000) == 0b11000000) {
4290 // We have a two-byte UTF-8, it should become
4291 // a single UTF-16 word.
4292 if (pos + 1 >= len) {
4293 return result(error_code::TOO_SHORT, pos);
4294 } // minimal bound checking
4295 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4296 return result(error_code::TOO_SHORT, pos);
4297 }
4298 // range check
4299 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
4300 (uint8_t(data[pos + 1]) & 0b00111111);
4301 if (code_point < 0x80 || 0x7ff < code_point) {
4302 return result(error_code::OVERLONG, pos);
4303 }
4304 if simdutf_constexpr (!match_system(big_endian)) {
4305 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
4306 }
4307 *utf16_output++ = char16_t(code_point);
4308 pos += 2;
4309 } else if ((leading_byte & 0b11110000) == 0b11100000) {
4310 // We have a three-byte UTF-8, it should become
4311 // a single UTF-16 word.
4312 if (pos + 2 >= len) {
4313 return result(error_code::TOO_SHORT, pos);
4314 } // minimal bound checking
4315
4316 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4317 return result(error_code::TOO_SHORT, pos);
4318 }
4319 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
4320 return result(error_code::TOO_SHORT, pos);
4321 }
4322 // range check
4323 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
4324 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
4325 (uint8_t(data[pos + 2]) & 0b00111111);
4326 if ((code_point < 0x800) || (0xffff < code_point)) {
4327 return result(error_code::OVERLONG, pos);
4328 }
4329 if (0xd7ff < code_point && code_point < 0xe000) {
4330 return result(error_code::SURROGATE, pos);
4331 }
4332 if simdutf_constexpr (!match_system(big_endian)) {
4333 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
4334 }
4335 *utf16_output++ = char16_t(code_point);
4336 pos += 3;
4337 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
4338 // we have a 4-byte UTF-8 word.
4339 if (pos + 3 >= len) {
4340 return result(error_code::TOO_SHORT, pos);
4341 } // minimal bound checking
4342 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4343 return result(error_code::TOO_SHORT, pos);
4344 }
4345 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
4346 return result(error_code::TOO_SHORT, pos);
4347 }
4348 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
4349 return result(error_code::TOO_SHORT, pos);
4350 }
4351
4352 // range check
4353 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
4354 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
4355 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
4356 (uint8_t(data[pos + 3]) & 0b00111111);
4357 if (code_point <= 0xffff) {
4358 return result(error_code::OVERLONG, pos);
4359 }
4360 if (0x10ffff < code_point) {
4361 return result(error_code::TOO_LARGE, pos);
4362 }
4363 code_point -= 0x10000;
4364 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
4365 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
4366 if simdutf_constexpr (!match_system(big_endian)) {
4367 high_surrogate = u16_swap_bytes(high_surrogate);
4368 low_surrogate = u16_swap_bytes(low_surrogate);
4369 }
4370 *utf16_output++ = char16_t(high_surrogate);
4371 *utf16_output++ = char16_t(low_surrogate);
4372 pos += 4;
4373 } else {
4374 // we either have too many continuation bytes or an invalid leading byte
4375 if ((leading_byte & 0b11000000) == 0b10000000) {
4376 return result(error_code::TOO_LONG, pos);
4377 } else {
4378 return result(error_code::HEADER_BITS, pos);
4379 }
4380 }
4381 }
4382 return result(error_code::SUCCESS, utf16_output - start);
4383}
4384
4400template <endianness endian>
4401inline result rewind_and_convert_with_errors(size_t prior_bytes,
4402 const char *buf, size_t len,
4403 char16_t *utf16_output) {
4404 size_t extra_len{0};
4405 // We potentially need to go back in time and find a leading byte.
4406 // In theory '3' would be sufficient, but sometimes the error can go back
4407 // quite far.
4408 size_t how_far_back = prior_bytes;
4409 // size_t how_far_back = 3; // 3 bytes in the past + current position
4410 // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
4411 bool found_leading_bytes{false};
4412 // important: it is i <= how_far_back and not 'i < how_far_back'.
4413 for (size_t i = 0; i <= how_far_back; i++) {
4414 unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
4415 found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
4416 if (found_leading_bytes) {
4417 if (i > 0 && byte < 128) {
4418 // If we had to go back and the leading byte is ascii
4419 // then we can stop right away.
4420 return result(error_code::TOO_LONG, 0 - i + 1);
4421 }
4422 buf -= i;
4423 extra_len = i;
4424 break;
4425 }
4426 }
4427 //
4428 // It is possible for this function to return a negative count in its result.
4429 // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
4430 // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
4431 // unsigned integral type of the result of the sizeof operator
4432 //
4433 // An unsigned type will simply wrap round arithmetically (well defined).
4434 //
4435 if (!found_leading_bytes) {
4436 // If how_far_back == 3, we may have four consecutive continuation bytes!!!
4437 // [....] [continuation] [continuation] [continuation] | [buf is
4438 // continuation] Or we possibly have a stream that does not start with a
4439 // leading byte.
4440 return result(error_code::TOO_LONG, 0 - how_far_back);
4441 }
4442 result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
4443 if (res.error) {
4444 res.count -= extra_len;
4445 }
4446 return res;
4447}
4448
4449} // namespace utf8_to_utf16
4450} // unnamed namespace
4451} // namespace scalar
4452} // namespace simdutf
4453
4454#endif
4455/* end file include/simdutf/scalar/utf8_to_utf16/utf8_to_utf16.h */
4456/* begin file include/simdutf/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
4457#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
4458#define SIMDUTF_VALID_UTF8_TO_UTF16_H
4459
4460namespace simdutf {
4461namespace scalar {
4462namespace {
4463namespace utf8_to_utf16 {
4464
4465template <endianness big_endian, typename InputPtr>
4466#if SIMDUTF_CPLUSPLUS20
4467 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4468#endif
4469simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len,
4470 char16_t *utf16_output) {
4471 size_t pos = 0;
4472 char16_t *start{utf16_output};
4473 while (pos < len) {
4474#if SIMDUTF_CPLUSPLUS23
4475 if !consteval
4476#endif
4477 { // try to convert the next block of 8 ASCII bytes
4478 if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that
4479 // they are ascii
4480 uint64_t v;
4481 ::memcpy(&v, data + pos, sizeof(uint64_t));
4482 if ((v & 0x8080808080808080) == 0) {
4483 size_t final_pos = pos + 8;
4484 while (pos < final_pos) {
4485 const char16_t byte = uint8_t(data[pos]);
4486 *utf16_output++ =
4487 !match_system(big_endian) ? u16_swap_bytes(byte) : byte;
4488 pos++;
4489 }
4490 continue;
4491 }
4492 }
4493 }
4494
4495 auto leading_byte = uint8_t(data[pos]); // leading byte
4496 if (leading_byte < 0b10000000) {
4497 // converting one ASCII byte !!!
4498 *utf16_output++ = !match_system(big_endian)
4499 ? char16_t(u16_swap_bytes(leading_byte))
4500 : char16_t(leading_byte);
4501 pos++;
4502 } else if ((leading_byte & 0b11100000) == 0b11000000) {
4503 // We have a two-byte UTF-8, it should become
4504 // a single UTF-16 word.
4505 if (pos + 1 >= len) {
4506 break;
4507 } // minimal bound checking
4508 uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) |
4509 (uint8_t(data[pos + 1]) & 0b00111111));
4510 if simdutf_constexpr (!match_system(big_endian)) {
4511 code_point = u16_swap_bytes(uint16_t(code_point));
4512 }
4513 *utf16_output++ = char16_t(code_point);
4514 pos += 2;
4515 } else if ((leading_byte & 0b11110000) == 0b11100000) {
4516 // We have a three-byte UTF-8, it should become
4517 // a single UTF-16 word.
4518 if (pos + 2 >= len) {
4519 break;
4520 } // minimal bound checking
4521 uint16_t code_point =
4522 uint16_t(((leading_byte & 0b00001111) << 12) |
4523 ((uint8_t(data[pos + 1]) & 0b00111111) << 6) |
4524 (uint8_t(data[pos + 2]) & 0b00111111));
4525 if simdutf_constexpr (!match_system(big_endian)) {
4526 code_point = u16_swap_bytes(uint16_t(code_point));
4527 }
4528 *utf16_output++ = char16_t(code_point);
4529 pos += 3;
4530 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
4531 // we have a 4-byte UTF-8 word.
4532 if (pos + 3 >= len) {
4533 break;
4534 } // minimal bound checking
4535 uint32_t code_point = ((leading_byte & 0b00000111) << 18) |
4536 ((uint8_t(data[pos + 1]) & 0b00111111) << 12) |
4537 ((uint8_t(data[pos + 2]) & 0b00111111) << 6) |
4538 (uint8_t(data[pos + 3]) & 0b00111111);
4539 code_point -= 0x10000;
4540 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
4541 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
4542 if simdutf_constexpr (!match_system(big_endian)) {
4543 high_surrogate = u16_swap_bytes(high_surrogate);
4544 low_surrogate = u16_swap_bytes(low_surrogate);
4545 }
4546 *utf16_output++ = char16_t(high_surrogate);
4547 *utf16_output++ = char16_t(low_surrogate);
4548 pos += 4;
4549 } else {
4550 // we may have a continuation but we do not do error checking
4551 return 0;
4552 }
4553 }
4554 return utf16_output - start;
4555}
4556
4557} // namespace utf8_to_utf16
4558} // unnamed namespace
4559} // namespace scalar
4560} // namespace simdutf
4561
4562#endif
4563/* end file include/simdutf/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
4564/* begin file include/simdutf/scalar/utf8_to_utf32/utf8_to_utf32.h */
4565#ifndef SIMDUTF_UTF8_TO_UTF32_H
4566#define SIMDUTF_UTF8_TO_UTF32_H
4567
4568namespace simdutf {
4569namespace scalar {
4570namespace {
4571namespace utf8_to_utf32 {
4572
4573template <typename InputPtr>
4574#if SIMDUTF_CPLUSPLUS20
4575 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4576#endif
4577simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
4578 char32_t *utf32_output) {
4579 size_t pos = 0;
4580 char32_t *start{utf32_output};
4581 while (pos < len) {
4582#if SIMDUTF_CPLUSPLUS23
4583 if !consteval
4584#endif
4585 {
4586 // try to convert the next block of 16 ASCII bytes
4587 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
4588 // they are ascii
4589 uint64_t v1;
4590 ::memcpy(&v1, data + pos, sizeof(uint64_t));
4591 uint64_t v2;
4592 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
4593 uint64_t v{v1 | v2};
4594 if ((v & 0x8080808080808080) == 0) {
4595 size_t final_pos = pos + 16;
4596 while (pos < final_pos) {
4597 *utf32_output++ = uint8_t(data[pos]);
4598 pos++;
4599 }
4600 continue;
4601 }
4602 }
4603 }
4604 auto leading_byte = uint8_t(data[pos]); // leading byte
4605 if (leading_byte < 0b10000000) {
4606 // converting one ASCII byte !!!
4607 *utf32_output++ = char32_t(leading_byte);
4608 pos++;
4609 } else if ((leading_byte & 0b11100000) == 0b11000000) {
4610 // We have a two-byte UTF-8
4611 if (pos + 1 >= len) {
4612 return 0;
4613 } // minimal bound checking
4614 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
4615 return 0;
4616 }
4617 // range check
4618 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
4619 (uint8_t(data[pos + 1]) & 0b00111111);
4620 if (code_point < 0x80 || 0x7ff < code_point) {
4621 return 0;
4622 }
4623 *utf32_output++ = char32_t(code_point);
4624 pos += 2;
4625 } else if ((leading_byte & 0b11110000) == 0b11100000) {
4626 // We have a three-byte UTF-8
4627 if (pos + 2 >= len) {
4628 return 0;
4629 } // minimal bound checking
4630
4631 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4632 return 0;
4633 }
4634 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
4635 return 0;
4636 }
4637 // range check
4638 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
4639 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
4640 (uint8_t(data[pos + 2]) & 0b00111111);
4641 if (code_point < 0x800 || 0xffff < code_point ||
4642 (0xd7ff < code_point && code_point < 0xe000)) {
4643 return 0;
4644 }
4645 *utf32_output++ = char32_t(code_point);
4646 pos += 3;
4647 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
4648 // we have a 4-byte UTF-8 word.
4649 if (pos + 3 >= len) {
4650 return 0;
4651 } // minimal bound checking
4652 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4653 return 0;
4654 }
4655 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
4656 return 0;
4657 }
4658 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
4659 return 0;
4660 }
4661
4662 // range check
4663 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
4664 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
4665 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
4666 (uint8_t(data[pos + 3]) & 0b00111111);
4667 if (code_point <= 0xffff || 0x10ffff < code_point) {
4668 return 0;
4669 }
4670 *utf32_output++ = char32_t(code_point);
4671 pos += 4;
4672 } else {
4673 return 0;
4674 }
4675 }
4676 return utf32_output - start;
4677}
4678
4679template <typename InputPtr>
4680#if SIMDUTF_CPLUSPLUS20
4681 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4682#endif
4683simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len,
4684 char32_t *utf32_output) {
4685 size_t pos = 0;
4686 char32_t *start{utf32_output};
4687 while (pos < len) {
4688#if SIMDUTF_CPLUSPLUS23
4689 if !consteval
4690#endif
4691 {
4692 // try to convert the next block of 16 ASCII bytes
4693 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
4694 // they are ascii
4695 uint64_t v1;
4696 ::memcpy(&v1, data + pos, sizeof(uint64_t));
4697 uint64_t v2;
4698 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
4699 uint64_t v{v1 | v2};
4700 if ((v & 0x8080808080808080) == 0) {
4701 size_t final_pos = pos + 16;
4702 while (pos < final_pos) {
4703 *utf32_output++ = uint8_t(data[pos]);
4704 pos++;
4705 }
4706 continue;
4707 }
4708 }
4709 }
4710 auto leading_byte = uint8_t(data[pos]); // leading byte
4711 if (leading_byte < 0b10000000) {
4712 // converting one ASCII byte !!!
4713 *utf32_output++ = char32_t(leading_byte);
4714 pos++;
4715 } else if ((leading_byte & 0b11100000) == 0b11000000) {
4716 // We have a two-byte UTF-8
4717 if (pos + 1 >= len) {
4718 return result(error_code::TOO_SHORT, pos);
4719 } // minimal bound checking
4720 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4721 return result(error_code::TOO_SHORT, pos);
4722 }
4723 // range check
4724 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
4725 (uint8_t(data[pos + 1]) & 0b00111111);
4726 if (code_point < 0x80 || 0x7ff < code_point) {
4727 return result(error_code::OVERLONG, pos);
4728 }
4729 *utf32_output++ = char32_t(code_point);
4730 pos += 2;
4731 } else if ((leading_byte & 0b11110000) == 0b11100000) {
4732 // We have a three-byte UTF-8
4733 if (pos + 2 >= len) {
4734 return result(error_code::TOO_SHORT, pos);
4735 } // minimal bound checking
4736
4737 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4738 return result(error_code::TOO_SHORT, pos);
4739 }
4740 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
4741 return result(error_code::TOO_SHORT, pos);
4742 }
4743 // range check
4744 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
4745 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
4746 (uint8_t(data[pos + 2]) & 0b00111111);
4747 if (code_point < 0x800 || 0xffff < code_point) {
4748 return result(error_code::OVERLONG, pos);
4749 }
4750 if (0xd7ff < code_point && code_point < 0xe000) {
4751 return result(error_code::SURROGATE, pos);
4752 }
4753 *utf32_output++ = char32_t(code_point);
4754 pos += 3;
4755 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
4756 // we have a 4-byte UTF-8 word.
4757 if (pos + 3 >= len) {
4758 return result(error_code::TOO_SHORT, pos);
4759 } // minimal bound checking
4760 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
4761 return result(error_code::TOO_SHORT, pos);
4762 }
4763 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
4764 return result(error_code::TOO_SHORT, pos);
4765 }
4766 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
4767 return result(error_code::TOO_SHORT, pos);
4768 }
4769
4770 // range check
4771 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
4772 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
4773 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
4774 (uint8_t(data[pos + 3]) & 0b00111111);
4775 if (code_point <= 0xffff) {
4776 return result(error_code::OVERLONG, pos);
4777 }
4778 if (0x10ffff < code_point) {
4779 return result(error_code::TOO_LARGE, pos);
4780 }
4781 *utf32_output++ = char32_t(code_point);
4782 pos += 4;
4783 } else {
4784 // we either have too many continuation bytes or an invalid leading byte
4785 if ((leading_byte & 0b11000000) == 0b10000000) {
4786 return result(error_code::TOO_LONG, pos);
4787 } else {
4788 return result(error_code::HEADER_BITS, pos);
4789 }
4790 }
4791 }
4792 return result(error_code::SUCCESS, utf32_output - start);
4793}
4794
4810inline result rewind_and_convert_with_errors(size_t prior_bytes,
4811 const char *buf, size_t len,
4812 char32_t *utf32_output) {
4813 size_t extra_len{0};
4814 // We potentially need to go back in time and find a leading byte.
4815 size_t how_far_back = 3; // 3 bytes in the past + current position
4816 if (how_far_back > prior_bytes) {
4817 how_far_back = prior_bytes;
4818 }
4819 bool found_leading_bytes{false};
4820 // important: it is i <= how_far_back and not 'i < how_far_back'.
4821 for (size_t i = 0; i <= how_far_back; i++) {
4822 unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
4823 found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
4824 if (found_leading_bytes) {
4825 if (i > 0 && byte < 128) {
4826 // If we had to go back and the leading byte is ascii
4827 // then we can stop right away.
4828 return result(error_code::TOO_LONG, 0 - i + 1);
4829 }
4830 buf -= i;
4831 extra_len = i;
4832 break;
4833 }
4834 }
4835 //
4836 // It is possible for this function to return a negative count in its result.
4837 // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
4838 // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
4839 // unsigned integral type of the result of the sizeof operator
4840 //
4841 // An unsigned type will simply wrap round arithmetically (well defined).
4842 //
4843 if (!found_leading_bytes) {
4844 // If how_far_back == 3, we may have four consecutive continuation bytes!!!
4845 // [....] [continuation] [continuation] [continuation] | [buf is
4846 // continuation] Or we possibly have a stream that does not start with a
4847 // leading byte.
4848 return result(error_code::TOO_LONG, 0 - how_far_back);
4849 }
4850
4851 result res = convert_with_errors(buf, len + extra_len, utf32_output);
4852 if (res.error) {
4853 res.count -= extra_len;
4854 }
4855 return res;
4856}
4857
4858} // namespace utf8_to_utf32
4859} // unnamed namespace
4860} // namespace scalar
4861} // namespace simdutf
4862
4863#endif
4864/* end file include/simdutf/scalar/utf8_to_utf32/utf8_to_utf32.h */
4865/* begin file include/simdutf/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
4866#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
4867#define SIMDUTF_VALID_UTF8_TO_UTF32_H
4868
4869namespace simdutf {
4870namespace scalar {
4871namespace {
4872namespace utf8_to_utf32 {
4873
4874template <typename InputPtr>
4875#if SIMDUTF_CPLUSPLUS20
4876 requires simdutf::detail::indexes_into_byte_like<InputPtr>
4877#endif
4878simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len,
4879 char32_t *utf32_output) {
4880 size_t pos = 0;
4881 char32_t *start{utf32_output};
4882 while (pos < len) {
4883#if SIMDUTF_CPLUSPLUS23
4884 if !consteval
4885#endif
4886 {
4887 // try to convert the next block of 8 ASCII bytes
4888 if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that
4889 // they are ascii
4890 uint64_t v;
4891 ::memcpy(&v, data + pos, sizeof(uint64_t));
4892 if ((v & 0x8080808080808080) == 0) {
4893 size_t final_pos = pos + 8;
4894 while (pos < final_pos) {
4895 *utf32_output++ = uint8_t(data[pos]);
4896 pos++;
4897 }
4898 continue;
4899 }
4900 }
4901 }
4902 auto leading_byte = uint8_t(data[pos]); // leading byte
4903 if (leading_byte < 0b10000000) {
4904 // converting one ASCII byte !!!
4905 *utf32_output++ = char32_t(leading_byte);
4906 pos++;
4907 } else if ((leading_byte & 0b11100000) == 0b11000000) {
4908 // We have a two-byte UTF-8
4909 if (pos + 1 >= len) {
4910 break;
4911 } // minimal bound checking
4912 *utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) |
4913 (uint8_t(data[pos + 1]) & 0b00111111));
4914 pos += 2;
4915 } else if ((leading_byte & 0b11110000) == 0b11100000) {
4916 // We have a three-byte UTF-8
4917 if (pos + 2 >= len) {
4918 break;
4919 } // minimal bound checking
4920 *utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) |
4921 ((uint8_t(data[pos + 1]) & 0b00111111) << 6) |
4922 (uint8_t(data[pos + 2]) & 0b00111111));
4923 pos += 3;
4924 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
4925 // we have a 4-byte UTF-8 word.
4926 if (pos + 3 >= len) {
4927 break;
4928 } // minimal bound checking
4929 uint32_t code_word = ((leading_byte & 0b00000111) << 18) |
4930 ((uint8_t(data[pos + 1]) & 0b00111111) << 12) |
4931 ((uint8_t(data[pos + 2]) & 0b00111111) << 6) |
4932 (uint8_t(data[pos + 3]) & 0b00111111);
4933 *utf32_output++ = char32_t(code_word);
4934 pos += 4;
4935 } else {
4936 // we may have a continuation but we do not do error checking
4937 return 0;
4938 }
4939 }
4940 return utf32_output - start;
4941}
4942
4943} // namespace utf8_to_utf32
4944} // unnamed namespace
4945} // namespace scalar
4946} // namespace simdutf
4947
4948#endif
4949/* end file include/simdutf/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
4950
4951namespace simdutf {
4952
4953constexpr size_t default_line_length =
4954 76;
4955
4967simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
4968 #if SIMDUTF_SPAN
4969simdutf_constexpr23 simdutf_really_inline simdutf_warn_unused bool
4970validate_utf8(const detail::input_span_of_byte_like auto &input) noexcept {
4971 #if SIMDUTF_CPLUSPLUS23
4972 if consteval {
4973 return scalar::utf8::validate(
4974 detail::constexpr_cast_ptr<uint8_t>(input.data()), input.size());
4975 } else
4976 #endif
4977 {
4978 return validate_utf8(reinterpret_cast<const char *>(input.data()),
4979 input.size());
4980 }
4981}
4982 #endif // SIMDUTF_SPAN
4983
4996simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
4997 size_t len) noexcept;
4998 #if SIMDUTF_SPAN
4999simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused result
5000validate_utf8_with_errors(
5001 const detail::input_span_of_byte_like auto &input) noexcept {
5002 #if SIMDUTF_CPLUSPLUS23
5003 if consteval {
5004 return scalar::utf8::validate_with_errors(
5005 detail::constexpr_cast_ptr<uint8_t>(input.data()), input.size());
5006 } else
5007 #endif
5008 {
5009 return validate_utf8_with_errors(
5010 reinterpret_cast<const char *>(input.data()), input.size());
5011 }
5012}
5013 #endif // SIMDUTF_SPAN
5014
5029simdutf_warn_unused bool validate_utf16(const char16_t *buf,
5030 size_t len) noexcept;
5031 #if SIMDUTF_SPAN
5032simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
5033validate_utf16(std::span<const char16_t> input) noexcept {
5034 #if SIMDUTF_CPLUSPLUS23
5035 if consteval {
5036 return scalar::utf16::validate<endianness::NATIVE>(input.data(),
5037 input.size());
5038 } else
5039 #endif
5040 {
5041 return validate_utf16(input.data(), input.size());
5042 }
5043}
5044 #endif // SIMDUTF_SPAN
5045
5060simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
5061 size_t len) noexcept;
5062 #if SIMDUTF_SPAN
5063simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused bool
5064validate_utf16le(std::span<const char16_t> input) noexcept {
5065 #if SIMDUTF_CPLUSPLUS23
5066 if consteval {
5067 return scalar::utf16::validate<endianness::LITTLE>(input.data(),
5068 input.size());
5069 } else
5070 #endif
5071 {
5072 return validate_utf16le(input.data(), input.size());
5073 }
5074}
5075 #endif // SIMDUTF_SPAN
5076
5091simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
5092 size_t len) noexcept;
5093 #if SIMDUTF_SPAN
5094simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
5095validate_utf16be(std::span<const char16_t> input) noexcept {
5096 #if SIMDUTF_CPLUSPLUS23
5097 if consteval {
5098 return scalar::utf16::validate<endianness::BIG>(input.data(), input.size());
5099 } else
5100 #endif
5101 {
5102 return validate_utf16be(input.data(), input.size());
5103 }
5104}
5105 #endif // SIMDUTF_SPAN
5106
5124simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
5125 size_t len) noexcept;
5126 #if SIMDUTF_SPAN
5127simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5128validate_utf16_with_errors(std::span<const char16_t> input) noexcept {
5129 #if SIMDUTF_CPLUSPLUS23
5130 if consteval {
5131 return scalar::utf16::validate_with_errors<endianness::NATIVE>(
5132 input.data(), input.size());
5133 } else
5134 #endif
5135 {
5136 return validate_utf16_with_errors(input.data(), input.size());
5137 }
5138}
5139 #endif // SIMDUTF_SPAN
5140
5157simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
5158 size_t len) noexcept;
5159 #if SIMDUTF_SPAN
5160simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5161validate_utf16le_with_errors(std::span<const char16_t> input) noexcept {
5162 #if SIMDUTF_CPLUSPLUS23
5163 if consteval {
5164 return scalar::utf16::validate_with_errors<endianness::LITTLE>(
5165 input.data(), input.size());
5166 } else
5167 #endif
5168 {
5169 return validate_utf16le_with_errors(input.data(), input.size());
5170 }
5171}
5172 #endif // SIMDUTF_SPAN
5173
5190simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
5191 size_t len) noexcept;
5192 #if SIMDUTF_SPAN
5193simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5194validate_utf16be_with_errors(std::span<const char16_t> input) noexcept {
5195 #if SIMDUTF_CPLUSPLUS23
5196 if consteval {
5197 return scalar::utf16::validate_with_errors<endianness::BIG>(input.data(),
5198 input.size());
5199 } else
5200 #endif
5201 {
5202 return validate_utf16be_with_errors(input.data(), input.size());
5203 }
5204}
5205 #endif // SIMDUTF_SPAN
5206
5219void to_well_formed_utf16le(const char16_t *input, size_t len,
5220 char16_t *output) noexcept;
5221 #if SIMDUTF_SPAN
5222simdutf_really_inline simdutf_constexpr23 void
5223to_well_formed_utf16le(std::span<const char16_t> input,
5224 std::span<char16_t> output) noexcept {
5225 #if SIMDUTF_CPLUSPLUS23
5226 if consteval {
5227 scalar::utf16::to_well_formed_utf16<endianness::LITTLE>(
5228 input.data(), input.size(), output.data());
5229 } else
5230 #endif
5231 {
5232 to_well_formed_utf16le(input.data(), input.size(), output.data());
5233 }
5234}
5235 #endif // SIMDUTF_SPAN
5236
5249void to_well_formed_utf16be(const char16_t *input, size_t len,
5250 char16_t *output) noexcept;
5251 #if SIMDUTF_SPAN
5252simdutf_really_inline simdutf_constexpr23 void
5253to_well_formed_utf16be(std::span<const char16_t> input,
5254 std::span<char16_t> output) noexcept {
5255 #if SIMDUTF_CPLUSPLUS23
5256 if consteval {
5257 scalar::utf16::to_well_formed_utf16<endianness::BIG>(
5258 input.data(), input.size(), output.data());
5259 } else
5260 #endif
5261 {
5262 to_well_formed_utf16be(input.data(), input.size(), output.data());
5263 }
5264}
5265 #endif // SIMDUTF_SPAN
5266
5279void to_well_formed_utf16(const char16_t *input, size_t len,
5280 char16_t *output) noexcept;
5281 #if SIMDUTF_SPAN
5282simdutf_really_inline simdutf_constexpr23 void
5283to_well_formed_utf16(std::span<const char16_t> input,
5284 std::span<char16_t> output) noexcept {
5285 #if SIMDUTF_CPLUSPLUS23
5286 if consteval {
5287 scalar::utf16::to_well_formed_utf16<endianness::NATIVE>(
5288 input.data(), input.size(), output.data());
5289 } else
5290 #endif
5291 {
5292 to_well_formed_utf16(input.data(), input.size(), output.data());
5293 }
5294}
5295 #endif // SIMDUTF_SPAN
5296
5310simdutf_warn_unused size_t convert_utf8_to_utf16(
5311 const char *input, size_t length, char16_t *utf16_output) noexcept;
5312 #if SIMDUTF_SPAN
5313simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
5314convert_utf8_to_utf16(const detail::input_span_of_byte_like auto &input,
5315 std::span<char16_t> output) noexcept {
5316 #if SIMDUTF_CPLUSPLUS23
5317 if consteval {
5318 return scalar::utf8_to_utf16::convert<endianness::NATIVE>(
5319 input.data(), input.size(), output.data());
5320 } else
5321 #endif
5322 {
5323 return convert_utf8_to_utf16(reinterpret_cast<const char *>(input.data()),
5324 input.size(), output.data());
5325 }
5326}
5327 #endif // SIMDUTF_SPAN
5328
5346simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
5347 const char16_t *input, size_t length) noexcept;
5348 #if SIMDUTF_SPAN
5349simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused result
5350utf8_length_from_utf16le_with_replacement(
5351 std::span<const char16_t> valid_utf16_input) noexcept {
5352 #if SIMDUTF_CPLUSPLUS23
5353 if consteval {
5354 return scalar::utf16::utf8_length_from_utf16_with_replacement<
5355 endianness::LITTLE>(valid_utf16_input.data(), valid_utf16_input.size());
5356 } else
5357 #endif
5358 {
5359 return utf8_length_from_utf16le_with_replacement(valid_utf16_input.data(),
5360 valid_utf16_input.size());
5361 }
5362}
5363 #endif // SIMDUTF_SPAN
5364
5382simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
5383 const char16_t *input, size_t length) noexcept;
5384 #if SIMDUTF_SPAN
5385simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5386utf8_length_from_utf16be_with_replacement(
5387 std::span<const char16_t> valid_utf16_input) noexcept {
5388 #if SIMDUTF_CPLUSPLUS23
5389 if consteval {
5390 return scalar::utf16::utf8_length_from_utf16_with_replacement<
5391 endianness::BIG>(valid_utf16_input.data(), valid_utf16_input.size());
5392 } else
5393 #endif
5394 {
5395 return utf8_length_from_utf16be_with_replacement(valid_utf16_input.data(),
5396 valid_utf16_input.size());
5397 }
5398}
5399 #endif // SIMDUTF_SPAN
5400
5413simdutf_warn_unused size_t convert_utf8_to_utf16le(
5414 const char *input, size_t length, char16_t *utf16_output) noexcept;
5415 #if SIMDUTF_SPAN
5416simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
5417convert_utf8_to_utf16le(const detail::input_span_of_byte_like auto &utf8_input,
5418 std::span<char16_t> utf16_output) noexcept {
5419 #if SIMDUTF_CPLUSPLUS23
5420 if consteval {
5421 return scalar::utf8_to_utf16::convert<endianness::LITTLE>(
5422 utf8_input.data(), utf8_input.size(), utf16_output.data());
5423 } else
5424 #endif
5425 {
5426 return convert_utf8_to_utf16le(
5427 reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
5428 utf16_output.data());
5429 }
5430}
5431 #endif // SIMDUTF_SPAN
5432
5445simdutf_warn_unused size_t convert_utf8_to_utf16be(
5446 const char *input, size_t length, char16_t *utf16_output) noexcept;
5447 #if SIMDUTF_SPAN
5448simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
5449convert_utf8_to_utf16be(const detail::input_span_of_byte_like auto &utf8_input,
5450 std::span<char16_t> utf16_output) noexcept {
5451
5452 #if SIMDUTF_CPLUSPLUS23
5453 if consteval {
5454 return scalar::utf8_to_utf16::convert<endianness::BIG>(
5455 utf8_input.data(), utf8_input.size(), utf16_output.data());
5456 } else
5457 #endif
5458 {
5459 return convert_utf8_to_utf16be(
5460 reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
5461 utf16_output.data());
5462 }
5463}
5464 #endif // SIMDUTF_SPAN
5465
5481simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
5482 const char *input, size_t length, char16_t *utf16_output) noexcept;
5483 #if SIMDUTF_SPAN
5484simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5485convert_utf8_to_utf16_with_errors(
5486 const detail::input_span_of_byte_like auto &utf8_input,
5487 std::span<char16_t> utf16_output) noexcept {
5488 #if SIMDUTF_CPLUSPLUS23
5489 if consteval {
5490 return scalar::utf8_to_utf16::convert_with_errors<endianness::NATIVE>(
5491 utf8_input.data(), utf8_input.size(), utf16_output.data());
5492 } else
5493 #endif
5494 {
5495 return convert_utf8_to_utf16_with_errors(
5496 reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
5497 utf16_output.data());
5498 }
5499}
5500 #endif // SIMDUTF_SPAN
5501
5516simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
5517 const char *input, size_t length, char16_t *utf16_output) noexcept;
5518 #if SIMDUTF_SPAN
5519simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5520convert_utf8_to_utf16le_with_errors(
5521 const detail::input_span_of_byte_like auto &utf8_input,
5522 std::span<char16_t> utf16_output) noexcept {
5523 #if SIMDUTF_CPLUSPLUS23
5524 if consteval {
5525 return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
5526 utf8_input.data(), utf8_input.size(), utf16_output.data());
5527 } else
5528 #endif
5529 {
5530 return convert_utf8_to_utf16le_with_errors(
5531 reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
5532 utf16_output.data());
5533 }
5534}
5535 #endif // SIMDUTF_SPAN
5536
5551simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
5552 const char *input, size_t length, char16_t *utf16_output) noexcept;
5553 #if SIMDUTF_SPAN
5554simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5555convert_utf8_to_utf16be_with_errors(
5556 const detail::input_span_of_byte_like auto &utf8_input,
5557 std::span<char16_t> utf16_output) noexcept {
5558 #if SIMDUTF_CPLUSPLUS23
5559 if consteval {
5560 return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(
5561 utf8_input.data(), utf8_input.size(), utf16_output.data());
5562 } else
5563 #endif
5564 {
5565 return convert_utf8_to_utf16be_with_errors(
5566 reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
5567 utf16_output.data());
5568 }
5569}
5570 #endif // SIMDUTF_SPAN
5571
5582simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
5583 const char *input, size_t length, char16_t *utf16_buffer) noexcept;
5584 #if SIMDUTF_SPAN
5585simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
5586convert_valid_utf8_to_utf16(
5587 const detail::input_span_of_byte_like auto &valid_utf8_input,
5588 std::span<char16_t> utf16_output) noexcept {
5589 #if SIMDUTF_CPLUSPLUS23
5590 if consteval {
5591 return scalar::utf8_to_utf16::convert_valid<endianness::NATIVE>(
5592 valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
5593 } else
5594 #endif
5595 {
5596 return convert_valid_utf8_to_utf16(
5597 reinterpret_cast<const char *>(valid_utf8_input.data()),
5598 valid_utf8_input.size(), utf16_output.data());
5599 }
5600}
5601 #endif // SIMDUTF_SPAN
5602
5613simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
5614 const char *input, size_t length, char16_t *utf16_buffer) noexcept;
5615 #if SIMDUTF_SPAN
5616simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
5617convert_valid_utf8_to_utf16le(
5618 const detail::input_span_of_byte_like auto &valid_utf8_input,
5619 std::span<char16_t> utf16_output) noexcept {
5620
5621 #if SIMDUTF_CPLUSPLUS23
5622 if consteval {
5623 return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
5624 valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
5625 } else
5626 #endif
5627 {
5628 return convert_valid_utf8_to_utf16le(
5629 reinterpret_cast<const char *>(valid_utf8_input.data()),
5630 valid_utf8_input.size(), utf16_output.data());
5631 }
5632}
5633 #endif // SIMDUTF_SPAN
5634
5645simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
5646 const char *input, size_t length, char16_t *utf16_buffer) noexcept;
5647 #if SIMDUTF_SPAN
5648simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
5649convert_valid_utf8_to_utf16be(
5650 const detail::input_span_of_byte_like auto &valid_utf8_input,
5651 std::span<char16_t> utf16_output) noexcept {
5652 #if SIMDUTF_CPLUSPLUS23
5653 if consteval {
5654 return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
5655 valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
5656 } else
5657 #endif
5658 {
5659 return convert_valid_utf8_to_utf16be(
5660 reinterpret_cast<const char *>(valid_utf8_input.data()),
5661 valid_utf8_input.size(), utf16_output.data());
5662 }
5663}
5664 #endif // SIMDUTF_SPAN
5665
5680simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
5681 size_t length) noexcept;
5682 #if SIMDUTF_SPAN
5683simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
5684utf16_length_from_utf8(
5685 const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
5686 #if SIMDUTF_CPLUSPLUS23
5687 if consteval {
5688 return scalar::utf8::utf16_length_from_utf8(valid_utf8_input.data(),
5689 valid_utf8_input.size());
5690 } else
5691 #endif
5692 {
5693 return utf16_length_from_utf8(
5694 reinterpret_cast<const char *>(valid_utf8_input.data()),
5695 valid_utf8_input.size());
5696 }
5697}
5698 #endif // SIMDUTF_SPAN
5699
5715simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *input,
5716 size_t length,
5717 char *utf8_buffer) noexcept;
5718 #if SIMDUTF_SPAN
5719simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
5720convert_utf16_to_utf8(
5721 std::span<const char16_t> utf16_input,
5722 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
5723 #if SIMDUTF_CPLUSPLUS23
5724 if consteval {
5725 return scalar::utf16_to_utf8::convert<endianness::NATIVE>(
5726 utf16_input.data(), utf16_input.size(), utf8_output.data());
5727 } else
5728 #endif
5729 {
5730 return convert_utf16_to_utf8(utf16_input.data(), utf16_input.size(),
5731 reinterpret_cast<char *>(utf8_output.data()));
5732 }
5733}
5734 #endif // SIMDUTF_SPAN
5735
5754simdutf_warn_unused size_t convert_utf16_to_utf8_safe(const char16_t *input,
5755 size_t length,
5756 char *utf8_output,
5757 size_t utf8_len) noexcept;
5758 #if SIMDUTF_SPAN
5759simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
5760convert_utf16_to_utf8_safe(
5761 std::span<const char16_t> utf16_input,
5762 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
5763 // implementation note: outputspan is a forwarding ref to avoid copying
5764 // and allow both lvalues and rvalues. std::span can be copied without
5765 // problems, but std::vector should not, and this function should accept
5766 // both. it will allow using an owning rvalue ref (example: passing a
5767 // temporary std::string) as output, but the user will quickly find out
5768 // that he has no way of getting the data out of the object in that case.
5769 #if SIMDUTF_CPLUSPLUS23
5770 if consteval {
5771 const full_result r =
5772 scalar::utf16_to_utf8::convert_with_errors<endianness::NATIVE, true>(
5773 utf16_input.data(), utf16_input.size(), utf8_output.data(),
5774 utf8_output.size());
5775 if (r.error != error_code::SUCCESS &&
5776 r.error != error_code::OUTPUT_BUFFER_TOO_SMALL) {
5777 return 0;
5778 }
5779 return r.output_count;
5780 } else
5781 #endif
5782 {
5783 return convert_utf16_to_utf8_safe(
5784 utf16_input.data(), utf16_input.size(),
5785 reinterpret_cast<char *>(utf8_output.data()), utf8_output.size());
5786 }
5787}
5788 #endif // SIMDUTF_SPAN
5789
5804simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input,
5805 size_t length,
5806 char *utf8_buffer) noexcept;
5807 #if SIMDUTF_SPAN
5808simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
5809convert_utf16le_to_utf8(
5810 std::span<const char16_t> utf16_input,
5811 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
5812 #if SIMDUTF_CPLUSPLUS23
5813 if consteval {
5814 return scalar::utf16_to_utf8::convert<endianness::LITTLE>(
5815 utf16_input.data(), utf16_input.size(), utf8_output.data());
5816 } else
5817 #endif
5818 {
5819 return convert_utf16le_to_utf8(
5820 utf16_input.data(), utf16_input.size(),
5821 reinterpret_cast<char *>(utf8_output.data()));
5822 }
5823}
5824 #endif // SIMDUTF_SPAN
5825
5840simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input,
5841 size_t length,
5842 char *utf8_buffer) noexcept;
5843 #if SIMDUTF_SPAN
5844simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
5845convert_utf16be_to_utf8(
5846 std::span<const char16_t> utf16_input,
5847 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
5848 #if SIMDUTF_CPLUSPLUS23
5849 if consteval {
5850 return scalar::utf16_to_utf8::convert<endianness::BIG>(
5851 utf16_input.data(), utf16_input.size(), utf8_output.data());
5852 } else
5853 #endif
5854 {
5855 return convert_utf16be_to_utf8(
5856 utf16_input.data(), utf16_input.size(),
5857 reinterpret_cast<char *>(utf8_output.data()));
5858 }
5859}
5860 #endif // SIMDUTF_SPAN
5861
5879simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
5880 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
5881 #if SIMDUTF_SPAN
5882simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5883convert_utf16_to_utf8_with_errors(
5884 std::span<const char16_t> utf16_input,
5885 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
5886 #if SIMDUTF_CPLUSPLUS23
5887 if consteval {
5888 return scalar::utf16_to_utf8::convert_with_errors<endianness::NATIVE>(
5889 utf16_input.data(), utf16_input.size(), utf8_output.data());
5890 } else
5891 #endif
5892 {
5893 return convert_utf16_to_utf8_with_errors(
5894 utf16_input.data(), utf16_input.size(),
5895 reinterpret_cast<char *>(utf8_output.data()));
5896 }
5897}
5898 #endif // SIMDUTF_SPAN
5899
5916simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
5917 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
5918 #if SIMDUTF_SPAN
5919simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5920convert_utf16le_to_utf8_with_errors(
5921 std::span<const char16_t> utf16_input,
5922 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
5923 #if SIMDUTF_CPLUSPLUS23
5924 if consteval {
5925 return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
5926 utf16_input.data(), utf16_input.size(), utf8_output.data());
5927 } else
5928 #endif
5929 {
5930 return convert_utf16le_to_utf8_with_errors(
5931 utf16_input.data(), utf16_input.size(),
5932 reinterpret_cast<char *>(utf8_output.data()));
5933 }
5934}
5935 #endif // SIMDUTF_SPAN
5936
5953simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
5954 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
5955 #if SIMDUTF_SPAN
5956simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
5957convert_utf16be_to_utf8_with_errors(
5958 std::span<const char16_t> utf16_input,
5959 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
5960 #if SIMDUTF_CPLUSPLUS23
5961 if consteval {
5962 return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
5963 utf16_input.data(), utf16_input.size(), utf8_output.data());
5964 } else
5965 #endif
5966 {
5967 return convert_utf16be_to_utf8_with_errors(
5968 utf16_input.data(), utf16_input.size(),
5969 reinterpret_cast<char *>(utf8_output.data()));
5970 }
5971}
5972 #endif // SIMDUTF_SPAN
5973
5987simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
5988 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
5989 #if SIMDUTF_SPAN
5990simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
5991convert_valid_utf16_to_utf8(
5992 std::span<const char16_t> valid_utf16_input,
5993 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
5994 #if SIMDUTF_CPLUSPLUS23
5995 if consteval {
5996 return scalar::utf16_to_utf8::convert_valid<endianness::NATIVE>(
5997 valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
5998 } else
5999 #endif
6000 {
6001 return convert_valid_utf16_to_utf8(
6002 valid_utf16_input.data(), valid_utf16_input.size(),
6003 reinterpret_cast<char *>(utf8_output.data()));
6004 }
6005}
6006 #endif // SIMDUTF_SPAN
6007
6021simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
6022 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
6023 #if SIMDUTF_SPAN
6024simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
6025convert_valid_utf16le_to_utf8(
6026 std::span<const char16_t> valid_utf16_input,
6027 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
6028 #if SIMDUTF_CPLUSPLUS23
6029 if consteval {
6030 return scalar::utf16_to_utf8::convert_valid<endianness::NATIVE>(
6031 valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
6032 } else
6033 #endif
6034 {
6035 return convert_valid_utf16le_to_utf8(
6036 valid_utf16_input.data(), valid_utf16_input.size(),
6037 reinterpret_cast<char *>(utf8_output.data()));
6038 }
6039}
6040 #endif // SIMDUTF_SPAN
6041
6055simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
6056 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
6057 #if SIMDUTF_SPAN
6058simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
6059convert_valid_utf16be_to_utf8(
6060 std::span<const char16_t> valid_utf16_input,
6061 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
6062 #if SIMDUTF_CPLUSPLUS23
6063 if consteval {
6064 return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(
6065 valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
6066 } else
6067 #endif
6068 {
6069 return convert_valid_utf16be_to_utf8(
6070 valid_utf16_input.data(), valid_utf16_input.size(),
6071 reinterpret_cast<char *>(utf8_output.data()));
6072 }
6073}
6074 #endif // SIMDUTF_SPAN
6075
6087simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
6088 size_t length) noexcept;
6089 #if SIMDUTF_SPAN
6090simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
6091utf8_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
6092 #if SIMDUTF_CPLUSPLUS23
6093 if consteval {
6094 return scalar::utf16::utf8_length_from_utf16<endianness::NATIVE>(
6095 valid_utf16_input.data(), valid_utf16_input.size());
6096 } else
6097 #endif
6098 {
6099 return utf8_length_from_utf16(valid_utf16_input.data(),
6100 valid_utf16_input.size());
6101 }
6102}
6103 #endif // SIMDUTF_SPAN
6104
6123simdutf_warn_unused result utf8_length_from_utf16_with_replacement(
6124 const char16_t *input, size_t length) noexcept;
6125 #if SIMDUTF_SPAN
6126simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
6127utf8_length_from_utf16_with_replacement(
6128 std::span<const char16_t> valid_utf16_input) noexcept {
6129 #if SIMDUTF_CPLUSPLUS23
6130 if consteval {
6131 return scalar::utf16::utf8_length_from_utf16_with_replacement<
6132 endianness::NATIVE>(valid_utf16_input.data(), valid_utf16_input.size());
6133 } else
6134 #endif
6135 {
6136 return utf8_length_from_utf16_with_replacement(valid_utf16_input.data(),
6137 valid_utf16_input.size());
6138 }
6139}
6140 #endif // SIMDUTF_SPAN
6141
6153simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
6154 size_t length) noexcept;
6155 #if SIMDUTF_SPAN
6156simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t
6157utf8_length_from_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
6158 #if SIMDUTF_CPLUSPLUS23
6159 if consteval {
6160 return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(
6161 valid_utf16_input.data(), valid_utf16_input.size());
6162 } else
6163 #endif
6164 {
6165 return utf8_length_from_utf16le(valid_utf16_input.data(),
6166 valid_utf16_input.size());
6167 }
6168}
6169 #endif // SIMDUTF_SPAN
6170
6182simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
6183 size_t length) noexcept;
6184 #if SIMDUTF_SPAN
6185simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
6186utf8_length_from_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
6187 #if SIMDUTF_CPLUSPLUS23
6188 if consteval {
6189 return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(
6190 valid_utf16_input.data(), valid_utf16_input.size());
6191 } else
6192 #endif
6193 {
6194 return utf8_length_from_utf16be(valid_utf16_input.data(),
6195 valid_utf16_input.size());
6196 }
6197}
6198 #endif // SIMDUTF_SPAN
6199
6213void change_endianness_utf16(const char16_t *input, size_t length,
6214 char16_t *output) noexcept;
6215 #if SIMDUTF_SPAN
6216simdutf_really_inline simdutf_constexpr23 void
6217change_endianness_utf16(std::span<const char16_t> utf16_input,
6218 std::span<char16_t> utf16_output) noexcept {
6219 #if SIMDUTF_CPLUSPLUS23
6220 if consteval {
6221 return scalar::utf16::change_endianness_utf16(
6222 utf16_input.data(), utf16_input.size(), utf16_output.data());
6223 } else
6224 #endif
6225 {
6226 return change_endianness_utf16(utf16_input.data(), utf16_input.size(),
6227 utf16_output.data());
6228 }
6229}
6230 #endif // SIMDUTF_SPAN
6231
6246simdutf_warn_unused size_t count_utf16(const char16_t *input,
6247 size_t length) noexcept;
6248 #if SIMDUTF_SPAN
6249simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
6250count_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
6251 #if SIMDUTF_CPLUSPLUS23
6252 if consteval {
6253 return scalar::utf16::count_code_points<endianness::NATIVE>(
6254 valid_utf16_input.data(), valid_utf16_input.size());
6255 } else
6256 #endif
6257 {
6258 return count_utf16(valid_utf16_input.data(), valid_utf16_input.size());
6259 }
6260}
6261 #endif // SIMDUTF_SPAN
6262
6277simdutf_warn_unused size_t count_utf16le(const char16_t *input,
6278 size_t length) noexcept;
6279 #if SIMDUTF_SPAN
6280simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
6281count_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
6282 #if SIMDUTF_CPLUSPLUS23
6283 if consteval {
6284 return scalar::utf16::count_code_points<endianness::LITTLE>(
6285 valid_utf16_input.data(), valid_utf16_input.size());
6286 } else
6287 #endif
6288 {
6289 return count_utf16le(valid_utf16_input.data(), valid_utf16_input.size());
6290 }
6291}
6292 #endif // SIMDUTF_SPAN
6293
6308simdutf_warn_unused size_t count_utf16be(const char16_t *input,
6309 size_t length) noexcept;
6310 #if SIMDUTF_SPAN
6311simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
6312count_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
6313 #if SIMDUTF_CPLUSPLUS23
6314 if consteval {
6315 return scalar::utf16::count_code_points<endianness::BIG>(
6316 valid_utf16_input.data(), valid_utf16_input.size());
6317 } else
6318 #endif
6319 {
6320 return count_utf16be(valid_utf16_input.data(), valid_utf16_input.size());
6321 }
6322}
6323 #endif // SIMDUTF_SPAN
6324
6337simdutf_warn_unused size_t count_utf8(const char *input,
6338 size_t length) noexcept;
6339 #if SIMDUTF_SPAN
6340simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t count_utf8(
6341 const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
6342 #if SIMDUTF_CPLUSPLUS23
6343 if consteval {
6344 return scalar::utf8::count_code_points(valid_utf8_input.data(),
6345 valid_utf8_input.size());
6346 } else
6347 #endif
6348 {
6349 return count_utf8(reinterpret_cast<const char *>(valid_utf8_input.data()),
6350 valid_utf8_input.size());
6351 }
6352}
6353 #endif // SIMDUTF_SPAN
6354
6369simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length);
6370 #if SIMDUTF_SPAN
6371simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
6372trim_partial_utf8(
6373 const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
6374 #if SIMDUTF_CPLUSPLUS23
6375 if consteval {
6376 return scalar::utf8::trim_partial_utf8(valid_utf8_input.data(),
6377 valid_utf8_input.size());
6378 } else
6379 #endif
6380 {
6381 return trim_partial_utf8(
6382 reinterpret_cast<const char *>(valid_utf8_input.data()),
6383 valid_utf8_input.size());
6384 }
6385}
6386 #endif // SIMDUTF_SPAN
6387
6402simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
6403 size_t length);
6404 #if SIMDUTF_SPAN
6405simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
6406trim_partial_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
6407 #if SIMDUTF_CPLUSPLUS23
6408 if consteval {
6409 return scalar::utf16::trim_partial_utf16<endianness::BIG>(
6410 valid_utf16_input.data(), valid_utf16_input.size());
6411 } else
6412 #endif
6413 {
6414 return trim_partial_utf16be(valid_utf16_input.data(),
6415 valid_utf16_input.size());
6416 }
6417}
6418 #endif // SIMDUTF_SPAN
6419
6434simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
6435 size_t length);
6436 #if SIMDUTF_SPAN
6437simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
6438trim_partial_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
6439 #if SIMDUTF_CPLUSPLUS23
6440 if consteval {
6441 return scalar::utf16::trim_partial_utf16<endianness::LITTLE>(
6442 valid_utf16_input.data(), valid_utf16_input.size());
6443 } else
6444 #endif
6445 {
6446 return trim_partial_utf16le(valid_utf16_input.data(),
6447 valid_utf16_input.size());
6448 }
6449}
6450 #endif // SIMDUTF_SPAN
6451
6466simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
6467 size_t length);
6468 #if SIMDUTF_SPAN
6469simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
6470trim_partial_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
6471 #if SIMDUTF_CPLUSPLUS23
6472 if consteval {
6473 return scalar::utf16::trim_partial_utf16<endianness::NATIVE>(
6474 valid_utf16_input.data(), valid_utf16_input.size());
6475 } else
6476 #endif
6477 {
6478 return trim_partial_utf16(valid_utf16_input.data(),
6479 valid_utf16_input.size());
6480 }
6481}
6482 #endif // SIMDUTF_SPAN
6483
6484 #ifndef SIMDUTF_NEED_TRAILING_ZEROES
6485 #define SIMDUTF_NEED_TRAILING_ZEROES 1
6486 #endif
6487
6496public:
6506 virtual std::string name() const { return std::string(_name); }
6507
6517 virtual std::string description() const { return std::string(_description); }
6518
6528 bool supported_by_runtime_system() const;
6529
6537 virtual uint32_t required_instruction_sets() const {
6539 }
6540
6550 simdutf_warn_unused virtual bool validate_utf8(const char *buf,
6551 size_t len) const noexcept = 0;
6552
6565 simdutf_warn_unused virtual result
6566 validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
6567
6582 simdutf_warn_unused virtual bool
6583 validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
6584
6599 simdutf_warn_unused virtual bool
6600 validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
6601
6618 simdutf_warn_unused virtual result
6619 validate_utf16le_with_errors(const char16_t *buf,
6620 size_t len) const noexcept = 0;
6621
6638 simdutf_warn_unused virtual result
6639 validate_utf16be_with_errors(const char16_t *buf,
6640 size_t len) const noexcept = 0;
6653 virtual void to_well_formed_utf16le(const char16_t *input, size_t len,
6654 char16_t *output) const noexcept = 0;
6667 virtual void to_well_formed_utf16be(const char16_t *input, size_t len,
6668 char16_t *output) const noexcept = 0;
6669
6682 simdutf_warn_unused virtual size_t
6683 convert_utf8_to_utf16le(const char *input, size_t length,
6684 char16_t *utf16_output) const noexcept = 0;
6685
6698 simdutf_warn_unused virtual size_t
6699 convert_utf8_to_utf16be(const char *input, size_t length,
6700 char16_t *utf16_output) const noexcept = 0;
6701
6717 simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(
6718 const char *input, size_t length,
6719 char16_t *utf16_output) const noexcept = 0;
6720
6736 simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(
6737 const char *input, size_t length,
6738 char16_t *utf16_output) const noexcept = 0;
6759 const char16_t *input, size_t length) const noexcept = 0;
6760
6781 const char16_t *input, size_t length) const noexcept = 0;
6782
6793 simdutf_warn_unused virtual size_t
6794 convert_valid_utf8_to_utf16le(const char *input, size_t length,
6795 char16_t *utf16_buffer) const noexcept = 0;
6796
6807 simdutf_warn_unused virtual size_t
6808 convert_valid_utf8_to_utf16be(const char *input, size_t length,
6809 char16_t *utf16_buffer) const noexcept = 0;
6810
6823 simdutf_warn_unused virtual size_t
6824 utf16_length_from_utf8(const char *input, size_t length) const noexcept = 0;
6825
6841 simdutf_warn_unused virtual size_t
6842 convert_utf16le_to_utf8(const char16_t *input, size_t length,
6843 char *utf8_buffer) const noexcept = 0;
6844
6860 simdutf_warn_unused virtual size_t
6861 convert_utf16be_to_utf8(const char16_t *input, size_t length,
6862 char *utf8_buffer) const noexcept = 0;
6863
6882 simdutf_warn_unused virtual result
6883 convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length,
6884 char *utf8_buffer) const noexcept = 0;
6885
6904 simdutf_warn_unused virtual result
6905 convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length,
6906 char *utf8_buffer) const noexcept = 0;
6907
6922 simdutf_warn_unused virtual size_t
6923 convert_valid_utf16le_to_utf8(const char16_t *input, size_t length,
6924 char *utf8_buffer) const noexcept = 0;
6925
6940 simdutf_warn_unused virtual size_t
6941 convert_valid_utf16be_to_utf8(const char16_t *input, size_t length,
6942 char *utf8_buffer) const noexcept = 0;
6943
6958 simdutf_warn_unused virtual size_t
6959 utf8_length_from_utf16le(const char16_t *input,
6960 size_t length) const noexcept = 0;
6961
6976 simdutf_warn_unused virtual size_t
6977 utf8_length_from_utf16be(const char16_t *input,
6978 size_t length) const noexcept = 0;
6979
6994 virtual void change_endianness_utf16(const char16_t *input, size_t length,
6995 char16_t *output) const noexcept = 0;
6996
7012 simdutf_warn_unused virtual size_t
7013 count_utf16le(const char16_t *input, size_t length) const noexcept = 0;
7014
7030 simdutf_warn_unused virtual size_t
7031 count_utf16be(const char16_t *input, size_t length) const noexcept = 0;
7032
7045 simdutf_warn_unused virtual size_t
7046 count_utf8(const char *input, size_t length) const noexcept = 0;
7047
7048#ifdef SIMDUTF_INTERNAL_TESTS
7049 // This method is exported only in developer mode, its purpose
7050 // is to expose some internal test procedures from the given
7051 // implementation and then use them through our standard test
7052 // framework.
7053 //
7054 // Regular users should not use it, the tests of the public
7055 // API are enough.
7056
7057 struct TestProcedure {
7058 // display name
7059 std::string name;
7060
7061 // procedure should return whether given test pass or not
7062 void (*procedure)(const implementation &);
7063 };
7064
7065 virtual std::vector<TestProcedure> internal_tests() const;
7066#endif
7067
7068protected:
7071 simdutf_really_inline implementation(const char *name,
7072 const char *description,
7076
7077protected:
7078 ~implementation() = default;
7079
7080private:
7084 const char *_name;
7085
7089 const char *_description;
7090
7095};
7096
7098namespace internal {
7099
7104public:
7106 simdutf_really_inline available_implementation_list() {}
7108 size_t size() const noexcept;
7110 const implementation *const *begin() const noexcept;
7112 const implementation *const *end() const noexcept;
7113
7127 const implementation *operator[](const std::string &name) const noexcept {
7128 for (const implementation *impl : *this) {
7129 if (impl->name() == name) {
7130 return impl;
7131 }
7132 }
7133 return nullptr;
7134 }
7135
7149 const implementation *detect_best_supported() const noexcept;
7150};
7151
7152template <typename T> class atomic_ptr {
7153public:
7154 atomic_ptr(T *_ptr) : ptr{_ptr} {}
7155
7156#if defined(SIMDUTF_NO_THREADS)
7157 operator const T *() const { return ptr; }
7158 const T &operator*() const { return *ptr; }
7159 const T *operator->() const { return ptr; }
7160
7161 operator T *() { return ptr; }
7162 T &operator*() { return *ptr; }
7163 T *operator->() { return ptr; }
7164 atomic_ptr &operator=(T *_ptr) {
7165 ptr = _ptr;
7166 return *this;
7167 }
7168
7169#else
7170 operator const T *() const { return ptr.load(); }
7171 const T &operator*() const { return *ptr; }
7172 const T *operator->() const { return ptr.load(); }
7173
7174 operator T *() { return ptr.load(); }
7175 T &operator*() { return *ptr; }
7176 T *operator->() { return ptr.load(); }
7177 atomic_ptr &operator=(T *_ptr) {
7178 ptr = _ptr;
7179 return *this;
7180 }
7181
7182#endif
7183
7184private:
7185#if defined(SIMDUTF_NO_THREADS)
7186 T *ptr;
7187#else
7188 std::atomic<T *> ptr;
7189#endif
7190};
7191
7193
7194} // namespace internal
7195
7199extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
7200get_available_implementations();
7201
7208extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
7209get_active_implementation();
7210
7211} // namespace simdutf
7212
7213#endif // SIMDUTF_IMPLEMENTATION_H
7214/* end file include/simdutf/implementation.h */
7215
7216// Implementation-internal files (must be included before the implementations
7217// themselves, to keep amalgamation working--otherwise, the first time a file is
7218// included, it might be put inside the #ifdef
7219// SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other
7220// implementations can't compile unless that implementation is turned on).
7221
7222SIMDUTF_POP_DISABLE_WARNINGS
7223
7224#endif // SIMDUTF_H
7225/* end file include/simdutf.h */
Definition: simdutf.h:6495
virtual simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char *input, size_t length, char16_t *utf16_buffer) const noexcept=0
virtual simdutf_warn_unused size_t convert_utf8_to_utf16be(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
virtual simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
virtual simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept=0
const char * _description
Definition: simdutf.h:7089
virtual simdutf_warn_unused size_t count_utf8(const char *input, size_t length) const noexcept=0
virtual simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
virtual void to_well_formed_utf16le(const char16_t *input, size_t len, char16_t *output) const noexcept=0
virtual simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept=0
virtual simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept=0
virtual simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
virtual simdutf_warn_unused size_t convert_utf8_to_utf16le(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
virtual simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(const char16_t *input, size_t length) const noexcept=0
virtual simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept=0
virtual simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
virtual simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(const char16_t *input, size_t length) const noexcept=0
virtual simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept=0
virtual simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
virtual simdutf_warn_unused size_t count_utf16le(const char16_t *input, size_t length) const noexcept=0
simdutf_really_inline implementation(const char *name, const char *description, uint32_t required_instruction_sets)
Definition: simdutf.h:7071
bool supported_by_runtime_system() const
Definition: simdutf.cpp:10267
virtual simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept=0
virtual simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
virtual uint32_t required_instruction_sets() const
Definition: simdutf.h:6537
virtual void change_endianness_utf16(const char16_t *input, size_t length, char16_t *output) const noexcept=0
virtual simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char *input, size_t length, char16_t *utf16_buffer) const noexcept=0
virtual std::string name() const
Definition: simdutf.h:6506
virtual simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept=0
const char * _name
Definition: simdutf.h:7084
virtual simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
virtual simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept=0
virtual simdutf_warn_unused size_t count_utf16be(const char16_t *input, size_t length) const noexcept=0
virtual void to_well_formed_utf16be(const char16_t *input, size_t len, char16_t *output) const noexcept=0
virtual simdutf_warn_unused size_t utf16_length_from_utf8(const char *input, size_t length) const noexcept=0
virtual simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
const uint32_t _required_instruction_sets
Definition: simdutf.h:7094
virtual std::string description() const
Definition: simdutf.h:6517
Definition: simdutf.h:7152
const implementation *const * begin() const noexcept
Definition: simdutf.cpp:10765
size_t size() const noexcept
Definition: simdutf.cpp:10761
simdutf_really_inline available_implementation_list()
Definition: simdutf.h:7106
const implementation *const * end() const noexcept
Definition: simdutf.cpp:10769
const implementation * detect_best_supported() const noexcept
Definition: simdutf.cpp:10773
Definition: simdutf.h:882
Definition: simdutf.h:860