DAW JSON Link
daw_json_parse_std_string.h
Go to the documentation of this file.
1 // Copyright (c) Darrell Wright
2 //
3 // Distributed under the Boost Software License, Version 1.0. (See accompanying
4 // file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
5 //
6 // Official repository: https://github.com/beached/daw_json_link
7 //
8 
9 #pragma once
10 
11 #include "version.h"
12 
14 #include "daw_json_assert.h"
15 #include "daw_json_parse_common.h"
17 
18 #include <daw/algorithms/daw_algorithm_copy.h>
19 #include <daw/algorithms/daw_algorithm_copy_n.h>
20 #include <daw/daw_data_end.h>
21 #include <daw/daw_likely.h>
22 
23 #include <cstddef>
24 #include <daw/stdinc/data_access.h>
25 #include <daw/stdinc/range_access.h>
26 #include <type_traits>
27 
28 namespace daw::json {
29  inline namespace DAW_JSON_VER {
30  namespace json_details {
31  [[nodiscard]] static inline constexpr UInt8
32  to_nibble( unsigned char chr ) {
33  int const b = static_cast<int>( chr );
34  int const maskLetter = ( ( '9' - b ) >> 31 );
35  int const maskSmall = ( ( 'Z' - b ) >> 31 );
36  int const offset = '0' + ( maskLetter & int( 'A' - '0' - 10 ) ) +
37  ( maskSmall & int( 'a' - 'A' ) );
38  auto const result = static_cast<unsigned>( b - offset );
39  return to_uint8( result );
40  }
41 
42  template<bool is_unchecked_input>
43  DAW_ATTRIB_NONNULL( )
44  [[nodiscard]] static inline constexpr UInt16
45  byte_from_nibbles( char const *&first ) {
46  auto const n0 = to_nibble( static_cast<unsigned char>( *first++ ) );
47  auto const n1 = to_nibble( static_cast<unsigned char>( *first++ ) );
48  if constexpr( is_unchecked_input ) {
49  daw_json_ensure( n0 < 16 and n1 < 16, ErrorReason::InvalidUTFEscape );
50  }
51  return to_uint16( ( n0 << 4U ) | n1 );
52  }
53 
54  static constexpr char u32toC( UInt32 value ) {
55  return static_cast<char>( static_cast<unsigned char>( value ) );
56  }
57 
58  template<typename ParseState>
59  DAW_ATTRIB_NONNULL( )
60  DAW_ATTRIB_RET_NONNULL [[nodiscard]] static constexpr char *decode_utf16(
61  ParseState &parse_state, char *it ) {
62  constexpr bool is_unchecked_input = ParseState::is_unchecked_input;
63  daw_json_assert_weak( parse_state.size( ) >= 5,
64  ErrorReason::UnexpectedEndOfData, parse_state );
65  char const *first = parse_state.first;
66  ++first;
67  UInt32 cp = to_uint32( byte_from_nibbles<is_unchecked_input>( first ) )
68  << 8U;
69  cp |= byte_from_nibbles<is_unchecked_input>( first );
70  if( cp <= 0x7FU ) {
71  *it++ = static_cast<char>( static_cast<unsigned char>( cp ) );
72  parse_state.first = first;
73  return it;
74  }
75 
76  //******************************
77  if( 0xD800U <= cp and cp <= 0xDBFFU ) {
78  cp = ( cp - 0xD800U ) * 0x400U;
79  ++first;
81  ( parse_state.last - first >= 5 ) and *first == 'u',
82  ErrorReason::InvalidUTFEscape,
83  parse_state ); // Expected parse_state to start with a \\u
84  ++first;
85  auto trailing =
86  to_uint32( byte_from_nibbles<is_unchecked_input>( first ) ) << 8U;
87  trailing |= byte_from_nibbles<is_unchecked_input>( first );
88  trailing -= 0xDC00U;
89  cp += trailing;
90  cp += 0x10000;
91  }
92  // UTF32-> UTF8
93  if( cp >= 0x10000U ) {
94  // 4 bytes
95  char const enc3 = u32toC( ( cp & 0b0011'1111U ) | 0b1000'0000U );
96  char const enc2 =
97  u32toC( ( ( cp >> 6U ) & 0b0011'1111U ) | 0b1000'0000U );
98  char const enc1 =
99  u32toC( ( ( cp >> 12U ) & 0b0011'1111U ) | 0b1000'0000U );
100  char const enc0 = u32toC( ( cp >> 18U ) | 0b1111'0000U );
101  *it++ = enc0;
102  *it++ = enc1;
103  *it++ = enc2;
104  *it++ = enc3;
105  parse_state.first = first;
106  return it;
107  }
108  //******************************
109  if( cp >= 0x800U ) {
110  // 3 bytes
111  char const enc2 = u32toC( ( cp & 0b0011'1111U ) | 0b1000'0000U );
112  char const enc1 =
113  u32toC( ( ( cp >> 6U ) & 0b0011'1111U ) | 0b1000'0000U );
114  char const enc0 = u32toC( ( cp >> 12U ) | 0b1110'0000U );
115  *it++ = enc0;
116  *it++ = enc1;
117  *it++ = enc2;
118  parse_state.first = first;
119  return it;
120  }
121  //******************************
122  // cp >= 0x80U
123  // 2 bytes
124  char const enc1 = u32toC( ( cp & 0b0011'1111U ) | 0b1000'0000U );
125  char const enc0 = u32toC( ( cp >> 6U ) | 0b1100'0000U );
126  *it++ = enc0;
127  *it++ = enc1;
128  parse_state.first = first;
129  return it;
130  }
131 
132  template<typename ParseState, typename Appender>
133  static constexpr void decode_utf16( ParseState &parse_state,
134  Appender &app ) {
135  constexpr bool is_unchecked_input = ParseState::is_unchecked_input;
136  char const *first = parse_state.first;
137  ++first;
138  UInt32 cp = to_uint32( byte_from_nibbles<is_unchecked_input>( first ) )
139  << 8U;
140  cp |= byte_from_nibbles<is_unchecked_input>( first );
141  if( cp <= 0x7FU ) {
142  app( u32toC( cp ) );
143  parse_state.first = first;
144  return;
145  }
146  if( 0xD800U <= cp and cp <= 0xDBFFU ) {
147  cp = ( cp - 0xD800U ) * 0x400U;
148  ++first;
149  daw_json_assert_weak( *first == 'u', ErrorReason::InvalidUTFEscape,
150  parse_state );
151  ++first;
152  auto trailing =
153  to_uint32( byte_from_nibbles<is_unchecked_input>( first ) ) << 8U;
154  trailing |= byte_from_nibbles<is_unchecked_input>( first );
155  trailing -= 0xDC00U;
156  cp += trailing;
157  cp += 0x10000;
158  }
159  // UTF32-> UTF8
160  if( cp >= 0x10000U ) {
161  // 4 bytes
162  char const enc3 = u32toC( ( cp & 0b0011'1111U ) | 0b1000'0000U );
163  char const enc2 =
164  u32toC( ( ( cp >> 6U ) & 0b0011'1111U ) | 0b1000'0000U );
165  char const enc1 =
166  u32toC( ( ( cp >> 12U ) & 0b0011'1111U ) | 0b1000'0000U );
167  char const enc0 = u32toC( ( cp >> 18U ) | 0b1111'0000U );
168  app( enc0 );
169  app( enc1 );
170  app( enc2 );
171  app( enc3 );
172  parse_state.first = first;
173  return;
174  }
175  if( cp >= 0x800U ) {
176  // 3 bytes
177  char const enc2 = u32toC( ( cp & 0b0011'1111U ) | 0b1000'0000U );
178  char const enc1 =
179  u32toC( ( ( cp >> 6U ) & 0b0011'1111U ) | 0b1000'0000U );
180  char const enc0 = u32toC( ( cp >> 12U ) | 0b1110'0000U );
181  app( enc0 );
182  app( enc1 );
183  app( enc2 );
184  parse_state.first = first;
185  return;
186  }
187  // cp >= 0x80U
188  // 2 bytes
189  char const enc1 = u32toC( ( cp & 0b0011'1111U ) | 0b1000'0000U );
190  char const enc0 = u32toC( ( cp >> 6U ) | 0b1100'0000U );
191  app( enc0 );
192  app( enc1 );
193  parse_state.first = first;
194  }
195 
196  namespace parse_tokens {
197  inline constexpr char const escape_quotes[] = "\\\"";
198  }
199 
200  // Fast path for parsing escaped strings to a std::string with the default
201  // appender
202  template<bool AllowHighEight, typename JsonMember, bool KnownBounds,
203  typename ParseState>
204  [[nodiscard]] static constexpr auto
205  parse_string_known_stdstring( ParseState &parse_state ) {
206  using string_type = json_base_type_t<JsonMember>;
207  string_type result =
208  string_type( std::size( parse_state ) + 1, '\0',
209  parse_state.template get_allocator_for<char>( ) );
210  char *it = std::data( result );
211 
212  bool const has_quote = parse_state.front( ) == '"';
213  if( has_quote ) {
214  parse_state.remove_prefix( );
215  }
216 
217  if( auto const first_slash =
218  static_cast<std::ptrdiff_t>( parse_state.counter ) - 1;
219  first_slash > 1 ) {
220  it = daw::algorithm::copy_n( parse_state.first, it,
221  static_cast<std::size_t>( first_slash ) )
222  .output;
223  parse_state.first += first_slash;
224  }
225  constexpr auto pred =
226  []( auto const &r ) DAW_JSON_CPP23_STATIC_CALL_OP {
227  if constexpr( ParseState::is_unchecked_input ) {
228  return DAW_LIKELY( r.front( ) != '"' );
229  } else {
230  return DAW_LIKELY( r.has_more( ) ) and ( r.front( ) != '"' );
231  }
232  };
233 
234  while( pred( parse_state ) ) {
235  {
236  char const *first = parse_state.first;
237  char const *const last = parse_state.last;
238  if constexpr( std::is_same_v<typename ParseState::exec_tag_t,
239  constexpr_exec_tag> ) {
240 
241  daw_json_assert_weak( KnownBounds or first < last,
242  ErrorReason::UnexpectedEndOfData,
243  parse_state );
244  while( *first != '"' and *first != '\\' ) {
245  ++first;
246  daw_json_assert_weak( KnownBounds or first < last,
247  ErrorReason::UnexpectedEndOfData,
248  parse_state );
249  }
250  } else {
251  first =
252  mem_move_to_next_of<( ParseState::is_unchecked_input or
254  '"', '\\'>( ParseState::exec_tag, first,
255  last );
256  }
258  static_cast<std::ptrdiff_t>( result.size( ) ) -
259  std::distance( result.data( ), it ) >=
260  std::distance( parse_state.first, first ),
261  ErrorReason::UnexpectedEndOfData );
262  it = daw::algorithm::copy( parse_state.first, first, it );
263  parse_state.first = first;
264  }
265  if( parse_state.front( ) == '\\' ) {
266  parse_state.remove_prefix( );
267  daw_json_assert_weak( not parse_state.is_space_unchecked( ),
268  ErrorReason::InvalidUTFCodepoint,
269  parse_state );
270  switch( parse_state.front( ) ) {
271  case 'b':
272  *it++ = '\b';
273  parse_state.remove_prefix( );
274  break;
275  case 'f':
276  *it++ = '\f';
277  parse_state.remove_prefix( );
278  break;
279  case 'n':
280  *it++ = '\n';
281  parse_state.remove_prefix( );
282  break;
283  case 'r':
284  *it++ = '\r';
285  parse_state.remove_prefix( );
286  break;
287  case 't':
288  *it++ = '\t';
289  parse_state.remove_prefix( );
290  break;
291  case 'u':
292  it = decode_utf16( parse_state, it );
293  break;
294  case '/':
295  case '\\':
296  case '"':
297  *it++ = parse_state.front( );
298  parse_state.remove_prefix( );
299  break;
300  default:
301  if constexpr( not AllowHighEight ) {
303  ( not parse_state.is_space_unchecked( ) ) &
304  ( static_cast<unsigned char>( parse_state.front( ) ) <=
305  0x7FU ),
306  ErrorReason::InvalidStringHighASCII, parse_state );
307  }
308  *it++ = parse_state.front( );
309  parse_state.remove_prefix( );
310  }
311  } else {
312  daw_json_assert_weak( not has_quote or
313  parse_state.is_quotes_checked( ),
314  ErrorReason::InvalidString, parse_state );
315  }
316  daw_json_assert_weak( not has_quote or parse_state.has_more( ),
317  ErrorReason::UnexpectedEndOfData, parse_state );
318  }
319  auto const sz =
320  static_cast<std::size_t>( std::distance( std::data( result ), it ) );
321  daw_json_assert_weak( std::size( result ) >= sz,
322  ErrorReason::InvalidString, parse_state );
323  result.resize( sz );
324  if constexpr( std::is_convertible_v<string_type,
325  json_result_t<JsonMember>> ) {
326  return result;
327  } else {
328  using constructor_t = json_constructor_t<JsonMember>;
329  construct_value<json_result_t<JsonMember>, constructor_t>(
330  parse_state, std::data( result ), daw::data_end( result ) );
331  }
332  }
333  } // namespace json_details
334  } // namespace DAW_JSON_VER
335 } // namespace daw::json
#define daw_json_assert_weak(Bool,...)
Assert that Bool is true when in Checked Input mode If false pass rest of args to daw_json_error.
#define daw_json_ensure(Bool,...)
Ensure that Bool is true. If false pass rest of args to daw_json_error.
#define DAW_JSON_CPP23_STATIC_CALL_OP
This is in addition to the parse policy. Always do a full name match instead of sometimes relying on ...
std::bool_constant< is_zero_terminated_string_v< T > > is_zero_terminated_string
Customization point traits.
#define DAW_JSON_VER
The version string used in namespace definitions. Must be a valid namespace name.
Definition: version.h:25