FreeBASIC  0.91.0
utf_convto_wchar.c
Go to the documentation of this file.
1 /* UTF to wstring conversion
2  * (based on ConvertUTF.c free implementation from Unicode, Inc)
3  */
4 
5 #include "fb.h"
6 
7 extern const char __fb_utf8_trailingTb[256];
8 extern const UTF_32 __fb_utf8_offsetsTb[6];
9 
10 char *fb_hUTF8ToChar( const UTF_8 *src, char *dst, ssize_t *chars );
11 char *fb_hUTF16ToChar( const UTF_16 *src, char *dst, ssize_t *chars );
12 char *fb_hUTF32ToChar( const UTF_32 *src, char *dst, ssize_t *chars );
13 
14 static FB_WCHAR *hUTF8ToUTF16( const UTF_8 *src, FB_WCHAR *dst, ssize_t *chars )
15 {
16  UTF_32 c;
17  ssize_t extbytes, charsleft;
18  FB_WCHAR *buffer = dst;
19 
20  if( dst == NULL )
21  {
22  ssize_t dst_size = 0;
23  charsleft = 0;
24  do
25  {
26  extbytes = __fb_utf8_trailingTb[(unsigned int)*src];
27 
28  c = 0;
29  switch( extbytes )
30  {
31  case 5:
32  c += *src++; c <<= 6;
33  case 4:
34  c += *src++; c <<= 6;
35  case 3:
36  c += *src++; c <<= 6;
37  case 2:
38  c += *src++; c <<= 6;
39  case 1:
40  c += *src++; c <<= 6;
41  case 0:
42  c += *src++;
43  }
44 
45  c -= __fb_utf8_offsetsTb[extbytes];
46 
47  if( charsleft <= 1 )
48  {
49  charsleft = 8 + 1;
50  dst_size += charsleft;
51  buffer = realloc( buffer, dst_size * sizeof( FB_WCHAR ) );
52  dst = buffer + dst_size - charsleft;
53  }
54 
55  if( c <= UTF16_MAX_BMP )
56  *dst++ = c;
57  else
58  {
59  c -= UTF16_HALFBASE;
60  *dst++ = ((c >> UTF16_HALFSHIFT) + UTF16_SUR_HIGH_START);
61  *dst++ = ((c & UTF16_HALFMASK) + UTF16_SUR_LOW_START);
62  --charsleft;
63  }
64 
65  if( c == 0 )
66  break;
67 
68  --charsleft;
69  } while( 1 );
70 
71  *chars = dst_size - charsleft;
72  }
73  else
74  {
75  charsleft = *chars;
76  while( charsleft > 0 )
77  {
78  extbytes = __fb_utf8_trailingTb[*src];
79 
80  c = 0;
81  switch( extbytes )
82  {
83  case 5:
84  c += *src++; c <<= 6;
85  case 4:
86  c += *src++; c <<= 6;
87  case 3:
88  c += *src++; c <<= 6;
89  case 2:
90  c += *src++; c <<= 6;
91  case 1:
92  c += *src++; c <<= 6;
93  case 0:
94  c += *src++;
95  }
96 
97  c -= __fb_utf8_offsetsTb[extbytes];
98 
99  if( c <= UTF16_MAX_BMP )
100  *dst++ = c;
101  else
102  {
103  c -= UTF16_HALFBASE;
104  *dst++ = ((c >> UTF16_HALFSHIFT) + UTF16_SUR_HIGH_START);
105  if( charsleft > 1 )
106  {
107  *dst++ = ((c & UTF16_HALFMASK) + UTF16_SUR_LOW_START);
108  --charsleft;
109  }
110  }
111 
112  if( c == 0 )
113  break;
114 
115  --charsleft;
116  }
117 
118  *chars -= charsleft;
119  }
120 
121  return buffer;
122 }
123 
124 static FB_WCHAR *hUTF8ToUTF32( const UTF_8 *src, FB_WCHAR *dst, ssize_t *chars )
125 {
126  UTF_32 c;
127  ssize_t extbytes, charsleft;
128  FB_WCHAR *buffer = dst;
129 
130  if( dst == NULL )
131  {
132  ssize_t dst_size = 0;
133  charsleft = 0;
134  do
135  {
136  extbytes = __fb_utf8_trailingTb[(unsigned int)*src];
137 
138  c = 0;
139  switch( extbytes )
140  {
141  case 5:
142  c += *src++; c <<= 6;
143  case 4:
144  c += *src++; c <<= 6;
145  case 3:
146  c += *src++; c <<= 6;
147  case 2:
148  c += *src++; c <<= 6;
149  case 1:
150  c += *src++; c <<= 6;
151  case 0:
152  c += *src++;
153  }
154 
155  c -= __fb_utf8_offsetsTb[extbytes];
156 
157  if( charsleft <= 1 )
158  {
159  charsleft = 8;
160  dst_size += charsleft;
161  buffer = realloc( buffer, dst_size * sizeof( FB_WCHAR ) );
162  dst = buffer + dst_size - charsleft;
163  }
164 
165  *dst++ = c;
166 
167  if( c == 0 )
168  break;
169 
170  --charsleft;
171  } while( 1 );
172 
173  *chars = dst_size - charsleft;
174  }
175  else
176  {
177  charsleft = *chars;
178  while( charsleft > 0 )
179  {
180  extbytes = __fb_utf8_trailingTb[*src];
181 
182  c = 0;
183  switch( extbytes )
184  {
185  case 5:
186  c += *src++; c <<= 6;
187  case 4:
188  c += *src++; c <<= 6;
189  case 3:
190  c += *src++; c <<= 6;
191  case 2:
192  c += *src++; c <<= 6;
193  case 1:
194  c += *src++; c <<= 6;
195  case 0:
196  c += *src++;
197  }
198 
199  c -= __fb_utf8_offsetsTb[extbytes];
200 
201  *dst++ = c;
202 
203  if( c == 0 )
204  break;
205 
206  --charsleft;
207  }
208 
209  *chars -= charsleft;
210  }
211 
212  return buffer;
213 }
214 
215 static FB_WCHAR *hUTF8ToWChar( const UTF_8 *src, FB_WCHAR *dst, ssize_t *chars )
216 {
217  FB_WCHAR *res = NULL;
218 
219  /* convert.. */
220  switch( sizeof( FB_WCHAR ) )
221  {
222  case sizeof( char ):
223  res = (FB_WCHAR *)fb_hUTF8ToChar( src, (char *)dst, chars );
224  break;
225 
226  case sizeof( UTF_16 ):
227  res = hUTF8ToUTF16( src, dst, chars );
228  break;
229 
230  case sizeof( UTF_32 ):
231  res = hUTF8ToUTF32( src, dst, chars );
232  break;
233  }
234 
235  return res;
236 }
237 
238 static FB_WCHAR *hUTF16ToUTF32( const UTF_16 *src, FB_WCHAR *dst, ssize_t *chars )
239 {
240  UTF_16 c;
241  ssize_t charsleft;
242  FB_WCHAR *buffer = dst;
243 
244  if( dst == NULL )
245  {
246  ssize_t dst_size = 0;
247  charsleft = 0;
248  do
249  {
250  c = *src++ & 0x0000FFFF;
251  if( c >= UTF16_SUR_HIGH_START && c <= UTF16_SUR_HIGH_END )
252  {
253  c = ((c - UTF16_SUR_HIGH_START) << UTF16_HALFSHIFT) +
255  }
256 
257  if( charsleft == 0 )
258  {
259  charsleft = 8;
260  dst_size += charsleft;
261  buffer = realloc( buffer, dst_size * sizeof( FB_WCHAR ) );
262  dst = buffer + dst_size - charsleft;
263  }
264 
265  *dst++ = c;
266 
267  if( c == 0 )
268  break;
269 
270  --charsleft;
271  } while( 1 );
272 
273  *chars = dst_size - charsleft;
274  }
275  else
276  {
277  charsleft = *chars;
278  while( charsleft > 0 )
279  {
280  c = *src++ & 0x0000FFFF;
281  if( c >= UTF16_SUR_HIGH_START && c <= UTF16_SUR_HIGH_END )
282  {
283  c = ((c - UTF16_SUR_HIGH_START) << UTF16_HALFSHIFT) +
285  }
286 
287  *dst++ = c;
288 
289  if( c == 0 )
290  break;
291 
292  --charsleft;
293  }
294 
295  *chars -= charsleft;
296  }
297 
298  return buffer;
299 }
300 
301 static FB_WCHAR *hUTF16ToWChar( const UTF_16 *src, FB_WCHAR *dst, ssize_t *chars )
302 {
303  FB_WCHAR *res = NULL;
304 
305  switch( sizeof( FB_WCHAR ) )
306  {
307  case sizeof( char ):
308  res = (FB_WCHAR *)fb_hUTF16ToChar( src, (char *)dst, chars );
309  break;
310 
311  case sizeof( UTF_16 ):
312  if( dst == NULL ) {
313  res = (FB_WCHAR *)src;
314  } else {
315  memcpy( dst, src, *chars * sizeof( UTF_16 ) );
316  res = dst;
317  }
318  break;
319 
320  case sizeof( UTF_32 ):
321  res = hUTF16ToUTF32( src, dst, chars );
322  break;
323  }
324 
325  return res;
326 }
327 
328 
329 static FB_WCHAR *hUTF32ToUTF16( const UTF_32 *src, FB_WCHAR *dst, ssize_t *chars )
330 {
331  UTF_32 c;
332  ssize_t charsleft;
333  FB_WCHAR *buffer = dst;
334 
335  if( dst == NULL )
336  {
337  ssize_t dst_size = 0;
338  charsleft = 0;
339  do
340  {
341  c = *src++;
342 
343  if( charsleft <= 1 )
344  {
345  charsleft = 8 + 1;
346  dst_size += charsleft;
347  buffer = realloc( buffer, dst_size * sizeof( FB_WCHAR ) );
348  dst = buffer + dst_size - charsleft;
349  }
350 
351  if( c > UTF16_MAX_BMP )
352  {
353  *dst++ = (UTF_16)((c >> UTF16_HALFSHIFT) + UTF16_SUR_HIGH_START);
354  c = ((c & UTF16_HALFMASK) + UTF16_SUR_LOW_START);
355  --charsleft;
356  }
357 
358  *dst++ = c;
359 
360  if( c == 0 )
361  break;
362 
363  --charsleft;
364  } while( 1 );
365 
366  *chars = dst_size - charsleft;
367  }
368  else
369  {
370  charsleft = *chars;
371  while( charsleft > 0 )
372  {
373  c = *src++;
374 
375  if( c > UTF16_MAX_BMP )
376  {
377  if( charsleft > 1 )
378  {
379  *dst++ = (UTF_16)((c >> UTF16_HALFSHIFT) + UTF16_SUR_HIGH_START);
380  c = ((c & UTF16_HALFMASK) + UTF16_SUR_LOW_START);
381  --charsleft;
382  }
383  }
384 
385  *dst++ = c;
386 
387  if( c == 0 )
388  break;
389 
390  --charsleft;
391  }
392 
393  *chars -= charsleft;
394  }
395 
396  return buffer;
397 }
398 
399 static FB_WCHAR *hUTF32ToWChar( const UTF_32 *src, FB_WCHAR *dst, ssize_t *chars )
400 {
401  FB_WCHAR *res = NULL;
402 
403  switch( sizeof( FB_WCHAR ) )
404  {
405  case sizeof( char ):
406  res = (FB_WCHAR *)fb_hUTF32ToChar( src, (char *)dst, chars );
407  break;
408 
409  case sizeof( UTF_16 ):
410  res = hUTF32ToUTF16( src, dst, chars );
411  break;
412 
413  case sizeof( UTF_32 ):
414  if( dst == NULL ) {
415  res = (FB_WCHAR *)src;
416  } else {
417  memcpy( dst, src, *chars * sizeof( UTF_32 ) );
418  res = dst;
419  }
420  break;
421  }
422 
423  return res;
424 }
425 
426 FB_WCHAR *fb_UTFToWChar( FB_FILE_ENCOD encod, const void *src, FB_WCHAR *dst, ssize_t *chars )
427 {
428  switch( encod )
429  {
430  case FB_FILE_ENCOD_UTF8:
431  return hUTF8ToWChar( src, dst, chars );
432 
433  case FB_FILE_ENCOD_UTF16:
434  return hUTF16ToWChar( src, dst, chars );
435 
436  case FB_FILE_ENCOD_UTF32:
437  return hUTF32ToWChar( src, dst, chars );
438 
439  default:
440  return NULL;
441  }
442 }