FreeBASIC  0.91.0
dev_file_encod_read_core.c
Go to the documentation of this file.
1 /* UTF-encoded to char or wchar file reading
2  * (based on ConvertUTF.c free implementation from Unicode, Inc)
3  */
4 
5 #include "fb.h"
6 
7 extern const char __fb_utf8_trailingTb[256];
8 extern const UTF_32 __fb_utf8_offsetsTb[6];
9 
10 /*::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::*
11  * to char *
12  *::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::*/
13 
14 static ssize_t hReadUTF8ToChar( FILE *fp, char *dst, ssize_t max_chars )
15 {
16  UTF_32 wc;
17  unsigned char c[7], *p;
18  ssize_t chars, extbytes;
19 
20  chars = max_chars;
21  while( chars > 0 )
22  {
23  if( fread( &c[0], 1, 1, fp ) != 1 )
24  break;
25 
26  extbytes = __fb_utf8_trailingTb[c[0]];
27 
28  if( extbytes > 0 )
29  if( fread( &c[1], extbytes, 1, fp ) != 1 )
30  break;
31 
32  wc = 0;
33  p = &c[0];
34  switch( extbytes )
35  {
36  case 5:
37  wc += *p++;
38  wc <<= 6;
39  case 4:
40  wc += *p++;
41  wc <<= 6;
42  case 3:
43  wc += *p++;
44  wc <<= 6;
45  case 2:
46  wc += *p++;
47  wc <<= 6;
48  case 1:
49  wc += *p++;
50  wc <<= 6;
51  case 0:
52  wc += *p++;
53  }
54 
55  wc -= __fb_utf8_offsetsTb[extbytes];
56 
57  if( wc > 255 )
58  wc = '?';
59 
60  *dst++ = wc;
61  --chars;
62  }
63 
64  return max_chars - chars;
65 }
66 
67 static ssize_t hReadUTF16ToChar( FILE *fp, char *dst, ssize_t max_chars )
68 {
69  ssize_t chars;
70  UTF_16 c;
71 
72  chars = max_chars;
73  while( chars > 0 )
74  {
75  if( fread( &c, sizeof( UTF_16 ), 1, fp ) != 1 )
76  break;
77 
78  if( c > 255 )
79  {
80  if( c >= UTF16_SUR_HIGH_START && c <= UTF16_SUR_HIGH_END )
81  {
82  if( fread( &c, sizeof( UTF_16 ), 1, fp ) != 1 )
83  break;
84  }
85  c = '?';
86  }
87 
88  *dst++ = c;
89  --chars;
90  }
91 
92  return max_chars - chars;
93 }
94 
95 static ssize_t hReadUTF32ToChar( FILE *fp, char *dst, ssize_t max_chars )
96 {
97  ssize_t chars;
98  UTF_32 c;
99 
100  chars = max_chars;
101  while( chars > 0 )
102  {
103  if( fread( &c, sizeof( UTF_32 ), 1, fp ) != 1 )
104  break;
105 
106  if( c > 255 )
107  c = '?';
108 
109  *dst++ = c;
110  --chars;
111  }
112 
113  return max_chars - chars;
114 }
115 
116 ssize_t fb_hFileRead_UTFToChar( FILE *fp, FB_FILE_ENCOD encod, char *dst, ssize_t max_chars )
117 {
118  switch( encod )
119  {
120  case FB_FILE_ENCOD_UTF8:
121  return hReadUTF8ToChar( fp, dst, max_chars );
122 
123  case FB_FILE_ENCOD_UTF16:
124  return hReadUTF16ToChar( fp, dst, max_chars );
125 
126  case FB_FILE_ENCOD_UTF32:
127  return hReadUTF32ToChar( fp, dst, max_chars );
128 
129  default:
130  return 0;
131  }
132 
133 }
134 
135 /*::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::*
136  * to wchar *
137  *::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::*/
138 
139 static ssize_t hUTF8ToUTF16( FILE *fp, FB_WCHAR *dst, ssize_t max_chars )
140 {
141  UTF_32 wc;
142  unsigned char c[7], *p;
143  ssize_t chars, extbytes;
144 
145  chars = max_chars;
146  while( chars > 0 )
147  {
148  if( fread( &c[0], 1, 1, fp ) != 1 )
149  break;
150 
151  extbytes = __fb_utf8_trailingTb[c[0]];
152 
153  if( extbytes > 0 )
154  if( fread( &c[1], extbytes, 1, fp ) != 1 )
155  break;
156 
157  wc = 0;
158  p = &c[0];
159  switch( extbytes )
160  {
161  case 5:
162  wc += *p++;
163  wc <<= 6;
164  case 4:
165  wc += *p++;
166  wc <<= 6;
167  case 3:
168  wc += *p++;
169  wc <<= 6;
170  case 2:
171  wc += *p++;
172  wc <<= 6;
173  case 1:
174  wc += *p++;
175  wc <<= 6;
176  case 0:
177  wc += *p++;
178  }
179 
180  wc -= __fb_utf8_offsetsTb[extbytes];
181 
182  if( wc <= UTF16_MAX_BMP )
183  {
184  *dst++ = wc;
185  }
186  else
187  {
188  if( chars > 1 )
189  {
190  wc -= UTF16_HALFBASE;
191  *dst++ = ((wc >> UTF16_HALFSHIFT) + UTF16_SUR_HIGH_START);
192  *dst++ = ((wc & UTF16_HALFMASK) + UTF16_SUR_LOW_START);
193  --chars;
194  }
195  }
196 
197  --chars;
198  }
199 
200  return max_chars - chars;
201 }
202 
203 static ssize_t hUTF8ToUTF32( FILE *fp, FB_WCHAR *dst, ssize_t max_chars )
204 {
205  UTF_32 wc;
206  unsigned char c[7], *p;
207  ssize_t chars, extbytes;
208 
209  chars = max_chars;
210  while( chars > 0 )
211  {
212  if( fread( &c[0], 1, 1, fp ) != 1 )
213  break;
214 
215  extbytes = __fb_utf8_trailingTb[c[0]];
216 
217  if( extbytes > 0 )
218  if( fread( &c[1], extbytes, 1, fp ) != 1 )
219  break;
220 
221  wc = 0;
222  p = &c[0];
223  switch( extbytes )
224  {
225  case 5:
226  wc += *p++;
227  wc <<= 6;
228  case 4:
229  wc += *p++;
230  wc <<= 6;
231  case 3:
232  wc += *p++;
233  wc <<= 6;
234  case 2:
235  wc += *p++;
236  wc <<= 6;
237  case 1:
238  wc += *p++;
239  wc <<= 6;
240  case 0:
241  wc += *p++;
242  }
243 
244  wc -= __fb_utf8_offsetsTb[extbytes];
245 
246  *dst++ = wc;
247  --chars;
248  }
249 
250  return max_chars - chars;
251 }
252 
253 static ssize_t hReadUTF8ToWchar( FILE *fp, FB_WCHAR *dst, ssize_t max_chars )
254 {
255  ssize_t res = 0;
256 
257  /* convert.. */
258  switch( sizeof( FB_WCHAR ) )
259  {
260  case sizeof( char ):
261  res = hReadUTF8ToChar( fp, (char *)dst, max_chars );
262  break;
263 
264  case sizeof( UTF_16 ):
265  res = hUTF8ToUTF16( fp, dst, max_chars );
266  break;
267 
268  case sizeof( UTF_32 ):
269  res = hUTF8ToUTF32( fp, dst, max_chars );
270  break;
271  }
272 
273  return res;
274 }
275 
276 static ssize_t hUTF16ToUTF32( FILE *fp, FB_WCHAR *dst, ssize_t max_chars )
277 {
278  UTF_32 c, c2;
279  ssize_t chars;
280 
281  chars = max_chars;
282  while( chars > 0 )
283  {
284  if( fread( &c, sizeof( UTF_16 ), 1, fp ) != 1 )
285  break;
286 
287  c &= 0x0000FFFF;
288  if( c >= UTF16_SUR_HIGH_START && c <= UTF16_SUR_HIGH_END )
289  {
290  if( fread( &c2, sizeof( UTF_16 ), 1, fp ) != 1 )
291  break;
292 
293  c = ((c - UTF16_SUR_HIGH_START) << UTF16_HALFSHIFT) +
295  }
296 
297  *dst++ = c;
298  --chars;
299  }
300 
301  return max_chars - chars;
302 }
303 
304 static ssize_t hReadUTF16ToWchar( FILE *fp, FB_WCHAR *dst, ssize_t max_chars )
305 {
306  ssize_t res = 0;
307 
308  /* same size? */
309  if( sizeof( FB_WCHAR ) == sizeof( UTF_16 ) )
310  return fread( (char *)dst, sizeof( UTF_16 ), max_chars, fp );
311 
312  /* convert.. */
313  switch( sizeof( FB_WCHAR ) )
314  {
315  case sizeof( char ):
316  res = hReadUTF16ToChar( fp, (char *)dst, max_chars );
317  break;
318 
319  case sizeof( UTF_32 ):
320  res = hUTF16ToUTF32( fp, dst, max_chars );
321  break;
322  }
323 
324  return res;
325 }
326 
327 static ssize_t hUTF32ToUTF16( FILE *fp, FB_WCHAR *dst, ssize_t max_chars )
328 {
329  UTF_32 c;
330  ssize_t chars;
331 
332  chars = max_chars;
333  while( chars > 0 )
334  {
335  if( fread( &c, sizeof( UTF_32 ), 1, fp ) != 1 )
336  break;
337 
338  if( c > UTF16_MAX_BMP )
339  {
340  if( chars > 1 )
341  {
342  *dst++ = (UTF_16)((c >> UTF16_HALFSHIFT) + UTF16_SUR_HIGH_START);
343  --chars;
344  }
345 
346  c = ((c & UTF16_HALFMASK) + UTF16_SUR_LOW_START);
347  }
348 
349  *dst++ = (UTF_16)c;
350  --chars;
351  }
352 
353  return max_chars - chars;
354 }
355 
356 static ssize_t hReadUTF32ToWchar( FILE *fp, FB_WCHAR *dst, ssize_t max_chars )
357 {
358  ssize_t res = 0;
359 
360  switch( sizeof( FB_WCHAR ) )
361  {
362  case sizeof( char ):
363  res = hReadUTF32ToChar( fp, (char *)dst, max_chars );
364  break;
365 
366  case sizeof( UTF_16 ):
367  res = hUTF32ToUTF16( fp, dst, max_chars );
368  break;
369 
370  case sizeof( UTF_32 ):
371  res = fread( (char *)dst, sizeof( UTF_32 ), max_chars, fp );
372  break;
373  }
374 
375  return res;
376 }
377 
378 ssize_t fb_hFileRead_UTFToWchar( FILE *fp, FB_FILE_ENCOD encod, FB_WCHAR *dst, ssize_t max_chars )
379 {
380  switch( encod )
381  {
382  case FB_FILE_ENCOD_UTF8:
383  return hReadUTF8ToWchar( fp, dst, max_chars );
384 
385  case FB_FILE_ENCOD_UTF16:
386  return hReadUTF16ToWchar( fp, dst, max_chars );
387 
388  case FB_FILE_ENCOD_UTF32:
389  return hReadUTF32ToWchar( fp, dst, max_chars );
390 
391  default:
392  return 0;
393  }
394 
395 }