FreeBASIC  0.91.0
lex-utf.bas
Go to the documentation of this file.
1 '' lex's UTF readers
2 ''
3 ''
4 '' chng: nov/2005 written [v1ctor]
5 ''
6 
7 
8 #include once "fb.bi"
9 #include once "fbint.bi"
10 #include once "lex.bi"
11 
12 #define UTF16_MAX_BMP &h0000FFFF
13 #define UTF16_SUR_HIGH_START &hD800
14 #define UTF16_SUR_HIGH_END &hDBFF
15 #define UTF16_SUR_LOW_START &hDC00
16 #define UTF16_SUR_LOW_END &hDFFF
17 #define UTF16_HALFSHIFT 10
18 #define UTF16_HALFBASE &h0010000UL
19 #define UTF16_HALFMASK &h3FFUL
20 
21 #define U16_SWAP(c) (((c) shr 8) or ((c) shl 8) and &hFF00)
22 
23 #define U32_SWAP(c) (((c) shr 24) or (((c) shl 8) and &h00FF0000) or _
24  (((c) shr 8) and &h0000FF00) or ((c) shl 24))
25 
26 ''::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
27 '' UTF-8
28 ''::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
29 
30  dim shared as ubyte utf8_trailingTb(0 to 255) => _
31  { _
32  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,_
33  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,_
34  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,_
35  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,_
36  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,_
37  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,_
38  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,_
39  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 _
40  }
41 
42  dim shared as uinteger utf8_offsetsTb(0 to 5) => _
43  { _
44  &h00000000UL, &h00003080UL, &h000E2080UL, &h03C82080UL, &hFA082080UL, &h82082080UL _
45  }
46 
47 '':::::
48 function hUTF8ToChar( ) as integer static
49  dim as ubyte src(0 to 6)
50  dim as ubyte ptr p
51  dim as uinteger c
52  dim as ubyte ptr dst
53  dim as integer chars, extbytes, i
54 
55  dst = cast( ubyte ptr, @lex.ctx->buffw )
56  chars = 0
57 
58  do while( chars < LEX_MAXBUFFCHARS )
59  if( eof( env.inf.num ) ) then
60  exit do
61  end if
62 
63  if( get( #env.inf.num, , src(0) ) <> 0 ) then
64  exit do
65  end if
66 
67  extbytes = utf8_trailingTb(src(0))
68 
69  c = 0
70  p = @src(0)
71  if( extbytes > 0 ) then
72  if( get( #env.inf.num, , src(1), extbytes ) <> 0 ) then
73  exit do
74  end if
75 
76  i = extbytes
77  do
78  c += *p
79  p += 1
80  c shl= 6
81  i -= 1
82  loop while( i > 0 )
83  end if
84 
85  c += *p
86 
87  c -= utf8_offsetsTb(extbytes)
88 
89  if( c > 255 ) then
90  c = asc( "?" )
91  end if
92 
93  *dst = c
94  dst += 1
95  chars += 1
96  loop
97 
98  function = chars
99 
100 end function
101 
102 '':::::
103 function hUTF8ToUTF16LE( ) as integer static
104  dim as ubyte src(0 to 6)
105  dim as ubyte ptr p
106  dim as uinteger c
107  dim as ushort ptr dst
108  dim as integer chars, extbytes, i
109 
110  dst = cast( ushort ptr, @lex.ctx->buffw )
111  chars = 0
112 
113  do while( chars < LEX_MAXBUFFCHARS )
114  if( eof( env.inf.num ) ) then
115  exit do
116  end if
117 
118  if( get( #env.inf.num, , src(0) ) <> 0 ) then
119  exit do
120  end if
121 
122  extbytes = utf8_trailingTb(src(0))
123 
124  c = 0
125  p = @src(0)
126  if( extbytes > 0 ) then
127  if( get( #env.inf.num, , src(1), extbytes ) <> 0 ) then
128  exit do
129  end if
130 
131  i = extbytes
132  do
133  c += *p
134  p += 1
135  c shl= 6
136  i -= 1
137  loop while( i > 0 )
138  end if
139 
140  c += *p
141 
142  c -= utf8_offsetsTb(extbytes)
143 
144  '' create surrogate?
145  if( c > UTF16_MAX_BMP ) then
146  if( chars < LEX_MAXBUFFCHARS-1 ) then
147  *dst = (c shr UTF16_HALFSHIFT) + UTF16_SUR_HIGH_START
148  dst += 1
149  chars += 1
150  end if
151 
152  c = (c and UTF16_HALFMASK) + UTF16_SUR_LOW_START
153  end if
154 
155  *dst = c
156  dst += 1
157  chars += 1
158  loop
159 
160  function = chars
161 
162 end function
163 
164 '':::::
165 function hUTF8ToUTF32LE( ) as integer static
166  dim as ubyte src(0 to 6)
167  dim as ubyte ptr p
168  dim as uinteger c
169  dim as ulong ptr dst
170  dim as integer chars, extbytes, i
171 
172  dst = cast( ulong ptr, @lex.ctx->buffw )
173  chars = 0
174 
175  do while( chars < LEX_MAXBUFFCHARS )
176  if( eof( env.inf.num ) ) then
177  exit do
178  end if
179 
180  if( get( #env.inf.num, , src(0) ) <> 0 ) then
181  exit do
182  end if
183 
184  extbytes = utf8_trailingTb(src(0))
185 
186  c = 0
187  p = @src(0)
188  if( extbytes > 0 ) then
189  if( get( #env.inf.num, , src(1), extbytes ) <> 0 ) then
190  exit do
191  end if
192 
193  i = extbytes
194  do
195  c += *p
196  p += 1
197  c shl= 6
198  i -= 1
199  loop while( i > 0 )
200  end if
201 
202  c += *p
203 
204  c -= utf8_offsetsTb(extbytes)
205 
206  *dst = c
207  dst += 1
208  chars += 1
209  loop
210 
211  function = chars
212 
213 end function
214 
215 '':::::
216 function hUTF8ToUTF32BE( ) as integer static
217  dim as ulong ptr dst
218  dim as uinteger c
219  dim as integer i, chars
220 
221  chars = hUTF8ToUTF32LE( )
222 
223  dst = cast( ulong ptr, @lex.ctx->buffw )
224  for i = 1 to chars
225  c = *dst
226  *dst = U32_SWAP( c )
227  dst += 1
228  next
229 
230  function = chars
231 
232 end function
233 
234 '':::::
236  dim as integer chars
237 
238  if sizeof(wstring) = 1 then
239  chars = hUTF8ToChar( )
240  elseif sizeof(wstring) = 2 then
241  chars = hUTF8ToUTF16LE( )
242  else
243 #ifdef __FB_BIGENDIAN__
244  chars = hUTF8ToUTF32BE( )
245 #else
246  chars = hUTF8ToUTF32LE( )
247 #endif
248  end if
249 
250  lex.ctx->bufflen = chars
251  lex.ctx->buffptrw = @lex.ctx->buffw
252 
253 end sub
254 
255 ''::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
256 '' UTF-16LE
257 ''::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
258 
259 '':::::
260 function hUTF16LEToChar( ) as integer static
261  dim as ushort c
262  dim as ubyte ptr dst
263  dim as integer chars
264 
265  dst = cast( ubyte ptr, @lex.ctx->buffw )
266  chars = 0
267 
268  do while( chars < LEX_MAXBUFFCHARS )
269  if( eof( env.inf.num ) ) then
270  exit do
271  end if
272 
273  if( get( #env.inf.num, , c ) <> 0 ) then
274  exit do
275  end if
276 
277  if( c > 255 ) then
278  '' surrogate?
279  if( c >= UTF16_SUR_HIGH_START ) then
280  if( c <= UTF16_SUR_HIGH_END ) then
281  if( get( #env.inf.num, , c ) <> 0 ) then
282  exit do
283  end if
284  end if
285  end if
286 
287  c = asc( "?" )
288  end if
289 
290  *dst = c
291  dst += 1
292  chars += 1
293  loop
294 
295  function = chars
296 
297 end function
298 
299 '':::::
300 function hUTF16LEToUTF16LE( ) as integer static
301 
302  if( get( #env.inf.num, , lex.ctx->buffw ) = 0 ) then
303  function = cunsg(seek( env.inf.num ) - lex.ctx->filepos) \ len( ushort )
304  else
305  function = 0
306  end if
307 
308 end function
309 
310 '':::::
311 function hUTF16LEToUTF32LE( ) as integer static
312  dim as ushort c
313  dim as uinteger wc
314  dim as ulong ptr dst
315  dim as integer chars
316 
317  dst = cast( ulong ptr, @lex.ctx->buffw )
318  chars = 0
319 
320  do while( chars < LEX_MAXBUFFCHARS )
321  if( eof( env.inf.num ) ) then
322  exit do
323  end if
324 
325  if( get( #env.inf.num, , c ) <> 0 ) then
326  exit do
327  end if
328 
329  wc = c
330  '' surrogate?
331  if( wc >= UTF16_SUR_HIGH_START ) then
332  if( wc <= UTF16_SUR_HIGH_END ) then
333  if( get( #env.inf.num, , c ) <> 0 ) then
334  exit do
335  end if
336 
337  wc = ((wc - UTF16_SUR_HIGH_START) shl UTF16_HALFSHIFT) + _
338  (cuint( c ) - UTF16_SUR_LOW_START) + UTF16_HALFBASE
339  end if
340  end if
341 
342  *dst = wc
343  dst += 1
344  chars += 1
345  loop
346 
347  function = chars
348 
349 end function
350 
351 '':::::
352 function hUTF16LEToUTF32BE( ) as integer static
353  dim as ulong ptr dst
354  dim as uinteger c
355  dim as integer i, chars
356 
357  chars = hUTF16LEToUTF32LE( )
358 
359  dst = cast( ulong ptr, @lex.ctx->buffw )
360  for i = 1 to chars
361  c = *dst
362  *dst = U32_SWAP( c )
363  dst += 1
364  next
365 
366  function = chars
367 
368 end function
369 
370 '':::::
371 sub lexReadUTF16LE( ) static
372  dim as integer chars
373 
374  if sizeof(wstring) = 1 then
375  chars = hUTF16LEToChar( )
376  elseif sizeof(wstring) = 2 then
377  chars = hUTF16LEToUTF16LE( )
378  else
379 #ifdef __FB_BIGENDIAN__
380  chars = hUTF16LEToUTF32BE( )
381 #else
382  chars = hUTF16LEToUTF32LE( )
383 #endif
384  end if
385 
386  lex.ctx->bufflen = chars
387  lex.ctx->buffptrw = @lex.ctx->buffw
388 
389 end sub
390 
391 ''::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
392 '' UTF-16BE
393 ''::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
394 
395 '':::::
396 function hUTF16BEToChar( ) as integer static
397  dim as ushort c
398  dim as ubyte ptr dst
399  dim as integer chars
400 
401  dst = cast( ubyte ptr, @lex.ctx->buffw )
402  chars = 0
403 
404  do while( chars < LEX_MAXBUFFCHARS )
405  if( eof( env.inf.num ) ) then
406  exit do
407  end if
408 
409  if( get( #env.inf.num, , c ) <> 0 ) then
410  exit do
411  end if
412 
413  c = U16_SWAP( c )
414 
415  if( c > 255 ) then
416  '' surrogate?
417  if( c >= UTF16_SUR_HIGH_START ) then
418  if( c <= UTF16_SUR_HIGH_END ) then
419  if( get( #env.inf.num, , c ) <> 0 ) then
420  exit do
421  end if
422  end if
423  end if
424 
425  c = asc( "?" )
426  end if
427 
428  *dst = c
429  dst += 1
430  chars += 1
431  loop
432 
433  function = chars
434 
435 end function
436 
437 '':::::
438 function hUTF16BEToUTF16LE( ) as integer static
439  dim as ushort ptr dst
440  dim as ushort c
441  dim as integer i, chars
442 
443  chars = hUTF16LEToUTF16LE( )
444 
445  dst = cast( ushort ptr, @lex.ctx->buffw )
446  for i = 1 to chars
447  c = *dst
448  *dst = U16_SWAP( c )
449  dst += 1
450  next
451 
452  function = chars
453 
454 end function
455 
456 '':::::
457 function hUTF16BEToUTF32LE( ) as integer static
458  dim as ushort c
459  dim as uinteger wc
460  dim as ulong ptr dst
461  dim as integer chars
462 
463  dst = cast( ulong ptr, @lex.ctx->buffw )
464  chars = 0
465 
466  do while( chars < LEX_MAXBUFFCHARS )
467  if( eof( env.inf.num ) ) then
468  exit do
469  end if
470 
471  if( get( #env.inf.num, , c ) <> 0 ) then
472  exit do
473  end if
474 
475  wc = U16_SWAP( c )
476 
477  '' surrogate?
478  if( wc >= UTF16_SUR_HIGH_START ) then
479  if( wc <= UTF16_SUR_HIGH_END ) then
480  if( get( #env.inf.num, , c ) <> 0 ) then
481  exit do
482  end if
483 
484  wc = ((wc - UTF16_SUR_HIGH_START) shl UTF16_HALFSHIFT) + _
485  (cuint( U16_SWAP( c ) ) - UTF16_SUR_LOW_START) + UTF16_HALFBASE
486  end if
487  end if
488 
489  *dst = wc
490  dst += 1
491  chars += 1
492  loop
493 
494  function = chars
495 
496 end function
497 
498 '':::::
499 function hUTF16BEToUTF32BE( ) as integer static
500 
501  function = hUTF16LEToUTF32LE( )
502 
503 end function
504 
505 '':::::
506 sub lexReadUTF16BE( ) static
507  dim as integer chars
508 
509  if sizeof(wstring) = 1 then
510  chars = hUTF16BEToChar( )
511  elseif sizeof(wstring) = 2 then
512  chars = hUTF16BEToUTF16LE( )
513  else
514 #ifdef __FB_BIGENDIAN__
515  chars = hUTF16BEToUTF32BE( )
516 #else
517  chars = hUTF16BEToUTF32LE( )
518 #endif
519  end if
520 
521  lex.ctx->bufflen = chars
522  lex.ctx->buffptrw = @lex.ctx->buffw
523 
524 end sub
525 
526 ''::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
527 '' UTF-32LE
528 ''::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
529 
530 '':::::
531 function hUTF32LEToChar( ) as integer static
532  dim as uinteger c
533  dim as ubyte ptr dst
534  dim as integer chars
535 
536  dst = cast( ubyte ptr, @lex.ctx->buffw )
537  chars = 0
538 
539  do while( chars < LEX_MAXBUFFCHARS )
540  if( eof( env.inf.num ) ) then
541  exit do
542  end if
543 
544  if( get( #env.inf.num, , c ) <> 0 ) then
545  exit do
546  end if
547 
548  if( c > 255 ) then
549  c = asc( "?" )
550  end if
551 
552  *dst = c
553  dst += 1
554  chars += 1
555  loop
556 
557  function = chars
558 
559 end function
560 
561 '':::::
562 function hUTF32LEToUTF16LE( ) as integer static
563  dim as uinteger c
564  dim as ushort ptr dst
565  dim as integer chars
566 
567  dst = cast( ushort ptr, @lex.ctx->buffw )
568  chars = 0
569 
570  do while( chars < LEX_MAXBUFFCHARS )
571  if( eof( env.inf.num ) ) then
572  exit do
573  end if
574 
575  if( get( #env.inf.num, , c ) <> 0 ) then
576  exit do
577  end if
578 
579  '' create surrogate?
580  if( c > UTF16_MAX_BMP ) then
581  if( chars < LEX_MAXBUFFCHARS-1 ) then
582  *dst = (c shr UTF16_HALFSHIFT) + UTF16_SUR_HIGH_START
583  dst += 1
584  chars += 1
585  end if
586 
587  c = (c and UTF16_HALFMASK) + UTF16_SUR_LOW_START
588  end if
589 
590  *dst = c
591  dst += 1
592  chars += 1
593  loop
594 
595  function = chars
596 
597 end function
598 
599 '':::::
600 function hUTF32LEToUTF32LE( ) as integer static
601 
602  if( get( #env.inf.num, , lex.ctx->buffw ) = 0 ) then
603  function = cunsg(seek( env.inf.num ) - lex.ctx->filepos) \ len( uinteger )
604  else
605  function = 0
606  end if
607 
608 end function
609 
610 '':::::
611 function hUTF32LEToUTF32BE( ) as integer static
612  dim as ulong ptr dst
613  dim as uinteger c
614  dim as integer i, chars
615 
616  chars = hUTF32LEToUTF32LE( )
617 
618  dst = cast( ulong ptr, @lex.ctx->buffw )
619  for i = 1 to chars
620  c = *dst
621  *dst = U32_SWAP( c )
622  dst += 1
623  next
624 
625  function = chars
626 
627 end function
628 
629 '':::::
631  dim as integer chars
632 
633  if sizeof(wstring) = 1 then
634  chars = hUTF32LEToChar( )
635  elseif sizeof(wstring) = 2 then
636  chars = hUTF32LEToUTF16LE( )
637  else
638 #ifdef __FB_BIGENDIAN__
639  chars = hUTF32LEToUTF32BE( )
640 #else
641  chars = hUTF32LEToUTF32LE( )
642 #endif
643  end if
644 
645  lex.ctx->bufflen = chars
646  lex.ctx->buffptrw = @lex.ctx->buffw
647 
648 end sub
649 
650 ''::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
651 '' UTF-32BE
652 ''::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
653 
654 '':::::
655 function hUTF32BEToChar( ) as integer static
656  dim as uinteger c
657  dim as ubyte ptr dst
658  dim as integer chars
659 
660  dst = cast( ubyte ptr, @lex.ctx->buffw )
661  chars = 0
662 
663  do while( chars < LEX_MAXBUFFCHARS )
664  if( eof( env.inf.num ) ) then
665  exit do
666  end if
667 
668  if( get( #env.inf.num, , c ) <> 0 ) then
669  exit do
670  end if
671 
672  c = U32_SWAP( c )
673 
674  if( c > 255 ) then
675  c = asc( "?" )
676  end if
677 
678  *dst = c
679  dst += 1
680  chars += 1
681  loop
682 
683  function = chars
684 
685 end function
686 
687 '':::::
688 function hUTF32BEToUTF16LE( ) as integer static
689  dim as uinteger c
690  dim as ushort ptr dst
691  dim as integer chars
692 
693  dst = cast( ushort ptr, @lex.ctx->buffw )
694  chars = 0
695 
696  do while( chars < LEX_MAXBUFFCHARS )
697  if( eof( env.inf.num ) ) then
698  exit do
699  end if
700 
701  if( get( #env.inf.num, , c ) <> 0 ) then
702  exit do
703  end if
704 
705  c = U32_SWAP( c )
706 
707  '' create surrogate?
708  if( c > UTF16_MAX_BMP ) then
709  if( chars < LEX_MAXBUFFCHARS-1 ) then
710  *dst = (c shr UTF16_HALFSHIFT) + UTF16_SUR_HIGH_START
711  dst += 1
712  chars += 1
713  end if
714 
715  c = (c and UTF16_HALFMASK) + UTF16_SUR_LOW_START
716  end if
717 
718  *dst = c
719  dst += 1
720  chars += 1
721  loop
722 
723  function = chars
724 
725 end function
726 
727 '':::::
728 function hUTF32BEToUTF32LE( ) as integer static
729  dim as ulong ptr dst
730  dim as uinteger c
731  dim as integer i, chars
732 
733  chars = hUTF32LEToUTF32LE( )
734 
735  dst = cast( ulong ptr, @lex.ctx->buffw )
736  for i = 1 to chars
737  c = *dst
738  *dst = U32_SWAP( c )
739  dst += 1
740  next
741 
742  function = chars
743 
744 end function
745 
746 '':::::
747 function hUTF32BEToUTF32BE( ) as integer static
748 
749  function = hUTF32LEToUTF32LE( )
750 
751 end function
752 
753 '':::::
755  dim as integer chars
756 
757  if sizeof(wstring) = 1 then
758  chars = hUTF32BEToChar( )
759  elseif sizeof(wstring) = 2 then
760  chars = hUTF32BEToUTF16LE( )
761  else
762 #ifdef __FB_BIGENDIAN__
763  chars = hUTF32BEToUTF32BE( )
764 #else
765  chars = hUTF32BEToUTF32LE( )
766 #endif
767  end if
768 
769  lex.ctx->bufflen = chars
770  lex.ctx->buffptrw = @lex.ctx->buffw
771 
772 end sub
773 
774