Jump to content

How to read UTF-8 Encoding?


mit

Recommended Posts

Hello everyone

 

I try to make code to read data from .txt file and fill to object data table in ]

 

not sure but just give a try here, sorry if not a good solution

Can't display "[color="red"]ດິນບຸກຄົນ[/color]" ?
[color="green"];;;(setq remark (vk_ReadTextStream "C:/test.txt" "UTF-8"))[/color]

[color="green"];Try alternative way manually copy text from text file then paste[/color]
(setq [b]remark[/b] ([color="blue"]getstring [/color] "\nPaste our text here -> "))
[color="green"];"\U+0E94\U+0EB4\U+0E99\U+0E9A\U+0EB8\U+0E81\U+0E84\U+0EBB\U+0E99"[/color]

[color="green"];or dialog [/color]
(setq [b]remark[/b] (lisped "paste here ") )

Link to comment
Share on other sites

some asian font can be shown normal open function tho it only support ANSI

if the initial pair is FE FF (hex) or 254 255

 

save your test.txt as Unicode

 

(setq f (open path "r"))
(setq ret (read-line f))[color="green"] ;<--test only 1st line[/color]
(if f (close f))

 

(defun foo ( str ) ; read unicode - test version 
hanhphuc 17.04.2018 
 (apply 'strcat
(mapcar
  	''( ( x ) (apply 'strcat (vl-list* (chr 92) "U+" (mapcar ''( (x / $)  (setq $ ( LM:dec->base x 16))
					  (if (or (< x 10) (=(strlen $)1)) (strcat "0" $) $) )
				    (reverse x)
				)
		   )
  	     )
     	   )
	
	(
	 '( ( f ) (f (vl-remove-if
		      '(lambda (x) (vl-some '(lambda (y)
					       (= x y)
					       )
					   '( 254 255 ))
				      )
			(vl-string->list str)
		      )
		   )
  	   	   )
 	 '( ( l ) (if l (cons (list (car l)(cadr l))
       			 (f (cddr l)))
		   )
  	  )
	)	
)
 )
)

;; Decimal to Base  -  Lee Mac
;; Converts a decimal number to another base.
;; n - [int] decimal integer
;; b - [int] non-zero positive integer base
;; Returns: [str] Representation of decimal in specified base

(defun LM:dec->base ( n b )
   (if (< n b)
       (chr (+ n (if (< n 10) 48 55)))
       (strcat (LM:dec->base (/ n b) b) (LM:dec->base (rem n b) b))
   )
)

 

test..


(alert (foo ret ) ) 
[b][color="red"]ດິນບຸກຄົນ[/color][/b] ??

[color="green"]"\U+0E94\U+0EB4\U+0E99\U+0E9A\U+0EB8\U+0E81\U+0E84\U+0EBB\U+0E99" [/color]

 

Try if the above if working for your language?

else plan B: assumed you FSO read stream with UTF-8 file it is more stable but difficult to pair 1~4 bytes, if i have some times

Link to comment
Share on other sites

Great!!!

It can work

 

Thank you very much hanhphuc

and Thank Lee Mac Code

 

you are welcome. hope you will code by yourself next time :)

 

Here's my UTF-8 functions may be useful in future if you have issue with the previous unicode method. Try it & good luck..


[color="green"];Reference, post#138 
;https://stackoverflow.com/questions/643694/what-is-the-difference-between-utf-8-and-unicode[/color]

(defun [color="blue"]UTF8->unicode[/color] ( l / ls 8b d2 foo) ; encode UTF-8 to unicode
[color="green"];;;hanhphuc 17.04.2018 [/color]
 (setq	8b '((s) (while (< (strlen s)  (setq s (strcat "0" s))) s) 
	d2 '((str) ;split string to two list  
		 (if (> (strlen str) 0)
  		 (cons (substr str 1  (d2 (setq str (substr str 9 ))))
   		)
	     )
foo '(($ / pos i) ; base2 to decimal 
 		(setq i 0)
 		(+ (cond ((while (and (> (strlen $) 0) (setq pos (vl-string-search "1" $)))
     	 		(setq 	$ (substr $ (+ 2 pos))
	   			i (+ i (expt 2 (strlen $)))
	   		 )
      		     )
    		  )
   		(0)
   	      )
    		   (atoi $)
    	     	  )
     	)
 	ls (mapcar ''((x / $) 
	      (setq $ (LM:dec->base (foo x) 16))
	      (if
	       (= (strlen $) 1)
	       (strcat "0" $)
	       $
	       )
	      )
	   (d2 
	     (apply 'strcat
		    (mapcar ''((a x) (substr (8b a) (- 9 x) x))
			    l
			    (cdr (assoc (length l) '((1 . (7)) (2 . (5 6)) (3 . (4 6 6)) (4 . (3 6 6 6)))))
			    )
		    )
	     ) 
	   ) 
)
 (apply 'strcat
 (vl-list* "\\U"
	  (if (> (length ls) 1)
	 "+"
	 "+00")
       ls
       )
 )
 )


(defun [color="blue"]U8:bytes[/color] (l / x ls)
 [color="green"];hanhphuc 17.04.2018[/color]
 ;UTF-8 split the bytes 
 (setq x (car l))
 (if l
   (cons (vl-remove nil (cond	((<= 0 x 191)
	 (setq ls (list x)
	       l  (cdr l)
	       )
	 ls
	 )
	((<= 192 x 223)
	 (setq ls (list x (cadr l))
	       l  (cddr l)
	       )
	 ls
	 )
	((<= 224 x 239)
	 (setq ls (list x (cadr l) (caddr l))
	       l  (cdddr l)
	       )
	 ls
	 )
	((<= 240 x 247)
	 (setq ls (list x (cadr l) (caddr l) (cadddr l))
	       l  (cddddr l)
	       )
	 ls
	 )
	)
    )
  (U8:bytes l)
  )
   ) 
 )


 

Here's the workaround

Step 1: read file

;assume this is the read result from stream UTF-8 file contents

(setq ret [b][color="purple"]"Lee Mac & Marko Ribar\r\nHappy Birthday\r\n祝ä½*们生日快乐\r\n幸福\r\nChúc mừng sinh nháº*t\r\n"[/color][/b]
     )

 

 

step 2: convert to char list

(setq lst ([color="blue"]vl-string->list[/color] ret))

[color="green"];Decimal [/color]
'([color="darkgreen"][b]239 187  191  76   101  101  32   77	97   99	  32   38   32	 77   97   114	107  111  32   82   105	 98   97   114
    13	  10   72   97	 112  112  121	32   66	  105  114  116	 104  100  97	121  13	  10   231  165	 157  228  189
    160  228  187  172	 231  148  159	230  151  165  229  191	 171  228  185	144  13	  10   229  185	 184  231  166
    143  13   10   67	 104  195  186	99   32	  109  225  187	 171  110  103	32   115  105  110  104	 32   110  104
    225  186  173  116	 13   10[/color]
    [/b])

[color="green"];Hex [/color]
'([color="purple"]"EF" "BB"   "BF"   "4C"   "65"  "65"	 "20"	"4D"   "61"   "63"   "20"   "26"   "20"	  "4D"	 "61"	"72"   "6B"
     "6F"   "20"   "52"   "69"	  "62"	 "61"	"72"   "D"    "A"    "48"   "61"   "70"	  "70"	 "79"	"20"   "42"
     "69"   "72"   "74"   "68"	  "64"	 "61"	"79"   "D"    "A"    "E7"   "A5"   "9D"	  "E4"	 "BD"	"A0"   "E4"
     "BB"   "AC"   "E7"   "94"	  "9F"	 "E6"	"97"   "A5"   "E5"   "BF"   "AB"   "E4"	  "B9"	 "90"	"D"    "A"
     "E5"   "B9"   "B8"   "E7"	  "A6"	 "8F"	"D"    "A"    "43"   "68"   "C3"   "BA"	  "63"	 "20"	"6D"   "E1"
     "BB"   "AB"   "6E"   "67"	  "20"	 "73"	"69"   "6E"   "68"   "20"   "6E"   "68"	  "E1"	 "BA"	"AD"   "74"
     "D"    "A"[/color]
     )

 

 

;Step 3: (U8:bytes lst ) function to filter the bytes list


'((239 187 191) (76) (101) (101) (32) (77) (97) (99) (32) (38) (32) (77) (97) (114) (107) (111) (32) (82) (105) (98) (97) (114)
 (13) (10) (72) (97) (112) (112) (121) (32) (66) (105) (114) (116) (104) (100) (97) (121) (13) (10) (231 165 157) (228 189 160)
 (228 187 172) (231 148 159) (230 151 165) (229 191 171) (228 185 144) (13) (10) (229 185 184) (231 166 143) (13) (10) (67) (104)
 (195 186) (99) (32) (109) (225 187 171) (110) (103) (32) (115) (105) (110) (104) (32) (110) (104) (225 186 173) (116) (13) (10))

 

 

;Step 4: convert decimal to base 2, then apply the function UTF8->unicode to encode

 

example: 汉

(mapcar ''(( x ) ([color="blue"]LM:dec->base[/color] x 2) )'(230 177 137)) [color="green"];Hex= E6 B1 89 [/color]
(alert
([color="blue"]UTF8->unicode[/color] '( [color="purple"]"11100110""10110001""10001001"[/color] ) )
)
[b][color="red"]"\U+6C49"[/color][/b]

[color="green"];you can encode each in the byte list, function[/color] [color="blue"] car, last , nth[/color]
(UTF8->unicode ([color="blue"]nth[/color] 10 lst) )

Finally concatenate all the encoded bytes list:

 

some screen shots

ooL2kHm.png

 

p/s: randomly tested Arabian, Chinese, Hindi, Japanese, Korean, Lao ,Punjabi, Russian, Tamil, Vietnamese etc.. still has some issues

Edited by hanhphuc
link added, 汉, date & syntax color
Link to comment
Share on other sites

  • 1 month later...

Hello

Cloud you please help me?

what's wrong to this code?

 

(defun c:test ()
;Step 1: read file
(setq ret "Lee Mac & Marko Ribar\r\nHappy Birthday\r\n祝ä½*们生日快乐\r\n幸福\r\nChúc mừng sinh nháº*t\r\n"
     )
 
;step 2: convert to char list
(setq lst (vl-string->list ret))
 
;Step 3: (U8:bytes lst ) function to filter the bytes list 
(setq lstt (U8:bytes lst))

 
;Step 4: convert decimal to base 2, then apply the function UTF8->unicode to encode
(foreach txt lstt (mapcar ''(( x ) (LM:dec->base x 2) ) 'txt))
(princ (UTF8->unicode 'txt ))

)

;Reference, post#138 
;https://stackoverflow.com/questions/643694/what-is-the-difference-between-utf-8-and-unicode

(defun UTF8->unicode ( l / ls 8b d2 foo) ; encode UTF-8 to unicode
;;;hanhphuc 17.04.2018 
 (setq	8b '((s) (while (< (strlen s)  (setq s (strcat "0" s))) s) 
	d2 '((str) ;split string to two list  
		 (if (> (strlen str) 0)
  		 (cons (substr str 1  (d2 (setq str (substr str 9 ))))
   		)
	     )
foo '(($ / pos i) ; base2 to decimal 
 		(setq i 0)
 		(+ (cond ((while (and (> (strlen $) 0) (setq pos (vl-string-search "1" $)))
     	 		(setq 	$ (substr $ (+ 2 pos))
	   			i (+ i (expt 2 (strlen $)))
	   		 )
      		     )
    		  )
   		(0)
   	      )
    		   (atoi $)
    	     	  )
     	)
 	ls (mapcar ''((x / $) 
	      (setq $ (LM:dec->base (foo x) 16))
	      (if
	       (= (strlen $) 1)
	       (strcat "0" $)
	       $
	       )
	      )
	   (d2 
	     (apply 'strcat
		    (mapcar ''((a x) (substr (8b a) (- 9 x) x))
			    l
			    (cdr (assoc (length l) '((1 . (7)) (2 . (5 6)) (3 . (4 6 6)) (4 . (3 6 6 6)))))
			    )
		    )
	     ) 
	   ) 
)
 (apply 'strcat
 (vl-list* "\\U"
	  (if (> (length ls) 1)
	 "+"
	 "+00")
       ls
       )
 )
 )


(defun U8:bytes (l / x ls)
 ;hanhphuc 17.04.2018
 ;UTF-8 split the bytes 
 (setq x (car l))
 (if l
   (cons (vl-remove nil (cond	((<= 0 x 191)
	 (setq ls (list x)
	       l  (cdr l)
	       )
	 ls
	 )
	((<= 192 x 223)
	 (setq ls (list x (cadr l))
	       l  (cddr l)
	       )
	 ls
	 )
	((<= 224 x 239)
	 (setq ls (list x (cadr l) (caddr l))
	       l  (cdddr l)
	       )
	 ls
	 )
	((<= 240 x 247)
	 (setq ls (list x (cadr l) (caddr l) (cadddr l))
	       l  (cddddr l)
	       )
	 ls
	 )
	)
    )
  (U8:bytes l)
  )
   ) 
 )

(defun foo ( str ) ; read unicode - test version 
hanhphuc 17.04.2018 
 (apply 'strcat
(mapcar
  	''( ( x ) (apply 'strcat (vl-list* (chr 92) "U+" (mapcar ''( (x / $)  (setq $ ( LM:dec->base x 16))
					  (if (or (< x 10) (=(strlen $)1)) (strcat "0" $) $) )
				    (reverse x)
				)
		   )
  	     )
     	   )
	
	(
	 '( ( f ) (f (vl-remove-if
		      '(lambda (x) (vl-some '(lambda (y)
					       (= x y)
					       )
					   '( 254 255 ))
				      )
			(vl-string->list str)
		      )
		   )
  	   	   )
 	 '( ( l ) (if l (cons (list (car l)(cadr l))
       			 (f (cddr l)))
		   )
  	  )
	)	
)
 )
)

;; Decimal to Base  -  Lee Mac
;; Converts a decimal number to another base.
;; n - [int] decimal integer
;; b - [int] non-zero positive integer base
;; Returns: [str] Representation of decimal in specified base

(defun LM:dec->base ( n b )
   (if (< n b)
       (chr (+ n (if (< n 10) 48 55)))
       (strcat (LM:dec->base (/ n b) b) (LM:dec->base (rem n b) b))
   )
)

:cry:

Link to comment
Share on other sites

Hello

Cloud you please help me?

what's wrong to this code?

 

(defun c:test ()
;Step 1: read file
(setq ret "Lee Mac & Marko Ribar\r\nHappy Birthday\r\n祝ä½*们生日快乐\r\n幸福\r\nChúc mừng sinh nháº*t\r\n"
     )
 
;step 2: convert to char list
(setq lst (vl-string->list ret))
 
;Step 3: (U8:bytes lst ) function to filter the bytes list 
(setq lstt (U8:bytes lst))

 
;Step 4: convert decimal to base 2, then apply the function UTF8->unicode to encode
(foreach txt lstt (mapcar ''(( x ) (LM:dec->base x 2) ) [color="red"]'txt[/color]))
(princ (UTF8->unicode [color="red"]'txt[/color] ))

)

:cry:

 

:thumbsup: good afford no worries learning from mistake, 'txt = (quote txt)

QUOTE function returns without evaluating argument

 

(setq txt "HELLO")
( princ [color="red"][b]'[/b][/color]txt ) 
TXT
( princ txt )
"HELLO"

 

so change to this

;Step 4: convert decimal to base 2, then apply the function UTF8->unicode to encode
(foreach [color="red"]txt[/color] lstt
(princ ([color="blue"]UTF8->unicode[/color] (mapcar ''(( x ) (LM:dec->base x 2) ) [color="red"]txt[/color] )) )
)
(princ)

be happy coding :)

 

p/s: use read-char for reading unicode file

Link to comment
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.
Note: Your post will require moderator approval before it will be visible.

Guest
Unfortunately, your content contains terms that we do not allow. Please edit your content to remove the highlighted words below.
Reply to this topic...

×   Pasted as rich text.   Restore formatting

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...