the string.byte and string.char functions can be used to compute the length and the substring of a string that contains characters other than those of the english alphabet:
string.len=function(s)
local len,k=0,1
while k\<=#s do
len=len+1
if string.byte(s,k)\<=190 then k=k+1 else k=k+2 end
end
return len
end
string.sub=function(s,i,j)
local chars={}
local k=1
while k\<=#s do
local byte1=string.byte(s,k)
if byte1\<=190 then
chars[#chars+1]=string.char(byte1)
k=k+1
else
local byte2=string.byte(s,k+1)
chars[#chars+1]=string.char(byte1,byte2)
k=k+2
end
end
local sub=""
for m=i,j do
sub=sub..chars[m]
end
return sub
end
usage:
local word="a?r??jämnñ"
print(string.len(word))
for i=1,string.len(word) do
print(string.sub(word,i,i))
end
this works for european languages, but i think that with a couple of changes that have to do with the bytes used for each glyph, it can work for other languages too
[import]uid: 6459 topic_id: 1633 reply_id: 5009[/import]
tetu,
This looks to be a very hopeful set of functions for something I’ve had problems with for a while. Do you know if there is any way to make this work with Chinese and Japanese characters? I don’t know what you meant by “…a couple of changes that have to do with the bytes used for each glyph, it can work for other languages too”.
Please advise me if you can. [import]uid: 62617 topic_id: 1633 reply_id: 145198[/import]
julius,
the workaround is no good when it comes to Chinese or Japanese characters, because each glyph in these languages takes 3 bytes
check out this library https://gist.github.com/gdeglin/4128882
tetu,
This looks to be a very hopeful set of functions for something I’ve had problems with for a while. Do you know if there is any way to make this work with Chinese and Japanese characters? I don’t know what you meant by “…a couple of changes that have to do with the bytes used for each glyph, it can work for other languages too”.
Please advise me if you can. [import]uid: 62617 topic_id: 1633 reply_id: 145198[/import]
julius,
the workaround is no good when it comes to Chinese or Japanese characters, because each glyph in these languages takes 3 bytes
check out this library https://gist.github.com/gdeglin/4128882
tetu,
This looks to be a very hopeful set of functions for something I’ve had problems with for a while. Do you know if there is any way to make this work with Chinese and Japanese characters? I don’t know what you meant by “…a couple of changes that have to do with the bytes used for each glyph, it can work for other languages too”.
Please advise me if you can. [import]uid: 62617 topic_id: 1633 reply_id: 145198[/import]
julius,
the workaround is no good when it comes to Chinese or Japanese characters, because each glyph in these languages takes 3 bytes
check out this library https://gist.github.com/gdeglin/4128882
tetu,
This looks to be a very hopeful set of functions for something I’ve had problems with for a while. Do you know if there is any way to make this work with Chinese and Japanese characters? I don’t know what you meant by “…a couple of changes that have to do with the bytes used for each glyph, it can work for other languages too”.
Please advise me if you can. [import]uid: 62617 topic_id: 1633 reply_id: 145198[/import]
julius,
the workaround is no good when it comes to Chinese or Japanese characters, because each glyph in these languages takes 3 bytes
check out this library https://gist.github.com/gdeglin/4128882
dingo, yes I think I have got it working for non western character languages. Is that what you meant by “write” japanese?
I still have to get all my content properly translated but using the following functions for string manipulation seemed to work :
-- returns the number of bytes used by the UTF-8 character at byte i in s -- also doubles as a UTF-8 character validator function utf8charbytes (s, i) -- argument defaults i = i or 1 local c = string.byte(s, i) -- determine bytes needed for character, based on RFC 3629 if c \> 0 and c \<= 127 then -- UTF8-1 return 1 elseif c \>= 194 and c \<= 223 then -- UTF8-2 local c2 = string.byte(s, i + 1) return 2 elseif c \>= 224 and c \<= 239 then -- UTF8-3 local c2 = s:byte(i + 1) local c3 = s:byte(i + 2) return 3 elseif c \>= 240 and c \<= 244 then -- UTF8-4 local c2 = s:byte(i + 1) local c3 = s:byte(i + 2) local c4 = s:byte(i + 3) return 4 end end -- returns the number of characters in a UTF-8 string function utf8len (s) local pos = 1 local bytes = string.len(s) local len = 0 while pos \<= bytes and len ~= chars do local c = string.byte(s,pos) len = len + 1 pos = pos + utf8charbytes(s, pos) end if chars ~= nil then return pos - 1 end return len end -- functions identically to string.sub except that i and j are UTF-8 characters -- instead of bytes function utf8sub (s, i, j) j = j or -1 if i == nil then return "" end local pos = 1 local bytes = string.len(s) local len = 0 -- only set l if i or j is negative local l = (i \>= 0 and j \>= 0) or utf8len(s) local startChar = (i \>= 0) and i or l + i + 1 local endChar = (j \>= 0) and j or l + j + 1 -- can't have start before end! if startChar \> endChar then return "" end -- byte offsets to pass to string.sub local startByte, endByte = 1, bytes while pos \<= bytes do len = len + 1 if len == startChar then startByte = pos end pos = pos + utf8charbytes(s, pos) if len == endChar then endByte = pos - 1 break end end return string.sub(s, startByte, endByte) end -- replace UTF-8 characters based on a mapping table function utf8replace (s, mapping) local pos = 1 local bytes = string.len(s) local charbytes local newstr = "" while pos \<= bytes do charbytes = utf8charbytes(s, pos) local c = string.sub(s, pos, pos + charbytes - 1) newstr = newstr .. (mapping[c] or c) pos = pos + charbytes end return newstr end
dingo, yes I think I have got it working for non western character languages. Is that what you meant by “write” japanese?
I still have to get all my content properly translated but using the following functions for string manipulation seemed to work :
-- returns the number of bytes used by the UTF-8 character at byte i in s -- also doubles as a UTF-8 character validator function utf8charbytes (s, i) -- argument defaults i = i or 1 local c = string.byte(s, i) -- determine bytes needed for character, based on RFC 3629 if c \> 0 and c \<= 127 then -- UTF8-1 return 1 elseif c \>= 194 and c \<= 223 then -- UTF8-2 local c2 = string.byte(s, i + 1) return 2 elseif c \>= 224 and c \<= 239 then -- UTF8-3 local c2 = s:byte(i + 1) local c3 = s:byte(i + 2) return 3 elseif c \>= 240 and c \<= 244 then -- UTF8-4 local c2 = s:byte(i + 1) local c3 = s:byte(i + 2) local c4 = s:byte(i + 3) return 4 end end -- returns the number of characters in a UTF-8 string function utf8len (s) local pos = 1 local bytes = string.len(s) local len = 0 while pos \<= bytes and len ~= chars do local c = string.byte(s,pos) len = len + 1 pos = pos + utf8charbytes(s, pos) end if chars ~= nil then return pos - 1 end return len end -- functions identically to string.sub except that i and j are UTF-8 characters -- instead of bytes function utf8sub (s, i, j) j = j or -1 if i == nil then return "" end local pos = 1 local bytes = string.len(s) local len = 0 -- only set l if i or j is negative local l = (i \>= 0 and j \>= 0) or utf8len(s) local startChar = (i \>= 0) and i or l + i + 1 local endChar = (j \>= 0) and j or l + j + 1 -- can't have start before end! if startChar \> endChar then return "" end -- byte offsets to pass to string.sub local startByte, endByte = 1, bytes while pos \<= bytes do len = len + 1 if len == startChar then startByte = pos end pos = pos + utf8charbytes(s, pos) if len == endChar then endByte = pos - 1 break end end return string.sub(s, startByte, endByte) end -- replace UTF-8 characters based on a mapping table function utf8replace (s, mapping) local pos = 1 local bytes = string.len(s) local charbytes local newstr = "" while pos \<= bytes do charbytes = utf8charbytes(s, pos) local c = string.sub(s, pos, pos + charbytes - 1) newstr = newstr .. (mapping[c] or c) pos = pos + charbytes end return newstr end