Sorting string with french accents

nmichaud · September 18, 2012, 6:57am

Hi,

We are building a multilanguage application. In french we need to sort strings. The problem is that the table.sort does not handle french accents.

Does anybody has a solution of how to remove accent before sorting. Any magical recipes?

thanks for your help
Nick [import]uid: 28795 topic_id: 31107 reply_id: 331107[/import]

vovasoft · September 18, 2012, 8:22am

Maybe you should get the byte sum of strings strings:
[lua]local a
local b
local sum = 0
local str = “châtain”
for i=1, string.len(str), 1 do
sum = sum + string.byte(str, i)
end
a = sum
sum = 0
str = “château”
for i=1, string.len(str), 1 do
sum = sum + string.byte(str, i)
end
b = sum
–I think that you will get that a < b[/lua]
http://docs.coronalabs.com/api/library/string/byte.html
http://docs.coronalabs.com/api/library/string/len.html
What do you think ? [import]uid: 138389 topic_id: 31107 reply_id: 124369[/import]

Darkmod · September 18, 2012, 12:18pm

My functions that i have gathered

[lua]local function CodeToUTF8 (Unicode)

if (Unicode <= 0x7F) then return string.char(Unicode); end;

if (Unicode <= 0x7FF) then
local Byte0 = 0xC0 + math.floor(Unicode / 0x40);
local Byte1 = 0x80 + (Unicode % 0x40);
return string.char(Byte0, Byte1);
end;

if (Unicode <= 0xFFFF) then
local Byte0 = 0xE0 + math.floor(Unicode / 0x1000);
local Byte1 = 0x80 + (math.floor(Unicode / 0x40) % 0x40);
local Byte2 = 0x80 + (Unicode % 0x40);
return string.char(Byte0, Byte1, Byte2);
end;

return “”;

end;
if not string.CodeToUTF8 then string.CodeToUTF8 = CodeToUTF8 end

local function CodeFromUTF8 (UTF8)
local Byte0 = string.byte(UTF8,1);
if (math.floor(Byte0 / 0x80) == 0) then return Byte0; end;

local Byte1 = string.byte(UTF8,2) % 0x40;
if (math.floor(Byte0 / 0x20) == 0x06) then
return (Byte0 % 0x20)*0x40 + Byte1;
end;

local Byte2 = string.byte(UTF8,3) % 0x40;
if (math.floor(Byte0 / 0x10) == 0x0E) then
return (Byte0 % 0x10)*0x1000 + Byte1*0x40 + Byte2;
end;

local Byte3 = string.byte(UTF8,4) % 0x40;
if (math.floor(Byte0 / 0x08) == 0x1E) then
return (Byte0 % 0x08)*0x40000 + Byte1*0x1000 + Byte2*0x40 + Byte3;
end;
end;
if not string.CodeFromUTF8 then string.CodeFromUTF8 = CodeFromUTF8 end

local function utf8charbytes (s, i)
– argument defaults
i = i or 1
local c = string.byte(s, i)

– determine bytes needed for character, based on RFC 3629
if c > 0 and c <= 127 then
– UTF8-1
return 1
elseif c >= 194 and c <= 223 then
– UTF8-2
local c2 = string.byte(s, i + 1)
return 2
elseif c >= 224 and c <= 239 then
– UTF8-3
local c2 = s:byte(i + 1)
local c3 = s:byte(i + 2)
return 3
elseif c >= 240 and c <= 244 then
– UTF8-4
local c2 = s:byte(i + 1)
local c3 = s:byte(i + 2)
local c4 = s:byte(i + 3)
return 4
end
end
if not string.utf8charbytes then string.utf8charbytes = utf8charbytes end

– returns the number of characters in a UTF-8 string
local function utf8len(s)
local pos = 1
local bytes = string.len(s)
local len = 0

while pos <= bytes and len ~= chars do
local c = string.byte(s,pos)
len = len + 1

pos = pos + utf8charbytes(s, pos)
end

if chars ~= nil then
return pos - 1
end

return len
end
if not string.utf8len then string.utf8len = utf8len end

– functions identically to string.sub except that i and j are UTF-8 characters
– instead of bytes
local function utf8sub (s, i, j)
j = j or -1

if i == nil then
return “”
end

local pos = 1
local bytes = string.len(s)
local len = 0

– only set l if i or j is negative
local l = (i >= 0 and j >= 0) or utf8len(s)
local startChar = (i >= 0) and i or l + i + 1
local endChar = (j >= 0) and j or l + j + 1

– can’t have start before end!
if startChar > endChar then
return “”
end

– byte offsets to pass to string.sub
local startByte, endByte = 1, bytes

while pos <= bytes do
len = len + 1

if len == startChar then
startByte = pos
end

pos = pos + utf8charbytes(s, pos)

if len == endChar then
endByte = pos - 1
break
end
end

return string.sub(s, startByte, endByte)
end
if not string.utf8sub then string.utf8sub = utf8sub end

– replace UTF-8 characters based on a mapping table
local function utf8replace (s, mapping)
local pos = 1
local bytes = string.len(s)
local charbytes
local newstr = “”

while pos <= bytes do
charbytes = utf8charbytes(s, pos)
local c = string.sub(s, pos, pos + charbytes - 1)
newstr = newstr … (mapping[c] or c)
pos = pos + charbytes
end

return newstr
end
if not string.utf8replace then string.utf8replace = utf8replace end

local function split(str, pat)
local t = {}
local fpat = “(.-)” … pat
local last_end = 1
local s, e, cap = str:find(fpat, 1)
while s do
if s ~= 1 or cap ~= “” then
table.insert(t,cap)
end
last_end = e+1
s, e, cap = str:find(fpat, last_end)
end
if last_end <= #str then
cap = str:sub(last_end)
table.insert(t, cap)
end
return t
end
if not string.split then string.split = split end [import]uid: 7177 topic_id: 31107 reply_id: 124408[/import]

vovasoft · September 18, 2012, 12:50pm

This is your solution ?
How do you use these methods ? [import]uid: 138389 topic_id: 31107 reply_id: 124419[/import]

nmichaud · September 18, 2012, 4:37pm

Thanks for some of the proposal, but I have found a method that is suitable for what we need. In case somebody like it, here it is. Feel free to improve it and share it.

local tableAccents = {}
tableAccents[“à”] = “a”
tableAccents[“á”] = “a”
tableAccents[“â”] = “a”
tableAccents[“ã”] = “a”
tableAccents[“ä”] = “a”
tableAccents[“ç”] = “c”
tableAccents[“è”] = “e”
tableAccents[“é”] = “e”
tableAccents[“ê”] = “e”
tableAccents[“ë”] = “e”
tableAccents[“ì”] = “i”
tableAccents[“í”] = “i”
tableAccents[“î”] = “i”
tableAccents[“ï”] = “i”
tableAccents[“ñ”] = “n”
tableAccents[“ò”] = “o”
tableAccents[“ó”] = “o”
tableAccents[“ô”] = “o”
tableAccents[“õ”] = “o”
tableAccents[“ö”] = “o”
tableAccents[“ù”] = “u”
tableAccents[“ú”] = “u”
tableAccents[“û”] = “u”
tableAccents[“ü”] = “u”
tableAccents[“ý”] = “y”
tableAccents[“ÿ”] = “y”
tableAccents[“À”] = “A”
tableAccents[“Á”] = “A”
tableAccents[“Â”] = “A”
tableAccents[“Ã”] = “A”
tableAccents[“Ä”] = “A”
tableAccents[“Ç”] = “C”
tableAccents[“È”] = “E”
tableAccents[“É”] = “E”
tableAccents[“Ê”] = “E”
tableAccents[“Ë”] = “E”
tableAccents[“Ì”] = “I”
tableAccents[“Í”] = “I”
tableAccents[“Î”] = “I”
tableAccents[“Ï”] = “I”
tableAccents[“Ñ”] = “N”
tableAccents[“Ò”] = “O”
tableAccents[“Ó”] = “O”
tableAccents[“Ô”] = “O”
tableAccents[“Õ”] = “O”
tableAccents[“Ö”] = “O”
tableAccents[“Ù”] = “U”
tableAccents[“Ú”] = “U”
tableAccents[“Û”] = “U”
tableAccents[“Ü”] = “U”
tableAccents[“Ý”] = “Y”

– Normalize a given string
function string.stripAccents( str )

local normalizedString = “”

for strChar in string.gfind(str, “([%z\1-\127\194-\244][\128-\191]*)”) do
if tableAccents[strChar] ~= nil then
normalizedString = normalizedString…tableAccents[strChar]
else
normalizedString = normalizedString…strChar
end
end

return normalizedString

end [import]uid: 28795 topic_id: 31107 reply_id: 124449[/import]

vovasoft · September 18, 2012, 5:36pm

I like your idea, very simple. [import]uid: 138389 topic_id: 31107 reply_id: 124451[/import]

vovasoft · September 18, 2012, 8:22am

Maybe you should get the byte sum of strings strings:
[lua]local a
local b
local sum = 0
local str = “châtain”
for i=1, string.len(str), 1 do
sum = sum + string.byte(str, i)
end
a = sum
sum = 0
str = “château”
for i=1, string.len(str), 1 do
sum = sum + string.byte(str, i)
end
b = sum
–I think that you will get that a < b[/lua]
http://docs.coronalabs.com/api/library/string/byte.html
http://docs.coronalabs.com/api/library/string/len.html
What do you think ? [import]uid: 138389 topic_id: 31107 reply_id: 124369[/import]

Darkmod · September 18, 2012, 12:18pm