You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
64 lines
1.8 KiB
64 lines
1.8 KiB
# -*- coding: latin-1 -*-
|
|
# Notice the encoding string above!
|
|
#
|
|
|
|
# $Id$
|
|
# use a dynamically populated translation dictionary to remove accents
|
|
# from a string
|
|
#
|
|
# This was originally borrowed from:
|
|
# http://svn.effbot.org/public/stuff/sandbox/text/unaccent.py
|
|
#
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
import unicodedata, sys
|
|
|
|
CHAR_REPLACEMENT = {
|
|
# latin-1 characters that don't have a unicode decomposition
|
|
0xc6: u"AE", # LATIN CAPITAL LETTER AE
|
|
0xd0: u"D", # LATIN CAPITAL LETTER ETH
|
|
0xd8: u"OE", # LATIN CAPITAL LETTER O WITH STROKE
|
|
0xde: u"Th", # LATIN CAPITAL LETTER THORN
|
|
0xdf: u"ss", # LATIN SMALL LETTER SHARP S
|
|
0xe6: u"ae", # LATIN SMALL LETTER AE
|
|
0xf0: u"d", # LATIN SMALL LETTER ETH
|
|
0xf8: u"oe", # LATIN SMALL LETTER O WITH STROKE
|
|
0xfe: u"th", # LATIN SMALL LETTER THORN
|
|
}
|
|
|
|
##
|
|
# Translation dictionary. Translation entries are added to this
|
|
# dictionary as needed.
|
|
|
|
class unaccented_map(dict):
|
|
|
|
##
|
|
# Maps a unicode character code (the key) to a replacement code
|
|
# (either a character code or a unicode string).
|
|
|
|
def mapchar(self, key):
|
|
ch = self.get(key)
|
|
if ch is not None:
|
|
return ch
|
|
if sys.version_info >= (3, 0):
|
|
de = unicodedata.decomposition(chr(key))
|
|
else:
|
|
de = unicodedata.decomposition(unichr(key))
|
|
if de:
|
|
try:
|
|
ch = int(de.split(None, 1)[0], 16)
|
|
except (IndexError, ValueError):
|
|
ch = key
|
|
else:
|
|
ch = CHAR_REPLACEMENT.get(key, key)
|
|
self[key] = ch
|
|
return ch
|
|
|
|
if sys.version >= "2.5":
|
|
# use __missing__ where available
|
|
__missing__ = mapchar
|
|
else:
|
|
# otherwise, use standard __getitem__ hook (this is slower,
|
|
# since it's called for each character)
|
|
__getitem__ = mapchar
|
|
|