# -*- coding: latin-1 -*- # Notice the encoding string above! # # $Id$ # use a dynamically populated translation dictionary to remove accents # from a string # # This was originally borrowed from: # http://svn.effbot.org/public/stuff/sandbox/text/unaccent.py # from __future__ import unicode_literals import unicodedata, sys CHAR_REPLACEMENT = { # latin-1 characters that don't have a unicode decomposition 0xc6: u"AE", # LATIN CAPITAL LETTER AE 0xd0: u"D", # LATIN CAPITAL LETTER ETH 0xd8: u"OE", # LATIN CAPITAL LETTER O WITH STROKE 0xde: u"Th", # LATIN CAPITAL LETTER THORN 0xdf: u"ss", # LATIN SMALL LETTER SHARP S 0xe6: u"ae", # LATIN SMALL LETTER AE 0xf0: u"d", # LATIN SMALL LETTER ETH 0xf8: u"oe", # LATIN SMALL LETTER O WITH STROKE 0xfe: u"th", # LATIN SMALL LETTER THORN } ## # Translation dictionary. Translation entries are added to this # dictionary as needed. class unaccented_map(dict): ## # Maps a unicode character code (the key) to a replacement code # (either a character code or a unicode string). def mapchar(self, key): ch = self.get(key) if ch is not None: return ch if sys.version_info >= (3, 0): de = unicodedata.decomposition(chr(key)) else: de = unicodedata.decomposition(unichr(key)) if de: try: ch = int(de.split(None, 1)[0], 16) except (IndexError, ValueError): ch = key else: ch = CHAR_REPLACEMENT.get(key, key) self[key] = ch return ch if sys.version >= "2.5": # use __missing__ where available __missing__ = mapchar else: # otherwise, use standard __getitem__ hook (this is slower, # since it's called for each character) __getitem__ = mapchar