wiki-cased / tools /remove_accent.py
Quagmire1's picture
Upload folder using huggingface_hub
41f6dd8 verified
# Copyright (c) 2019-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
import sys
import unicodedata
import six
def convert_to_unicode(text):
"""
Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
"""
# six_ensure_text is copied from https://github.com/benjaminp/six
def six_ensure_text(s, encoding='utf-8', errors='strict'):
if isinstance(s, six.binary_type):
return s.decode(encoding, errors)
elif isinstance(s, six.text_type):
return s
else:
raise TypeError("not expecting type '%s'" % type(s))
return six_ensure_text(text, encoding="utf-8", errors="ignore")
def run_strip_accents(text):
"""
Strips accents from a piece of text.
"""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
for line in sys.stdin:
line = convert_to_unicode(line.rstrip())
line = run_strip_accents(line)
print(u'%s' % line)