Spaces:
Sleeping
Sleeping
| # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import io | |
| import warnings | |
| from .homoglyphs import homoglyphs | |
| from .replacements import replacements | |
| _replacements = {uni: asc for uni, asc in replacements} | |
| _homoglyphs = {g: asc for asc, glyphs in homoglyphs.items() for g in glyphs} | |
| def unidecoder(s, homoglyphs=False): | |
| """Transliterate unicode | |
| Args: | |
| s (str): unicode string | |
| homoglyphs (bool): prioritize translating to homoglyphs | |
| """ | |
| warned = False # Once per utterance | |
| ret = '' | |
| for u in s: | |
| if ord(u) < 127: | |
| a = u | |
| elif homoglyphs: | |
| a = _homoglyphs.get(u, _replacements.get(u, None)) | |
| else: | |
| a = _replacements.get(u, _homoglyphs.get(u, None)) | |
| if a is None: | |
| if not warned: | |
| warnings.warn(f'Unexpected character {u}: ' | |
| 'please revise your text cleaning rules.', | |
| stacklevel=10**6) | |
| warned = True | |
| else: | |
| ret += a | |
| return ret | |