Spaces:
Sleeping
Sleeping
| """ | |
| Evaluator module. | |
| Provides functions to evaluate a given model on a dataset sample using the Faster Whisper model, | |
| and generate HTML visualization blocks of the word alignment. | |
| """ | |
| import concurrent.futures | |
| import gc | |
| import io | |
| import queue | |
| import threading | |
| from typing import Dict, Generator, List | |
| import soundfile as sf | |
| from hebrew import Hebrew | |
| from tqdm import tqdm | |
| from transformers.models.whisper.english_normalizer import BasicTextNormalizer | |
| from visual_eval.visualization import render_visualize_jiwer_result_html | |
| class HebrewTextNormalizer(BasicTextNormalizer): | |
| def __init__(self, *args, **kwargs): | |
| super().__init__(*args, **kwargs) | |
| superfluous_chars_to_remove = "\u061c" # Arabic letter mark | |
| superfluous_chars_to_remove += ( | |
| "\u200b\u200c\u200d" # Zero-width space, non-joiner, joiner | |
| ) | |
| superfluous_chars_to_remove += "\u200e\u200f" # LTR and RTL marks | |
| superfluous_chars_to_remove += ( | |
| "\u202a\u202b\u202c\u202d\u202e" # LTR/RTL embedding, pop, override | |
| ) | |
| superfluous_chars_to_remove += "\u2066\u2067\u2068\u2069" # Isolate controls | |
| superfluous_chars_to_remove += "\ufeff" # Zero-width no-break space | |
| self.superfluous_hebrew_unicode_symbols_translator = str.maketrans( | |
| {ord(c): None for c in superfluous_chars_to_remove} | |
| ) | |
| self.quotes_translator = str.maketrans({ord(c): None for c in "\"'"}) | |
| def __remove_niqqud(self, text: str) -> str: | |
| return Hebrew(text).no_niqqud().string | |
| def __remove_superfluous_hebrew_unicode_symbols(self, text: str) -> str: | |
| return text.translate(self.superfluous_hebrew_unicode_symbols_translator) | |
| def __remove_quotes(self, text: str) -> str: | |
| return text.translate(self.quotes_translator) | |
| def __call__(self, text): | |
| text = self.__remove_niqqud(text) | |
| text = self.__remove_superfluous_hebrew_unicode_symbols(text) | |
| text = self.__remove_quotes(text) | |
| text = super().__call__(text) | |
| return text | |