| """ |
| uv pip install phonikud-onnx dicta-onnx pandas git+https://github.com/thewh1teagle/phonikud |
| wget https://huggingface.co/thewh1teagle/phonikud-onnx/resolve/main/phonikud-1.0.int8.onnx |
| wget https://github.com/thewh1teagle/dicta-onnx/releases/download/model-files-v1.0/dicta-1.0.int8.onnx |
| """ |
| import csv |
| import pandas as pd |
| from tqdm import tqdm |
| from dicta_onnx import Dicta |
| from phonikud_onnx import Phonikud |
| import phonikud |
|
|
| tqdm.pandas() |
|
|
| |
| df = pd.read_csv('saspeech_male_unvocalized.csv', sep='|', names=['k', 'v']) |
|
|
| |
| dicta = Dicta('./dicta-1.0.int8.onnx') |
| df['dicta_vocalized'] = df['v'].progress_apply(lambda sentence: dicta.add_diacritics(sentence, mark_matres_lectionis='')) |
|
|
| |
| df[['k', 'dicta_vocalized']].to_csv('saspeech_male_dicta.csv', sep='|', index=False, header=False, quoting=csv.QUOTE_NONE) |
|
|
| |
| phonikud_model = Phonikud('./phonikud-1.0.int8.onnx') |
| df['phonikud_vocalized'] = df['v'].progress_apply(phonikud_model.add_diacritics) |
| df['phonikud_phonemes'] = df['phonikud_vocalized'].progress_apply(phonikud.phonemize) |
|
|
| |
| df[['k', 'phonikud_vocalized', 'phonikud_phonemes']].to_csv('saspeech_male_phonikud.csv', sep=',', index=False, header=False, quoting=csv.QUOTE_NONE) |