phonikud-experiments / comparison /sentences /create_vocalized.py
thewh1teagle
add stts2, sort tables
26bbc8b unverified
"""
uv pip install phonikud-onnx dicta-onnx pandas git+https://github.com/thewh1teagle/phonikud
wget https://huggingface.co/thewh1teagle/phonikud-onnx/resolve/main/phonikud-1.0.int8.onnx
wget https://github.com/thewh1teagle/dicta-onnx/releases/download/model-files-v1.0/dicta-1.0.int8.onnx
"""
import csv
import pandas as pd
from tqdm import tqdm
from dicta_onnx import Dicta
from phonikud_onnx import Phonikud
import phonikud
tqdm.pandas()
# Load unvocalized data once
df = pd.read_csv('saspeech_male_unvocalized.csv', sep='|', names=['k', 'v'])
# Initialize Dicta and apply diacritics
dicta = Dicta('./dicta-1.0.int8.onnx')
df['dicta_vocalized'] = df['v'].progress_apply(lambda sentence: dicta.add_diacritics(sentence, mark_matres_lectionis=''))
# Save Dicta output
df[['k', 'dicta_vocalized']].to_csv('saspeech_male_dicta.csv', sep='|', index=False, header=False, quoting=csv.QUOTE_NONE)
# Initialize Phonikud and apply diacritics on original text (not the Dicta vocalized)
phonikud_model = Phonikud('./phonikud-1.0.int8.onnx')
df['phonikud_vocalized'] = df['v'].progress_apply(phonikud_model.add_diacritics)
df['phonikud_phonemes'] = df['phonikud_vocalized'].progress_apply(phonikud.phonemize)
# Save Phonikud output
df[['k', 'phonikud_vocalized', 'phonikud_phonemes']].to_csv('saspeech_male_phonikud.csv', sep=',', index=False, header=False, quoting=csv.QUOTE_NONE)