|
|
""" |
|
|
uv venv -p3.11 |
|
|
uv pip install phonikud-onnx dicta-onnx nakdimon-onnx pandas git+https://github.com/thewh1teagle/phonikud |
|
|
wget https://huggingface.co/thewh1teagle/phonikud-onnx/resolve/main/phonikud-1.0.int8.onnx |
|
|
wget https://github.com/thewh1teagle/dicta-onnx/releases/download/model-files-v1.0/dicta-1.0.int8.onnx |
|
|
wget https://github.com/thewh1teagle/nakdimon-onnx/releases/download/v0.1.0/nakdimon.onnx |
|
|
""" |
|
|
import csv |
|
|
import pandas as pd |
|
|
from tqdm import tqdm |
|
|
from dicta_onnx import Dicta |
|
|
from phonikud_onnx import Phonikud |
|
|
from nakdimon_onnx import Nakdimon |
|
|
import phonikud |
|
|
|
|
|
tqdm.pandas() |
|
|
|
|
|
|
|
|
df = pd.read_csv('study_unvocalized.csv', sep='|', names=['k', 'v']) |
|
|
|
|
|
|
|
|
nakdimon = Nakdimon("nakdimon.onnx") |
|
|
dicta = Dicta('./dicta-1.0.int8.onnx') |
|
|
phonikud_model = Phonikud('./phonikud-1.0.int8.onnx') |
|
|
|
|
|
|
|
|
df['dicta_vocalized'] = df['v'].progress_apply(lambda sentence: dicta.add_diacritics(sentence, mark_matres_lectionis='')) |
|
|
|
|
|
|
|
|
df['phonikud_vocalized'] = df['v'].progress_apply(phonikud_model.add_diacritics) |
|
|
df['phonikud_phonemes'] = df['phonikud_vocalized'].progress_apply(phonikud.phonemize) |
|
|
|
|
|
|
|
|
df['nakdimon_vocalized'] = df['v'].progress_apply(nakdimon.compute) |
|
|
|
|
|
|
|
|
df[['k', 'dicta_vocalized']].to_csv('study_dicta.csv', sep='|', index=False, header=False) |
|
|
df[['k', 'nakdimon_vocalized']].to_csv('study_nakdimon.csv', sep='|', index=False, header=False) |
|
|
df[['k', 'phonikud_vocalized', 'phonikud_phonemes']].to_csv('study_phonikud.csv', sep=',', index=False, header=False) |
|
|
|