TestTranslator / scripts /compare_whisper.py
yujuanqin's picture
update scripts and test_data
e4406a3
from pywhispercpp.model import Model
from pathlib import Path
import time
import csv
from silero_vad.utils_vad import languages
def save_csv(file_path, rows):
with open(file_path, "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerows(rows)
print(f"write csv to {file_path}")
def run_audios_after_vad(models_dir, audio_dir, model_name):
t0 = time.time()
model = Model(
model=model_name,
models_dir=models_dir,
print_realtime=False,
print_progress=False,
print_timestamps=False,
translate=False,
# beam_search=1,
temperature=0.,
no_context=True
)
print("load model time: ", time.time()-t0)
rows = [["lang", "file_name", "inference_time", "python_res", "intel_res"]]
for lang in ["es", "fr", "hi", "it", "ja", "pt"]:
print("*" * 10, lang, "*"*10)
for audio in sorted(list((audio_dir/lang).glob("*.wav"))):
print("Audio name:", audio.name)
t1 = time.time()
output = model.transcribe(str(audio), language=lang)
t = time.time() - t1
print("Inference time:", t)
# print(output)
text = " ".join([a.text for a in output])
print("Text from Python:", text)
try:
with open(audio.with_suffix(".txt"), encoding="utf-8") as f:
intel_text = f.read().strip()
except Exception as e:
intel_text = ""
print(f"Error reading Intel text for {audio.name}: {e}")
print("Text from Intel :", intel_text)
rows.append([lang, audio.name, t, text, intel_text])
save_csv("csv/compare_whisper_intel.csv", rows)
def run_long_audios(models_dir, audios_list, model_name):
t0 = time.time()
model = Model(
model=model_name,
models_dir=models_dir,
print_realtime=False,
print_progress=False,
print_timestamps=False,
translate=False,
# beam_search=1,
temperature=0.,
no_context=True
)
print("load model time: ", time.time() - t0)
rows = [["file_name", "inference_time", "res_text"]]
audios = audios_list.read_text().splitlines()
for audio in audios:
if not audio:
rows.append([])
continue
lang = Path(audio).name.split('-')[0]
if lang not in ["es", "fr", "hi", "it", "ja", "pt"]:
lang = "en"
print(f"Audio file: {audio}, lang: {lang}")
t1 = time.time()
output = model.transcribe(str(audio), language=lang)
t = time.time() - t1
print("Inference time:", t)
# print(output)
text = " ".join([a.text for a in output])
print("Text:", text)
rows.append([audio, t, text])
save_csv("csv/compare_whisper.csv", rows)
if __name__ == '__main__':
models_dir = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models")
# model_name = "large-v3-turbo-q5_0"
model_name = "large-v3-turbo-q8_0"
# model_name = "small-q8_0"
# audios_after_vad = Path("/Users/jeqin/work/test/test_yoyotranslator/audios_after_vad/audio2-with-noise")
audios_list = Path("/Users/jeqin/work/code/TestTranslator/scripts/audios.txt")
# run_audios_after_vad(models_dir, audios_after_vad, model_name)
run_long_audios(models_dir, audios_list, model_name)