|
|
from pywhispercpp.model import Model |
|
|
from pathlib import Path |
|
|
import time |
|
|
import csv |
|
|
|
|
|
from silero_vad.utils_vad import languages |
|
|
|
|
|
|
|
|
def save_csv(file_path, rows): |
|
|
with open(file_path, "w", encoding="utf-8") as f: |
|
|
writer = csv.writer(f) |
|
|
writer.writerows(rows) |
|
|
print(f"write csv to {file_path}") |
|
|
|
|
|
def run_audios_after_vad(models_dir, audio_dir, model_name): |
|
|
t0 = time.time() |
|
|
model = Model( |
|
|
model=model_name, |
|
|
models_dir=models_dir, |
|
|
print_realtime=False, |
|
|
print_progress=False, |
|
|
print_timestamps=False, |
|
|
translate=False, |
|
|
|
|
|
temperature=0., |
|
|
no_context=True |
|
|
) |
|
|
print("load model time: ", time.time()-t0) |
|
|
rows = [["lang", "file_name", "inference_time", "python_res", "intel_res"]] |
|
|
for lang in ["es", "fr", "hi", "it", "ja", "pt"]: |
|
|
print("*" * 10, lang, "*"*10) |
|
|
for audio in sorted(list((audio_dir/lang).glob("*.wav"))): |
|
|
print("Audio name:", audio.name) |
|
|
t1 = time.time() |
|
|
output = model.transcribe(str(audio), language=lang) |
|
|
t = time.time() - t1 |
|
|
print("Inference time:", t) |
|
|
|
|
|
text = " ".join([a.text for a in output]) |
|
|
print("Text from Python:", text) |
|
|
try: |
|
|
with open(audio.with_suffix(".txt"), encoding="utf-8") as f: |
|
|
intel_text = f.read().strip() |
|
|
except Exception as e: |
|
|
intel_text = "" |
|
|
print(f"Error reading Intel text for {audio.name}: {e}") |
|
|
print("Text from Intel :", intel_text) |
|
|
rows.append([lang, audio.name, t, text, intel_text]) |
|
|
save_csv("csv/compare_whisper_intel.csv", rows) |
|
|
|
|
|
def run_long_audios(models_dir, audios_list, model_name): |
|
|
t0 = time.time() |
|
|
model = Model( |
|
|
model=model_name, |
|
|
models_dir=models_dir, |
|
|
print_realtime=False, |
|
|
print_progress=False, |
|
|
print_timestamps=False, |
|
|
translate=False, |
|
|
|
|
|
temperature=0., |
|
|
no_context=True |
|
|
) |
|
|
print("load model time: ", time.time() - t0) |
|
|
rows = [["file_name", "inference_time", "res_text"]] |
|
|
audios = audios_list.read_text().splitlines() |
|
|
for audio in audios: |
|
|
if not audio: |
|
|
rows.append([]) |
|
|
continue |
|
|
lang = Path(audio).name.split('-')[0] |
|
|
if lang not in ["es", "fr", "hi", "it", "ja", "pt"]: |
|
|
lang = "en" |
|
|
print(f"Audio file: {audio}, lang: {lang}") |
|
|
t1 = time.time() |
|
|
output = model.transcribe(str(audio), language=lang) |
|
|
t = time.time() - t1 |
|
|
print("Inference time:", t) |
|
|
|
|
|
text = " ".join([a.text for a in output]) |
|
|
print("Text:", text) |
|
|
rows.append([audio, t, text]) |
|
|
save_csv("csv/compare_whisper.csv", rows) |
|
|
if __name__ == '__main__': |
|
|
models_dir = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models") |
|
|
|
|
|
model_name = "large-v3-turbo-q8_0" |
|
|
|
|
|
|
|
|
audios_list = Path("/Users/jeqin/work/code/TestTranslator/scripts/audios.txt") |
|
|
|
|
|
run_long_audios(models_dir, audios_list, model_name) |