from pywhispercpp.model import Model from pathlib import Path import time import csv from silero_vad.utils_vad import languages def save_csv(file_path, rows): with open(file_path, "w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerows(rows) print(f"write csv to {file_path}") def run_audios_after_vad(models_dir, audio_dir, model_name): t0 = time.time() model = Model( model=model_name, models_dir=models_dir, print_realtime=False, print_progress=False, print_timestamps=False, translate=False, # beam_search=1, temperature=0., no_context=True ) print("load model time: ", time.time()-t0) rows = [["lang", "file_name", "inference_time", "python_res", "intel_res"]] for lang in ["es", "fr", "hi", "it", "ja", "pt"]: print("*" * 10, lang, "*"*10) for audio in sorted(list((audio_dir/lang).glob("*.wav"))): print("Audio name:", audio.name) t1 = time.time() output = model.transcribe(str(audio), language=lang) t = time.time() - t1 print("Inference time:", t) # print(output) text = " ".join([a.text for a in output]) print("Text from Python:", text) try: with open(audio.with_suffix(".txt"), encoding="utf-8") as f: intel_text = f.read().strip() except Exception as e: intel_text = "" print(f"Error reading Intel text for {audio.name}: {e}") print("Text from Intel :", intel_text) rows.append([lang, audio.name, t, text, intel_text]) save_csv("csv/compare_whisper_intel.csv", rows) def run_long_audios(models_dir, audios_list, model_name): t0 = time.time() model = Model( model=model_name, models_dir=models_dir, print_realtime=False, print_progress=False, print_timestamps=False, translate=False, # beam_search=1, temperature=0., no_context=True ) print("load model time: ", time.time() - t0) rows = [["file_name", "inference_time", "res_text"]] audios = audios_list.read_text().splitlines() for audio in audios: if not audio: rows.append([]) continue lang = Path(audio).name.split('-')[0] if lang not in ["es", "fr", "hi", "it", "ja", "pt"]: lang = "en" print(f"Audio file: {audio}, lang: {lang}") t1 = time.time() output = model.transcribe(str(audio), language=lang) t = time.time() - t1 print("Inference time:", t) # print(output) text = " ".join([a.text for a in output]) print("Text:", text) rows.append([audio, t, text]) save_csv("csv/compare_whisper.csv", rows) if __name__ == '__main__': models_dir = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models") # model_name = "large-v3-turbo-q5_0" model_name = "large-v3-turbo-q8_0" # model_name = "small-q8_0" # audios_after_vad = Path("/Users/jeqin/work/test/test_yoyotranslator/audios_after_vad/audio2-with-noise") audios_list = Path("/Users/jeqin/work/code/TestTranslator/scripts/audios.txt") # run_audios_after_vad(models_dir, audios_after_vad, model_name) run_long_audios(models_dir, audios_list, model_name)