update scripts and test_data
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- environment.py +1 -1
- scripts/audios.txt +70 -0
- scripts/compare_whisper.py +92 -0
- scripts/export_onnx.py +18 -0
- scripts/infer_finetuned_whisper.py +157 -0
- scripts/run_funasr.py +50 -0
- scripts/run_funasr_c.py +39 -0
- scripts/run_kokoro.py +54 -0
- scripts/run_kokoro_sample.py +65 -0
- scripts/run_quant.py +51 -0
- scripts/run_whisper.py +39 -20
- scripts/split_audio.py +35 -0
- temp.py +4 -0
- tests/test_accuracy_and_delay.py +2 -2
- tests/test_data/test_audios.zip +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-chaos-part2-0.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-chaos-part2-10.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-chaos-part2-20.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-chaos-part2-30.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-computer_sicence-part1-0.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-computer_sicence-part1-10.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-computer_sicence-part1-20.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-computer_sicence-part1-30.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-generative_ai-part1-0.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-generative_ai-part1-10.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-generative_ai-part1-20.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-generative_ai-part1-30.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-internet-part20-0.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-internet-part20-10.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-internet-part20-20.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-internet-part20-30.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-legalsystem-part1-0.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-legalsystem-part1-10.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-legalsystem-part1-20.wav +3 -0
- tests/test_data/test_audios/10s-en-ac1-16k/English-legalsystem-part1-30.wav +3 -0
- tests/test_data/test_audios/10s-mix/qiaodan-part1-0.wav +3 -0
- tests/test_data/test_audios/10s-mix/qiaodan-part1-10.wav +3 -0
- tests/test_data/test_audios/10s-mix/qiaodan-part1-20.wav +3 -0
- tests/test_data/test_audios/10s-mix/qiaodan-part1-30.wav +3 -0
- tests/test_data/test_audios/10s-mix/qiaodan-part2-0.wav +3 -0
- tests/test_data/test_audios/10s-mix/qiaodan-part2-10.wav +3 -0
- tests/test_data/test_audios/10s-mix/qiaodan-part2-20.wav +3 -0
- tests/test_data/test_audios/10s-mix/qiaodan-part2-30.wav +3 -0
- tests/test_data/test_audios/10s-mix/randomforest-part1-0.wav +3 -0
- tests/test_data/test_audios/10s-mix/randomforest-part1-10.wav +3 -0
- tests/test_data/test_audios/10s-mix/randomforest-part1-20.wav +3 -0
- tests/test_data/test_audios/10s-mix/randomforest-part1-30.wav +3 -0
- tests/test_data/test_audios/10s-mix/zhanghuailong-part1-0.wav +3 -0
- tests/test_data/test_audios/10s-mix/zhanghuailong-part1-10.wav +3 -0
- tests/test_data/test_audios/10s-mix/zhanghuailong-part1-20.wav +3 -0
environment.py
CHANGED
|
@@ -26,4 +26,4 @@ class RunType(Enum):
|
|
| 26 |
code = 0
|
| 27 |
electron = 1
|
| 28 |
dev = 2
|
| 29 |
-
RUN_TYPE = RunType.
|
|
|
|
| 26 |
code = 0
|
| 27 |
electron = 1
|
| 28 |
dev = 2
|
| 29 |
+
RUN_TYPE = RunType.electron # electron or web
|
scripts/audios.txt
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/5s-en-ac1-16k/English-chaos-part2-0.wav
|
| 2 |
+
/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/5s-en-ac1-16k/English-chaos-part2-5.wav
|
| 3 |
+
/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/5s-en-ac1-16k/English-chaos-part2-10.wav
|
| 4 |
+
/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/5s-en-ac1-16k/English-chaos-part2-15.wav
|
| 5 |
+
/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/10s-en-ac1-16k/English-chaos-part2-0.wav
|
| 6 |
+
/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/10s-en-ac1-16k/English-chaos-part2-10.wav
|
| 7 |
+
/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/10s-en-ac1-16k/English-chaos-part2-20.wav
|
| 8 |
+
/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/10s-en-ac1-16k/English-chaos-part2-30.wav
|
| 9 |
+
/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/English-chaos-part2.wav
|
| 10 |
+
|
| 11 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/es-1-0.wav
|
| 12 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/es-1-5.wav
|
| 13 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/es-1-10.wav
|
| 14 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/es-1-15.wav
|
| 15 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/es-1-0.wav
|
| 16 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/es-1-10.wav
|
| 17 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/es-1-20.wav
|
| 18 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/es-1-30.wav
|
| 19 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/es-1.wav
|
| 20 |
+
|
| 21 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/fr-1-0.wav
|
| 22 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/fr-1-5.wav
|
| 23 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/fr-1-10.wav
|
| 24 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/fr-1-15.wav
|
| 25 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/fr-1-0.wav
|
| 26 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/fr-1-10.wav
|
| 27 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/fr-1-20.wav
|
| 28 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/fr-1-30.wav
|
| 29 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/fr-1.wav
|
| 30 |
+
|
| 31 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/hi-2-0.wav
|
| 32 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/hi-2-5.wav
|
| 33 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/hi-2-10.wav
|
| 34 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/hi-2-15.wav
|
| 35 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/hi-2-0.wav
|
| 36 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/hi-2-10.wav
|
| 37 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/hi-2-20.wav
|
| 38 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/hi-2-30.wav
|
| 39 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/hi-2.wav
|
| 40 |
+
|
| 41 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/it-1-0.wav
|
| 42 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/it-1-5.wav
|
| 43 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/it-1-10.wav
|
| 44 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/it-1-15.wav
|
| 45 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/it-1-0.wav
|
| 46 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/it-1-10.wav
|
| 47 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/it-1-20.wav
|
| 48 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/it-1-30.wav
|
| 49 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/it-1.wav
|
| 50 |
+
|
| 51 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/ja-1-0.wav
|
| 52 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/ja-1-5.wav
|
| 53 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/ja-1-10.wav
|
| 54 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/ja-1-15.wav
|
| 55 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/ja-1-0.wav
|
| 56 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/ja-1-10.wav
|
| 57 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/ja-1-20.wav
|
| 58 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/ja-1-30.wav
|
| 59 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/ja-1.wav
|
| 60 |
+
|
| 61 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/pt-1-0.wav
|
| 62 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/pt-1-5.wav
|
| 63 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/pt-1-10.wav
|
| 64 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/5s/pt-1-15.wav
|
| 65 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/pt-1-0.wav
|
| 66 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/pt-1-10.wav
|
| 67 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/pt-1-20.wav
|
| 68 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/10s/pt-1-30.wav
|
| 69 |
+
/Users/jeqin/work/test/test_yoyotranslator/test_videos/audios/pt-1.wav
|
| 70 |
+
|
scripts/compare_whisper.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pywhispercpp.model import Model
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import time
|
| 4 |
+
import csv
|
| 5 |
+
|
| 6 |
+
from silero_vad.utils_vad import languages
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def save_csv(file_path, rows):
|
| 10 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 11 |
+
writer = csv.writer(f)
|
| 12 |
+
writer.writerows(rows)
|
| 13 |
+
print(f"write csv to {file_path}")
|
| 14 |
+
|
| 15 |
+
def run_audios_after_vad(models_dir, audio_dir, model_name):
|
| 16 |
+
t0 = time.time()
|
| 17 |
+
model = Model(
|
| 18 |
+
model=model_name,
|
| 19 |
+
models_dir=models_dir,
|
| 20 |
+
print_realtime=False,
|
| 21 |
+
print_progress=False,
|
| 22 |
+
print_timestamps=False,
|
| 23 |
+
translate=False,
|
| 24 |
+
# beam_search=1,
|
| 25 |
+
temperature=0.,
|
| 26 |
+
no_context=True
|
| 27 |
+
)
|
| 28 |
+
print("load model time: ", time.time()-t0)
|
| 29 |
+
rows = [["lang", "file_name", "inference_time", "python_res", "intel_res"]]
|
| 30 |
+
for lang in ["es", "fr", "hi", "it", "ja", "pt"]:
|
| 31 |
+
print("*" * 10, lang, "*"*10)
|
| 32 |
+
for audio in sorted(list((audio_dir/lang).glob("*.wav"))):
|
| 33 |
+
print("Audio name:", audio.name)
|
| 34 |
+
t1 = time.time()
|
| 35 |
+
output = model.transcribe(str(audio), language=lang)
|
| 36 |
+
t = time.time() - t1
|
| 37 |
+
print("Inference time:", t)
|
| 38 |
+
# print(output)
|
| 39 |
+
text = " ".join([a.text for a in output])
|
| 40 |
+
print("Text from Python:", text)
|
| 41 |
+
try:
|
| 42 |
+
with open(audio.with_suffix(".txt"), encoding="utf-8") as f:
|
| 43 |
+
intel_text = f.read().strip()
|
| 44 |
+
except Exception as e:
|
| 45 |
+
intel_text = ""
|
| 46 |
+
print(f"Error reading Intel text for {audio.name}: {e}")
|
| 47 |
+
print("Text from Intel :", intel_text)
|
| 48 |
+
rows.append([lang, audio.name, t, text, intel_text])
|
| 49 |
+
save_csv("csv/compare_whisper_intel.csv", rows)
|
| 50 |
+
|
| 51 |
+
def run_long_audios(models_dir, audios_list, model_name):
|
| 52 |
+
t0 = time.time()
|
| 53 |
+
model = Model(
|
| 54 |
+
model=model_name,
|
| 55 |
+
models_dir=models_dir,
|
| 56 |
+
print_realtime=False,
|
| 57 |
+
print_progress=False,
|
| 58 |
+
print_timestamps=False,
|
| 59 |
+
translate=False,
|
| 60 |
+
# beam_search=1,
|
| 61 |
+
temperature=0.,
|
| 62 |
+
no_context=True
|
| 63 |
+
)
|
| 64 |
+
print("load model time: ", time.time() - t0)
|
| 65 |
+
rows = [["file_name", "inference_time", "res_text"]]
|
| 66 |
+
audios = audios_list.read_text().splitlines()
|
| 67 |
+
for audio in audios:
|
| 68 |
+
if not audio:
|
| 69 |
+
rows.append([])
|
| 70 |
+
continue
|
| 71 |
+
lang = Path(audio).name.split('-')[0]
|
| 72 |
+
if lang not in ["es", "fr", "hi", "it", "ja", "pt"]:
|
| 73 |
+
lang = "en"
|
| 74 |
+
print(f"Audio file: {audio}, lang: {lang}")
|
| 75 |
+
t1 = time.time()
|
| 76 |
+
output = model.transcribe(str(audio), language=lang)
|
| 77 |
+
t = time.time() - t1
|
| 78 |
+
print("Inference time:", t)
|
| 79 |
+
# print(output)
|
| 80 |
+
text = " ".join([a.text for a in output])
|
| 81 |
+
print("Text:", text)
|
| 82 |
+
rows.append([audio, t, text])
|
| 83 |
+
save_csv("csv/compare_whisper.csv", rows)
|
| 84 |
+
if __name__ == '__main__':
|
| 85 |
+
models_dir = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models")
|
| 86 |
+
# model_name = "large-v3-turbo-q5_0"
|
| 87 |
+
model_name = "large-v3-turbo-q8_0"
|
| 88 |
+
# model_name = "small-q8_0"
|
| 89 |
+
# audios_after_vad = Path("/Users/jeqin/work/test/test_yoyotranslator/audios_after_vad/audio2-with-noise")
|
| 90 |
+
audios_list = Path("/Users/jeqin/work/code/TestTranslator/scripts/audios.txt")
|
| 91 |
+
# run_audios_after_vad(models_dir, audios_after_vad, model_name)
|
| 92 |
+
run_long_audios(models_dir, audios_list, model_name)
|
scripts/export_onnx.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from funasr import AutoModel
|
| 2 |
+
|
| 3 |
+
model_dir = "/Users/moyoyo/code/Translator/moyoyo_asr_models"
|
| 4 |
+
asr_model_path = model_dir + '/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
|
| 5 |
+
vad_model_path = model_dir + '/speech_fsmn_vad_zh-cn-16k-common-pytorch'
|
| 6 |
+
punc_model_path = model_dir + '/punc_ct-transformer_cn-en-common-vocab471067-large'
|
| 7 |
+
|
| 8 |
+
model = AutoModel(model=asr_model_path)
|
| 9 |
+
model_dir = model.export(type="onnx", quantize=True, disable_update=True)
|
| 10 |
+
print(model_dir)
|
| 11 |
+
|
| 12 |
+
model = AutoModel(model=vad_model_path)
|
| 13 |
+
model_dir = model.export(type="onnx", quantize=True, disable_update=True)
|
| 14 |
+
print(model_dir)
|
| 15 |
+
|
| 16 |
+
model = AutoModel(model=punc_model_path)
|
| 17 |
+
model_dir = model.export(type="onnx", quantize=True, disable_update=True)
|
| 18 |
+
print(model_dir)
|
scripts/infer_finetuned_whisper.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import csv
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch
|
| 9 |
+
import librosa
|
| 10 |
+
from transformers import WhisperForConditionalGeneration, WhisperProcessor
|
| 11 |
+
|
| 12 |
+
def save_csv(file_path, rows):
|
| 13 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 14 |
+
writer = csv.writer(f)
|
| 15 |
+
writer.writerows(rows)
|
| 16 |
+
print(f"write csv to {file_path}")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def load_audio(audio_path: str, sr: int = 16000):
|
| 20 |
+
# 读取音频并转成 16k 单声道 numpy float32
|
| 21 |
+
audio, _ = librosa.load(audio_path, sr=sr, mono=True)
|
| 22 |
+
return audio
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def transcribe_file(
|
| 26 |
+
audio_path: str,
|
| 27 |
+
model,
|
| 28 |
+
processor,
|
| 29 |
+
language: str = "Chinese",
|
| 30 |
+
task: str = "transcribe",
|
| 31 |
+
timestamps: bool = False,
|
| 32 |
+
max_new_tokens: int = 255,
|
| 33 |
+
):
|
| 34 |
+
# 准备特征
|
| 35 |
+
audio = load_audio(audio_path, sr=16000)
|
| 36 |
+
inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
|
| 37 |
+
|
| 38 |
+
# 放到设备
|
| 39 |
+
device = next(model.parameters()).device
|
| 40 |
+
input_features = inputs["input_features"].to(device)
|
| 41 |
+
|
| 42 |
+
# 生成
|
| 43 |
+
with torch.inference_mode(), torch.autocast(device_type="cuda", enabled=(device.type == "cuda")):
|
| 44 |
+
generated_ids = model.generate(
|
| 45 |
+
input_features=input_features,
|
| 46 |
+
max_new_tokens=max_new_tokens,
|
| 47 |
+
return_timestamps=timestamps, # 仅部分版本支持;不支持时自动忽略
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# 解码
|
| 51 |
+
text = processor.tokenizer.batch_decode(generated_ids.cpu().numpy(), skip_special_tokens=True)
|
| 52 |
+
return text[0]
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def main():
|
| 56 |
+
parser = argparse.ArgumentParser("Simple Whisper Inference")
|
| 57 |
+
parser.add_argument("--model_path", type=str, default="whisper-large-v3-turbo-finetune",
|
| 58 |
+
help="本地合并模型路径或HF模型名")
|
| 59 |
+
parser.add_argument("--input", type=str, required=True,
|
| 60 |
+
help="音频文件路径,或目录(将批量处理其中的音频)")
|
| 61 |
+
parser.add_argument("--language", type=str, default="Chinese",
|
| 62 |
+
help="语言(如 Chinese / English / zh / en)")
|
| 63 |
+
parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"],
|
| 64 |
+
help="任务:转写或翻译")
|
| 65 |
+
parser.add_argument("--timestamps", action="store_true", help="是否返回时间戳(若模型与版本支持)")
|
| 66 |
+
parser.add_argument("--local_files_only", action="store_true", help="仅本地加载,不联网")
|
| 67 |
+
parser.add_argument("--batch_exts", type=str, default=".wav,.mp3,.flac,.m4a",
|
| 68 |
+
help="当 --input 是目录时,处理这些后缀的文件,逗号分隔")
|
| 69 |
+
args = parser.parse_args()
|
| 70 |
+
|
| 71 |
+
# 加载处理器 & 模型
|
| 72 |
+
processor = WhisperProcessor.from_pretrained(
|
| 73 |
+
args.model_path,
|
| 74 |
+
language=args.language,
|
| 75 |
+
task=args.task,
|
| 76 |
+
no_timestamps=not args.timestamps,
|
| 77 |
+
local_files_only=args.local_files_only,
|
| 78 |
+
)
|
| 79 |
+
model = WhisperForConditionalGeneration.from_pretrained(
|
| 80 |
+
args.model_path,
|
| 81 |
+
device_map="auto",
|
| 82 |
+
local_files_only=args.local_files_only,
|
| 83 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
model.generation_config.language = args.language.lower()
|
| 87 |
+
model.generation_config.forced_decoder_ids = None
|
| 88 |
+
model.eval()
|
| 89 |
+
|
| 90 |
+
path = Path(args.input)
|
| 91 |
+
if path.is_file():
|
| 92 |
+
text = transcribe_file(
|
| 93 |
+
str(path), model, processor,
|
| 94 |
+
language=args.language, task=args.task, timestamps=args.timestamps
|
| 95 |
+
)
|
| 96 |
+
print(f"{path.name} -> {text}")
|
| 97 |
+
else:
|
| 98 |
+
# 目录批量
|
| 99 |
+
exts = {e.strip().lower() for e in args.batch_exts.split(",")}
|
| 100 |
+
files = [p for p in path.rglob("*") if p.suffix.lower() in exts]
|
| 101 |
+
if not files:
|
| 102 |
+
print("目录中未找到可处理的音频文件。")
|
| 103 |
+
return
|
| 104 |
+
for p in sorted(files):
|
| 105 |
+
try:
|
| 106 |
+
t0 = time.time()
|
| 107 |
+
text = transcribe_file(
|
| 108 |
+
str(p), model, processor,
|
| 109 |
+
language=args.language, task=args.task, timestamps=args.timestamps
|
| 110 |
+
)
|
| 111 |
+
t1 = time.time()
|
| 112 |
+
print(f"{p.name} -> {text}; time cost: {t1-t0}")
|
| 113 |
+
except Exception as e:
|
| 114 |
+
print(f"{p.name} -> 失败: {e}")
|
| 115 |
+
def run():
|
| 116 |
+
model_path = "/Users/jeqin/Downloads/whisper-large-v3-turbo-finetune-0901"
|
| 117 |
+
lang = "en"
|
| 118 |
+
t0 = time.time()
|
| 119 |
+
processor = WhisperProcessor.from_pretrained(
|
| 120 |
+
model_path,
|
| 121 |
+
language=lang,
|
| 122 |
+
task="transcribe",
|
| 123 |
+
no_timestamps=True,
|
| 124 |
+
local_files_only=True,
|
| 125 |
+
)
|
| 126 |
+
model = WhisperForConditionalGeneration.from_pretrained(
|
| 127 |
+
model_path,
|
| 128 |
+
device_map="mps",
|
| 129 |
+
local_files_only=True,
|
| 130 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
model.generation_config.language = lang.lower()
|
| 134 |
+
model.generation_config.forced_decoder_ids = None
|
| 135 |
+
model.eval()
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
print("load model time: ", time.time() - t0)
|
| 139 |
+
audios = Path("/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/")
|
| 140 |
+
rows = [["file_name", "inference_time", "inference_result"]]
|
| 141 |
+
for audio in sorted(audios.glob("*en-ac1-16k/*.wav")): # *s/randomforest*.wav"
|
| 142 |
+
try:
|
| 143 |
+
t0 = time.time()
|
| 144 |
+
text = transcribe_file(
|
| 145 |
+
str(audio), model, processor
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
t = time.time()-t0
|
| 149 |
+
print(f"{audio.name} -> {text}; time cost: {t}")
|
| 150 |
+
rows.append([f"{audio.parent.name}/{audio.name}", t, text])
|
| 151 |
+
except Exception as e:
|
| 152 |
+
print(f"{audio.name} -> 失败: {e}")
|
| 153 |
+
save_csv("csv/fine-tune_whisper-0901.csv", rows)
|
| 154 |
+
|
| 155 |
+
if __name__ == "__main__":
|
| 156 |
+
# main()
|
| 157 |
+
run()
|
scripts/run_funasr.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from funasr import AutoModel
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import time
|
| 4 |
+
import csv
|
| 5 |
+
|
| 6 |
+
def save_csv(file_path, rows):
|
| 7 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 8 |
+
writer = csv.writer(f)
|
| 9 |
+
writer.writerows(rows)
|
| 10 |
+
print(f"write csv to {file_path}")
|
| 11 |
+
|
| 12 |
+
def main():
|
| 13 |
+
model_dir = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models")
|
| 14 |
+
|
| 15 |
+
asr_model_path = model_dir / 'speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
|
| 16 |
+
vad_model_path = model_dir / 'speech_fsmn_vad_zh-cn-16k-common-pytorch'
|
| 17 |
+
punc_model_path = model_dir / 'punc_ct-transformer_cn-en-common-vocab471067-large'
|
| 18 |
+
t0 = time.time()
|
| 19 |
+
model = AutoModel(
|
| 20 |
+
model=asr_model_path.as_posix(),
|
| 21 |
+
vad_model=vad_model_path.as_posix(),
|
| 22 |
+
punc_model=punc_model_path.as_posix(),
|
| 23 |
+
log_level="ERROR",
|
| 24 |
+
disable_update=True
|
| 25 |
+
)
|
| 26 |
+
t1 = time.time()
|
| 27 |
+
print("load model: ", t1 - t0)
|
| 28 |
+
audios = Path("/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/")
|
| 29 |
+
rows = [["file_name", "inference_time", "inference_result"]]
|
| 30 |
+
for audio in sorted(audios.glob("*ac1-16k/Chinese*")):
|
| 31 |
+
print(audio)
|
| 32 |
+
t1 = time.time()
|
| 33 |
+
try:
|
| 34 |
+
result = model.generate(input=str(audio), disable_pbar=True,
|
| 35 |
+
hotword="")
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(audio)
|
| 38 |
+
print(e)
|
| 39 |
+
t2 = time.time()
|
| 40 |
+
t = t2-t1
|
| 41 |
+
print("inference time:", t)
|
| 42 |
+
text = result[0]["text"]
|
| 43 |
+
print("inference result", text)
|
| 44 |
+
rows.append([f"{audio.parent.name}/{audio.name}", t, text])
|
| 45 |
+
save_csv(f"csv/funasr.csv", rows)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
if __name__ == '__main__':
|
| 50 |
+
main()
|
scripts/run_funasr_c.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import sys
|
| 3 |
+
import time
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import csv
|
| 6 |
+
|
| 7 |
+
sys.path.append('/Users/jeqin/work/code/funasr_wrapper/build') # 添加编译生成的模块路径
|
| 8 |
+
sys.path.append('/Users/jeqin/work/code/funasr_wrapper/build/src') # 添加编译生成的模块路径
|
| 9 |
+
import funasr_py
|
| 10 |
+
|
| 11 |
+
def save_csv(file_path, rows):
|
| 12 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 13 |
+
writer = csv.writer(f)
|
| 14 |
+
writer.writerows(rows)
|
| 15 |
+
print(f"write csv to {file_path}")
|
| 16 |
+
|
| 17 |
+
def main():
|
| 18 |
+
t0 = time.time()
|
| 19 |
+
config_file = "/Users/jeqin/work/code/funasr_wrapper/testpy/config.json"
|
| 20 |
+
asr = funasr_py.FunasrEasy(config_file)
|
| 21 |
+
# 初始化模型
|
| 22 |
+
asr.init()
|
| 23 |
+
t1 = time.time()
|
| 24 |
+
print("Initializing model: ", t1-t0)
|
| 25 |
+
audios = Path("/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/")
|
| 26 |
+
rows = [["file_name", "inference_time", "inference_result"]]
|
| 27 |
+
for audio in sorted(audios.glob("*s-ac1/Chinese*")):
|
| 28 |
+
print(audio)
|
| 29 |
+
t1 = time.time()
|
| 30 |
+
result = asr.infer(str(audio))
|
| 31 |
+
text = asr.get_text(result)
|
| 32 |
+
asr.free_result(result)
|
| 33 |
+
t = time.time() - t1
|
| 34 |
+
print("inference time:", t)
|
| 35 |
+
print(text)
|
| 36 |
+
rows.append([f"{audio.parent.name}/{audio.name}", t, text])
|
| 37 |
+
save_csv("csv/funasr_c.csv", rows)
|
| 38 |
+
if __name__ == '__main__':
|
| 39 |
+
main()
|
scripts/run_kokoro.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from time import time
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
import soundfile as sf
|
| 6 |
+
from misaki import zh
|
| 7 |
+
import onnxruntime
|
| 8 |
+
|
| 9 |
+
from kokoro_onnx import Kokoro
|
| 10 |
+
|
| 11 |
+
# providers = onnxruntime.get_available_providers()
|
| 12 |
+
# print(f"Available onnx runtime providers: {providers}")
|
| 13 |
+
|
| 14 |
+
def create_session(model_path):
|
| 15 |
+
# See list of providers https://github.com/microsoft/onnxruntime/issues/22101#issuecomment-2357667377
|
| 16 |
+
providers = onnxruntime.get_available_providers()
|
| 17 |
+
providers = providers[1:2]
|
| 18 |
+
print(f"Available onnx runtime providers: {providers}")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# See session options https://onnxruntime.ai/docs/performance/tune-performance/threading.html#thread-management
|
| 22 |
+
sess_options = onnxruntime.SessionOptions()
|
| 23 |
+
cpu_count = os.cpu_count() // 2
|
| 24 |
+
print(f"Setting threads to CPU cores count: {cpu_count}")
|
| 25 |
+
# sess_options.intra_op_num_threads = cpu_count
|
| 26 |
+
session = onnxruntime.InferenceSession(
|
| 27 |
+
model_path, providers=providers, sess_options=sess_options
|
| 28 |
+
)
|
| 29 |
+
return session
|
| 30 |
+
|
| 31 |
+
model_folder = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models/kokoro")
|
| 32 |
+
model_path = str(model_folder/"kokoro-quant.onnx")
|
| 33 |
+
voice_model_path = str(model_folder/"voices-v1.0.bin")
|
| 34 |
+
vocab_config = str(model_folder/"zh_config.json")
|
| 35 |
+
|
| 36 |
+
texts = [
|
| 37 |
+
"千里之行,始于足下。",
|
| 38 |
+
"我想听你唱首歌",
|
| 39 |
+
"窗前明月光,疑是地上霜。举头望明月,低头思故乡。"
|
| 40 |
+
]
|
| 41 |
+
voice = "zf_xiaoyi"
|
| 42 |
+
session = create_session(model_path)
|
| 43 |
+
model = Kokoro.from_session(session, voice_model_path, vocab_config=vocab_config)
|
| 44 |
+
g2p = zh.ZHG2P()
|
| 45 |
+
for i in range(5):
|
| 46 |
+
for index, text in enumerate(texts):
|
| 47 |
+
phonemes, _ = g2p(text)
|
| 48 |
+
start = time()
|
| 49 |
+
samples, sample_rate = model.create(phonemes, voice=voice, speed=1.0, is_phonemes=True)
|
| 50 |
+
end = time()
|
| 51 |
+
time_cost = end - start
|
| 52 |
+
print(f"time cost: {time_cost} for text: {text}")
|
| 53 |
+
sf.write(f"audio_{index}.wav", samples, sample_rate)
|
| 54 |
+
print(f"Created audio_{index}.wav")
|
scripts/run_kokoro_sample.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from time import time
|
| 2 |
+
import soundfile as sf
|
| 3 |
+
from misaki import en, espeak, zh
|
| 4 |
+
from kokoro_onnx import Kokoro
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def run_en():
|
| 9 |
+
# Misaki G2P with espeak-ng fallback
|
| 10 |
+
fallback = espeak.EspeakFallback(british=False)
|
| 11 |
+
g2p = en.G2P(trf=False, british=False, fallback=fallback)
|
| 12 |
+
|
| 13 |
+
models = "/Users/jeqin/work/code/TestTranslator/scripts/kokoro_models/"
|
| 14 |
+
# Kokoro
|
| 15 |
+
kokoro = Kokoro(f"{models}kokoro-v1.0.onnx", f"{models}voices-v1.0.bin")
|
| 16 |
+
|
| 17 |
+
texts = [
|
| 18 |
+
"[Misaki](/misˈɑki/) is a G2P engine designed for [Kokoro](/kˈOkəɹO/) models.",
|
| 19 |
+
"For example, the geology and terrain along the railway line.",
|
| 20 |
+
" When choosing solid-state drives, we sometimes see reviews or videos discussing whether a particular solid-state drive has a caching scheme or an uncaching scheme in the performance testing section."
|
| 21 |
+
]
|
| 22 |
+
for index, text in enumerate(texts):
|
| 23 |
+
# Phonemize
|
| 24 |
+
# text = "[Misaki](/misˈɑki/) is a G2P engine designed for [Kokoro](/kˈOkəɹO/) models."
|
| 25 |
+
phonemes, _ = g2p(text)
|
| 26 |
+
|
| 27 |
+
# Create
|
| 28 |
+
start = time()
|
| 29 |
+
samples, sample_rate = kokoro.create(phonemes, "af_heart", is_phonemes=True)
|
| 30 |
+
end = time()
|
| 31 |
+
time_cost = end - start
|
| 32 |
+
print(f"time cost: {time_cost} for text: {text}")
|
| 33 |
+
# Save
|
| 34 |
+
sf.write(f"audio{index}.wav", samples, sample_rate)
|
| 35 |
+
print(f"Created audio{index}.wav")
|
| 36 |
+
|
| 37 |
+
def run_zh():
|
| 38 |
+
# Misaki G2P with espeak-ng fallback
|
| 39 |
+
# fallback = espeak.EspeakFallback(british=False)
|
| 40 |
+
g2p = zh.ZHG2P()
|
| 41 |
+
|
| 42 |
+
models = "/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models/kokoro"
|
| 43 |
+
# Kokoro
|
| 44 |
+
kokoro = Kokoro(f"{models}/kokoro-quant.onnx", f"{models}/voices-v1.0.bin", vocab_config=f"{models}/zh_config.json")
|
| 45 |
+
|
| 46 |
+
texts = [
|
| 47 |
+
"千里之行,始于足下。",
|
| 48 |
+
"我想听你唱首歌",
|
| 49 |
+
"窗前明月光,疑是地上霜。举头望明月,低头思故乡。"
|
| 50 |
+
]
|
| 51 |
+
for index, text in enumerate(texts):
|
| 52 |
+
phonemes, _ = g2p(text)
|
| 53 |
+
|
| 54 |
+
# Create
|
| 55 |
+
start = time()
|
| 56 |
+
samples, sample_rate = kokoro.create(phonemes, "zf_xiaoyi", is_phonemes=True, speed=1.0)
|
| 57 |
+
end = time()
|
| 58 |
+
time_cost = end - start
|
| 59 |
+
print(f"time cost: {time_cost} for text: {text}")
|
| 60 |
+
# Save
|
| 61 |
+
sf.write(f"audio{index}.wav", samples, sample_rate)
|
| 62 |
+
print(f"Created audio{index}.wav")
|
| 63 |
+
|
| 64 |
+
if __name__ == '__main__':
|
| 65 |
+
run_zh()
|
scripts/run_quant.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import time
|
| 3 |
+
import csv
|
| 4 |
+
from funasr_onnx import SeacoParaformer, CT_Transformer, Fsmn_vad
|
| 5 |
+
|
| 6 |
+
def save_csv(file_path, rows):
|
| 7 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 8 |
+
writer = csv.writer(f)
|
| 9 |
+
writer.writerows(rows)
|
| 10 |
+
print(f"write csv to {file_path}")
|
| 11 |
+
def main():
|
| 12 |
+
model_dir = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models")
|
| 13 |
+
|
| 14 |
+
asr_model_path = model_dir / 'speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
|
| 15 |
+
vad_model_path = model_dir / 'speech_fsmn_vad_zh-cn-16k-common-pytorch'
|
| 16 |
+
punc_model_path = model_dir / 'punc_ct-transformer_cn-en-common-vocab471067-large'
|
| 17 |
+
t0 = time.time()
|
| 18 |
+
quantize = True
|
| 19 |
+
vad_model = Fsmn_vad(vad_model_path, quantize=quantize)
|
| 20 |
+
asr_model = SeacoParaformer(asr_model_path, quantize=quantize)
|
| 21 |
+
punc_model = CT_Transformer(punc_model_path, quantize=quantize)
|
| 22 |
+
t1 = time.time()
|
| 23 |
+
print("load model time:", t1 - t0)
|
| 24 |
+
audios = Path("/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/")
|
| 25 |
+
rows = [["file_name", "inference_time", "inference_result"]]
|
| 26 |
+
for audio in sorted(audios.glob("*s/randomforest*.wav")):
|
| 27 |
+
t1 = time.time()
|
| 28 |
+
vad_res = vad_model(str(audio))
|
| 29 |
+
t2 = time.time()
|
| 30 |
+
print("vad time:", t2-t1)
|
| 31 |
+
asr_res = asr_model(str(audio), hotwords="")
|
| 32 |
+
asr_text = asr_res[0]["preds"]
|
| 33 |
+
t3 = time.time()
|
| 34 |
+
print("asr time:", t3-t2)
|
| 35 |
+
print("asr text:", asr_text)
|
| 36 |
+
result = punc_model(asr_text)
|
| 37 |
+
text = result[0]
|
| 38 |
+
t4 = time.time()
|
| 39 |
+
print("punc time:", t4-t3)
|
| 40 |
+
print("punc text:", text)
|
| 41 |
+
# print(text)
|
| 42 |
+
# vad_res = vad_model(str(audio))
|
| 43 |
+
# t5 = time.time()
|
| 44 |
+
# print("vad time:", t5 - t4)
|
| 45 |
+
t = t4-t1
|
| 46 |
+
print("inference:", t)
|
| 47 |
+
rows.append([f"{audio.parent.name}/{audio.name}", t, text])
|
| 48 |
+
file_name = "csv/quant.csv" if quantize else "run_onnx.csv"
|
| 49 |
+
save_csv(file_name, rows)
|
| 50 |
+
if __name__ == '__main__':
|
| 51 |
+
main()
|
scripts/run_whisper.py
CHANGED
|
@@ -1,26 +1,45 @@
|
|
| 1 |
from pywhispercpp.model import Model
|
| 2 |
from pathlib import Path
|
| 3 |
import time
|
|
|
|
| 4 |
|
| 5 |
from silero_vad.utils_vad import languages
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from pywhispercpp.model import Model
|
| 2 |
from pathlib import Path
|
| 3 |
import time
|
| 4 |
+
import csv
|
| 5 |
|
| 6 |
from silero_vad.utils_vad import languages
|
| 7 |
|
| 8 |
+
|
| 9 |
+
def save_csv(file_path, rows):
|
| 10 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 11 |
+
writer = csv.writer(f)
|
| 12 |
+
writer.writerows(rows)
|
| 13 |
+
print(f"write csv to {file_path}")
|
| 14 |
+
|
| 15 |
+
def main():
|
| 16 |
+
models_dir = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models")
|
| 17 |
+
whisper_model = 'large-v3-turbo-q5_0'
|
| 18 |
+
t0 = time.time()
|
| 19 |
+
model = Model(
|
| 20 |
+
model=whisper_model,
|
| 21 |
+
models_dir=models_dir,
|
| 22 |
+
print_realtime=False,
|
| 23 |
+
print_progress=False,
|
| 24 |
+
print_timestamps=False,
|
| 25 |
+
translate=False,
|
| 26 |
+
# beam_search=1,
|
| 27 |
+
temperature=0.,
|
| 28 |
+
no_context=True
|
| 29 |
+
)
|
| 30 |
+
print("load model time: ", time.time()-t0)
|
| 31 |
+
audios = Path("/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios/")
|
| 32 |
+
rows = [["file_name", "inference_time", "inference_result"]]
|
| 33 |
+
for audio in sorted(audios.glob("*-mix/randomforest*.wav")):
|
| 34 |
+
print(audio)
|
| 35 |
+
t1 = time.time()
|
| 36 |
+
output = model.transcribe(str(audio), language="zh")#, language="zh", initial_prompt="这是一段中文的会议内容。")# initial_prompt="这是一段中文的会议内容。"
|
| 37 |
+
t = time.time() - t1
|
| 38 |
+
print("inference time:", t)
|
| 39 |
+
text = " ".join([a.text for a in output])
|
| 40 |
+
print(text)
|
| 41 |
+
rows.append([f"{audio.parent.name}/{audio.name}", t, text])
|
| 42 |
+
# save_csv("csv/whisper.csv", rows)
|
| 43 |
+
|
| 44 |
+
if __name__ == '__main__':
|
| 45 |
+
main()
|
scripts/split_audio.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import subprocess
|
| 3 |
+
from subprocess import CompletedProcess
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def cmd(command: str, check=True, capture_output=False) -> CompletedProcess:
|
| 7 |
+
print(command)
|
| 8 |
+
if capture_output:
|
| 9 |
+
ret = subprocess.run(command, shell=True, check=check, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
| 10 |
+
universal_newlines=True)
|
| 11 |
+
else:
|
| 12 |
+
ret = subprocess.run(command, shell=True, check=check)
|
| 13 |
+
print(ret.stdout)
|
| 14 |
+
return ret
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
current = Path("/Users/jeqin/work/code/TestTranslator/tests/test_data/test_audios")
|
| 18 |
+
audios_5s = current/"5s"
|
| 19 |
+
audios_10s = current/"10s"
|
| 20 |
+
if not audios_5s.exists():
|
| 21 |
+
audios_5s.mkdir(parents=True, exist_ok=True)
|
| 22 |
+
if not audios_10s.exists():
|
| 23 |
+
audios_10s.mkdir(parents=True, exist_ok=True)
|
| 24 |
+
for f in sorted(current.glob("randomforest*.wav")):
|
| 25 |
+
file_name = f.name
|
| 26 |
+
print(file_name)
|
| 27 |
+
for i in [0, 5, 10, 15]:
|
| 28 |
+
new_name = f"{f.stem}-{i}.wav"
|
| 29 |
+
# -ac 1 -ar 16000
|
| 30 |
+
command=f"ffmpeg -i {f} -ss 00:00:{str(i).zfill(2)} -ac 1 -ar 16000 -t 00:00:05 {audios_5s/new_name}"
|
| 31 |
+
cmd(command)
|
| 32 |
+
for i in [0, 10, 20, 30]:
|
| 33 |
+
new_name = f"{f.stem}-{i}.wav"
|
| 34 |
+
command = f"ffmpeg -i {f} -ss 00:00:{str(i).zfill(2)} -ac 1 -ar 16000 -t 00:00:10 {audios_10s/new_name}"
|
| 35 |
+
cmd(command)
|
temp.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
text ="""
|
| 2 |
+
{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}
|
| 3 |
+
"""
|
| 4 |
+
print(text)
|
tests/test_accuracy_and_delay.py
CHANGED
|
@@ -14,7 +14,7 @@ def test_accuracy_and_delay_zh2en(app, log_file, page: TranslatorPage,
|
|
| 14 |
audio:Path,):
|
| 15 |
page.start_zh2en()
|
| 16 |
translation_lang = "zh2en"
|
| 17 |
-
time.sleep(
|
| 18 |
audio_length = get_length(audio)
|
| 19 |
play_audio(audio)
|
| 20 |
web_records = page.get_current_node_text(duration=audio_length)
|
|
@@ -38,7 +38,7 @@ def test_accuracy_and_delay_en2zh(app, log_file, page: TranslatorPage,
|
|
| 38 |
audio:Path):
|
| 39 |
page.start_en2zh()
|
| 40 |
translation_lang = "en2zh"
|
| 41 |
-
time.sleep(
|
| 42 |
audio_length = get_length(audio)
|
| 43 |
play_audio(audio)
|
| 44 |
web_records = page.get_current_node_text(duration=audio_length)
|
|
|
|
| 14 |
audio:Path,):
|
| 15 |
page.start_zh2en()
|
| 16 |
translation_lang = "zh2en"
|
| 17 |
+
time.sleep(3)
|
| 18 |
audio_length = get_length(audio)
|
| 19 |
play_audio(audio)
|
| 20 |
web_records = page.get_current_node_text(duration=audio_length)
|
|
|
|
| 38 |
audio:Path):
|
| 39 |
page.start_en2zh()
|
| 40 |
translation_lang = "en2zh"
|
| 41 |
+
time.sleep(3)
|
| 42 |
audio_length = get_length(audio)
|
| 43 |
play_audio(audio)
|
| 44 |
web_records = page.get_current_node_text(duration=audio_length)
|
tests/test_data/test_audios.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b3832d0c066ab144e2cda7e37df5144922dbceb0ae2605134eada3c866b0d43
|
| 3 |
+
size 83025760
|
tests/test_data/test_audios/10s-en-ac1-16k/English-chaos-part2-0.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:111f098aa42c139e19c795fc65b14d3b1435a29d75d208592c59e98f5e43144a
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-chaos-part2-10.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:836b29596a0c1609aa91d6d48bc3fd7c73ebda89656744d5ba5691168bebc8a7
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-chaos-part2-20.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a2f7a0f2e768846ad361672b402bb243c30c22631286908f58a8ffb9d4361ad
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-chaos-part2-30.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef83616e756e449f0307c93b97b8d260bc4c68e213dc878e9f0ca4a46e2a69b7
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-computer_sicence-part1-0.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dbf8ee215f2e447dff00e5d3cfee257a2945f1689c5af2fd995729f02315802d
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-computer_sicence-part1-10.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e27e550cde48277d1239cb9d9ee40749c520c2f4d5824bfb2fb46b29a8db2fc8
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-computer_sicence-part1-20.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69bedb0d844819919c44f5280aeb8ce20d3eee30099565bfff926aa883702a3c
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-computer_sicence-part1-30.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:606f1b5ad35f4edeaf274fd7e54c1b32cf22e905feee795c207de9d837f9031a
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-generative_ai-part1-0.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:171b9e830af34320b9049564390b36305fd98168a82a45d3bb93f24acb2ede29
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-generative_ai-part1-10.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7077bd545e60a791a4f5bef34e6d52a0c580ea7f3cb767bc2808442836347ec5
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-generative_ai-part1-20.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a85b17f66cc09d5cbdd55f17e52bc376db9d7c3668a1613b74cc60c146a8b2aa
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-generative_ai-part1-30.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b25e9278c64ba162139f4785a0419435112653d3c7bd66bb7d6a35e7c20bd12b
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-internet-part20-0.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2661796edf3667289ebb1772c3a3fb3d120ae7fb2e96c08899a5261b817fef49
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-internet-part20-10.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:100272f678ee41dc71c35e1b705bfbf3aef69650562539a1390f87a8ec21a926
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-internet-part20-20.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5110fd6774eab81a40c6c11fe5b08cb941588aa3fec0a00aa6bc951907750dec
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-internet-part20-30.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ace6cb64e830ab31892ea4ca072051b5993bbe393ac63bb887157b6c6808bbf6
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-legalsystem-part1-0.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b6d1ba99abc344ec4d31a3a6e34af5ab81dda139512f306bba94c5c52b71edc
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-legalsystem-part1-10.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:48e5aff409e3b83ae387dd6fd9c06f131116191161e2a844ba69febf1e8cbf3f
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-legalsystem-part1-20.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bfea77a95b17e3dace6dd5504bb5d618597619e1e64ecc91e52748e354331170
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-en-ac1-16k/English-legalsystem-part1-30.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0111f1d6d65692dce0b3200de36af5fa27b086401e462123baddc11f45fc8ef6
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/qiaodan-part1-0.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:464420cd14b00b08d4240fb5ecc19b2aa053ebbf534cb0aadb8f80f7bf0da668
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/qiaodan-part1-10.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5dbd856cf2a9543f3c133483708a6354a9ee06718b496bce03c29a981a56f8fc
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/qiaodan-part1-20.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59c1d0e112e1fafe9ec26a02d786451c890636f83f1dc63dc36fdcec4c9526e4
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/qiaodan-part1-30.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de45323f5741aac05143b6b995568c8e10e0bad5eadde2df9d67e0770ecc57b8
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/qiaodan-part2-0.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:648770939935801a613bc36917df034811ae70012c72d399582b54ff54d5cae0
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/qiaodan-part2-10.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9b3f117ccbb2bc9bab9355a3174b3fe9b39a6786ad37c0c20d577b643031aff
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/qiaodan-part2-20.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c918e58777b7d72307089963690a72ee1e67cf733f328fa93a385c204e0e4b02
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/qiaodan-part2-30.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:85891cc64cb79a385d986915dfcfa4fbc3e0ddf12870c7beafcc76c1e012bf78
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/randomforest-part1-0.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:802125c360c22476bad1aaabb6c5210d21460ed69884f21a3cb318ea00377345
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/randomforest-part1-10.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1605d02574c5a50a91bda5c397079eb6f503311012ed9f15e9b9f90ee7c5f30e
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/randomforest-part1-20.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:777b2c46d021c3c10fd0eafce8d6a17bddfc0867119d4537daaff945cf839462
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/randomforest-part1-30.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d9da19f35aab393949a408855ea7970f55ae50f13ceaa0f87e576cb1270cc019
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/zhanghuailong-part1-0.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6799ca07ab7c8db1ebb97244bc89022b281464dc864de0267b52192db8a3e107
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/zhanghuailong-part1-10.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8709c003dd26f01d0a140ed3c31e0dd84801984b7a50a36b0239e6b9590538f0
|
| 3 |
+
size 320078
|
tests/test_data/test_audios/10s-mix/zhanghuailong-part1-20.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c15a860a7c6994862244e49cb1fd2e19ac633e1b2c2abddedb10bed480968ed0
|
| 3 |
+
size 320078
|