yujuanqin commited on
Commit
db0d138
·
1 Parent(s): e5b5f3e

add asr test

Browse files
Files changed (45) hide show
  1. lib/asr_models/__init__.py +14 -0
  2. lib/asr_models/base_model.py +25 -0
  3. lib/asr_models/evaluator.py +109 -0
  4. lib/asr_models/funasr_nano.py +63 -0
  5. lib/asr_models/funasr_quant.py +41 -0
  6. lib/asr_models/model.py +701 -0
  7. lib/asr_models/whisper.py +44 -0
  8. lib/asr_models/whisper_finetuned.py +65 -0
  9. reports/asr_result_funasr_mlt_nano_librispeech_clean.json +0 -0
  10. reports/asr_result_funasr_mlt_nano_recording.json +597 -0
  11. reports/asr_result_funasr_mlt_nano_wenet_net.json +0 -0
  12. reports/asr_result_funasr_nano_librispeech_clean.json +0 -0
  13. reports/asr_result_funasr_nano_recording.json +597 -0
  14. reports/asr_result_funasr_nano_wenet_net.json +0 -0
  15. reports/asr_result_funasr_quant_librispeech_clean.json +0 -0
  16. reports/asr_result_funasr_quant_recording.json +597 -0
  17. reports/asr_result_funasr_quant_wenet_net.json +0 -0
  18. reports/asr_result_whisper_finetuned_librispeech_clean.json +0 -0
  19. reports/asr_result_whisper_finetuned_recording.json +597 -0
  20. reports/asr_result_whisper_finetuned_wenet_net.json +0 -0
  21. reports/asr_result_whisper_librispeech_clean.json +0 -0
  22. reports/asr_result_whisper_recording.json +597 -0
  23. reports/asr_result_whisper_wenet_net.json +0 -0
  24. scripts/asr_utils.py +6 -4
  25. scripts/batch_run_asr.py +11 -0
  26. scripts/caculate_cer.py +2 -2
  27. scripts/csv/fine-tune_whisper.csv +0 -2
  28. scripts/csv/funasr_quant.csv +0 -86
  29. scripts/csv/whisper.csv +0 -86
  30. scripts/export_onnx.py +18 -13
  31. scripts/model.py +696 -0
  32. scripts/run_funasr.py +2 -2
  33. scripts/run_funasr_mlt_nano.py +160 -0
  34. scripts/run_funasr_nano.py +161 -0
  35. scripts/run_funasr_quant.py +1 -1
  36. scripts/run_whisper.py +1 -2
  37. scripts/run_whisper_finetuned.py +1 -1
  38. scripts/vad.py +12 -0
  39. scripts/wenet_utils.py +63 -0
  40. test_data/{dataset → AIShell/dataset}/dataset.txt +0 -0
  41. test_data/__init__.py +9 -0
  42. test_data/audios.py +44 -33
  43. tests/test_asr/__init__.py +0 -0
  44. tests/test_asr/conftest.py +59 -0
  45. tests/test_asr/test_asr.py +102 -0
lib/asr_models/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base_model import ModelName
2
+ from .whisper import Whisper
3
+ from .whisper_finetuned import WhisperFinetuned
4
+ from .funasr_nano import FunasrNano, FunasrMLTNano
5
+ from .funasr_quant import FunasrQuant
6
+
7
+ __all__ = [
8
+ "ModelName",
9
+ "Whisper",
10
+ "WhisperFinetuned",
11
+ "FunasrNano",
12
+ "FunasrMLTNano",
13
+ "FunasrQuant",
14
+ ]
lib/asr_models/base_model.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from enum import Enum
3
+
4
+
5
+ class AbstractASRModel(ABC):
6
+ def __init__(self, device='cpu'):
7
+ self.device = device
8
+ self.name = "AbstractASRModel"
9
+
10
+ @abstractmethod
11
+ def load(self, model_dir, language):
12
+ raise NotImplementedError
13
+
14
+ @abstractmethod
15
+ def transcribe(self, wav, language):
16
+ raise NotImplementedError
17
+
18
+
19
+ class ModelName(Enum):
20
+ WHISPER = "whisper"
21
+ WHISPER_FINETUNED = "whisper_finetuned"
22
+ FUNASR_NANO = "funasr_nano"
23
+ FUNASR_MLT_NANO = "funasr_mlt_nano"
24
+ FUNASR_QUANT = "funasr_quant"
25
+
lib/asr_models/evaluator.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from jiwer import wer, cer
3
+ import cn2an
4
+ from whisper_normalizer.english import EnglishTextNormalizer
5
+ from whisper_normalizer.basic import BasicTextNormalizer
6
+
7
+
8
+ class ASREvaluator:
9
+ def __init__(self):
10
+ # 官方英文标准化:处理拼写、缩写、标点
11
+ self.en_normalizer = EnglishTextNormalizer()
12
+ # 官方基础标准化:主要用于非英文,处理标点、大小写、多余空格
13
+ self.zh_normalizer = BasicTextNormalizer()
14
+
15
+ def clean_zh_text(self, text):
16
+ """针对中文的特殊处理"""
17
+ # print("text before clean:", text)
18
+ if re.search(r"\d", text):
19
+ text = cn2an.transform(text, "an2cn")
20
+ # 1. 基础标准化 (去标点、繁转简等)
21
+ text = self.zh_normalizer(text)
22
+ # 2. 去除所有空格(防止原文本中自带的空格干扰)
23
+ text = re.sub(r'\s+', '', text)
24
+ # print("text after clean:", text)
25
+ return text
26
+
27
+ def clean_en_text(self, text):
28
+ """针对英文的标准化"""
29
+ # print("text before clean:", text)
30
+ text = self.en_normalizer(text)
31
+ # print("text after clean:", text)
32
+ return text
33
+
34
+ def compute_en_wer(self, data):
35
+ """计算英文词错误率 (WER)
36
+ data = ["{
37
+ "index": 1,
38
+ "audio_path": "xxx.wav",
39
+ "reference": "text",
40
+ "inference_time": 0.123,
41
+ "predicts": "test"
42
+ }]
43
+ """
44
+ refs = []
45
+ preds = []
46
+ for item in data:
47
+ ref_clean = self.clean_en_text(item["reference"])
48
+ pred_clean = self.clean_en_text(item["predicts"])
49
+ if ref_clean.strip(): # 过滤掉空的参考文本
50
+ refs.append(ref_clean)
51
+ preds.append(pred_clean)
52
+ score = wer(refs, preds)
53
+ return score
54
+
55
+ def compute_zh_cer(self, data):
56
+ """计算中文字错误率 (CER)
57
+ data = ["{
58
+ "index": 1,
59
+ "audio_path": "xxx.wav",
60
+ "reference": "text",
61
+ "inference_time": 0.123,
62
+ "predicts": "test"
63
+ }]
64
+ """
65
+ # 注意:在中文评估中,将句子拆解为“字”后计算 WER,结果等同于 CER
66
+ refs = []
67
+ preds = []
68
+ for item in data:
69
+ ref_clean = self.clean_zh_text(item["reference"])
70
+ pred_clean = self.clean_zh_text(item["predicts"])
71
+ if ref_clean.strip(): # 过滤掉空的参考文本
72
+ refs.append(ref_clean)
73
+ preds.append(pred_clean)
74
+ score = cer(refs, preds)
75
+ return score
76
+
77
+
78
+ def compute(model_name, data_name, results, language):
79
+ evaluator = ASREvaluator()
80
+ if language == "zh":
81
+ cer = evaluator.compute_zh_cer(results)
82
+ res = f"Model: {model_name}, Dataset: {data_name}, data {len(results)}, CER: {cer:.2%}"
83
+ print(res)
84
+ return res
85
+ else:
86
+ wer = evaluator.compute_en_wer(results)
87
+ res = f"Model: {model_name}, Dataset: {data_name}, data {len(results)}, WER: {wer:.2%}"
88
+ print(res)
89
+ return res
90
+
91
+
92
+ # ================= 使用示例 =================
93
+ if __name__ == "__main__":
94
+ import json
95
+ from pathlib import Path
96
+
97
+ evaluator = ASREvaluator()
98
+ print(evaluator.clean_zh_text("相比sota模型,我们的方法在小样本场景下召回率高出十二个百分点,且参数量仅为其三分之一。"))
99
+ # result_file = Path("/Users/jeqin/work/code/TestTranslator/reports/whisper_libri_en.json")
100
+ # with open(result_file, "r", encoding="utf-8") as f:
101
+ # data = json.load(f)
102
+ # en_wer = evaluator.compute_en_wer(data)
103
+ # print(f"{result_file.name} WER: {en_wer:.2%}")
104
+
105
+ reports = Path("/Users/jeqin/work/code/TestTranslator/reports")
106
+ for file in reports.glob("*wenet_net.json"):
107
+ with open(file) as f:
108
+ data = json.load(f)
109
+ compute(model_name=file.name, data_name="wenet_net",results=data, language="zh")
lib/asr_models/funasr_nano.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from funasr import AutoModel
2
+
3
+ from lib.asr_models.base_model import AbstractASRModel, ModelName
4
+ from lib.utils import Timer
5
+ from environment import PROJECT_DIR
6
+
7
+ MODEL_DIR = "/Users/jeqin/work/code/Fun-ASR-Nano-2512"
8
+ MODEL_MLT_DIR = "/Users/jeqin/work/code/Fun-ASR-MLT-Nano-2512"
9
+ language_map = {
10
+ "en": "英文",
11
+ "zh": "中文",
12
+ }
13
+ class FunasrNano(AbstractASRModel):
14
+ def __init__(self, device='mps'):
15
+ super().__init__(device)
16
+ self.name = ModelName.FUNASR_NANO
17
+
18
+ def load(self, model_dir=MODEL_DIR, language=""):
19
+ with Timer("Loading Fun-ASR-Nano model"):
20
+ self.model = AutoModel(
21
+ model=str(model_dir),
22
+ trust_remote_code=True,
23
+ remote_code=str(PROJECT_DIR / "scripts"/"model.py"),
24
+ device=self.device,
25
+ disable_update=True,
26
+ )
27
+
28
+ def transcribe(self, wav, language="zh"):
29
+ language = language_map.get(language)
30
+ with Timer("Transcribing audio") as t:
31
+ res = self.model.generate(input=[str(wav)], cache={}, batch_size=1, language=language, itn=True)
32
+ # res = self.model.generate(
33
+ # input=[str(wav)],
34
+ # cache={},
35
+ # # batch_size=1,
36
+ # hotwords=["开放时间", "llama", "decode"],
37
+ # # 中文、英文、日文 for Fun-ASR-Nano-2512
38
+ # # 中文、英文、粤语、日文、韩文、越南语、印尼语、泰语、马来语、菲律宾语、阿拉伯语、
39
+ # # 印地语、保加利亚语、克罗地亚语、捷克语、丹麦语、荷兰语、爱沙尼亚语、芬兰语、希腊语、
40
+ # # 匈牙利语、爱尔兰语、拉脱维亚语、立陶宛语、马耳他语、波兰语、葡萄牙语、罗马尼亚语、
41
+ # # 斯洛伐克语、斯洛文尼亚语、瑞典语 for Fun-ASR-MLT-Nano-2512
42
+ # language=language,
43
+ # itn=True, # or False
44
+ # )
45
+ text = res[0]["text"]
46
+ return text, t.duration
47
+
48
+
49
+ class FunasrMLTNano(FunasrNano):
50
+ def __init__(self, device='mps'):
51
+ super().__init__(device)
52
+ self.name = ModelName.FUNASR_MLT_NANO
53
+
54
+ def load(self, model_dir=MODEL_MLT_DIR, language=""):
55
+ super().load(model_dir, language)
56
+
57
+
58
+ if __name__ == "__main__":
59
+ model = FunasrMLTNano()
60
+ model.load()
61
+ text, cost = model.transcribe('../../test_data/recordings/1.wav', language="zh")
62
+ print("inference time: ", cost)
63
+ print(text)
lib/asr_models/funasr_quant.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from funasr_onnx import SeacoParaformer, CT_Transformer, Fsmn_vad
3
+
4
+ from lib.utils import Timer
5
+ from lib.asr_models.base_model import AbstractASRModel, ModelName
6
+
7
+
8
+ MODEL_DIR = "/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models"
9
+
10
+ class FunasrQuant(AbstractASRModel):
11
+ def __init__(self, device='mps'):
12
+ super().__init__(device=device)
13
+ self.name = ModelName.FUNASR_QUANT
14
+
15
+ def load(self, model_dir=MODEL_DIR, language=""):
16
+ quantize=True
17
+ with Timer("Loading Fun-ASR-Quant model"):
18
+ model_dir = Path(model_dir)
19
+ asr_model_path = model_dir / 'speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
20
+ vad_model_path = model_dir / 'speech_fsmn_vad_zh-cn-16k-common-pytorch'
21
+ punc_model_path = model_dir / 'punc_ct-transformer_cn-en-common-vocab471067-large'
22
+ self.vad_model = Fsmn_vad(vad_model_path, quantize=quantize)
23
+ self.asr_model = SeacoParaformer(asr_model_path, quantize=quantize)
24
+ self.punc_model = CT_Transformer(punc_model_path, quantize=quantize)
25
+
26
+ def transcribe(self, wav, language="zh"):
27
+ with Timer("Transcribing audio") as t:
28
+ asr_res = self.asr_model(str(wav), hotwords="", language=language)
29
+ text = ""
30
+ if len(asr_res) > 0:
31
+ asr_text = asr_res[0]["preds"]
32
+ result = self.punc_model(asr_text)
33
+ text = result[0]
34
+ return text, t.duration
35
+
36
+ if __name__ == "__main__":
37
+ model = FunasrQuant(device='mps')
38
+ model.load()
39
+ text, cost = model.transcribe('../../test_data/recordings/1.wav', language="en")
40
+ print("inference time: ", cost)
41
+ print(text)
lib/asr_models/model.py ADDED
@@ -0,0 +1,701 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import random
5
+ import re
6
+ import string
7
+ import time
8
+ import traceback
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ from funasr import AutoModel
13
+ from funasr.metrics.compute_acc import compute_accuracy
14
+ from funasr.register import tables
15
+ from funasr.train_utils.device_funcs import force_gatherable, to_device
16
+ from funasr.utils.datadir_writer import DatadirWriter
17
+ from funasr.utils.load_utils import extract_fbank, load_audio_text_image_video
18
+ from transformers import AutoConfig, AutoModelForCausalLM
19
+
20
+ dtype_map = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
21
+
22
+
23
+ @tables.register("model_classes", "FunASRNano")
24
+ class FunASRNano(nn.Module):
25
+ def __init__(
26
+ self,
27
+ audio_encoder: str = None,
28
+ audio_encoder_conf: dict = None,
29
+ audio_adaptor: str = None,
30
+ audio_adaptor_conf: dict = None,
31
+ llm: str = None,
32
+ llm_conf: dict = None,
33
+ input_size: int = 80,
34
+ length_normalized_loss: bool = False,
35
+ **kwargs,
36
+ ):
37
+ super().__init__()
38
+
39
+ # audio encoder
40
+ hub = audio_encoder_conf.get("hub", None)
41
+ self.audio_encoder_activation_checkpoint = audio_encoder_conf.get(
42
+ "activation_checkpoint", False
43
+ )
44
+ if hub == "ms":
45
+ model = AutoModel(model=audio_encoder, model_revision="master")
46
+ audio_encoder_output_size = (
47
+ model.model.encoder_output_size
48
+ if hasattr(model.model, "encoder_output_size")
49
+ else -1
50
+ )
51
+ audio_encoder = (
52
+ model.model.model.encoder
53
+ if hasattr(model.model, "model")
54
+ else model.model.encoder
55
+ )
56
+ else:
57
+ encoder_class = tables.encoder_classes.get(audio_encoder)
58
+ audio_encoder = encoder_class(input_size=input_size, **audio_encoder_conf)
59
+ audio_encoder_output_size = audio_encoder.output_size()
60
+ freeze = audio_encoder_conf.get("freeze", True)
61
+ freeze_layer_num = int(audio_encoder_conf.get("freeze_layer_num", -1))
62
+
63
+ if freeze:
64
+ for name, param in audio_encoder.named_parameters():
65
+ param.requires_grad = False
66
+ audio_encoder.eval()
67
+ self.audio_encoder = audio_encoder
68
+ # llm
69
+ self.llm = None
70
+ init_param_path = llm_conf.get("init_param_path", None)
71
+ llm_dim = None
72
+
73
+ llm_load_kwargs = llm_conf.get("load_kwargs", {})
74
+ config = AutoConfig.from_pretrained(init_param_path)
75
+ model = AutoModelForCausalLM.from_config(config, **llm_load_kwargs)
76
+
77
+ freeze = llm_conf.get("freeze", True)
78
+ if freeze:
79
+ for name, param in model.named_parameters():
80
+ param.requires_grad = False
81
+ model.eval()
82
+ logging.info(f"use_lora: {llm_conf.get('use_lora', False)}")
83
+ if llm_conf.get("use_lora", False):
84
+ from omegaconf import DictConfig, OmegaConf
85
+
86
+ lora_conf = llm_conf.get("lora_conf", {})
87
+ if isinstance(lora_conf, (OmegaConf, DictConfig)):
88
+ lora_conf = OmegaConf.to_container(lora_conf, resolve=True)
89
+ from peft import LoraConfig, PeftModel, get_peft_model
90
+
91
+ lora_init_param_path = lora_conf.get("init_param_path", None)
92
+ if lora_init_param_path is not None:
93
+ logging.info(f"lora_init_param_path: {lora_init_param_path}")
94
+ model = PeftModel.from_pretrained(model, lora_init_param_path)
95
+ for name, param in model.named_parameters():
96
+ if not lora_conf.get("freeze_lora", False):
97
+ if "lora_" in name:
98
+ param.requires_grad = True
99
+ else:
100
+ peft_config = LoraConfig(**lora_conf)
101
+ model = get_peft_model(model, peft_config)
102
+ model.print_trainable_parameters()
103
+
104
+ if llm_conf.get("activation_checkpoint", False):
105
+ model.gradient_checkpointing_enable()
106
+
107
+ self.llm_dtype = llm_conf.get("llm_dtype", "fp32")
108
+ self.llm = model.to(dtype_map[self.llm_dtype])
109
+ llm_dim = model.get_input_embeddings().weight.shape[-1]
110
+
111
+ # adaptor
112
+ adaptor_class = tables.adaptor_classes.get(audio_adaptor)
113
+ if audio_encoder_output_size > 0:
114
+ audio_adaptor_conf["encoder_dim"] = audio_encoder_output_size
115
+ audio_adaptor_conf["llm_dim"] = (
116
+ llm_dim if llm_dim is not None else audio_adaptor_conf["llm_dim"]
117
+ )
118
+ audio_adaptor = adaptor_class(**audio_adaptor_conf)
119
+ freeze = audio_adaptor_conf.get("freeze", False)
120
+ if freeze:
121
+ for name, param in audio_adaptor.named_parameters():
122
+ param.requires_grad = False
123
+ audio_adaptor.eval()
124
+ self.audio_adaptor = audio_adaptor
125
+
126
+ self.length_normalized_loss = length_normalized_loss
127
+ self.feat_permute = audio_encoder_conf.get("feat_permute", True)
128
+ rank = int(os.environ.get("RANK", 0))
129
+ logging.info(f"rank: {rank}, model is builded.")
130
+
131
+ def forward(
132
+ self,
133
+ speech: torch.Tensor = None,
134
+ speech_lengths: torch.Tensor = None,
135
+ input_ids: torch.Tensor = None,
136
+ attention_mask: torch.Tensor = None,
137
+ labels_ids: torch.Tensor = None,
138
+ fbank_beg: torch.Tensor = None,
139
+ fbank_mask: torch.Tensor = None,
140
+ **kwargs,
141
+ ):
142
+ batch_size, token_num = input_ids.shape
143
+ stats = {}
144
+ input_ids[input_ids < 0] = 0
145
+ inputs_embeds = self.llm.model.get_input_embeddings()(input_ids)
146
+ if speech is not None:
147
+ if len(speech_lengths.size()) > 1:
148
+ speech_lengths = speech_lengths[:, 0]
149
+ batch_size_speech, frames, _ = speech.shape
150
+
151
+ # audio encoder
152
+ if self.audio_encoder_activation_checkpoint:
153
+ from torch.utils.checkpoint import checkpoint
154
+
155
+ encoder_out, encoder_out_lens = checkpoint(
156
+ self.encode, speech, speech_lengths, use_reentrant=False
157
+ )
158
+ else:
159
+ encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
160
+
161
+ # audio_adaptor
162
+ encoder_out, encoder_out_lens = self.audio_adaptor(
163
+ encoder_out, encoder_out_lens
164
+ )
165
+
166
+ batch_size, token_num, dims = inputs_embeds.shape
167
+ fake_token_len = kwargs.get("fake_token_len")
168
+ fake_token_len[fake_token_len < 0] = 0
169
+ fbank_beg[fbank_beg < 0] = 0
170
+
171
+ speech_idx = 0
172
+ for batch_idx in range(batch_size):
173
+ for turn_id in range(fbank_beg.shape[1]):
174
+ fbank_beg_idx = fbank_beg[batch_idx, turn_id].item()
175
+ if fbank_beg_idx > 0:
176
+ speech_token_len = fake_token_len[batch_idx, turn_id]
177
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
178
+
179
+ try:
180
+ inputs_embeds[
181
+ batch_idx,
182
+ fbank_beg_idx : fbank_beg_idx + speech_token_len,
183
+ :,
184
+ ] = speech_token
185
+ except Exception as e:
186
+ logging.error(f"{str(e)}, {traceback.format_exc()}")
187
+ logging.info(
188
+ f"batch_idx: {batch_idx}, inputs_embeds: {inputs_embeds.shape}, fbank_beg_idx: {fbank_beg_idx}, speech_token_len: {speech_token_len}, encoder_out: {encoder_out.shape}, encoder_out_lens: {encoder_out_lens}, fake_token_len: {fake_token_len}, speech_lengths: {speech_lengths}"
189
+ )
190
+ speech_token_len = encoder_out_lens[speech_idx].item()
191
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
192
+ inputs_embeds[
193
+ batch_idx,
194
+ fbank_beg_idx : fbank_beg_idx + speech_token_len,
195
+ :,
196
+ ] = speech_token
197
+
198
+ speech_idx += 1
199
+
200
+ stats["batch_size_speech"] = batch_size_speech
201
+ stats["batch_size_x_frames"] = frames * batch_size_speech
202
+ stats["batch_size_real_frames"] = speech_lengths.sum().item()
203
+ stats["padding_frames"] = (
204
+ stats["batch_size_x_frames"] - stats["batch_size_real_frames"]
205
+ )
206
+
207
+ device_type = next(self.parameters()).device.type
208
+ with torch.autocast(
209
+ device_type=device_type if device_type in ["cuda", "mps"] else "cpu",
210
+ enabled=True if self.llm_dtype != "fp32" else False,
211
+ dtype=dtype_map[self.llm_dtype],
212
+ ):
213
+ labels_ids[labels_ids == -1] = -100
214
+ attention_mask[attention_mask < 0] = 0
215
+ model_outputs = self.llm(
216
+ inputs_embeds=inputs_embeds.to(dtype_map[self.llm_dtype]),
217
+ attention_mask=attention_mask,
218
+ labels=labels_ids,
219
+ )
220
+ loss = model_outputs.loss
221
+
222
+ with torch.no_grad():
223
+ preds = torch.argmax(model_outputs.logits, -1)
224
+ acc_att = compute_accuracy(
225
+ preds[:, :-1], labels_ids[:, 1:], ignore_label=-100
226
+ )
227
+ stats["acc"] = acc_att
228
+
229
+ stats["loss"] = torch.clone(loss.detach())
230
+ stats["batch_size"] = batch_size
231
+
232
+ stats["batch_size_x_tokens"] = token_num * batch_size
233
+ stats["batch_size_real_tokens"] = attention_mask.sum().item()
234
+ stats["padding_tokens"] = (
235
+ stats["batch_size_x_tokens"] - stats["batch_size_real_tokens"]
236
+ )
237
+
238
+ dialog_turns = (fbank_beg > 0).sum(-1)
239
+ dialog_turns_max = torch.max(dialog_turns).int().item()
240
+ dialog_turns_avg = dialog_turns.sum().item() / batch_size
241
+ stats["dialog_turns_max"] = dialog_turns_max
242
+ stats["dialog_turns_avg"] = dialog_turns_avg
243
+
244
+ # force_gatherable: to-device and to-tensor if scalar for DataParallel
245
+ if self.length_normalized_loss:
246
+ batch_size = int((labels_ids > 0 + 1).sum())
247
+ loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
248
+ return loss, stats, weight
249
+
250
+ def forward_export(self, speech, speech_lengths, **kwargs):
251
+ x, olens = self.audio_encoder(speech, speech_lengths)
252
+ encoder_out, encoder_out_lens = self.audio_adaptor(x, olens)
253
+ return encoder_out, encoder_out_lens
254
+
255
+ def encode(self, speech, speech_lengths):
256
+ # audio encoder
257
+ if self.feat_permute:
258
+ encoder_out, encoder_out_lens = self.audio_encoder(
259
+ speech.permute(0, 2, 1), speech_lengths
260
+ )
261
+ else:
262
+ encoder_out, encoder_out_lens = self.audio_encoder(speech, speech_lengths)
263
+
264
+ return encoder_out, encoder_out_lens
265
+
266
+ def data_template(self, data):
267
+ system, user, assistant = [], [], []
268
+ for i, item in enumerate(data):
269
+ role = item["role"]
270
+ content = item["content"]
271
+ if role == "system":
272
+ system.append(content)
273
+ elif role == "user":
274
+ if "audio" in item:
275
+ audio = item["audio"]
276
+ content = [content, audio]
277
+ user.append(content)
278
+ elif role == "assistant":
279
+ assistant.append(content)
280
+
281
+ system = system * len(user)
282
+
283
+ contents = {
284
+ "system": system,
285
+ "user": user,
286
+ "assistant": assistant,
287
+ }
288
+
289
+ return contents
290
+
291
+ def data_load_speech(
292
+ self, contents: dict, tokenizer, frontend, meta_data={}, **kwargs
293
+ ):
294
+ system = contents["system"]
295
+ user = contents["user"]
296
+ assistant = contents["assistant"]
297
+ pattern = re.compile(r"(<\|startofspeech\|>.*?<\|endofspeech\|>)")
298
+ do_think = True
299
+ sys_prompt = True
300
+ if "dataset_conf" in kwargs:
301
+ do_think = kwargs["dataset_conf"].get("do_think", True)
302
+ sys_prompt = kwargs["dataset_conf"].get("sys_prompt", True)
303
+
304
+ input_ids, labels, fbank, fbank_lens, fbank_mask, fbank_beg, fake_token_len = (
305
+ [],
306
+ [],
307
+ [],
308
+ [],
309
+ [],
310
+ [],
311
+ [],
312
+ )
313
+ input_source_ids = []
314
+ for i, (system_prompt, user_prompt, target_out) in enumerate(
315
+ zip(system, user, assistant)
316
+ ):
317
+ if i >= kwargs.get("multiturn_num_max", 5):
318
+ break
319
+ if len(input_ids) > kwargs.get("max_token_length", 1500):
320
+ break
321
+ if isinstance(user_prompt, (list, tuple)):
322
+ user_prompt, audio = user_prompt
323
+ if i == 0:
324
+ if kwargs.get("infer_with_assistant_input", False):
325
+ source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}"
326
+ if not sys_prompt:
327
+ source_input = f"<|im_start|>user\n{user_prompt}"
328
+ else:
329
+ source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
330
+ if not sys_prompt:
331
+ source_input = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
332
+ else:
333
+ if kwargs.get("infer_with_assistant_input", False):
334
+ source_input = f"<|im_start|>user\n{user_prompt}"
335
+ else:
336
+ source_input = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
337
+ if not do_think:
338
+ source_input += "<think>\n\n</think>\n\n"
339
+
340
+ splits = pattern.split(source_input)
341
+ source_ids = []
342
+ fbank_mask_i = []
343
+ fake_token_len_i = 0
344
+ fbank_beg_i = -1
345
+ speech, speech_lengths = [], []
346
+ for k, sub_str in enumerate(splits):
347
+ if not sub_str.startswith("<|startofspeech|>"):
348
+ sub_token = tokenizer.encode(sub_str)
349
+ source_ids += sub_token
350
+ fbank_mask_i += [0] * len(sub_token)
351
+ else:
352
+ sub_str = sub_str.replace("<|startofspeech|>", "").replace(
353
+ "<|endofspeech|>", ""
354
+ )
355
+ if sub_str.startswith("!"):
356
+ sub_str = sub_str[1:]
357
+ if sub_str.startswith("!"): # !!: audio sample point
358
+ sub_str = audio
359
+ try:
360
+ time1 = time.perf_counter()
361
+ data_src = load_audio_text_image_video(
362
+ sub_str, fs=frontend.fs, **kwargs
363
+ )
364
+ time2 = time.perf_counter()
365
+ meta_data["load_data"] = f"{time2 - time1:0.3f}"
366
+ except Exception as e:
367
+ logging.error(
368
+ f"Loading wav failed! {str(e)}, {traceback.format_exc()}"
369
+ )
370
+
371
+ speech, speech_lengths = extract_fbank(
372
+ data_src,
373
+ data_type=kwargs.get("data_type", "sound"),
374
+ frontend=frontend,
375
+ is_final=True,
376
+ ) # speech: [b, T, d]
377
+
378
+ time3 = time.perf_counter()
379
+ meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
380
+ meta_data["batch_data_time"] = (
381
+ speech_lengths.sum().item()
382
+ * frontend.frame_shift
383
+ * frontend.lfr_n
384
+ / 1000
385
+ )
386
+
387
+ if self.feat_permute:
388
+ speech = speech.permute(0, 2, 1)
389
+
390
+ olens = 1 + (speech_lengths[0].item() - 3 + 2 * 1) // 2
391
+ olens = 1 + (olens - 3 + 2 * 1) // 2
392
+ fake_token_len_i = (olens - 1) // 2 + 1
393
+ fake_token = [0] * fake_token_len_i
394
+ fbank_beg_i = len(source_ids)
395
+ source_ids += fake_token
396
+ fbank_mask_i += [1] * len(fake_token)
397
+
398
+ fbank_beg += [fbank_beg_i + len(input_ids)]
399
+ fake_token_len += [fake_token_len_i]
400
+ source_mask = [-100] * len(source_ids)
401
+ target_out = f"{target_out}<|im_end|>"
402
+ target_ids = tokenizer.encode(target_out)
403
+ input_source_ids = input_ids + source_ids
404
+ input_ids += source_ids + target_ids
405
+ labels += source_mask + target_ids
406
+ fbank_mask += fbank_mask_i
407
+ if len(speech) > 0:
408
+ fbank.append(speech[0, :, :])
409
+ fbank_lens.append(speech_lengths)
410
+
411
+ input_ids = torch.tensor(
412
+ input_ids, dtype=torch.int64
413
+ ) # [: self.max_token_length]
414
+ attention_mask = torch.tensor([1] * len(input_ids), dtype=torch.int32)
415
+ labels = torch.tensor(labels, dtype=torch.int64) # [: self.max_token_length]
416
+
417
+ fbank_mask = torch.tensor(fbank_mask, dtype=torch.float32)
418
+ fbank_beg = torch.tensor(fbank_beg, dtype=torch.int32)
419
+ fake_token_len = torch.tensor(fake_token_len, dtype=torch.int32)
420
+ source_ids = torch.tensor(input_source_ids, dtype=torch.int64)
421
+ target_ids = torch.tensor(target_ids, dtype=torch.int64)
422
+
423
+ if len(fbank) > 0:
424
+ speech = torch.nn.utils.rnn.pad_sequence(
425
+ fbank, batch_first=True, padding_value=0.0
426
+ )
427
+ speech_lengths = torch.nn.utils.rnn.pad_sequence(
428
+ fbank_lens, batch_first=True, padding_value=-1
429
+ )
430
+ else:
431
+ speech = []
432
+ speech_lengths = []
433
+ output = {
434
+ "speech": speech,
435
+ "speech_lengths": speech_lengths,
436
+ "fbank_mask": fbank_mask[None, :],
437
+ "fbank_beg": fbank_beg[None,],
438
+ "fake_token_len": fake_token_len[None, :],
439
+ "input_ids": input_ids[None,],
440
+ "attention_mask": attention_mask[None,],
441
+ "labels_ids": labels,
442
+ "source_ids": source_ids[None, :],
443
+ "target_ids": target_ids[None, :],
444
+ }
445
+
446
+ return output
447
+
448
+ def inference_prepare(
449
+ self,
450
+ data_in,
451
+ data_lengths=None,
452
+ key: list = None,
453
+ tokenizer=None,
454
+ frontend=None,
455
+ **kwargs,
456
+ ):
457
+ meta_data = {}
458
+
459
+ if kwargs.get("batch_size", 1) > 1:
460
+ raise NotImplementedError("batch decoding is not implemented")
461
+
462
+ contents = self.data_template(data_in[0])
463
+ output = self.data_load_speech(
464
+ contents, tokenizer, frontend, meta_data=meta_data, **kwargs
465
+ )
466
+ batch = to_device(output, kwargs["device"])
467
+
468
+ # audio encoder
469
+ speech = batch["speech"]
470
+
471
+ if len(speech) > 0:
472
+ if "audio_embedding" in kwargs and "audio_embedding_lens" in kwargs:
473
+ encoder_out = kwargs["audio_embedding"]
474
+ encoder_out_lens = kwargs["audio_embedding_lens"]
475
+ else:
476
+ speech_lengths = batch["speech_lengths"][:, 0]
477
+ # fp16
478
+ if kwargs.get("fp16", False):
479
+ speech = speech.to(torch.float16)
480
+ elif kwargs.get("bf16", False):
481
+ speech = speech.to(torch.bfloat16)
482
+ # audio encoder
483
+ encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
484
+
485
+ # audio_adaptor
486
+ encoder_out, encoder_out_lens = self.audio_adaptor(
487
+ encoder_out, encoder_out_lens
488
+ )
489
+ meta_data["audio_adaptor_out"] = encoder_out
490
+ meta_data["audio_adaptor_out_lens"] = encoder_out_lens
491
+
492
+ input_ids = batch["input_ids"]
493
+ source_ids = batch["source_ids"]
494
+ fbank_beg = batch["fbank_beg"]
495
+ fake_token_len = batch["fake_token_len"]
496
+
497
+ if not kwargs.get("tearchforing", False):
498
+ input_ids = source_ids
499
+
500
+ input_ids[input_ids < 0] = 0
501
+ inputs_embeds = self.llm.model.get_input_embeddings()(input_ids)
502
+
503
+ batch_size, token_num, dims = inputs_embeds.shape
504
+
505
+ fake_token_len[fake_token_len < 0] = 0
506
+ fbank_beg[fbank_beg < 0] = 0
507
+
508
+ speech_idx = 0
509
+ for batch_idx in range(batch_size):
510
+ for turn_id in range(fbank_beg.shape[1]):
511
+ fbank_beg_idx = fbank_beg[batch_idx, turn_id].item()
512
+ if fbank_beg_idx > 0:
513
+ speech_token_len = fake_token_len[batch_idx, turn_id]
514
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
515
+
516
+ try:
517
+ inputs_embeds[
518
+ batch_idx,
519
+ fbank_beg_idx : fbank_beg_idx + speech_token_len,
520
+ :,
521
+ ] = speech_token
522
+ except Exception as e:
523
+ #
524
+ logging.error(f"{str(e)}, {traceback.format_exc()}")
525
+ logging.info(
526
+ f"batch_idx: {batch_idx}, inputs_embeds: {inputs_embeds.shape}, fbank_beg_idx: {fbank_beg_idx}, speech_token_len: {speech_token_len}, encoder_out: {encoder_out.shape}, encoder_out_lens: {encoder_out_lens}, fake_token_len: {fake_token_len}, speech_lengths: {speech_lengths}"
527
+ )
528
+ speech_token_len = encoder_out_lens[speech_idx].item()
529
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
530
+ inputs_embeds[
531
+ batch_idx,
532
+ fbank_beg_idx : fbank_beg_idx + speech_token_len,
533
+ :,
534
+ ] = speech_token
535
+
536
+ speech_idx += 1
537
+ return inputs_embeds, contents, batch, source_ids, meta_data
538
+
539
+ def inference(
540
+ self,
541
+ data_in,
542
+ data_lengths=None,
543
+ key: list = None,
544
+ tokenizer=None,
545
+ frontend=None,
546
+ **kwargs,
547
+ ):
548
+ hotwords = kwargs.get("hotwords", [])
549
+ if len(hotwords) > 0:
550
+ hotwords = ", ".join(hotwords)
551
+ prompt = f"请结合上下文信息,更加准确地完成语音转写任务。如果没有相关信息,我们会留空。\n\n\n**上下文信息:**\n\n\n"
552
+ prompt += f"热词列表:[{hotwords}]\n"
553
+ else:
554
+ prompt = ""
555
+ language = kwargs.get("language", None)
556
+ if language is None:
557
+ prompt += "语音转写"
558
+ else:
559
+ prompt += f"语音转写成{language}"
560
+ itn = kwargs.get("itn", True)
561
+ if not itn:
562
+ prompt += ",不进行文本规整"
563
+ prompt += ":"
564
+
565
+ new_data_in = []
566
+ for data in data_in:
567
+ if isinstance(data, str):
568
+ new_data_in.append(
569
+ [
570
+ {"role": "system", "content": "You are a helpful assistant."},
571
+ {
572
+ "role": "user",
573
+ "content": f"{prompt}<|startofspeech|>!{data}<|endofspeech|>",
574
+ },
575
+ {"role": "assistant", "content": "null"},
576
+ ]
577
+ )
578
+ elif isinstance(data, torch.Tensor):
579
+ new_data_in.append(
580
+ [
581
+ {"role": "system", "content": "You are a helpful assistant."},
582
+ {
583
+ "role": "user",
584
+ "content": f"{prompt}<|startofspeech|>!!<|endofspeech|>",
585
+ "audio": data,
586
+ },
587
+ {"role": "assistant", "content": "null"},
588
+ ]
589
+ )
590
+ data_in = new_data_in
591
+
592
+ if key is None:
593
+ key = []
594
+ for _ in data_in:
595
+ chars = string.ascii_letters + string.digits
596
+ key.append(
597
+ "rand_key_" + "".join(random.choice(chars) for _ in range(13))
598
+ )
599
+
600
+ return self.inference_llm(
601
+ data_in,
602
+ data_lengths=data_lengths,
603
+ key=key,
604
+ tokenizer=tokenizer,
605
+ frontend=frontend,
606
+ **kwargs,
607
+ )
608
+
609
+ def inference_llm(
610
+ self,
611
+ data_in,
612
+ data_lengths=None,
613
+ key: list = None,
614
+ tokenizer=None,
615
+ frontend=None,
616
+ **kwargs,
617
+ ):
618
+ inputs_embeds, contents, batch, source_ids, meta_data = self.inference_prepare(
619
+ data_in, data_lengths, key, tokenizer, frontend, **kwargs
620
+ )
621
+ llm_dtype = kwargs.get("llm_dtype", "fp32")
622
+ if llm_dtype == "fp32":
623
+ llm_dtype = "fp16" if kwargs.get("fp16", False) else llm_dtype
624
+ llm_dtype = "bf16" if kwargs.get("bf16", False) else llm_dtype
625
+
626
+ device_type = torch.device(kwargs.get("device", "cuda")).type
627
+ with torch.autocast(
628
+ device_type=device_type if device_type in ["cuda", "mps"] else "cpu",
629
+ enabled=True if llm_dtype != "fp32" else False,
630
+ dtype=dtype_map[llm_dtype]
631
+ ):
632
+ label = contents["assistant"][-1]
633
+ self.llm = self.llm.to(dtype_map[llm_dtype])
634
+ inputs_embeds = inputs_embeds.to(dtype_map[llm_dtype])
635
+ llm_kwargs = kwargs.get("llm_kwargs", {})
636
+ if not kwargs.get("teachforing", False):
637
+ generated_ids = self.llm.generate(
638
+ inputs_embeds=inputs_embeds,
639
+ max_new_tokens=kwargs.get("max_length", 512),
640
+ **llm_kwargs,
641
+ )
642
+
643
+ response = tokenizer.batch_decode(
644
+ generated_ids,
645
+ skip_special_tokens=kwargs.get("skip_special_tokens", True),
646
+ )[0]
647
+
648
+ loss = None
649
+ else:
650
+ labels_ids = batch["labels_ids"]
651
+ labels_ids[labels_ids == -1] = -100
652
+ attention_mask = batch.get("attention_mask", None)
653
+ model_outputs = self.llm(
654
+ inputs_embeds=inputs_embeds,
655
+ attention_mask=attention_mask,
656
+ labels=labels_ids,
657
+ **llm_kwargs,
658
+ )
659
+
660
+ preds = torch.argmax(model_outputs.logits, -1)[:, source_ids.shape[1] :]
661
+ response = tokenizer.batch_decode(
662
+ preds,
663
+ add_special_tokens=False,
664
+ skip_special_tokens=kwargs.get("skip_special_tokens", True),
665
+ )[0]
666
+ loss = model_outputs.loss.item()
667
+
668
+ ibest_writer = None
669
+ if kwargs.get("output_dir") is not None:
670
+ if not hasattr(self, "writer"):
671
+ self.writer = DatadirWriter(kwargs.get("output_dir"))
672
+ ibest_writer = self.writer[f"{0 + 1}best_recog"]
673
+
674
+ results = []
675
+ response_clean = re.sub(r"[^\w\s\u3000\u4e00-\u9fff]+", "", response)
676
+ result_i = {
677
+ "key": key[0],
678
+ "text": re.sub(r'\s+', ' ', response.replace("/sil", " ")),
679
+ "text_tn": response_clean,
680
+ "label": label,
681
+ }
682
+ if loss is not None:
683
+ result_i["loss"] = loss
684
+ results.append(result_i)
685
+
686
+ if ibest_writer is not None:
687
+ ibest_writer["text"][key[0]] = response.replace("\n", " ")
688
+ ibest_writer["label"][key[0]] = label.replace("\n", " ")
689
+ ibest_writer["text_tn"][key[0]] = response_clean
690
+
691
+ return results, meta_data
692
+
693
+ @staticmethod
694
+ def from_pretrained(model: str = None, **kwargs):
695
+ from funasr import AutoModel
696
+
697
+ model, kwargs = AutoModel.build_model(
698
+ model=model, trust_remote_code=True, **kwargs
699
+ )
700
+
701
+ return model, kwargs
lib/asr_models/whisper.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pywhispercpp.model import Model
2
+
3
+ from lib.utils import Timer
4
+ from lib.asr_models.base_model import AbstractASRModel, ModelName
5
+
6
+
7
+ MODEL_DIR = "/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models"
8
+
9
+ class Whisper(AbstractASRModel):
10
+ def __init__(self, device='mps'):
11
+ super().__init__(device)
12
+ self.name = ModelName.WHISPER
13
+
14
+ def load(self, model_dir=MODEL_DIR, language="en", whisper_model='large-v3-turbo-q5_0'):
15
+ with Timer("Loading Whisper model"):
16
+ self.model = Model(
17
+ model=whisper_model,
18
+ models_dir=model_dir,
19
+ print_realtime=False,
20
+ print_progress=False,
21
+ print_timestamps=False,
22
+ translate=False,
23
+ # beam_search=1,
24
+ temperature=0.,
25
+ no_context=True
26
+ )
27
+
28
+ def transcribe(self, wav, language="en"):
29
+ with Timer("Transcribing audio") as t:
30
+ if language == "zh":
31
+ init_prompt = "以下是普通话句子,这是一段会议内容。"
32
+ output = self.model.transcribe(str(wav), language=language, initial_prompt=init_prompt)
33
+ else:
34
+ output = self.model.transcribe(str(wav), language=language)
35
+ text = " ".join([a.text for a in output])
36
+ return text, t.duration
37
+
38
+ if __name__ == "__main__":
39
+ model = Whisper()
40
+ model.load()
41
+ text, cost = model.transcribe('../../test_data/recordings/1.wav', language="zh")
42
+ print("inference time: ", cost)
43
+ print(text)
44
+
lib/asr_models/whisper_finetuned.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor
4
+
5
+ from lib.utils import Timer
6
+ from lib.asr_models.base_model import AbstractASRModel, ModelName
7
+
8
+
9
+ MODEL_DIR = "/Users/jeqin/Downloads/whisper-large-v3-turbo-finetune_1219"
10
+
11
+ class WhisperFinetuned(AbstractASRModel):
12
+ def __init__(self, device='mps'):
13
+ super().__init__(device)
14
+ self.name = ModelName.WHISPER_FINETUNED
15
+
16
+ def load(self, model_dir=MODEL_DIR, language="zh"):
17
+ with Timer("Loading Whisper Finetuned model"):
18
+ processor = WhisperProcessor.from_pretrained(
19
+ model_dir,
20
+ language=language,
21
+ task="transcribe",
22
+ no_timestamps=True,
23
+ local_files_only=True,
24
+ )
25
+ model = WhisperForConditionalGeneration.from_pretrained(
26
+ model_dir,
27
+ device_map=self.device,
28
+ local_files_only=True,
29
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
30
+ )
31
+
32
+ model.generation_config.language = language.lower()
33
+ model.generation_config.forced_decoder_ids = None
34
+ model.eval()
35
+ self.processor = processor
36
+ self.model = model
37
+
38
+ def transcribe(self, wav, language=""):
39
+ with Timer("Transcribing audio") as t:
40
+ sr = 16000
41
+ audio, _ = librosa.load(wav, sr=sr, mono=True)
42
+ inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt")
43
+
44
+ # 放到设备
45
+ device = next(self.model.parameters()).device
46
+ input_features = inputs["input_features"].to(device)
47
+
48
+ # 生成
49
+ with torch.inference_mode(), torch.autocast(device_type="cuda", enabled=(device.type == "cuda")):
50
+ generated_ids = self.model.generate(
51
+ input_features=input_features,
52
+ max_new_tokens=255,
53
+ return_timestamps=False, # 仅部分版本支持;不支持时自动忽略
54
+ )
55
+
56
+ # 解码
57
+ text = self.processor.tokenizer.batch_decode(generated_ids.cpu().numpy(), skip_special_tokens=True)
58
+ return text[0], t.duration
59
+
60
+ if __name__ == "__main__":
61
+ model = WhisperFinetuned(device='mps')
62
+ model.load(language="en")
63
+ text, cost = model.transcribe('../../test_data/recordings/1.wav', language="zh")
64
+ print("inference time: ", cost)
65
+ print(text)
reports/asr_result_funasr_mlt_nano_librispeech_clean.json ADDED
The diff for this file is too large to render. See raw diff
 
reports/asr_result_funasr_mlt_nano_recording.json ADDED
@@ -0,0 +1,597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "index": 1,
4
+ "audio_path": "1.wav",
5
+ "reference": "您这车开得真够稳的,蜗牛都超车了。",
6
+ "inference_time": 1.986,
7
+ "predicts": "您这车开得真够稳的,蜗牛都超车了。"
8
+ },
9
+ {
10
+ "index": 2,
11
+ "audio_path": "2.wav",
12
+ "reference": "你可真是个天才,把这么简单的事情都搞砸了。",
13
+ "inference_time": 1.287,
14
+ "predicts": "你可真是个天才,把这么简单的事情都搞砸了。"
15
+ },
16
+ {
17
+ "index": 3,
18
+ "audio_path": "3.wav",
19
+ "reference": "因网络问题,远程讲者音频中断,我们切到备用方案共享幻灯片。",
20
+ "inference_time": 1.827,
21
+ "predicts": "因网络问题,远程讲者音频中断,我们切到备用方案,共享幻灯片。"
22
+ },
23
+ {
24
+ "index": 4,
25
+ "audio_path": "4.wav",
26
+ "reference": "这就好比‘用大炮打蚊子’,资源分配严重不均衡,需要细粒度调度。",
27
+ "inference_time": 1.767,
28
+ "predicts": "这就好比用大炮打蚊子,资源分配严重不均衡,需要细力度调度。"
29
+ },
30
+ {
31
+ "index": 5,
32
+ "audio_path": "5.wav",
33
+ "reference": "言归正传,我们来讨论一下核心问题。",
34
+ "inference_time": 1.402,
35
+ "predicts": "言归正传,我们来讨论一下核心问题。"
36
+ },
37
+ {
38
+ "index": 6,
39
+ "audio_path": "6.wav",
40
+ "reference": "这有点超出了我们今天讨论的范围。",
41
+ "inference_time": 1.071,
42
+ "predicts": "这有点超出了我们今天讨论的范围。"
43
+ },
44
+ {
45
+ "index": 7,
46
+ "audio_path": "7.wav",
47
+ "reference": "他的年收入约为一百万人民币。",
48
+ "inference_time": 0.823,
49
+ "predicts": "他的年收入约为一百万人民币。"
50
+ },
51
+ {
52
+ "index": 8,
53
+ "audio_path": "8.wav",
54
+ "reference": "下一位演讲者是来自斯坦福大学计算机科学系的张明教授",
55
+ "inference_time": 1.44,
56
+ "predicts": "下一位演讲者是来自斯坦福大学计算机科学系的张明教授。"
57
+ },
58
+ {
59
+ "index": 9,
60
+ "audio_path": "9.wav",
61
+ "reference": "请各位将手机调至静音模式,演讲结束后有十五分钟提问时间",
62
+ "inference_time": 1.496,
63
+ "predicts": "请各位将手机调至静音模式。演讲结束后有十五分钟提问时间。"
64
+ },
65
+ {
66
+ "index": 10,
67
+ "audio_path": "10.wav",
68
+ "reference": "茶歇将在十点三十分开始,地点在二楼休息区",
69
+ "inference_time": 1.329,
70
+ "predicts": "茶歇将在十点三十分开始,地点在二楼休息区。"
71
+ },
72
+ {
73
+ "index": 11,
74
+ "audio_path": "11.wav",
75
+ "reference": "本合同自双方签字盖章之日起生效。",
76
+ "inference_time": 0.899,
77
+ "predicts": "本合同自双方签字盖章之日起生效。"
78
+ },
79
+ {
80
+ "index": 12,
81
+ "audio_path": "12.wav",
82
+ "reference": "政府正在采取措施刺激经济增长和创造就业机会。",
83
+ "inference_time": 1.065,
84
+ "predicts": "政府正在采取措施,刺激经济增长和创造就业机会。"
85
+ },
86
+ {
87
+ "index": 13,
88
+ "audio_path": "13.wav",
89
+ "reference": "今天中午我们点外卖吧,我想吃宫保鸡丁和麻婆豆腐。",
90
+ "inference_time": 1.328,
91
+ "predicts": "今天中午我们点外卖吧,我想吃宫保鸡丁和麻婆豆腐。"
92
+ },
93
+ {
94
+ "index": 14,
95
+ "audio_path": "14.wav",
96
+ "reference": "医生建议我多吃粗粮,比如燕麦和紫薯,少吃高油高糖的油炸食品,像炸鸡和甜甜圈。",
97
+ "inference_time": 2.831,
98
+ "predicts": "医生建议我多吃粗粮,比如燕麦和紫薯,少吃高油高糖的油炸食品,像炸鸡和甜甜圈。"
99
+ },
100
+ {
101
+ "index": 15,
102
+ "audio_path": "15.wav",
103
+ "reference": "本方案采用异构计算架构,结合GPU与FPGA加速张量运算。",
104
+ "inference_time": 1.598,
105
+ "predicts": "本方案采用异构计算架构,结合GPU与FPGA加速张量运算。"
106
+ },
107
+ {
108
+ "index": 16,
109
+ "audio_path": "16.wav",
110
+ "reference": "推理延迟稳定在5毫秒以内,吞吐量提升40%,功耗降低1.8瓦。",
111
+ "inference_time": 1.77,
112
+ "predicts": "推力延迟稳定在五毫秒以内吞吐量提升百分之四十功耗降低一点八瓦。"
113
+ },
114
+ {
115
+ "index": 17,
116
+ "audio_path": "17.wav",
117
+ "reference": "相比SOTA模型,我们的方法在小样本场景下召回率高出12个百分点,且参数量仅为其三分之一。",
118
+ "inference_time": 3.28,
119
+ "predicts": "相比sota模型我们的方法在小样本场景下召回率高出十二个百分点且参数量仅为其三分之一。"
120
+ },
121
+ {
122
+ "index": 18,
123
+ "audio_path": "18.wav",
124
+ "reference": "当节点并发数超过10,000时,内存带宽会成瓶颈,导致尾延迟急剧上升。",
125
+ "inference_time": 1.845,
126
+ "predicts": "当节点并发数超过一万时,内存带宽会呈瓶颈,导致伪延迟急剧上升。"
127
+ },
128
+ {
129
+ "index": 19,
130
+ "audio_path": "19.wav",
131
+ "reference": "请看图3:蓝色柱是基线模型,红色线���我们的优化结果,交叉点表明在第15轮迭代后优势显著。",
132
+ "inference_time": 2.927,
133
+ "predicts": "请看图三,蓝色柱是基线模型,红色线是我们的优化结果。交叉点表明在第十五轮迭代后,优势显著。"
134
+ },
135
+ {
136
+ "index": 20,
137
+ "audio_path": "20.wav",
138
+ "reference": "您提到模型泛化性提升,是否在跨模态数据上验证过?量化指标是多少?",
139
+ "inference_time": 1.897,
140
+ "predicts": "您提到模型泛化性提升,是否在跨模态数据上验证过量化指标是多少?"
141
+ },
142
+ {
143
+ "index": 21,
144
+ "audio_path": "21.wav",
145
+ "reference": "容我追问一点:如果输入数据存在对抗样本,您方案的鲁棒性如何保证?失效概率有测试吗?",
146
+ "inference_time": 2.492,
147
+ "predicts": "容我追问一点,如果输入数据存在对抗样本,您方案的鲁棒性如何保证?失效概率有测试吗?"
148
+ },
149
+ {
150
+ "index": 22,
151
+ "audio_path": "22.wav",
152
+ "reference": "抱歉打断,您说的‘动态剪枝’是指训练中还是推理中?阈值是自适应的吗?",
153
+ "inference_time": 2.074,
154
+ "predicts": "抱歉打断您说的动态减脂,是指训练中还是推理中?阈值是自适应的吗?"
155
+ },
156
+ {
157
+ "index": 23,
158
+ "audio_path": "23.wav",
159
+ "reference": "成本效益是否可行?部署这种定制芯片需要流片费用,中小客户怎么承担?",
160
+ "inference_time": 1.692,
161
+ "predicts": "成本效益是否可行?部署这种定制芯片需要流片费用,中小客户怎么承担?"
162
+ },
163
+ {
164
+ "index": 24,
165
+ "audio_path": "24.wav",
166
+ "reference": "我补充一个角度:隐私计算领域也在用类似思路,是否可能跨领域合作?",
167
+ "inference_time": 1.655,
168
+ "predicts": "我补充一个角度,隐私计算领域也在用类似思路,是否可能跨领域合作?"
169
+ },
170
+ {
171
+ "index": 25,
172
+ "audio_path": "25.wav",
173
+ "reference": "端到端优化流水线涵盖数据清洗特征工程模型压缩部署监控。",
174
+ "inference_time": 1.754,
175
+ "predicts": "端到端优化流水线,涵盖数据清洗、特征工程、模型压缩、部署监控。"
176
+ },
177
+ {
178
+ "index": 26,
179
+ "audio_path": "26.wav",
180
+ "reference": "请检查代码库中的依赖项冲突问题。",
181
+ "inference_time": 0.996,
182
+ "predicts": "请检查代码库中的依赖项冲突问题。"
183
+ },
184
+ {
185
+ "index": 27,
186
+ "audio_path": "27.wav",
187
+ "reference": "这个电路板集成了微控制器、传感器和无线通信模块。",
188
+ "inference_time": 1.264,
189
+ "predicts": "这个电路板集成了微控制器、传感器和无线通信模块。"
190
+ },
191
+ {
192
+ "index": 28,
193
+ "audio_path": "28.wav",
194
+ "reference": "区块链技术的核心在于去中心化和加密安全性。",
195
+ "inference_time": 1.034,
196
+ "predicts": "区块链技术的核心在于去中心化和加密安全性。"
197
+ },
198
+ {
199
+ "index": 29,
200
+ "audio_path": "29.wav",
201
+ "reference": "需符合等保三级要求,数据出境要走安全评估。",
202
+ "inference_time": 1.299,
203
+ "predicts": "需符合等保三级要求,数据出境要走安全评估。"
204
+ },
205
+ {
206
+ "index": 30,
207
+ "audio_path": "30.wav",
208
+ "reference": "最新数据显示,消费者信心指数有所回升。",
209
+ "inference_time": 0.893,
210
+ "predicts": "最新数据显示,消费者信心指数有所回升。"
211
+ },
212
+ {
213
+ "index": 31,
214
+ "audio_path": "31.wav",
215
+ "reference": "专家预测,新能源汽车市场将迎来爆发式增长。",
216
+ "inference_time": 1.025,
217
+ "predicts": "专家预测,新能源汽车市场将迎来爆发式增长。"
218
+ },
219
+ {
220
+ "index": 32,
221
+ "audio_path": "32.wav",
222
+ "reference": "多模态大模型能够同时处理文本、图像和音频信号,实现真正的跨模态理解",
223
+ "inference_time": 1.907,
224
+ "predicts": "多模态大模型能够同时处理文本、图像和音频信号,实现真正的跨模态理解。"
225
+ },
226
+ {
227
+ "index": 33,
228
+ "audio_path": "33.wav",
229
+ "reference": "联邦学习可以在保护用户隐私的前提下,实现分布式模型的协同训练",
230
+ "inference_time": 1.329,
231
+ "predicts": "联邦学习可以在保护用户隐私的前提下,实现分布式模型的协同训练。"
232
+ },
233
+ {
234
+ "index": 34,
235
+ "audio_path": "34.wav",
236
+ "reference": "我们研发的五十量子比特处理器在特定算法上展现出了量子优越性",
237
+ "inference_time": 1.319,
238
+ "predicts": "我们研发的五十量子比特处理器,在特定算法上展现出了量子优越性。"
239
+ },
240
+ {
241
+ "index": 35,
242
+ "audio_path": "35.wav",
243
+ "reference": "变分量子本征值求解器在化学模拟中显示出巨大潜力",
244
+ "inference_time": 1.346,
245
+ "predicts": "变分量子本征值求解器在化学模拟中显示出巨大潜力。"
246
+ },
247
+ {
248
+ "index": 36,
249
+ "audio_path": "36.wav",
250
+ "reference": "边缘AI推理的延迟已经可以控制在十毫秒以内,满足实时应用需求",
251
+ "inference_time": 1.493,
252
+ "predicts": "边缘ai推理的延迟已经可以控制在十毫秒以内满足实时应用需求。"
253
+ },
254
+ {
255
+ "index": 37,
256
+ "audio_path": "37.wav",
257
+ "reference": "我们设计的新型神经网络压缩算法在保持精度的同时将模型大小减少了百分之七十",
258
+ "inference_time": 1.712,
259
+ "predicts": "我们设计的新型神经网络压缩算法,在保持精度的同时,将模型大小减少了百分之七十。"
260
+ },
261
+ {
262
+ "index": 38,
263
+ "audio_path": "38.wav",
264
+ "reference": "损失函数结合了交叉熵损失和对比损失,权重系数设置为零点三和零点七",
265
+ "inference_time": 1.812,
266
+ "predicts": "损失函数结合了交叉熵损失和对比损失权重系数设置为零点三和零点七。"
267
+ },
268
+ {
269
+ "index": 39,
270
+ "audio_path": "39.wav",
271
+ "reference": "在五个标准数据集上的实验结果表明,我们的方法平均比现有最佳方法提升了二点三个百分点",
272
+ "inference_time": 2.119,
273
+ "predicts": "在五个标准数据集上的实验结果表明,我们的方法平均比现有最佳方法提升了2.3个百分点。"
274
+ },
275
+ {
276
+ "index": 40,
277
+ "audio_path": "40.wav",
278
+ "reference": "消融实验证实了每个模块的有效性,移除注意力机制会导致性能下降百分之四点六",
279
+ "inference_time": 1.759,
280
+ "predicts": "消融实验证实了每个模块的有效性,移除注意力机制会导致性能下降百分之四点六。"
281
+ },
282
+ {
283
+ "index": 41,
284
+ "audio_path": "41.wav",
285
+ "reference": "模型的参数量为一点二亿,在八张A100显卡上训练了七十二小时",
286
+ "inference_time": 1.869,
287
+ "predicts": "模型的参数量为一点二亿在八张a一百显卡上训练了七十二小时。"
288
+ },
289
+ {
290
+ "index": 42,
291
+ "audio_path": "42.wav",
292
+ "reference": "总之,搞定了数据倾斜,性能就上去了。",
293
+ "inference_time": 0.974,
294
+ "predicts": "总之,搞定的数据倾斜性能就上去了。"
295
+ },
296
+ {
297
+ "index": 43,
298
+ "audio_path": "43.wav",
299
+ "reference": "openai-whisper 需要 ffmpeg 的环境,ffmpeg 是一个开源的跨平台音视频处理工具和框架,可以用来录制、转换和流式传输音视频内容 。",
300
+ "inference_time": 3.157,
301
+ "predicts": "OpenAI Whisper需要FMPEG的环境。FMPEG是一个开源的跨平台音视频处理工具和框架,可以用来录制、转换和流式传输音视频内容。"
302
+ },
303
+ {
304
+ "index": 44,
305
+ "audio_path": "44.wav",
306
+ "reference": "采用 Transformer 序列到序列模型可以实现针对不同的语言处理任务。",
307
+ "inference_time": 1.411,
308
+ "predicts": "采用Transformer序列到序列模型,可以实现针对不同的语言处理任务。"
309
+ },
310
+ {
311
+ "index": 45,
312
+ "audio_path": "45.wav",
313
+ "reference": "Transformer架构在自然语言处理中的成功应用已经彻底改变了预训练模型的范式",
314
+ "inference_time": 1.734,
315
+ "predicts": "Transformer架构在自然语言处理中的成功应用,已经彻底改变了预训练模型的范式。"
316
+ },
317
+ {
318
+ "index": 46,
319
+ "audio_path": "46.wav",
320
+ "reference": "请大家注意,Workshop材料已经上传至会议系统,代码仓库链接在附录页。",
321
+ "inference_time": 1.805,
322
+ "predicts": "请大家注意,workshop材料已经上传至会议系统,代码仓库链接在附录页。"
323
+ },
324
+ {
325
+ "index": 47,
326
+ "audio_path": "47.wav",
327
+ "reference": "别造轮子了!直接调用yolov5开源库的预训练权重,快速迭代才是王道。",
328
+ "inference_time": 2.185,
329
+ "predicts": "别造轮子了,直接调用YOLO V五开源库的预训练权重,快速迭代才是王道。"
330
+ },
331
+ {
332
+ "index": 48,
333
+ "audio_path": "48.wav",
334
+ "reference": "请确保您的设备已连接到5GHz Wi-Fi频段以获得最佳性能。",
335
+ "inference_time": 1.75,
336
+ "predicts": "请确保您的设备已连接到五G赫兹WiFi频段,以获得最佳性能。"
337
+ },
338
+ {
339
+ "index": 49,
340
+ "audio_path": "49.wav",
341
+ "reference": "我们提出了一种基于对比学习的自监督方法,在ImageNet数据集上达到了百分之九十二点五的准确率",
342
+ "inference_time": 2.412,
343
+ "predicts": "我们提出了一种基于对比学习的自监督方法,在Imogenet数据集上达到了92.5%的准确率。"
344
+ },
345
+ {
346
+ "index": 50,
347
+ "audio_path": "50.wav",
348
+ "reference": "GPT-4在代码生成和多步推理任务上展现出令人印象深刻的能力",
349
+ "inference_time": 1.515,
350
+ "predicts": "GPT-4在代码生成和多步推理任务上展现出令人印象深刻的能力。"
351
+ },
352
+ {
353
+ "index": 51,
354
+ "audio_path": "51.wav",
355
+ "reference": "若在高并发场景下未启用我们的动态缓存机制,即使用RDMA网络,延迟也可能因CPU调度争抢而恶化。",
356
+ "inference_time": 3.067,
357
+ "predicts": "若在高并发场景下未启用我们的动态缓存机制即使用rdma网络��迟也可能因CPU调度增强而恶化。"
358
+ },
359
+ {
360
+ "index": 52,
361
+ "audio_path": "52.wav",
362
+ "reference": "准确说,峰值算力是200 TOPS,不是刚才说的150。",
363
+ "inference_time": 1.718,
364
+ "predicts": "准确说,峰值算力是两百tops,不是刚才说的一百五。"
365
+ },
366
+ {
367
+ "index": 53,
368
+ "audio_path": "53.wav",
369
+ "reference": "秦始皇嬴政书同文,车同轨,奠定了中国大一统的基础。",
370
+ "inference_time": 1.871,
371
+ "predicts": "秦始皇嬴政书同文车同轨奠定了中国大一统的基础。"
372
+ },
373
+ {
374
+ "index": 54,
375
+ "audio_path": "54.wav",
376
+ "reference": "诸葛亮在《出师表》中写道‘鞠躬尽瘁,死而后已’,成为后世臣子的楷模。",
377
+ "inference_time": 2.357,
378
+ "predicts": "诸葛亮在《出师表》中写道:“鞠躬尽瘁,死而后已,成为后世臣子的楷模。”"
379
+ },
380
+ {
381
+ "index": 55,
382
+ "audio_path": "55.wav",
383
+ "reference": "李白的‘举头望明月,低头思故乡’是连三岁孩童都能背诵的诗句。",
384
+ "inference_time": 2.307,
385
+ "predicts": "李白的举头望明月,低头思故乡,是连三岁孩童都能背诵的诗句。"
386
+ },
387
+ {
388
+ "index": 56,
389
+ "audio_path": "56.wav",
390
+ "reference": "孔子曾说‘己所不欲,勿施于人’,这简单的八个字构成了儒家伦理的基石。",
391
+ "inference_time": 2.008,
392
+ "predicts": "孔子曾说:“己所不欲,勿施于人。”这简单的八个字,构成了儒家伦理的基石。"
393
+ },
394
+ {
395
+ "index": 57,
396
+ "audio_path": "57.wav",
397
+ "reference": "王羲之被后人尊为‘书圣’,其代表作《兰亭集序》被誉为天下第一行书。",
398
+ "inference_time": 2.169,
399
+ "predicts": "王羲之被后人尊为书圣,其代表作《兰亭集序》被誉为天下第一行书。"
400
+ },
401
+ {
402
+ "index": 58,
403
+ "audio_path": "58.wav",
404
+ "reference": "我们要学习庖丁解牛的精神,掌握事物的客观规律,才能游刃有余。",
405
+ "inference_time": 1.714,
406
+ "predicts": "我们要学习庖丁解牛的精神,掌握事物的客观规律,才能游刃有余。"
407
+ },
408
+ {
409
+ "index": 59,
410
+ "audio_path": "59.wav",
411
+ "reference": "项羽在鸿门宴上优柔寡断,放走了刘邦,最终兵败乌江自刎。",
412
+ "inference_time": 2.041,
413
+ "predicts": "项羽在鸿门宴上优柔寡断,放走了刘邦,最终兵败乌江自刎。"
414
+ },
415
+ {
416
+ "index": 60,
417
+ "audio_path": "60.wav",
418
+ "reference": "在杭州西湖畔,人们总会想起苏轼治理西湖、修筑苏堤的往事。",
419
+ "inference_time": 1.631,
420
+ "predicts": "在杭州西湖畔,人们总会想起苏轼治理西湖、修筑苏堤的往事。"
421
+ },
422
+ {
423
+ "index": 61,
424
+ "audio_path": "61.wav",
425
+ "reference": "我计划去埃菲尔铁塔和卢浮宫参观。",
426
+ "inference_time": 1.224,
427
+ "predicts": "我计划去埃菲尔铁塔和卢浮宫参观。"
428
+ },
429
+ {
430
+ "index": 62,
431
+ "audio_path": "62.wav",
432
+ "reference": "莎士比亚的戏剧深刻地探讨了人性的复杂性。",
433
+ "inference_time": 1.266,
434
+ "predicts": "莎士比亚的戏剧深刻地探讨了人性的复杂性。"
435
+ },
436
+ {
437
+ "index": 63,
438
+ "audio_path": "63.wav",
439
+ "reference": "这个消息让他丈二和尚摸不着头脑。",
440
+ "inference_time": 1.143,
441
+ "predicts": "这个消息让他丈二和尚摸不着头脑。"
442
+ },
443
+ {
444
+ "index": 64,
445
+ "audio_path": "64.wav",
446
+ "reference": "画龙点睛之笔,让整个设计焕然一新。",
447
+ "inference_time": 1.302,
448
+ "predicts": "画龙点睛之笔,让整个设计焕然一新。"
449
+ },
450
+ {
451
+ "index": 65,
452
+ "audio_path": "65.wav",
453
+ "reference": "这是卡脖子技术,必须自主研发!",
454
+ "inference_time": 0.885,
455
+ "predicts": "这是卡脖子技术,必须自主研发。"
456
+ },
457
+ {
458
+ "index": 66,
459
+ "audio_path": "66.wav",
460
+ "reference": "这幅画作以其独特的色彩运用和构图技巧而闻名。",
461
+ "inference_time": 1.275,
462
+ "predicts": "这幅画作以其独特的色彩运用和构图技巧而闻名。"
463
+ },
464
+ {
465
+ "index": 67,
466
+ "audio_path": "67.wav",
467
+ "reference": "患者主诉间歇性胸痛,放射至左臂。",
468
+ "inference_time": 1.211,
469
+ "predicts": "患者主诉间歇性胸痛,放射至左臂。"
470
+ },
471
+ {
472
+ "index": 68,
473
+ "audio_path": "68.wav",
474
+ "reference": "患者有冠状动脉粥样硬化性心脏病病史十年,慢性阻塞性肺疾病病史五年",
475
+ "inference_time": 2.265,
476
+ "predicts": "患者有冠状动脉粥样硬化性心脏病病史十年慢性阻塞性肺疾病病史五年"
477
+ },
478
+ {
479
+ "index": 69,
480
+ "audio_path": "69.wav",
481
+ "reference": "建议行冠状动脉造影检查,必要时植入支架",
482
+ "inference_time": 1.405,
483
+ "predicts": "建议行冠状动脉造影检查,必要时植入支架。"
484
+ },
485
+ {
486
+ "index": 70,
487
+ "audio_path": "70.wav",
488
+ "reference": "患者需低盐低脂糖尿病饮食,监测血压、血糖变化",
489
+ "inference_time": 1.402,
490
+ "predicts": "患者需低盐低脂糖尿病饮食,监测血压、血糖变化。"
491
+ },
492
+ {
493
+ "index": 71,
494
+ "audio_path": "71.wav",
495
+ "reference": "胸部CT平扫显示:双肺散在磨玻璃样密度影,以胸膜下分布为主",
496
+ "inference_time": 2.044,
497
+ "predicts": "胸部c t平扫显示双肺散在磨玻璃样密度影以胸膜下分布为主。"
498
+ },
499
+ {
500
+ "index": 72,
501
+ "audio_path": "72.wav",
502
+ "reference": "知识产权归属约定:乙方在履行本合同过程中所产生的全部智力成果,其知识产权均归甲方所有",
503
+ "inference_time": 2.064,
504
+ "predicts": "知识产权归属约定:乙方在履行本合同过程中所产生的全部智力成果,其知识产权归甲方所有。"
505
+ },
506
+ {
507
+ "index": 73,
508
+ "audio_path": "73.wav",
509
+ "reference": "双方因履行本合同发生争议的,应首先通过友好协商解决;协商不成的,任何一方均有权向有管辖权的人民法院提起诉讼",
510
+ "inference_time": 2.892,
511
+ "predicts": "双方因履行本合同发生争议的,应首先通过友好协商解决;协商不成的,任何一方均有权向有管辖权的人民法院提起诉讼。"
512
+ },
513
+ {
514
+ "index": 74,
515
+ "audio_path": "74.wav",
516
+ "reference": "被告在法定期限内未提交答辩状,亦未到庭参加诉讼,本院依法缺席审理",
517
+ "inference_time": 2.157,
518
+ "predicts": "被告在法定期限内未提交答辩状,亦未到庭参加诉讼,本院依法缺席审理。"
519
+ },
520
+ {
521
+ "index": 75,
522
+ "audio_path": "75.wav",
523
+ "reference": "原告向本院提出诉讼请求:一、判令被告支付货款人民币五十万八千元及逾期付款利息;二、判令被告承担本案的诉讼费用",
524
+ "inference_time": 3.538,
525
+ "predicts": "原告向本院提出诉讼请求:一、判令被告支付货款人民币50万8000元及逾期付款利息;二、判令被告承担本案的诉讼费用。"
526
+ },
527
+ {
528
+ "index": 76,
529
+ "audio_path": "76.wav",
530
+ "reference": "被执行人未按执行通知履行法律文书确定的义务,人民法院有权查封、扣押、冻结、拍卖被执行人的财产",
531
+ "inference_time": 2.759,
532
+ "predicts": "被执行人未按执行通知履行法律文书确定的义务,人民法院有权查封、扣押、冻结、拍卖被执行人的财产。"
533
+ },
534
+ {
535
+ "index": 77,
536
+ "audio_path": "77.wav",
537
+ "reference": "这个视频太上头了!",
538
+ "inference_time": 1.279,
539
+ "predicts": "这个视频太上头了。"
540
+ },
541
+ {
542
+ "index": 78,
543
+ "audio_path": "78.wav",
544
+ "reference": "他真是个社恐。",
545
+ "inference_time": 0.617,
546
+ "predicts": "他真是个社恐。"
547
+ },
548
+ {
549
+ "index": 79,
550
+ "audio_path": "79.wav",
551
+ "reference": "简直了,这躺平的状态也太佛系了吧。",
552
+ "inference_time": 1.111,
553
+ "predicts": "简直了,这躺平的状态也太佛系了吧。"
554
+ },
555
+ {
556
+ "index": 80,
557
+ "audio_path": "80.wav",
558
+ "reference": "这个瓜有点大,我得去吃瓜了。",
559
+ "inference_time": 0.956,
560
+ "predicts": "这个瓜有点大,我得去吃瓜了。"
561
+ },
562
+ {
563
+ "index": 81,
564
+ "audio_path": "81.wav",
565
+ "reference": "别内卷了,咱们还是多交流交流吧。",
566
+ "inference_time": 0.947,
567
+ "predicts": "别内卷了,咱们还是多交流交流吧。"
568
+ },
569
+ {
570
+ "index": 82,
571
+ "audio_path": "82.wav",
572
+ "reference": "第二个公司我们成立了中国黄页,在中国黄页的创业经验中有很多的经验,也是可以在这儿跟大家进行分享的",
573
+ "inference_time": 2.36,
574
+ "predicts": "第二个公司,我们成立了中国黄页。在中国黄页的创业经验中,有很多的经验也是可以在这儿跟大家进行分享的。"
575
+ },
576
+ {
577
+ "index": 83,
578
+ "audio_path": "83.wav",
579
+ "reference": "对于大部分在接触微积分之前,主要的学习经验就是刷题,甚至是连题也不刷的同学们来说。",
580
+ "inference_time": 1.998,
581
+ "predicts": "对于大部分在接触微积分之前,主要的学习经验就是刷题,甚至是连题也不刷的同学们来说。"
582
+ },
583
+ {
584
+ "index": 84,
585
+ "audio_path": "84.wav",
586
+ "reference": "说了两个小时没人听懂我在说什么,最后二十三个人反对一个人同意,这一个人就说马云你这样做,你就试试看,不行的话,赶紧逃回来,还来得及。",
587
+ "inference_time": 3.189,
588
+ "predicts": "说了两个小时,没人听懂我在说什么。最后二十三个人反对,一个人同意。这一个人就说:马云,你这样做你就试试看,不行的话赶紧逃回来,还来得及。"
589
+ },
590
+ {
591
+ "index": 85,
592
+ "audio_path": "85.wav",
593
+ "reference": "晚上想想是热血沸腾,真好,第二天早上骑个自行车又上班去了,对吧?",
594
+ "inference_time": 1.703,
595
+ "predicts": "晚上想想是热血沸腾,真好。第二天早上骑个自行车又上班去了���对吧?"
596
+ }
597
+ ]
reports/asr_result_funasr_mlt_nano_wenet_net.json ADDED
The diff for this file is too large to render. See raw diff
 
reports/asr_result_funasr_nano_librispeech_clean.json ADDED
The diff for this file is too large to render. See raw diff
 
reports/asr_result_funasr_nano_recording.json ADDED
@@ -0,0 +1,597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "index": 1,
4
+ "audio_path": "1.wav",
5
+ "reference": "您这车开得真够稳的,蜗牛都超车了。",
6
+ "inference_time": 2.332,
7
+ "predicts": "您这车开的真够稳的,蜗牛都超车了。"
8
+ },
9
+ {
10
+ "index": 2,
11
+ "audio_path": "2.wav",
12
+ "reference": "你可真是个天才,把这么简单的事情都搞砸了。",
13
+ "inference_time": 1.347,
14
+ "predicts": "你可真是个天才,把这么简单的事情都搞砸了。"
15
+ },
16
+ {
17
+ "index": 3,
18
+ "audio_path": "3.wav",
19
+ "reference": "因网络问题,远程讲者音频中断,我们切到备用方案共享幻灯片。",
20
+ "inference_time": 2.163,
21
+ "predicts": "因网络问题,远程讲者音频中断,我们切到备用方案,共享幻灯片。"
22
+ },
23
+ {
24
+ "index": 4,
25
+ "audio_path": "4.wav",
26
+ "reference": "这就好比‘用大炮打蚊子’,资源分配严重不均衡,需要细粒度调度。",
27
+ "inference_time": 1.994,
28
+ "predicts": "这就好比用大炮打蚊子,资源分配严重不均衡,需要细力度调度。"
29
+ },
30
+ {
31
+ "index": 5,
32
+ "audio_path": "5.wav",
33
+ "reference": "言归正传,我们来讨论一下核心问题。",
34
+ "inference_time": 1.192,
35
+ "predicts": "言归正传,我们来讨论一下核心问题。"
36
+ },
37
+ {
38
+ "index": 6,
39
+ "audio_path": "6.wav",
40
+ "reference": "这有点超出了我们今天讨论的范围。",
41
+ "inference_time": 1.219,
42
+ "predicts": "这有点超出了我们今天讨论的范围"
43
+ },
44
+ {
45
+ "index": 7,
46
+ "audio_path": "7.wav",
47
+ "reference": "他的年收入约为一百万人民币。",
48
+ "inference_time": 0.959,
49
+ "predicts": "他的年收入约为一百万人民币。"
50
+ },
51
+ {
52
+ "index": 8,
53
+ "audio_path": "8.wav",
54
+ "reference": "下一位演讲者是来自斯坦福大学计算机科学系的张明教授",
55
+ "inference_time": 1.657,
56
+ "predicts": "下一位演讲者是来自斯坦福大学计算机科学系的张鸣教授。"
57
+ },
58
+ {
59
+ "index": 9,
60
+ "audio_path": "9.wav",
61
+ "reference": "请各位将手机调至静音模式,演讲结束后有十五分钟提问时间",
62
+ "inference_time": 1.675,
63
+ "predicts": "请各位将手机调至静音模式。演讲结束后有十五分钟提问时间。"
64
+ },
65
+ {
66
+ "index": 10,
67
+ "audio_path": "10.wav",
68
+ "reference": "茶歇将在十点三十分开始,地点在二楼休息区",
69
+ "inference_time": 1.793,
70
+ "predicts": "茶歇将在十点三十分开始,地点在二楼休息区。"
71
+ },
72
+ {
73
+ "index": 11,
74
+ "audio_path": "11.wav",
75
+ "reference": "本合同自双方签字盖章之日起生效。",
76
+ "inference_time": 1.078,
77
+ "predicts": "本合同自双方签字盖章之日起生效。"
78
+ },
79
+ {
80
+ "index": 12,
81
+ "audio_path": "12.wav",
82
+ "reference": "政府正在采取措施刺激经济增长和创造就业机会。",
83
+ "inference_time": 1.36,
84
+ "predicts": "政府正在采取措施刺激经济增长和创造就业机会。"
85
+ },
86
+ {
87
+ "index": 13,
88
+ "audio_path": "13.wav",
89
+ "reference": "今天中午我们点外卖吧,我想吃宫保鸡丁和麻婆豆腐。",
90
+ "inference_time": 1.199,
91
+ "predicts": "今天中午我们点外卖吧我想吃宫保鸡丁和麻婆豆腐"
92
+ },
93
+ {
94
+ "index": 14,
95
+ "audio_path": "14.wav",
96
+ "reference": "医生建议我多吃粗粮,比如燕麦和紫薯,少吃高油高糖的油炸食品,像炸鸡和甜甜圈。",
97
+ "inference_time": 3.541,
98
+ "predicts": "医生建议我多吃粗粮,比如燕麦和紫薯,少吃高油高糖的油炸食品,像炸鸡和甜甜圈。"
99
+ },
100
+ {
101
+ "index": 15,
102
+ "audio_path": "15.wav",
103
+ "reference": "本方案采用异构计算架构,结合GPU与FPGA加速张量运算。",
104
+ "inference_time": 1.897,
105
+ "predicts": "本方案采用异构计算架构,结合GPU与FPGA,加速张量运算。"
106
+ },
107
+ {
108
+ "index": 16,
109
+ "audio_path": "16.wav",
110
+ "reference": "推理延迟稳定在5毫秒以内,吞吐量提升40%,功耗降低1.8瓦。",
111
+ "inference_time": 2.648,
112
+ "predicts": "推力延迟稳定在五毫秒以内,吞吐量提升百分之四十,功耗降低一点八瓦。"
113
+ },
114
+ {
115
+ "index": 17,
116
+ "audio_path": "17.wav",
117
+ "reference": "相比SOTA模型,我们的方法在小样本场景下召回率高出12个百分点,且参数量仅为其三分之一。",
118
+ "inference_time": 3.272,
119
+ "predicts": "相比SOTA模型,我们的方法在小样本场景下召回率高出十二个百分点,且参数量仅为其三分之一。"
120
+ },
121
+ {
122
+ "index": 18,
123
+ "audio_path": "18.wav",
124
+ "reference": "当节点并发数超过10,000时,内存带宽会成瓶颈,导致尾延迟急剧上升。",
125
+ "inference_time": 2.284,
126
+ "predicts": "当节点并发数超过一万时,内存带宽会呈瓶颈,导致伪延迟急剧上升。"
127
+ },
128
+ {
129
+ "index": 19,
130
+ "audio_path": "19.wav",
131
+ "reference": "请看图3:蓝色柱是基线模型,红��线是我们的优化结果,交叉点表明在第15轮迭代后优势显著。",
132
+ "inference_time": 2.934,
133
+ "predicts": "请看图三,蓝色柱是基线模型,红色线是我们的优化结果。交叉点表明在第十五轮迭代后优势显著。"
134
+ },
135
+ {
136
+ "index": 20,
137
+ "audio_path": "20.wav",
138
+ "reference": "您提到模型泛化性提升,是否在跨模态数据上验证过?量化指标是多少?",
139
+ "inference_time": 2.185,
140
+ "predicts": "您提到模型泛化性提升是否在跨模态数据上验证过?量化指标是多少?"
141
+ },
142
+ {
143
+ "index": 21,
144
+ "audio_path": "21.wav",
145
+ "reference": "容我追问一点:如果输入数据存在对抗样本,您方案的鲁棒性如何保证?失效概率有测试吗?",
146
+ "inference_time": 2.658,
147
+ "predicts": "容我追问一点,如果输入数据存在对抗样本,您方案的鲁棒性如何保证?失效概率有测试吗?"
148
+ },
149
+ {
150
+ "index": 22,
151
+ "audio_path": "22.wav",
152
+ "reference": "抱歉打断,您说的‘动态剪枝’是指训练中还是推理中?阈值是自适应的吗?",
153
+ "inference_time": 2.711,
154
+ "predicts": "抱歉打断您说的动态减脂,是指训练中还是推理中?阈值是自适应的吗?"
155
+ },
156
+ {
157
+ "index": 23,
158
+ "audio_path": "23.wav",
159
+ "reference": "成本效益是否可行?部署这种定制芯片需要流片费用,中小客户怎么承担?",
160
+ "inference_time": 2.103,
161
+ "predicts": "成本效益是否可行?部署这种定制芯片需要流片费用,中小客户怎么承担?"
162
+ },
163
+ {
164
+ "index": 24,
165
+ "audio_path": "24.wav",
166
+ "reference": "我补充一个角度:隐私计算领域也在用类似思路,是否可能跨领域合作?",
167
+ "inference_time": 2.078,
168
+ "predicts": "我补充一个角度,隐私计算领域也在用类似思路,是否可能跨领域合作?"
169
+ },
170
+ {
171
+ "index": 25,
172
+ "audio_path": "25.wav",
173
+ "reference": "端到端优化流水线涵盖数据清洗特征工程模型压缩部署监控。",
174
+ "inference_time": 2.091,
175
+ "predicts": "端到端优化流水线,涵盖数据清洗、特征工程、模型压缩、部署监控。"
176
+ },
177
+ {
178
+ "index": 26,
179
+ "audio_path": "26.wav",
180
+ "reference": "请检查代码库中的依赖项冲突问题。",
181
+ "inference_time": 1.719,
182
+ "predicts": "请检查代码库中的依赖项冲突问题。"
183
+ },
184
+ {
185
+ "index": 27,
186
+ "audio_path": "27.wav",
187
+ "reference": "这个电路板集成了微控制器、传感器和无线通信模块。",
188
+ "inference_time": 1.929,
189
+ "predicts": "这个电路板集成了微控制器、传感器和无线通信模块。"
190
+ },
191
+ {
192
+ "index": 28,
193
+ "audio_path": "28.wav",
194
+ "reference": "区块链技术的核心在于去中心化和加密安全性。",
195
+ "inference_time": 1.417,
196
+ "predicts": "区块链技术的核心在于去中心化和加密安全性。"
197
+ },
198
+ {
199
+ "index": 29,
200
+ "audio_path": "29.wav",
201
+ "reference": "需符合等保三级要求,数据出境要走安全评估。",
202
+ "inference_time": 1.665,
203
+ "predicts": "需符合等保三级要求,数据出境要走安全评估。"
204
+ },
205
+ {
206
+ "index": 30,
207
+ "audio_path": "30.wav",
208
+ "reference": "最新数据显示,消费者信心指数有所回升。",
209
+ "inference_time": 1.292,
210
+ "predicts": "最新数据显示,消费者信心指数有所回升。"
211
+ },
212
+ {
213
+ "index": 31,
214
+ "audio_path": "31.wav",
215
+ "reference": "专家预测,新能源汽车市场将迎来爆发式增长。",
216
+ "inference_time": 1.321,
217
+ "predicts": "专家预测,新能源汽车市场将迎来爆发式增长。"
218
+ },
219
+ {
220
+ "index": 32,
221
+ "audio_path": "32.wav",
222
+ "reference": "多模态大模型能够同时处理文本、图像和音频信号,实现真正的跨模态理解",
223
+ "inference_time": 2.159,
224
+ "predicts": "多模态大模型能够同时处理文本、图像和音频信号,实现真正的跨模态理解。"
225
+ },
226
+ {
227
+ "index": 33,
228
+ "audio_path": "33.wav",
229
+ "reference": "联邦学习可以在保护用户隐私的前提下,实现分布式模型的协同训练",
230
+ "inference_time": 1.743,
231
+ "predicts": "联邦学习可以在保护用户隐私的前提下,实现分布式模型的协同训练。"
232
+ },
233
+ {
234
+ "index": 34,
235
+ "audio_path": "34.wav",
236
+ "reference": "我们研发的五十量子比特处理器在特定算法上展现出了量子优越性",
237
+ "inference_time": 1.381,
238
+ "predicts": "我们研发的五十量子比特处理器在特定算法上展现出了量子优越性。"
239
+ },
240
+ {
241
+ "index": 35,
242
+ "audio_path": "35.wav",
243
+ "reference": "变分量子本征值求解器在化学模拟中显示出巨大潜力",
244
+ "inference_time": 1.765,
245
+ "predicts": "变分量子本征值求解器在化学模拟中显示出巨大潜力。"
246
+ },
247
+ {
248
+ "index": 36,
249
+ "audio_path": "36.wav",
250
+ "reference": "边缘AI推理的延迟已经可以控制在十毫秒以内,满���实时应用需求",
251
+ "inference_time": 1.553,
252
+ "predicts": "边缘AI推理的延迟已经可以控制在10毫秒以内,满足实时应用需求。"
253
+ },
254
+ {
255
+ "index": 37,
256
+ "audio_path": "37.wav",
257
+ "reference": "我们设计的新型神经网络压缩算法在保持精度的同时将模型大小减少了百分之七十",
258
+ "inference_time": 2.034,
259
+ "predicts": "我们设计的新型神经网络压缩算法,在保持精度的同时,将模型大小减少了百分之七十。"
260
+ },
261
+ {
262
+ "index": 38,
263
+ "audio_path": "38.wav",
264
+ "reference": "损失函数结合了交叉熵损失和对比损失,权重系数设置为零点三和零点七",
265
+ "inference_time": 1.855,
266
+ "predicts": "损失函数结合了交叉熵损失和对比损失,权重系数设置为0.3和0.7。"
267
+ },
268
+ {
269
+ "index": 39,
270
+ "audio_path": "39.wav",
271
+ "reference": "在五个标准数据集上的实验结果表明,我们的方法平均比现有最佳方法提升了二点三个百分点",
272
+ "inference_time": 2.457,
273
+ "predicts": "在五个标准数据集上的实验结果表明,我们的方法平均比现有最佳方法提升了2.3个百分点。"
274
+ },
275
+ {
276
+ "index": 40,
277
+ "audio_path": "40.wav",
278
+ "reference": "消融实验证实了每个模块的有效性,移除注意力机制会导致性能下降百分之四点六",
279
+ "inference_time": 1.82,
280
+ "predicts": "消融实验证实了每个模块的有效性,移除注意力机制会导致性能下降百分之四点六。"
281
+ },
282
+ {
283
+ "index": 41,
284
+ "audio_path": "41.wav",
285
+ "reference": "模型的参数量为一点二亿,在八张A100显卡上训练了七十二小时",
286
+ "inference_time": 2.325,
287
+ "predicts": "模型的参数量为1.2亿,在八张A100显卡上训练了72小时。"
288
+ },
289
+ {
290
+ "index": 42,
291
+ "audio_path": "42.wav",
292
+ "reference": "总之,搞定了数据倾斜,性能就上去了。",
293
+ "inference_time": 1.389,
294
+ "predicts": "总之,搞定了数据倾斜性能就上去了。"
295
+ },
296
+ {
297
+ "index": 43,
298
+ "audio_path": "43.wav",
299
+ "reference": "openai-whisper 需要 ffmpeg 的环境,ffmpeg 是一个开源的跨平台音视频处理工具和框架,可以用来录制、转换和流式传输音视频内容 。",
300
+ "inference_time": 5.131,
301
+ "predicts": "OpenAI Whisper需要Ffmpeg的环境。Ffmpeg是一个开源的跨平台音视频处理工具和框架,可以用来录制、转换和流式传输音视频内容。"
302
+ },
303
+ {
304
+ "index": 44,
305
+ "audio_path": "44.wav",
306
+ "reference": "采用 Transformer 序列到序列模型可以实现针对不同的语言处理任务。",
307
+ "inference_time": 1.981,
308
+ "predicts": "采用Transformer序列到序列模型,可以实现针对不同的语言处理任务。"
309
+ },
310
+ {
311
+ "index": 45,
312
+ "audio_path": "45.wav",
313
+ "reference": "Transformer架构在自然语言处理中的成功应用已经彻底改变了预训练模型的范式",
314
+ "inference_time": 1.817,
315
+ "predicts": "Transformer 架构在自然语言处理中的成功应用,已经彻底改变了预训练模型的范式。"
316
+ },
317
+ {
318
+ "index": 46,
319
+ "audio_path": "46.wav",
320
+ "reference": "请大家注意,Workshop材料已经上传至会议系统,代码仓库链接在附录页。",
321
+ "inference_time": 2.385,
322
+ "predicts": "请大家注意,Workshop材料已经上传至会议系统,代码仓库链接在附录页。"
323
+ },
324
+ {
325
+ "index": 47,
326
+ "audio_path": "47.wav",
327
+ "reference": "别造轮子了!直接调用yolov5开源库的预训练权重,快速迭代才是王道。",
328
+ "inference_time": 2.731,
329
+ "predicts": "别造轮子了,直接调用YOLO V五开源库的预训练权重,快速迭代才是王道。"
330
+ },
331
+ {
332
+ "index": 48,
333
+ "audio_path": "48.wav",
334
+ "reference": "请确保您的设备已连接到5GHz Wi-Fi频段以获得最佳性能。",
335
+ "inference_time": 2.224,
336
+ "predicts": "请确保您的设备已连接到5G赫兹WiFi频段,以获得最佳性能。"
337
+ },
338
+ {
339
+ "index": 49,
340
+ "audio_path": "49.wav",
341
+ "reference": "我们提出了一种基于对比学习的自监督方法,在ImageNet数据集上达到了百分之九十二点五的准确率",
342
+ "inference_time": 2.67,
343
+ "predicts": "我们提出了一种基于对比学习的自监督方法,在ImageNet数据集上达到了百分之九十二点五的准确率。"
344
+ },
345
+ {
346
+ "index": 50,
347
+ "audio_path": "50.wav",
348
+ "reference": "GPT-4在代码生成和多步推理任务上展现出令人印象深刻的能力",
349
+ "inference_time": 1.423,
350
+ "predicts": "GPT-四在代码生成和多步推理任务上展现出令人印象深刻的能力。"
351
+ },
352
+ {
353
+ "index": 51,
354
+ "audio_path": "51.wav",
355
+ "reference": "若在高并发场景下未启用我们的动态缓存机制,即使用RDMA网络,延迟也可能因CPU调度争抢而恶化。",
356
+ "inference_time": 4.261,
357
+ "predicts": "若在高并发场景下未启用我们的动态缓存机制,即使用RDMA网��,延迟也可能因CPU调度增强而恶化。"
358
+ },
359
+ {
360
+ "index": 52,
361
+ "audio_path": "52.wav",
362
+ "reference": "准确说,峰值算力是200 TOPS,不是刚才说的150。",
363
+ "inference_time": 1.628,
364
+ "predicts": "准确说,峰值算力是200TOPS,不是刚才说的一百五。"
365
+ },
366
+ {
367
+ "index": 53,
368
+ "audio_path": "53.wav",
369
+ "reference": "秦始皇嬴政书同文,车同轨,奠定了中国大一统的基础。",
370
+ "inference_time": 2.224,
371
+ "predicts": "秦始皇嬴政书同文、车同轨,奠定了中国大一统的基础。"
372
+ },
373
+ {
374
+ "index": 54,
375
+ "audio_path": "54.wav",
376
+ "reference": "诸葛亮在《出师表》中写道‘鞠躬尽瘁,死而后已’,成为后世臣子的楷模。",
377
+ "inference_time": 2.053,
378
+ "predicts": "诸葛亮在出师表中写道:鞠躬尽瘁,死而后已,成为后世臣子的楷模。"
379
+ },
380
+ {
381
+ "index": 55,
382
+ "audio_path": "55.wav",
383
+ "reference": "李白的‘举头望明月,低头思故乡’是连三岁孩童都能背诵的诗句。",
384
+ "inference_time": 2.022,
385
+ "predicts": "李白的举头望明月,低头思故乡,是连三岁孩童都能背诵的诗句。"
386
+ },
387
+ {
388
+ "index": 56,
389
+ "audio_path": "56.wav",
390
+ "reference": "孔子曾说‘己所不欲,勿施于人’,这简单的八个字构成了儒家伦理的基石。",
391
+ "inference_time": 2.065,
392
+ "predicts": "孔子曾说:己所不欲,勿施于人。这简单的八个字,构成了儒家伦理的基石。"
393
+ },
394
+ {
395
+ "index": 57,
396
+ "audio_path": "57.wav",
397
+ "reference": "王羲之被后人尊为‘书圣’,其代表作《兰亭集序》被誉为天下第一行书。",
398
+ "inference_time": 2.753,
399
+ "predicts": "王羲之被后人尊为书圣,其代表作《兰亭集序》被誉为天下第一行书。"
400
+ },
401
+ {
402
+ "index": 58,
403
+ "audio_path": "58.wav",
404
+ "reference": "我们要学习庖丁解牛的精神,掌握事物的客观规律,才能游刃有余。",
405
+ "inference_time": 1.711,
406
+ "predicts": "我们要学习庖丁解牛的精神,掌握事物的客观规律,才能游刃有余。"
407
+ },
408
+ {
409
+ "index": 59,
410
+ "audio_path": "59.wav",
411
+ "reference": "项羽在鸿门宴上优柔寡断,放走了刘邦,最终兵败乌江自刎。",
412
+ "inference_time": 2.597,
413
+ "predicts": "项羽在鸿门宴上优柔寡断,放走了刘邦,最终兵败乌江自刎。"
414
+ },
415
+ {
416
+ "index": 60,
417
+ "audio_path": "60.wav",
418
+ "reference": "在杭州西湖畔,人们总会想起苏轼治理西湖、修筑苏堤的往事。",
419
+ "inference_time": 1.618,
420
+ "predicts": "在杭州西湖畔,人们总会想起苏轼治理西湖、修筑苏堤的往事。"
421
+ },
422
+ {
423
+ "index": 61,
424
+ "audio_path": "61.wav",
425
+ "reference": "我计划去埃菲尔铁塔和卢浮宫参观。",
426
+ "inference_time": 1.86,
427
+ "predicts": "我计划去埃菲尔铁塔和卢浮宫参观。"
428
+ },
429
+ {
430
+ "index": 62,
431
+ "audio_path": "62.wav",
432
+ "reference": "莎士比亚的戏剧深刻地探讨了人性的复杂性。",
433
+ "inference_time": 1.824,
434
+ "predicts": "莎士比亚的戏剧深刻地探讨了人性的复杂性。"
435
+ },
436
+ {
437
+ "index": 63,
438
+ "audio_path": "63.wav",
439
+ "reference": "这个消息让他丈二和尚摸不着头脑。",
440
+ "inference_time": 1.786,
441
+ "predicts": "这个消息让他丈二和尚摸不着头脑。"
442
+ },
443
+ {
444
+ "index": 64,
445
+ "audio_path": "64.wav",
446
+ "reference": "画龙点睛之笔,让整个设计焕然一新。",
447
+ "inference_time": 1.284,
448
+ "predicts": "画龙点睛之笔,让整个设计焕然一新。"
449
+ },
450
+ {
451
+ "index": 65,
452
+ "audio_path": "65.wav",
453
+ "reference": "这是卡脖子技术,必须自主研发!",
454
+ "inference_time": 1.479,
455
+ "predicts": "这是卡脖子技术,必须自主研发。"
456
+ },
457
+ {
458
+ "index": 66,
459
+ "audio_path": "66.wav",
460
+ "reference": "这幅画作以其独特的色彩运用和构图技巧而闻名。",
461
+ "inference_time": 1.204,
462
+ "predicts": "这幅画作以其独特的色彩运用和构图技巧而闻名。"
463
+ },
464
+ {
465
+ "index": 67,
466
+ "audio_path": "67.wav",
467
+ "reference": "患者主诉间歇性胸痛,放射至左臂。",
468
+ "inference_time": 1.191,
469
+ "predicts": "患者主诉间歇性胸痛,放射至左臂。"
470
+ },
471
+ {
472
+ "index": 68,
473
+ "audio_path": "68.wav",
474
+ "reference": "患者有冠状动脉粥样硬化性心脏病病史十年,慢性阻塞性肺疾病病史五年",
475
+ "inference_time": 4.634,
476
+ "predicts": "患者有冠状动脉粥样硬化性心脏病病史,十年慢性阻塞性肺疾病病史五年。"
477
+ },
478
+ {
479
+ "index": 69,
480
+ "audio_path": "69.wav",
481
+ "reference": "建议行冠状动脉造影检查,必要时植入支架",
482
+ "inference_time": 1.993,
483
+ "predicts": "建议行冠状动脉造影检查,必要时植入支架。"
484
+ },
485
+ {
486
+ "index": 70,
487
+ "audio_path": "70.wav",
488
+ "reference": "患者需低盐低脂糖尿病饮食,监测血压���血糖变化",
489
+ "inference_time": 1.945,
490
+ "predicts": "患者需低盐低脂糖尿病饮食,监测血压、血糖变化。"
491
+ },
492
+ {
493
+ "index": 71,
494
+ "audio_path": "71.wav",
495
+ "reference": "胸部CT平扫显示:双肺散在磨玻璃样密度影,以胸膜下分布为主",
496
+ "inference_time": 2.713,
497
+ "predicts": "胸部CT平扫显示双肺散在膜玻璃样密度影,以胸膜下分布为主。"
498
+ },
499
+ {
500
+ "index": 72,
501
+ "audio_path": "72.wav",
502
+ "reference": "知识产权归属约定:乙方在履行本合同过程中所产生的全部智力成果,其知识产权均归甲方所有",
503
+ "inference_time": 2.421,
504
+ "predicts": "知识产权归属约定,乙方在履行本合同过程中所产生的全部智力成果,其知识产权归甲方所有。"
505
+ },
506
+ {
507
+ "index": 73,
508
+ "audio_path": "73.wav",
509
+ "reference": "双方因履行本合同发生争议的,应首先通过友好协商解决;协商不成的,任何一方均有权向有管辖权的人民法院提起诉讼",
510
+ "inference_time": 4.377,
511
+ "predicts": "双方因履行本合同发生争议的,应首先通过友好协商解决。协商不成的,任何一方均有权向有管辖权的人民法院提起诉讼。"
512
+ },
513
+ {
514
+ "index": 74,
515
+ "audio_path": "74.wav",
516
+ "reference": "被告在法定期限内未提交答辩状,亦未到庭参加诉讼,本院依法缺席审理",
517
+ "inference_time": 2.65,
518
+ "predicts": "被告在法定期限内未提交答辩状,亦未到庭参加诉讼。本院依法缺席审理。"
519
+ },
520
+ {
521
+ "index": 75,
522
+ "audio_path": "75.wav",
523
+ "reference": "原告向本院提出诉讼请求:一、判令被告支付货款人民币五十万八千元及逾期付款利息;二、判令被告承担本案的诉讼费用",
524
+ "inference_time": 5.014,
525
+ "predicts": "原告向本院提出诉讼请求:一、判令被告支付货款人民币五十万八千元及逾期付款利息;二、判令被告承担本案的诉讼费用。"
526
+ },
527
+ {
528
+ "index": 76,
529
+ "audio_path": "76.wav",
530
+ "reference": "被执行人未按执行通知履行法律文书确定的义务,人民法院有权查封、扣押、冻结、拍卖被执行人的财产",
531
+ "inference_time": 4.054,
532
+ "predicts": "被执行人未按执行通知履行法律文书确定的义务,人民法院有权查封、扣押、冻结、拍卖被执行人的财产。"
533
+ },
534
+ {
535
+ "index": 77,
536
+ "audio_path": "77.wav",
537
+ "reference": "这个视频太上头了!",
538
+ "inference_time": 1.682,
539
+ "predicts": "这个视频太上头了"
540
+ },
541
+ {
542
+ "index": 78,
543
+ "audio_path": "78.wav",
544
+ "reference": "他真是个社恐。",
545
+ "inference_time": 1.304,
546
+ "predicts": "他真是个社恐。"
547
+ },
548
+ {
549
+ "index": 79,
550
+ "audio_path": "79.wav",
551
+ "reference": "简直了,这躺平的状态也太佛系了吧。",
552
+ "inference_time": 1.747,
553
+ "predicts": "简直了,这躺平的状态也太佛系了吧。"
554
+ },
555
+ {
556
+ "index": 80,
557
+ "audio_path": "80.wav",
558
+ "reference": "这个瓜有点大,我得去吃瓜了。",
559
+ "inference_time": 1.032,
560
+ "predicts": "这个瓜有点大,我得去吃瓜了。"
561
+ },
562
+ {
563
+ "index": 81,
564
+ "audio_path": "81.wav",
565
+ "reference": "别内卷了,咱们还是多交流交流吧。",
566
+ "inference_time": 0.895,
567
+ "predicts": "别内卷了咱们还是多交流交流吧"
568
+ },
569
+ {
570
+ "index": 82,
571
+ "audio_path": "82.wav",
572
+ "reference": "第二个公司我们成立了中国黄页,在中国黄页的创业经验中有很多的经验,也是可以在这儿跟大家进行分享的",
573
+ "inference_time": 3.008,
574
+ "predicts": "第二个公司我们成立了中国黄页,在中国黄页的创业经验中有很多的经验,也是可以在这儿跟大家进行分享的。"
575
+ },
576
+ {
577
+ "index": 83,
578
+ "audio_path": "83.wav",
579
+ "reference": "对于大部分在接触微积分之前,主要的学习经验就是刷题,甚至是连题也不刷的同学们来说。",
580
+ "inference_time": 2.598,
581
+ "predicts": "对于大部分在接触微积分之前,主要的学习经验就是刷题,甚至是连题也不刷的同学们来说。"
582
+ },
583
+ {
584
+ "index": 84,
585
+ "audio_path": "84.wav",
586
+ "reference": "说了两个小时没人听懂我在说什么,最后二十三个人反对一个人同意,这一个人就说马云你这样做,你就试试看,不行的话,赶紧逃回来,还来得及。",
587
+ "inference_time": 3.818,
588
+ "predicts": "说了两个小时,没人听懂我在说什么。最后二十三个人反对,一个人同意。这一个人就说:马云,你这样做你就试试看,不行的话赶紧逃回来,还来得及。"
589
+ },
590
+ {
591
+ "index": 85,
592
+ "audio_path": "85.wav",
593
+ "reference": "晚上想想是热血沸腾,真好,第二天早上骑个自行车又上班去了,对吧?",
594
+ "inference_time": 1.465,
595
+ "predicts": "晚上想想是热血沸腾真好第二天早上骑个自行车又上班去了对吧"
596
+ }
597
+ ]
reports/asr_result_funasr_nano_wenet_net.json ADDED
The diff for this file is too large to render. See raw diff
 
reports/asr_result_funasr_quant_librispeech_clean.json ADDED
The diff for this file is too large to render. See raw diff
 
reports/asr_result_funasr_quant_recording.json ADDED
@@ -0,0 +1,597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "index": 1,
4
+ "audio_path": "1.wav",
5
+ "reference": "您这车开得真够稳的,蜗牛都超车了。",
6
+ "inference_time": 0.328,
7
+ "predicts": "您这车开的真够稳的,蜗牛都超车了。"
8
+ },
9
+ {
10
+ "index": 2,
11
+ "audio_path": "2.wav",
12
+ "reference": "你可真是个天才,把这么简单的事情都搞砸了。",
13
+ "inference_time": 0.272,
14
+ "predicts": "你可真是个天才,把这么简单的事情都搞砸了。"
15
+ },
16
+ {
17
+ "index": 3,
18
+ "audio_path": "3.wav",
19
+ "reference": "因网络问题,远程讲者音频中断,我们切到备用方案共享幻灯片。",
20
+ "inference_time": 0.415,
21
+ "predicts": "因网络问题,远程讲者音频中断,我们切到备用方案,共享幻灯片。"
22
+ },
23
+ {
24
+ "index": 4,
25
+ "audio_path": "4.wav",
26
+ "reference": "这就好比‘用大炮打蚊子’,资源分配严重不均衡,需要细粒度调度。",
27
+ "inference_time": 0.387,
28
+ "predicts": "这就好比用大炮打蚊子,资源分配严重不均衡,需要细腻度调度。"
29
+ },
30
+ {
31
+ "index": 5,
32
+ "audio_path": "5.wav",
33
+ "reference": "言归正传,我们来讨论一下核心问题。",
34
+ "inference_time": 0.258,
35
+ "predicts": "言归正传,我们来讨论一下核心问题。"
36
+ },
37
+ {
38
+ "index": 6,
39
+ "audio_path": "6.wav",
40
+ "reference": "这有点超出了我们今天讨论的范围。",
41
+ "inference_time": 0.222,
42
+ "predicts": "这有点超出了我们今天讨论的范围。"
43
+ },
44
+ {
45
+ "index": 7,
46
+ "audio_path": "7.wav",
47
+ "reference": "他的年收入约为一百万人民币。",
48
+ "inference_time": 0.223,
49
+ "predicts": "他的年收入约为一百万人民币。"
50
+ },
51
+ {
52
+ "index": 8,
53
+ "audio_path": "8.wav",
54
+ "reference": "下一位演讲者是来自斯坦福大学计算机科学系的张明教授",
55
+ "inference_time": 0.387,
56
+ "predicts": "下一位演讲者是来自斯坦福大学计算机科学系的张明教授。"
57
+ },
58
+ {
59
+ "index": 9,
60
+ "audio_path": "9.wav",
61
+ "reference": "请各位将手机调至静音模式,演讲结束后有十五分钟提问时间",
62
+ "inference_time": 0.407,
63
+ "predicts": "请各位将手机调至静音模式,演讲结束后有十五分钟提问时间。"
64
+ },
65
+ {
66
+ "index": 10,
67
+ "audio_path": "10.wav",
68
+ "reference": "茶歇将在十点三十分开始,地点在二楼休息区",
69
+ "inference_time": 0.316,
70
+ "predicts": "茶歇将在十点三十分开始,地点在二楼休息区。"
71
+ },
72
+ {
73
+ "index": 11,
74
+ "audio_path": "11.wav",
75
+ "reference": "本合同自双方签字盖章之日起生效。",
76
+ "inference_time": 0.273,
77
+ "predicts": "本合同自双方签字盖章之日起生效。"
78
+ },
79
+ {
80
+ "index": 12,
81
+ "audio_path": "12.wav",
82
+ "reference": "政府正在采取措施刺激经济增长和创造就业机会。",
83
+ "inference_time": 0.346,
84
+ "predicts": "政府正在采取措施刺激经济增长和创造就业机会。"
85
+ },
86
+ {
87
+ "index": 13,
88
+ "audio_path": "13.wav",
89
+ "reference": "今天中午我们点外卖吧,我想吃宫保鸡丁和麻婆豆腐。",
90
+ "inference_time": 0.344,
91
+ "predicts": "今天中午我们点外卖吧,我想吃宫保鸡丁和麻婆豆腐。"
92
+ },
93
+ {
94
+ "index": 14,
95
+ "audio_path": "14.wav",
96
+ "reference": "医生建议我多吃粗粮,比如燕麦和紫薯,少吃高油高糖的油炸食品,像炸鸡和甜甜圈。",
97
+ "inference_time": 0.558,
98
+ "predicts": "医生建议我多吃粗粮,比如燕麦和紫薯,少吃高油、高糖的油炸食品,像炸鸡和甜甜圈。"
99
+ },
100
+ {
101
+ "index": 15,
102
+ "audio_path": "15.wav",
103
+ "reference": "本方案采用异构计算架构,结合GPU与FPGA加速张量运算。",
104
+ "inference_time": 0.419,
105
+ "predicts": "本方案采用异构计算架构,结合gpu与fpga加速张量运算。"
106
+ },
107
+ {
108
+ "index": 16,
109
+ "audio_path": "16.wav",
110
+ "reference": "推理延迟稳定在5毫秒以内,吞吐量提升40%,功耗降低1.8瓦。",
111
+ "inference_time": 0.467,
112
+ "predicts": "推理延迟稳定在五毫秒以内,吞吐量提升百分之四十,功耗降低一点八瓦。"
113
+ },
114
+ {
115
+ "index": 17,
116
+ "audio_path": "17.wav",
117
+ "reference": "相比SOTA模型,我们的方法在小样本场景下召回率高出12个百分点,且参数量仅为其三分之一。",
118
+ "inference_time": 0.652,
119
+ "predicts": "相比sota模型,我们的方法在小样本场景下召回率高出十二个百分点,且参数量仅为其三分之一。"
120
+ },
121
+ {
122
+ "index": 18,
123
+ "audio_path": "18.wav",
124
+ "reference": "当节点并发数超过10,000时,内存带宽会成瓶颈,导致尾延迟急剧上升。",
125
+ "inference_time": 0.468,
126
+ "predicts": "当节点并发数超过一万时,内存带宽会成瓶颈,导致伪延迟急剧上升。"
127
+ },
128
+ {
129
+ "index": 19,
130
+ "audio_path": "19.wav",
131
+ "reference": "请看图3:蓝色柱是基线模���,红色线是我们的优化结果,交叉点表明在第15轮迭代后优势显著。",
132
+ "inference_time": 0.62,
133
+ "predicts": "请看图三,蓝色柱是基线模型,红色线是我们的优化。结果交叉点表明,在第十五轮迭代后,优势显著。"
134
+ },
135
+ {
136
+ "index": 20,
137
+ "audio_path": "20.wav",
138
+ "reference": "您提到模型泛化性提升,是否在跨模态数据上验证过?量化指标是多少?",
139
+ "inference_time": 0.464,
140
+ "predicts": "您提到模型泛化性提升是否在跨模态数据上验证过量化指标是多少?"
141
+ },
142
+ {
143
+ "index": 21,
144
+ "audio_path": "21.wav",
145
+ "reference": "容我追问一点:如果输入数据存在对抗样本,您方案的鲁棒性如何保证?失效概率有测试吗?",
146
+ "inference_time": 0.559,
147
+ "predicts": "容我追问一点,如果输入数据存在对抗样本,您方案的鲁棒性如何保证失效概率有测试吗?"
148
+ },
149
+ {
150
+ "index": 22,
151
+ "audio_path": "22.wav",
152
+ "reference": "抱歉打断,您说的‘动态剪枝’是指训练中还是推理中?阈值是自适应的吗?",
153
+ "inference_time": 0.463,
154
+ "predicts": "抱歉打断您说的动态减脂,是指训练中还是推理中阈值是自适应的吗?"
155
+ },
156
+ {
157
+ "index": 23,
158
+ "audio_path": "23.wav",
159
+ "reference": "成本效益是否可行?部署这种定制芯片需要流片费用,中小客户怎么承担?",
160
+ "inference_time": 0.462,
161
+ "predicts": "成本效益是否可行?部署这种定制芯片需要流片费用,中小客户怎么承担?"
162
+ },
163
+ {
164
+ "index": 24,
165
+ "audio_path": "24.wav",
166
+ "reference": "我补充一个角度:隐私计算领域也在用类似思路,是否可能跨领域合作?",
167
+ "inference_time": 0.496,
168
+ "predicts": "我补充一个角度,隐私计算领域也在用,类似思路是否可能跨领域合作。"
169
+ },
170
+ {
171
+ "index": 25,
172
+ "audio_path": "25.wav",
173
+ "reference": "端到端优化流水线涵盖数据清洗特征工程模型压缩部署监控。",
174
+ "inference_time": 0.449,
175
+ "predicts": "端到端,优化流水线,涵盖数据清洗特征、工程模型,压缩部署监控。"
176
+ },
177
+ {
178
+ "index": 26,
179
+ "audio_path": "26.wav",
180
+ "reference": "请检查代码库中的依赖项冲突问题。",
181
+ "inference_time": 0.266,
182
+ "predicts": "请检查代码库中的依赖项冲突问题。"
183
+ },
184
+ {
185
+ "index": 27,
186
+ "audio_path": "27.wav",
187
+ "reference": "这个电路板集成了微控制器、传感器和无线通信模块。",
188
+ "inference_time": 0.316,
189
+ "predicts": "这个电路板集成了微控制器传感器和无线通信模块。"
190
+ },
191
+ {
192
+ "index": 28,
193
+ "audio_path": "28.wav",
194
+ "reference": "区块链技术的核心在于去中心化和加密安全性。",
195
+ "inference_time": 0.319,
196
+ "predicts": "区块链技术的核心在于去中心化和加密安全性。"
197
+ },
198
+ {
199
+ "index": 29,
200
+ "audio_path": "29.wav",
201
+ "reference": "需符合等保三级要求,数据出境要走安全评估。",
202
+ "inference_time": 0.338,
203
+ "predicts": "需符合等保三级要求,数据出境要走安全评估。"
204
+ },
205
+ {
206
+ "index": 30,
207
+ "audio_path": "30.wav",
208
+ "reference": "最新数据显示,消费者信心指数有所回升。",
209
+ "inference_time": 0.289,
210
+ "predicts": "最新数据显示,消费者信心指数有所回升。"
211
+ },
212
+ {
213
+ "index": 31,
214
+ "audio_path": "31.wav",
215
+ "reference": "专家预测,新能源汽车市场将迎来爆发式增长。",
216
+ "inference_time": 0.326,
217
+ "predicts": "专家预测,新能源汽车市场将迎来爆发式增长。"
218
+ },
219
+ {
220
+ "index": 32,
221
+ "audio_path": "32.wav",
222
+ "reference": "多模态大模型能够同时处理文本、图像和音频信号,实现真正的跨模态理解",
223
+ "inference_time": 0.452,
224
+ "predicts": "多模态大模型能够同时处理文本图像和音频信号,实现真正的跨模态理解。"
225
+ },
226
+ {
227
+ "index": 33,
228
+ "audio_path": "33.wav",
229
+ "reference": "联邦学习可以在保护用户隐私的前提下,实现分布式模型的协同训练",
230
+ "inference_time": 0.421,
231
+ "predicts": "联邦学习可以在保护用户隐私的前提下,实现分布式模型的协同训练。"
232
+ },
233
+ {
234
+ "index": 34,
235
+ "audio_path": "34.wav",
236
+ "reference": "我们研发的五十量子比特处理器在特定算法上展现出了量子优越性",
237
+ "inference_time": 0.413,
238
+ "predicts": "我们研发的五十量子比特处理器在特定算法上展现出了量子优越性。"
239
+ },
240
+ {
241
+ "index": 35,
242
+ "audio_path": "35.wav",
243
+ "reference": "变分量子本征值求解器在化学模拟中显示出巨大潜力",
244
+ "inference_time": 0.429,
245
+ "predicts": "变分量子本增值求解器在化学模拟中显示出巨大潜力。"
246
+ },
247
+ {
248
+ "index": 36,
249
+ "audio_path": "36.wav",
250
+ "reference": "边缘AI推理的延迟已经可以控制在十毫秒以内,满���实时应用需求",
251
+ "inference_time": 0.456,
252
+ "predicts": "边缘ai推理的延迟已经可以控制在十毫秒以内,满足实时应用需求。"
253
+ },
254
+ {
255
+ "index": 37,
256
+ "audio_path": "37.wav",
257
+ "reference": "我们设计的新型神经网络压缩算法在保持精度的同时将模型大小减少了百分之七十",
258
+ "inference_time": 0.508,
259
+ "predicts": "我们设计的新型神经网络压缩算法,在保持精度的同时,将模型大小减少了百分之七十。"
260
+ },
261
+ {
262
+ "index": 38,
263
+ "audio_path": "38.wav",
264
+ "reference": "损失函数结合了交叉熵损失和对比损失,权重系数设置为零点三和零点七",
265
+ "inference_time": 0.458,
266
+ "predicts": "损失函数结合了交叉熵损失和对比,损失,权重系数设置为零点三和零点七。"
267
+ },
268
+ {
269
+ "index": 39,
270
+ "audio_path": "39.wav",
271
+ "reference": "在五个标准数据集上的实验结果表明,我们的方法平均比现有最佳方法提升了二点三个百分点",
272
+ "inference_time": 0.581,
273
+ "predicts": "在五个标准数据集上的实验结果表明,我们的方法平均比现有最佳方法提升了二点三个百分点。"
274
+ },
275
+ {
276
+ "index": 40,
277
+ "audio_path": "40.wav",
278
+ "reference": "消融实验证实了每个模块的有效性,移除注意力机制会导致性能下降百分之四点六",
279
+ "inference_time": 0.468,
280
+ "predicts": "消融实验证实了每个模块的有效性,移除注意力机制会导致性能下降百分之四点六。"
281
+ },
282
+ {
283
+ "index": 41,
284
+ "audio_path": "41.wav",
285
+ "reference": "模型的参数量为一点二亿,在八张A100显卡上训练了七十二小时",
286
+ "inference_time": 0.437,
287
+ "predicts": "模型的参数量为一点二亿,在八张a一百显卡上训练了七十二小时。"
288
+ },
289
+ {
290
+ "index": 42,
291
+ "audio_path": "42.wav",
292
+ "reference": "总之,搞定了数据倾斜,性能就上去了。",
293
+ "inference_time": 0.275,
294
+ "predicts": "总之,搞定了数据倾斜性能就上去了。"
295
+ },
296
+ {
297
+ "index": 43,
298
+ "audio_path": "43.wav",
299
+ "reference": "openai-whisper 需要 ffmpeg 的环境,ffmpeg 是一个开源的跨平台音视频处理工具和框架,可以用来录制、转换和流式传输音视频内容 。",
300
+ "inference_time": 0.932,
301
+ "predicts": "open ai whisper需要fm pag的环境。fm pag是一个开源的跨平台音视频,处理工具和框架,可以用来录制转换和流式传输音视频内容。"
302
+ },
303
+ {
304
+ "index": 44,
305
+ "audio_path": "44.wav",
306
+ "reference": "采用 Transformer 序列到序列模型可以实现针对不同的语言处理任务。",
307
+ "inference_time": 0.406,
308
+ "predicts": "采用transformer序列到序列模型,可以实现针对不同的语言处理任务。"
309
+ },
310
+ {
311
+ "index": 45,
312
+ "audio_path": "45.wav",
313
+ "reference": "Transformer架构在自然语言处理中的成功应用已经彻底改变了预训练模型的范式",
314
+ "inference_time": 0.503,
315
+ "predicts": "transformer架构在自然语言处理中的成功应用,已经彻底改变了预训练模型的范式。"
316
+ },
317
+ {
318
+ "index": 46,
319
+ "audio_path": "46.wav",
320
+ "reference": "请大家注意,Workshop材料已经上传至会议系统,代码仓库链接在附录页。",
321
+ "inference_time": 0.476,
322
+ "predicts": "请大家注意,workshop材料已经上传至会议系统,代码仓库链接在附录页。"
323
+ },
324
+ {
325
+ "index": 47,
326
+ "audio_path": "47.wav",
327
+ "reference": "别造轮子了!直接调用yolov5开源库的预训练权重,快速迭代才是王道。",
328
+ "inference_time": 0.482,
329
+ "predicts": "别造轮子了,直接调用乐uv五开源库的预训练权重,快速迭代才是王道。"
330
+ },
331
+ {
332
+ "index": 48,
333
+ "audio_path": "48.wav",
334
+ "reference": "请确保您的设备已连接到5GHz Wi-Fi频段以获得最佳性能。",
335
+ "inference_time": 0.405,
336
+ "predicts": "请确保您的设备已连接到五g赫兹、wifi频段,以获得最佳性能。"
337
+ },
338
+ {
339
+ "index": 49,
340
+ "audio_path": "49.wav",
341
+ "reference": "我们提出了一种基于对比学习的自监督方法,在ImageNet数据集上达到了百分之九十二点五的准确率",
342
+ "inference_time": 0.598,
343
+ "predicts": "我们提出了一种基于对比学习的自监督方法,在imaginate数据集上达到了百分之九十二点五的准确率。"
344
+ },
345
+ {
346
+ "index": 50,
347
+ "audio_path": "50.wav",
348
+ "reference": "GPT-4在代码生成和多步推理任务上展现出令人印象深刻的能力",
349
+ "inference_time": 0.433,
350
+ "predicts": "gbd four在代码生成和多步推理任务上展现出令人印象深刻的伦理。"
351
+ },
352
+ {
353
+ "index": 51,
354
+ "audio_path": "51.wav",
355
+ "reference": "若在高并发场景下未启用我们的动态缓存机制,即使用RDMA网络,延迟也可能因CPU调度争抢而恶化。",
356
+ "inference_time": 0.711,
357
+ "predicts": "若在高并发场景下未启用��们的动态缓存机制,即使用rdma网络延迟,也可能因cpu调度增强而恶化。"
358
+ },
359
+ {
360
+ "index": 52,
361
+ "audio_path": "52.wav",
362
+ "reference": "准确说,峰值算力是200 TOPS,不是刚才说的150。",
363
+ "inference_time": 0.371,
364
+ "predicts": "准确说峰值算力是两百tops,不是刚才说的一百五。"
365
+ },
366
+ {
367
+ "index": 53,
368
+ "audio_path": "53.wav",
369
+ "reference": "秦始皇嬴政书同文,车同轨,奠定了中国大一统的基础。",
370
+ "inference_time": 0.37,
371
+ "predicts": "秦始皇嬴政书同文,车同轨,奠定了中国大一统的基础。"
372
+ },
373
+ {
374
+ "index": 54,
375
+ "audio_path": "54.wav",
376
+ "reference": "诸葛亮在《出师表》中写道‘鞠躬尽瘁,死而后已’,成为后世臣子的楷模。",
377
+ "inference_time": 0.408,
378
+ "predicts": "诸葛亮在出师表中写道,鞠躬尽瘁死而后已成为后世臣子的楷模。"
379
+ },
380
+ {
381
+ "index": 55,
382
+ "audio_path": "55.wav",
383
+ "reference": "李白的‘举头望明月,低头思故乡’是连三岁孩童都能背诵的诗句。",
384
+ "inference_time": 0.4,
385
+ "predicts": "李白的举头望明月,低头思故乡,是连三岁孩童都能背诵的诗句。"
386
+ },
387
+ {
388
+ "index": 56,
389
+ "audio_path": "56.wav",
390
+ "reference": "孔子曾说‘己所不欲,勿施于人’,这简单的八个字构成了儒家伦理的基石。",
391
+ "inference_time": 0.515,
392
+ "predicts": "孔子曾说,己所不欲勿施于人,这简单的八个字,构成了儒家伦理的基石。"
393
+ },
394
+ {
395
+ "index": 57,
396
+ "audio_path": "57.wav",
397
+ "reference": "王羲之被后人尊为‘书圣’,其代表作《兰亭集序》被誉为天下第一行书。",
398
+ "inference_time": 0.499,
399
+ "predicts": "王羲之被后人尊为书圣,其代表作兰亭集序被誉为天下第一行书。"
400
+ },
401
+ {
402
+ "index": 58,
403
+ "audio_path": "58.wav",
404
+ "reference": "我们要学习庖丁解牛的精神,掌握事物的客观规律,才能游刃有余。",
405
+ "inference_time": 0.454,
406
+ "predicts": "我们要学习庖丁解牛的精神,掌握事物的客观规律才能游刃有余。"
407
+ },
408
+ {
409
+ "index": 59,
410
+ "audio_path": "59.wav",
411
+ "reference": "项羽在鸿门宴上优柔寡断,放走了刘邦,最终兵败乌江自刎。",
412
+ "inference_time": 0.46,
413
+ "predicts": "项羽在鸿门宴上优柔寡断,放走了刘邦,最终兵败乌江自刎。"
414
+ },
415
+ {
416
+ "index": 60,
417
+ "audio_path": "60.wav",
418
+ "reference": "在杭州西湖畔,人们总会想起苏轼治理西湖、修筑苏堤的往事。",
419
+ "inference_time": 0.438,
420
+ "predicts": "在杭州,西湖畔,人们总会想起苏轼治理西湖,修筑苏堤的往事。"
421
+ },
422
+ {
423
+ "index": 61,
424
+ "audio_path": "61.wav",
425
+ "reference": "我计划去埃菲尔铁塔和卢浮宫参观。",
426
+ "inference_time": 0.255,
427
+ "predicts": "我计划去埃菲尔铁塔和卢浮宫参观。"
428
+ },
429
+ {
430
+ "index": 62,
431
+ "audio_path": "62.wav",
432
+ "reference": "莎士比亚的戏剧深刻地探讨了人性的复杂性。",
433
+ "inference_time": 0.304,
434
+ "predicts": "莎士比亚的戏剧深刻地探讨了人性的复杂性。"
435
+ },
436
+ {
437
+ "index": 63,
438
+ "audio_path": "63.wav",
439
+ "reference": "这个消息让他丈二和尚摸不着头脑。",
440
+ "inference_time": 0.22,
441
+ "predicts": "这个消息让他丈二和尚摸不着头脑。"
442
+ },
443
+ {
444
+ "index": 64,
445
+ "audio_path": "64.wav",
446
+ "reference": "画龙点睛之笔,让整个设计焕然一新。",
447
+ "inference_time": 0.257,
448
+ "predicts": "画龙点睛之笔,让整个设计焕然一新。"
449
+ },
450
+ {
451
+ "index": 65,
452
+ "audio_path": "65.wav",
453
+ "reference": "这是卡脖子技术,必须自主研发!",
454
+ "inference_time": 0.238,
455
+ "predicts": "这是卡脖子技术,必须自主研发。"
456
+ },
457
+ {
458
+ "index": 66,
459
+ "audio_path": "66.wav",
460
+ "reference": "这幅画作以其独特的色彩运用和构图技巧而闻名。",
461
+ "inference_time": 0.312,
462
+ "predicts": "这幅画作以其独特的色彩运用和构图技巧而闻名。"
463
+ },
464
+ {
465
+ "index": 67,
466
+ "audio_path": "67.wav",
467
+ "reference": "患者主诉间歇性胸痛,放射至左臂。",
468
+ "inference_time": 0.254,
469
+ "predicts": "患者主诉间歇性,胸痛放射至左臂。"
470
+ },
471
+ {
472
+ "index": 68,
473
+ "audio_path": "68.wav",
474
+ "reference": "患者有冠状动脉粥样硬化性心脏病病史十年,慢性阻塞性肺疾病病史五年",
475
+ "inference_time": 0.597,
476
+ "predicts": "患者有冠状动脉粥样硬化性心脏病病史,十年慢性阻塞性肺疾病病史,五年。"
477
+ },
478
+ {
479
+ "index": 69,
480
+ "audio_path": "69.wav",
481
+ "reference": "建议行冠状动脉造影检查,必要时植入支架",
482
+ "inference_time": 0.347,
483
+ "predicts": "建议行冠状动脉、造影检查,必要时植入支架。"
484
+ },
485
+ {
486
+ "index": 70,
487
+ "audio_path": "70.wav",
488
+ "reference": "患者需低盐低���糖尿病饮食,监测血压、血糖变化",
489
+ "inference_time": 0.41,
490
+ "predicts": "患者需低盐、低脂糖尿病饮食监测、血压、血糖变化。"
491
+ },
492
+ {
493
+ "index": 71,
494
+ "audio_path": "71.wav",
495
+ "reference": "胸部CT平扫显示:双肺散在磨玻璃样密度影,以胸膜下分布为主",
496
+ "inference_time": 0.529,
497
+ "predicts": "胸部ct平扫显示,双肺散在膜玻璃样密度影以胸膜下分布为主。"
498
+ },
499
+ {
500
+ "index": 72,
501
+ "audio_path": "72.wav",
502
+ "reference": "知识产权归属约定:乙方在履行本合同过程中所产生的全部智力成果,其知识产权均归甲方所有",
503
+ "inference_time": 0.633,
504
+ "predicts": "知识产权归属约定,乙方在履行本合同过程中所产生的全部智力成果,其知识产权归甲方所有。"
505
+ },
506
+ {
507
+ "index": 73,
508
+ "audio_path": "73.wav",
509
+ "reference": "双方因履行本合同发生争议的,应首先通过友好协商解决;协商不成的,任何一方均有权向有管辖权的人民法院提起诉讼",
510
+ "inference_time": 0.754,
511
+ "predicts": "双方应履行本合同发生争议的应首先通过友好协商解决,协商不成的,任何一方均有权向有管辖权的人民法院提起诉讼。"
512
+ },
513
+ {
514
+ "index": 74,
515
+ "audio_path": "74.wav",
516
+ "reference": "被告在法定期限内未提交答辩状,亦未到庭参加诉讼,本院依法缺席审理",
517
+ "inference_time": 0.542,
518
+ "predicts": "被告在法定期限内未提交答辩状,亦未到庭参加诉讼,本院依法缺席审理。"
519
+ },
520
+ {
521
+ "index": 75,
522
+ "audio_path": "75.wav",
523
+ "reference": "原告向本院提出诉讼请求:一、判令被告支付货款人民币五十万八千元及逾期付款利息;二、判令被告承担本案的诉讼费用",
524
+ "inference_time": 0.838,
525
+ "predicts": "原告向本院提出诉讼请求。一、判令被告支付货款人民币五十万八千元及逾期付款利息。二、判令被告承担本案的诉讼费用。"
526
+ },
527
+ {
528
+ "index": 76,
529
+ "audio_path": "76.wav",
530
+ "reference": "被执行人未按执行通知履行法律文书确定的义务,人民法院有权查封、扣押、冻结、拍卖被执行人的财产",
531
+ "inference_time": 0.677,
532
+ "predicts": "被执行人未按执行通知履行法律文书确定的义务,人民法院有权查封、扣押、冻结、拍卖被执行人的财产。"
533
+ },
534
+ {
535
+ "index": 77,
536
+ "audio_path": "77.wav",
537
+ "reference": "这个视频太上头了!",
538
+ "inference_time": 0.167,
539
+ "predicts": "这个视频太上头了。"
540
+ },
541
+ {
542
+ "index": 78,
543
+ "audio_path": "78.wav",
544
+ "reference": "他真是个社恐。",
545
+ "inference_time": 0.171,
546
+ "predicts": "他真是个社恐。"
547
+ },
548
+ {
549
+ "index": 79,
550
+ "audio_path": "79.wav",
551
+ "reference": "简直了,这躺平的状态也太佛系了吧。",
552
+ "inference_time": 0.253,
553
+ "predicts": "简直了,这躺平的状态也太佛系了吧。"
554
+ },
555
+ {
556
+ "index": 80,
557
+ "audio_path": "80.wav",
558
+ "reference": "这个瓜有点大,我得去吃瓜了。",
559
+ "inference_time": 0.245,
560
+ "predicts": "这个瓜有点大我得去吃瓜了。"
561
+ },
562
+ {
563
+ "index": 81,
564
+ "audio_path": "81.wav",
565
+ "reference": "别内卷了,咱们还是多交流交流吧。",
566
+ "inference_time": 0.24,
567
+ "predicts": "别内卷了,咱们还是多交流交流吧。"
568
+ },
569
+ {
570
+ "index": 82,
571
+ "audio_path": "82.wav",
572
+ "reference": "第二个公司我们成立了中国黄页,在中国黄页的创业经验中有很多的经验,也是可以在这儿跟大家进行分享的",
573
+ "inference_time": 0.592,
574
+ "predicts": "第二个公司,我们成立了中国黄页。在中国黄页的创业经验中有很多的经验,也是可以在这跟大家进行分享的。"
575
+ },
576
+ {
577
+ "index": 83,
578
+ "audio_path": "83.wav",
579
+ "reference": "对于大部分在接触微积分之前,主要的学习经验就是刷题,甚至是连题也不刷的同学们来说。",
580
+ "inference_time": 0.553,
581
+ "predicts": "对于大部分在接触微积分之前,主要的学习经验就是刷题,甚至是连题也不刷的同学们来说。"
582
+ },
583
+ {
584
+ "index": 84,
585
+ "audio_path": "84.wav",
586
+ "reference": "说了两个小时没人听懂我在说什么,最后二十三个人反对一个人同意,这一个人就说马云你这样做,你就试试看,不行的话,赶紧逃回来,还来得及。",
587
+ "inference_time": 0.78,
588
+ "predicts": "说了两个小时没人听懂,我在说什么,最后二十三个人反对一个人同意这一个人就说马云你这样做,你就试试看,不行的话,赶紧逃回来,还来得及。"
589
+ },
590
+ {
591
+ "index": 85,
592
+ "audio_path": "85.wav",
593
+ "reference": "晚上想想是热血沸腾,真好,第二天早上骑个自行车又上班去了,对吧?",
594
+ "inference_time": 0.43,
595
+ "predicts": "晚上想想是热血沸腾真好,第二天早上骑个自行车又上班去了,对吧?"
596
+ }
597
+ ]
reports/asr_result_funasr_quant_wenet_net.json ADDED
The diff for this file is too large to render. See raw diff
 
reports/asr_result_whisper_finetuned_librispeech_clean.json ADDED
The diff for this file is too large to render. See raw diff
 
reports/asr_result_whisper_finetuned_recording.json ADDED
@@ -0,0 +1,597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "index": 1,
4
+ "audio_path": "1.wav",
5
+ "reference": "您这车开得真够稳的,蜗牛都超车了。",
6
+ "inference_time": 3.399,
7
+ "predicts": "您这车开得真够稳的蜗牛都超车了"
8
+ },
9
+ {
10
+ "index": 2,
11
+ "audio_path": "2.wav",
12
+ "reference": "你可真是个天才,把这么简单的事情都搞砸了。",
13
+ "inference_time": 1.123,
14
+ "predicts": "你可真是个天才把这么简单的事情都搞砸了"
15
+ },
16
+ {
17
+ "index": 3,
18
+ "audio_path": "3.wav",
19
+ "reference": "因网络问题,远程讲者音频中断,我们切到备用方案共享幻灯片。",
20
+ "inference_time": 1.277,
21
+ "predicts": "因网络问题远程讲着音频中断我们切到备用方案共享幻灯片"
22
+ },
23
+ {
24
+ "index": 4,
25
+ "audio_path": "4.wav",
26
+ "reference": "这就好比‘用大炮打蚊子’,资源分配严重不均衡,需要细粒度调度。",
27
+ "inference_time": 1.309,
28
+ "predicts": "这就好比用大炮打蚊子资源分配严重不均衡需要细腻度调度"
29
+ },
30
+ {
31
+ "index": 5,
32
+ "audio_path": "5.wav",
33
+ "reference": "言归正传,我们来讨论一下核心问题。",
34
+ "inference_time": 1.096,
35
+ "predicts": "言归正传我们来讨论一下核心问题"
36
+ },
37
+ {
38
+ "index": 6,
39
+ "audio_path": "6.wav",
40
+ "reference": "这有点超出了我们今天讨论的范围。",
41
+ "inference_time": 1.08,
42
+ "predicts": "这有点超出了我们今天讨论的范围"
43
+ },
44
+ {
45
+ "index": 7,
46
+ "audio_path": "7.wav",
47
+ "reference": "他的年收入约为一百万人民币。",
48
+ "inference_time": 1.051,
49
+ "predicts": "它的年收入约为一百万人民币"
50
+ },
51
+ {
52
+ "index": 8,
53
+ "audio_path": "8.wav",
54
+ "reference": "下一位演讲者是来自斯坦福大学计算机科学系的张明教授",
55
+ "inference_time": 1.179,
56
+ "predicts": "下一位演讲者是来自斯坦福大学计算机科学系的张明教授"
57
+ },
58
+ {
59
+ "index": 9,
60
+ "audio_path": "9.wav",
61
+ "reference": "请各位将手机调至静音模式,演讲结束后有十五分钟提问时间",
62
+ "inference_time": 1.201,
63
+ "predicts": "请各位将手机调至静音模式演讲结束后有十五分钟提问时间"
64
+ },
65
+ {
66
+ "index": 10,
67
+ "audio_path": "10.wav",
68
+ "reference": "茶歇将在十点三十分开始,地点在二楼休息区",
69
+ "inference_time": 1.165,
70
+ "predicts": "茶蝎将在十点三十分开始地联在二楼休息区"
71
+ },
72
+ {
73
+ "index": 11,
74
+ "audio_path": "11.wav",
75
+ "reference": "本合同自双方签字盖章之日起生效。",
76
+ "inference_time": 1.118,
77
+ "predicts": "本合同自双方签字盖章之日起盛效"
78
+ },
79
+ {
80
+ "index": 12,
81
+ "audio_path": "12.wav",
82
+ "reference": "政府正在采取措施刺激经济增长和创造就业机会。",
83
+ "inference_time": 1.219,
84
+ "predicts": "政府正在采取措施刺激经济增长和创造就业机会"
85
+ },
86
+ {
87
+ "index": 13,
88
+ "audio_path": "13.wav",
89
+ "reference": "今天中午我们点外卖吧,我想吃宫保鸡丁和麻婆豆腐。",
90
+ "inference_time": 1.172,
91
+ "predicts": "今天中午我们点外卖吧我想吃宫保鸡丁和麻婆豆腐"
92
+ },
93
+ {
94
+ "index": 14,
95
+ "audio_path": "14.wav",
96
+ "reference": "医生建议我多吃粗粮,比如燕麦和紫薯,少吃高油高糖的油炸食品,像炸鸡和甜甜圈。",
97
+ "inference_time": 1.463,
98
+ "predicts": "医生建议我多吃粗粮比如燕麦和紫薯少吃高油高糖的油炸食品像炸鸡和甜甜圈"
99
+ },
100
+ {
101
+ "index": 15,
102
+ "audio_path": "15.wav",
103
+ "reference": "本方案采用异构计算架构,结合GPU与FPGA加速张量运算。",
104
+ "inference_time": 1.228,
105
+ "predicts": "本方案采用易构计算架构结合GPU与加速张量运算"
106
+ },
107
+ {
108
+ "index": 16,
109
+ "audio_path": "16.wav",
110
+ "reference": "推理延迟稳定在5毫秒以内,吞吐量提升40%,功耗降低1.8瓦。",
111
+ "inference_time": 1.346,
112
+ "predicts": "推拟延迟稳定在五毫秒以内吞吐量提升百分之四十功耗降低一点八瓦"
113
+ },
114
+ {
115
+ "index": 17,
116
+ "audio_path": "17.wav",
117
+ "reference": "相比SOTA模型,我们的方法在小样本场景下召回率高出12个百分点,且参数量仅为其三分之一。",
118
+ "inference_time": 1.366,
119
+ "predicts": "相比模型我们的方法在小样本场景下召回率高出十二个百分点且参数量仅为其三分之一"
120
+ },
121
+ {
122
+ "index": 18,
123
+ "audio_path": "18.wav",
124
+ "reference": "当节点并发数超过10,000时,内存带宽会成瓶颈,导致尾延迟急剧上升。",
125
+ "inference_time": 1.371,
126
+ "predicts": "当结点并发数超过一万时内存带宽会呈瓶颈导致尾延尺急剧上升"
127
+ },
128
+ {
129
+ "index": 19,
130
+ "audio_path": "19.wav",
131
+ "reference": "请看图3:蓝色柱是基线模型,红色线是我们的优化结果,交叉点表明在第15轮迭代后优势显著。",
132
+ "inference_time": 1.519,
133
+ "predicts": "请看图三蓝色柱是基线模型红色线是我们的优化结果交叉点表明在第十五轮迭代后优势显著"
134
+ },
135
+ {
136
+ "index": 20,
137
+ "audio_path": "20.wav",
138
+ "reference": "您提到模型泛化性提升,是否在跨模态数据上验证过?量化指标是多少?",
139
+ "inference_time": 1.273,
140
+ "predicts": "您提到模型泛化性提升是否在跨模态数据上验证过量化指标是多少"
141
+ },
142
+ {
143
+ "index": 21,
144
+ "audio_path": "21.wav",
145
+ "reference": "容我追问一点:如果输入数据存在对抗样本,您方案的鲁棒性如何保证?失效概率有测试吗?",
146
+ "inference_time": 1.381,
147
+ "predicts": "容我追问一点如果输入数据存在对抗样本您方案的鲁棒性如何保证失效概率有测试吗"
148
+ },
149
+ {
150
+ "index": 22,
151
+ "audio_path": "22.wav",
152
+ "reference": "抱歉打断,您说的‘动态剪枝’是指训练中还是推理中?阈值是自适应的吗?",
153
+ "inference_time": 1.302,
154
+ "predicts": "抱歉打断您说的动态简直是指训练中还是推理中预值是自适应的吗"
155
+ },
156
+ {
157
+ "index": 23,
158
+ "audio_path": "23.wav",
159
+ "reference": "成本效益是否可行?部署这种定制芯片需要流片费用,中小客户怎么承担?",
160
+ "inference_time": 1.299,
161
+ "predicts": "成本效益是否可行部署这种定制芯片需要留片费用中小客户怎么承担"
162
+ },
163
+ {
164
+ "index": 24,
165
+ "audio_path": "24.wav",
166
+ "reference": "我补充一个角度:隐私计算领域也在用类似思路,是否可能跨领域合作?",
167
+ "inference_time": 1.322,
168
+ "predicts": "我补充一个角度隐私计算领域也在用类似思路是否可能跨领域合作"
169
+ },
170
+ {
171
+ "index": 25,
172
+ "audio_path": "25.wav",
173
+ "reference": "端到端优化流水线涵盖数据清洗特征工程模型压缩部署监控。",
174
+ "inference_time": 1.318,
175
+ "predicts": "端到端优化流水线涵盖数据清洗特征工程模型压缩部署监控"
176
+ },
177
+ {
178
+ "index": 26,
179
+ "audio_path": "26.wav",
180
+ "reference": "请检查代码库中的依赖项冲突问题。",
181
+ "inference_time": 1.123,
182
+ "predicts": "请检查代码库中的依赖项冲突问题"
183
+ },
184
+ {
185
+ "index": 27,
186
+ "audio_path": "27.wav",
187
+ "reference": "这个电路板集成了微控制器、传感器和无线通信模块。",
188
+ "inference_time": 1.131,
189
+ "predicts": "这个电路板集成了微控制器传感器和无线通信模块"
190
+ },
191
+ {
192
+ "index": 28,
193
+ "audio_path": "28.wav",
194
+ "reference": "区块链技术的核心在于去中心化和加密安全性。",
195
+ "inference_time": 1.159,
196
+ "predicts": "区困链技术的核心在于去中心化和加密安全性"
197
+ },
198
+ {
199
+ "index": 29,
200
+ "audio_path": "29.wav",
201
+ "reference": "需符合等保三级要求,数据出境要走安全评估。",
202
+ "inference_time": 1.146,
203
+ "predicts": "需符合等保三级要求数据出境要走安全评估"
204
+ },
205
+ {
206
+ "index": 30,
207
+ "audio_path": "30.wav",
208
+ "reference": "最新数据显示,消费者信心指数有所回升。",
209
+ "inference_time": 1.105,
210
+ "predicts": "最新数据显示消费者信心指数有所回升"
211
+ },
212
+ {
213
+ "index": 31,
214
+ "audio_path": "31.wav",
215
+ "reference": "专家预测,新能源汽车市场将迎来爆发式增长。",
216
+ "inference_time": 1.154,
217
+ "predicts": "专家预测新能源汽车市场将迎来爆发式增长"
218
+ },
219
+ {
220
+ "index": 32,
221
+ "audio_path": "32.wav",
222
+ "reference": "多模态大模型能够同时处理文本、图像和音频信号,实现真正的跨模态理解",
223
+ "inference_time": 1.299,
224
+ "predicts": "多模态大模型能够同时处理文本图像和音频信号实现真正的跨模态理解"
225
+ },
226
+ {
227
+ "index": 33,
228
+ "audio_path": "33.wav",
229
+ "reference": "联邦学习可以在保护用户隐私的前提下,实现分布式模型的协同训练",
230
+ "inference_time": 1.331,
231
+ "predicts": "联邦学习可以在保护用户隐私的前提下实现分布式模型的协同训练"
232
+ },
233
+ {
234
+ "index": 34,
235
+ "audio_path": "34.wav",
236
+ "reference": "我们研发的五十量子比特处理器在特定算法上展现出了量子优越性",
237
+ "inference_time": 1.242,
238
+ "predicts": "我们研发的五十量子比特处理器在特定算法上展现出了量子优越性"
239
+ },
240
+ {
241
+ "index": 35,
242
+ "audio_path": "35.wav",
243
+ "reference": "变分量子本征值求解器在化学模拟中显示出巨大潜力",
244
+ "inference_time": 1.217,
245
+ "predicts": "变分量子本真值求解器在化学模拟中显示出巨大潜力"
246
+ },
247
+ {
248
+ "index": 36,
249
+ "audio_path": "36.wav",
250
+ "reference": "边缘AI推理的延迟已经可以控制在十毫秒以内,满足实时应用需求",
251
+ "inference_time": 1.229,
252
+ "predicts": "边缘AI推理的延迟已经可以控制在十毫秒以内满足实时应用需求"
253
+ },
254
+ {
255
+ "index": 37,
256
+ "audio_path": "37.wav",
257
+ "reference": "我们设计的新型神经网络压缩算法在保持精度的同时将模型大小减少了百分之七十",
258
+ "inference_time": 1.33,
259
+ "predicts": "我们设计的新型神经网诺压缩算法在保持精度的同时将模型大小减少了七十"
260
+ },
261
+ {
262
+ "index": 38,
263
+ "audio_path": "38.wav",
264
+ "reference": "损失函数结合了交叉熵损失和对比损失,权重系数设置为零点三和零点七",
265
+ "inference_time": 1.357,
266
+ "predicts": "损失函数结合了交叉伤损失和对比损失权重系数设置为零点三和零点七"
267
+ },
268
+ {
269
+ "index": 39,
270
+ "audio_path": "39.wav",
271
+ "reference": "在五个标准数据集上的实验结果表明,我们的方法平均比现有最佳方法提升了二点三个百分点",
272
+ "inference_time": 1.425,
273
+ "predicts": "在五个标准数据级上的实验结果表明我们的方法平均比现有最佳方法提升了二点三个百分点"
274
+ },
275
+ {
276
+ "index": 40,
277
+ "audio_path": "40.wav",
278
+ "reference": "消融实验证实了每个模块的有效性,移除注意力机制会导致性能下降百分之四点六",
279
+ "inference_time": 1.335,
280
+ "predicts": "消融实验证实了每个模块的有效性移除注意力机制会导致性能下降百分之四点六"
281
+ },
282
+ {
283
+ "index": 41,
284
+ "audio_path": "41.wav",
285
+ "reference": "模型的参数量为一点二亿,在八张A100显卡上训练了七十二小时",
286
+ "inference_time": 1.267,
287
+ "predicts": "模型的参数量为一点二亿在八张A一百显卡上训练了七十二小时"
288
+ },
289
+ {
290
+ "index": 42,
291
+ "audio_path": "42.wav",
292
+ "reference": "总之,搞定了数据倾斜,性能就上去了。",
293
+ "inference_time": 1.109,
294
+ "predicts": "总之搞定了数据倾斜性能就上去了"
295
+ },
296
+ {
297
+ "index": 43,
298
+ "audio_path": "43.wav",
299
+ "reference": "openai-whisper 需要 ffmpeg 的环境,ffmpeg 是一个开源的跨平台音视频处理工具和框架,可以用来录制、转换和流式传输音视频内容 。",
300
+ "inference_time": 1.583,
301
+ "predicts": "欧本需要配格的环境配格是一个开源的跨平台音视频处理工具和框架可以用来录制转换和流逝传输音视频内容"
302
+ },
303
+ {
304
+ "index": 44,
305
+ "audio_path": "44.wav",
306
+ "reference": "采用 Transformer 序列到序列模型可以实现针对不同的语言处理任务。",
307
+ "inference_time": 1.277,
308
+ "predicts": "采用参设计计到计计模型可以实现针对不同的语言处理任务"
309
+ },
310
+ {
311
+ "index": 45,
312
+ "audio_path": "45.wav",
313
+ "reference": "Transformer架构在自然语言处理中的成功应用已经彻底改变了预训练模型的范式",
314
+ "inference_time": 1.348,
315
+ "predicts": "架架构在自然语言处理中的成功应用已经彻底改变了预训练模型的范式"
316
+ },
317
+ {
318
+ "index": 46,
319
+ "audio_path": "46.wav",
320
+ "reference": "请大家注意,Workshop材料已经上传至会议系统,代码仓库链接在附录页。",
321
+ "inference_time": 1.282,
322
+ "predicts": "请大家注意材料已经上传至会议系统代码仓库链接在附录页"
323
+ },
324
+ {
325
+ "index": 47,
326
+ "audio_path": "47.wav",
327
+ "reference": "别造轮子了!直接调用yolov5开源库的预训练权重,快速迭代才是王道。",
328
+ "inference_time": 1.353,
329
+ "predicts": "别造轮子了直接调用优罗V五开元库的预训练权种快速迭带才是王道"
330
+ },
331
+ {
332
+ "index": 48,
333
+ "audio_path": "48.wav",
334
+ "reference": "请确保您的设备已连接到5GHz Wi-Fi频段以获得最佳性能。",
335
+ "inference_time": 1.284,
336
+ "predicts": "请确保您的设备已连接到五G赫兹WiFi频段易获得最佳性能"
337
+ },
338
+ {
339
+ "index": 49,
340
+ "audio_path": "49.wav",
341
+ "reference": "我们提出了一种基于对比学习的自监督方法,在ImageNet数据集上达到了百分之九十二点五的准确率",
342
+ "inference_time": 1.413,
343
+ "predicts": "我们提出了一种基于对比学习的自监督方法在ImageNet数据集上达到了九百二十五的转确率"
344
+ },
345
+ {
346
+ "index": 50,
347
+ "audio_path": "50.wav",
348
+ "reference": "GPT-4在代码生成和多步推理任务上展现出令人印象深刻的能力",
349
+ "inference_time": 1.238,
350
+ "predicts": "基比利富在代码生成和多部推理任务上展现出令人印象深刻的伦理"
351
+ },
352
+ {
353
+ "index": 51,
354
+ "audio_path": "51.wav",
355
+ "reference": "若在高并发场景下未启用我们的动态缓存机制,即使用RDMA网络,延迟也可能因CPU调度争抢而恶化。",
356
+ "inference_time": 1.513,
357
+ "predicts": "若在高并发场景下为启用我们的动态缓存机制即使用DM网络延迟也可能因CPU调度争抢而恶化"
358
+ },
359
+ {
360
+ "index": 52,
361
+ "audio_path": "52.wav",
362
+ "reference": "准确说,峰值算力是200 TOPS,不是刚才说的150。",
363
+ "inference_time": 1.145,
364
+ "predicts": "准确说峰值算力是两百特不是刚才说的一百五"
365
+ },
366
+ {
367
+ "index": 53,
368
+ "audio_path": "53.wav",
369
+ "reference": "秦始皇嬴政书同文,车同轨,奠定了中国大一统的基础。",
370
+ "inference_time": 1.251,
371
+ "predicts": "秦始皇赢政书同文车同轨奠定了中国大一统的基础"
372
+ },
373
+ {
374
+ "index": 54,
375
+ "audio_path": "54.wav",
376
+ "reference": "诸葛亮在《出师表》中写道‘鞠躬尽瘁,死而后已’,成为后世臣子的楷模。",
377
+ "inference_time": 1.32,
378
+ "predicts": "诸葛亮在出师表中写道鞠躬尽瘁死而后已成为后世臣子的楷模"
379
+ },
380
+ {
381
+ "index": 55,
382
+ "audio_path": "55.wav",
383
+ "reference": "李白的‘举头望明月,低头思故乡’是连三岁孩童都能背诵的诗句。",
384
+ "inference_time": 1.285,
385
+ "predicts": "李白的举头望明月低头思故乡是连三岁孩童都能背诵的诗句"
386
+ },
387
+ {
388
+ "index": 56,
389
+ "audio_path": "56.wav",
390
+ "reference": "孔子曾说‘己所不欲,勿施于人’,这简单的八个字构成了儒家伦理的基石。",
391
+ "inference_time": 1.337,
392
+ "predicts": "孔子曾说己所不欲勿施于人这简单的八个字构成了儒家伦理的基石"
393
+ },
394
+ {
395
+ "index": 57,
396
+ "audio_path": "57.wav",
397
+ "reference": "王羲之被后人尊为‘书圣’,其代表作《兰亭集序》被誉为天下第一行书。",
398
+ "inference_time": 1.285,
399
+ "predicts": "王羲之背后仁尊为书圣其代表作南庭集序被誉为天下第一行书"
400
+ },
401
+ {
402
+ "index": 58,
403
+ "audio_path": "58.wav",
404
+ "reference": "我们要学习庖丁解牛的精神,掌握事物的客观规律,才能游刃有余。",
405
+ "inference_time": 1.292,
406
+ "predicts": "我们要学习刨丁解牛的精神掌握事物的客观规律才能游刃有余"
407
+ },
408
+ {
409
+ "index": 59,
410
+ "audio_path": "59.wav",
411
+ "reference": "项羽在鸿门宴上优柔寡断,放走了刘邦,最终兵败乌江自刎。",
412
+ "inference_time": 1.338,
413
+ "predicts": "项羽在鸿门宴上优柔寡断放走了刘邦最终兵败乌江自吻"
414
+ },
415
+ {
416
+ "index": 60,
417
+ "audio_path": "60.wav",
418
+ "reference": "在杭州西湖畔,人们总会想起苏轼治理西湖、修筑苏堤的往事。",
419
+ "inference_time": 1.284,
420
+ "predicts": "在杭州西湖畔人们总会想起苏轼治理西湖修筑苏堤的往事"
421
+ },
422
+ {
423
+ "index": 61,
424
+ "audio_path": "61.wav",
425
+ "reference": "我计划去埃菲尔铁塔和卢浮宫参观。",
426
+ "inference_time": 1.166,
427
+ "predicts": "我计划去爱菲尔铁塔和卢浮宫参观"
428
+ },
429
+ {
430
+ "index": 62,
431
+ "audio_path": "62.wav",
432
+ "reference": "莎士比亚的戏剧深刻地探讨了人性的复杂性。",
433
+ "inference_time": 1.181,
434
+ "predicts": "莎士比亚的戏剧深刻地探讨了人性的复杂性"
435
+ },
436
+ {
437
+ "index": 63,
438
+ "audio_path": "63.wav",
439
+ "reference": "这个消息让他丈二和尚摸不着头脑。",
440
+ "inference_time": 1.074,
441
+ "predicts": "这个消息让他张儿和尚摸不着头脑"
442
+ },
443
+ {
444
+ "index": 64,
445
+ "audio_path": "64.wav",
446
+ "reference": "画龙点睛之笔,让整个设计焕然一新。",
447
+ "inference_time": 1.112,
448
+ "predicts": "画龙点睛之笔让整个设计焕然一新"
449
+ },
450
+ {
451
+ "index": 65,
452
+ "audio_path": "65.wav",
453
+ "reference": "这是卡脖子技术,必须自主研发!",
454
+ "inference_time": 1.081,
455
+ "predicts": "这是卡帛子技术必须自主研发"
456
+ },
457
+ {
458
+ "index": 66,
459
+ "audio_path": "66.wav",
460
+ "reference": "这幅画作以其独特的色彩运用和构图技巧而闻名。",
461
+ "inference_time": 1.188,
462
+ "predicts": "这幅画作以其独特的色彩运用和构图技巧而闻名"
463
+ },
464
+ {
465
+ "index": 67,
466
+ "audio_path": "67.wav",
467
+ "reference": "患者主诉间歇性胸痛,放射至左臂。",
468
+ "inference_time": 1.108,
469
+ "predicts": "患者主速间歇性胸痛放射至左臂"
470
+ },
471
+ {
472
+ "index": 68,
473
+ "audio_path": "68.wav",
474
+ "reference": "患者有冠状动脉粥样硬化性心脏病病史十年,慢性阻塞性肺疾病病史五年",
475
+ "inference_time": 1.365,
476
+ "predicts": "患者有冠状动脉周样硬化性心脏病病史十年慢性阻塞性肺疾病病史五年"
477
+ },
478
+ {
479
+ "index": 69,
480
+ "audio_path": "69.wav",
481
+ "reference": "建议行冠状动脉造影检查,必要时植入支架",
482
+ "inference_time": 1.168,
483
+ "predicts": "建议行冠状动脉噪影检查必要时植入支降"
484
+ },
485
+ {
486
+ "index": 70,
487
+ "audio_path": "70.wav",
488
+ "reference": "患者需低盐低脂糖尿病饮食,监测血压、血糖变化",
489
+ "inference_time": 1.22,
490
+ "predicts": "患者需低盐低脂糖尿病饮食监测血压血糖变化"
491
+ },
492
+ {
493
+ "index": 71,
494
+ "audio_path": "71.wav",
495
+ "reference": "胸部CT平扫显示:双肺散在磨玻璃样密度影,以胸膜下分布为主",
496
+ "inference_time": 1.351,
497
+ "predicts": "胸部CT频扫显示双肺散在膜玻璃一样密度硬以胸膜下分布为主"
498
+ },
499
+ {
500
+ "index": 72,
501
+ "audio_path": "72.wav",
502
+ "reference": "知识产权归属约定:乙方在履行本合同过程中所产生的全部智力成果,其知识产权均归甲方所有",
503
+ "inference_time": 1.501,
504
+ "predicts": "知识产权归属约定一方在履行本合同过程中所产生的全部致内成果其知识产权归甲方所有"
505
+ },
506
+ {
507
+ "index": 73,
508
+ "audio_path": "73.wav",
509
+ "reference": "双方因履行本合同发生争议的,应首先通过友好协商解决;协商不成的,任何一方均有权向有管辖权的人民法院提起诉讼",
510
+ "inference_time": 1.608,
511
+ "predicts": "双方应履行本合同发生争议的应首先通过友好协商解决协商不成的任何一方均有权向有管辖权的人民法院提起诉讼"
512
+ },
513
+ {
514
+ "index": 74,
515
+ "audio_path": "74.wav",
516
+ "reference": "被告在法定期限内未提交答辩状,亦未到庭参加诉讼,本院依法缺席审理",
517
+ "inference_time": 1.326,
518
+ "predicts": "被告在法定期限内未提交答辩状意未到庭参加诉讼本院依法缺席审理"
519
+ },
520
+ {
521
+ "index": 75,
522
+ "audio_path": "75.wav",
523
+ "reference": "原告向本院提出诉讼请求:一、判令被告支付货款人民币五十万八千元及逾期付款利息;二、判令被告承担本案的诉讼费用",
524
+ "inference_time": 1.591,
525
+ "predicts": "原告向本院提出诉讼请求一判令被告支付货款人民币五十万八千元及逾期付款利息二判令被告承担本案的诉讼费用"
526
+ },
527
+ {
528
+ "index": 76,
529
+ "audio_path": "76.wav",
530
+ "reference": "被执行人未按执行通知履行法律文书确定的义务,人民法院有权查封、扣押、冻结、拍卖被执行人的财产",
531
+ "inference_time": 1.568,
532
+ "predicts": "被执行人未按执行通知履行法律文书确定的义务人民法院有权查封扣押冻结拍卖被执行人的财产"
533
+ },
534
+ {
535
+ "index": 77,
536
+ "audio_path": "77.wav",
537
+ "reference": "这个视频太上头了!",
538
+ "inference_time": 0.988,
539
+ "predicts": "这个视频太上头了"
540
+ },
541
+ {
542
+ "index": 78,
543
+ "audio_path": "78.wav",
544
+ "reference": "他真是个社恐。",
545
+ "inference_time": 0.951,
546
+ "predicts": "他真是个社恐"
547
+ },
548
+ {
549
+ "index": 79,
550
+ "audio_path": "79.wav",
551
+ "reference": "简直了,这躺平的状态也太佛系了吧。",
552
+ "inference_time": 1.083,
553
+ "predicts": "简直了这躺平的状态也太佛系了吧"
554
+ },
555
+ {
556
+ "index": 80,
557
+ "audio_path": "80.wav",
558
+ "reference": "这个瓜有点大,我得去吃瓜了。",
559
+ "inference_time": 1.029,
560
+ "predicts": "这个瓜有点大我得去吃瓜了"
561
+ },
562
+ {
563
+ "index": 81,
564
+ "audio_path": "81.wav",
565
+ "reference": "别内卷了,咱们还是多交流交流吧。",
566
+ "inference_time": 1.049,
567
+ "predicts": "别内卷了咱们还是多交流交流吧"
568
+ },
569
+ {
570
+ "index": 82,
571
+ "audio_path": "82.wav",
572
+ "reference": "第二个公司我们成立了中国黄页,在中国黄页的创业经验中有很多的经验,也是可以在这儿跟大家进行分享的",
573
+ "inference_time": 1.327,
574
+ "predicts": "第二个公司我们成立了中国房业在中国房业的创业经验中有很多的经验也是可以在这儿跟大家进行分享的"
575
+ },
576
+ {
577
+ "index": 83,
578
+ "audio_path": "83.wav",
579
+ "reference": "对于大部分在接触微积分之前,主要的学习经验就是刷题,甚至是连题也不刷的同学们来说。",
580
+ "inference_time": 1.359,
581
+ "predicts": "对于大部分在接触微积分之前主要的学习经验就是刷题甚至是连题也不刷的同学们来说"
582
+ },
583
+ {
584
+ "index": 84,
585
+ "audio_path": "84.wav",
586
+ "reference": "说了两个小时没人听懂我在说什么,最后二十三个人反对一个人同意,这一个人就说马云你这样做,你就试试看,不行的话,赶紧逃回来,还来得及。",
587
+ "inference_time": 1.588,
588
+ "predicts": "说了两个小时没人听懂我在说什么最后二十三个人反对一个人同意这一个人就说马云你这样做你就试试看不行的话赶紧逃回来还来得及"
589
+ },
590
+ {
591
+ "index": 85,
592
+ "audio_path": "85.wav",
593
+ "reference": "晚上想想是热血沸腾,真好,第二天早上骑个自行车又上班去了,对吧?",
594
+ "inference_time": 1.237,
595
+ "predicts": "晚上想想是热血沸腾真好第二天早上骑个自行车又上班去了对吧"
596
+ }
597
+ ]
reports/asr_result_whisper_finetuned_wenet_net.json ADDED
The diff for this file is too large to render. See raw diff
 
reports/asr_result_whisper_librispeech_clean.json ADDED
The diff for this file is too large to render. See raw diff
 
reports/asr_result_whisper_recording.json ADDED
@@ -0,0 +1,597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "index": 1,
4
+ "audio_path": "1.wav",
5
+ "reference": "您这车开得真够稳的,蜗牛都超车了。",
6
+ "inference_time": 0.955,
7
+ "predicts": "您这车开得真够稳的,蜗牛都超车了。"
8
+ },
9
+ {
10
+ "index": 2,
11
+ "audio_path": "2.wav",
12
+ "reference": "你可真是个天才,把这么简单的事情都搞砸了。",
13
+ "inference_time": 0.777,
14
+ "predicts": "你可真是个天才,把这么简单的事情都搞砸了。"
15
+ },
16
+ {
17
+ "index": 3,
18
+ "audio_path": "3.wav",
19
+ "reference": "因网络问题,远程讲者音频中断,我们切到备用方案共享幻灯片。",
20
+ "inference_time": 0.831,
21
+ "predicts": "因网络问题,远程讲着音频中断,我们切到备用方案,共享幻灯片。"
22
+ },
23
+ {
24
+ "index": 4,
25
+ "audio_path": "4.wav",
26
+ "reference": "这就好比‘用大炮打蚊子’,资源分配严重不均衡,需要细粒度调度。",
27
+ "inference_time": 0.834,
28
+ "predicts": "这就好比用大炮打蚊子。资源分配严重不均衡,需要细腻度调度。"
29
+ },
30
+ {
31
+ "index": 5,
32
+ "audio_path": "5.wav",
33
+ "reference": "言归正传,我们来讨论一下核心问题。",
34
+ "inference_time": 0.768,
35
+ "predicts": "言归正传,我们来讨论一下核心问题。"
36
+ },
37
+ {
38
+ "index": 6,
39
+ "audio_path": "6.wav",
40
+ "reference": "这有点超出了我们今天讨论的范围。",
41
+ "inference_time": 0.771,
42
+ "predicts": "这有点超出了我们今天讨论的范围。"
43
+ },
44
+ {
45
+ "index": 7,
46
+ "audio_path": "7.wav",
47
+ "reference": "他的年收入约为一百万人民币。",
48
+ "inference_time": 0.731,
49
+ "predicts": "他的年收入约为100万人民币。"
50
+ },
51
+ {
52
+ "index": 8,
53
+ "audio_path": "8.wav",
54
+ "reference": "下一位演讲者是来自斯坦福大学计算机科学系的张明教授",
55
+ "inference_time": 0.778,
56
+ "predicts": "下一位演讲者是来自斯坦福大学计算机科学系的张明教授。"
57
+ },
58
+ {
59
+ "index": 9,
60
+ "audio_path": "9.wav",
61
+ "reference": "请各位将手机调至静音模式,演讲结束后有十五分钟提问时间",
62
+ "inference_time": 0.799,
63
+ "predicts": "请各位将手机调至静音模式。演讲结束后有15分钟提问时间。"
64
+ },
65
+ {
66
+ "index": 10,
67
+ "audio_path": "10.wav",
68
+ "reference": "茶歇将在十点三十分开始,地点在二楼休息区",
69
+ "inference_time": 0.784,
70
+ "predicts": "茶歇将在10:30分开始,地联在二楼休息区。"
71
+ },
72
+ {
73
+ "index": 11,
74
+ "audio_path": "11.wav",
75
+ "reference": "本合同自双方签字盖章之日起生效。",
76
+ "inference_time": 0.766,
77
+ "predicts": "本合同自双方签字盖章之日起生效。"
78
+ },
79
+ {
80
+ "index": 12,
81
+ "audio_path": "12.wav",
82
+ "reference": "政府正在采取措施刺激经济增长和创造就业机会。",
83
+ "inference_time": 0.816,
84
+ "predicts": "政府正在采取措施刺激经济增长和创造就业机会。"
85
+ },
86
+ {
87
+ "index": 13,
88
+ "audio_path": "13.wav",
89
+ "reference": "今天中午我们点外卖吧,我想吃宫保鸡丁和麻婆豆腐。",
90
+ "inference_time": 0.79,
91
+ "predicts": "今天中午我们点外卖吧!我想吃公保鸡丁和麻婆豆腐。"
92
+ },
93
+ {
94
+ "index": 14,
95
+ "audio_path": "14.wav",
96
+ "reference": "医生建议我多吃粗粮,比如燕麦和紫薯,少吃高油高糖的油炸食品,像炸鸡和甜甜圈。",
97
+ "inference_time": 0.849,
98
+ "predicts": "医生建议我多吃粗粮,比如燕麦和紫薯,少吃高油高糖的油炸食品,像炸鸡和甜甜圈。"
99
+ },
100
+ {
101
+ "index": 15,
102
+ "audio_path": "15.wav",
103
+ "reference": "本方案采用异构计算架构,结合GPU与FPGA加速张量运算。",
104
+ "inference_time": 0.804,
105
+ "predicts": "本方案采用易购计算架构,结合GPU与FPGA加速张量运算。"
106
+ },
107
+ {
108
+ "index": 16,
109
+ "audio_path": "16.wav",
110
+ "reference": "推理延迟稳定在5毫秒以内,吞吐量提升40%,功耗降低1.8瓦。",
111
+ "inference_time": 0.83,
112
+ "predicts": "推拟延迟稳定在5毫秒以内,吞吐量提升40%,功耗降低1.8瓦。"
113
+ },
114
+ {
115
+ "index": 17,
116
+ "audio_path": "17.wav",
117
+ "reference": "相比SOTA模型,我们的方法在小样本场景下召回率高出12个百分点,且参数量仅为其三分之一。",
118
+ "inference_time": 0.858,
119
+ "predicts": "相比SOTA模型,我们的方法在想样本场景下召回率高出12个百分点,且参数量仅为其三分之一。"
120
+ },
121
+ {
122
+ "index": 18,
123
+ "audio_path": "18.wav",
124
+ "reference": "当节点并发数超过10,000时,内存带宽会成瓶颈,导致尾延迟急剧上升。",
125
+ "inference_time": 0.848,
126
+ "predicts": "当节点并发数超过1万时,内存带宽会成瓶颈,导致尾延尺急剧上升。"
127
+ },
128
+ {
129
+ "index": 19,
130
+ "audio_path": "19.wav",
131
+ "reference": "请看图3:蓝色柱是基线模型,红色线是我们的优化结果,交叉点表明在第15轮迭代后优势���著。",
132
+ "inference_time": 0.895,
133
+ "predicts": "请看图3,蓝色柱是基线模型,红色线是我们的优化结果。交叉点表明,在第15轮迭代后,优势显著。"
134
+ },
135
+ {
136
+ "index": 20,
137
+ "audio_path": "20.wav",
138
+ "reference": "您提到模型泛化性提升,是否在跨模态数据上验证过?量化指标是多少?",
139
+ "inference_time": 0.808,
140
+ "predicts": "您提到模型泛化性提升,是否在跨模态数据上验证过,量化指标是多少。"
141
+ },
142
+ {
143
+ "index": 21,
144
+ "audio_path": "21.wav",
145
+ "reference": "容我追问一点:如果输入数据存在对抗样本,您方案的鲁棒性如何保证?失效概率有测试吗?",
146
+ "inference_time": 0.843,
147
+ "predicts": "容我追问一点,如果输入数据存在对抗样本,您方案的鲁棒性如何保证?失效概率有测试吗?"
148
+ },
149
+ {
150
+ "index": 22,
151
+ "audio_path": "22.wav",
152
+ "reference": "抱歉打断,您说的‘动态剪枝’是指训练中还是推理中?阈值是自适应的吗?",
153
+ "inference_time": 0.839,
154
+ "predicts": "抱歉打断,您说的动态简直,是指训练中还是推理中,预值是自适应的吗?"
155
+ },
156
+ {
157
+ "index": 23,
158
+ "audio_path": "23.wav",
159
+ "reference": "成本效益是否可行?部署这种定制芯片需要流片费用,中小客户怎么承担?",
160
+ "inference_time": 0.835,
161
+ "predicts": "成本效益是否可行?部署这种定制芯片,需要牛片费用,中小客户怎么承担?"
162
+ },
163
+ {
164
+ "index": 24,
165
+ "audio_path": "24.wav",
166
+ "reference": "我补充一个角度:隐私计算领域也在用类似思路,是否可能跨领域合作?",
167
+ "inference_time": 0.843,
168
+ "predicts": "我补充一个角度,隐私计算领域也在用nace思路,是否可能跨领域合作。"
169
+ },
170
+ {
171
+ "index": 25,
172
+ "audio_path": "25.wav",
173
+ "reference": "端到端优化流水线涵盖数据清洗特征工程模型压缩部署监控。",
174
+ "inference_time": 0.841,
175
+ "predicts": "端到端优化流水线,涵盖数据清洗特征工程,模型压缩部署监控。"
176
+ },
177
+ {
178
+ "index": 26,
179
+ "audio_path": "26.wav",
180
+ "reference": "请检查代码库中的依赖项冲突问题。",
181
+ "inference_time": 0.764,
182
+ "predicts": "请检查代码库中的依赖向冲突问题。"
183
+ },
184
+ {
185
+ "index": 27,
186
+ "audio_path": "27.wav",
187
+ "reference": "这个电路板集成了微控制器、传感器和无线通信模块。",
188
+ "inference_time": 0.763,
189
+ "predicts": "这个电路板集成了微控制器、传感器和无线通信模块。"
190
+ },
191
+ {
192
+ "index": 28,
193
+ "audio_path": "28.wav",
194
+ "reference": "区块链技术的核心在于去中心化和加密安全性。",
195
+ "inference_time": 0.785,
196
+ "predicts": "区块链技术的核心在于去中心化和加密安全性。"
197
+ },
198
+ {
199
+ "index": 29,
200
+ "audio_path": "29.wav",
201
+ "reference": "需符合等保三级要求,数据出境要走安全评估。",
202
+ "inference_time": 0.787,
203
+ "predicts": "需符合等保三级要求,数据出境要走安全评估。"
204
+ },
205
+ {
206
+ "index": 30,
207
+ "audio_path": "30.wav",
208
+ "reference": "最新数据显示,消费者信心指数有所回升。",
209
+ "inference_time": 0.772,
210
+ "predicts": "最新数据显示,消费者信心指数有所回升。"
211
+ },
212
+ {
213
+ "index": 31,
214
+ "audio_path": "31.wav",
215
+ "reference": "专家预测,新能源汽车市场将迎来爆发式增长。",
216
+ "inference_time": 0.792,
217
+ "predicts": "专家预测,新能源汽车市场将引来爆发式增长。"
218
+ },
219
+ {
220
+ "index": 32,
221
+ "audio_path": "32.wav",
222
+ "reference": "多模态大模型能够同时处理文本、图像和音频信号,实现真正的跨模态理解",
223
+ "inference_time": 0.829,
224
+ "predicts": "多模态大模型能够同时处理文本、图像和音频信号,实现真正的跨模态理解。"
225
+ },
226
+ {
227
+ "index": 33,
228
+ "audio_path": "33.wav",
229
+ "reference": "联邦学习可以在保护用户隐私的前提下,实现分布式模型的协同训练",
230
+ "inference_time": 0.819,
231
+ "predicts": "联邦学习可以在保护用户隐私的前提下,实现分布式模型的协同训练。"
232
+ },
233
+ {
234
+ "index": 34,
235
+ "audio_path": "34.wav",
236
+ "reference": "我们研发的五十量子比特处理器在特定算法上展现出了量子优越性",
237
+ "inference_time": 0.79,
238
+ "predicts": "我们研发的50量子比特处理器在特定算法上展现出了量子优越性。"
239
+ },
240
+ {
241
+ "index": 35,
242
+ "audio_path": "35.wav",
243
+ "reference": "变分量子本征值求解器在化学模拟中显示出巨大潜力",
244
+ "inference_time": 0.8,
245
+ "predicts": "变分量子本真值求解器在化学模拟中显示出巨大潜力。"
246
+ },
247
+ {
248
+ "index": 36,
249
+ "audio_path": "36.wav",
250
+ "reference": "边缘AI推理的延迟已经可以控制在十毫秒以内,满足实时应用需求",
251
+ "inference_time": 0.804,
252
+ "predicts": "边缘AI-20的延迟已经可以控制在10ms以内,满足实���应用需求。"
253
+ },
254
+ {
255
+ "index": 37,
256
+ "audio_path": "37.wav",
257
+ "reference": "我们设计的新型神经网络压缩算法在保持精度的同时将模型大小减少了百分之七十",
258
+ "inference_time": 0.837,
259
+ "predicts": "我们设计的新型神经网诺压缩算法在保持精度的同时将模型大小减少了70%。"
260
+ },
261
+ {
262
+ "index": 38,
263
+ "audio_path": "38.wav",
264
+ "reference": "损失函数结合了交叉熵损失和对比损失,权重系数设置为零点三和零点七",
265
+ "inference_time": 0.842,
266
+ "predicts": "损失函数结合了交叉伤损失和对比损失。权重系数设置为0.3和0.7。"
267
+ },
268
+ {
269
+ "index": 39,
270
+ "audio_path": "39.wav",
271
+ "reference": "在五个标准数据集上的实验结果表明,我们的方法平均比现有最佳方法提升了二点三个百分点",
272
+ "inference_time": 0.874,
273
+ "predicts": "在五个标准数据级上的实验结果表明,我们的方法平均比现有最佳方法提升了2.3个百分点。"
274
+ },
275
+ {
276
+ "index": 40,
277
+ "audio_path": "40.wav",
278
+ "reference": "消融实验证实了每个模块的有效性,移除注意力机制会导致性能下降百分之四点六",
279
+ "inference_time": 0.812,
280
+ "predicts": "消融实验证实了每个模块的有效性。移除注意力机制会导致性能下降4.6%。"
281
+ },
282
+ {
283
+ "index": 41,
284
+ "audio_path": "41.wav",
285
+ "reference": "模型的参数量为一点二亿,在八张A100显卡上训练了七十二小时",
286
+ "inference_time": 0.801,
287
+ "predicts": "模型的参数量为1.2亿,在8张A100显卡上训练了72小时。"
288
+ },
289
+ {
290
+ "index": 42,
291
+ "audio_path": "42.wav",
292
+ "reference": "总之,搞定了数据倾斜,性能就上去了。",
293
+ "inference_time": 0.766,
294
+ "predicts": "总之搞定了数据倾斜,性能就上去了。"
295
+ },
296
+ {
297
+ "index": 43,
298
+ "audio_path": "43.wav",
299
+ "reference": "openai-whisper 需要 ffmpeg 的环境,ffmpeg 是一个开源的跨平台音视频处理工具和框架,可以用来录制、转换和流式传输音视频内容 。",
300
+ "inference_time": 0.922,
301
+ "predicts": "OpenAI Whisper需要FFMPEG的环境。FFMPEG是一个开源的跨平台音视频处理工具和框架,可以用来录制、转换和流逝传输音视频内容。"
302
+ },
303
+ {
304
+ "index": 44,
305
+ "audio_path": "44.wav",
306
+ "reference": "采用 Transformer 序列到序列模型可以实现针对不同的语言处理任务。",
307
+ "inference_time": 0.829,
308
+ "predicts": "采用Transformer训练到训练模型,可以实现针对不同的语言处理任务。"
309
+ },
310
+ {
311
+ "index": 45,
312
+ "audio_path": "45.wav",
313
+ "reference": "Transformer架构在自然语言处理中的成功应用已经彻底改变了预训练模型的范式",
314
+ "inference_time": 0.854,
315
+ "predicts": "Transformer架构在自然语言处理中的成功应用,已经彻底改变了预训练模型的范式。"
316
+ },
317
+ {
318
+ "index": 46,
319
+ "audio_path": "46.wav",
320
+ "reference": "请大家注意,Workshop材料已经上传至会议系统,代码仓库链接在附录页。",
321
+ "inference_time": 0.82,
322
+ "predicts": "请大家注意,Workshop材料已经上传至会议系统,代码仓库链接在附录页。"
323
+ },
324
+ {
325
+ "index": 47,
326
+ "audio_path": "47.wav",
327
+ "reference": "别造轮子了!直接调用yolov5开源库的预训练权重,快速迭代才是王道。",
328
+ "inference_time": 0.817,
329
+ "predicts": "别造轮子了,直接调用YOLO V5开原库的预训练权种,快速迭带才是王道。"
330
+ },
331
+ {
332
+ "index": 48,
333
+ "audio_path": "48.wav",
334
+ "reference": "请确保您的设备已连接到5GHz Wi-Fi频段以获得最佳性能。",
335
+ "inference_time": 0.813,
336
+ "predicts": "请确保您的设备已连接到5GHz Wifi频段,以获得最佳性能。"
337
+ },
338
+ {
339
+ "index": 49,
340
+ "audio_path": "49.wav",
341
+ "reference": "我们提出了一种基于对比学习的自监督方法,在ImageNet数据集上达到了百分之九十二点五的准确率",
342
+ "inference_time": 0.854,
343
+ "predicts": "我们提出了一种基于对比学习的自监督方法,在ImageNet数据集上达到了92.5%的转确率。"
344
+ },
345
+ {
346
+ "index": 50,
347
+ "audio_path": "50.wav",
348
+ "reference": "GPT-4在代码生成和多步推理任务上展现出令人印象深刻的能力",
349
+ "inference_time": 0.818,
350
+ "predicts": "GPT4在代码生成和多部推理任务上展现出令人印象深刻的伦理。"
351
+ },
352
+ {
353
+ "index": 51,
354
+ "audio_path": "51.wav",
355
+ "reference": "若在高并发场景下未启用我们的动态缓存机制,即使用RDMA网络,延迟也可能因CPU调度争抢而恶化。",
356
+ "inference_time": 0.896,
357
+ "predicts": "若在高病发场景下,为启用我们的动态缓存机制,即使用RDMA网络,延迟也可能因CPU调度增强而恶化。"
358
+ },
359
+ {
360
+ "index": 52,
361
+ "audio_path": "52.wav",
362
+ "reference": "准确说,峰值算力是200 TOPS,不是刚才说的150。",
363
+ "inference_time": 0.778,
364
+ "predicts": "准确说峰值算力是200TOPS,不是刚才说的150。"
365
+ },
366
+ {
367
+ "index": 53,
368
+ "audio_path": "53.wav",
369
+ "reference": "秦始皇嬴政书同文,车同轨,奠定了中国大一统的基础。",
370
+ "inference_time": 0.784,
371
+ "predicts": "秦始皇嬴政,书同文,车同轨,奠定了中国大一统的基础。"
372
+ },
373
+ {
374
+ "index": 54,
375
+ "audio_path": "54.wav",
376
+ "reference": "诸葛亮在《出师表》中写道‘鞠躬尽瘁,死而后已’,成为后世臣子的楷模。",
377
+ "inference_time": 0.851,
378
+ "predicts": "诸葛娘在《出师表》中写道:鞠躬尽瘁,死而后已,成为后世臣子的楷模。"
379
+ },
380
+ {
381
+ "index": 55,
382
+ "audio_path": "55.wav",
383
+ "reference": "李白的‘举头望明月,低头思故乡’是连三岁孩童都能背诵的诗句。",
384
+ "inference_time": 0.828,
385
+ "predicts": "李白的《举头望明月,低头思故乡》是连三岁孩童都能背诵的诗句。"
386
+ },
387
+ {
388
+ "index": 56,
389
+ "audio_path": "56.wav",
390
+ "reference": "孔子曾说‘己所不欲,勿施于人’,这简单的八个字构成了儒家伦理的基石。",
391
+ "inference_time": 0.85,
392
+ "predicts": "孔子曾说:己所不欲,勿施于人。这简单的八个字,构成了儒家伦理的基石。"
393
+ },
394
+ {
395
+ "index": 57,
396
+ "audio_path": "57.wav",
397
+ "reference": "王羲之被后人尊为‘书圣’,其代表作《兰亭集序》被誉为天下第一行书。",
398
+ "inference_time": 0.845,
399
+ "predicts": "王羲之背后仁尊为书圣,其代表作《南庭集序》,被誉为天下第一行书。"
400
+ },
401
+ {
402
+ "index": 58,
403
+ "audio_path": "58.wav",
404
+ "reference": "我们要学习庖丁解牛的精神,掌握事物的客观规律,才能游刃有余。",
405
+ "inference_time": 0.841,
406
+ "predicts": "我们要学习刨丁解牛的精神,掌握事物的客观规律才能游刃有余。"
407
+ },
408
+ {
409
+ "index": 59,
410
+ "audio_path": "59.wav",
411
+ "reference": "项羽在鸿门宴上优柔寡断,放走了刘邦,最终兵败乌江自刎。",
412
+ "inference_time": 0.818,
413
+ "predicts": "向羽在鸿门宴上优柔寡断,放走了刘邦,最终兵败乌江自吻。"
414
+ },
415
+ {
416
+ "index": 60,
417
+ "audio_path": "60.wav",
418
+ "reference": "在杭州西湖畔,人们总会想起苏轼治理西湖、修筑苏堤的往事。",
419
+ "inference_time": 0.812,
420
+ "predicts": "在杭州西湖畔,人们总会想起苏氏治理西湖修筑苏迪的往事。"
421
+ },
422
+ {
423
+ "index": 61,
424
+ "audio_path": "61.wav",
425
+ "reference": "我计划去埃菲尔铁塔和卢浮宫参观。",
426
+ "inference_time": 0.798,
427
+ "predicts": "我计划去爱菲尔铁塔和卢浮宫参观。"
428
+ },
429
+ {
430
+ "index": 62,
431
+ "audio_path": "62.wav",
432
+ "reference": "莎士比亚的戏剧深刻地探讨了人性的复杂性。",
433
+ "inference_time": 0.801,
434
+ "predicts": "莎士比亚的戏剧深刻地探讨了人性的复杂性。"
435
+ },
436
+ {
437
+ "index": 63,
438
+ "audio_path": "63.wav",
439
+ "reference": "这个消息让他丈二和尚摸不着头脑。",
440
+ "inference_time": 0.769,
441
+ "predicts": "这个消息让他仗而合上,摸不着头脑。"
442
+ },
443
+ {
444
+ "index": 64,
445
+ "audio_path": "64.wav",
446
+ "reference": "画龙点睛之笔,让整个设计焕然一新。",
447
+ "inference_time": 0.783,
448
+ "predicts": "画龙点睛之笔,让整个设计焕然一新。"
449
+ },
450
+ {
451
+ "index": 65,
452
+ "audio_path": "65.wav",
453
+ "reference": "这是卡脖子技术,必须自主研发!",
454
+ "inference_time": 0.769,
455
+ "predicts": "这是卡博子技术,必须自主研发。"
456
+ },
457
+ {
458
+ "index": 66,
459
+ "audio_path": "66.wav",
460
+ "reference": "这幅画作以其独特的色彩运用和构图技巧而闻名。",
461
+ "inference_time": 0.779,
462
+ "predicts": "这幅画作以其独特的色彩运用和构图技巧而闻名。"
463
+ },
464
+ {
465
+ "index": 67,
466
+ "audio_path": "67.wav",
467
+ "reference": "患者主诉间歇性胸痛,放射至左臂。",
468
+ "inference_time": 0.764,
469
+ "predicts": "患者主宿间歇性胸痛,放射至左臂。"
470
+ },
471
+ {
472
+ "index": 68,
473
+ "audio_path": "68.wav",
474
+ "reference": "患者有冠状动脉粥样硬化性心脏病病史十年,慢性阻塞性肺疾病病史五年",
475
+ "inference_time": 0.843,
476
+ "predicts": "患者有冠状动脉周样硬化性心脏病病死10年,慢性阻塞性肺疾病病死5年。"
477
+ },
478
+ {
479
+ "index": 69,
480
+ "audio_path": "69.wav",
481
+ "reference": "建议行冠状动脉造影检查,必要时植入支架",
482
+ "inference_time": 0.801,
483
+ "predicts": "建议行冠状动脉造影检查,必要使植入支架。"
484
+ },
485
+ {
486
+ "index": 70,
487
+ "audio_path": "70.wav",
488
+ "reference": "患者需低盐低脂糖尿病饮食,监测血压、血糖变化",
489
+ "inference_time": 0.825,
490
+ "predicts": "患者需低盐、低脂糖尿病饮食, 监测血压、血糖变化。"
491
+ },
492
+ {
493
+ "index": 71,
494
+ "audio_path": "71.wav",
495
+ "reference": "胸部CT平扫显示:双肺散在磨玻璃样密度影,��胸膜下分布为主",
496
+ "inference_time": 0.833,
497
+ "predicts": "胸部CT频扫显示,双肺散在膜玻璃样密度影,以胸膜下分布为主。"
498
+ },
499
+ {
500
+ "index": 72,
501
+ "audio_path": "72.wav",
502
+ "reference": "知识产权归属约定:乙方在履行本合同过程中所产生的全部智力成果,其知识产权均归甲方所有",
503
+ "inference_time": 0.878,
504
+ "predicts": "知识产权归属约定,以方在履行本合同过程中所产生的全部致力成果,其知识产权归甲方所有。"
505
+ },
506
+ {
507
+ "index": 73,
508
+ "audio_path": "73.wav",
509
+ "reference": "双方因履行本合同发生争议的,应首先通过友好协商解决;协商不成的,任何一方均有权向有管辖权的人民法院提起诉讼",
510
+ "inference_time": 0.897,
511
+ "predicts": "双方应履行本合同发生争议的,应首先通过友好协商解决,协商不成的,任何一方均有权向有管辖权的人民法院提起诉讼。"
512
+ },
513
+ {
514
+ "index": 74,
515
+ "audio_path": "74.wav",
516
+ "reference": "被告在法定期限内未提交答辩状,亦未到庭参加诉讼,本院依法缺席审理",
517
+ "inference_time": 0.856,
518
+ "predicts": "被告在法定期限内为提交答编状,亦为道庭参加诉讼,本院依法缺席审理。"
519
+ },
520
+ {
521
+ "index": 75,
522
+ "audio_path": "75.wav",
523
+ "reference": "原告向本院提出诉讼请求:一、判令被告支付货款人民币五十万八千元及逾期付款利息;二、判令被告承担本案的诉讼费用",
524
+ "inference_time": 0.931,
525
+ "predicts": "原告向本愿提出诉讼请求: 1.判令被告支付货款人民币50万8千元及逾期付款利息。 2.判令被告承担本案的诉讼费用。"
526
+ },
527
+ {
528
+ "index": 76,
529
+ "audio_path": "76.wav",
530
+ "reference": "被执行人未按执行通知履行法律文书确定的义务,人民法院有权查封、扣押、冻结、拍卖被执行人的财产",
531
+ "inference_time": 0.915,
532
+ "predicts": "被执行人未按执行通知履行法律文书确定的义务。人民法院有权查封扣押、冻结、拍卖被执行人的财产。"
533
+ },
534
+ {
535
+ "index": 77,
536
+ "audio_path": "77.wav",
537
+ "reference": "这个视频太上头了!",
538
+ "inference_time": 0.716,
539
+ "predicts": "这个视频太上头了。"
540
+ },
541
+ {
542
+ "index": 78,
543
+ "audio_path": "78.wav",
544
+ "reference": "他真是个社恐。",
545
+ "inference_time": 0.721,
546
+ "predicts": "他真是个社恐。"
547
+ },
548
+ {
549
+ "index": 79,
550
+ "audio_path": "79.wav",
551
+ "reference": "简直了,这躺平的状态也太佛系了吧。",
552
+ "inference_time": 0.746,
553
+ "predicts": "简直了,这躺平的状态也太佛系了吧!"
554
+ },
555
+ {
556
+ "index": 80,
557
+ "audio_path": "80.wav",
558
+ "reference": "这个瓜有点大,我得去吃瓜了。",
559
+ "inference_time": 0.75,
560
+ "predicts": "这个瓜有点大,我得去吃瓜了。"
561
+ },
562
+ {
563
+ "index": 81,
564
+ "audio_path": "81.wav",
565
+ "reference": "别内卷了,咱们还是多交流交流吧。",
566
+ "inference_time": 0.762,
567
+ "predicts": "别内卷了,咱们还是多交流交流吧。"
568
+ },
569
+ {
570
+ "index": 82,
571
+ "audio_path": "82.wav",
572
+ "reference": "第二个公司我们成立了中国黄页,在中国黄页的创业经验中有很多的经验,也是可以在这儿跟大家进行分享的",
573
+ "inference_time": 0.868,
574
+ "predicts": "第二个公司我们成立了中国黄业。在中国黄业的创业经验中,有很多的经验,也是可以在这儿跟大家进行分享的。"
575
+ },
576
+ {
577
+ "index": 83,
578
+ "audio_path": "83.wav",
579
+ "reference": "对于大部分在接触微积分之前,主要的学习经验就是刷题,甚至是连题也不刷的同学们来说。",
580
+ "inference_time": 0.855,
581
+ "predicts": "对于大部分在接触微积分之前,主要的学习经验就是刷题,甚至是连题也不刷的同学们来说。"
582
+ },
583
+ {
584
+ "index": 84,
585
+ "audio_path": "84.wav",
586
+ "reference": "说了两个小时没人听懂我在说什么,最后二十三个人反对一个人同意,这一个人就说马云你这样做,你就试试看,不行的话,赶紧逃回来,还来得及。",
587
+ "inference_time": 0.912,
588
+ "predicts": "说了两个小时,没人听懂我在说什么,最后23个人反对,一个人同意。这一个人就说:马云你这样做你就试试看,不行的话赶紧逃回来还来得及。"
589
+ },
590
+ {
591
+ "index": 85,
592
+ "audio_path": "85.wav",
593
+ "reference": "晚上想想是热血沸腾,真好,第二天早上骑个自行车又上班去了,对吧?",
594
+ "inference_time": 0.8,
595
+ "predicts": "晚上想想是热血沸腾。真好,第二天早上骑个自行车又上班去了,对吧?"
596
+ }
597
+ ]
reports/asr_result_whisper_wenet_net.json ADDED
The diff for this file is too large to render. See raw diff
 
scripts/asr_utils.py CHANGED
@@ -10,6 +10,7 @@ from subprocess import CompletedProcess
10
  def add_text_index():
11
  text_file = '../test_data/recordings/text/test_asr_zh.txt'
12
  index = 1
 
13
  with open(text_file, encoding='utf-8') as f:
14
  for line in f:
15
  line = line.strip()
@@ -19,9 +20,10 @@ def add_text_index():
19
  if line.startswith('#'):
20
  # print(line)
21
  continue
22
- line = f"{index}. {line}"
23
- print(line)
24
  index += 1
 
 
25
 
26
  def get_lines_with_index(filepath):
27
  with open(filepath, encoding='utf-8') as f:
@@ -83,6 +85,6 @@ def get_origin_text_dict():
83
 
84
 
85
  if __name__ == '__main__':
86
- # add_text_index()
87
- print_text_and_audio_length()
88
  # pass
 
10
  def add_text_index():
11
  text_file = '../test_data/recordings/text/test_asr_zh.txt'
12
  index = 1
13
+ text_dict = {}
14
  with open(text_file, encoding='utf-8') as f:
15
  for line in f:
16
  line = line.strip()
 
20
  if line.startswith('#'):
21
  # print(line)
22
  continue
23
+ text_dict[f"{index}.wav"] = line
 
24
  index += 1
25
+ with open('../test_data/recordings/data.json', "w", encoding="utf-8") as f:
26
+ json.dump(text_dict, f, ensure_ascii=False, indent=2)
27
 
28
  def get_lines_with_index(filepath):
29
  with open(filepath, encoding='utf-8') as f:
 
85
 
86
 
87
  if __name__ == '__main__':
88
+ add_text_index()
89
+ # print_text_and_audio_length()
90
  # pass
scripts/batch_run_asr.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lib.utils import cmd
2
+
3
+ if __name__ == '__main__':
4
+ commands = [
5
+ "/Users/jeqin/miniconda3/envs/translator/bin/python /Users/jeqin/work/code/TestTranslator/lib/asr_models/funasr_nano.py",
6
+ "/Users/jeqin/miniconda3/envs/translator/bin/python /Users/jeqin/work/code/TestTranslator/lib/asr_models/funasr_quant.py",
7
+ "/Users/jeqin/miniconda3/envs/translator/bin/python /Users/jeqin/work/code/TestTranslator/lib/asr_models/whisper.py",
8
+ "/Users/jeqin/miniconda3/envs/translator/bin/python /Users/jeqin/work/code/TestTranslator/lib/asr_models/whisper_finetuned.py",
9
+ ]
10
+ for c in commands:
11
+ cmd(c)
scripts/caculate_cer.py CHANGED
@@ -27,14 +27,14 @@ def calculate_distance(reference: str, hypothesis: str):
27
 
28
  if __name__ == '__main__':
29
  import cn2an
30
- results_list = json.load(open("csv/funasr_wenet_results.json", encoding="utf-8"))
31
  count = 0
32
  distance_sum = 0
33
  reference_sum = 0
34
  for item in results_list:
35
  count += 1
36
  reference = item["reference"]
37
- hypothesis = item["inference_result"]
38
  # # 如果是 whisper,使用 cn2an替换数字为中文
39
  # if re.search(r"\d", hypothesis):
40
  # hypothesis = cn2an.transform(hypothesis, "an2cn")
 
27
 
28
  if __name__ == '__main__':
29
  import cn2an
30
+ results_list = json.load(open("../reports/asr_result_funasr_mlt_nano_wenet_net.json", encoding="utf-8"))
31
  count = 0
32
  distance_sum = 0
33
  reference_sum = 0
34
  for item in results_list:
35
  count += 1
36
  reference = item["reference"]
37
+ hypothesis = item["predicts"]
38
  # # 如果是 whisper,使用 cn2an替换数字为中文
39
  # if re.search(r"\d", hypothesis):
40
  # hypothesis = cn2an.transform(hypothesis, "an2cn")
scripts/csv/fine-tune_whisper.csv DELETED
@@ -1,2 +0,0 @@
1
- file_name,time,inference_result
2
- 49.wav,4.025,我们提出了一种基于对比学习的自监督方法在ImageNet数据集上达到了九百二十五的转确率,6,0.128,我们提出了一种基于对比学习的自监督方法在imagenet数据集上达到了{+九+}百[-分之九-]{+二+}十[-二点-]五的[-准-]{+转+}确率
 
 
 
scripts/csv/funasr_quant.csv DELETED
@@ -1,86 +0,0 @@
1
- file_name,time,inference_result
2
- 1.wav,1.93,您这车开的真够稳的,蜗牛都超车了。,1,0.067,您这车开[-得-]{+的+}真够稳的蜗牛都超车了
3
- 2.wav,0.302,你可真是个天才,把这么简单的事情都搞砸了。,0,0.0,你可真是个天才把这么简单的事情都搞砸了
4
- 3.wav,0.454,因网络问题,远程讲者音频中断,我们切到备用方案,共享幻灯片。,0,0.0,因网络问题远程讲者音频中断我们切到备用方案共享幻灯片
5
- 4.wav,0.419,这就好比用大炮打蚊子,资源分配严重不均衡,需要细腻度调度。,1,0.038,这就好比用大炮打蚊子资源分配严重不均衡需要细[-粒-]{+腻+}度调度
6
- 5.wav,0.28,言归正传,我们来讨论一下核心问题。,0,0.0,言归正传我们来讨论一下核心问题
7
- 6.wav,0.24,这有点超出了我们今天讨论的范围。,0,0.0,这有点超出了我们今天讨论的范围
8
- 7.wav,0.248,他的年收入约为一百万人民币。,0,0.0,他的年收入约为一百万人民币
9
- 8.wav,0.435,下一位演讲者是来自斯坦福大学计算机科学系的张明教授。,0,0.0,下一位演讲者是来自斯坦福大学计算机科学系的张明教授
10
- 9.wav,0.442,请各位将手机调至静音模式,演讲结束后有十五分钟提问时间。,0,0.0,请各位将手机调至静音模式演讲结束后有十五分钟提问时间
11
- 10.wav,0.339,茶歇将在十点三十分开始,地点在二楼休息区。,0,0.0,茶歇将在十点三十分开始地点在二楼休息区
12
- 11.wav,0.293,本合同自双方签字盖章之日起生效。,0,0.0,本合同自双方签字盖章之日起生效
13
- 12.wav,0.366,政府正在采取措施刺激经济增长和创造就业机会。,0,0.0,政府正在采取措施刺激经济增长和创造就业机会
14
- 13.wav,0.369,今天中午我们点外卖吧,我想吃宫保鸡丁和麻婆豆腐。,0,0.0,今天中午我们点外卖吧我想吃宫保鸡丁和麻婆豆腐
15
- 14.wav,0.588,医生建议我多吃粗粮,比如燕麦和紫薯,少吃高油、高糖的油炸食品,像炸鸡和甜甜圈。,0,0.0,医生建议我多吃粗粮比如燕麦和紫薯少吃高油高糖的油炸食品像炸鸡和甜甜圈
16
- 15.wav,0.453,本方案采用异构计算架构,结合gpu与fpga加速张量运算。,0,0.0,本方案采用异构计算架构结合gpu与fpga加速张量运算
17
- 16.wav,0.497,推理延迟稳定在五毫秒以内,吞吐量提升百分之四十,功耗降低一点八瓦。,0,0.0,推理延迟稳定在五毫秒以内吞吐量提升百分之四十功耗降低一点八瓦
18
- 17.wav,0.696,相比sota模型,我们的方法在小样本场景下召回率高出十二个百分点,且参数量仅为其三分之一。,0,0.0,相比sota模型我们的方法在小样本场景下召回率高出十二个百分点且参数量仅为其三分之一
19
- 18.wav,0.508,当节点并发数超过一万时,内存带宽会成瓶颈,导致伪延迟急剧上升。,1,0.036,当节点并发数超过一万时内存带宽会成瓶颈导致[-尾-]{+伪+}延迟急剧上升
20
- 19.wav,0.667,请看图三,蓝色柱是基线模型,红色线是我们的优化。结果交叉点表明,在第十五轮迭代后,优势显著。,0,0.0,请看图三蓝色柱是基线模型红色线是我们的优化结果交叉点表明在第十五轮迭代后优势显著
21
- 20.wav,0.499,您提到模型泛化性提升是否在跨模态数据上验证过量化指标是多少?,0,0.0,您提到模型泛化性提升是否在跨模态数据上验证过量化指标是多少
22
- 21.wav,0.578,容我追问一点,如果输入数据存在对抗样本,您方案的鲁棒性如何保证失效概率有测试吗?,0,0.0,容我追问一点如果输入数据存在对抗样本您方案的鲁棒性如何保证失效概率有测试吗
23
- 22.wav,0.495,抱歉打断您说的动态减脂,是指训练中还是推理中阈值是自适应的吗?,2,0.069,抱歉打断您说的动态[-剪枝-]{+减脂+}是指训练中还是推理中阈值是自适应的吗
24
- 23.wav,0.503,成本效益是否可行?部署这种定制芯片需要流片费用,中小客户怎么承担?,0,0.0,成本效益是否可行部署这种定制芯片需要流片费用中小客户怎么承担
25
- 24.wav,0.546,我补充一个角度,隐私计算领域也在用,类似思路是否可能跨领域合作。,0,0.0,我补充一个角度隐私计算领域也在用类似思路是否可能跨领域合作
26
- 25.wav,0.485,端到端,优化流水线,涵盖数据清洗特征、工程模型,压缩部署监控。,0,0.0,端到端优化流水线涵盖数据清洗特征工程模型压缩部署监控
27
- 26.wav,0.292,请检查代码库中的依赖项冲突问题。,0,0.0,请检查代码库中的依赖项冲突问题
28
- 27.wav,0.345,这个电路板集成了微控制器传感器和无线通信模块。,0,0.0,这个电路板集成了微控制器传感器和无线通信模块
29
- 28.wav,0.342,区块链技术的核心在于去中心化和加密安全性。,0,0.0,区块链技术的核心在于去中心化和加密安全性
30
- 29.wav,0.362,需符合等保三级要求,数据出境��走安全评估。,0,0.0,需符合等保三级要求数据出境要走安全评估
31
- 30.wav,0.313,最新数据显示,消费者信心指数有所回升。,0,0.0,最新数据显示消费者信心指数有所回升
32
- 31.wav,0.349,专家预测,新能源汽车市场将迎来爆发式增长。,0,0.0,专家预测新能源汽车市场将迎来爆发式增长
33
- 32.wav,0.482,多模态大模型能够同时处理文本图像和音频信号,实现真正的跨模态理解。,0,0.0,多模态大模型能够同时处理文本图像和音频信号实现真正的跨模态理解
34
- 33.wav,0.467,联邦学习可以在保护用户隐私的前提下,实现分布式模型的协同训练。,0,0.0,联邦学习可以在保护用户隐私的前提下实现分布式模型的协同训练
35
- 34.wav,0.455,我们研发的五十量子比特处理器在特定算法上展现出了量子优越性。,0,0.0,我们研发的五十量子比特处理器在特定算法上展现出了量子优越性
36
- 35.wav,0.449,变分量子本增值求解器在化学模拟中显示出巨大潜力。,1,0.043,变分量子本[-征-]{+增+}值求解器在化学模拟中显示出巨大潜力
37
- 36.wav,0.492,边缘ai推理的延迟已经可以控制在十毫秒以内,满足实时应用需求。,0,0.0,边缘ai推理的延迟已经可以控制在十毫秒以内满足实时应用需求
38
- 37.wav,0.55,我们设计的新型神经网络压缩算法,在保持精度的同时,将模型大小减少了百分之七十。,0,0.0,我们设计的新型神经网络压缩算法在保持精度的同时将模型大小减少了百分之七十
39
- 38.wav,0.511,损失函数结合了交叉熵损失和对比,损失,权重系数设置为零点三和零点七。,0,0.0,损失函数结合了交叉熵损失和对比损失权重系数设置为零点三和零点七
40
- 39.wav,0.648,在五个标准数据集上的实验结果表明,我们的方法平均比现有最佳方法提升了二点三个百分点。,0,0.0,在五个标准数据集上的实验结果表明我们的方法平均比现有最佳方法提升了二点三个百分点
41
- 40.wav,0.53,消融实验证实了每个模块的有效性,移除注意力机制会导致性能下降百分之四点六。,0,0.0,消融实验证实了每个模块的有效性移除注意力机制会导致性能下降百分之四点六
42
- 41.wav,0.496,模型的参数量为一点二亿,在八张a一百显卡上训练了七十二小时。,0,0.0,模型的参数量为一点二亿在八张a一百显卡上训练了七十二小时
43
- 42.wav,0.311,总之,搞定了数据倾斜性能就上去了。,0,0.0,总之搞定了数据倾斜性能就上去了
44
- 43.wav,0.994,open ai whisper需要fm pag的环境。fm pag是一个开源的跨平台音视频,处理工具和框架,可以用来录制转换和流式传输音视频内容。,5,0.074,openai[---]whisper需要f[-f-]mp[-e-]{+a+}g的环境f[-f-]mp[-e-]{+a+}g是一个开源的跨平台音视频处理工具和框架可以用来录制转换和流式传输音视频内容
45
- 44.wav,0.444,采用transformer序列到序列模型,可以实现针对不同的语言处理任务。,0,0.0,采用transformer序列到序列模型可以实现针对不同的语言处理任务
46
- 45.wav,0.535,transformer架构在自然语言处理中的成功应用,已经彻底改变了预训练模型的范式。,0,0.0,transformer架构在自然语言处理中的成功应用已经彻底改变了预训练模型的范式
47
- 46.wav,0.511,请大家注意,workshop材料已经上传至会议系统,代码仓库链接在附录页。,0,0.0,请大家注意workshop材料已经上传至会议系统代码仓库链接在附录页
48
- 47.wav,0.524,别造轮子了,直接调用乐uv五开源库的预训练权重,快速迭代才是王道。,4,0.125,别造轮子了直接调用[-yolo-]{+乐u+}v五开源库的预训练权重快速迭代才是王道
49
- 48.wav,0.43,请确保您的设备已连接到五g赫兹、wifi频段,以获得最佳性能。,3,0.103,请确保您的设备已连接到五g[-hz-]{+赫兹+}wi[---]fi频段以获得最佳性能
50
- 49.wav,0.593,我们提出了一种基于对比学习的自监督方法,在imaginate数据集上达到了百分之九十二点五的准确率。,3,0.064,我们提出了一种基于对比学习的自监督方法在imag{+inat+}e[-net-]数据集上达到了百分之九十二点五的准确率
51
- 50.wav,0.478,gbd four在代码生成和多步推理任务上展现出令人印象深刻的伦理。,8,0.267,g[-pt负四-]{+bdfour+}在代码生成和多步推理任务上展现出令人印象深刻的[-能力-]{+伦理+}
52
- 51.wav,0.783,若在高并发场景下未启用我们的动态缓存机制,即使用rdma网络延迟,也可能因cpu调度增强而恶化。,2,0.044,若在高并发场景下未启用我们的动态缓存机制即使用rdma网络延迟也可能因cpu调度[-争抢-]{+增强+}而恶化
53
- 52.wav,0.402,准确说峰值算力是两百tops,不是刚才说的一百五。,2,0.083,准确说峰值算力是[-二-]{+两+}百tops不是刚才说的一百五[-十-]
54
- 53.wav,0.393,秦始皇嬴政书同文,车同轨,奠定了中国大一统的基础。,0,0.0,秦始皇嬴政书同文车同轨奠定了中国大一统的基础
55
- 54.wav,0.438,诸葛亮在出师表中写道,鞠躬尽瘁死而后已成为后世臣子的楷模。,0,0.0,诸葛亮在出师表中写道鞠躬尽瘁死而后已成为后世臣子的楷模
56
- 55.wav,0.43,李白的举头望明月,低头思故乡,是连三岁孩童都能背诵的诗句。,0,0.0,李白的举头望明月低头思故乡是连三岁孩童都能背诵的诗句
57
- 56.wav,0.488,孔子曾说,己所不欲勿施于人,这简单的八个字,构成了儒家伦理的基石。,0,0.0,孔子曾说己所不欲勿施于人这简单的八个字构成了儒家伦理的基石
58
- 57.wav,0.464,王羲之被后人尊为书圣,其代表作兰亭集序被誉为天下第一行书。,0,0.0,王羲之被后人尊为书圣其代表作兰亭集序被誉为天下第一行书
59
- 58.wav,0.423,我们要学习庖丁解牛的精神,掌握事物的客观规律才能游刃有余。,0,0.0,我们要学习庖丁解牛的精神掌握事物的客观规律才能游刃有余
60
- 59.wav,0.428,项羽在鸿门宴上优柔寡断,放走了刘邦,最终兵败乌江自刎。,0,0.0,项羽在鸿门宴上优柔寡断放走了刘邦最终兵败乌江自刎
61
- 60.wav,0.471,在杭州,西湖畔,人们总会想起苏轼治理西湖,修筑苏堤的往事。,0,0.0,在杭州西湖畔人们总会想起苏轼治理西湖修筑苏堤的往事
62
- 61.wav,0.279,我计划去埃菲尔铁塔和卢浮宫参观。,0,0.0,我计划去埃菲尔铁塔和卢浮宫参观
63
- 62.wav,0.342,莎士比亚的戏剧深刻地探讨了人性的复杂性。,0,0.0,莎士比亚的戏剧深刻地探讨了人性的复杂性
64
- 63.wav,0.241,这个消息让他丈二和尚摸不着头脑。,0,0.0,这个消息让他丈二和尚摸不着头脑
65
- 64.wav,0.325,画龙点睛之笔,让整个设计焕然一新。,0,0.0,画龙点睛之笔让整个设计焕然一新
66
- 65.wav,0.264,这是卡脖子技术,必须自主研发。,0,0.0,这是卡脖子技术必须自主研发
67
- 66.wav,0.337,这幅画作以其独特的色彩运用和构图技巧而闻名。,0,0.0,这幅画作以其独特的色彩运用和构图技巧而闻名
68
- 67.wav,0.277,患者主诉间歇性,胸痛放射至左臂。,0,0.0,患者主诉间歇性胸痛放射至左臂
69
- 68.wav,0.626,患者有冠状动脉粥样硬化性心脏病病史,十年慢性阻塞性肺疾病病史,五年。,0,0.0,患者有冠状动脉粥样硬化性心脏病病史十年慢性阻塞性肺疾病病史五年
70
- 69.wav,0.38,建议行冠状动脉、造影检查,必要时植入支架。,0,0.0,建议行冠状动脉造影检查必要时植入支架
71
- 70.wav,0.44,患者需低盐、低脂糖尿病饮食监测、血压、血糖变化。,0,0.0,患者需低盐低脂糖尿病饮食监测血压血糖变化
72
- 71.wav,0.553,胸部ct平扫显示,双肺散在膜玻璃样密度影以胸膜下分布为主。,1,0.037,胸部ct平扫显示双肺散在[-磨-]{+膜+}玻璃样密度影以胸膜下分布为主
73
- 72.wav,0.662,知识产权归属约定,乙方在履行本合同过程中所产生的全部智力成果,其知识产权归甲方所有。,1,0.025,知识产权归属约定乙方在履行本合同过程中所产生的全部智力成果其知识产权[-均-]归甲方所有
74
- 73.wav,0.792,双方应履行本合同发生争议的应首先通过友好协商解决,协商不成的,任何一方均有权向有管辖权的人民法院提起诉讼。,1,0.02,双方[-因-]{+应+}履行本合同发生争议的应首先通过友好协商解决协商不成的任何一方均有权向有管辖权的人民法院提起诉讼
75
- 74.wav,0.583,被告在法定期限内未提交答辩状,亦未到庭参加诉讼,本院依法缺席审理。,0,0.0,被告在法定期限内未提交答辩状亦未到庭参加诉讼本院依法缺席审理
76
- 75.wav,0.882,原告向本院提出诉讼请求。一、判令被告支付货款人民币五十万八千元及逾期付款利息。二、判令被告承担本案的诉讼费用。,0,0.0,原告向本院提出诉讼请求一判令被告支付货款人民币五十万八千元及逾期付款利息二判令被告承担本案的诉讼费用
77
- 76.wav,0.71,被执行人未按执行通知履行法律文书确定的义务,人民法院有权查封、扣押、冻结、拍卖被执行人的财产。,0,0.0,被执行人未按执行通知履行法律文书确定的义务人民法院有权查封扣押冻结拍卖被执行人的财产
78
- 77.wav,0.179,这个视频太上头了。,0,0.0,这个视频太上头了
79
- 78.wav,0.182,他真是个社恐。,0,0.0,他真是个社恐
80
- 79.wav,0.267,简直了,这躺平的状态也太佛系了吧。,0,0.0,简直了这躺平的状态也太佛系了吧
81
- 80.wav,0.263,这个瓜有点大我得去吃瓜了。,0,0.0,这个瓜有点大我得去吃瓜了
82
- 81.wav,0.261,别内卷了,咱们还是多交流交流吧。,0,0.0,别内卷了咱们还是多交流交流吧
83
- 82.wav,0.633,第二个公司,我们成立了中国黄页。在中国黄页的创业经验中有很多的经验,也是可以在这跟大家进行分享的。,1,0.022,第二个公司我们成立了中国黄页在中国黄���的创业经验中有很多的经验也是可以在这[-儿-]跟大家进行分享的
84
- 83.wav,0.582,对于大部分在接触微积分之前,主要的学习经验就是刷题,甚至是连题也不刷的同学们来说。,0,0.0,对于大部分在接触微积分之前主要的学习经验就是刷题甚至是连题也不刷的同学们来说
85
- 84.wav,0.81,说了两个小时没人听懂,我在说什么,最后二十三个人反对一个人同意这一个人就说马云你这样做,你就试试看,不行的话,赶紧逃回来,还来得及。,0,0.0,说了两个小时没人听懂我在说什么最后二十三个人反对一个人同意这一个人就说马云你这样做你就试试看不行的话赶紧逃回来还来得及
86
- 85.wav,0.46,晚上想想是热血沸腾真好,第二天早上骑个自行车又上班去了,对吧?,0,0.0,晚上想想是热血沸腾真好第二天早上骑个自行车又上班去了对吧
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/csv/whisper.csv DELETED
@@ -1,86 +0,0 @@
1
- file_name,time,inference_result
2
- 1.wav,1.032,"您这车开得真够稳的,蜗牛都超车了。",0,0.0,您这车开得真够稳的蜗牛都超车了
3
- 2.wav,0.783,"你可真是个天才,把这么简单的事情都搞砸了。",0,0.0,你可真是个天才把这么简单的事情都搞砸了
4
- 3.wav,0.827,"因网络问题,远程讲着音频中断,我们切到备用方案,共享幻灯片。",1,0.038,因网络问题远程讲[-者-]{+着+}音频中断我们切到备用方案共享幻灯片
5
- 4.wav,0.831,"这就好比用大炮打蚊子。资源分配严重不均衡,需要细腻度调度。",1,0.038,这就好比用大炮打蚊子资源分配严重不均衡需要细[-粒-]{+腻+}度调度
6
- 5.wav,0.765,"言归正传,我们来讨论一下核心问题。",0,0.0,言归正传我们来讨论一下核心问题
7
- 6.wav,0.774,这有点超出了我们今天讨论的范围。,0,0.0,这有点超出了我们今天讨论的范围
8
- 7.wav,0.756,他的年收入约为100万人民币。,3,0.231,他的年收入约为[-一百-]{+100+}万人民币
9
- 8.wav,0.81,下一位演讲者是来自斯坦福大学计算机科学系的张明教授。,0,0.0,下一位演讲者是来自斯坦福大学计算机科学系的张明教授
10
- 9.wav,0.797,请各位将手机调至静音模式。演讲结束后有15分钟提问时间。,2,0.077,请各位将手机调至静音模式演讲结束后有[-十五-]{+15+}分钟提问时间
11
- 10.wav,0.786,"茶歇将在10:30分开始,地联在二楼休息区。",5,0.263,茶歇将在[-十点三十-]{+1030+}分开始地[-点-]{+联+}在二楼休息区
12
- 11.wav,0.781,本合同自双方签字盖章之日起生效。,0,0.0,本合同自双方签字盖章之日起生效
13
- 12.wav,0.811,政府正在采取措施刺激经济增长和创造就业机会。,0,0.0,政府正在采取措施刺激经济增长和创造就业机会
14
- 13.wav,0.8,今天中午我们点外卖吧!我想吃公保鸡丁和麻婆豆腐。,1,0.045,今天中午我们点外卖吧我想吃[-宫-]{+公+}保鸡丁和麻婆豆腐
15
- 14.wav,0.883,"医生建议我多吃粗粮,比如燕麦和紫薯,少吃高油高糖的油炸食品,像炸鸡和甜甜圈。",0,0.0,医生建议我多吃粗粮比如燕麦和紫薯少吃高油高糖的油炸食品像炸鸡和甜甜圈
16
- 15.wav,0.818,"本方案采用易购计算架构,结合GPU与FPGA加速张量运算。",2,0.074,本方案采用[-异构-]{+易购+}计算架构结合gpu与fpga加速张量运算
17
- 16.wav,0.838,"推拟延迟稳定在5毫秒以内,吞吐量提升40%,功耗降低1.8瓦。",1,0.036,推[-理-]{+拟+}延迟稳定在5毫秒以内吞吐量提升40%功耗降低1.8瓦
18
- 17.wav,0.86,"相比SOTA模型,我们的方法在想样本场景下召回率高出12个百分点,且参数量仅为其三分之一。",1,0.024,相比sota模型我们的方法在[-小-]{+想+}样本场景下召回率高出12个百分点且参数量仅为其三分之一
19
- 18.wav,0.841,"当节点并发数超过1万时,内存带宽会成瓶颈,导致尾延尺急剧上升。",5,0.161,当节点并发数超过1[-0000-]{+万+}时内存带宽会成瓶颈导致尾延[-迟-]{+尺+}急剧上升
20
- 19.wav,0.906,"请看图3,蓝色柱是基线模型,红色线是我们的优化结果。交叉点表明,在第15轮迭代后,优势显著。",0,0.0,请看图3蓝色柱是基线模型红色线是我们的优化结果交叉点表明在第15轮迭代后优势显著
21
- 20.wav,0.84,"您提到模型泛化性提升,是否在跨模态数据上验证过,量化指标是多少。",0,0.0,您提到模型泛化性提升是否在跨模态数据上验证过量化指标是多少
22
- 21.wav,0.862,"容我追问一点,如果输入数据存在对抗样本,您方案的鲁棒性如何保证?失效概率有测试吗?",0,0.0,容我追问一点如果输入数据存在对抗样本您方案的鲁棒性如何保证失效概率有测试吗
23
- 22.wav,0.845,"抱歉打断,您说的动态简直,是指训练中还是推理中,预值是自适应的吗?",3,0.103,抱歉打断您说的动态[-剪枝-]{+简直+}是指训练中还是推理中[-阈-]{+预+}值是自适应的吗
24
- 23.wav,0.837,"成本效益是否可行?部署这种定制芯片,需要牛片费用,中小客户怎么承担?",1,0.033,成本效益是否可行部署这种定制芯片需要[-流-]{+牛+}片费用中小客户怎么承担
25
- 24.wav,0.841,"我补充一个角度,隐私计算领域也在用nace思路,是否可能跨领域合作。",4,0.138,我补充一个角度隐私计算领域也在用[-类似-]{+nace+}思路是否可能跨领域合作
26
- 25.wav,0.853,"端到端优化流水线,涵盖数据清洗特征工程,模型压缩部署监控。",0,0.0,端到端优化流水线涵盖数据清洗特征工程模型压缩部署监控
27
- 26.wav,0.783,请检查代码库中的依赖向冲突问题。,1,0.067,请检查代码库中的依赖[-项-]{+向+}冲突问题
28
- 27.wav,0.79,这个电路板集成了微控制器、传感器和无线通信模块。,0,0.0,这个电路板集成了微控制器传感器和无线通信模块
29
- 28.wav,0.784,区块链技术的核心在于去中心化和加密安全性。,0,0.0,区块链技术的核心在于去中心化和加密安全性
30
- 29.wav,0.786,"需符合等保三级要求,数据出境要走安全评估。",0,0.0,需符合等保三级要求数据出境要走安全评估
31
- 30.wav,0.78,"最新数据显示,消费者信心指数有所回升。",0,0.0,最新数据显示消费者信心指数有所回升
32
- 31.wav,0.795,"专家预测,新能源汽车市场将引来爆发式增长。",1,0.053,专家预测新能源汽车市场将[-迎-]{+引+}来爆发式增长
33
- 32.wav,0.841,"多模态大模型能够同时处理文本、图像和音频信号,实现真正的跨模态理解。",0,0.0,多模态大模型能够同时处理文本图像和音频信号实现真正的跨模态理解
34
- 33.wav,0.842,"联邦学习可以在保护用户隐私的前提下,实现分布式模型的协同训练。",0,0.0,联邦学习可以在保护用户隐私的前提下实现分布式模型的协同训练
35
- 34.wav,0.807,我们研发的50量子比特处理器在特定算法上展现出了量子优越性。,2,0.069,我们研发的[-五十-]{+50+}量子比特处理器在特定算法上展现出了量子优越性
36
- 35.wav,0.805,变分量子本真值求解器在化学模拟中显示出巨大潜力。,1,0.043,变分量子本[-征-]{+真+}值求解器在化学模拟中显示出巨大潜力
37
- 36.wav,0.811,"边缘AI-20的延迟已经可以控制在10ms以内,满足实时应用需求。",7,0.241,边缘ai[-推理-]{+-20+}的延迟已经可以控制在[-十毫秒-]{+10ms+}以内满足实时应用需求
38
- 37.wav,0.836,我们设计的新型神经网诺压缩算法在保持精度的同时将模型大小减少了70%。,6,0.167,我们设计的新型神经网[-络-]{+诺+}压缩算法在保持精度的同时将模型大小减少了[-百分之七十-]{+70%+}
39
- 38.wav,0.844,损失函数结合了交叉伤损失和对比损失。权重系数设置为0.3和0.7。,7,0.226,损失函数结合了交叉[-熵-]{+伤+}损失和对比损失权重系数设置为[-零点三-]{+0.3+}和[-零点七-]{+0.7+}
40
- 39.wav,0.872,"在五个标准数据级上的实验结果表明,我们的方法平均比现有最佳方法提升了2.3个百分点。",4,0.1,在五个标准数据[-集-]{+级+}上的实验结果表明我们的方法平均比现有最佳方法提升了[-二点三-]{+2.3+}个百分点
41
- 40.wav,0.838,消融实验证实了每个模块的有效性。移除注意力机制会导致性能下降4.6%。,6,0.171,消融实验证实了每个模块的有效性移除注意力机制会导致性能下降[-百分之四点六-]{+4.6%+}
42
- 41.wav,0.814,"模型的参数量为1.2亿,在8张A100显卡上训练了72小时。",7,0.241,模型的参数量为[-一点二-]{+1.2+}亿在[-八-]{+8+}张a100显卡上训练了[-七十二-]{+72+}小时
43
- 42.wav,0.775,"总之搞定了数据倾斜,性能就上去了。",0,0.0,总之搞定了数据倾斜性能就上去了
44
- 43.wav,0.925,"OpenAI Whisper需要FFMPEG的环境。FFMPEG是一个开源的跨平台音视频处理工具和框架,可以用来录制、转换和流逝传输音视频内容。",2,0.029,openai[---]whisper需要ffmpeg的环境ffmpeg是一个开源的跨平台音视频处理工具和框架可以用来录制转换和流[-式-]{+逝+}传输音视频内容
45
- 44.wav,0.833,"采用Transformer训练到训练模型,可以实现针对不同的语言处理任务。",4,0.114,采用transformer[-序列-]{+训练+}到[-序列-]{+训练+}模型可以实现针对不同的语言处理任务
46
- 45.wav,0.864,"Transformer架构在自然语言处理中的成功应用,已经彻底改变了预训练模型的范式。",0,0.0,transformer架构在自然语言处理中的成功应用已经彻底改变了预训练模型的范式
47
- 46.wav,0.849,"请大家注意,Workshop材料已经上传至会议系统,代码仓库链接在附录页。",0,0.0,请大家注意workshop材料已经上传至会议系统代码仓库链接在附录页
48
- 47.wav,0.852,"别造轮子了,直接调用YOLO V5开原库的预训练权种,快速迭带才是王道。",3,0.094,别造轮子了直接调用yolov5开[-源-]{+原+}库的预训练权[-重-]{+种+}快速迭[-代-]{+带+}才是王道
49
- 48.wav,0.817,"请确保您的设备已连接到5GHz Wifi频段,以获得最佳性能。",1,0.034,请确保您的设备已连接到5ghzwi[---]fi频段以获得最佳性能
50
- 49.wav,0.844,"我们提出了一种基于对比学习的自监督方法,在ImageNet数据集上达到了92.5%的转确率。",9,0.191,我们提出了一种基于对比学习的自监督方法在imagenet数据集上达到了[-百分之九十二点五-]{+92.5%+}的[-准-]{+转+}确率
51
- 50.wav,0.823,GPT4在代码生成和多部推理任务上展现出令人印象深刻的伦理。,4,0.133,gpt[---]4在代码生成和多[-步-]{+部+}推理任务上展现出令人印象深刻的[-能力-]{+伦理+}
52
- 51.wav,0.904,"若在高病发场景下,为启用我们的动态缓存机制,即使用RDMA网络,延迟也可能因CPU调度增强而恶化。",4,0.089,若在高[-并-]{+病+}发场景下[-未-]{+为+}启用我们的动态缓存机制即使用rdma网络延迟也可能因cpu调度[-争抢-]{+增强+}而恶化
53
- 52.wav,0.793,"准确说峰值算力是200TOPS,不是刚才说的150。",0,0.0,准确说峰值算力是200tops不是刚才说的150
54
- 53.wav,0.823,"秦始皇嬴政,书同文,车同轨,奠定了中国大一统的基础。",0,0.0,秦始皇嬴政书同文车同轨奠定了中国大一统的基础
55
- 54.wav,0.853,"诸葛娘在《出师表》中写道:鞠躬尽瘁,死而后已,成为后世臣子的楷模。",1,0.037,诸葛[-亮-]{+娘+}在出师表中写道鞠躬尽瘁死而后已成为后世臣子的楷模
56
- 55.wav,0.83,"李白的《举头望明月,低头思故乡》是连三岁孩童都能背诵的诗句。",0,0.0,李白的举头望明月低头思故乡是连三岁孩童都能背诵的诗句
57
- 56.wav,0.843,"孔子曾说:己所不欲,勿施于人。这简单的八个字,构成了儒家伦理的基石。",0,0.0,孔子曾说己所不欲勿施于人这简单的八个字构成了儒家伦理的基石
58
- 57.wav,0.849,"王羲之背后仁尊为书圣,其代表作《南庭集序》,被誉为天下第一行书。",4,0.148,王羲之[-被-]{+背+}后[-人-]{+仁+}尊为书圣其代表作[-兰亭-]{+南庭+}集序被誉为天下第一行书
59
- 58.wav,0.84,"我们要学习刨丁解牛的精神,掌握事物的客观规律才能游刃有余。",1,0.037,我们要学习[-庖-]{+刨+}丁解牛的精神掌握事物的客观规律才能游刃有余
60
- 59.wav,0.85,"向羽在鸿门宴上优柔寡断,放走了刘邦,最终兵败乌江自吻。",2,0.083,[-项-]{+向+}羽在鸿门宴上优柔寡断放走了刘邦最终兵败乌江自[-刎-]{+吻+}
61
- 60.wav,0.832,"在杭州西湖畔,人们总会想起苏氏治理西湖修筑苏迪的往事。",2,0.08,在杭州西湖畔人们总会想起苏[-轼-]{+氏+}治理西湖修筑苏[-堤-]{+迪+}的往事
62
- 61.wav,0.783,我计划去爱菲尔铁塔和卢浮宫参观。,1,0.067,我计划去[-埃-]{+爱+}菲尔铁塔和卢浮宫参观
63
- 62.wav,0.804,莎士比亚的戏剧深刻地探讨了人性的复杂性。,0,0.0,莎士比亚的戏剧深刻地探讨了人性的复杂性
64
- 63.wav,0.777,"这个消息让他仗而合上,摸不着头脑。",4,0.267,这个消息让他[-丈二和尚-]{+仗而合上+}摸不着头脑
65
- 64.wav,0.792,"画龙点睛之笔,让整个设计焕然一新。",0,0.0,画龙点睛之笔让整个设计焕然一新
66
- 65.wav,0.775,"这是卡博子技术,必须自主研发。",1,0.077,这是卡[-脖-]{+博+}子技术必须自主研发
67
- 66.wav,0.807,这幅画作以其独特的色彩运用和构图技巧而闻名。,0,0.0,这幅画作以其独特的色彩运用和构图技巧而闻名
68
- 67.wav,0.768,"患者主宿间歇性胸痛,放射至左臂。",1,0.071,患者主[-诉-]{+宿+}间歇性胸痛放射至左臂
69
- 68.wav,0.86,"患者有冠状动脉周样硬化性心脏病病死10年,慢性阻塞性肺疾病病死5年。",6,0.194,患者有冠状动脉[-粥-]{+周+}样硬化性心脏病病[-史十-]{+死10+}年慢性阻塞性肺疾病病[-史五-]{+死5+}年
70
- 69.wav,0.794,"建议行冠状动脉造影检查,必要使植入支架。",1,0.056,建议行冠状动脉造影检查必要[-时-]{+使+}植入支架
71
- 70.wav,0.833,"患者需低盐、低脂糖尿病饮食, 监测血压、血糖变化。",0,0.0,患者需低盐低脂糖尿病饮食监测血压血糖变化
72
- 71.wav,0.851,"胸部CT频扫显示,双肺散在膜玻璃样密度影,以胸膜下分布为主。",2,0.074,胸部ct[-平-]{+频+}扫显示双肺散在[-磨-]{+膜+}玻璃样密度影以胸膜下分布为主
73
- 72.wav,0.892,"知识产权归属约定,以方在履行本合同过程中所产生的全部致力成果,其知识产权归甲方所有。",3,0.075,知识产权归属约定[-乙-]{+以+}方在履行本合同过程中所产生的全部[-智-]{+致+}力成果其知识产权[-均-]归甲方所有
74
- 73.wav,0.918,"双方应履行本合同发生争议的,应首先通过友好协商解决,协商不成的,任何一方均有权向有管辖权的人民法院提起诉讼。",1,0.02,双方[-因-]{+应+}履行本合同发生争议的应首先通过友好协商解决协商不成的任何一方均有权向有管辖权的人民法院提起诉讼
75
- 74.wav,0.849,"被告在法定期限内为提交答编状,亦为道庭参加诉讼,本院依法缺席审理。",4,0.133,被告在法定期限内[-未-]{+为+}提交答[-辩-]{+编+}状亦[-未到-]{+为道+}庭参加诉讼本院依法缺席审理
76
- 75.wav,0.936,原告向本愿提出诉讼请求: 1.判令被告支付货款人民币50万8千元及逾期付款利息。 2.判令被告承担本案的诉讼费用。,8,0.16,原告向本[-院-]{+愿+}提出诉讼请求[-一-]{+1.+}判令被告支付货款人民币[-五十-]{+50+}万[-八-]{+8+}千元及逾期付款利息[-二-]{+2.+}判令被告承担本案的诉讼费用
77
- 76.wav,0.913,被执行人未按执行通知履行法律文书确定的义务。人民法院有权查封扣押、冻结、拍卖被执行人的财产。,0,0.0,被执行人未按执行通知履行法律文书确定的义务人民法院有权查封扣押冻结拍卖被执行人的财产
78
- 77.wav,0.739,这个视频太上头了。,0,0.0,这个视频太上头了
79
- 78.wav,0.737,他真是个社恐。,0,0.0,他真是个社恐
80
- 79.wav,0.774,"简直了,这躺平的状态也太佛系了吧!",0,0.0,简直了这躺平的状态也太佛系了吧
81
- 80.wav,0.746,"这个瓜有点大,我得去吃瓜了。",0,0.0,这个瓜有点大我得去吃瓜了
82
- 81.wav,0.763,"别内卷了,咱们还是多交流交流吧。",0,0.0,别内卷了咱们还是多交流交流吧
83
- 82.wav,0.873,"第二个公司我们成立了中国黄业。在中国黄业的创业经验中,有很多的经验,也是可以在这儿跟大家进行分享的。",2,0.043,第二个公司我们成立了中国黄[-页-]{+业+}在中国黄[-页-]{+业+}的创业经验中有很多的经验也是可以在这儿跟大家进行分享的
84
- 83.wav,0.865,"对于大部分在接触微积分之前,主要的学习经验就是刷题,甚至是连题也不刷的同学们来说。",0,0.0,对于大部分在接触微积分之前主要的学习经验就是刷题甚至是连题也不刷的同学们来说
85
- 84.wav,0.922,"说了两个小时,没人听懂我在说什么,最后23个人反对,一个人同意。这一个人就说:马云你这样做你就试试看,不行的话赶紧逃回来还来得及。",3,0.051,说了两个小时没人听懂我在说什么最后[-二十三-]{+23+}个人反对一个人同意这一个人就说马云你这样做你就试试看不行的话赶紧逃回来还来得及
86
- 85.wav,0.824,"晚上想想是热血沸腾。真好,第二天早上骑个自行车又上班去了,对吧?",0,0.0,晚上想想是热血沸腾真好第二天早上骑个自行车又上班去了对吧
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/export_onnx.py CHANGED
@@ -1,18 +1,23 @@
1
  from funasr import AutoModel
2
 
3
- model_dir = "/Users/moyoyo/code/Translator/moyoyo_asr_models"
4
- asr_model_path = model_dir + '/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
5
- vad_model_path = model_dir + '/speech_fsmn_vad_zh-cn-16k-common-pytorch'
6
- punc_model_path = model_dir + '/punc_ct-transformer_cn-en-common-vocab471067-large'
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- model = AutoModel(model=asr_model_path)
9
- model_dir = model.export(type="onnx", quantize=True, disable_update=True)
10
- print(model_dir)
11
-
12
- model = AutoModel(model=vad_model_path)
13
- model_dir = model.export(type="onnx", quantize=True, disable_update=True)
14
- print(model_dir)
15
 
16
- model = AutoModel(model=punc_model_path)
 
17
  model_dir = model.export(type="onnx", quantize=True, disable_update=True)
18
- print(model_dir)
 
1
  from funasr import AutoModel
2
 
3
+ # model_dir = "/Users/moyoyo/code/Translator/moyoyo_asr_models"
4
+ # asr_model_path = model_dir + '/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
5
+ # vad_model_path = model_dir + '/speech_fsmn_vad_zh-cn-16k-common-pytorch'
6
+ # punc_model_path = model_dir + '/punc_ct-transformer_cn-en-common-vocab471067-large'
7
+ #
8
+ # model = AutoModel(model=asr_model_path)
9
+ # model_dir = model.export(type="onnx", quantize=True, disable_update=True)
10
+ # print(model_dir)
11
+ #
12
+ # model = AutoModel(model=vad_model_path)
13
+ # model_dir = model.export(type="onnx", quantize=True, disable_update=True)
14
+ # print(model_dir)
15
+ #
16
+ # model = AutoModel(model=punc_model_path)
17
+ # model_dir = model.export(type="onnx", quantize=True, disable_update=True)
18
+ # print(model_dir)
19
 
 
 
 
 
 
 
 
20
 
21
+ model_dir = "/Users/jeqin/work/code/Fun-ASR-Nano-2512"
22
+ model = AutoModel(model=model_dir, trust_remote_code=True, remote_code="./model.py")
23
  model_dir = model.export(type="onnx", quantize=True, disable_update=True)
 
scripts/model.py ADDED
@@ -0,0 +1,696 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import random
5
+ import re
6
+ import string
7
+ import time
8
+ import traceback
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ from funasr import AutoModel
13
+ from funasr.metrics.compute_acc import compute_accuracy
14
+ from funasr.register import tables
15
+ from funasr.train_utils.device_funcs import force_gatherable, to_device
16
+ from funasr.utils.datadir_writer import DatadirWriter
17
+ from funasr.utils.load_utils import extract_fbank, load_audio_text_image_video
18
+ from transformers import AutoConfig, AutoModelForCausalLM
19
+
20
+ dtype_map = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
21
+
22
+
23
+ @tables.register("model_classes", "FunASRNano")
24
+ class FunASRNano(nn.Module):
25
+ def __init__(
26
+ self,
27
+ audio_encoder: str = None,
28
+ audio_encoder_conf: dict = None,
29
+ audio_adaptor: str = None,
30
+ audio_adaptor_conf: dict = None,
31
+ llm: str = None,
32
+ llm_conf: dict = None,
33
+ input_size: int = 80,
34
+ length_normalized_loss: bool = False,
35
+ **kwargs,
36
+ ):
37
+ super().__init__()
38
+
39
+ # audio encoder
40
+ hub = audio_encoder_conf.get("hub", None)
41
+ self.audio_encoder_activation_checkpoint = audio_encoder_conf.get(
42
+ "activation_checkpoint", False
43
+ )
44
+ if hub == "ms":
45
+ model = AutoModel(model=audio_encoder, model_revision="master")
46
+ audio_encoder_output_size = (
47
+ model.model.encoder_output_size
48
+ if hasattr(model.model, "encoder_output_size")
49
+ else -1
50
+ )
51
+ audio_encoder = (
52
+ model.model.model.encoder
53
+ if hasattr(model.model, "model")
54
+ else model.model.encoder
55
+ )
56
+ else:
57
+ encoder_class = tables.encoder_classes.get(audio_encoder)
58
+ audio_encoder = encoder_class(input_size=input_size, **audio_encoder_conf)
59
+ audio_encoder_output_size = audio_encoder.output_size()
60
+ freeze = audio_encoder_conf.get("freeze", True)
61
+ freeze_layer_num = int(audio_encoder_conf.get("freeze_layer_num", -1))
62
+
63
+ if freeze:
64
+ for name, param in audio_encoder.named_parameters():
65
+ param.requires_grad = False
66
+ audio_encoder.eval()
67
+ self.audio_encoder = audio_encoder
68
+ # llm
69
+ self.llm = None
70
+ init_param_path = llm_conf.get("init_param_path", None)
71
+ llm_dim = None
72
+
73
+ llm_load_kwargs = llm_conf.get("load_kwargs", {})
74
+ config = AutoConfig.from_pretrained(init_param_path)
75
+ model = AutoModelForCausalLM.from_config(config, **llm_load_kwargs)
76
+
77
+ freeze = llm_conf.get("freeze", True)
78
+ if freeze:
79
+ for name, param in model.named_parameters():
80
+ param.requires_grad = False
81
+ model.eval()
82
+ logging.info(f"use_lora: {llm_conf.get('use_lora', False)}")
83
+ if llm_conf.get("use_lora", False):
84
+ from omegaconf import DictConfig, OmegaConf
85
+
86
+ lora_conf = llm_conf.get("lora_conf", {})
87
+ if isinstance(lora_conf, (OmegaConf, DictConfig)):
88
+ lora_conf = OmegaConf.to_container(lora_conf, resolve=True)
89
+ from peft import LoraConfig, PeftModel, get_peft_model
90
+
91
+ lora_init_param_path = lora_conf.get("init_param_path", None)
92
+ if lora_init_param_path is not None:
93
+ logging.info(f"lora_init_param_path: {lora_init_param_path}")
94
+ model = PeftModel.from_pretrained(model, lora_init_param_path)
95
+ for name, param in model.named_parameters():
96
+ if not lora_conf.get("freeze_lora", False):
97
+ if "lora_" in name:
98
+ param.requires_grad = True
99
+ else:
100
+ peft_config = LoraConfig(**lora_conf)
101
+ model = get_peft_model(model, peft_config)
102
+ model.print_trainable_parameters()
103
+
104
+ if llm_conf.get("activation_checkpoint", False):
105
+ model.gradient_checkpointing_enable()
106
+
107
+ self.llm_dtype = llm_conf.get("llm_dtype", "fp32")
108
+ self.llm = model.to(dtype_map[self.llm_dtype])
109
+ llm_dim = model.get_input_embeddings().weight.shape[-1]
110
+
111
+ # adaptor
112
+ adaptor_class = tables.adaptor_classes.get(audio_adaptor)
113
+ if audio_encoder_output_size > 0:
114
+ audio_adaptor_conf["encoder_dim"] = audio_encoder_output_size
115
+ audio_adaptor_conf["llm_dim"] = (
116
+ llm_dim if llm_dim is not None else audio_adaptor_conf["llm_dim"]
117
+ )
118
+ audio_adaptor = adaptor_class(**audio_adaptor_conf)
119
+ freeze = audio_adaptor_conf.get("freeze", False)
120
+ if freeze:
121
+ for name, param in audio_adaptor.named_parameters():
122
+ param.requires_grad = False
123
+ audio_adaptor.eval()
124
+ self.audio_adaptor = audio_adaptor
125
+ self.use_low_frame_rate = audio_adaptor_conf.get("use_low_frame_rate", False)
126
+
127
+ self.length_normalized_loss = length_normalized_loss
128
+ rank = int(os.environ.get("RANK", 0))
129
+ logging.info(f"rank: {rank}, model is builded.")
130
+
131
+ def forward(
132
+ self,
133
+ speech: torch.Tensor = None,
134
+ speech_lengths: torch.Tensor = None,
135
+ input_ids: torch.Tensor = None,
136
+ attention_mask: torch.Tensor = None,
137
+ labels_ids: torch.Tensor = None,
138
+ fbank_beg: torch.Tensor = None,
139
+ fbank_mask: torch.Tensor = None,
140
+ **kwargs,
141
+ ):
142
+ batch_size, token_num = input_ids.shape
143
+ stats = {}
144
+ input_ids[input_ids < 0] = 0
145
+ inputs_embeds = self.llm.model.get_input_embeddings()(input_ids)
146
+ if speech is not None:
147
+ if len(speech_lengths.size()) > 1:
148
+ speech_lengths = speech_lengths[:, 0]
149
+ batch_size_speech, frames, _ = speech.shape
150
+
151
+ # audio encoder
152
+ if self.audio_encoder_activation_checkpoint:
153
+ from torch.utils.checkpoint import checkpoint
154
+
155
+ encoder_out, encoder_out_lens = checkpoint(
156
+ self.encode, speech, speech_lengths, use_reentrant=False
157
+ )
158
+ else:
159
+ encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
160
+
161
+ # audio_adaptor
162
+ encoder_out, encoder_out_lens = self.audio_adaptor(
163
+ encoder_out, encoder_out_lens
164
+ )
165
+
166
+ batch_size, token_num, dims = inputs_embeds.shape
167
+ fake_token_len = kwargs.get("fake_token_len")
168
+ fake_token_len[fake_token_len < 0] = 0
169
+ fbank_beg[fbank_beg < 0] = 0
170
+
171
+ speech_idx = 0
172
+ for batch_idx in range(batch_size):
173
+ for turn_id in range(fbank_beg.shape[1]):
174
+ fbank_beg_idx = fbank_beg[batch_idx, turn_id].item()
175
+ if fbank_beg_idx > 0:
176
+ speech_token_len = fake_token_len[batch_idx, turn_id]
177
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
178
+
179
+ try:
180
+ inputs_embeds[
181
+ batch_idx,
182
+ fbank_beg_idx : fbank_beg_idx + speech_token_len,
183
+ :,
184
+ ] = speech_token
185
+ except Exception as e:
186
+ logging.error(f"{str(e)}, {traceback.format_exc()}")
187
+ logging.info(
188
+ f"batch_idx: {batch_idx}, inputs_embeds: {inputs_embeds.shape}, fbank_beg_idx: {fbank_beg_idx}, speech_token_len: {speech_token_len}, encoder_out: {encoder_out.shape}, encoder_out_lens: {encoder_out_lens}, fake_token_len: {fake_token_len}, speech_lengths: {speech_lengths}"
189
+ )
190
+ speech_token_len = encoder_out_lens[speech_idx].item()
191
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
192
+ inputs_embeds[
193
+ batch_idx,
194
+ fbank_beg_idx : fbank_beg_idx + speech_token_len,
195
+ :,
196
+ ] = speech_token
197
+
198
+ speech_idx += 1
199
+
200
+ stats["batch_size_speech"] = batch_size_speech
201
+ stats["batch_size_x_frames"] = frames * batch_size_speech
202
+ stats["batch_size_real_frames"] = speech_lengths.sum().item()
203
+ stats["padding_frames"] = (
204
+ stats["batch_size_x_frames"] - stats["batch_size_real_frames"]
205
+ )
206
+
207
+ device_type = next(self.parameters()).device.type
208
+ with torch.autocast(
209
+ device_type=device_type if device_type in ["cuda", "mps"] else "cpu",
210
+ enabled=True if self.llm_dtype != "fp32" else False,
211
+ dtype=dtype_map[self.llm_dtype],
212
+ ):
213
+ labels_ids[labels_ids == -1] = -100
214
+ attention_mask[attention_mask < 0] = 0
215
+ model_outputs = self.llm(
216
+ inputs_embeds=inputs_embeds.to(dtype_map[self.llm_dtype]),
217
+ attention_mask=attention_mask,
218
+ labels=labels_ids,
219
+ )
220
+ loss = model_outputs.loss
221
+
222
+ with torch.no_grad():
223
+ preds = torch.argmax(model_outputs.logits, -1)
224
+ acc_att = compute_accuracy(
225
+ preds[:, :-1], labels_ids[:, 1:], ignore_label=-100
226
+ )
227
+ stats["acc"] = acc_att
228
+
229
+ stats["loss"] = torch.clone(loss.detach())
230
+ stats["batch_size"] = batch_size
231
+
232
+ stats["batch_size_x_tokens"] = token_num * batch_size
233
+ stats["batch_size_real_tokens"] = attention_mask.sum().item()
234
+ stats["padding_tokens"] = (
235
+ stats["batch_size_x_tokens"] - stats["batch_size_real_tokens"]
236
+ )
237
+
238
+ dialog_turns = (fbank_beg > 0).sum(-1)
239
+ dialog_turns_max = torch.max(dialog_turns).int().item()
240
+ dialog_turns_avg = dialog_turns.sum().item() / batch_size
241
+ stats["dialog_turns_max"] = dialog_turns_max
242
+ stats["dialog_turns_avg"] = dialog_turns_avg
243
+
244
+ # force_gatherable: to-device and to-tensor if scalar for DataParallel
245
+ if self.length_normalized_loss:
246
+ batch_size = int((labels_ids > 0 + 1).sum())
247
+ loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
248
+ return loss, stats, weight
249
+
250
+ def forward_export(self, speech, speech_lengths, **kwargs):
251
+ x, olens = self.audio_encoder(speech, speech_lengths)
252
+ encoder_out, encoder_out_lens = self.audio_adaptor(x, olens)
253
+ return encoder_out, encoder_out_lens
254
+
255
+ def encode(self, speech, speech_lengths):
256
+ # audio encoder
257
+ encoder_out, encoder_out_lens = self.audio_encoder(speech, speech_lengths)
258
+
259
+ return encoder_out, encoder_out_lens
260
+
261
+ def data_template(self, data):
262
+ system, user, assistant = [], [], []
263
+ for i, item in enumerate(data):
264
+ role = item["role"]
265
+ content = item["content"]
266
+ if role == "system":
267
+ system.append(content)
268
+ elif role == "user":
269
+ if "audio" in item:
270
+ audio = item["audio"]
271
+ content = [content, audio]
272
+ user.append(content)
273
+ elif role == "assistant":
274
+ assistant.append(content)
275
+
276
+ system = system * len(user)
277
+
278
+ contents = {
279
+ "system": system,
280
+ "user": user,
281
+ "assistant": assistant,
282
+ }
283
+
284
+ return contents
285
+
286
+ def data_load_speech(
287
+ self, contents: dict, tokenizer, frontend, meta_data={}, **kwargs
288
+ ):
289
+ system = contents["system"]
290
+ user = contents["user"]
291
+ assistant = contents["assistant"]
292
+ pattern = re.compile(r"(<\|startofspeech\|>.*?<\|endofspeech\|>)")
293
+ do_think = True
294
+ sys_prompt = True
295
+ if "dataset_conf" in kwargs:
296
+ do_think = kwargs["dataset_conf"].get("do_think", True)
297
+ sys_prompt = kwargs["dataset_conf"].get("sys_prompt", True)
298
+
299
+ input_ids, labels, fbank, fbank_lens, fbank_mask, fbank_beg, fake_token_len = (
300
+ [],
301
+ [],
302
+ [],
303
+ [],
304
+ [],
305
+ [],
306
+ [],
307
+ )
308
+ input_source_ids = []
309
+ for i, (system_prompt, user_prompt, target_out) in enumerate(
310
+ zip(system, user, assistant)
311
+ ):
312
+ if i >= kwargs.get("multiturn_num_max", 5):
313
+ break
314
+ if len(input_ids) > kwargs.get("max_token_length", 1500):
315
+ break
316
+ if isinstance(user_prompt, (list, tuple)):
317
+ user_prompt, audio = user_prompt
318
+ if i == 0:
319
+ if kwargs.get("infer_with_assistant_input", False):
320
+ source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}"
321
+ if not sys_prompt:
322
+ source_input = f"<|im_start|>user\n{user_prompt}"
323
+ else:
324
+ source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
325
+ if not sys_prompt:
326
+ source_input = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
327
+ else:
328
+ if kwargs.get("infer_with_assistant_input", False):
329
+ source_input = f"<|im_start|>user\n{user_prompt}"
330
+ else:
331
+ source_input = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
332
+ if not do_think:
333
+ source_input += "<think>\n\n</think>\n\n"
334
+
335
+ splits = pattern.split(source_input)
336
+ source_ids = []
337
+ fbank_mask_i = []
338
+ fake_token_len_i = 0
339
+ fbank_beg_i = -1
340
+ speech, speech_lengths = [], []
341
+ for k, sub_str in enumerate(splits):
342
+ if not sub_str.startswith("<|startofspeech|>"):
343
+ sub_token = tokenizer.encode(sub_str)
344
+ source_ids += sub_token
345
+ fbank_mask_i += [0] * len(sub_token)
346
+ else:
347
+ sub_str = sub_str.replace("<|startofspeech|>", "").replace(
348
+ "<|endofspeech|>", ""
349
+ )
350
+ if sub_str.startswith("!"):
351
+ sub_str = sub_str[1:]
352
+ if sub_str.startswith("!"): # !!: audio sample point
353
+ sub_str = audio
354
+ try:
355
+ time1 = time.perf_counter()
356
+ data_src = load_audio_text_image_video(
357
+ sub_str, fs=frontend.fs, **kwargs
358
+ )
359
+ time2 = time.perf_counter()
360
+ meta_data["load_data"] = f"{time2 - time1:0.3f}"
361
+ except Exception as e:
362
+ logging.error(
363
+ f"Loading wav failed! {str(e)}, {traceback.format_exc()}"
364
+ )
365
+
366
+ speech, speech_lengths = extract_fbank(
367
+ data_src,
368
+ data_type=kwargs.get("data_type", "sound"),
369
+ frontend=frontend,
370
+ is_final=True,
371
+ ) # speech: [b, T, d]
372
+
373
+ time3 = time.perf_counter()
374
+ meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
375
+ meta_data["batch_data_time"] = (
376
+ speech_lengths.sum().item()
377
+ * frontend.frame_shift
378
+ * frontend.lfr_n
379
+ / 1000
380
+ )
381
+
382
+ if self.use_low_frame_rate:
383
+ olens = 1 + (speech_lengths[0].item() - 3 + 2 * 1) // 2
384
+ olens = 1 + (olens - 3 + 2 * 1) // 2
385
+ fake_token_len_i = (olens - 1) // 2 + 1
386
+ else:
387
+ fake_token_len_i = speech_lengths[0].item()
388
+ fake_token = [0] * fake_token_len_i
389
+ fbank_beg_i = len(source_ids)
390
+ source_ids += fake_token
391
+ fbank_mask_i += [1] * len(fake_token)
392
+
393
+ fbank_beg += [fbank_beg_i + len(input_ids)]
394
+ fake_token_len += [fake_token_len_i]
395
+ source_mask = [-100] * len(source_ids)
396
+ target_out = f"{target_out}<|im_end|>"
397
+ target_ids = tokenizer.encode(target_out)
398
+ input_source_ids = input_ids + source_ids
399
+ input_ids += source_ids + target_ids
400
+ labels += source_mask + target_ids
401
+ fbank_mask += fbank_mask_i
402
+ if len(speech) > 0:
403
+ fbank.append(speech[0, :, :])
404
+ fbank_lens.append(speech_lengths)
405
+
406
+ input_ids = torch.tensor(
407
+ input_ids, dtype=torch.int64
408
+ ) # [: self.max_token_length]
409
+ attention_mask = torch.tensor([1] * len(input_ids), dtype=torch.int32)
410
+ labels = torch.tensor(labels, dtype=torch.int64) # [: self.max_token_length]
411
+
412
+ fbank_mask = torch.tensor(fbank_mask, dtype=torch.float32)
413
+ fbank_beg = torch.tensor(fbank_beg, dtype=torch.int32)
414
+ fake_token_len = torch.tensor(fake_token_len, dtype=torch.int32)
415
+ source_ids = torch.tensor(input_source_ids, dtype=torch.int64)
416
+ target_ids = torch.tensor(target_ids, dtype=torch.int64)
417
+
418
+ if len(fbank) > 0:
419
+ speech = torch.nn.utils.rnn.pad_sequence(
420
+ fbank, batch_first=True, padding_value=0.0
421
+ )
422
+ speech_lengths = torch.nn.utils.rnn.pad_sequence(
423
+ fbank_lens, batch_first=True, padding_value=-1
424
+ )
425
+ else:
426
+ speech = []
427
+ speech_lengths = []
428
+ output = {
429
+ "speech": speech,
430
+ "speech_lengths": speech_lengths,
431
+ "fbank_mask": fbank_mask[None, :],
432
+ "fbank_beg": fbank_beg[None,],
433
+ "fake_token_len": fake_token_len[None, :],
434
+ "input_ids": input_ids[None,],
435
+ "attention_mask": attention_mask[None,],
436
+ "labels_ids": labels,
437
+ "source_ids": source_ids[None, :],
438
+ "target_ids": target_ids[None, :],
439
+ }
440
+
441
+ return output
442
+
443
+ def inference_prepare(
444
+ self,
445
+ data_in,
446
+ data_lengths=None,
447
+ key: list = None,
448
+ tokenizer=None,
449
+ frontend=None,
450
+ **kwargs,
451
+ ):
452
+ meta_data = {}
453
+
454
+ if kwargs.get("batch_size", 1) > 1:
455
+ raise NotImplementedError("batch decoding is not implemented")
456
+
457
+ contents = self.data_template(data_in[0])
458
+ output = self.data_load_speech(
459
+ contents, tokenizer, frontend, meta_data=meta_data, **kwargs
460
+ )
461
+ batch = to_device(output, kwargs["device"])
462
+
463
+ # audio encoder
464
+ speech = batch["speech"]
465
+
466
+ if len(speech) > 0:
467
+ if "audio_embedding" in kwargs and "audio_embedding_lens" in kwargs:
468
+ encoder_out = kwargs["audio_embedding"]
469
+ encoder_out_lens = kwargs["audio_embedding_lens"]
470
+ else:
471
+ speech_lengths = batch["speech_lengths"][:, 0]
472
+ # fp16
473
+ if kwargs.get("fp16", False):
474
+ speech = speech.to(torch.float16)
475
+ elif kwargs.get("bf16", False):
476
+ speech = speech.to(torch.bfloat16)
477
+ # audio encoder
478
+ encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
479
+
480
+ # audio_adaptor
481
+ encoder_out, encoder_out_lens = self.audio_adaptor(
482
+ encoder_out, encoder_out_lens
483
+ )
484
+ meta_data["audio_adaptor_out"] = encoder_out
485
+ meta_data["audio_adaptor_out_lens"] = encoder_out_lens
486
+
487
+ input_ids = batch["input_ids"]
488
+ source_ids = batch["source_ids"]
489
+ fbank_beg = batch["fbank_beg"]
490
+ fake_token_len = batch["fake_token_len"]
491
+
492
+ if not kwargs.get("tearchforing", False):
493
+ input_ids = source_ids
494
+
495
+ input_ids[input_ids < 0] = 0
496
+ inputs_embeds = self.llm.model.get_input_embeddings()(input_ids)
497
+
498
+ batch_size, token_num, dims = inputs_embeds.shape
499
+
500
+ fake_token_len[fake_token_len < 0] = 0
501
+ fbank_beg[fbank_beg < 0] = 0
502
+
503
+ speech_idx = 0
504
+ for batch_idx in range(batch_size):
505
+ for turn_id in range(fbank_beg.shape[1]):
506
+ fbank_beg_idx = fbank_beg[batch_idx, turn_id].item()
507
+ if fbank_beg_idx > 0:
508
+ speech_token_len = fake_token_len[batch_idx, turn_id]
509
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
510
+
511
+ try:
512
+ inputs_embeds[
513
+ batch_idx,
514
+ fbank_beg_idx : fbank_beg_idx + speech_token_len,
515
+ :,
516
+ ] = speech_token
517
+ except Exception as e:
518
+ #
519
+ logging.error(f"{str(e)}, {traceback.format_exc()}")
520
+ logging.info(
521
+ f"batch_idx: {batch_idx}, inputs_embeds: {inputs_embeds.shape}, fbank_beg_idx: {fbank_beg_idx}, speech_token_len: {speech_token_len}, encoder_out: {encoder_out.shape}, encoder_out_lens: {encoder_out_lens}, fake_token_len: {fake_token_len}, speech_lengths: {speech_lengths}"
522
+ )
523
+ speech_token_len = encoder_out_lens[speech_idx].item()
524
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
525
+ inputs_embeds[
526
+ batch_idx,
527
+ fbank_beg_idx : fbank_beg_idx + speech_token_len,
528
+ :,
529
+ ] = speech_token
530
+
531
+ speech_idx += 1
532
+ return inputs_embeds, contents, batch, source_ids, meta_data
533
+
534
+ def inference(
535
+ self,
536
+ data_in,
537
+ data_lengths=None,
538
+ key: list = None,
539
+ tokenizer=None,
540
+ frontend=None,
541
+ **kwargs,
542
+ ):
543
+ hotwords = kwargs.get("hotwords", [])
544
+ if len(hotwords) > 0:
545
+ hotwords = ", ".join(hotwords)
546
+ prompt = f"请结合上下文信息,更加准确地完成语音转写任务。如果没有相关信息,我们会留空。\n\n\n**上下文信息:**\n\n\n"
547
+ prompt += f"热词列表:[{hotwords}]\n"
548
+ else:
549
+ prompt = ""
550
+ language = kwargs.get("language", None)
551
+ if language is None:
552
+ prompt += "语音转写"
553
+ else:
554
+ prompt += f"语音转写成{language}"
555
+ itn = kwargs.get("itn", True)
556
+ if not itn:
557
+ prompt += ",不进行文本规整"
558
+ prompt += ":"
559
+
560
+ new_data_in = []
561
+ for data in data_in:
562
+ if isinstance(data, str):
563
+ new_data_in.append(
564
+ [
565
+ {"role": "system", "content": "You are a helpful assistant."},
566
+ {
567
+ "role": "user",
568
+ "content": f"{prompt}<|startofspeech|>!{data}<|endofspeech|>",
569
+ },
570
+ {"role": "assistant", "content": "null"},
571
+ ]
572
+ )
573
+ elif isinstance(data, torch.Tensor):
574
+ new_data_in.append(
575
+ [
576
+ {"role": "system", "content": "You are a helpful assistant."},
577
+ {
578
+ "role": "user",
579
+ "content": f"{prompt}<|startofspeech|>!!<|endofspeech|>",
580
+ "audio": data,
581
+ },
582
+ {"role": "assistant", "content": "null"},
583
+ ]
584
+ )
585
+ data_in = new_data_in
586
+
587
+ if key is None:
588
+ key = []
589
+ for _ in data_in:
590
+ chars = string.ascii_letters + string.digits
591
+ key.append(
592
+ "rand_key_" + "".join(random.choice(chars) for _ in range(13))
593
+ )
594
+
595
+ return self.inference_llm(
596
+ data_in,
597
+ data_lengths=data_lengths,
598
+ key=key,
599
+ tokenizer=tokenizer,
600
+ frontend=frontend,
601
+ **kwargs,
602
+ )
603
+
604
+ def inference_llm(
605
+ self,
606
+ data_in,
607
+ data_lengths=None,
608
+ key: list = None,
609
+ tokenizer=None,
610
+ frontend=None,
611
+ **kwargs,
612
+ ):
613
+ inputs_embeds, contents, batch, source_ids, meta_data = self.inference_prepare(
614
+ data_in, data_lengths, key, tokenizer, frontend, **kwargs
615
+ )
616
+ llm_dtype = kwargs.get("llm_dtype", "fp32")
617
+ if llm_dtype == "fp32":
618
+ llm_dtype = "fp16" if kwargs.get("fp16", False) else llm_dtype
619
+ llm_dtype = "bf16" if kwargs.get("bf16", False) else llm_dtype
620
+
621
+ device_type = torch.device(kwargs.get("device", "cuda")).type
622
+ with torch.autocast(
623
+ device_type=device_type if device_type in ["cuda", "mps"] else "cpu",
624
+ enabled=True if llm_dtype != "fp32" else False,
625
+ dtype=dtype_map[llm_dtype]
626
+ ):
627
+ label = contents["assistant"][-1]
628
+ self.llm = self.llm.to(dtype_map[llm_dtype])
629
+ inputs_embeds = inputs_embeds.to(dtype_map[llm_dtype])
630
+ llm_kwargs = kwargs.get("llm_kwargs", {})
631
+ if not kwargs.get("teachforing", False):
632
+ generated_ids = self.llm.generate(
633
+ inputs_embeds=inputs_embeds,
634
+ max_new_tokens=kwargs.get("max_length", 512),
635
+ **llm_kwargs,
636
+ )
637
+
638
+ response = tokenizer.batch_decode(
639
+ generated_ids,
640
+ skip_special_tokens=kwargs.get("skip_special_tokens", True),
641
+ )[0]
642
+
643
+ loss = None
644
+ else:
645
+ labels_ids = batch["labels_ids"]
646
+ labels_ids[labels_ids == -1] = -100
647
+ attention_mask = batch.get("attention_mask", None)
648
+ model_outputs = self.llm(
649
+ inputs_embeds=inputs_embeds,
650
+ attention_mask=attention_mask,
651
+ labels=labels_ids,
652
+ **llm_kwargs,
653
+ )
654
+
655
+ preds = torch.argmax(model_outputs.logits, -1)[:, source_ids.shape[1] :]
656
+ response = tokenizer.batch_decode(
657
+ preds,
658
+ add_special_tokens=False,
659
+ skip_special_tokens=kwargs.get("skip_special_tokens", True),
660
+ )[0]
661
+ loss = model_outputs.loss.item()
662
+
663
+ ibest_writer = None
664
+ if kwargs.get("output_dir") is not None:
665
+ if not hasattr(self, "writer"):
666
+ self.writer = DatadirWriter(kwargs.get("output_dir"))
667
+ ibest_writer = self.writer[f"{0 + 1}best_recog"]
668
+
669
+ results = []
670
+ response_clean = re.sub(r"[^\w\s\u3000\u4e00-\u9fff]+", "", response)
671
+ result_i = {
672
+ "key": key[0],
673
+ "text": re.sub(r'\s+', ' ', response.replace("/sil", " ")),
674
+ "text_tn": response_clean,
675
+ "label": label,
676
+ }
677
+ if loss is not None:
678
+ result_i["loss"] = loss
679
+ results.append(result_i)
680
+
681
+ if ibest_writer is not None:
682
+ ibest_writer["text"][key[0]] = response.replace("\n", " ")
683
+ ibest_writer["label"][key[0]] = label.replace("\n", " ")
684
+ ibest_writer["text_tn"][key[0]] = response_clean
685
+
686
+ return results, meta_data
687
+
688
+ @staticmethod
689
+ def from_pretrained(model: str = None, **kwargs):
690
+ from funasr import AutoModel
691
+
692
+ model, kwargs = AutoModel.build_model(
693
+ model=model, trust_remote_code=True, **kwargs
694
+ )
695
+
696
+ return model, kwargs
scripts/run_funasr.py CHANGED
@@ -25,9 +25,9 @@ def main():
25
  )
26
  t1 = time.time()
27
  print("load model: ", t1 - t0)
28
- audios = Path("/test_data/audio_clips/")
29
  rows = [["file_name", "inference_time", "inference_result"]]
30
- for audio in sorted(audios.glob("*mix/*")):
31
  print(audio)
32
  t1 = time.time()
33
  try:
 
25
  )
26
  t1 = time.time()
27
  print("load model: ", t1 - t0)
28
+ audios = Path("../test_data/recordings/")
29
  rows = [["file_name", "inference_time", "inference_result"]]
30
+ for audio in sorted(audios.glob("*.wav"), key=lambda x: int(x.stem)):
31
  print(audio)
32
  t1 = time.time()
33
  try:
scripts/run_funasr_mlt_nano.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import time
3
+ import csv
4
+ from funasr import AutoModel
5
+
6
+
7
+ def main():
8
+ device = "mps"
9
+ model_dir = "/Users/jeqin/work/code/Fun-ASR-Nano-2512"
10
+ model = AutoModel(
11
+ model=model_dir,
12
+ trust_remote_code=True,
13
+ remote_code="./model.py",
14
+ device=device,
15
+ )
16
+
17
+ wav_path = f"/Users/jeqin/work/code/TestTranslator/test_data/audio_clips/zhengyaowei-part1.mp3"
18
+ res = model.generate(
19
+ input=[wav_path],
20
+ cache={},
21
+ batch_size=1,
22
+ # hotwords=["开放时间"],
23
+ # 中文、英文、日文 for Fun-ASR-Nano-2512
24
+ # 中文、英文、粤语、日文、韩文、越南语、印尼语、泰语、马来语、菲律宾语、阿拉伯语、
25
+ # 印地语、保加利亚语、克罗地亚语、捷克语、丹麦语、荷兰语、爱沙尼亚语、芬兰语、希腊语、
26
+ # 匈牙利语、爱尔兰语、拉脱维亚语、立陶宛语、马耳他语、波兰语、葡萄牙语、罗马尼亚语、
27
+ # 斯洛伐克语、斯洛文尼亚语、瑞典语 for Fun-ASR-MLT-Nano-2512
28
+ language="中文",
29
+ itn=True, # or False
30
+ )
31
+ text = res[0]["text"]
32
+ print(text)
33
+ text = model.generate(input=[wav_path],
34
+ cache={},
35
+ batch_size=1,
36
+ # hotwords=["开放时间"],
37
+ # language="中文",
38
+ itn=True, # or False
39
+ )[0]["text"]
40
+ print(text)
41
+ text = model.generate(input=[wav_path],
42
+ cache={},
43
+ batch_size=1,
44
+ hotwords=["头数", "llama", "decode", "query"],
45
+ # language="中文",
46
+ itn=True, # or False
47
+ )[0]["text"]
48
+ print(text)
49
+
50
+ # model = AutoModel(
51
+ # model=model_dir,
52
+ # trust_remote_code=True,
53
+ # vad_model="fsmn-vad",
54
+ # vad_kwargs={"max_single_segment_time": 30000},
55
+ # remote_code="./model.py",
56
+ # device=device,
57
+ # )
58
+ # res = model.generate(input=[wav_path], cache={}, batch_size=1)
59
+ # text = res[0]["text"]
60
+ # print(text)
61
+
62
+ def save_csv(file_path, rows):
63
+ with open(file_path, "w", encoding="utf-8") as f:
64
+ writer = csv.writer(f)
65
+ writer.writerows(rows)
66
+ print(f"write csv to {file_path}")
67
+
68
+ def load_model():
69
+ device = "mps"
70
+ s = time.time()
71
+ model_dir = "/Users/jeqin/work/code/Fun-ASR-MLT-Nano-2512"
72
+ model = AutoModel(
73
+ model=model_dir,
74
+ trust_remote_code=True,
75
+ remote_code="./model.py",
76
+ device=device,
77
+ disable_update=True,
78
+ )
79
+ print("load model cost:", time.time() - s)
80
+ return model
81
+
82
+ def inference(model, wav_path):
83
+ t1 = time.time()
84
+ # res = model.generate(input=[wav_path], cache={}, batch_size=1)
85
+ res = model.generate(
86
+ input=[str(wav_path)],
87
+ cache={},
88
+ # batch_size=1,
89
+ hotwords=["开放时间", "llama", "decode"],
90
+ # 中文、英文、日文 for Fun-ASR-Nano-2512
91
+ # 中文、英文、粤语、日文、韩文、越南语、印尼语、泰语、马来语、菲律宾语、阿拉伯语、
92
+ # 印地语、保加利亚语、克罗地亚语、捷克语、丹麦语、荷兰语、爱沙尼亚语、芬兰语、希腊语、
93
+ # 匈牙利语、爱尔兰语、拉脱维亚语、立陶宛语、马耳他语、波兰语、葡萄牙语、罗马尼亚语、
94
+ # 斯洛伐克语、斯洛文尼亚语、瑞典语 for Fun-ASR-MLT-Nano-2512
95
+ language="中文",
96
+ itn=True, # or False
97
+ )
98
+ text = res[0]["text"]
99
+ return text, time.time()-t1
100
+
101
+ def run_audio_clips():
102
+ model = load_model()
103
+ audios = Path("/Users/jeqin/work/code/TestTranslator/test_data/audio_clips/10s-mix")
104
+ rows = [["file_name", "time", "inference_result"]]
105
+ for audio in sorted(audios.glob("*.wav")):
106
+ print(audio)
107
+ text, cost = inference(model, audio)
108
+ print("inference cost: ", cost)
109
+ print(text)
110
+ rows.append([audio.name, round(cost, 3), text]) # f"{audio.parent.name}/{audio.name}"
111
+ file_name = "csv/funasr_nano.csv"
112
+ # save_csv(file_name, rows)
113
+
114
+
115
+ def run_recordings():
116
+ from scripts.asr_utils import get_origin_text_dict, get_text_distance
117
+ model = load_model()
118
+ audios = Path("/Users/jeqin/work/code/TestTranslator/test_data/recordings/")
119
+ rows = [["file_name", "time", "inference_result"]]
120
+ original = get_origin_text_dict()
121
+ for audio in sorted(audios.glob("*.wav"), key=lambda x: int(x.stem)):
122
+ print(audio)
123
+ text, cost = inference(model, audio)
124
+ print("inference cost: ", cost)
125
+ print(text)
126
+ d, nd, diff = get_text_distance(original[audio.stem], text)
127
+ rows.append([audio.name, round(cost, 3), text, d, diff]) # f"{audio.parent.name}/{audio.name}"
128
+ file_name = "csv/funasr_mlt_nano_recordings.csv"
129
+ save_csv(file_name, rows)
130
+
131
+ def run_test_wenet():
132
+ from test_data.audios import read_wenet
133
+ model = load_model()
134
+ result_list = []
135
+ count = 0
136
+ for audio, sentence in read_wenet(count_limit=5000):
137
+ count += 1
138
+ print(f"processing {count}: {audio}")
139
+ text, cost = inference(model, audio)
140
+ print("inference time:", cost)
141
+ result_list.append({
142
+ "index": count,
143
+ "audio_path": audio.name,
144
+ "reference": sentence,
145
+ # "duration": duration,
146
+ "inference_time": round(cost, 3),
147
+ "inference_result": text
148
+ })
149
+ print("inference cost: ", cost)
150
+ print(text)
151
+
152
+ import json
153
+ with open("csv/funasr_mlt_nano_wenet.json", "w", encoding="utf-8") as f:
154
+ json.dump(result_list, f, ensure_ascii=False, indent=2)
155
+
156
+ if __name__ == "__main__":
157
+ # main()
158
+ run_recordings()
159
+ # run_audio_clips()
160
+ # run_test_wenet()
scripts/run_funasr_nano.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import time
3
+ import csv
4
+ from funasr import AutoModel
5
+
6
+
7
+ def main():
8
+ device = "mps"
9
+ model_dir = "/Users/jeqin/work/code/Fun-ASR-Nano-2512"
10
+ model = AutoModel(
11
+ model=model_dir,
12
+ trust_remote_code=True,
13
+ remote_code="./model.py",
14
+ device=device,
15
+ )
16
+
17
+ wav_path = f"/Users/jeqin/work/code/TestTranslator/test_data/audio_clips/zhengyaowei-part1.mp3"
18
+ res = model.generate(
19
+ input=[wav_path],
20
+ cache={},
21
+ batch_size=1,
22
+ # hotwords=["开放时间"],
23
+ # 中文、英文、日文 for Fun-ASR-Nano-2512
24
+ # 中文、英文、粤语、日文、韩文、越南语、印尼语、泰语、马来语、菲律宾语、阿拉伯语、
25
+ # 印地语、保加利亚语、克罗地亚语、捷克语、丹麦语、荷兰语、爱沙尼亚语、芬兰语、希腊语、
26
+ # 匈牙利语、爱尔兰语、拉脱维亚语、立陶宛语、马耳他语、波兰语、葡萄牙语、罗马尼亚语、
27
+ # 斯洛伐克语、斯洛文尼亚语、瑞典语 for Fun-ASR-MLT-Nano-2512
28
+ language="中文",
29
+ itn=True, # or False
30
+ )
31
+ text = res[0]["text"]
32
+ print(text)
33
+ text = model.generate(input=[wav_path],
34
+ cache={},
35
+ batch_size=1,
36
+ # hotwords=["开放时间"],
37
+ # language="中文",
38
+ itn=True, # or False
39
+ )[0]["text"]
40
+ print(text)
41
+ text = model.generate(input=[wav_path],
42
+ cache={},
43
+ batch_size=1,
44
+ hotwords=["头数", "llama", "decode", "query"],
45
+ # language="中文",
46
+ itn=True, # or False
47
+ )[0]["text"]
48
+ print(text)
49
+
50
+ # model = AutoModel(
51
+ # model=model_dir,
52
+ # trust_remote_code=True,
53
+ # vad_model="fsmn-vad",
54
+ # vad_kwargs={"max_single_segment_time": 30000},
55
+ # remote_code="./model.py",
56
+ # device=device,
57
+ # )
58
+ # res = model.generate(input=[wav_path], cache={}, batch_size=1)
59
+ # text = res[0]["text"]
60
+ # print(text)
61
+
62
+ def save_csv(file_path, rows):
63
+ with open(file_path, "w", encoding="utf-8") as f:
64
+ writer = csv.writer(f)
65
+ writer.writerows(rows)
66
+ print(f"write csv to {file_path}")
67
+
68
+ def load_model():
69
+ device = "mps"
70
+ s = time.time()
71
+ # model_dir = "/Users/jeqin/work/code/Fun-ASR-Nano-2512"
72
+ model_dir = "/Users/jeqin/work/code/Fun-ASR-MLT-Nano-2512"
73
+ model = AutoModel(
74
+ model=model_dir,
75
+ trust_remote_code=True,
76
+ remote_code="./model.py",
77
+ device=device,
78
+ disable_update=True,
79
+ )
80
+ print("load model cost:", time.time() - s)
81
+ return model
82
+
83
+ def inference(model, wav_path):
84
+ t1 = time.time()
85
+ res = model.generate(input=[str(wav_path)], cache={}, batch_size=1)
86
+ # res = model.generate(
87
+ # input=[str(wav_path)],
88
+ # cache={},
89
+ # # batch_size=1,
90
+ # hotwords=["开放时间", "llama", "decode"],
91
+ # # 中文、英文、日文 for Fun-ASR-Nano-2512
92
+ # # 中文、英文、粤语、日文、韩文、越南语、印尼语、泰语、马来语、菲律宾语、阿拉伯语、
93
+ # # 印地语、保加利亚语、克罗地亚语、捷克语、丹麦语、荷兰语、爱沙尼亚语、芬兰语、希腊语、
94
+ # # 匈牙利语、爱尔兰语、拉脱维亚语、立陶宛语、马耳他语、波兰语、葡萄牙语、罗马尼亚语、
95
+ # # 斯洛伐克语、斯洛文尼亚语、瑞典语 for Fun-ASR-MLT-Nano-2512
96
+ # language="中文",
97
+ # itn=True, # or False
98
+ # )
99
+ text = res[0]["text"]
100
+ return text, time.time()-t1
101
+
102
+ def run_audio_clips():
103
+ model = load_model()
104
+ audios = Path("/Users/jeqin/work/code/TestTranslator/test_data/audio_clips/10s-mix")
105
+ rows = [["file_name", "time", "inference_result"]]
106
+ for audio in sorted(audios.glob("*.wav")):
107
+ print(audio)
108
+ text, cost = inference(model, audio)
109
+ print("inference cost: ", cost)
110
+ print(text)
111
+ rows.append([audio.name, round(cost, 3), text]) # f"{audio.parent.name}/{audio.name}"
112
+ file_name = "csv/funasr_nano.csv"
113
+ # save_csv(file_name, rows)
114
+
115
+
116
+ def run_recordings():
117
+ from scripts.asr_utils import get_origin_text_dict, get_text_distance
118
+ model = load_model()
119
+ audios = Path("/Users/jeqin/work/code/TestTranslator/test_data/recordings/")
120
+ rows = [["file_name", "time", "inference_result"]]
121
+ original = get_origin_text_dict()
122
+ for audio in sorted(audios.glob("*.wav"), key=lambda x: int(x.stem)):
123
+ print("processing: ", audio)
124
+ text, cost = inference(model, audio)
125
+ print("inference cost: ", cost)
126
+ print(text)
127
+ d, nd, diff = get_text_distance(original[audio.stem], text)
128
+ rows.append([audio.name, round(cost, 3), text, d, diff]) # f"{audio.parent.name}/{audio.name}"
129
+ file_name = "csv/funasr_nano.csv"
130
+ save_csv(file_name, rows)
131
+
132
+ def run_test_wenet():
133
+ from test_data.audios import read_wenet
134
+ model = load_model()
135
+ result_list = []
136
+ count = 0
137
+ for audio, sentence in read_wenet(count_limit=5000):
138
+ count += 1
139
+ print(f"processing {count}: {audio}")
140
+ text, cost = inference(model, audio)
141
+ print("inference time:", cost)
142
+ result_list.append({
143
+ "index": count,
144
+ "audio_path": audio.name,
145
+ "reference": sentence,
146
+ # "duration": duration,
147
+ "inference_time": round(cost, 3),
148
+ "inference_result": text
149
+ })
150
+ print("inference cost: ", cost)
151
+ print(text)
152
+
153
+ import json
154
+ with open("csv/funasr_nano_wenet.json", "w", encoding="utf-8") as f:
155
+ json.dump(result_list, f, ensure_ascii=False, indent=2)
156
+
157
+ if __name__ == "__main__":
158
+ # main()
159
+ run_recordings()
160
+ # run_audio_clips()
161
+ # run_test_wenet()
scripts/run_funasr_quant.py CHANGED
@@ -72,7 +72,7 @@ def run_test_dataset():
72
  from test_data.audios import read_dataset
73
  quantize = True
74
  vad_model, asr_model, punc_model = load_model(quantize)
75
- test_data = Path("../test_data/dataset/dataset.txt")
76
  audio_parent = Path("../test_data/")
77
  rows = [["file_name", "time", "inference_result"]]
78
  result_list = []
 
72
  from test_data.audios import read_dataset
73
  quantize = True
74
  vad_model, asr_model, punc_model = load_model(quantize)
75
+ test_data = Path("../test_data/AIShell/dataset/dataset.txt")
76
  audio_parent = Path("../test_data/")
77
  rows = [["file_name", "time", "inference_result"]]
78
  result_list = []
scripts/run_whisper.py CHANGED
@@ -3,7 +3,6 @@ from pathlib import Path
3
  import time
4
  import csv
5
 
6
- from silero_vad.utils_vad import languages
7
  from scripts.asr_utils import get_origin_text_dict, get_text_distance
8
 
9
  def save_csv(file_path, rows):
@@ -67,7 +66,7 @@ def run_test_audios():
67
  def run_test_dataset():
68
  from test_data.audios import read_dataset
69
  model = load_model()
70
- test_data = Path("../test_data/dataset/dataset.txt")
71
  audio_parent = Path("../test_data/")
72
  rows = [["file_name", "time", "inference_result"]]
73
  result_list = []
 
3
  import time
4
  import csv
5
 
 
6
  from scripts.asr_utils import get_origin_text_dict, get_text_distance
7
 
8
  def save_csv(file_path, rows):
 
66
  def run_test_dataset():
67
  from test_data.audios import read_dataset
68
  model = load_model()
69
+ test_data = Path("../test_data/AIShell/dataset/dataset.txt")
70
  audio_parent = Path("../test_data/")
71
  rows = [["file_name", "time", "inference_result"]]
72
  result_list = []
scripts/run_whisper_finetuned.py CHANGED
@@ -181,7 +181,7 @@ def run_recordings():
181
  def run_test_dataset():
182
  from test_data.audios import read_dataset
183
  model, processor = load_model()
184
- test_data = Path("../test_data/dataset/dataset.txt")
185
  audio_parent = Path("../test_data/")
186
  rows = [["file_name", "time", "inference_result"]]
187
  result_list = []
 
181
  def run_test_dataset():
182
  from test_data.audios import read_dataset
183
  model, processor = load_model()
184
+ test_data = Path("../test_data/AIShell/dataset/dataset.txt")
185
  audio_parent = Path("../test_data/")
186
  rows = [["file_name", "time", "inference_result"]]
187
  result_list = []
scripts/vad.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
4
+ model = load_silero_vad()
5
+ audio = "/Users/jeqin/Downloads/daily_20251117.wav"
6
+ wav = read_audio(audio)
7
+ speech_timestamps = get_speech_timestamps(
8
+ wav,
9
+ model,
10
+ return_seconds=True, # Return speech timestamps in seconds (default is samples)
11
+ )
12
+ print(speech_timestamps)
scripts/wenet_utils.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from lib.utils import cmd
4
+ from environment import TEST_DATA
5
+
6
+ def read_json():
7
+
8
+ test_net = {"audios": []}
9
+ test_meeting = {"audios": []}
10
+ dev = {"audios": []}
11
+ small = {"audios": []}
12
+ with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/WenetSpeech.json', 'r') as f:
13
+ data = json.load(f)
14
+ for audio in data["audios"]:
15
+ # print(audio["path"], audio["duration"])
16
+ for seg in audio["segments"]:
17
+ # if "TEST_NET" in seg["subsets"]:
18
+ # test_net["audios"].append(audio)
19
+ # break
20
+ # if "TEST_MEETING" in seg["subsets"]:
21
+ # test_meeting["audios"].append(audio)
22
+ # break
23
+ # if "DEV" in seg["subsets"]:
24
+ # dev["audios"].append(audio)
25
+ # break
26
+ if "S" in seg["subsets"]:
27
+ small["audios"].append(audio)
28
+ continue
29
+
30
+ # with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/WenetSpeech_TEST_NET.json', 'w') as f:
31
+ # json.dump(test_net, f, indent=4)
32
+ # with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_TEST_MEETING.json', 'w') as f:
33
+ # json.dump(test_meeting, f, indent=4)
34
+ # with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_DEV.json', 'w') as f:
35
+ # json.dump(dev, f, indent=4)
36
+ with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_SMALL.json', 'w') as f:
37
+ json.dump(small, f, indent=4)
38
+
39
+ def move_wenet(folder: Path=TEST_DATA/"wenet", json_file="WenetSpeech_TEST_NET.json", count_limit=None):
40
+ """读取 wenet 数据集,返回音频路径、文本、时长,
41
+ """
42
+ count = 0
43
+ with open(folder/json_file, encoding="utf-8") as f:
44
+ data = json.load(f)
45
+ audios = data["audios"]
46
+ print(f"Total {len(audios)} samples in {json_file}")
47
+ segment_sum = 0
48
+ for a in audios:
49
+ print(a["path"])
50
+ segs = len(a["segments"])
51
+ if segs < 100:
52
+ segment_sum += segs
53
+ print("segments number:", segment_sum)
54
+ # remote_path = f"/home/ubuntu/data_1/yujuan/dataset_untar/{a['path']}"
55
+ # local_dir = Path(f"/Users/jeqin/work/code/TestTranslator/test_data/wenet/{a['path']}").parent
56
+ # if not local_dir.exists():
57
+ # local_dir.mkdir(parents=True, exist_ok=True)
58
+ # command = f"scp ubuntu@192.168.110.49:{remote_path} {local_dir}/"
59
+ # cmd(command)
60
+
61
+ if __name__ == "__main__":
62
+ move_wenet()
63
+
test_data/{dataset → AIShell/dataset}/dataset.txt RENAMED
File without changes
test_data/__init__.py CHANGED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+
3
+ class DataName(Enum):
4
+ WENET_NET = "wenet_net"
5
+ WENET_MEETING = "wenet_meeting"
6
+ LIBRISPEECH_CLEAN = "librispeech_clean"
7
+ LIBRISPEECH_OTHER = "librispeech_other"
8
+ AISHELL2 = "aishell2"
9
+ RECORDING = "recording"
test_data/audios.py CHANGED
@@ -7,12 +7,24 @@ from lib.utils import cmd
7
  from environment import TEST_DATA
8
 
9
 
10
- def read_recording(folder: Path=Path("./recordings"), count_limit=None):
11
- pass
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- def read_dataset(file: Path=Path("dataset/dataset.txt"), count_limit=None):
14
  """line sample: {"audio": {"path": "dataset/audio/data_aishell/wav/test/S0916/BAC009S0916W0158.wav"}, "sentence": "顾客体验的核心是真善美", "duration": 3.22, "sentences": [{"start": 0, "end": 3.22, "text": "顾客体验的核心是真善美"}]}"""
15
- with open(file) as f:
16
  lines =f.readlines()
17
  count = 0
18
  for line in lines:
@@ -24,10 +36,10 @@ def read_dataset(file: Path=Path("dataset/dataset.txt"), count_limit=None):
24
  continue
25
  data = json.loads(line)
26
 
27
- yield data["audio"]["path"], data["sentence"], data["duration"]
28
 
29
  def read_emilia(folder: Path=TEST_DATA/"ZH-B000000", count_limit=None):
30
- """读取 emilia 数据集,返回音频路径、文本、时长,
31
  json 文件样例:
32
  {"id": "ZH_B00000_S00110_W000000", "wav": "ZH_B00000/ZH_B00000_S00110/mp3/ZH_B00000_S00110_W000000.mp3", "text": "\u628a\u63e1\u6700\u524d\u6cbf\u7684\u91d1\u878d\u9886\u57df\u548c\u533a\u5757\u94fe\u6700\u65b0\u8d44\u8baf\u3002\u6211\u4eec\u4e00\u8d77\u6765\u4e86\u89e3\u4e00\u4e0b\u4eca\u5929\u5e02\u573a\u4e0a\u6709\u53d1\u751f\u54ea\u4e9b\u91cd\u8981\u4e8b\u4ef6\u3002", "duration": 7.963, "speaker": "ZH_B00000_S00110", "language": "zh", "dnsmos": 3.3808}"""
33
  count = 0
@@ -44,24 +56,10 @@ def read_emilia(folder: Path=TEST_DATA/"ZH-B000000", count_limit=None):
44
  mp3_path = folder / f'{json_file.stem}.mp3'
45
  command=f"ffmpeg -i {mp3_path} -ac 1 -ar 16000 {wav_path}"
46
  cmd(command)
47
- yield wav_path, text, duration
48
-
49
- def read_st(folder: Path=TEST_DATA/"ST-CMDS-20170001_1-OS", count_limit=None):
50
- """读取 st 数据集,返回音频路径、文本、时长,
51
- """
52
- count = 0
53
- for wav in sorted(folder.glob("*.wav")):
54
- count += 1
55
- if count_limit and count > count_limit:
56
- break
57
- txt = wav.with_suffix(".txt")
58
- with open(txt, encoding="utf-8") as f:
59
- text = f.read()
60
-
61
- yield wav, text
62
 
63
  def read_wenet(folder: Path=TEST_DATA/"wenet", json_file="WenetSpeech_TEST_NET.json", count_limit=None):
64
- """读取 wenet 数据集,返回音频路径、文本、时长,
65
  """
66
  count = 0
67
  with open(folder/json_file, encoding="utf-8") as f:
@@ -73,7 +71,7 @@ def read_wenet(folder: Path=TEST_DATA/"wenet", json_file="WenetSpeech_TEST_NET.j
73
  continue
74
  for seg in a["segments"]:
75
  if count > count_limit:
76
- break
77
  seg_file = audio_file.parent / (seg["sid"]+".wav")
78
  if not seg_file.exists():
79
  command = f"ffmpeg -i {audio_file} -ar 16000 -ac 1 -ss {seg['begin_time']} -to {seg['end_time']} {seg_file}"
@@ -81,16 +79,29 @@ def read_wenet(folder: Path=TEST_DATA/"wenet", json_file="WenetSpeech_TEST_NET.j
81
  count +=1
82
  yield seg_file, seg["text"]
83
 
84
-
85
- # for wav in sorted(folder.glob("*.wav")):
86
- # count += 1
87
- # if count_limit and count > count_limit:
88
- # break
89
- # txt = wav.with_suffix(".txt")
90
- # with open(txt, encoding="utf-8") as f:
91
- # text = f.read()
92
-
93
- # yield wav, text
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  if __name__ == '__main__':
96
  read_wenet()
 
7
  from environment import TEST_DATA
8
 
9
 
10
+ def read_recording(folder: Path=TEST_DATA/"recordings", count_limit=None):
11
+ """读取录音文件夹,返回音频路径、文本,
12
+ """
13
+ data_file = folder / 'data.json'
14
+ with open(data_file, encoding='utf-8') as f:
15
+ data = json.load(f)
16
+ count = 0
17
+ for filename, text in data.items():
18
+ count += 1
19
+ if count_limit and count > count_limit:
20
+ break
21
+ wav_path = folder / filename
22
+ yield wav_path, text
23
+
24
 
25
+ def read_dataset(folder: Path= TEST_DATA / "AIShell", count_limit=None):
26
  """line sample: {"audio": {"path": "dataset/audio/data_aishell/wav/test/S0916/BAC009S0916W0158.wav"}, "sentence": "顾客体验的核心是真善美", "duration": 3.22, "sentences": [{"start": 0, "end": 3.22, "text": "顾客体验的核心是真善美"}]}"""
27
+ with open(folder / "dataset/dataset.txt") as f:
28
  lines =f.readlines()
29
  count = 0
30
  for line in lines:
 
36
  continue
37
  data = json.loads(line)
38
 
39
+ yield folder / data["audio"]["path"], data["sentence"]
40
 
41
  def read_emilia(folder: Path=TEST_DATA/"ZH-B000000", count_limit=None):
42
+ """读取 emilia 数据集,返回音频路径、文本,
43
  json 文件样例:
44
  {"id": "ZH_B00000_S00110_W000000", "wav": "ZH_B00000/ZH_B00000_S00110/mp3/ZH_B00000_S00110_W000000.mp3", "text": "\u628a\u63e1\u6700\u524d\u6cbf\u7684\u91d1\u878d\u9886\u57df\u548c\u533a\u5757\u94fe\u6700\u65b0\u8d44\u8baf\u3002\u6211\u4eec\u4e00\u8d77\u6765\u4e86\u89e3\u4e00\u4e0b\u4eca\u5929\u5e02\u573a\u4e0a\u6709\u53d1\u751f\u54ea\u4e9b\u91cd\u8981\u4e8b\u4ef6\u3002", "duration": 7.963, "speaker": "ZH_B00000_S00110", "language": "zh", "dnsmos": 3.3808}"""
45
  count = 0
 
56
  mp3_path = folder / f'{json_file.stem}.mp3'
57
  command=f"ffmpeg -i {mp3_path} -ac 1 -ar 16000 {wav_path}"
58
  cmd(command)
59
+ yield wav_path, text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  def read_wenet(folder: Path=TEST_DATA/"wenet", json_file="WenetSpeech_TEST_NET.json", count_limit=None):
62
+ """读取 wenet 数据集,返回音频路径、文本,
63
  """
64
  count = 0
65
  with open(folder/json_file, encoding="utf-8") as f:
 
71
  continue
72
  for seg in a["segments"]:
73
  if count > count_limit:
74
+ return
75
  seg_file = audio_file.parent / (seg["sid"]+".wav")
76
  if not seg_file.exists():
77
  command = f"ffmpeg -i {audio_file} -ar 16000 -ac 1 -ss {seg['begin_time']} -to {seg['end_time']} {seg_file}"
 
79
  count +=1
80
  yield seg_file, seg["text"]
81
 
82
+ def read_libri(folder: Path=TEST_DATA/"LibriSpeech/test-clean", count_limit=None):
83
+ """读取 libri 数据集,返回音频路径、文本,
84
+ """
85
+ count = 0
86
+ for trans_file in sorted(folder.rglob("*trans.txt")):
87
+ with open(trans_file, encoding="utf-8") as f:
88
+ lines = f.readlines()
89
+ for line in lines:
90
+ if count_limit and count >= count_limit:
91
+ return
92
+ parts = line.strip().split(" ", 1)
93
+ if len(parts) != 2:
94
+ print("Invalid line:", line)
95
+ continue
96
+ file_id, text = parts
97
+ # speaker_id = file_id.split("-")[0]
98
+ flac_path = trans_file.parent / (file_id + ".flac")
99
+ wav_path = flac_path.with_suffix(".wav")
100
+ if not wav_path.exists():
101
+ command = f"ffmpeg -i {flac_path} -ar 16000 -ac 1 {wav_path}"
102
+ cmd(command)
103
+ count += 1
104
+ yield wav_path, text
105
 
106
  if __name__ == '__main__':
107
  read_wenet()
tests/test_asr/__init__.py ADDED
File without changes
tests/test_asr/conftest.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import pytest
3
+
4
+ from lib.asr_models.base_model import ModelName
5
+ from test_data import DataName
6
+
7
+ def pytest_addoption(parser):
8
+ # 注册命令行参数
9
+ parser.addoption(
10
+ "--model",
11
+ action="store",
12
+ required=True,
13
+ nargs="+",
14
+ choices=["whisper", "whisper_finetuned", "funasr_nano", "funasr_mlt_nano","funasr_quant", "all"],
15
+ help="Models to run"
16
+ )
17
+ parser.addoption(
18
+ "--dataset",
19
+ action="store",
20
+ required=True,
21
+ nargs="+",
22
+ choices=["wenet_net", "wenet_meeting", "librispeech_clean", "librispeech_other", "aishell2", "recording"],
23
+ help=f"Datasets to use"
24
+ )
25
+
26
+ def pytest_generate_tests(metafunc):
27
+ # metafunc 可以获取到测试函数的信息
28
+
29
+ # 动态参数化 'model'
30
+ if "model" in metafunc.fixturenames:
31
+ model_list = metafunc.config.getoption("model")
32
+ if "all" in model_list:
33
+ model_list = [m.value for m in ModelName]
34
+ metafunc.parametrize("model", model_list)
35
+
36
+ # 动态参数化 'dataset'
37
+ if "dataset" in metafunc.fixturenames:
38
+ dataset_list = metafunc.config.getoption("dataset")
39
+ metafunc.parametrize("dataset", dataset_list)
40
+
41
+ @pytest.fixture(scope="module")
42
+ def summary(request):
43
+ # 在所有 case 运行完后打印总的结果
44
+ summary = []
45
+ yield summary
46
+ print("\n\n")
47
+ print("*"*50)
48
+ [print(s) for s in summary]
49
+ print("*" * 50)
50
+
51
+ @pytest.fixture(scope="function", autouse=True)
52
+ def print_log(request):
53
+ model_name = request.node.callspec.params.get("model")
54
+ dataset_name = request.node.callspec.params.get("dataset")
55
+ print("\n", "#"*30,model_name, "-", dataset_name, "start", "#"*30, "\n")
56
+ yield
57
+ print("\n", "#" * 30, model_name, "-", dataset_name, "end", "#" * 30, "\n")
58
+ time.sleep(30)
59
+
tests/test_asr/test_asr.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import csv
3
+ import json
4
+ import pytest
5
+
6
+ from lib.asr_models import ModelName
7
+ from lib.asr_models import Whisper, WhisperFinetuned, FunasrQuant, FunasrNano, FunasrMLTNano
8
+ from lib.asr_models.evaluator import compute
9
+ from lib.utils import run_textdistance, clean_text_for_comparison_zh, highlight_diff
10
+
11
+ from test_data import DataName
12
+ from test_data.audios import read_libri, read_wenet, read_dataset, read_recording
13
+ from environment import REPORTS_DIR
14
+
15
+ """
16
+ example:
17
+ pytest -s --dataset recording --model all
18
+ pytest -s --dataset libri --model whisper funasr_quant
19
+ """
20
+
21
+ def get_model_and_dataset(model, dataset):
22
+ model_name_map = {
23
+ ModelName.WHISPER.value: Whisper,
24
+ ModelName.WHISPER_FINETUNED.value: WhisperFinetuned,
25
+ ModelName.FUNASR_QUANT.value: FunasrQuant,
26
+ ModelName.FUNASR_NANO.value: FunasrNano,
27
+ ModelName.FUNASR_MLT_NANO.value: FunasrMLTNano
28
+ }
29
+ data_name_map = {
30
+ DataName.LIBRISPEECH_CLEAN.value: read_libri,
31
+ DataName.LIBRISPEECH_OTHER.value: read_libri,
32
+ DataName.WENET_NET.value: read_wenet,
33
+ DataName.WENET_MEETING.value: read_wenet,
34
+ DataName.AISHELL2.value: read_dataset,
35
+ DataName.RECORDING.value: read_recording,
36
+ }
37
+ language_map = {
38
+ DataName.LIBRISPEECH_CLEAN.value: "en",
39
+ DataName.LIBRISPEECH_OTHER.value: "en",
40
+ DataName.WENET_NET.value: "zh",
41
+ DataName.WENET_MEETING.value: "zh",
42
+ DataName.AISHELL2.value: "zh",
43
+ DataName.RECORDING.value: "zh",
44
+ }
45
+
46
+ return model_name_map.get(model), data_name_map.get(dataset), language_map.get(dataset)
47
+
48
+
49
+ def save_result(model_name, data_name, results):
50
+ if data_name == DataName.RECORDING.value:
51
+ # 如果是 recording, 将结果保存成 csv
52
+ file = REPORTS_DIR / f"recording_{model_name}.csv"
53
+ with open(file, "w", newline='', encoding="utf-8") as csvfile:
54
+ writer = csv.writer(csvfile)
55
+ writer.writerow(['file', 'time_cost', 'predicts', "distance", "diff"])
56
+ for result in results:
57
+ ref = result['reference']
58
+ pred = result['predicts']
59
+ ref_clean = clean_text_for_comparison_zh(ref)
60
+ pred_clean = clean_text_for_comparison_zh(pred)
61
+ d, nd = run_textdistance(ref_clean, pred_clean)
62
+ diff = highlight_diff(ref_clean, pred_clean, spliter="")
63
+ writer.writerow([result['audio_path'], result['inference_time'], pred, d, diff])
64
+
65
+ # 其他数据,保存成 json
66
+ file = REPORTS_DIR / f"{data_name}_{model_name}.json"
67
+ with open(file, "w", encoding="utf-8") as f:
68
+ json.dump(results, f, ensure_ascii=False, indent=2)
69
+ print(f"ASR results saved to {file}")
70
+
71
+
72
+ def test_asr_evaluation(model, dataset, summary):
73
+ ASRModelClass, audio_reader, language = get_model_and_dataset(model, dataset)
74
+ if ASRModelClass is None or audio_reader is None:
75
+ pytest.skip(f"Model {model} or Dataset {dataset} not implemented yet.")
76
+ asr_model = ASRModelClass(device='mps')
77
+ asr_model.load(language=language)
78
+
79
+ result_list = []
80
+ count = 0
81
+ try:
82
+ for audio, sentence in audio_reader(count_limit=3000):
83
+ count += 1
84
+ print(f"processing {count}: {audio}")
85
+ text, cost = asr_model.transcribe(audio, language=language)
86
+ print("inference time: ", cost)
87
+ print(text)
88
+ result_list.append({
89
+ "index": count,
90
+ "audio_path": audio.name,
91
+ "reference": sentence,
92
+ "inference_time": round(cost, 3),
93
+ "predicts": text
94
+ })
95
+ except Exception as e:
96
+ pytest.fail(f"Error during ASR test: {e}")
97
+ except KeyboardInterrupt:
98
+ pytest.fail("Test interrupted by user.")
99
+
100
+ save_result(model, dataset, result_list)
101
+ res = compute(model, dataset, result_list, language)
102
+ summary.append(res)