Gluttony10 commited on
Commit
3762e33
·
verified ·
1 Parent(s): f0dd4ee

Upload 65 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +6 -0
  2. LAM_audio2exp/pretrained_models/LAM_audio2exp_streaming2.tar +3 -0
  3. face-parse-bisent/.cache/huggingface/.gitignore +1 -0
  4. face-parse-bisent/.cache/huggingface/download/79999_iter.pth.metadata +3 -0
  5. face-parse-bisent/.cache/huggingface/download/resnet18-5c106cde.pth.metadata +3 -0
  6. face-parse-bisent/79999_iter.pth +3 -0
  7. face-parse-bisent/resnet18-5c106cde.pth +3 -0
  8. iic/SenseVoiceSmall/.mdl +0 -0
  9. iic/SenseVoiceSmall/.msc +0 -0
  10. iic/SenseVoiceSmall/.mv +1 -0
  11. iic/SenseVoiceSmall/README.md +219 -0
  12. iic/SenseVoiceSmall/am.mvn +8 -0
  13. iic/SenseVoiceSmall/chn_jpn_yue_eng_ko_spectok.bpe.model +3 -0
  14. iic/SenseVoiceSmall/config.yaml +97 -0
  15. iic/SenseVoiceSmall/configuration.json +14 -0
  16. iic/SenseVoiceSmall/example/.DS_Store +0 -0
  17. iic/SenseVoiceSmall/example/en.mp3 +0 -0
  18. iic/SenseVoiceSmall/example/ja.mp3 +0 -0
  19. iic/SenseVoiceSmall/example/ko.mp3 +0 -0
  20. iic/SenseVoiceSmall/example/yue.mp3 +0 -0
  21. iic/SenseVoiceSmall/example/zh.mp3 +0 -0
  22. iic/SenseVoiceSmall/fig/aed_figure.png +3 -0
  23. iic/SenseVoiceSmall/fig/asr_results.png +3 -0
  24. iic/SenseVoiceSmall/fig/inference.png +3 -0
  25. iic/SenseVoiceSmall/fig/sensevoice.png +3 -0
  26. iic/SenseVoiceSmall/fig/ser_figure.png +3 -0
  27. iic/SenseVoiceSmall/fig/ser_table.png +3 -0
  28. iic/SenseVoiceSmall/model.pt +3 -0
  29. iic/SenseVoiceSmall/tokens.json +0 -0
  30. musetalk/.cache/huggingface/.gitignore +1 -0
  31. musetalk/.cache/huggingface/download/.gitattributes.metadata +3 -0
  32. musetalk/.cache/huggingface/download/README.md.metadata +3 -0
  33. musetalk/.cache/huggingface/download/musetalk/musetalk.json.metadata +3 -0
  34. musetalk/.cache/huggingface/download/musetalk/pytorch_model.bin.metadata +3 -0
  35. musetalk/.cache/huggingface/download/musetalkV15/musetalk.json.metadata +3 -0
  36. musetalk/.cache/huggingface/download/musetalkV15/unet.pth.metadata +3 -0
  37. musetalk/.gitattributes +35 -0
  38. musetalk/README.md +259 -0
  39. musetalk/dwpose/.cache/huggingface/.gitignore +1 -0
  40. musetalk/dwpose/.cache/huggingface/download/dw-ll_ucoco_384.pth.metadata +3 -0
  41. musetalk/dwpose/dw-ll_ucoco_384.pth +3 -0
  42. musetalk/musetalk/musetalk.json +36 -0
  43. musetalk/musetalk/pytorch_model.bin +3 -0
  44. musetalk/musetalkV15/musetalk.json +36 -0
  45. musetalk/musetalkV15/unet.pth +3 -0
  46. musetalk/syncnet/.cache/huggingface/.gitignore +1 -0
  47. musetalk/syncnet/.cache/huggingface/download/latentsync_syncnet.pt.metadata +3 -0
  48. musetalk/syncnet/latentsync_syncnet.pt +3 -0
  49. musetalk/whisper/.cache/huggingface/.gitignore +1 -0
  50. musetalk/whisper/.cache/huggingface/download/config.json.metadata +3 -0
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ iic/SenseVoiceSmall/fig/aed_figure.png filter=lfs diff=lfs merge=lfs -text
37
+ iic/SenseVoiceSmall/fig/asr_results.png filter=lfs diff=lfs merge=lfs -text
38
+ iic/SenseVoiceSmall/fig/inference.png filter=lfs diff=lfs merge=lfs -text
39
+ iic/SenseVoiceSmall/fig/sensevoice.png filter=lfs diff=lfs merge=lfs -text
40
+ iic/SenseVoiceSmall/fig/ser_figure.png filter=lfs diff=lfs merge=lfs -text
41
+ iic/SenseVoiceSmall/fig/ser_table.png filter=lfs diff=lfs merge=lfs -text
LAM_audio2exp/pretrained_models/LAM_audio2exp_streaming2.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38084d471966381f2e52d9f59e5654c9b8cde6fa36d4c6f18fc02cbaf593d157
3
+ size 408538564
face-parse-bisent/.cache/huggingface/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *
face-parse-bisent/.cache/huggingface/download/79999_iter.pth.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 0073b233a5a3c4b1377d4dbf49245017938a72b5
2
+ 468e13ca13a9b43cc0881a9f99083a430e9c0a38abd935431d1c28ee94b26567
3
+ 1750088240.4747846
face-parse-bisent/.cache/huggingface/download/resnet18-5c106cde.pth.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 0073b233a5a3c4b1377d4dbf49245017938a72b5
2
+ 5c106cde386e87d4033832f2996f5493238eda96ccf559d1d62760c4de0613f8
3
+ 1750088239.4089577
face-parse-bisent/79999_iter.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:468e13ca13a9b43cc0881a9f99083a430e9c0a38abd935431d1c28ee94b26567
3
+ size 53289463
face-parse-bisent/resnet18-5c106cde.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c106cde386e87d4033832f2996f5493238eda96ccf559d1d62760c4de0613f8
3
+ size 46827520
iic/SenseVoiceSmall/.mdl ADDED
Binary file (42 Bytes). View file
 
iic/SenseVoiceSmall/.msc ADDED
Binary file (1.35 kB). View file
 
iic/SenseVoiceSmall/.mv ADDED
@@ -0,0 +1 @@
 
 
1
+ Revision:master,CreatedAt:1727321787
iic/SenseVoiceSmall/README.md ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ frameworks:
3
+ - Pytorch
4
+ license: Apache License 2.0
5
+ tasks:
6
+ - auto-speech-recognition
7
+
8
+ #model-type:
9
+ ##如 gpt、phi、llama、chatglm、baichuan 等
10
+ #- gpt
11
+
12
+ #domain:
13
+ ##如 nlp、cv、audio、multi-modal
14
+ #- nlp
15
+
16
+ #language:
17
+ ##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
18
+ #- cn
19
+
20
+ #metrics:
21
+ ##如 CIDEr、Blue、ROUGE 等
22
+ #- CIDEr
23
+
24
+ #tags:
25
+ ##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
26
+ #- pretrained
27
+
28
+ #tools:
29
+ ##如 vllm、fastchat、llamacpp、AdaSeq 等
30
+ #- vllm
31
+ ---
32
+
33
+ # Highlights
34
+ **SenseVoice**专注于高精度多语言语音识别、情感辨识和音频事件检测
35
+ - **多语言识别:** 采用超过40万小时数据训练,支持超过50种语言,识别效果上优于Whisper模型。
36
+ - **富文本识别:**
37
+ - 具备优秀的情感识别,能够在测试数据上达到和超过目前最佳情感识别模型的效果。
38
+ - 支持声音事件检测能力,支持音乐、掌声、笑声、哭声、咳嗽、喷嚏等多种常见人机交互事件进行检测。
39
+ - **高效推理:** SenseVoice-Small模型采用非自回归端到端框架,推理延迟极低,10s音频推理仅耗时70ms,15倍优于Whisper-Large。
40
+ - **微调定制:** 具备便捷的微调脚本与策略,方便用户根据业务场景修复长尾样本问题。
41
+ - **服务部署:** 具有完整的服务部署链路,支持多并发请求,支持客户端语言有,python、c++、html、java与c#等。
42
+
43
+
44
+ ## <strong>[SenseVoice开源项目介绍](https://github.com/FunAudioLLM/SenseVoice)</strong>
45
+ <strong>[SenseVoice](https://github.com/FunAudioLLM/SenseVoice)</strong>开源模型是多语言音频理解模型,具有包括语音识别、语种识别、语音情感识别,声学事件检测能力。
46
+
47
+ [**github仓库**](https://github.com/FunAudioLLM/SenseVoice)
48
+ | [**最新动态**](https://github.com/FunAudioLLM/SenseVoice/blob/main/README_zh.md#%E6%9C%80%E6%96%B0%E5%8A%A8%E6%80%81)
49
+ | [**环境安装**](https://github.com/FunAudioLLM/SenseVoice/blob/main/README_zh.md#%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85)
50
+
51
+ # 模型结构图
52
+ SenseVoice多语言音频理解模型,支持语音识别、语种识别、语音情感识别、声学事件检测、逆文本正则化等能力,采用工业级数十万小时的标注音频进行模型训练,保证了模型的通用识别效果。模型可以被应用于中文、粤语、英语、日语、韩语音频识别,并输出带有情感和事件的富文本转写结果。
53
+
54
+ <p align="center">
55
+ <img src="fig/sensevoice.png" alt="SenseVoice模型结构" width="1500" />
56
+ </p>
57
+
58
+ SenseVoice-Small是基于非自回归端到端框架模型,为了指定任务,我们在语音特征前添加四个嵌入作为输入传递给编码器:
59
+ - LID:用于预测音频语种标签。
60
+ - SER:用于预测音频情感标签。
61
+ - AED:用于预测音频包含的事件标签。
62
+ - ITN:用于指定识别输出文本是否进行逆文本正则化。
63
+
64
+
65
+ # 依赖环境
66
+
67
+ 推理之前,请务必更新funasr与modelscope版本
68
+
69
+ ```shell
70
+ pip install -U funasr modelscope
71
+ ```
72
+
73
+ # 用法
74
+
75
+
76
+ ## 推理
77
+
78
+ ### modelscope pipeline推理
79
+ ```python
80
+ from modelscope.pipelines import pipeline
81
+ from modelscope.utils.constant import Tasks
82
+
83
+ inference_pipeline = pipeline(
84
+ task=Tasks.auto_speech_recognition,
85
+ model='iic/SenseVoiceSmall',
86
+ model_revision="master",
87
+ device="cuda:0",)
88
+
89
+ rec_result = inference_pipeline('https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
90
+ print(rec_result)
91
+ ```
92
+
93
+ ### 使用funasr推理
94
+
95
+ 支持任意格式音频输入,支持任意时长输入
96
+
97
+ ```python
98
+ from funasr import AutoModel
99
+ from funasr.utils.postprocess_utils import rich_transcription_postprocess
100
+
101
+ model_dir = "iic/SenseVoiceSmall"
102
+
103
+
104
+ model = AutoModel(
105
+ model=model_dir,
106
+ trust_remote_code=True,
107
+ remote_code="./model.py",
108
+ vad_model="fsmn-vad",
109
+ vad_kwargs={"max_single_segment_time": 30000},
110
+ device="cuda:0",
111
+ )
112
+
113
+ # en
114
+ res = model.generate(
115
+ input=f"{model.model_path}/example/en.mp3",
116
+ cache={},
117
+ language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
118
+ use_itn=True,
119
+ batch_size_s=60,
120
+ merge_vad=True, #
121
+ merge_length_s=15,
122
+ )
123
+ text = rich_transcription_postprocess(res[0]["text"])
124
+ print(text)
125
+ ```
126
+ 参数说明:
127
+ - `model_dir`:模型名称,或本地磁盘中的模型路径。
128
+ - `trust_remote_code`:
129
+ - `True`表示model代码实现从`remote_code`处加载,`remote_code`指定`model`具体代码的位置(例如,当前目录下的`model.py`),支持绝对路径与相对路径,以及网络url。
130
+ - `False`表示,model代码实现为 [FunASR](https://github.com/modelscope/FunASR) 内部集成版本,此时修改当前目录下的`model.py`不会生效,因为加载的是funasr内部版本,模型代码[点击查看](https://github.com/modelscope/FunASR/tree/main/funasr/models/sense_voice)。
131
+ - `vad_model`:表示开启VAD,VAD的作用是将长音频切割成短音频,此时推理耗时包括了VAD与SenseVoice总耗时,为链路耗时,如果需要单独测试SenseVoice模型耗时,可以关闭VAD模型。
132
+ - `vad_kwargs`:表示VAD模型配置,`max_single_segment_time`: 表示`vad_model`最大切割音频时长, 单位是毫秒ms。
133
+ - `use_itn`:输出结果中是否包含标点与逆文本正则化。
134
+ - `batch_size_s` 表示采用动态batch,batch中总音频时长,单位为秒s。
135
+ - `merge_vad`:是否将 vad 模型切割的短音频碎片合成,合并后长度为`merge_length_s`,单位为秒s。
136
+ - `ban_emo_unk`:禁用emo_unk标签,禁用后所有的句子都会被赋与情感标签。默认`False`
137
+
138
+ ```python
139
+ model = AutoModel(model=model_dir, trust_remote_code=True, device="cuda:0")
140
+
141
+ res = model.generate(
142
+ input=f"{model.model_path}/example/en.mp3",
143
+ cache={},
144
+ language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
145
+ use_itn=True,
146
+ batch_size=64,
147
+ )
148
+ ```
149
+
150
+ 更多详细用法,请参考 [文档](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md)
151
+
152
+
153
+
154
+ ## 模型下载
155
+ 上面代码会自动下载模型,如果您需要离线下载好模型,可以通过下面代码,手动下载,之后指定模型本地路径即可。
156
+
157
+ SDK下载
158
+ ```bash
159
+ #安装ModelScope
160
+ pip install modelscope
161
+ ```
162
+ ```python
163
+ #SDK模型下载
164
+ from modelscope import snapshot_download
165
+ model_dir = snapshot_download('iic/SenseVoiceSmall')
166
+ ```
167
+ Git下载
168
+ ```
169
+ #Git模型下载
170
+ git clone https://www.modelscope.cn/iic/SenseVoiceSmall.git
171
+ ```
172
+
173
+ ## 服务部署
174
+
175
+ Undo
176
+
177
+ # Performance
178
+
179
+ ## 语音识别效果
180
+ 我们在开源基准数据集(包括 AISHELL-1、AISHELL-2、Wenetspeech、Librispeech和Common Voice)上比较了SenseVoice与Whisper的多语言语音识别性能和推理效率。在中文和粤语识别效果上,SenseVoice-Small模型具有明显的效果优势。
181
+
182
+ <p align="center">
183
+ <img src="fig/asr_results.png" alt="SenseVoice模型在开源测试集上的表现" width="2500" />
184
+ </p>
185
+
186
+
187
+
188
+ ## 情感识别效果
189
+ 由于目前缺乏被广泛使用的情感识别测试指标和方法,我们在多个测试集的多种指标进行测试,并与近年来Benchmark上的多个结果进行了全面的对比。所选取的测试集同时包含中文/英文两种语言以及表演、影视剧、自然对话等多种风格的数据,在不进行目标数据微调的前提下,SenseVoice能够在测试数据上达到和超过目前最佳情感识别模型的效果。
190
+
191
+ <p align="center">
192
+ <img src="fig/ser_table.png" alt="SenseVoice模型SER效果1" width="1500" />
193
+ </p>
194
+
195
+ 同时,我们还在测试集上对多个开源情感识别模型进行对比,结果表明,SenseVoice-Large模型可以在几乎所有数据上都达到了最佳效果,而SenseVoice-Small模型同样可以在多数数据集上取得超越其他开源模型的效果。
196
+
197
+ <p align="center">
198
+ <img src="fig/ser_figure.png" alt="SenseVoice模型SER效果2" width="500" />
199
+ </p>
200
+
201
+ ## 事件检测效果
202
+
203
+ 尽管SenseVoice只在语音数据上进行训练,它仍然可以作为事件检测模型进行单独使用。我们在环境音分类ESC-50数据集上与目前业内广泛使用的BEATS与PANN模型的效果进行了对比。SenseVoice模型能够在这些任务上取得较好的效果,但受限于训练数据与训练方式,其事件分类效果专业的事件检测模型相比仍然有一定的差距。
204
+
205
+ <p align="center">
206
+ <img src="fig/aed_figure.png" alt="SenseVoice模型AED效果" width="500" />
207
+ </p>
208
+
209
+
210
+
211
+ ## 推理效率
212
+ SenseVoice-Small模型采用非自回归端到端架构,推理延迟极低。在参数量与Whisper-Small模型相当的情况下,比Whisper-Small模型推理速度快7倍,比Whisper-Large模型快17倍。同时SenseVoice-small模型在音频时长增加的情况下,推理耗时也无明显增加。
213
+
214
+
215
+ <p align="center">
216
+ <img src="fig/inference.png" alt="SenseVoice模型的推理效率" width="1500" />
217
+ </p>
218
+
219
+ <p style="color: lightgrey;">如果您是本模型的贡献者,我们邀请您根据<a href="https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88" style="color: lightgrey; text-decoration: underline;">模型贡献文档</a>,及时完善模型卡片内容。</p>
iic/SenseVoiceSmall/am.mvn ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <Nnet>
2
+ <Splice> 560 560
3
+ [ 0 ]
4
+ <AddShift> 560 560
5
+ <LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
6
+ <Rescale> 560 560
7
+ <LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
8
+ </Nnet>
iic/SenseVoiceSmall/chn_jpn_yue_eng_ko_spectok.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
3
+ size 377341
iic/SenseVoiceSmall/config.yaml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ encoder: SenseVoiceEncoderSmall
2
+ encoder_conf:
3
+ output_size: 512
4
+ attention_heads: 4
5
+ linear_units: 2048
6
+ num_blocks: 50
7
+ tp_blocks: 20
8
+ dropout_rate: 0.1
9
+ positional_dropout_rate: 0.1
10
+ attention_dropout_rate: 0.1
11
+ input_layer: pe
12
+ pos_enc_class: SinusoidalPositionEncoder
13
+ normalize_before: true
14
+ kernel_size: 11
15
+ sanm_shfit: 0
16
+ selfattention_layer_type: sanm
17
+
18
+
19
+ model: SenseVoiceSmall
20
+ model_conf:
21
+ length_normalized_loss: true
22
+ sos: 1
23
+ eos: 2
24
+ ignore_id: -1
25
+
26
+ tokenizer: SentencepiecesTokenizer
27
+ tokenizer_conf:
28
+ bpemodel: null
29
+ unk_symbol: <unk>
30
+ split_with_space: true
31
+
32
+ frontend: WavFrontend
33
+ frontend_conf:
34
+ fs: 16000
35
+ window: hamming
36
+ n_mels: 80
37
+ frame_length: 25
38
+ frame_shift: 10
39
+ lfr_m: 7
40
+ lfr_n: 6
41
+ cmvn_file: null
42
+
43
+
44
+ dataset: SenseVoiceCTCDataset
45
+ dataset_conf:
46
+ index_ds: IndexDSJsonl
47
+ batch_sampler: EspnetStyleBatchSampler
48
+ data_split_num: 32
49
+ batch_type: token
50
+ batch_size: 14000
51
+ max_token_length: 2000
52
+ min_token_length: 60
53
+ max_source_length: 2000
54
+ min_source_length: 60
55
+ max_target_length: 200
56
+ min_target_length: 0
57
+ shuffle: true
58
+ num_workers: 4
59
+ sos: ${model_conf.sos}
60
+ eos: ${model_conf.eos}
61
+ IndexDSJsonl: IndexDSJsonl
62
+ retry: 20
63
+
64
+ train_conf:
65
+ accum_grad: 1
66
+ grad_clip: 5
67
+ max_epoch: 20
68
+ keep_nbest_models: 10
69
+ avg_nbest_model: 10
70
+ log_interval: 100
71
+ resume: true
72
+ validate_interval: 10000
73
+ save_checkpoint_interval: 10000
74
+
75
+ optim: adamw
76
+ optim_conf:
77
+ lr: 0.00002
78
+ scheduler: warmuplr
79
+ scheduler_conf:
80
+ warmup_steps: 25000
81
+
82
+ specaug: SpecAugLFR
83
+ specaug_conf:
84
+ apply_time_warp: false
85
+ time_warp_window: 5
86
+ time_warp_mode: bicubic
87
+ apply_freq_mask: true
88
+ freq_mask_width_range:
89
+ - 0
90
+ - 30
91
+ lfr_rate: 6
92
+ num_freq_mask: 1
93
+ apply_time_mask: true
94
+ time_mask_width_range:
95
+ - 0
96
+ - 12
97
+ num_time_mask: 1
iic/SenseVoiceSmall/configuration.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "framework": "pytorch",
3
+ "task" : "auto-speech-recognition",
4
+ "model": {"type" : "funasr"},
5
+ "pipeline": {"type":"funasr-pipeline"},
6
+ "model_name_in_hub": {
7
+ "ms":"",
8
+ "hf":""},
9
+ "file_path_metas": {
10
+ "init_param":"model.pt",
11
+ "config":"config.yaml",
12
+ "tokenizer_conf": {"bpemodel": "chn_jpn_yue_eng_ko_spectok.bpe.model"},
13
+ "frontend_conf":{"cmvn_file": "am.mvn"}}
14
+ }
iic/SenseVoiceSmall/example/.DS_Store ADDED
Binary file (6.15 kB). View file
 
iic/SenseVoiceSmall/example/en.mp3 ADDED
Binary file (57.4 kB). View file
 
iic/SenseVoiceSmall/example/ja.mp3 ADDED
Binary file (57.8 kB). View file
 
iic/SenseVoiceSmall/example/ko.mp3 ADDED
Binary file (27.9 kB). View file
 
iic/SenseVoiceSmall/example/yue.mp3 ADDED
Binary file (31.2 kB). View file
 
iic/SenseVoiceSmall/example/zh.mp3 ADDED
Binary file (45 kB). View file
 
iic/SenseVoiceSmall/fig/aed_figure.png ADDED

Git LFS Details

  • SHA256: 643a2705d9b2ac1fe95cc4f1ca7df23c0f030e256e351541287391128665b5b8
  • Pointer size: 131 Bytes
  • Size of remote file: 119 kB
iic/SenseVoiceSmall/fig/asr_results.png ADDED

Git LFS Details

  • SHA256: fda4390934fc6d309cbac7f22053580cbedd8a608ad52e3cfac0fb2947a8c4fe
  • Pointer size: 131 Bytes
  • Size of remote file: 244 kB
iic/SenseVoiceSmall/fig/inference.png ADDED

Git LFS Details

  • SHA256: 0bcde7dd81dcb5b1198cb7fd210fa05a6861512b844027fe8508c8f6a997352e
  • Pointer size: 131 Bytes
  • Size of remote file: 958 kB
iic/SenseVoiceSmall/fig/sensevoice.png ADDED

Git LFS Details

  • SHA256: e570d2fad030bb48b0094a3085791d3a6aa1be338f0a003736a11e58e5034afc
  • Pointer size: 131 Bytes
  • Size of remote file: 901 kB
iic/SenseVoiceSmall/fig/ser_figure.png ADDED

Git LFS Details

  • SHA256: eaf050fa5d775a3b84031d8eb5a200699312d7e9a7d769550643cf02edbdb13b
  • Pointer size: 131 Bytes
  • Size of remote file: 199 kB
iic/SenseVoiceSmall/fig/ser_table.png ADDED

Git LFS Details

  • SHA256: a0a8ae6dcc63b89a7d9160461b7c78333d0c8c0cdcf7bd0311d59a3b36d26370
  • Pointer size: 131 Bytes
  • Size of remote file: 326 kB
iic/SenseVoiceSmall/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:833ca2dcfdf8ec91bd4f31cfac36d6124e0c459074d5e909aec9cabe6204a3ea
3
+ size 936291369
iic/SenseVoiceSmall/tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
musetalk/.cache/huggingface/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *
musetalk/.cache/huggingface/download/.gitattributes.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3ef28bc5cff08c90ad8178a25f1b570cd800170f
2
+ a6344aac8c09253b3b630fb776ae94478aa0275b
3
+ 1750086024.9018652
musetalk/.cache/huggingface/download/README.md.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3ef28bc5cff08c90ad8178a25f1b570cd800170f
2
+ 74e11fb4b681253f7fe73d9c4b80ec0021949213
3
+ 1750086024.779342
musetalk/.cache/huggingface/download/musetalk/musetalk.json.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3ef28bc5cff08c90ad8178a25f1b570cd800170f
2
+ b822db87e503a283fbbee73617f89dcd294cb91c
3
+ 1750086025.6143801
musetalk/.cache/huggingface/download/musetalk/pytorch_model.bin.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3ef28bc5cff08c90ad8178a25f1b570cd800170f
2
+ 0ee7d5ea03ea75d8dca50ea7a76df791e90633687a135c4a69393abfc0475ffe
3
+ 1750086527.7114985
musetalk/.cache/huggingface/download/musetalkV15/musetalk.json.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3ef28bc5cff08c90ad8178a25f1b570cd800170f
2
+ b822db87e503a283fbbee73617f89dcd294cb91c
3
+ 1750086025.06287
musetalk/.cache/huggingface/download/musetalkV15/unet.pth.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3ef28bc5cff08c90ad8178a25f1b570cd800170f
2
+ 7ebf6c98c181e20838e4c0054e96e944ac60d5d692cc01db42839fe11b787007
3
+ 1750086542.267899
musetalk/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
musetalk/README.md ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: creativeml-openrail-m
3
+ language:
4
+ - en
5
+ ---
6
+ # MuseTalk
7
+
8
+ MuseTalk: Real-Time High Quality Lip Synchronization with Latent Space Inpainting
9
+ </br>
10
+ Yue Zhang <sup>\*</sup>,
11
+ Minhao Liu<sup>\*</sup>,
12
+ Zhaokang Chen,
13
+ Bin Wu<sup>†</sup>,
14
+ Yingjie He,
15
+ Chao Zhan,
16
+ Wenjiang Zhou
17
+ (<sup>*</sup>Equal Contribution, <sup>†</sup>Corresponding Author, benbinwu@tencent.com)
18
+
19
+ **[github](https://github.com/TMElyralab/MuseTalk)** **[huggingface](https://huggingface.co/TMElyralab/MuseTalk)** **Project(comming soon)** **Technical report (comming soon)**
20
+
21
+ We introduce `MuseTalk`, a **real-time high quality** lip-syncing model (30fps+ on an NVIDIA Tesla V100). MuseTalk can be applied with input videos, e.g., generated by [MuseV](https://github.com/TMElyralab/MuseV), as a complete virtual human solution.
22
+
23
+ # Overview
24
+ `MuseTalk` is a real-time high quality audio-driven lip-syncing model trained in the latent space of `ft-mse-vae`, which
25
+
26
+ 1. modifies an unseen face according to the input audio, with a size of face region of `256 x 256`.
27
+ 1. supports audio in various languages, such as Chinese, English, and Japanese.
28
+ 1. supports real-time inference with 30fps+ on an NVIDIA Tesla V100.
29
+ 1. supports modification of the center point of the face region proposes, which **SIGNIFICANTLY** affects generation results.
30
+ 1. checkpoint available trained on the HDTF dataset.
31
+ 1. training codes (comming soon).
32
+
33
+ # News
34
+ - [04/02/2024] Released MuseTalk project and pretrained models.
35
+
36
+ ## Model
37
+ ![Model Structure](assets/figs/musetalk_arc.jpg)
38
+ MuseTalk was trained in latent spaces, where the images were encoded by a freezed VAE. The audio was encoded by a freezed `whisper-tiny` model. The architecture of the generation network was borrowed from the UNet of the `stable-diffusion-v1-4`, where the audio embeddings were fused to the image embeddings by cross-attention.
39
+
40
+ ## Cases
41
+ ### MuseV + MuseTalk make human photos alive!
42
+ <table class="center">
43
+ <tr style="font-weight: bolder;text-align:center;">
44
+ <td width="33%">Image</td>
45
+ <td width="33%">MuseV</td>
46
+ <td width="33%">+MuseTalk</td>
47
+ </tr>
48
+ <tr>
49
+ <td>
50
+ <img src=assets/demo/musk/musk.png width="95%">
51
+ </td>
52
+ <td >
53
+ <video src=assets/demo/yongen/yongen_musev.mp4 controls preload></video>
54
+ </td>
55
+ <td >
56
+ <video src=assets/demo/yongen/yongen_musetalk.mp4 controls preload></video>
57
+ </td>
58
+ </tr>
59
+ <tr>
60
+ <td>
61
+ <img src=assets/demo/yongen/yongen.jpeg width="95%">
62
+ </td>
63
+ <td >
64
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/57ef9dee-a9fd-4dc8-839b-3fbbbf0ff3f4 controls preload></video>
65
+ </td>
66
+ <td >
67
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/94d8dcba-1bcd-4b54-9d1d-8b6fc53228f0 controls preload></video>
68
+ </td>
69
+ </tr>
70
+ <tr>
71
+ <td>
72
+ <img src=assets/demo/monalisa/monalisa.png width="95%">
73
+ </td>
74
+ <td >
75
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/1568f604-a34f-4526-a13a-7d282aa2e773 controls preload></video>
76
+ </td>
77
+ <td >
78
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/a40784fc-a885-4c1f-9b7e-8f87b7caf4e0 controls preload></video>
79
+ </td>
80
+ </tr>
81
+ <tr>
82
+ <td>
83
+ <img src=assets/demo/sun1/sun.png width="95%">
84
+ </td>
85
+ <td >
86
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/37a3a666-7b90-4244-8d3a-058cb0e44107 controls preload></video>
87
+ </td>
88
+ <td >
89
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/172f4ff1-d432-45bd-a5a7-a07dec33a26b controls preload></video>
90
+ </td>
91
+ </tr>
92
+ <tr>
93
+ <td>
94
+ <img src=assets/demo/sun2/sun.png width="95%">
95
+ </td>
96
+ <td >
97
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/37a3a666-7b90-4244-8d3a-058cb0e44107 controls preload></video>
98
+ </td>
99
+ <td >
100
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/85a6873d-a028-4cce-af2b-6c59a1f2971d controls preload></video>
101
+ </td>
102
+ </tr>
103
+ </table >
104
+
105
+ * The character of the last two rows, `Xinying Sun`, is a supermodel KOL. You can follow her on [douyin](https://www.douyin.com/user/MS4wLjABAAAAWDThbMPN_6Xmm_JgXexbOii1K-httbu2APdG8DvDyM8).
106
+
107
+ ## Video dubbing
108
+ <table class="center">
109
+ <tr style="font-weight: bolder;text-align:center;">
110
+ <td width="70%">MuseTalk</td>
111
+ <td width="30%">Original videos</td>
112
+ </tr>
113
+ <tr>
114
+ <td>
115
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/4d7c5fa1-3550-4d52-8ed2-52f158150f24 controls preload></video>
116
+ </td>
117
+ <td>
118
+ <a href="//www.bilibili.com/video/BV1wT411b7HU">Link</a>
119
+ <href src=""></href>
120
+ </td>
121
+ </tr>
122
+ </table>
123
+
124
+ * For video dubbing, we applied a self-developed tool which can detect the talking person.
125
+
126
+
127
+ # TODO:
128
+ - [x] trained models and inference codes.
129
+ - [ ] technical report.
130
+ - [ ] training codes.
131
+ - [ ] online UI.
132
+ - [ ] a better model (may take longer).
133
+
134
+
135
+ # Getting Started
136
+ We provide a detailed tutorial about the installation and the basic usage of MuseTalk for new users:
137
+ ## Installation
138
+ To prepare the Python environment and install additional packages such as opencv, diffusers, mmcv, etc., please follow the steps below:
139
+ ### Build environment
140
+
141
+ We recommend a python version >=3.10 and cuda version =11.7. Then build environment as follows:
142
+
143
+ ```shell
144
+ pip install -r requirements.txt
145
+ ```
146
+ ### whisper
147
+ install whisper to extract audio feature (only encoder)
148
+ ```
149
+ pip install --editable ./musetalk/whisper
150
+ ```
151
+
152
+ ### mmlab packages
153
+ ```bash
154
+ pip install --no-cache-dir -U openmim
155
+ mim install mmengine
156
+ mim install "mmcv>=2.0.1"
157
+ mim install "mmdet>=3.1.0"
158
+ mim install "mmpose>=1.1.0"
159
+ ```
160
+
161
+ ### Download ffmpeg-static
162
+ Download the ffmpeg-static and
163
+ ```
164
+ export FFMPEG_PATH=/path/to/ffmpeg
165
+ ```
166
+ for example:
167
+ ```
168
+ export FFMPEG_PATH=/musetalk/ffmpeg-4.4-amd64-static
169
+ ```
170
+ ### Download weights
171
+ You can download weights manually as follows:
172
+
173
+ 1. Download our trained [weights](https://huggingface.co/TMElyralab/MuseTalk).
174
+
175
+ 2. Download the weights of other components:
176
+ - [sd-vae-ft-mse](https://huggingface.co/stabilityai/sd-vae-ft-mse)
177
+ - [whisper](https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt)
178
+ - [dwpose](https://huggingface.co/yzd-v/DWPose/tree/main)
179
+ - [face-parse-bisent](https://github.com/zllrunning/face-parsing.PyTorch)
180
+ - [resnet18](https://download.pytorch.org/models/resnet18-5c106cde.pth)
181
+
182
+
183
+ Finally, these weights should be organized in `models` as follows:
184
+ ```
185
+ ./models/
186
+ ├── musetalk
187
+ │ └── musetalk.json
188
+ │ └── pytorch_model.bin
189
+ ├── dwpose
190
+ │ └── dw-ll_ucoco_384.pth
191
+ ├── face-parse-bisent
192
+ │ ├── 79999_iter.pth
193
+ │ └── resnet18-5c106cde.pth
194
+ ├── sd-vae-ft-mse
195
+ │ ├── config.json
196
+ │ └── diffusion_pytorch_model.bin
197
+ └── whisper
198
+ └── tiny.pt
199
+ ```
200
+ ## Quickstart
201
+
202
+ ### Inference
203
+ Here, we provide the inference script.
204
+ ```
205
+ python -m scripts.inference --inference_config configs/inference/test.yaml
206
+ ```
207
+ configs/inference/test.yaml is the path to the inference configuration file, including video_path and audio_path.
208
+ The video_path should be either a video file or a directory of images.
209
+
210
+ #### Use of bbox_shift to have adjustable results
211
+ :mag_right: We have found that upper-bound of the mask has an important impact on mouth openness. Thus, to control the mask region, we suggest using the `bbox_shift` parameter. Positive values (moving towards the lower half) increase mouth openness, while negative values (moving towards the upper half) decrease mouth openness.
212
+
213
+ You can start by running with the default configuration to obtain the adjustable value range, and then re-run the script within this range.
214
+
215
+ For example, in the case of `Xinying Sun`, after running the default configuration, it shows that the adjustable value rage is [-9, 9]. Then, to decrease the mouth openness, we set the value to be `-7`.
216
+ ```
217
+ python -m scripts.inference --inference_config configs/inference/test.yaml --bbox_shift -7
218
+ ```
219
+ :pushpin: More technical details can be found in [bbox_shift](assets/BBOX_SHIFT.md).
220
+
221
+ #### Combining MuseV and MuseTalk
222
+
223
+ As a complete solution to virtual human generation, you are suggested to first apply [MuseV](https://github.com/TMElyralab/MuseV) to generate a video (text-to-video, image-to-video or pose-to-video) by referring [this](https://github.com/TMElyralab/MuseV?tab=readme-ov-file#text2video). Then, you can use `MuseTalk` to generate a lip-sync video by referring [this](https://github.com/TMElyralab/MuseTalk?tab=readme-ov-file#inference).
224
+
225
+ # Note
226
+
227
+ If you want to launch online video chats, you are suggested to generate videos using MuseV and apply necessary pre-processing such as face detection in advance. During online chatting, only UNet and the VAE decoder are involved, which makes MuseTalk real-time.
228
+
229
+
230
+ # Acknowledgement
231
+ 1. We thank open-source components like [whisper](https://github.com/isaacOnline/whisper/tree/extract-embeddings), [dwpose](https://github.com/IDEA-Research/DWPose), [face-alignment](https://github.com/1adrianb/face-alignment), [face-parsing](https://github.com/zllrunning/face-parsing.PyTorch), [S3FD](https://github.com/yxlijun/S3FD.pytorch).
232
+ 1. MuseTalk has referred much to [diffusers](https://github.com/huggingface/diffusers).
233
+ 1. MuseTalk has been built on `HDTF` datasets.
234
+
235
+ Thanks for open-sourcing!
236
+
237
+ # Limitations
238
+ - Resolution: Though MuseTalk uses a face region size of 256 x 256, which make it better than other open-source methods, it has not yet reached the theoretical resolution bound. We will continue to deal with this problem.
239
+ If you need higher resolution, you could apply super resolution models such as [GFPGAN](https://github.com/TencentARC/GFPGAN) in combination with MuseTalk.
240
+
241
+ - Identity preservation: Some details of the original face are not well preserved, such as mustache, lip shape and color.
242
+
243
+ - Jitter: There exists some jitter as the current pipeline adopts single-frame generation.
244
+
245
+ # Citation
246
+ ```bib
247
+ @article{musetalk,
248
+ title={MuseTalk: Real-Time High Quality Lip Synchorization with Latent Space Inpainting},
249
+ author={Zhang, Yue and Liu, Minhao and Chen, Zhaokang and Wu, Bin and He, Yingjie and Zhan, Chao and Zhou, Wenjiang},
250
+ journal={arxiv},
251
+ year={2024}
252
+ }
253
+ ```
254
+ # Disclaimer/License
255
+ 1. `code`: The code of MuseTalk is released under the MIT License. There is no limitation for both academic and commercial usage.
256
+ 1. `model`: The trained model are available for any purpose, even commercially.
257
+ 1. `other opensource model`: Other open-source models used must comply with their license, such as `whisper`, `ft-mse-vae`, `dwpose`, `S3FD`, etc..
258
+ 1. The testdata are collected from internet, which are available for non-commercial research purposes only.
259
+ 1. `AIGC`: This project strives to impact the domain of AI-driven video generation positively. Users are granted the freedom to create videos using this tool, but they are expected to comply with local laws and utilize it responsibly. The developers do not assume any responsibility for potential misuse by users.
musetalk/dwpose/.cache/huggingface/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *
musetalk/dwpose/.cache/huggingface/download/dw-ll_ucoco_384.pth.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 1a7144101628d69ee7a3768d1ee3a094070dc388
2
+ 0d9408b13cd863c4e95a149dd31232f88f2a12aa6cf8964ed74d7d97748c7a07
3
+ 1750087247.9831874
musetalk/dwpose/dw-ll_ucoco_384.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d9408b13cd863c4e95a149dd31232f88f2a12aa6cf8964ed74d7d97748c7a07
3
+ size 406878486
musetalk/musetalk/musetalk.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.6.0.dev0",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": 8,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "center_input_sample": false,
13
+ "cross_attention_dim": 384,
14
+ "down_block_types": [
15
+ "CrossAttnDownBlock2D",
16
+ "CrossAttnDownBlock2D",
17
+ "CrossAttnDownBlock2D",
18
+ "DownBlock2D"
19
+ ],
20
+ "downsample_padding": 1,
21
+ "flip_sin_to_cos": true,
22
+ "freq_shift": 0,
23
+ "in_channels": 8,
24
+ "layers_per_block": 2,
25
+ "mid_block_scale_factor": 1,
26
+ "norm_eps": 1e-05,
27
+ "norm_num_groups": 32,
28
+ "out_channels": 4,
29
+ "sample_size": 64,
30
+ "up_block_types": [
31
+ "UpBlock2D",
32
+ "CrossAttnUpBlock2D",
33
+ "CrossAttnUpBlock2D",
34
+ "CrossAttnUpBlock2D"
35
+ ]
36
+ }
musetalk/musetalk/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ee7d5ea03ea75d8dca50ea7a76df791e90633687a135c4a69393abfc0475ffe
3
+ size 3400076549
musetalk/musetalkV15/musetalk.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.6.0.dev0",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": 8,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "center_input_sample": false,
13
+ "cross_attention_dim": 384,
14
+ "down_block_types": [
15
+ "CrossAttnDownBlock2D",
16
+ "CrossAttnDownBlock2D",
17
+ "CrossAttnDownBlock2D",
18
+ "DownBlock2D"
19
+ ],
20
+ "downsample_padding": 1,
21
+ "flip_sin_to_cos": true,
22
+ "freq_shift": 0,
23
+ "in_channels": 8,
24
+ "layers_per_block": 2,
25
+ "mid_block_scale_factor": 1,
26
+ "norm_eps": 1e-05,
27
+ "norm_num_groups": 32,
28
+ "out_channels": 4,
29
+ "sample_size": 64,
30
+ "up_block_types": [
31
+ "UpBlock2D",
32
+ "CrossAttnUpBlock2D",
33
+ "CrossAttnUpBlock2D",
34
+ "CrossAttnUpBlock2D"
35
+ ]
36
+ }
musetalk/musetalkV15/unet.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ebf6c98c181e20838e4c0054e96e944ac60d5d692cc01db42839fe11b787007
3
+ size 3400074924
musetalk/syncnet/.cache/huggingface/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *
musetalk/syncnet/.cache/huggingface/download/latentsync_syncnet.pt.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 405eda8eab9f65c1a6e0c292a5dee5a08089e2ae
2
+ 38fa63bad3ed2332f647c40a5dc616cb0e233db8579f698f62af4c41965c4da5
3
+ 1750087764.185959
musetalk/syncnet/latentsync_syncnet.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38fa63bad3ed2332f647c40a5dc616cb0e233db8579f698f62af4c41965c4da5
3
+ size 1488019828
musetalk/whisper/.cache/huggingface/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *
musetalk/whisper/.cache/huggingface/download/config.json.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 169d4a4341b33bc18d8881c4b69c2e104e1cc0af
2
+ 417aa9de49a132dd3eb6a56d3be2718b15f08917
3
+ 1750087057.2418575