Upload 65 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +6 -0
- LAM_audio2exp/pretrained_models/LAM_audio2exp_streaming2.tar +3 -0
- face-parse-bisent/.cache/huggingface/.gitignore +1 -0
- face-parse-bisent/.cache/huggingface/download/79999_iter.pth.metadata +3 -0
- face-parse-bisent/.cache/huggingface/download/resnet18-5c106cde.pth.metadata +3 -0
- face-parse-bisent/79999_iter.pth +3 -0
- face-parse-bisent/resnet18-5c106cde.pth +3 -0
- iic/SenseVoiceSmall/.mdl +0 -0
- iic/SenseVoiceSmall/.msc +0 -0
- iic/SenseVoiceSmall/.mv +1 -0
- iic/SenseVoiceSmall/README.md +219 -0
- iic/SenseVoiceSmall/am.mvn +8 -0
- iic/SenseVoiceSmall/chn_jpn_yue_eng_ko_spectok.bpe.model +3 -0
- iic/SenseVoiceSmall/config.yaml +97 -0
- iic/SenseVoiceSmall/configuration.json +14 -0
- iic/SenseVoiceSmall/example/.DS_Store +0 -0
- iic/SenseVoiceSmall/example/en.mp3 +0 -0
- iic/SenseVoiceSmall/example/ja.mp3 +0 -0
- iic/SenseVoiceSmall/example/ko.mp3 +0 -0
- iic/SenseVoiceSmall/example/yue.mp3 +0 -0
- iic/SenseVoiceSmall/example/zh.mp3 +0 -0
- iic/SenseVoiceSmall/fig/aed_figure.png +3 -0
- iic/SenseVoiceSmall/fig/asr_results.png +3 -0
- iic/SenseVoiceSmall/fig/inference.png +3 -0
- iic/SenseVoiceSmall/fig/sensevoice.png +3 -0
- iic/SenseVoiceSmall/fig/ser_figure.png +3 -0
- iic/SenseVoiceSmall/fig/ser_table.png +3 -0
- iic/SenseVoiceSmall/model.pt +3 -0
- iic/SenseVoiceSmall/tokens.json +0 -0
- musetalk/.cache/huggingface/.gitignore +1 -0
- musetalk/.cache/huggingface/download/.gitattributes.metadata +3 -0
- musetalk/.cache/huggingface/download/README.md.metadata +3 -0
- musetalk/.cache/huggingface/download/musetalk/musetalk.json.metadata +3 -0
- musetalk/.cache/huggingface/download/musetalk/pytorch_model.bin.metadata +3 -0
- musetalk/.cache/huggingface/download/musetalkV15/musetalk.json.metadata +3 -0
- musetalk/.cache/huggingface/download/musetalkV15/unet.pth.metadata +3 -0
- musetalk/.gitattributes +35 -0
- musetalk/README.md +259 -0
- musetalk/dwpose/.cache/huggingface/.gitignore +1 -0
- musetalk/dwpose/.cache/huggingface/download/dw-ll_ucoco_384.pth.metadata +3 -0
- musetalk/dwpose/dw-ll_ucoco_384.pth +3 -0
- musetalk/musetalk/musetalk.json +36 -0
- musetalk/musetalk/pytorch_model.bin +3 -0
- musetalk/musetalkV15/musetalk.json +36 -0
- musetalk/musetalkV15/unet.pth +3 -0
- musetalk/syncnet/.cache/huggingface/.gitignore +1 -0
- musetalk/syncnet/.cache/huggingface/download/latentsync_syncnet.pt.metadata +3 -0
- musetalk/syncnet/latentsync_syncnet.pt +3 -0
- musetalk/whisper/.cache/huggingface/.gitignore +1 -0
- musetalk/whisper/.cache/huggingface/download/config.json.metadata +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
iic/SenseVoiceSmall/fig/aed_figure.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
iic/SenseVoiceSmall/fig/asr_results.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
iic/SenseVoiceSmall/fig/inference.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
iic/SenseVoiceSmall/fig/sensevoice.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
iic/SenseVoiceSmall/fig/ser_figure.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
iic/SenseVoiceSmall/fig/ser_table.png filter=lfs diff=lfs merge=lfs -text
|
LAM_audio2exp/pretrained_models/LAM_audio2exp_streaming2.tar
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38084d471966381f2e52d9f59e5654c9b8cde6fa36d4c6f18fc02cbaf593d157
|
| 3 |
+
size 408538564
|
face-parse-bisent/.cache/huggingface/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*
|
face-parse-bisent/.cache/huggingface/download/79999_iter.pth.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0073b233a5a3c4b1377d4dbf49245017938a72b5
|
| 2 |
+
468e13ca13a9b43cc0881a9f99083a430e9c0a38abd935431d1c28ee94b26567
|
| 3 |
+
1750088240.4747846
|
face-parse-bisent/.cache/huggingface/download/resnet18-5c106cde.pth.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0073b233a5a3c4b1377d4dbf49245017938a72b5
|
| 2 |
+
5c106cde386e87d4033832f2996f5493238eda96ccf559d1d62760c4de0613f8
|
| 3 |
+
1750088239.4089577
|
face-parse-bisent/79999_iter.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:468e13ca13a9b43cc0881a9f99083a430e9c0a38abd935431d1c28ee94b26567
|
| 3 |
+
size 53289463
|
face-parse-bisent/resnet18-5c106cde.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c106cde386e87d4033832f2996f5493238eda96ccf559d1d62760c4de0613f8
|
| 3 |
+
size 46827520
|
iic/SenseVoiceSmall/.mdl
ADDED
|
Binary file (42 Bytes). View file
|
|
|
iic/SenseVoiceSmall/.msc
ADDED
|
Binary file (1.35 kB). View file
|
|
|
iic/SenseVoiceSmall/.mv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Revision:master,CreatedAt:1727321787
|
iic/SenseVoiceSmall/README.md
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
frameworks:
|
| 3 |
+
- Pytorch
|
| 4 |
+
license: Apache License 2.0
|
| 5 |
+
tasks:
|
| 6 |
+
- auto-speech-recognition
|
| 7 |
+
|
| 8 |
+
#model-type:
|
| 9 |
+
##如 gpt、phi、llama、chatglm、baichuan 等
|
| 10 |
+
#- gpt
|
| 11 |
+
|
| 12 |
+
#domain:
|
| 13 |
+
##如 nlp、cv、audio、multi-modal
|
| 14 |
+
#- nlp
|
| 15 |
+
|
| 16 |
+
#language:
|
| 17 |
+
##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
|
| 18 |
+
#- cn
|
| 19 |
+
|
| 20 |
+
#metrics:
|
| 21 |
+
##如 CIDEr、Blue、ROUGE 等
|
| 22 |
+
#- CIDEr
|
| 23 |
+
|
| 24 |
+
#tags:
|
| 25 |
+
##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
|
| 26 |
+
#- pretrained
|
| 27 |
+
|
| 28 |
+
#tools:
|
| 29 |
+
##如 vllm、fastchat、llamacpp、AdaSeq 等
|
| 30 |
+
#- vllm
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
# Highlights
|
| 34 |
+
**SenseVoice**专注于高精度多语言语音识别、情感辨识和音频事件检测
|
| 35 |
+
- **多语言识别:** 采用超过40万小时数据训练,支持超过50种语言,识别效果上优于Whisper模型。
|
| 36 |
+
- **富文本识别:**
|
| 37 |
+
- 具备优秀的情感识别,能够在测试数据上达到和超过目前最佳情感识别模型的效果。
|
| 38 |
+
- 支持声音事件检测能力,支持音乐、掌声、笑声、哭声、咳嗽、喷嚏等多种常见人机交互事件进行检测。
|
| 39 |
+
- **高效推理:** SenseVoice-Small模型采用非自回归端到端框架,推理延迟极低,10s音频推理仅耗时70ms,15倍优于Whisper-Large。
|
| 40 |
+
- **微调定制:** 具备便捷的微调脚本与策略,方便用户根据业务场景修复长尾样本问题。
|
| 41 |
+
- **服务部署:** 具有完整的服务部署链路,支持多并发请求,支持客户端语言有,python、c++、html、java与c#等。
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
## <strong>[SenseVoice开源项目介绍](https://github.com/FunAudioLLM/SenseVoice)</strong>
|
| 45 |
+
<strong>[SenseVoice](https://github.com/FunAudioLLM/SenseVoice)</strong>开源模型是多语言音频理解模型,具有包括语音识别、语种识别、语音情感识别,声学事件检测能力。
|
| 46 |
+
|
| 47 |
+
[**github仓库**](https://github.com/FunAudioLLM/SenseVoice)
|
| 48 |
+
| [**最新动态**](https://github.com/FunAudioLLM/SenseVoice/blob/main/README_zh.md#%E6%9C%80%E6%96%B0%E5%8A%A8%E6%80%81)
|
| 49 |
+
| [**环境安装**](https://github.com/FunAudioLLM/SenseVoice/blob/main/README_zh.md#%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85)
|
| 50 |
+
|
| 51 |
+
# 模型结构图
|
| 52 |
+
SenseVoice多语言音频理解模型,支持语音识别、语种识别、语音情感识别、声学事件检测、逆文本正则化等能力,采用工业级数十万小时的标注音频进行模型训练,保证了模型的通用识别效果。模型可以被应用于中文、粤语、英语、日语、韩语音频识别,并输出带有情感和事件的富文本转写结果。
|
| 53 |
+
|
| 54 |
+
<p align="center">
|
| 55 |
+
<img src="fig/sensevoice.png" alt="SenseVoice模型结构" width="1500" />
|
| 56 |
+
</p>
|
| 57 |
+
|
| 58 |
+
SenseVoice-Small是基于非自回归端到端框架模型,为了指定任务,我们在语音特征前添加四个嵌入作为输入传递给编码器:
|
| 59 |
+
- LID:用于预测音频语种标签。
|
| 60 |
+
- SER:用于预测音频情感标签。
|
| 61 |
+
- AED:用于预测音频包含的事件标签。
|
| 62 |
+
- ITN:用于指定识别输出文本是否进行逆文本正则化。
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# 依赖环境
|
| 66 |
+
|
| 67 |
+
推理之前,请务必更新funasr与modelscope版本
|
| 68 |
+
|
| 69 |
+
```shell
|
| 70 |
+
pip install -U funasr modelscope
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
# 用法
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
## 推理
|
| 77 |
+
|
| 78 |
+
### modelscope pipeline推理
|
| 79 |
+
```python
|
| 80 |
+
from modelscope.pipelines import pipeline
|
| 81 |
+
from modelscope.utils.constant import Tasks
|
| 82 |
+
|
| 83 |
+
inference_pipeline = pipeline(
|
| 84 |
+
task=Tasks.auto_speech_recognition,
|
| 85 |
+
model='iic/SenseVoiceSmall',
|
| 86 |
+
model_revision="master",
|
| 87 |
+
device="cuda:0",)
|
| 88 |
+
|
| 89 |
+
rec_result = inference_pipeline('https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
|
| 90 |
+
print(rec_result)
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
### 使用funasr推理
|
| 94 |
+
|
| 95 |
+
支持任意格式音频输入,支持任意时长输入
|
| 96 |
+
|
| 97 |
+
```python
|
| 98 |
+
from funasr import AutoModel
|
| 99 |
+
from funasr.utils.postprocess_utils import rich_transcription_postprocess
|
| 100 |
+
|
| 101 |
+
model_dir = "iic/SenseVoiceSmall"
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
model = AutoModel(
|
| 105 |
+
model=model_dir,
|
| 106 |
+
trust_remote_code=True,
|
| 107 |
+
remote_code="./model.py",
|
| 108 |
+
vad_model="fsmn-vad",
|
| 109 |
+
vad_kwargs={"max_single_segment_time": 30000},
|
| 110 |
+
device="cuda:0",
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# en
|
| 114 |
+
res = model.generate(
|
| 115 |
+
input=f"{model.model_path}/example/en.mp3",
|
| 116 |
+
cache={},
|
| 117 |
+
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
|
| 118 |
+
use_itn=True,
|
| 119 |
+
batch_size_s=60,
|
| 120 |
+
merge_vad=True, #
|
| 121 |
+
merge_length_s=15,
|
| 122 |
+
)
|
| 123 |
+
text = rich_transcription_postprocess(res[0]["text"])
|
| 124 |
+
print(text)
|
| 125 |
+
```
|
| 126 |
+
参数说明:
|
| 127 |
+
- `model_dir`:模型名称,或本地磁盘中的模型路径。
|
| 128 |
+
- `trust_remote_code`:
|
| 129 |
+
- `True`表示model代码实现从`remote_code`处加载,`remote_code`指定`model`具体代码的位置(例如,当前目录下的`model.py`),支持绝对路径与相对路径,以及网络url。
|
| 130 |
+
- `False`表示,model代码实现为 [FunASR](https://github.com/modelscope/FunASR) 内部集成版本,此时修改当前目录下的`model.py`不会生效,因为加载的是funasr内部版本,模型代码[点击查看](https://github.com/modelscope/FunASR/tree/main/funasr/models/sense_voice)。
|
| 131 |
+
- `vad_model`:表示开启VAD,VAD的作用是将长音频切割成短音频,此时推理耗时包括了VAD与SenseVoice总耗时,为链路耗时,如果需要单独测试SenseVoice模型耗时,可以关闭VAD模型。
|
| 132 |
+
- `vad_kwargs`:表示VAD模型配置,`max_single_segment_time`: 表示`vad_model`最大切割音频时长, 单位是毫秒ms。
|
| 133 |
+
- `use_itn`:输出结果中是否包含标点与逆文本正则化。
|
| 134 |
+
- `batch_size_s` 表示采用动态batch,batch中总音频时长,单位为秒s。
|
| 135 |
+
- `merge_vad`:是否将 vad 模型切割的短音频碎片合成,合并后长度为`merge_length_s`,单位为秒s。
|
| 136 |
+
- `ban_emo_unk`:禁用emo_unk标签,禁用后所有的句子都会被赋与情感标签。默认`False`
|
| 137 |
+
|
| 138 |
+
```python
|
| 139 |
+
model = AutoModel(model=model_dir, trust_remote_code=True, device="cuda:0")
|
| 140 |
+
|
| 141 |
+
res = model.generate(
|
| 142 |
+
input=f"{model.model_path}/example/en.mp3",
|
| 143 |
+
cache={},
|
| 144 |
+
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
|
| 145 |
+
use_itn=True,
|
| 146 |
+
batch_size=64,
|
| 147 |
+
)
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
更多详细用法,请参考 [文档](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
## 模型下载
|
| 155 |
+
上面代码会自动下载模型,如果您需要离线下载好模型,可以通过下面代码,手动下载,之后指定模型本地路径即可。
|
| 156 |
+
|
| 157 |
+
SDK下载
|
| 158 |
+
```bash
|
| 159 |
+
#安装ModelScope
|
| 160 |
+
pip install modelscope
|
| 161 |
+
```
|
| 162 |
+
```python
|
| 163 |
+
#SDK模型下载
|
| 164 |
+
from modelscope import snapshot_download
|
| 165 |
+
model_dir = snapshot_download('iic/SenseVoiceSmall')
|
| 166 |
+
```
|
| 167 |
+
Git下载
|
| 168 |
+
```
|
| 169 |
+
#Git模型下载
|
| 170 |
+
git clone https://www.modelscope.cn/iic/SenseVoiceSmall.git
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
## 服务部署
|
| 174 |
+
|
| 175 |
+
Undo
|
| 176 |
+
|
| 177 |
+
# Performance
|
| 178 |
+
|
| 179 |
+
## 语音识别效果
|
| 180 |
+
我们在开源基准数据集(包括 AISHELL-1、AISHELL-2、Wenetspeech、Librispeech和Common Voice)上比较了SenseVoice与Whisper的多语言语音识别性能和推理效率。在中文和粤语识别效果上,SenseVoice-Small模型具有明显的效果优势。
|
| 181 |
+
|
| 182 |
+
<p align="center">
|
| 183 |
+
<img src="fig/asr_results.png" alt="SenseVoice模型在开源测试集上的表现" width="2500" />
|
| 184 |
+
</p>
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
## 情感识别效果
|
| 189 |
+
由于目前缺乏被广泛使用的情感识别测试指标和方法,我们在多个测试集的多种指标进行测试,并与近年来Benchmark上的多个结果进行了全面的对比。所选取的测试集同时包含中文/英文两种语言以及表演、影视剧、自然对话等多种风格的数据,在不进行目标数据微调的前提下,SenseVoice能够在测试数据上达到和超过目前最佳情感识别模型的效果。
|
| 190 |
+
|
| 191 |
+
<p align="center">
|
| 192 |
+
<img src="fig/ser_table.png" alt="SenseVoice模型SER效果1" width="1500" />
|
| 193 |
+
</p>
|
| 194 |
+
|
| 195 |
+
同时,我们还在测试集上对多个开源情感识别模型进行对比,结果表明,SenseVoice-Large模型可以在几乎所有数据上都达到了最佳效果,而SenseVoice-Small模型同样可以在多数数据集上取得超越其他开源模型的效果。
|
| 196 |
+
|
| 197 |
+
<p align="center">
|
| 198 |
+
<img src="fig/ser_figure.png" alt="SenseVoice模型SER效果2" width="500" />
|
| 199 |
+
</p>
|
| 200 |
+
|
| 201 |
+
## 事件检测效果
|
| 202 |
+
|
| 203 |
+
尽管SenseVoice只在语音数据上进行训练,它仍然可以作为事件检测模型进行单独使用。我们在环境音分类ESC-50数据集上与目前业内广泛使用的BEATS与PANN模型的效果进行了对比。SenseVoice模型能够在这些任务上取得较好的效果,但受限于训练数据与训练方式,其事件分类效果专业的事件检测模型相比仍然有一定的差距。
|
| 204 |
+
|
| 205 |
+
<p align="center">
|
| 206 |
+
<img src="fig/aed_figure.png" alt="SenseVoice模型AED效果" width="500" />
|
| 207 |
+
</p>
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
## 推理效率
|
| 212 |
+
SenseVoice-Small模型采用非自回归端到端架构,推理延迟极低。在参数量与Whisper-Small模型相当的情况下,比Whisper-Small模型推理速度快7倍,比Whisper-Large模型快17倍。同时SenseVoice-small模型在音频时长增加的情况下,推理耗时也无明显增加。
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
<p align="center">
|
| 216 |
+
<img src="fig/inference.png" alt="SenseVoice模型的推理效率" width="1500" />
|
| 217 |
+
</p>
|
| 218 |
+
|
| 219 |
+
<p style="color: lightgrey;">如果您是本模型的贡献者,我们邀请您根据<a href="https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88" style="color: lightgrey; text-decoration: underline;">模型贡献文档</a>,及时完善模型卡片内容。</p>
|
iic/SenseVoiceSmall/am.mvn
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<Nnet>
|
| 2 |
+
<Splice> 560 560
|
| 3 |
+
[ 0 ]
|
| 4 |
+
<AddShift> 560 560
|
| 5 |
+
<LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
|
| 6 |
+
<Rescale> 560 560
|
| 7 |
+
<LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
|
| 8 |
+
</Nnet>
|
iic/SenseVoiceSmall/chn_jpn_yue_eng_ko_spectok.bpe.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
|
| 3 |
+
size 377341
|
iic/SenseVoiceSmall/config.yaml
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
encoder: SenseVoiceEncoderSmall
|
| 2 |
+
encoder_conf:
|
| 3 |
+
output_size: 512
|
| 4 |
+
attention_heads: 4
|
| 5 |
+
linear_units: 2048
|
| 6 |
+
num_blocks: 50
|
| 7 |
+
tp_blocks: 20
|
| 8 |
+
dropout_rate: 0.1
|
| 9 |
+
positional_dropout_rate: 0.1
|
| 10 |
+
attention_dropout_rate: 0.1
|
| 11 |
+
input_layer: pe
|
| 12 |
+
pos_enc_class: SinusoidalPositionEncoder
|
| 13 |
+
normalize_before: true
|
| 14 |
+
kernel_size: 11
|
| 15 |
+
sanm_shfit: 0
|
| 16 |
+
selfattention_layer_type: sanm
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
model: SenseVoiceSmall
|
| 20 |
+
model_conf:
|
| 21 |
+
length_normalized_loss: true
|
| 22 |
+
sos: 1
|
| 23 |
+
eos: 2
|
| 24 |
+
ignore_id: -1
|
| 25 |
+
|
| 26 |
+
tokenizer: SentencepiecesTokenizer
|
| 27 |
+
tokenizer_conf:
|
| 28 |
+
bpemodel: null
|
| 29 |
+
unk_symbol: <unk>
|
| 30 |
+
split_with_space: true
|
| 31 |
+
|
| 32 |
+
frontend: WavFrontend
|
| 33 |
+
frontend_conf:
|
| 34 |
+
fs: 16000
|
| 35 |
+
window: hamming
|
| 36 |
+
n_mels: 80
|
| 37 |
+
frame_length: 25
|
| 38 |
+
frame_shift: 10
|
| 39 |
+
lfr_m: 7
|
| 40 |
+
lfr_n: 6
|
| 41 |
+
cmvn_file: null
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
dataset: SenseVoiceCTCDataset
|
| 45 |
+
dataset_conf:
|
| 46 |
+
index_ds: IndexDSJsonl
|
| 47 |
+
batch_sampler: EspnetStyleBatchSampler
|
| 48 |
+
data_split_num: 32
|
| 49 |
+
batch_type: token
|
| 50 |
+
batch_size: 14000
|
| 51 |
+
max_token_length: 2000
|
| 52 |
+
min_token_length: 60
|
| 53 |
+
max_source_length: 2000
|
| 54 |
+
min_source_length: 60
|
| 55 |
+
max_target_length: 200
|
| 56 |
+
min_target_length: 0
|
| 57 |
+
shuffle: true
|
| 58 |
+
num_workers: 4
|
| 59 |
+
sos: ${model_conf.sos}
|
| 60 |
+
eos: ${model_conf.eos}
|
| 61 |
+
IndexDSJsonl: IndexDSJsonl
|
| 62 |
+
retry: 20
|
| 63 |
+
|
| 64 |
+
train_conf:
|
| 65 |
+
accum_grad: 1
|
| 66 |
+
grad_clip: 5
|
| 67 |
+
max_epoch: 20
|
| 68 |
+
keep_nbest_models: 10
|
| 69 |
+
avg_nbest_model: 10
|
| 70 |
+
log_interval: 100
|
| 71 |
+
resume: true
|
| 72 |
+
validate_interval: 10000
|
| 73 |
+
save_checkpoint_interval: 10000
|
| 74 |
+
|
| 75 |
+
optim: adamw
|
| 76 |
+
optim_conf:
|
| 77 |
+
lr: 0.00002
|
| 78 |
+
scheduler: warmuplr
|
| 79 |
+
scheduler_conf:
|
| 80 |
+
warmup_steps: 25000
|
| 81 |
+
|
| 82 |
+
specaug: SpecAugLFR
|
| 83 |
+
specaug_conf:
|
| 84 |
+
apply_time_warp: false
|
| 85 |
+
time_warp_window: 5
|
| 86 |
+
time_warp_mode: bicubic
|
| 87 |
+
apply_freq_mask: true
|
| 88 |
+
freq_mask_width_range:
|
| 89 |
+
- 0
|
| 90 |
+
- 30
|
| 91 |
+
lfr_rate: 6
|
| 92 |
+
num_freq_mask: 1
|
| 93 |
+
apply_time_mask: true
|
| 94 |
+
time_mask_width_range:
|
| 95 |
+
- 0
|
| 96 |
+
- 12
|
| 97 |
+
num_time_mask: 1
|
iic/SenseVoiceSmall/configuration.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"framework": "pytorch",
|
| 3 |
+
"task" : "auto-speech-recognition",
|
| 4 |
+
"model": {"type" : "funasr"},
|
| 5 |
+
"pipeline": {"type":"funasr-pipeline"},
|
| 6 |
+
"model_name_in_hub": {
|
| 7 |
+
"ms":"",
|
| 8 |
+
"hf":""},
|
| 9 |
+
"file_path_metas": {
|
| 10 |
+
"init_param":"model.pt",
|
| 11 |
+
"config":"config.yaml",
|
| 12 |
+
"tokenizer_conf": {"bpemodel": "chn_jpn_yue_eng_ko_spectok.bpe.model"},
|
| 13 |
+
"frontend_conf":{"cmvn_file": "am.mvn"}}
|
| 14 |
+
}
|
iic/SenseVoiceSmall/example/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
iic/SenseVoiceSmall/example/en.mp3
ADDED
|
Binary file (57.4 kB). View file
|
|
|
iic/SenseVoiceSmall/example/ja.mp3
ADDED
|
Binary file (57.8 kB). View file
|
|
|
iic/SenseVoiceSmall/example/ko.mp3
ADDED
|
Binary file (27.9 kB). View file
|
|
|
iic/SenseVoiceSmall/example/yue.mp3
ADDED
|
Binary file (31.2 kB). View file
|
|
|
iic/SenseVoiceSmall/example/zh.mp3
ADDED
|
Binary file (45 kB). View file
|
|
|
iic/SenseVoiceSmall/fig/aed_figure.png
ADDED
|
|
Git LFS Details
|
iic/SenseVoiceSmall/fig/asr_results.png
ADDED
|
|
Git LFS Details
|
iic/SenseVoiceSmall/fig/inference.png
ADDED
|
|
Git LFS Details
|
iic/SenseVoiceSmall/fig/sensevoice.png
ADDED
|
|
Git LFS Details
|
iic/SenseVoiceSmall/fig/ser_figure.png
ADDED
|
|
Git LFS Details
|
iic/SenseVoiceSmall/fig/ser_table.png
ADDED
|
|
Git LFS Details
|
iic/SenseVoiceSmall/model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:833ca2dcfdf8ec91bd4f31cfac36d6124e0c459074d5e909aec9cabe6204a3ea
|
| 3 |
+
size 936291369
|
iic/SenseVoiceSmall/tokens.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
musetalk/.cache/huggingface/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*
|
musetalk/.cache/huggingface/download/.gitattributes.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3ef28bc5cff08c90ad8178a25f1b570cd800170f
|
| 2 |
+
a6344aac8c09253b3b630fb776ae94478aa0275b
|
| 3 |
+
1750086024.9018652
|
musetalk/.cache/huggingface/download/README.md.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3ef28bc5cff08c90ad8178a25f1b570cd800170f
|
| 2 |
+
74e11fb4b681253f7fe73d9c4b80ec0021949213
|
| 3 |
+
1750086024.779342
|
musetalk/.cache/huggingface/download/musetalk/musetalk.json.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3ef28bc5cff08c90ad8178a25f1b570cd800170f
|
| 2 |
+
b822db87e503a283fbbee73617f89dcd294cb91c
|
| 3 |
+
1750086025.6143801
|
musetalk/.cache/huggingface/download/musetalk/pytorch_model.bin.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3ef28bc5cff08c90ad8178a25f1b570cd800170f
|
| 2 |
+
0ee7d5ea03ea75d8dca50ea7a76df791e90633687a135c4a69393abfc0475ffe
|
| 3 |
+
1750086527.7114985
|
musetalk/.cache/huggingface/download/musetalkV15/musetalk.json.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3ef28bc5cff08c90ad8178a25f1b570cd800170f
|
| 2 |
+
b822db87e503a283fbbee73617f89dcd294cb91c
|
| 3 |
+
1750086025.06287
|
musetalk/.cache/huggingface/download/musetalkV15/unet.pth.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3ef28bc5cff08c90ad8178a25f1b570cd800170f
|
| 2 |
+
7ebf6c98c181e20838e4c0054e96e944ac60d5d692cc01db42839fe11b787007
|
| 3 |
+
1750086542.267899
|
musetalk/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
musetalk/README.md
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: creativeml-openrail-m
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
---
|
| 6 |
+
# MuseTalk
|
| 7 |
+
|
| 8 |
+
MuseTalk: Real-Time High Quality Lip Synchronization with Latent Space Inpainting
|
| 9 |
+
</br>
|
| 10 |
+
Yue Zhang <sup>\*</sup>,
|
| 11 |
+
Minhao Liu<sup>\*</sup>,
|
| 12 |
+
Zhaokang Chen,
|
| 13 |
+
Bin Wu<sup>†</sup>,
|
| 14 |
+
Yingjie He,
|
| 15 |
+
Chao Zhan,
|
| 16 |
+
Wenjiang Zhou
|
| 17 |
+
(<sup>*</sup>Equal Contribution, <sup>†</sup>Corresponding Author, benbinwu@tencent.com)
|
| 18 |
+
|
| 19 |
+
**[github](https://github.com/TMElyralab/MuseTalk)** **[huggingface](https://huggingface.co/TMElyralab/MuseTalk)** **Project(comming soon)** **Technical report (comming soon)**
|
| 20 |
+
|
| 21 |
+
We introduce `MuseTalk`, a **real-time high quality** lip-syncing model (30fps+ on an NVIDIA Tesla V100). MuseTalk can be applied with input videos, e.g., generated by [MuseV](https://github.com/TMElyralab/MuseV), as a complete virtual human solution.
|
| 22 |
+
|
| 23 |
+
# Overview
|
| 24 |
+
`MuseTalk` is a real-time high quality audio-driven lip-syncing model trained in the latent space of `ft-mse-vae`, which
|
| 25 |
+
|
| 26 |
+
1. modifies an unseen face according to the input audio, with a size of face region of `256 x 256`.
|
| 27 |
+
1. supports audio in various languages, such as Chinese, English, and Japanese.
|
| 28 |
+
1. supports real-time inference with 30fps+ on an NVIDIA Tesla V100.
|
| 29 |
+
1. supports modification of the center point of the face region proposes, which **SIGNIFICANTLY** affects generation results.
|
| 30 |
+
1. checkpoint available trained on the HDTF dataset.
|
| 31 |
+
1. training codes (comming soon).
|
| 32 |
+
|
| 33 |
+
# News
|
| 34 |
+
- [04/02/2024] Released MuseTalk project and pretrained models.
|
| 35 |
+
|
| 36 |
+
## Model
|
| 37 |
+

|
| 38 |
+
MuseTalk was trained in latent spaces, where the images were encoded by a freezed VAE. The audio was encoded by a freezed `whisper-tiny` model. The architecture of the generation network was borrowed from the UNet of the `stable-diffusion-v1-4`, where the audio embeddings were fused to the image embeddings by cross-attention.
|
| 39 |
+
|
| 40 |
+
## Cases
|
| 41 |
+
### MuseV + MuseTalk make human photos alive!
|
| 42 |
+
<table class="center">
|
| 43 |
+
<tr style="font-weight: bolder;text-align:center;">
|
| 44 |
+
<td width="33%">Image</td>
|
| 45 |
+
<td width="33%">MuseV</td>
|
| 46 |
+
<td width="33%">+MuseTalk</td>
|
| 47 |
+
</tr>
|
| 48 |
+
<tr>
|
| 49 |
+
<td>
|
| 50 |
+
<img src=assets/demo/musk/musk.png width="95%">
|
| 51 |
+
</td>
|
| 52 |
+
<td >
|
| 53 |
+
<video src=assets/demo/yongen/yongen_musev.mp4 controls preload></video>
|
| 54 |
+
</td>
|
| 55 |
+
<td >
|
| 56 |
+
<video src=assets/demo/yongen/yongen_musetalk.mp4 controls preload></video>
|
| 57 |
+
</td>
|
| 58 |
+
</tr>
|
| 59 |
+
<tr>
|
| 60 |
+
<td>
|
| 61 |
+
<img src=assets/demo/yongen/yongen.jpeg width="95%">
|
| 62 |
+
</td>
|
| 63 |
+
<td >
|
| 64 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/57ef9dee-a9fd-4dc8-839b-3fbbbf0ff3f4 controls preload></video>
|
| 65 |
+
</td>
|
| 66 |
+
<td >
|
| 67 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/94d8dcba-1bcd-4b54-9d1d-8b6fc53228f0 controls preload></video>
|
| 68 |
+
</td>
|
| 69 |
+
</tr>
|
| 70 |
+
<tr>
|
| 71 |
+
<td>
|
| 72 |
+
<img src=assets/demo/monalisa/monalisa.png width="95%">
|
| 73 |
+
</td>
|
| 74 |
+
<td >
|
| 75 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/1568f604-a34f-4526-a13a-7d282aa2e773 controls preload></video>
|
| 76 |
+
</td>
|
| 77 |
+
<td >
|
| 78 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/a40784fc-a885-4c1f-9b7e-8f87b7caf4e0 controls preload></video>
|
| 79 |
+
</td>
|
| 80 |
+
</tr>
|
| 81 |
+
<tr>
|
| 82 |
+
<td>
|
| 83 |
+
<img src=assets/demo/sun1/sun.png width="95%">
|
| 84 |
+
</td>
|
| 85 |
+
<td >
|
| 86 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/37a3a666-7b90-4244-8d3a-058cb0e44107 controls preload></video>
|
| 87 |
+
</td>
|
| 88 |
+
<td >
|
| 89 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/172f4ff1-d432-45bd-a5a7-a07dec33a26b controls preload></video>
|
| 90 |
+
</td>
|
| 91 |
+
</tr>
|
| 92 |
+
<tr>
|
| 93 |
+
<td>
|
| 94 |
+
<img src=assets/demo/sun2/sun.png width="95%">
|
| 95 |
+
</td>
|
| 96 |
+
<td >
|
| 97 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/37a3a666-7b90-4244-8d3a-058cb0e44107 controls preload></video>
|
| 98 |
+
</td>
|
| 99 |
+
<td >
|
| 100 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/85a6873d-a028-4cce-af2b-6c59a1f2971d controls preload></video>
|
| 101 |
+
</td>
|
| 102 |
+
</tr>
|
| 103 |
+
</table >
|
| 104 |
+
|
| 105 |
+
* The character of the last two rows, `Xinying Sun`, is a supermodel KOL. You can follow her on [douyin](https://www.douyin.com/user/MS4wLjABAAAAWDThbMPN_6Xmm_JgXexbOii1K-httbu2APdG8DvDyM8).
|
| 106 |
+
|
| 107 |
+
## Video dubbing
|
| 108 |
+
<table class="center">
|
| 109 |
+
<tr style="font-weight: bolder;text-align:center;">
|
| 110 |
+
<td width="70%">MuseTalk</td>
|
| 111 |
+
<td width="30%">Original videos</td>
|
| 112 |
+
</tr>
|
| 113 |
+
<tr>
|
| 114 |
+
<td>
|
| 115 |
+
<video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/4d7c5fa1-3550-4d52-8ed2-52f158150f24 controls preload></video>
|
| 116 |
+
</td>
|
| 117 |
+
<td>
|
| 118 |
+
<a href="//www.bilibili.com/video/BV1wT411b7HU">Link</a>
|
| 119 |
+
<href src=""></href>
|
| 120 |
+
</td>
|
| 121 |
+
</tr>
|
| 122 |
+
</table>
|
| 123 |
+
|
| 124 |
+
* For video dubbing, we applied a self-developed tool which can detect the talking person.
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# TODO:
|
| 128 |
+
- [x] trained models and inference codes.
|
| 129 |
+
- [ ] technical report.
|
| 130 |
+
- [ ] training codes.
|
| 131 |
+
- [ ] online UI.
|
| 132 |
+
- [ ] a better model (may take longer).
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# Getting Started
|
| 136 |
+
We provide a detailed tutorial about the installation and the basic usage of MuseTalk for new users:
|
| 137 |
+
## Installation
|
| 138 |
+
To prepare the Python environment and install additional packages such as opencv, diffusers, mmcv, etc., please follow the steps below:
|
| 139 |
+
### Build environment
|
| 140 |
+
|
| 141 |
+
We recommend a python version >=3.10 and cuda version =11.7. Then build environment as follows:
|
| 142 |
+
|
| 143 |
+
```shell
|
| 144 |
+
pip install -r requirements.txt
|
| 145 |
+
```
|
| 146 |
+
### whisper
|
| 147 |
+
install whisper to extract audio feature (only encoder)
|
| 148 |
+
```
|
| 149 |
+
pip install --editable ./musetalk/whisper
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
### mmlab packages
|
| 153 |
+
```bash
|
| 154 |
+
pip install --no-cache-dir -U openmim
|
| 155 |
+
mim install mmengine
|
| 156 |
+
mim install "mmcv>=2.0.1"
|
| 157 |
+
mim install "mmdet>=3.1.0"
|
| 158 |
+
mim install "mmpose>=1.1.0"
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
### Download ffmpeg-static
|
| 162 |
+
Download the ffmpeg-static and
|
| 163 |
+
```
|
| 164 |
+
export FFMPEG_PATH=/path/to/ffmpeg
|
| 165 |
+
```
|
| 166 |
+
for example:
|
| 167 |
+
```
|
| 168 |
+
export FFMPEG_PATH=/musetalk/ffmpeg-4.4-amd64-static
|
| 169 |
+
```
|
| 170 |
+
### Download weights
|
| 171 |
+
You can download weights manually as follows:
|
| 172 |
+
|
| 173 |
+
1. Download our trained [weights](https://huggingface.co/TMElyralab/MuseTalk).
|
| 174 |
+
|
| 175 |
+
2. Download the weights of other components:
|
| 176 |
+
- [sd-vae-ft-mse](https://huggingface.co/stabilityai/sd-vae-ft-mse)
|
| 177 |
+
- [whisper](https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt)
|
| 178 |
+
- [dwpose](https://huggingface.co/yzd-v/DWPose/tree/main)
|
| 179 |
+
- [face-parse-bisent](https://github.com/zllrunning/face-parsing.PyTorch)
|
| 180 |
+
- [resnet18](https://download.pytorch.org/models/resnet18-5c106cde.pth)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
Finally, these weights should be organized in `models` as follows:
|
| 184 |
+
```
|
| 185 |
+
./models/
|
| 186 |
+
├── musetalk
|
| 187 |
+
│ └── musetalk.json
|
| 188 |
+
│ └── pytorch_model.bin
|
| 189 |
+
├── dwpose
|
| 190 |
+
│ └── dw-ll_ucoco_384.pth
|
| 191 |
+
├── face-parse-bisent
|
| 192 |
+
│ ├── 79999_iter.pth
|
| 193 |
+
│ └── resnet18-5c106cde.pth
|
| 194 |
+
├── sd-vae-ft-mse
|
| 195 |
+
│ ├── config.json
|
| 196 |
+
│ └── diffusion_pytorch_model.bin
|
| 197 |
+
└── whisper
|
| 198 |
+
└── tiny.pt
|
| 199 |
+
```
|
| 200 |
+
## Quickstart
|
| 201 |
+
|
| 202 |
+
### Inference
|
| 203 |
+
Here, we provide the inference script.
|
| 204 |
+
```
|
| 205 |
+
python -m scripts.inference --inference_config configs/inference/test.yaml
|
| 206 |
+
```
|
| 207 |
+
configs/inference/test.yaml is the path to the inference configuration file, including video_path and audio_path.
|
| 208 |
+
The video_path should be either a video file or a directory of images.
|
| 209 |
+
|
| 210 |
+
#### Use of bbox_shift to have adjustable results
|
| 211 |
+
:mag_right: We have found that upper-bound of the mask has an important impact on mouth openness. Thus, to control the mask region, we suggest using the `bbox_shift` parameter. Positive values (moving towards the lower half) increase mouth openness, while negative values (moving towards the upper half) decrease mouth openness.
|
| 212 |
+
|
| 213 |
+
You can start by running with the default configuration to obtain the adjustable value range, and then re-run the script within this range.
|
| 214 |
+
|
| 215 |
+
For example, in the case of `Xinying Sun`, after running the default configuration, it shows that the adjustable value rage is [-9, 9]. Then, to decrease the mouth openness, we set the value to be `-7`.
|
| 216 |
+
```
|
| 217 |
+
python -m scripts.inference --inference_config configs/inference/test.yaml --bbox_shift -7
|
| 218 |
+
```
|
| 219 |
+
:pushpin: More technical details can be found in [bbox_shift](assets/BBOX_SHIFT.md).
|
| 220 |
+
|
| 221 |
+
#### Combining MuseV and MuseTalk
|
| 222 |
+
|
| 223 |
+
As a complete solution to virtual human generation, you are suggested to first apply [MuseV](https://github.com/TMElyralab/MuseV) to generate a video (text-to-video, image-to-video or pose-to-video) by referring [this](https://github.com/TMElyralab/MuseV?tab=readme-ov-file#text2video). Then, you can use `MuseTalk` to generate a lip-sync video by referring [this](https://github.com/TMElyralab/MuseTalk?tab=readme-ov-file#inference).
|
| 224 |
+
|
| 225 |
+
# Note
|
| 226 |
+
|
| 227 |
+
If you want to launch online video chats, you are suggested to generate videos using MuseV and apply necessary pre-processing such as face detection in advance. During online chatting, only UNet and the VAE decoder are involved, which makes MuseTalk real-time.
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
# Acknowledgement
|
| 231 |
+
1. We thank open-source components like [whisper](https://github.com/isaacOnline/whisper/tree/extract-embeddings), [dwpose](https://github.com/IDEA-Research/DWPose), [face-alignment](https://github.com/1adrianb/face-alignment), [face-parsing](https://github.com/zllrunning/face-parsing.PyTorch), [S3FD](https://github.com/yxlijun/S3FD.pytorch).
|
| 232 |
+
1. MuseTalk has referred much to [diffusers](https://github.com/huggingface/diffusers).
|
| 233 |
+
1. MuseTalk has been built on `HDTF` datasets.
|
| 234 |
+
|
| 235 |
+
Thanks for open-sourcing!
|
| 236 |
+
|
| 237 |
+
# Limitations
|
| 238 |
+
- Resolution: Though MuseTalk uses a face region size of 256 x 256, which make it better than other open-source methods, it has not yet reached the theoretical resolution bound. We will continue to deal with this problem.
|
| 239 |
+
If you need higher resolution, you could apply super resolution models such as [GFPGAN](https://github.com/TencentARC/GFPGAN) in combination with MuseTalk.
|
| 240 |
+
|
| 241 |
+
- Identity preservation: Some details of the original face are not well preserved, such as mustache, lip shape and color.
|
| 242 |
+
|
| 243 |
+
- Jitter: There exists some jitter as the current pipeline adopts single-frame generation.
|
| 244 |
+
|
| 245 |
+
# Citation
|
| 246 |
+
```bib
|
| 247 |
+
@article{musetalk,
|
| 248 |
+
title={MuseTalk: Real-Time High Quality Lip Synchorization with Latent Space Inpainting},
|
| 249 |
+
author={Zhang, Yue and Liu, Minhao and Chen, Zhaokang and Wu, Bin and He, Yingjie and Zhan, Chao and Zhou, Wenjiang},
|
| 250 |
+
journal={arxiv},
|
| 251 |
+
year={2024}
|
| 252 |
+
}
|
| 253 |
+
```
|
| 254 |
+
# Disclaimer/License
|
| 255 |
+
1. `code`: The code of MuseTalk is released under the MIT License. There is no limitation for both academic and commercial usage.
|
| 256 |
+
1. `model`: The trained model are available for any purpose, even commercially.
|
| 257 |
+
1. `other opensource model`: Other open-source models used must comply with their license, such as `whisper`, `ft-mse-vae`, `dwpose`, `S3FD`, etc..
|
| 258 |
+
1. The testdata are collected from internet, which are available for non-commercial research purposes only.
|
| 259 |
+
1. `AIGC`: This project strives to impact the domain of AI-driven video generation positively. Users are granted the freedom to create videos using this tool, but they are expected to comply with local laws and utilize it responsibly. The developers do not assume any responsibility for potential misuse by users.
|
musetalk/dwpose/.cache/huggingface/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*
|
musetalk/dwpose/.cache/huggingface/download/dw-ll_ucoco_384.pth.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
1a7144101628d69ee7a3768d1ee3a094070dc388
|
| 2 |
+
0d9408b13cd863c4e95a149dd31232f88f2a12aa6cf8964ed74d7d97748c7a07
|
| 3 |
+
1750087247.9831874
|
musetalk/dwpose/dw-ll_ucoco_384.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d9408b13cd863c4e95a149dd31232f88f2a12aa6cf8964ed74d7d97748c7a07
|
| 3 |
+
size 406878486
|
musetalk/musetalk/musetalk.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "UNet2DConditionModel",
|
| 3 |
+
"_diffusers_version": "0.6.0.dev0",
|
| 4 |
+
"act_fn": "silu",
|
| 5 |
+
"attention_head_dim": 8,
|
| 6 |
+
"block_out_channels": [
|
| 7 |
+
320,
|
| 8 |
+
640,
|
| 9 |
+
1280,
|
| 10 |
+
1280
|
| 11 |
+
],
|
| 12 |
+
"center_input_sample": false,
|
| 13 |
+
"cross_attention_dim": 384,
|
| 14 |
+
"down_block_types": [
|
| 15 |
+
"CrossAttnDownBlock2D",
|
| 16 |
+
"CrossAttnDownBlock2D",
|
| 17 |
+
"CrossAttnDownBlock2D",
|
| 18 |
+
"DownBlock2D"
|
| 19 |
+
],
|
| 20 |
+
"downsample_padding": 1,
|
| 21 |
+
"flip_sin_to_cos": true,
|
| 22 |
+
"freq_shift": 0,
|
| 23 |
+
"in_channels": 8,
|
| 24 |
+
"layers_per_block": 2,
|
| 25 |
+
"mid_block_scale_factor": 1,
|
| 26 |
+
"norm_eps": 1e-05,
|
| 27 |
+
"norm_num_groups": 32,
|
| 28 |
+
"out_channels": 4,
|
| 29 |
+
"sample_size": 64,
|
| 30 |
+
"up_block_types": [
|
| 31 |
+
"UpBlock2D",
|
| 32 |
+
"CrossAttnUpBlock2D",
|
| 33 |
+
"CrossAttnUpBlock2D",
|
| 34 |
+
"CrossAttnUpBlock2D"
|
| 35 |
+
]
|
| 36 |
+
}
|
musetalk/musetalk/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ee7d5ea03ea75d8dca50ea7a76df791e90633687a135c4a69393abfc0475ffe
|
| 3 |
+
size 3400076549
|
musetalk/musetalkV15/musetalk.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "UNet2DConditionModel",
|
| 3 |
+
"_diffusers_version": "0.6.0.dev0",
|
| 4 |
+
"act_fn": "silu",
|
| 5 |
+
"attention_head_dim": 8,
|
| 6 |
+
"block_out_channels": [
|
| 7 |
+
320,
|
| 8 |
+
640,
|
| 9 |
+
1280,
|
| 10 |
+
1280
|
| 11 |
+
],
|
| 12 |
+
"center_input_sample": false,
|
| 13 |
+
"cross_attention_dim": 384,
|
| 14 |
+
"down_block_types": [
|
| 15 |
+
"CrossAttnDownBlock2D",
|
| 16 |
+
"CrossAttnDownBlock2D",
|
| 17 |
+
"CrossAttnDownBlock2D",
|
| 18 |
+
"DownBlock2D"
|
| 19 |
+
],
|
| 20 |
+
"downsample_padding": 1,
|
| 21 |
+
"flip_sin_to_cos": true,
|
| 22 |
+
"freq_shift": 0,
|
| 23 |
+
"in_channels": 8,
|
| 24 |
+
"layers_per_block": 2,
|
| 25 |
+
"mid_block_scale_factor": 1,
|
| 26 |
+
"norm_eps": 1e-05,
|
| 27 |
+
"norm_num_groups": 32,
|
| 28 |
+
"out_channels": 4,
|
| 29 |
+
"sample_size": 64,
|
| 30 |
+
"up_block_types": [
|
| 31 |
+
"UpBlock2D",
|
| 32 |
+
"CrossAttnUpBlock2D",
|
| 33 |
+
"CrossAttnUpBlock2D",
|
| 34 |
+
"CrossAttnUpBlock2D"
|
| 35 |
+
]
|
| 36 |
+
}
|
musetalk/musetalkV15/unet.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ebf6c98c181e20838e4c0054e96e944ac60d5d692cc01db42839fe11b787007
|
| 3 |
+
size 3400074924
|
musetalk/syncnet/.cache/huggingface/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*
|
musetalk/syncnet/.cache/huggingface/download/latentsync_syncnet.pt.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
405eda8eab9f65c1a6e0c292a5dee5a08089e2ae
|
| 2 |
+
38fa63bad3ed2332f647c40a5dc616cb0e233db8579f698f62af4c41965c4da5
|
| 3 |
+
1750087764.185959
|
musetalk/syncnet/latentsync_syncnet.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38fa63bad3ed2332f647c40a5dc616cb0e233db8579f698f62af4c41965c4da5
|
| 3 |
+
size 1488019828
|
musetalk/whisper/.cache/huggingface/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*
|
musetalk/whisper/.cache/huggingface/download/config.json.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
169d4a4341b33bc18d8881c4b69c2e104e1cc0af
|
| 2 |
+
417aa9de49a132dd3eb6a56d3be2718b15f08917
|
| 3 |
+
1750087057.2418575
|