Spaces:
Running
Running
fix: 本地 GUI 优化
Browse files- docs/音源格式规范.md +293 -0
- src/gui_old.py +133 -42
- src/mfa_runner.py +14 -3
docs/音源格式规范.md
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 人力V助手 音源格式规范
|
| 2 |
+
|
| 3 |
+
本文档描述 `bank/` 目录下音源的数据格式,便于其他软件联动调用或生成兼容数据。
|
| 4 |
+
|
| 5 |
+
## 目录结构
|
| 6 |
+
|
| 7 |
+
```
|
| 8 |
+
bank/
|
| 9 |
+
└── {音源名称}/
|
| 10 |
+
├── meta.json # 音源元数据
|
| 11 |
+
├── slices/ # 音频切片目录
|
| 12 |
+
│ ├── {name}_0000.wav
|
| 13 |
+
│ ├── {name}_0000.lab
|
| 14 |
+
│ ├── {name}_0001.wav
|
| 15 |
+
│ ├── {name}_0001.lab
|
| 16 |
+
│ └── ...
|
| 17 |
+
└── textgrid/ # MFA 对齐结果目录
|
| 18 |
+
├── {name}_0000.TextGrid
|
| 19 |
+
├── {name}_0001.TextGrid
|
| 20 |
+
├── ...
|
| 21 |
+
└── alignment_analysis.csv # 对齐分析报告(可选)
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
## 文件格式详解
|
| 25 |
+
|
| 26 |
+
### 1. meta.json - 音源元数据
|
| 27 |
+
|
| 28 |
+
```json
|
| 29 |
+
{
|
| 30 |
+
"source_name": "音源名称",
|
| 31 |
+
"created_at": "2026-01-31T18:37:05.045252",
|
| 32 |
+
"updated_at": "2026-01-31T18:37:50.838075",
|
| 33 |
+
"whisper_model": "openai/whisper-medium",
|
| 34 |
+
"mfa_dict": "mandarin_china_mfa.dict",
|
| 35 |
+
"mfa_acoustic": "mandarin_mfa.zip",
|
| 36 |
+
"language": "chinese",
|
| 37 |
+
"single_speaker": true,
|
| 38 |
+
"slice_count": 13,
|
| 39 |
+
"textgrid_count": 13
|
| 40 |
+
}
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
| 字段 | 类型 | 说明 |
|
| 44 |
+
|------|------|------|
|
| 45 |
+
| source_name | string | 音源名称 |
|
| 46 |
+
| created_at | string | 创建时间(ISO 8601 格式) |
|
| 47 |
+
| updated_at | string | 最后更新时间 |
|
| 48 |
+
| whisper_model | string | 使用的 Whisper 模型 |
|
| 49 |
+
| mfa_dict | string | MFA 字典文件名 |
|
| 50 |
+
| mfa_acoustic | string | MFA 声学模型文件名 |
|
| 51 |
+
| language | string | 语言代码:`chinese` / `japanese` |
|
| 52 |
+
| single_speaker | boolean | 是否单说话人 |
|
| 53 |
+
| slice_count | integer | 切片数量 |
|
| 54 |
+
| textgrid_count | integer | TextGrid 文件数量 |
|
| 55 |
+
|
| 56 |
+
### 2. .wav 文件 - 音频切片
|
| 57 |
+
|
| 58 |
+
- 格式:WAV(PCM)
|
| 59 |
+
- 采样率:保持原始采样率(通常 44100Hz 或 48000Hz)
|
| 60 |
+
- 位深:16-bit
|
| 61 |
+
- 声道:单声道(mono)
|
| 62 |
+
|
| 63 |
+
文件命名规则:`{原始文件名}_{4位序号}.wav`
|
| 64 |
+
|
| 65 |
+
示例:`0102Trial08_AnAn010_0000.wav`
|
| 66 |
+
|
| 67 |
+
### 3. .lab 文件 - 文本标注
|
| 68 |
+
|
| 69 |
+
纯文本文件,包含对应 .wav 文件的转录文本。
|
| 70 |
+
|
| 71 |
+
- 编码:UTF-8
|
| 72 |
+
- 内容:单行文本,无换行符
|
| 73 |
+
|
| 74 |
+
示例内容:
|
| 75 |
+
```
|
| 76 |
+
你能說你能這樣假冒跟你打個比方啊你全家裡面有比如說四口人六口人也好對吧
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
### 4. .TextGrid 文件 - 时间对齐
|
| 80 |
+
|
| 81 |
+
Praat TextGrid 格式,由 MFA(Montreal Forced Aligner)生成。
|
| 82 |
+
|
| 83 |
+
#### 文件结构
|
| 84 |
+
|
| 85 |
+
```
|
| 86 |
+
File type = "ooTextFile"
|
| 87 |
+
Object class = "TextGrid"
|
| 88 |
+
|
| 89 |
+
xmin = 0
|
| 90 |
+
xmax = {音频时长(秒)}
|
| 91 |
+
tiers? <exists>
|
| 92 |
+
size = 2
|
| 93 |
+
item []:
|
| 94 |
+
item [1]:
|
| 95 |
+
class = "IntervalTier"
|
| 96 |
+
name = "words" # 词层
|
| 97 |
+
...
|
| 98 |
+
item [2]:
|
| 99 |
+
class = "IntervalTier"
|
| 100 |
+
name = "phones" # 音素层
|
| 101 |
+
...
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
#### words 层(词层)
|
| 105 |
+
|
| 106 |
+
按词/字切分的时间边界。
|
| 107 |
+
|
| 108 |
+
```
|
| 109 |
+
intervals [N]:
|
| 110 |
+
xmin = 0.03 # 开始时间(秒)
|
| 111 |
+
xmax = 0.14 # 结束时间(秒)
|
| 112 |
+
text = "你" # 文本内容
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
特殊标记:
|
| 116 |
+
- `""` - 静音/空白
|
| 117 |
+
- `<unk>` - 未识别词
|
| 118 |
+
- `spn` - 非语音噪声
|
| 119 |
+
|
| 120 |
+
#### phones 层(音素层)
|
| 121 |
+
|
| 122 |
+
IPA 音素级别的时间边界,包含声调标记。
|
| 123 |
+
|
| 124 |
+
```
|
| 125 |
+
intervals [N]:
|
| 126 |
+
xmin = 0.03
|
| 127 |
+
xmax = 0.08
|
| 128 |
+
text = "ɲ" # 辅音
|
| 129 |
+
|
| 130 |
+
intervals [N+1]:
|
| 131 |
+
xmin = 0.08
|
| 132 |
+
xmax = 0.14
|
| 133 |
+
text = "i˨˩˦" # 元音 + 声调
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
##### 中文音素(IPA)
|
| 137 |
+
|
| 138 |
+
辅音:
|
| 139 |
+
```
|
| 140 |
+
p, pʰ, b, m, f
|
| 141 |
+
t, tʰ, d, n, l
|
| 142 |
+
k, kʰ, ɡ, ŋ, x, h
|
| 143 |
+
tɕ, tɕʰ, ɕ
|
| 144 |
+
ts, tsʰ, s
|
| 145 |
+
ʈʂ, ʈʂʰ, ʂ, ʐ
|
| 146 |
+
ɲ, j, w, ɥ
|
| 147 |
+
ʔ (喉塞音)
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
元音:
|
| 151 |
+
```
|
| 152 |
+
a, o, e, i, u, y
|
| 153 |
+
ə, ɛ, ɔ, ɤ, ɨ, ʅ
|
| 154 |
+
aw, ej, ow (复合元音)
|
| 155 |
+
z̩ (舌尖元音)
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
声调标记(附加在元音后):
|
| 159 |
+
```
|
| 160 |
+
˥ - 阴平(1声)
|
| 161 |
+
˧˥ - 阳平(2声)
|
| 162 |
+
˨˩˦ - 上声(3声)
|
| 163 |
+
˥˩ - 去声(4声)
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
##### 日语音素(IPA)
|
| 167 |
+
|
| 168 |
+
辅音:
|
| 169 |
+
```
|
| 170 |
+
p, b, m, ɸ
|
| 171 |
+
t, d, n, s, z, ɾ
|
| 172 |
+
k, ɡ, h
|
| 173 |
+
tɕ, dʑ, ɕ, ʑ
|
| 174 |
+
ts, dz
|
| 175 |
+
ɲ, j, w
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
元音:
|
| 179 |
+
```
|
| 180 |
+
a, i, ɯ, e, o
|
| 181 |
+
aː, iː, ɯː, eː, oː (长元音)
|
| 182 |
+
```
|
| 183 |
+
|
| 184 |
+
##### 特殊标记
|
| 185 |
+
|
| 186 |
+
- `""` - 静音
|
| 187 |
+
- `spn` - 非语音噪声(spoken noise)
|
| 188 |
+
- `SP` / `AP` - 停顿
|
| 189 |
+
|
| 190 |
+
## 导出格式
|
| 191 |
+
|
| 192 |
+
### 简单单字导出
|
| 193 |
+
|
| 194 |
+
导出到 `export/{音源名称}/simple_export/`
|
| 195 |
+
|
| 196 |
+
按拼音/罗马音分类的单字音频文件:
|
| 197 |
+
```
|
| 198 |
+
export/{音源名称}/simple_export/
|
| 199 |
+
├── ba.wav
|
| 200 |
+
├── ba1.wav
|
| 201 |
+
├── ba2.wav
|
| 202 |
+
├── ni.wav
|
| 203 |
+
├── ...
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
### UTAU oto.ini 导出
|
| 207 |
+
|
| 208 |
+
导出到 `export/{音源名称}/utau_oto/`
|
| 209 |
+
|
| 210 |
+
```
|
| 211 |
+
export/{音源名称}/utau_oto/
|
| 212 |
+
├── oto.ini # UTAU 配置文件
|
| 213 |
+
├── character.txt # 角色信息
|
| 214 |
+
├── {name}_0000.wav # 音频文件
|
| 215 |
+
├── {name}_0001.wav
|
| 216 |
+
└── ...
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
#### oto.ini 格式
|
| 220 |
+
|
| 221 |
+
```
|
| 222 |
+
{wav文件名}={别名},{offset},{consonant},{cutoff},{preutterance},{overlap}
|
| 223 |
+
```
|
| 224 |
+
|
| 225 |
+
示例:
|
| 226 |
+
```
|
| 227 |
+
test_0000.wav=ni,30,50,-110,50,15
|
| 228 |
+
test_0000.wav=neng,140,60,-140,60,18
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
| 参数 | 说明 |
|
| 232 |
+
|------|------|
|
| 233 |
+
| wav文件名 | 音频文���名 |
|
| 234 |
+
| 别名 | 音素别名(拼音/平假名) |
|
| 235 |
+
| offset | 从音频开头跳过的毫秒数 |
|
| 236 |
+
| consonant | 不被拉伸的区域长度(ms) |
|
| 237 |
+
| cutoff | 负值,表示音素总时长(ms) |
|
| 238 |
+
| preutterance | 先行发声(ms) |
|
| 239 |
+
| overlap | 与前一音符的交叉淡化区域(ms) |
|
| 240 |
+
|
| 241 |
+
## 编程接口示例
|
| 242 |
+
|
| 243 |
+
### Python 读取 meta.json
|
| 244 |
+
|
| 245 |
+
```python
|
| 246 |
+
import json
|
| 247 |
+
|
| 248 |
+
def load_source_meta(bank_dir: str, source_name: str) -> dict:
|
| 249 |
+
meta_path = f"{bank_dir}/{source_name}/meta.json"
|
| 250 |
+
with open(meta_path, 'r', encoding='utf-8') as f:
|
| 251 |
+
return json.load(f)
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
### Python 读取 TextGrid
|
| 255 |
+
|
| 256 |
+
```python
|
| 257 |
+
import textgrid
|
| 258 |
+
|
| 259 |
+
def load_textgrid(tg_path: str):
|
| 260 |
+
tg = textgrid.TextGrid.fromFile(tg_path)
|
| 261 |
+
words_tier = tg[0] # words 层
|
| 262 |
+
phones_tier = tg[1] # phones 层
|
| 263 |
+
|
| 264 |
+
for interval in phones_tier:
|
| 265 |
+
phone = interval.mark.strip()
|
| 266 |
+
start = interval.minTime
|
| 267 |
+
end = interval.maxTime
|
| 268 |
+
print(f"{phone}: {start:.3f} - {end:.3f}")
|
| 269 |
+
```
|
| 270 |
+
|
| 271 |
+
### Python 遍历音源切片
|
| 272 |
+
|
| 273 |
+
```python
|
| 274 |
+
import os
|
| 275 |
+
import glob
|
| 276 |
+
|
| 277 |
+
def iter_slices(bank_dir: str, source_name: str):
|
| 278 |
+
slices_dir = f"{bank_dir}/{source_name}/slices"
|
| 279 |
+
for wav_path in glob.glob(f"{slices_dir}/*.wav"):
|
| 280 |
+
lab_path = wav_path.replace('.wav', '.lab')
|
| 281 |
+
basename = os.path.basename(wav_path).replace('.wav', '')
|
| 282 |
+
|
| 283 |
+
with open(lab_path, 'r', encoding='utf-8') as f:
|
| 284 |
+
text = f.read().strip()
|
| 285 |
+
|
| 286 |
+
yield basename, wav_path, text
|
| 287 |
+
```
|
| 288 |
+
|
| 289 |
+
## 版本历史
|
| 290 |
+
|
| 291 |
+
| 版本 | 日期 | 说明 |
|
| 292 |
+
|------|------|------|
|
| 293 |
+
| 1.0 | 2026-02-02 | 初始版本 |
|
src/gui_old.py
CHANGED
|
@@ -94,80 +94,99 @@ class ModelDownloadFrame(ctk.CTkFrame):
|
|
| 94 |
self._setup_ui()
|
| 95 |
|
| 96 |
def _setup_ui(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
# Whisper 模型区域
|
| 98 |
ctk.CTkLabel(self, text="Whisper 语音识别模型", font=ctk.CTkFont(size=14, weight="bold")).grid(
|
| 99 |
-
row=
|
| 100 |
)
|
|
|
|
| 101 |
|
| 102 |
-
ctk.CTkLabel(self, text="模型版本:").grid(row=
|
| 103 |
self.whisper_model_var = ctk.StringVar(value=self.config.get("whisper_model"))
|
| 104 |
ctk.CTkComboBox(
|
| 105 |
self, values=list(ConfigManager.WHISPER_MODELS.keys()),
|
| 106 |
variable=self.whisper_model_var, width=200,
|
| 107 |
command=self._on_model_change
|
| 108 |
-
).grid(row=
|
| 109 |
|
| 110 |
self.model_desc_label = ctk.CTkLabel(self, text=self._get_model_desc(), text_color="gray")
|
| 111 |
-
self.model_desc_label.grid(row=
|
|
|
|
| 112 |
|
| 113 |
-
ctk.CTkLabel(self, text="模型目录:").grid(row=
|
| 114 |
self.models_dir_var = ctk.StringVar(value=self.config.get("models_dir"))
|
| 115 |
-
ctk.CTkEntry(self, textvariable=self.models_dir_var, width=320).grid(row=
|
| 116 |
-
ctk.CTkButton(self, text="浏览", width=60, command=self._browse_models_dir).grid(row=
|
|
|
|
| 117 |
|
| 118 |
-
|
| 119 |
-
self.whisper_status = ctk.CTkLabel(self, text="
|
| 120 |
-
self.whisper_status.grid(row=
|
| 121 |
self.whisper_btn = ctk.CTkButton(self, text="下载 / 加载模型", command=self._download_whisper, width=140)
|
| 122 |
-
self.whisper_btn.grid(row=
|
|
|
|
| 123 |
|
| 124 |
self.progress_label = ctk.CTkLabel(self, text="", text_color="gray")
|
| 125 |
-
self.progress_label.grid(row=
|
|
|
|
| 126 |
|
| 127 |
# Silero VAD 区域
|
| 128 |
ctk.CTkLabel(self, text="Silero VAD 模型", font=ctk.CTkFont(size=14, weight="bold")).grid(
|
| 129 |
-
row=
|
| 130 |
)
|
|
|
|
| 131 |
ctk.CTkLabel(self, text="用于语音活动检测和音频切片", text_color="gray").grid(
|
| 132 |
-
row=
|
| 133 |
)
|
|
|
|
| 134 |
|
| 135 |
-
|
| 136 |
-
self.vad_status = ctk.CTkLabel(self, text="
|
| 137 |
-
self.vad_status.grid(row=
|
| 138 |
self.vad_btn = ctk.CTkButton(self, text="下载模型", command=self._download_vad, width=140)
|
| 139 |
-
self.vad_btn.grid(row=
|
|
|
|
| 140 |
|
| 141 |
# MFA 模型区域
|
| 142 |
ctk.CTkLabel(self, text="MFA 声学模型", font=ctk.CTkFont(size=14, weight="bold")).grid(
|
| 143 |
-
row=
|
| 144 |
)
|
|
|
|
| 145 |
ctk.CTkLabel(self, text="Montreal Forced Aligner 模型,用于语音对齐", text_color="gray").grid(
|
| 146 |
-
row=
|
| 147 |
)
|
|
|
|
| 148 |
|
| 149 |
-
ctk.CTkLabel(self, text="模型目录:").grid(row=
|
| 150 |
self.mfa_dir_var = ctk.StringVar(value=self.config.get("mfa_dir"))
|
| 151 |
-
ctk.CTkEntry(self, textvariable=self.mfa_dir_var, width=320).grid(row=
|
| 152 |
-
ctk.CTkButton(self, text="浏览", width=60, command=self._browse_mfa_dir).grid(row=
|
|
|
|
| 153 |
|
| 154 |
-
ctk.CTkLabel(self, text="选择语言:").grid(row=
|
| 155 |
self.mfa_lang_var = ctk.StringVar(value="mandarin")
|
| 156 |
ctk.CTkComboBox(
|
| 157 |
self, values=["mandarin", "japanese"],
|
| 158 |
variable=self.mfa_lang_var, width=200,
|
| 159 |
command=self._on_mfa_lang_change
|
| 160 |
-
).grid(row=
|
| 161 |
self.mfa_lang_desc = ctk.CTkLabel(self, text="中文 (普通话)", text_color="gray")
|
| 162 |
-
self.mfa_lang_desc.grid(row=
|
|
|
|
| 163 |
|
| 164 |
-
|
| 165 |
-
self.mfa_status = ctk.CTkLabel(self, text="
|
| 166 |
-
self.mfa_status.grid(row=
|
| 167 |
self.mfa_download_btn = ctk.CTkButton(self, text="下载模型", command=self._download_mfa_models, width=140)
|
| 168 |
-
self.mfa_download_btn.grid(row=
|
| 169 |
-
|
| 170 |
-
self._check_vad_status()
|
| 171 |
|
| 172 |
def _get_model_desc(self):
|
| 173 |
info = ConfigManager.WHISPER_MODELS.get(self.whisper_model_var.get(), {})
|
|
@@ -176,7 +195,8 @@ class ModelDownloadFrame(ctk.CTkFrame):
|
|
| 176 |
def _on_model_change(self, choice):
|
| 177 |
self.model_desc_label.configure(text=self._get_model_desc())
|
| 178 |
self.config.set("whisper_model", choice)
|
| 179 |
-
|
|
|
|
| 180 |
self.whisper_pipe = None
|
| 181 |
|
| 182 |
def _browse_models_dir(self):
|
|
@@ -195,13 +215,6 @@ class ModelDownloadFrame(ctk.CTkFrame):
|
|
| 195 |
from src.mfa_model_downloader import get_available_languages
|
| 196 |
self.mfa_lang_desc.configure(text=get_available_languages().get(choice, ""))
|
| 197 |
|
| 198 |
-
def _check_vad_status(self):
|
| 199 |
-
from src.silero_vad_downloader import is_vad_model_downloaded
|
| 200 |
-
if is_vad_model_downloaded(self.config.get("models_dir")):
|
| 201 |
-
self.vad_status.configure(text="✅ 已下载", text_color="green")
|
| 202 |
-
else:
|
| 203 |
-
self.vad_status.configure(text="⏳ 未下载", text_color="gray")
|
| 204 |
-
|
| 205 |
def _download_vad(self):
|
| 206 |
if self._download_thread and self._download_thread.is_alive():
|
| 207 |
return
|
|
@@ -412,8 +425,9 @@ class MakeVoiceBankFrame(ctk.CTkFrame):
|
|
| 412 |
ctk.CTkLabel(self.scroll_frame, text="转录语言:").grid(row=row, column=0, padx=10, pady=5, sticky="w")
|
| 413 |
self.language_var = ctk.StringVar(value="chinese")
|
| 414 |
ctk.CTkComboBox(
|
| 415 |
-
self.scroll_frame, values=["chinese", "japanese"
|
| 416 |
-
variable=self.language_var, width=150
|
|
|
|
| 417 |
).grid(row=row, column=1, padx=5, pady=5, sticky="w")
|
| 418 |
row += 1
|
| 419 |
|
|
@@ -529,6 +543,44 @@ class MakeVoiceBankFrame(ctk.CTkFrame):
|
|
| 529 |
else:
|
| 530 |
self.acoustic_combo.configure(values=["(未找到声学模型)"])
|
| 531 |
self.acoustic_combo.set("(未找到声学模型)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
|
| 533 |
def _browse_input_file(self):
|
| 534 |
path = filedialog.askopenfilename(
|
|
@@ -970,6 +1022,22 @@ class ExportVoiceBankFrame(ctk.CTkFrame):
|
|
| 970 |
# 动态生成插件卡片
|
| 971 |
self._create_plugin_cards()
|
| 972 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 973 |
# 配置行列权重
|
| 974 |
self.grid_columnconfigure(1, weight=1)
|
| 975 |
self.grid_rowconfigure(5, weight=1)
|
|
@@ -1031,6 +1099,29 @@ class ExportVoiceBankFrame(ctk.CTkFrame):
|
|
| 1031 |
bank_dir = self.config.get("bank_dir", "bank")
|
| 1032 |
ExportSettingsDialog(self, plugin, voice_bank, bank_dir, self.log_callback)
|
| 1033 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1034 |
def _refresh_voice_banks(self):
|
| 1035 |
"""刷新音源列表"""
|
| 1036 |
bank_dir = self.config.get("bank_dir", "bank")
|
|
|
|
| 94 |
self._setup_ui()
|
| 95 |
|
| 96 |
def _setup_ui(self):
|
| 97 |
+
row = 0
|
| 98 |
+
|
| 99 |
+
# 便携版提示
|
| 100 |
+
ctk.CTkLabel(
|
| 101 |
+
self, text="💡 便携版已附带除 whisper-medium 以外的所有模型",
|
| 102 |
+
font=ctk.CTkFont(size=12), text_color="#4a9a6a"
|
| 103 |
+
).grid(row=row, column=0, columnspan=3, padx=10, pady=(10, 15), sticky="w")
|
| 104 |
+
row += 1
|
| 105 |
+
|
| 106 |
# Whisper 模型区域
|
| 107 |
ctk.CTkLabel(self, text="Whisper 语音识别模型", font=ctk.CTkFont(size=14, weight="bold")).grid(
|
| 108 |
+
row=row, column=0, columnspan=3, padx=10, pady=(10, 5), sticky="w"
|
| 109 |
)
|
| 110 |
+
row += 1
|
| 111 |
|
| 112 |
+
ctk.CTkLabel(self, text="模型版本:").grid(row=row, column=0, padx=10, pady=5, sticky="w")
|
| 113 |
self.whisper_model_var = ctk.StringVar(value=self.config.get("whisper_model"))
|
| 114 |
ctk.CTkComboBox(
|
| 115 |
self, values=list(ConfigManager.WHISPER_MODELS.keys()),
|
| 116 |
variable=self.whisper_model_var, width=200,
|
| 117 |
command=self._on_model_change
|
| 118 |
+
).grid(row=row, column=1, padx=5, pady=5, sticky="w")
|
| 119 |
|
| 120 |
self.model_desc_label = ctk.CTkLabel(self, text=self._get_model_desc(), text_color="gray")
|
| 121 |
+
self.model_desc_label.grid(row=row, column=2, padx=10, pady=5, sticky="w")
|
| 122 |
+
row += 1
|
| 123 |
|
| 124 |
+
ctk.CTkLabel(self, text="模型目录:").grid(row=row, column=0, padx=10, pady=5, sticky="w")
|
| 125 |
self.models_dir_var = ctk.StringVar(value=self.config.get("models_dir"))
|
| 126 |
+
ctk.CTkEntry(self, textvariable=self.models_dir_var, width=320).grid(row=row, column=1, padx=5, pady=5, sticky="w")
|
| 127 |
+
ctk.CTkButton(self, text="浏览", width=60, command=self._browse_models_dir).grid(row=row, column=2, padx=5, pady=5, sticky="w")
|
| 128 |
+
row += 1
|
| 129 |
|
| 130 |
+
# Whisper 状态(初始隐藏)
|
| 131 |
+
self.whisper_status = ctk.CTkLabel(self, text="", text_color="gray")
|
| 132 |
+
self.whisper_status.grid(row=row, column=1, padx=5, pady=5, sticky="w")
|
| 133 |
self.whisper_btn = ctk.CTkButton(self, text="下载 / 加载模型", command=self._download_whisper, width=140)
|
| 134 |
+
self.whisper_btn.grid(row=row, column=2, padx=5, pady=5, sticky="w")
|
| 135 |
+
row += 1
|
| 136 |
|
| 137 |
self.progress_label = ctk.CTkLabel(self, text="", text_color="gray")
|
| 138 |
+
self.progress_label.grid(row=row, column=0, columnspan=3, padx=10, pady=5, sticky="w")
|
| 139 |
+
row += 1
|
| 140 |
|
| 141 |
# Silero VAD 区域
|
| 142 |
ctk.CTkLabel(self, text="Silero VAD 模型", font=ctk.CTkFont(size=14, weight="bold")).grid(
|
| 143 |
+
row=row, column=0, columnspan=3, padx=10, pady=(20, 5), sticky="w"
|
| 144 |
)
|
| 145 |
+
row += 1
|
| 146 |
ctk.CTkLabel(self, text="用于语音活动检测和音频切片", text_color="gray").grid(
|
| 147 |
+
row=row, column=0, columnspan=3, padx=10, pady=(0, 10), sticky="w"
|
| 148 |
)
|
| 149 |
+
row += 1
|
| 150 |
|
| 151 |
+
# VAD 状态(初始隐藏)
|
| 152 |
+
self.vad_status = ctk.CTkLabel(self, text="", text_color="gray")
|
| 153 |
+
self.vad_status.grid(row=row, column=1, padx=5, pady=5, sticky="w")
|
| 154 |
self.vad_btn = ctk.CTkButton(self, text="下载模型", command=self._download_vad, width=140)
|
| 155 |
+
self.vad_btn.grid(row=row, column=2, padx=5, pady=5, sticky="w")
|
| 156 |
+
row += 1
|
| 157 |
|
| 158 |
# MFA 模型区域
|
| 159 |
ctk.CTkLabel(self, text="MFA 声学模型", font=ctk.CTkFont(size=14, weight="bold")).grid(
|
| 160 |
+
row=row, column=0, columnspan=3, padx=10, pady=(20, 5), sticky="w"
|
| 161 |
)
|
| 162 |
+
row += 1
|
| 163 |
ctk.CTkLabel(self, text="Montreal Forced Aligner 模型,用于语音对齐", text_color="gray").grid(
|
| 164 |
+
row=row, column=0, columnspan=3, padx=10, pady=(0, 10), sticky="w"
|
| 165 |
)
|
| 166 |
+
row += 1
|
| 167 |
|
| 168 |
+
ctk.CTkLabel(self, text="模型目录:").grid(row=row, column=0, padx=10, pady=5, sticky="w")
|
| 169 |
self.mfa_dir_var = ctk.StringVar(value=self.config.get("mfa_dir"))
|
| 170 |
+
ctk.CTkEntry(self, textvariable=self.mfa_dir_var, width=320).grid(row=row, column=1, padx=5, pady=5, sticky="w")
|
| 171 |
+
ctk.CTkButton(self, text="浏览", width=60, command=self._browse_mfa_dir).grid(row=row, column=2, padx=5, pady=5)
|
| 172 |
+
row += 1
|
| 173 |
|
| 174 |
+
ctk.CTkLabel(self, text="选择语言:").grid(row=row, column=0, padx=10, pady=5, sticky="w")
|
| 175 |
self.mfa_lang_var = ctk.StringVar(value="mandarin")
|
| 176 |
ctk.CTkComboBox(
|
| 177 |
self, values=["mandarin", "japanese"],
|
| 178 |
variable=self.mfa_lang_var, width=200,
|
| 179 |
command=self._on_mfa_lang_change
|
| 180 |
+
).grid(row=row, column=1, padx=5, pady=5, sticky="w")
|
| 181 |
self.mfa_lang_desc = ctk.CTkLabel(self, text="中文 (普通话)", text_color="gray")
|
| 182 |
+
self.mfa_lang_desc.grid(row=row, column=2, padx=5, pady=5, sticky="w")
|
| 183 |
+
row += 1
|
| 184 |
|
| 185 |
+
# MFA 状态(初始隐藏)
|
| 186 |
+
self.mfa_status = ctk.CTkLabel(self, text="", text_color="gray")
|
| 187 |
+
self.mfa_status.grid(row=row, column=1, padx=5, pady=5, sticky="w")
|
| 188 |
self.mfa_download_btn = ctk.CTkButton(self, text="下载模型", command=self._download_mfa_models, width=140)
|
| 189 |
+
self.mfa_download_btn.grid(row=row, column=2, padx=5, pady=5, sticky="w")
|
|
|
|
|
|
|
| 190 |
|
| 191 |
def _get_model_desc(self):
|
| 192 |
info = ConfigManager.WHISPER_MODELS.get(self.whisper_model_var.get(), {})
|
|
|
|
| 195 |
def _on_model_change(self, choice):
|
| 196 |
self.model_desc_label.configure(text=self._get_model_desc())
|
| 197 |
self.config.set("whisper_model", choice)
|
| 198 |
+
# 切换模型时清空状态显示,重置 pipeline
|
| 199 |
+
self.whisper_status.configure(text="")
|
| 200 |
self.whisper_pipe = None
|
| 201 |
|
| 202 |
def _browse_models_dir(self):
|
|
|
|
| 215 |
from src.mfa_model_downloader import get_available_languages
|
| 216 |
self.mfa_lang_desc.configure(text=get_available_languages().get(choice, ""))
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
def _download_vad(self):
|
| 219 |
if self._download_thread and self._download_thread.is_alive():
|
| 220 |
return
|
|
|
|
| 425 |
ctk.CTkLabel(self.scroll_frame, text="转录语言:").grid(row=row, column=0, padx=10, pady=5, sticky="w")
|
| 426 |
self.language_var = ctk.StringVar(value="chinese")
|
| 427 |
ctk.CTkComboBox(
|
| 428 |
+
self.scroll_frame, values=["chinese", "japanese"],
|
| 429 |
+
variable=self.language_var, width=150,
|
| 430 |
+
command=self._on_language_change
|
| 431 |
).grid(row=row, column=1, padx=5, pady=5, sticky="w")
|
| 432 |
row += 1
|
| 433 |
|
|
|
|
| 543 |
else:
|
| 544 |
self.acoustic_combo.configure(values=["(未找到声学模型)"])
|
| 545 |
self.acoustic_combo.set("(未找到声学模型)")
|
| 546 |
+
|
| 547 |
+
# 根据当前语言自动选择对应模型
|
| 548 |
+
self._auto_select_mfa_models()
|
| 549 |
+
|
| 550 |
+
def _on_language_change(self, choice):
|
| 551 |
+
"""语言选择变化时自动选择对应的 MFA 模型和字典"""
|
| 552 |
+
self._auto_select_mfa_models()
|
| 553 |
+
|
| 554 |
+
def _auto_select_mfa_models(self):
|
| 555 |
+
"""根据当前语言自动选择对应的 MFA 模型和字典"""
|
| 556 |
+
language = self.language_var.get()
|
| 557 |
+
|
| 558 |
+
# 语言到 MFA 模型关键字的映射
|
| 559 |
+
lang_to_mfa = {
|
| 560 |
+
"chinese": "mandarin",
|
| 561 |
+
"japanese": "japanese",
|
| 562 |
+
"english": "english"
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
mfa_keyword = lang_to_mfa.get(language, "")
|
| 566 |
+
if not mfa_keyword:
|
| 567 |
+
return
|
| 568 |
+
|
| 569 |
+
# 自动选择字典
|
| 570 |
+
dict_values = self.dict_combo.cget("values")
|
| 571 |
+
if dict_values and not dict_values[0].startswith("("):
|
| 572 |
+
for dict_file in dict_values:
|
| 573 |
+
if mfa_keyword in dict_file.lower():
|
| 574 |
+
self.dict_combo.set(dict_file)
|
| 575 |
+
break
|
| 576 |
+
|
| 577 |
+
# 自动选择声学模型
|
| 578 |
+
acoustic_values = self.acoustic_combo.cget("values")
|
| 579 |
+
if acoustic_values and not acoustic_values[0].startswith("("):
|
| 580 |
+
for acoustic_file in acoustic_values:
|
| 581 |
+
if mfa_keyword in acoustic_file.lower():
|
| 582 |
+
self.acoustic_combo.set(acoustic_file)
|
| 583 |
+
break
|
| 584 |
|
| 585 |
def _browse_input_file(self):
|
| 586 |
path = filedialog.askopenfilename(
|
|
|
|
| 1022 |
# 动态生成插件卡片
|
| 1023 |
self._create_plugin_cards()
|
| 1024 |
|
| 1025 |
+
# 底部按钮区域
|
| 1026 |
+
btn_frame = ctk.CTkFrame(self, fg_color="transparent")
|
| 1027 |
+
btn_frame.grid(row=6, column=0, columnspan=3, padx=10, pady=(5, 10), sticky="ew")
|
| 1028 |
+
|
| 1029 |
+
ctk.CTkButton(
|
| 1030 |
+
btn_frame, text="📂 打开导出文件夹", width=140,
|
| 1031 |
+
command=self._open_export_folder,
|
| 1032 |
+
fg_color="#5a6a7a", hover_color="#4a5a6a"
|
| 1033 |
+
).pack(side="left", padx=5)
|
| 1034 |
+
|
| 1035 |
+
ctk.CTkButton(
|
| 1036 |
+
btn_frame, text="🔌 前往导出插件仓库", width=160,
|
| 1037 |
+
command=self._open_plugin_repo,
|
| 1038 |
+
fg_color="#5a6a7a", hover_color="#4a5a6a"
|
| 1039 |
+
).pack(side="left", padx=5)
|
| 1040 |
+
|
| 1041 |
# 配置行列权重
|
| 1042 |
self.grid_columnconfigure(1, weight=1)
|
| 1043 |
self.grid_rowconfigure(5, weight=1)
|
|
|
|
| 1099 |
bank_dir = self.config.get("bank_dir", "bank")
|
| 1100 |
ExportSettingsDialog(self, plugin, voice_bank, bank_dir, self.log_callback)
|
| 1101 |
|
| 1102 |
+
def _open_export_folder(self):
|
| 1103 |
+
"""打开导出文件夹"""
|
| 1104 |
+
import subprocess
|
| 1105 |
+
export_dir = os.path.join(
|
| 1106 |
+
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
| 1107 |
+
"export"
|
| 1108 |
+
)
|
| 1109 |
+
if not os.path.exists(export_dir):
|
| 1110 |
+
os.makedirs(export_dir, exist_ok=True)
|
| 1111 |
+
|
| 1112 |
+
# Windows 使用 explorer 打开文件夹
|
| 1113 |
+
if sys.platform == "win32":
|
| 1114 |
+
os.startfile(export_dir)
|
| 1115 |
+
elif sys.platform == "darwin":
|
| 1116 |
+
subprocess.run(["open", export_dir])
|
| 1117 |
+
else:
|
| 1118 |
+
subprocess.run(["xdg-open", export_dir])
|
| 1119 |
+
|
| 1120 |
+
def _open_plugin_repo(self):
|
| 1121 |
+
"""打开导出插件仓库"""
|
| 1122 |
+
import webbrowser
|
| 1123 |
+
webbrowser.open("https://github.com/TNOTawa/JinrikiHelper-Plugin")
|
| 1124 |
+
|
| 1125 |
def _refresh_voice_banks(self):
|
| 1126 |
"""刷新音源列表"""
|
| 1127 |
bank_dir = self.config.get("bank_dir", "bank")
|
src/mfa_runner.py
CHANGED
|
@@ -217,6 +217,7 @@ def run_mfa_alignment(
|
|
| 217 |
temp_dir: Optional[str] = None,
|
| 218 |
single_speaker: bool = True,
|
| 219 |
clean: bool = True,
|
|
|
|
| 220 |
progress_callback: Optional[Callable[[str], None]] = None
|
| 221 |
) -> tuple[bool, str]:
|
| 222 |
"""
|
|
@@ -230,6 +231,7 @@ def run_mfa_alignment(
|
|
| 230 |
temp_dir: 临时目录,默认使用 mfa_temp(云端会自动创建独立目录)
|
| 231 |
single_speaker: 是否为单说话人模式
|
| 232 |
clean: 是否清理旧缓存
|
|
|
|
| 233 |
progress_callback: 进度回调函数
|
| 234 |
|
| 235 |
返回:
|
|
@@ -288,9 +290,17 @@ def run_mfa_alignment(
|
|
| 288 |
"--temp_directory", str(temp_dir),
|
| 289 |
]
|
| 290 |
|
| 291 |
-
#
|
| 292 |
-
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
if clean:
|
| 296 |
cmd.append("--clean")
|
|
@@ -299,6 +309,7 @@ def run_mfa_alignment(
|
|
| 299 |
|
| 300 |
log(f"正在启动 MFA 对齐引擎...")
|
| 301 |
log(f"运行平台: {'Windows (外挂模式)' if IS_WINDOWS else 'Linux (系统安装)'}")
|
|
|
|
| 302 |
log(f"输入目录: {corpus_dir}")
|
| 303 |
log(f"输出目录: {output_dir}")
|
| 304 |
|
|
|
|
| 217 |
temp_dir: Optional[str] = None,
|
| 218 |
single_speaker: bool = True,
|
| 219 |
clean: bool = True,
|
| 220 |
+
num_jobs: Optional[int] = None,
|
| 221 |
progress_callback: Optional[Callable[[str], None]] = None
|
| 222 |
) -> tuple[bool, str]:
|
| 223 |
"""
|
|
|
|
| 231 |
temp_dir: 临时目录,默认使用 mfa_temp(云端会自动创建独立目录)
|
| 232 |
single_speaker: 是否为单说话人模式
|
| 233 |
clean: 是否清理旧缓存
|
| 234 |
+
num_jobs: 并行进程数,默认使用 CPU 核心数
|
| 235 |
progress_callback: 进度回调函数
|
| 236 |
|
| 237 |
返回:
|
|
|
|
| 290 |
"--temp_directory", str(temp_dir),
|
| 291 |
]
|
| 292 |
|
| 293 |
+
# 设置并行进程数(默认使用 CPU 核心数,最少 1 个)
|
| 294 |
+
import multiprocessing
|
| 295 |
+
if num_jobs is None:
|
| 296 |
+
num_jobs = max(1, multiprocessing.cpu_count())
|
| 297 |
+
cmd.extend(["--num_jobs", str(num_jobs)])
|
| 298 |
+
|
| 299 |
+
# Windows 外挂模式:启用多进程可能有兼容性问题,但可以尝试
|
| 300 |
+
# 如果遇到问题,用户可以通过设置 num_jobs=1 来禁用
|
| 301 |
+
# 注释掉原来的禁用逻辑,让 Windows 也能使用多进程
|
| 302 |
+
# if IS_WINDOWS:
|
| 303 |
+
# cmd.extend(["--use_mp", "false"])
|
| 304 |
|
| 305 |
if clean:
|
| 306 |
cmd.append("--clean")
|
|
|
|
| 309 |
|
| 310 |
log(f"正在启动 MFA 对齐引擎...")
|
| 311 |
log(f"运行平台: {'Windows (外挂模式)' if IS_WINDOWS else 'Linux (系统安装)'}")
|
| 312 |
+
log(f"并行进程数: {num_jobs}")
|
| 313 |
log(f"输入目录: {corpus_dir}")
|
| 314 |
log(f"输出目录: {output_dir}")
|
| 315 |
|