Spaces:
Running
Running
feat: UTAU 导出插件新增自动拼字和模糊拼字
Browse files- docs/流程文档_AI用.md +10 -0
- src/export_plugins/utau_oto_export.py +747 -2
docs/流程文档_AI用.md
CHANGED
|
@@ -144,6 +144,16 @@
|
|
| 144 |
│ │ 4. 生成 oto.ini 配置文件 │ │
|
| 145 |
│ │ 5. 生成 character.txt(支持自定义角色名) │ │
|
| 146 |
│ │ 6. 自动检测文件名编码兼容性,不合法时转拼音 │ │
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
│ └─────────────────────────────────────────────────────────────────────┘ │
|
| 148 |
│ │
|
| 149 |
│ 输出: export/[音源名称]/simple_export/ │
|
|
|
|
| 144 |
│ │ 4. 生成 oto.ini 配置文件 │ │
|
| 145 |
│ │ 5. 生成 character.txt(支持自定义角色名) │ │
|
| 146 |
│ │ 6. 自动检测文件名编码兼容性,不合法时转拼音 │ │
|
| 147 |
+
│ │ 7. 自动拼字功能(可选): │ │
|
| 148 |
+
│ │ • 收集已有的高质量辅音和元音片段 │ │
|
| 149 |
+
│ │ • 排列组合生成缺失的音素组合 │ │
|
| 150 |
+
│ │ • 交叉淡化拼接音频并保存 │ │
|
| 151 |
+
│ │ • 自动生成对应的 oto 配置条目 │ │
|
| 152 |
+
│ │ 8. 模糊拼字功能(可选,仅中文): │ │
|
| 153 |
+
│ │ • 在自动拼字基础上,用近似音素替代缺失音素 │ │
|
| 154 |
+
│ │ • 声母近似组: sh↔s, zh↔z, ch↔c, l↔n↔r, f↔h │ │
|
| 155 |
+
│ │ • 韵母近似组: an↔ang, en↔eng↔ong, in↔ing, ian↔iang, uan↔uang │ │
|
| 156 |
+
│ │ • 同组内音素互为替代,按组内顺序优先匹配 │ │
|
| 157 |
│ └─────────────────────────────────────────────────────────────────────┘ │
|
| 158 |
│ │
|
| 159 |
│ 输出: export/[音源名称]/simple_export/ │
|
src/export_plugins/utau_oto_export.py
CHANGED
|
@@ -68,6 +68,26 @@ JAPANESE_VOWELS = {
|
|
| 68 |
# 跳过的标记
|
| 69 |
SKIP_MARKS = {'', 'SP', 'AP', '<unk>', 'spn', 'sil'}
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def is_consonant(phone: str, language: str) -> bool:
|
| 73 |
"""判断音素是否为辅音"""
|
|
@@ -293,7 +313,25 @@ class UTAUOtoExportPlugin(ExportPlugin):
|
|
| 293 |
label="自动拼字",
|
| 294 |
option_type=OptionType.SWITCH,
|
| 295 |
default=False,
|
| 296 |
-
description="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
),
|
| 298 |
PluginOption(
|
| 299 |
key="encoding",
|
|
@@ -332,6 +370,9 @@ class UTAUOtoExportPlugin(ExportPlugin):
|
|
| 332 |
overlap_ratio = float(options.get("overlap_ratio", 0.3))
|
| 333 |
encoding = options.get("encoding", "utf-8")
|
| 334 |
character_name = options.get("character_name", "").strip()
|
|
|
|
|
|
|
|
|
|
| 335 |
use_hiragana = (alias_style == "hiragana") and language in ('japanese', 'ja', 'jp')
|
| 336 |
|
| 337 |
# 使用基类方法解析质量评估维度
|
|
@@ -365,6 +406,28 @@ class UTAUOtoExportPlugin(ExportPlugin):
|
|
| 365 |
)
|
| 366 |
self._log(f"筛选后保留 {len(filtered_entries)} 条配置,涉及 {len(used_wavs)} 个音频文件")
|
| 367 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
# 步骤3: 复制音频文件(自动检测文件名是否需要转拼音)
|
| 369 |
self._log("\n【复制音频文件】")
|
| 370 |
copied, filename_map = self._copy_wav_files(
|
|
@@ -388,7 +451,10 @@ class UTAUOtoExportPlugin(ExportPlugin):
|
|
| 388 |
|
| 389 |
# 统计别名数量
|
| 390 |
unique_aliases = set(e["alias"] for e in filtered_entries)
|
| 391 |
-
|
|
|
|
|
|
|
|
|
|
| 392 |
|
| 393 |
except Exception as e:
|
| 394 |
logger.error(f"UTAU oto.ini 导出失败: {e}", exc_info=True)
|
|
@@ -901,3 +967,682 @@ class UTAUOtoExportPlugin(ExportPlugin):
|
|
| 901 |
|
| 902 |
with open(output_path, 'w', encoding=encoding) as f:
|
| 903 |
f.write(f"name={name_to_write}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
# 跳过的标记
|
| 69 |
SKIP_MARKS = {'', 'SP', 'AP', '<unk>', 'spn', 'sil'}
|
| 70 |
|
| 71 |
+
# ==================== 模糊拼字近似音素对照表 ====================
|
| 72 |
+
|
| 73 |
+
# 声母近似组(同组内音素互为替代,按优先级排序)
|
| 74 |
+
FUZZY_CONSONANT_GROUPS = [
|
| 75 |
+
('sh', 's'), # 翘舌/平舌
|
| 76 |
+
('zh', 'z'), # 翘舌/平舌
|
| 77 |
+
('ch', 'c'), # 翘舌/平舌
|
| 78 |
+
('l', 'n', 'r'), # 边音/鼻音/卷舌
|
| 79 |
+
('f', 'h'), # 唇齿/喉音
|
| 80 |
+
]
|
| 81 |
+
|
| 82 |
+
# 韵母近似组(同组内音素互为替代,按优先级排序)
|
| 83 |
+
FUZZY_VOWEL_GROUPS = [
|
| 84 |
+
('an', 'ang'), # 前鼻/后鼻
|
| 85 |
+
('en', 'eng', 'ong'), # 前鼻/后鼻/后鼻圆唇
|
| 86 |
+
('in', 'ing'), # 前鼻/后鼻
|
| 87 |
+
('ian', 'iang'), # 前鼻/后鼻
|
| 88 |
+
('uan', 'uang'), # 前鼻/后鼻
|
| 89 |
+
]
|
| 90 |
+
|
| 91 |
|
| 92 |
def is_consonant(phone: str, language: str) -> bool:
|
| 93 |
"""判断音素是否为辅音"""
|
|
|
|
| 313 |
label="自动拼字",
|
| 314 |
option_type=OptionType.SWITCH,
|
| 315 |
default=False,
|
| 316 |
+
description="用已有的高质量音素拼接生成缺失的音素组合"
|
| 317 |
+
),
|
| 318 |
+
PluginOption(
|
| 319 |
+
key="crossfade_ms",
|
| 320 |
+
label="拼接淡入淡出时长(ms)",
|
| 321 |
+
option_type=OptionType.NUMBER,
|
| 322 |
+
default=10,
|
| 323 |
+
min_value=5,
|
| 324 |
+
max_value=50,
|
| 325 |
+
description="自动拼字时辅音与元音之间的交叉淡化时长",
|
| 326 |
+
visible_when={"auto_phoneme_combine": True}
|
| 327 |
+
),
|
| 328 |
+
PluginOption(
|
| 329 |
+
key="fuzzy_phoneme",
|
| 330 |
+
label="模糊拼字",
|
| 331 |
+
option_type=OptionType.SWITCH,
|
| 332 |
+
default=False,
|
| 333 |
+
description="用近似声母/韵母替代缺失音素(如 sh↔s, an↔ang),仅中文有效",
|
| 334 |
+
visible_when={"auto_phoneme_combine": True}
|
| 335 |
),
|
| 336 |
PluginOption(
|
| 337 |
key="encoding",
|
|
|
|
| 370 |
overlap_ratio = float(options.get("overlap_ratio", 0.3))
|
| 371 |
encoding = options.get("encoding", "utf-8")
|
| 372 |
character_name = options.get("character_name", "").strip()
|
| 373 |
+
auto_phoneme_combine = options.get("auto_phoneme_combine", False)
|
| 374 |
+
crossfade_ms = int(options.get("crossfade_ms", 10))
|
| 375 |
+
fuzzy_phoneme = options.get("fuzzy_phoneme", False)
|
| 376 |
use_hiragana = (alias_style == "hiragana") and language in ('japanese', 'ja', 'jp')
|
| 377 |
|
| 378 |
# 使用基类方法解析质量评估维度
|
|
|
|
| 406 |
)
|
| 407 |
self._log(f"筛选后保留 {len(filtered_entries)} 条配置,涉及 {len(used_wavs)} 个音频文件")
|
| 408 |
|
| 409 |
+
# 步骤2.5: 自动拼字(如果启用)
|
| 410 |
+
combined_count = 0
|
| 411 |
+
if auto_phoneme_combine:
|
| 412 |
+
self._log("\n【自动拼字】")
|
| 413 |
+
combined_entries, combined_wavs = self._auto_combine_phonemes(
|
| 414 |
+
oto_entries,
|
| 415 |
+
filtered_entries,
|
| 416 |
+
paths["slices_dir"],
|
| 417 |
+
export_dir,
|
| 418 |
+
language,
|
| 419 |
+
use_hiragana,
|
| 420 |
+
overlap_ratio,
|
| 421 |
+
crossfade_ms,
|
| 422 |
+
first_naming_rule,
|
| 423 |
+
fuzzy_phoneme
|
| 424 |
+
)
|
| 425 |
+
if combined_entries:
|
| 426 |
+
filtered_entries.extend(combined_entries)
|
| 427 |
+
used_wavs.update(combined_wavs)
|
| 428 |
+
combined_count = len(combined_entries)
|
| 429 |
+
self._log(f"拼接生成 {combined_count} 条新配置")
|
| 430 |
+
|
| 431 |
# 步骤3: 复制音频文件(自动检测文件名是否需要转拼音)
|
| 432 |
self._log("\n【复制音频文件】")
|
| 433 |
copied, filename_map = self._copy_wav_files(
|
|
|
|
| 451 |
|
| 452 |
# 统计别名数量
|
| 453 |
unique_aliases = set(e["alias"] for e in filtered_entries)
|
| 454 |
+
result_msg = f"导出完成: {export_dir}\n{len(unique_aliases)} 个别名,{len(filtered_entries)} 条配置,{copied} 个音频"
|
| 455 |
+
if combined_count > 0:
|
| 456 |
+
result_msg += f"\n(其中 {combined_count} 条为自动拼接生成)"
|
| 457 |
+
return True, result_msg
|
| 458 |
|
| 459 |
except Exception as e:
|
| 460 |
logger.error(f"UTAU oto.ini 导出失败: {e}", exc_info=True)
|
|
|
|
| 967 |
|
| 968 |
with open(output_path, 'w', encoding=encoding) as f:
|
| 969 |
f.write(f"name={name_to_write}")
|
| 970 |
+
|
| 971 |
+
# ==================== 自动拼字功能 ====================
|
| 972 |
+
|
| 973 |
+
def _auto_combine_phonemes(
|
| 974 |
+
self,
|
| 975 |
+
all_entries: List[Dict],
|
| 976 |
+
filtered_entries: List[Dict],
|
| 977 |
+
slices_dir: str,
|
| 978 |
+
export_dir: str,
|
| 979 |
+
language: str,
|
| 980 |
+
use_hiragana: bool,
|
| 981 |
+
overlap_ratio: float,
|
| 982 |
+
crossfade_ms: int,
|
| 983 |
+
first_naming_rule: str,
|
| 984 |
+
fuzzy_phoneme: bool = False
|
| 985 |
+
) -> Tuple[List[Dict], set]:
|
| 986 |
+
"""
|
| 987 |
+
自动拼字:用已有音素拼接生成缺失的音素组合
|
| 988 |
+
|
| 989 |
+
参数:
|
| 990 |
+
all_entries: 所有原始 oto 条目(用于提取音素片段)
|
| 991 |
+
filtered_entries: 已筛选的条目(用于确定已有别名)
|
| 992 |
+
slices_dir: 切片目录
|
| 993 |
+
export_dir: 导出目录
|
| 994 |
+
language: 语言
|
| 995 |
+
use_hiragana: 是否使用平假名
|
| 996 |
+
overlap_ratio: overlap 比例
|
| 997 |
+
crossfade_ms: 交叉淡化时长
|
| 998 |
+
first_naming_rule: 首个样本命名规则
|
| 999 |
+
fuzzy_phoneme: 是否启用模糊拼字(仅中文有效)
|
| 1000 |
+
|
| 1001 |
+
返回:
|
| 1002 |
+
(新生成的条目列表, 新生成的 wav 文件名集合)
|
| 1003 |
+
"""
|
| 1004 |
+
import numpy as np
|
| 1005 |
+
import soundfile as sf
|
| 1006 |
+
|
| 1007 |
+
# 步骤1: 收集已有别名
|
| 1008 |
+
existing_aliases = set()
|
| 1009 |
+
for entry in filtered_entries:
|
| 1010 |
+
# 提取基础别名(去除序号后缀)
|
| 1011 |
+
alias = entry.get("alias", "")
|
| 1012 |
+
if alias:
|
| 1013 |
+
existing_aliases.add(alias)
|
| 1014 |
+
|
| 1015 |
+
self._log(f"已有 {len(existing_aliases)} 个别名")
|
| 1016 |
+
|
| 1017 |
+
# 步骤2: 从原始条目中提取最佳辅音和元音片段
|
| 1018 |
+
consonant_segments, vowel_segments = self._collect_phoneme_segments(
|
| 1019 |
+
all_entries, slices_dir, language
|
| 1020 |
+
)
|
| 1021 |
+
|
| 1022 |
+
self._log(f"收集到 {len(consonant_segments)} 个辅音, {len(vowel_segments)} 个元音")
|
| 1023 |
+
|
| 1024 |
+
if not consonant_segments or not vowel_segments:
|
| 1025 |
+
self._log("音素不足,跳过自动拼字")
|
| 1026 |
+
return [], set()
|
| 1027 |
+
|
| 1028 |
+
# 步骤3: 生成候选组合并过滤
|
| 1029 |
+
# 模糊拼字仅对中文生效
|
| 1030 |
+
enable_fuzzy = fuzzy_phoneme and language in ('chinese', 'zh', 'mandarin')
|
| 1031 |
+
candidates = self._generate_candidates(
|
| 1032 |
+
consonant_segments, vowel_segments,
|
| 1033 |
+
existing_aliases, language, use_hiragana,
|
| 1034 |
+
enable_fuzzy
|
| 1035 |
+
)
|
| 1036 |
+
|
| 1037 |
+
if not candidates:
|
| 1038 |
+
self._log("无缺失的有效组合")
|
| 1039 |
+
return [], set()
|
| 1040 |
+
|
| 1041 |
+
self._log(f"发现 {len(candidates)} 个缺失组合,开始拼接...")
|
| 1042 |
+
|
| 1043 |
+
# 步骤4: 执行音频拼接
|
| 1044 |
+
new_entries = []
|
| 1045 |
+
new_wavs = set()
|
| 1046 |
+
success_count = 0
|
| 1047 |
+
fail_count = 0
|
| 1048 |
+
|
| 1049 |
+
for candidate in candidates:
|
| 1050 |
+
try:
|
| 1051 |
+
entry, wav_name = self._combine_and_save(
|
| 1052 |
+
candidate,
|
| 1053 |
+
slices_dir,
|
| 1054 |
+
export_dir,
|
| 1055 |
+
overlap_ratio,
|
| 1056 |
+
crossfade_ms,
|
| 1057 |
+
first_naming_rule
|
| 1058 |
+
)
|
| 1059 |
+
if entry:
|
| 1060 |
+
new_entries.append(entry)
|
| 1061 |
+
new_wavs.add(wav_name)
|
| 1062 |
+
success_count += 1
|
| 1063 |
+
except Exception as e:
|
| 1064 |
+
logger.warning(f"拼接失败 {candidate['alias']}: {e}")
|
| 1065 |
+
fail_count += 1
|
| 1066 |
+
|
| 1067 |
+
if fail_count > 0:
|
| 1068 |
+
self._log(f"拼接完成: 成功 {success_count}, 失败 {fail_count}")
|
| 1069 |
+
|
| 1070 |
+
return new_entries, new_wavs
|
| 1071 |
+
|
| 1072 |
+
def _collect_phoneme_segments(
|
| 1073 |
+
self,
|
| 1074 |
+
entries: List[Dict],
|
| 1075 |
+
slices_dir: str,
|
| 1076 |
+
language: str
|
| 1077 |
+
) -> Tuple[Dict[str, Dict], Dict[str, Dict]]:
|
| 1078 |
+
"""
|
| 1079 |
+
从条目中收集辅音和元音片段信息
|
| 1080 |
+
|
| 1081 |
+
返回:
|
| 1082 |
+
(辅音字典, 元音字典)
|
| 1083 |
+
每个字典: {IPA音素: {wav_path, offset_ms, duration_ms, quality_score}}
|
| 1084 |
+
"""
|
| 1085 |
+
import soundfile as sf
|
| 1086 |
+
|
| 1087 |
+
consonant_segments: Dict[str, List[Dict]] = defaultdict(list)
|
| 1088 |
+
vowel_segments: Dict[str, List[Dict]] = defaultdict(list)
|
| 1089 |
+
|
| 1090 |
+
for entry in entries:
|
| 1091 |
+
wav_name = entry.get("wav_name", "")
|
| 1092 |
+
wav_path = os.path.join(slices_dir, wav_name)
|
| 1093 |
+
|
| 1094 |
+
if not os.path.exists(wav_path):
|
| 1095 |
+
continue
|
| 1096 |
+
|
| 1097 |
+
# 从条目中提取原始音素信息(如果有)
|
| 1098 |
+
# 这里需要重新解析,因为原始条目可能没有保存 IPA 信息
|
| 1099 |
+
# 我们使用 alias 反推(简化处理)
|
| 1100 |
+
alias = entry.get("alias", "")
|
| 1101 |
+
offset = entry.get("offset", 0)
|
| 1102 |
+
consonant_dur = entry.get("consonant", 0)
|
| 1103 |
+
segment_dur = entry.get("segment_duration", 0)
|
| 1104 |
+
quality = entry.get("quality_score", 0.5)
|
| 1105 |
+
|
| 1106 |
+
# 尝试分离辅音和元音部分
|
| 1107 |
+
c_part, v_part = self._split_alias_to_cv(alias, language)
|
| 1108 |
+
|
| 1109 |
+
if c_part:
|
| 1110 |
+
consonant_segments[c_part].append({
|
| 1111 |
+
"wav_path": wav_path,
|
| 1112 |
+
"offset_ms": offset,
|
| 1113 |
+
"duration_ms": consonant_dur,
|
| 1114 |
+
"quality_score": quality,
|
| 1115 |
+
"ipa": c_part
|
| 1116 |
+
})
|
| 1117 |
+
|
| 1118 |
+
if v_part:
|
| 1119 |
+
# 元音从辅音结束位置开始
|
| 1120 |
+
v_offset = offset + consonant_dur
|
| 1121 |
+
v_duration = segment_dur - consonant_dur
|
| 1122 |
+
if v_duration > 0:
|
| 1123 |
+
vowel_segments[v_part].append({
|
| 1124 |
+
"wav_path": wav_path,
|
| 1125 |
+
"offset_ms": v_offset,
|
| 1126 |
+
"duration_ms": v_duration,
|
| 1127 |
+
"quality_score": quality,
|
| 1128 |
+
"ipa": v_part
|
| 1129 |
+
})
|
| 1130 |
+
|
| 1131 |
+
# 选择最佳音素
|
| 1132 |
+
# 辅音:从质量前5中选择时长最接近中位数的(避免过长或过短)
|
| 1133 |
+
# 元音:从质量前5中选择时长最长的(避免UTAU过度拉伸)
|
| 1134 |
+
best_consonants = {}
|
| 1135 |
+
for ipa, segments in consonant_segments.items():
|
| 1136 |
+
if segments:
|
| 1137 |
+
best_consonants[ipa] = self._select_best_consonant(segments)
|
| 1138 |
+
|
| 1139 |
+
best_vowels = {}
|
| 1140 |
+
for ipa, segments in vowel_segments.items():
|
| 1141 |
+
if segments:
|
| 1142 |
+
best_vowels[ipa] = self._select_best_vowel(segments)
|
| 1143 |
+
|
| 1144 |
+
return best_consonants, best_vowels
|
| 1145 |
+
|
| 1146 |
+
def _select_best_consonant(self, segments: List[Dict]) -> Dict:
|
| 1147 |
+
"""
|
| 1148 |
+
选择最佳辅音片段
|
| 1149 |
+
|
| 1150 |
+
策略:从质量排名前5中选择时长最接近中位数的
|
| 1151 |
+
(辅音不宜过长也不宜过短)
|
| 1152 |
+
"""
|
| 1153 |
+
# 按质量排序,取前5
|
| 1154 |
+
sorted_by_quality = sorted(segments, key=lambda x: -x["quality_score"])
|
| 1155 |
+
top_candidates = sorted_by_quality[:5]
|
| 1156 |
+
|
| 1157 |
+
if len(top_candidates) == 1:
|
| 1158 |
+
return top_candidates[0]
|
| 1159 |
+
|
| 1160 |
+
# 计算这些候选的时长中位数
|
| 1161 |
+
durations = [s["duration_ms"] for s in top_candidates]
|
| 1162 |
+
durations.sort()
|
| 1163 |
+
median_duration = durations[len(durations) // 2]
|
| 1164 |
+
|
| 1165 |
+
# 选择最接近中位数的
|
| 1166 |
+
best = min(top_candidates, key=lambda x: abs(x["duration_ms"] - median_duration))
|
| 1167 |
+
return best
|
| 1168 |
+
|
| 1169 |
+
def _select_best_vowel(self, segments: List[Dict]) -> Dict:
|
| 1170 |
+
"""
|
| 1171 |
+
选择最佳元音片段
|
| 1172 |
+
|
| 1173 |
+
策略:从质量排名前5中选择时长最长的
|
| 1174 |
+
(元音过短会导致UTAU过度拉伸)
|
| 1175 |
+
"""
|
| 1176 |
+
# 按质量排序,取前5
|
| 1177 |
+
sorted_by_quality = sorted(segments, key=lambda x: -x["quality_score"])
|
| 1178 |
+
top_candidates = sorted_by_quality[:5]
|
| 1179 |
+
|
| 1180 |
+
# 从中选择时长最长的
|
| 1181 |
+
best = max(top_candidates, key=lambda x: x["duration_ms"])
|
| 1182 |
+
return best
|
| 1183 |
+
|
| 1184 |
+
def _split_alias_to_cv(
|
| 1185 |
+
self,
|
| 1186 |
+
alias: str,
|
| 1187 |
+
language: str
|
| 1188 |
+
) -> Tuple[Optional[str], Optional[str]]:
|
| 1189 |
+
"""
|
| 1190 |
+
将别名拆分为辅音和元音部分
|
| 1191 |
+
|
| 1192 |
+
参数:
|
| 1193 |
+
alias: 别名(拼音、罗马音或平假名)
|
| 1194 |
+
language: 语言
|
| 1195 |
+
|
| 1196 |
+
返回:
|
| 1197 |
+
(辅音部分, 元音部分) - 始终返回罗马音格式
|
| 1198 |
+
"""
|
| 1199 |
+
if not alias:
|
| 1200 |
+
return None, None
|
| 1201 |
+
|
| 1202 |
+
# 如果是平假名,先转换为罗马音
|
| 1203 |
+
alias_to_split = self._hiragana_to_romaji(alias)
|
| 1204 |
+
if alias_to_split is None:
|
| 1205 |
+
alias_to_split = alias.lower()
|
| 1206 |
+
|
| 1207 |
+
if language in ('chinese', 'zh', 'mandarin'):
|
| 1208 |
+
# 中文拼音辅音列表(按长度降序排列以优先匹配长的)
|
| 1209 |
+
consonants = [
|
| 1210 |
+
'zh', 'ch', 'sh', 'ng',
|
| 1211 |
+
'b', 'p', 'm', 'f',
|
| 1212 |
+
'd', 't', 'n', 'l',
|
| 1213 |
+
'g', 'k', 'h',
|
| 1214 |
+
'j', 'q', 'x',
|
| 1215 |
+
'z', 'c', 's', 'r',
|
| 1216 |
+
'y', 'w'
|
| 1217 |
+
]
|
| 1218 |
+
else:
|
| 1219 |
+
# 日语罗马音辅音
|
| 1220 |
+
consonants = [
|
| 1221 |
+
'ch', 'sh', 'ts', 'ny',
|
| 1222 |
+
'ky', 'gy', 'py', 'by', 'my', 'ry', 'hy',
|
| 1223 |
+
'k', 'g', 's', 'z', 't', 'd', 'n', 'h', 'b', 'p', 'm', 'r', 'w', 'y', 'f', 'j'
|
| 1224 |
+
]
|
| 1225 |
+
|
| 1226 |
+
# 尝试匹配辅音
|
| 1227 |
+
for c in consonants:
|
| 1228 |
+
if alias_to_split.startswith(c):
|
| 1229 |
+
vowel = alias_to_split[len(c):]
|
| 1230 |
+
if vowel: # 确保有元音部分
|
| 1231 |
+
return c, vowel
|
| 1232 |
+
else:
|
| 1233 |
+
return c, None
|
| 1234 |
+
|
| 1235 |
+
# 没有辅音,整个是元音
|
| 1236 |
+
return None, alias_to_split
|
| 1237 |
+
|
| 1238 |
+
def _hiragana_to_romaji(self, text: str) -> Optional[str]:
|
| 1239 |
+
"""
|
| 1240 |
+
将平假名转换为罗马音
|
| 1241 |
+
|
| 1242 |
+
参数:
|
| 1243 |
+
text: 平假名文本
|
| 1244 |
+
|
| 1245 |
+
返回:
|
| 1246 |
+
罗马音,如果无法转换则返回 None
|
| 1247 |
+
"""
|
| 1248 |
+
# 平假名到罗马音映射(ROMAJI_TO_HIRAGANA 的反向映射)
|
| 1249 |
+
hiragana_to_romaji_map = {
|
| 1250 |
+
# 基本元音
|
| 1251 |
+
'あ': 'a', 'い': 'i', 'う': 'u', 'え': 'e', 'お': 'o',
|
| 1252 |
+
# か行
|
| 1253 |
+
'か': 'ka', 'き': 'ki', 'く': 'ku', 'け': 'ke', 'こ': 'ko',
|
| 1254 |
+
# さ行
|
| 1255 |
+
'さ': 'sa', 'し': 'shi', 'す': 'su', 'せ': 'se', 'そ': 'so',
|
| 1256 |
+
# た行
|
| 1257 |
+
'た': 'ta', 'ち': 'chi', 'つ': 'tsu', 'て': 'te', 'と': 'to',
|
| 1258 |
+
# な行
|
| 1259 |
+
'な': 'na', 'に': 'ni', 'ぬ': 'nu', 'ね': 'ne', 'の': 'no',
|
| 1260 |
+
# は行
|
| 1261 |
+
'は': 'ha', 'ひ': 'hi', 'ふ': 'fu', 'へ': 'he', 'ほ': 'ho',
|
| 1262 |
+
# ま行
|
| 1263 |
+
'ま': 'ma', 'み': 'mi', 'む': 'mu', 'め': 'me', 'も': 'mo',
|
| 1264 |
+
# や行
|
| 1265 |
+
'や': 'ya', 'ゆ': 'yu', 'よ': 'yo',
|
| 1266 |
+
# ら行
|
| 1267 |
+
'ら': 'ra', 'り': 'ri', 'る': 'ru', 'れ': 're', 'ろ': 'ro',
|
| 1268 |
+
# わ行
|
| 1269 |
+
'わ': 'wa', 'を': 'wo', 'ん': 'n',
|
| 1270 |
+
# が行
|
| 1271 |
+
'が': 'ga', 'ぎ': 'gi', 'ぐ': 'gu', 'げ': 'ge', 'ご': 'go',
|
| 1272 |
+
# ざ行
|
| 1273 |
+
'ざ': 'za', 'じ': 'ji', 'ず': 'zu', 'ぜ': 'ze', 'ぞ': 'zo',
|
| 1274 |
+
# だ行
|
| 1275 |
+
'だ': 'da', 'ぢ': 'di', 'づ': 'du', 'で': 'de', 'ど': 'do',
|
| 1276 |
+
# ば行
|
| 1277 |
+
'ば': 'ba', 'び': 'bi', 'ぶ': 'bu', 'べ': 'be', 'ぼ': 'bo',
|
| 1278 |
+
# ぱ行
|
| 1279 |
+
'ぱ': 'pa', 'ぴ': 'pi', 'ぷ': 'pu', 'ぺ': 'pe', 'ぽ': 'po',
|
| 1280 |
+
# 拗音
|
| 1281 |
+
'きゃ': 'kya', 'きゅ': 'kyu', 'きょ': 'kyo',
|
| 1282 |
+
'しゃ': 'sha', 'しゅ': 'shu', 'しょ': 'sho',
|
| 1283 |
+
'ちゃ': 'cha', 'ちゅ': 'chu', 'ちょ': 'cho',
|
| 1284 |
+
'にゃ': 'nya', 'にゅ': 'nyu', 'にょ': 'nyo',
|
| 1285 |
+
'ひゃ': 'hya', 'ひゅ': 'hyu', 'ひょ': 'hyo',
|
| 1286 |
+
'みゃ': 'mya', 'みゅ': 'myu', 'みょ': 'myo',
|
| 1287 |
+
'りゃ': 'rya', 'りゅ': 'ryu', 'りょ': 'ryo',
|
| 1288 |
+
'ぎゃ': 'gya', 'ぎゅ': 'gyu', 'ぎょ': 'gyo',
|
| 1289 |
+
'じゃ': 'ja', 'じゅ': 'ju', 'じょ': 'jo',
|
| 1290 |
+
'びゃ': 'bya', 'びゅ': 'byu', 'びょ': 'byo',
|
| 1291 |
+
'ぴゃ': 'pya', 'ぴゅ': 'pyu', 'ぴょ': 'pyo',
|
| 1292 |
+
}
|
| 1293 |
+
|
| 1294 |
+
# 去除数字后缀
|
| 1295 |
+
base_text = text.rstrip('0123456789')
|
| 1296 |
+
|
| 1297 |
+
# 直接查找
|
| 1298 |
+
if base_text in hiragana_to_romaji_map:
|
| 1299 |
+
return hiragana_to_romaji_map[base_text]
|
| 1300 |
+
|
| 1301 |
+
# 如果是纯 ASCII,直接返回小写
|
| 1302 |
+
if base_text.isascii():
|
| 1303 |
+
return base_text.lower()
|
| 1304 |
+
|
| 1305 |
+
return None
|
| 1306 |
+
|
| 1307 |
+
def _generate_candidates(
|
| 1308 |
+
self,
|
| 1309 |
+
consonants: Dict[str, Dict],
|
| 1310 |
+
vowels: Dict[str, Dict],
|
| 1311 |
+
existing_aliases: set,
|
| 1312 |
+
language: str,
|
| 1313 |
+
use_hiragana: bool,
|
| 1314 |
+
fuzzy_phoneme: bool = False
|
| 1315 |
+
) -> List[Dict]:
|
| 1316 |
+
"""
|
| 1317 |
+
生成缺失的候选组合
|
| 1318 |
+
|
| 1319 |
+
参数:
|
| 1320 |
+
consonants: 可用辅音字典
|
| 1321 |
+
vowels: 可用元音字典
|
| 1322 |
+
existing_aliases: 已存在的别名集合
|
| 1323 |
+
language: 语言
|
| 1324 |
+
use_hiragana: 是否使用平假名
|
| 1325 |
+
fuzzy_phoneme: 是否启用模糊拼字
|
| 1326 |
+
|
| 1327 |
+
返回:
|
| 1328 |
+
候选列表,每个候选包含 {alias, consonant_info, vowel_info}
|
| 1329 |
+
"""
|
| 1330 |
+
candidates = []
|
| 1331 |
+
|
| 1332 |
+
# 获取有效的元音列表(用于验证组合)
|
| 1333 |
+
if language in ('chinese', 'zh', 'mandarin'):
|
| 1334 |
+
valid_vowels = {'a', 'o', 'e', 'i', 'u', 'v', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', 'eng', 'ong', 'er'}
|
| 1335 |
+
else:
|
| 1336 |
+
valid_vowels = {'a', 'i', 'u', 'e', 'o'}
|
| 1337 |
+
|
| 1338 |
+
# 构建可用音素集合(用于模糊匹配)
|
| 1339 |
+
available_consonants = set(consonants.keys())
|
| 1340 |
+
available_vowels = set(vowels.keys())
|
| 1341 |
+
|
| 1342 |
+
# 辅音 + 元音组合
|
| 1343 |
+
for c_alias, c_info in consonants.items():
|
| 1344 |
+
for v_alias, v_info in vowels.items():
|
| 1345 |
+
# 确保辅音和元音都是罗马音格式(小写ASCII)
|
| 1346 |
+
c_romaji = c_alias.lower() if c_alias.isascii() else None
|
| 1347 |
+
v_romaji = v_alias.lower() if v_alias.isascii() else None
|
| 1348 |
+
|
| 1349 |
+
# 跳过非罗马音的音素(如已经是平假名的)
|
| 1350 |
+
if c_romaji is None or v_romaji is None:
|
| 1351 |
+
continue
|
| 1352 |
+
|
| 1353 |
+
combined_romaji = c_romaji + v_romaji
|
| 1354 |
+
|
| 1355 |
+
# 检查组合是否合理(简单验证)
|
| 1356 |
+
if v_romaji not in valid_vowels and len(v_romaji) > 2:
|
| 1357 |
+
continue
|
| 1358 |
+
|
| 1359 |
+
# 转换为最终别名格式
|
| 1360 |
+
if use_hiragana:
|
| 1361 |
+
final_alias = ROMAJI_TO_HIRAGANA.get(combined_romaji)
|
| 1362 |
+
# 如果无法转换为平假名,跳过此组合
|
| 1363 |
+
if final_alias is None:
|
| 1364 |
+
continue
|
| 1365 |
+
else:
|
| 1366 |
+
final_alias = combined_romaji
|
| 1367 |
+
|
| 1368 |
+
# 检查是否已存在(检查最终别名)
|
| 1369 |
+
if final_alias in existing_aliases:
|
| 1370 |
+
continue
|
| 1371 |
+
|
| 1372 |
+
# 同时检查罗马音形式是否已存在
|
| 1373 |
+
if combined_romaji in existing_aliases:
|
| 1374 |
+
continue
|
| 1375 |
+
|
| 1376 |
+
candidates.append({
|
| 1377 |
+
"alias": final_alias,
|
| 1378 |
+
"base_alias": combined_romaji, # 始终使用罗马音作为基础
|
| 1379 |
+
"consonant_info": c_info,
|
| 1380 |
+
"vowel_info": v_info
|
| 1381 |
+
})
|
| 1382 |
+
|
| 1383 |
+
# 模糊拼字:生成使用近似音素的额外候选
|
| 1384 |
+
if fuzzy_phoneme and language in ('chinese', 'zh', 'mandarin'):
|
| 1385 |
+
fuzzy_candidates = self._generate_fuzzy_candidates(
|
| 1386 |
+
consonants, vowels,
|
| 1387 |
+
available_consonants, available_vowels,
|
| 1388 |
+
existing_aliases, candidates
|
| 1389 |
+
)
|
| 1390 |
+
candidates.extend(fuzzy_candidates)
|
| 1391 |
+
|
| 1392 |
+
return candidates
|
| 1393 |
+
|
| 1394 |
+
def _find_fuzzy_substitute(
|
| 1395 |
+
self,
|
| 1396 |
+
phoneme: str,
|
| 1397 |
+
available_phonemes: set,
|
| 1398 |
+
groups: List[Tuple[str, ...]]
|
| 1399 |
+
) -> Optional[str]:
|
| 1400 |
+
"""
|
| 1401 |
+
查找模糊替代音素
|
| 1402 |
+
|
| 1403 |
+
参数:
|
| 1404 |
+
phoneme: 目标音素
|
| 1405 |
+
available_phonemes: 可用音素集合
|
| 1406 |
+
groups: 近似音素组列表(同组内音素互为替代)
|
| 1407 |
+
|
| 1408 |
+
返回:
|
| 1409 |
+
替代音素,如果无法替代则返回 None
|
| 1410 |
+
"""
|
| 1411 |
+
# 如果目标音素已存在,直接返回
|
| 1412 |
+
if phoneme in available_phonemes:
|
| 1413 |
+
return phoneme
|
| 1414 |
+
|
| 1415 |
+
# 查找目标音素所在的近似组
|
| 1416 |
+
for group in groups:
|
| 1417 |
+
if phoneme in group:
|
| 1418 |
+
# 按组内顺序查找可用的替代音素
|
| 1419 |
+
for candidate in group:
|
| 1420 |
+
if candidate != phoneme and candidate in available_phonemes:
|
| 1421 |
+
return candidate
|
| 1422 |
+
# 该组内没有可用替代
|
| 1423 |
+
break
|
| 1424 |
+
|
| 1425 |
+
return None
|
| 1426 |
+
|
| 1427 |
+
def _generate_fuzzy_candidates(
|
| 1428 |
+
self,
|
| 1429 |
+
consonants: Dict[str, Dict],
|
| 1430 |
+
vowels: Dict[str, Dict],
|
| 1431 |
+
available_consonants: set,
|
| 1432 |
+
available_vowels: set,
|
| 1433 |
+
existing_aliases: set,
|
| 1434 |
+
normal_candidates: List[Dict]
|
| 1435 |
+
) -> List[Dict]:
|
| 1436 |
+
"""
|
| 1437 |
+
生成模糊拼字候选
|
| 1438 |
+
|
| 1439 |
+
使用近似音素替代缺失的声母/韵母,生成额外的候选组合
|
| 1440 |
+
"""
|
| 1441 |
+
fuzzy_candidates = []
|
| 1442 |
+
|
| 1443 |
+
# 已生成的别名(包括普通候选)
|
| 1444 |
+
generated_aliases = set(c["base_alias"] for c in normal_candidates)
|
| 1445 |
+
generated_aliases.update(existing_aliases)
|
| 1446 |
+
|
| 1447 |
+
# 中文所有可能的声母
|
| 1448 |
+
all_consonants = ['b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h',
|
| 1449 |
+
'j', 'q', 'x', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's', 'y', 'w']
|
| 1450 |
+
|
| 1451 |
+
# 中文所有可能的韵母
|
| 1452 |
+
all_vowels = ['a', 'o', 'e', 'i', 'u', 'v', 'ai', 'ei', 'ao', 'ou',
|
| 1453 |
+
'an', 'en', 'ang', 'eng', 'ong', 'in', 'ing', 'ian', 'iang',
|
| 1454 |
+
'uan', 'uang', 'un', 'ia', 'ie', 'iu', 'iao', 'ua', 'uo', 'ui', 'uai']
|
| 1455 |
+
|
| 1456 |
+
fuzzy_count = 0
|
| 1457 |
+
|
| 1458 |
+
for target_c in all_consonants:
|
| 1459 |
+
for target_v in all_vowels:
|
| 1460 |
+
target_alias = target_c + target_v
|
| 1461 |
+
|
| 1462 |
+
# 跳过已存在或已生成的
|
| 1463 |
+
if target_alias in generated_aliases:
|
| 1464 |
+
continue
|
| 1465 |
+
|
| 1466 |
+
# 确定实际使用的辅音
|
| 1467 |
+
if target_c in available_consonants:
|
| 1468 |
+
actual_c = target_c
|
| 1469 |
+
else:
|
| 1470 |
+
actual_c = self._find_fuzzy_substitute(
|
| 1471 |
+
target_c, available_consonants, FUZZY_CONSONANT_GROUPS
|
| 1472 |
+
)
|
| 1473 |
+
|
| 1474 |
+
# 确定实际使用的元音
|
| 1475 |
+
if target_v in available_vowels:
|
| 1476 |
+
actual_v = target_v
|
| 1477 |
+
else:
|
| 1478 |
+
actual_v = self._find_fuzzy_substitute(
|
| 1479 |
+
target_v, available_vowels, FUZZY_VOWEL_GROUPS
|
| 1480 |
+
)
|
| 1481 |
+
|
| 1482 |
+
# 如果辅音或元音无法获取,跳过
|
| 1483 |
+
if actual_c is None or actual_v is None:
|
| 1484 |
+
continue
|
| 1485 |
+
|
| 1486 |
+
# 如果实际音素与目标相同,说明不需要模糊替换(普通候选已处理)
|
| 1487 |
+
if actual_c == target_c and actual_v == target_v:
|
| 1488 |
+
continue
|
| 1489 |
+
|
| 1490 |
+
# 获取音素信息
|
| 1491 |
+
c_info = consonants.get(actual_c)
|
| 1492 |
+
v_info = vowels.get(actual_v)
|
| 1493 |
+
|
| 1494 |
+
if c_info is None or v_info is None:
|
| 1495 |
+
continue
|
| 1496 |
+
|
| 1497 |
+
fuzzy_candidates.append({
|
| 1498 |
+
"alias": target_alias,
|
| 1499 |
+
"base_alias": target_alias,
|
| 1500 |
+
"consonant_info": c_info,
|
| 1501 |
+
"vowel_info": v_info,
|
| 1502 |
+
"is_fuzzy": True,
|
| 1503 |
+
"fuzzy_from": f"{actual_c}+{actual_v}"
|
| 1504 |
+
})
|
| 1505 |
+
generated_aliases.add(target_alias)
|
| 1506 |
+
fuzzy_count += 1
|
| 1507 |
+
|
| 1508 |
+
if fuzzy_count > 0:
|
| 1509 |
+
self._log(f"模糊拼字生成 {fuzzy_count} 个额外候选")
|
| 1510 |
+
|
| 1511 |
+
return fuzzy_candidates
|
| 1512 |
+
|
| 1513 |
+
def _combine_and_save(
|
| 1514 |
+
self,
|
| 1515 |
+
candidate: Dict,
|
| 1516 |
+
slices_dir: str,
|
| 1517 |
+
export_dir: str,
|
| 1518 |
+
overlap_ratio: float,
|
| 1519 |
+
crossfade_ms: int,
|
| 1520 |
+
first_naming_rule: str
|
| 1521 |
+
) -> Tuple[Optional[Dict], Optional[str]]:
|
| 1522 |
+
"""
|
| 1523 |
+
执行音频拼接并保存
|
| 1524 |
+
|
| 1525 |
+
参数:
|
| 1526 |
+
candidate: 候选信息
|
| 1527 |
+
slices_dir: 切片目录
|
| 1528 |
+
export_dir: 导出目录
|
| 1529 |
+
overlap_ratio: overlap 比例
|
| 1530 |
+
crossfade_ms: 交叉淡化时长
|
| 1531 |
+
first_naming_rule: 命名规则
|
| 1532 |
+
|
| 1533 |
+
返回:
|
| 1534 |
+
(oto条目, wav文件名) 或 (None, None)
|
| 1535 |
+
"""
|
| 1536 |
+
import numpy as np
|
| 1537 |
+
import soundfile as sf
|
| 1538 |
+
|
| 1539 |
+
c_info = candidate["consonant_info"]
|
| 1540 |
+
v_info = candidate["vowel_info"]
|
| 1541 |
+
alias = candidate["alias"]
|
| 1542 |
+
|
| 1543 |
+
# 加载辅音片段
|
| 1544 |
+
c_audio, c_sr = sf.read(c_info["wav_path"])
|
| 1545 |
+
if len(c_audio.shape) > 1:
|
| 1546 |
+
c_audio = c_audio.mean(axis=1)
|
| 1547 |
+
|
| 1548 |
+
c_start = int(c_info["offset_ms"] / 1000 * c_sr)
|
| 1549 |
+
c_duration = int(c_info["duration_ms"] / 1000 * c_sr)
|
| 1550 |
+
c_segment = c_audio[c_start:c_start + c_duration]
|
| 1551 |
+
|
| 1552 |
+
# 加载元音片段
|
| 1553 |
+
v_audio, v_sr = sf.read(v_info["wav_path"])
|
| 1554 |
+
if len(v_audio.shape) > 1:
|
| 1555 |
+
v_audio = v_audio.mean(axis=1)
|
| 1556 |
+
|
| 1557 |
+
v_start = int(v_info["offset_ms"] / 1000 * v_sr)
|
| 1558 |
+
v_duration = int(v_info["duration_ms"] / 1000 * v_sr)
|
| 1559 |
+
v_segment = v_audio[v_start:v_start + v_duration]
|
| 1560 |
+
|
| 1561 |
+
# 确保采样率一致
|
| 1562 |
+
if c_sr != v_sr:
|
| 1563 |
+
logger.warning(f"采样率不一致: {c_sr} vs {v_sr},跳过")
|
| 1564 |
+
return None, None
|
| 1565 |
+
|
| 1566 |
+
sr = c_sr
|
| 1567 |
+
|
| 1568 |
+
# 检查片段有效性
|
| 1569 |
+
if len(c_segment) == 0 or len(v_segment) == 0:
|
| 1570 |
+
return None, None
|
| 1571 |
+
|
| 1572 |
+
# 执行交叉淡化拼接
|
| 1573 |
+
crossfade_samples = int(crossfade_ms / 1000 * sr)
|
| 1574 |
+
crossfade_samples = min(crossfade_samples, len(c_segment) // 2, len(v_segment) // 2)
|
| 1575 |
+
|
| 1576 |
+
if crossfade_samples < 1:
|
| 1577 |
+
crossfade_samples = 1
|
| 1578 |
+
|
| 1579 |
+
combined = self._crossfade_concat(c_segment, v_segment, crossfade_samples)
|
| 1580 |
+
|
| 1581 |
+
# 生成文件名(使用 C 前缀表示 Combined)
|
| 1582 |
+
wav_name = f"C{candidate['alias']}.wav"
|
| 1583 |
+
wav_path = os.path.join(export_dir, wav_name)
|
| 1584 |
+
|
| 1585 |
+
# 保存音频
|
| 1586 |
+
sf.write(wav_path, combined, sr)
|
| 1587 |
+
|
| 1588 |
+
# 计算 oto 参数
|
| 1589 |
+
c_duration_ms = c_info["duration_ms"]
|
| 1590 |
+
total_duration_ms = len(combined) / sr * 1000
|
| 1591 |
+
|
| 1592 |
+
# 应用命名规则(作为首个样本)
|
| 1593 |
+
final_alias = self.apply_naming_rule(first_naming_rule, alias, 0) if first_naming_rule else alias
|
| 1594 |
+
|
| 1595 |
+
entry = {
|
| 1596 |
+
"wav_name": wav_name,
|
| 1597 |
+
"alias": final_alias,
|
| 1598 |
+
"offset": 0,
|
| 1599 |
+
"consonant": round(c_duration_ms, 1),
|
| 1600 |
+
"cutoff": round(-total_duration_ms, 1),
|
| 1601 |
+
"preutterance": round(c_duration_ms, 1),
|
| 1602 |
+
"overlap": round(c_duration_ms * overlap_ratio, 1),
|
| 1603 |
+
"segment_duration": total_duration_ms,
|
| 1604 |
+
"is_combined": True # 标记为拼接生成
|
| 1605 |
+
}
|
| 1606 |
+
|
| 1607 |
+
return entry, wav_name
|
| 1608 |
+
|
| 1609 |
+
def _crossfade_concat(
|
| 1610 |
+
self,
|
| 1611 |
+
audio1: 'np.ndarray',
|
| 1612 |
+
audio2: 'np.ndarray',
|
| 1613 |
+
crossfade_samples: int
|
| 1614 |
+
) -> 'np.ndarray':
|
| 1615 |
+
"""
|
| 1616 |
+
交叉淡化拼接两段音频
|
| 1617 |
+
|
| 1618 |
+
参数:
|
| 1619 |
+
audio1: 第一段音频
|
| 1620 |
+
audio2: 第二段音频
|
| 1621 |
+
crossfade_samples: 交叉淡化采样数
|
| 1622 |
+
|
| 1623 |
+
返回:
|
| 1624 |
+
拼接后的音频
|
| 1625 |
+
"""
|
| 1626 |
+
import numpy as np
|
| 1627 |
+
|
| 1628 |
+
if crossfade_samples <= 0:
|
| 1629 |
+
return np.concatenate([audio1, audio2])
|
| 1630 |
+
|
| 1631 |
+
# 确保交叉淡化长度不超过音频长度
|
| 1632 |
+
crossfade_samples = min(crossfade_samples, len(audio1), len(audio2))
|
| 1633 |
+
|
| 1634 |
+
# 创建淡入淡出曲线
|
| 1635 |
+
fade_out = np.linspace(1.0, 0.0, crossfade_samples)
|
| 1636 |
+
fade_in = np.linspace(0.0, 1.0, crossfade_samples)
|
| 1637 |
+
|
| 1638 |
+
# 分离各部分
|
| 1639 |
+
part1 = audio1[:-crossfade_samples]
|
| 1640 |
+
overlap1 = audio1[-crossfade_samples:]
|
| 1641 |
+
overlap2 = audio2[:crossfade_samples]
|
| 1642 |
+
part2 = audio2[crossfade_samples:]
|
| 1643 |
+
|
| 1644 |
+
# 交叉混合
|
| 1645 |
+
crossfaded = overlap1 * fade_out + overlap2 * fade_in
|
| 1646 |
+
|
| 1647 |
+
# 拼接
|
| 1648 |
+
return np.concatenate([part1, crossfaded, part2])
|