Spaces:
Running
Running
File size: 4,849 Bytes
bcaa58c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | #!/usr/bin/env python3
"""
准备RVC v2训练数据 - 简化版
使用snapshot_download一次性下载整个Dataset
"""
import os
from pathlib import Path
from huggingface_hub import snapshot_download
import subprocess
import json
from tqdm import tqdm
# 配置
DATASET_ID = "ayf3/numberblocks-audio"
OUTPUT_DIR = Path("data/training_data")
AUDIO_DIR = OUTPUT_DIR / "audio"
METADATA_FILE = OUTPUT_DIR / "metadata.json"
# HuggingFace Token - 从环境变量或缓存读取
HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", os.environ.get("HF_TOKEN", None))
def create_directories():
"""创建必要的目录"""
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"✅ 目录创建完成: {AUDIO_DIR}")
def download_audio_files():
"""从HuggingFace Dataset下载所有音频文件"""
print(f"📥 开始下载音频文件...")
print(f"📦 Dataset: {DATASET_ID}")
try:
# 使用snapshot_download一次性下载整个repo
snapshot_download(
repo_id=DATASET_ID,
repo_type="dataset",
token=HF_TOKEN,
local_dir=str(AUDIO_DIR),
local_dir_use_symlinks=False
)
print(f"✅ 下载完成")
except Exception as e:
print(f"❌ 下载失败: {e}")
return False
return True
def analyze_audio_files():
"""分析音频文件(时长、采样率、质量)"""
print(f"\n🔍 分析音频文件...")
audio_files = list(AUDIO_DIR.glob("*.wav")) + list(AUDIO_DIR.glob("*.mp3")) + list(AUDIO_DIR.glob("*.m4a"))
print(f"📊 找到 {len(audio_files)} 个本地音频文件")
if len(audio_files) == 0:
print("❌ 没有找到音频文件")
return None
metadata = []
total_duration = 0
print(f"\n处理中...")
for i, audio_file in enumerate(audio_files, 1):
try:
# 使用ffprobe获取音频信息
cmd = [
"ffprobe",
"-v", "error",
"-show_entries", "format=duration",
"-show_entries", "stream=sample_rate,channels",
"-of", "json",
str(audio_file)
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=10)
info = json.loads(result.stdout)
duration = float(info["format"]["duration"])
sample_rate = int(info["streams"][0]["sample_rate"])
channels = int(info["streams"][0]["channels"])
total_duration += duration
file_metadata = {
"filename": audio_file.name,
"duration": duration,
"sample_rate": sample_rate,
"channels": channels,
"size": audio_file.stat().st_size
}
metadata.append(file_metadata)
if i <= 10 or i == len(audio_files):
print(f" [{i:3d}/{len(audio_files)}] {audio_file.name}: {duration:6.2f}s, {sample_rate}Hz, {channels}ch")
except Exception as e:
print(f" ❌ [{i}/{len(audio_files)}] 分析失败: {audio_file.name}, 错误: {e}")
# 保存元数据
with open(METADATA_FILE, 'w', encoding='utf-8') as f:
json.dump({
"total_files": len(metadata),
"total_duration": total_duration,
"total_duration_hours": round(total_duration / 3600, 2),
"files": metadata
}, f, indent=2, ensure_ascii=False)
print(f"\n✅ 分析完成:")
print(f" - 文件数: {len(metadata)}")
print(f" - 总时长: {total_duration / 3600:.2f} 小时 ({total_duration / 60:.1f} 分钟)")
print(f" - 元数据保存: {METADATA_FILE}")
return metadata
def main():
"""主函数"""
print("=" * 60)
print("🎤 准备RVC v2训练数据(简化版)")
print("=" * 60)
# 步骤1: 创建目录
create_directories()
# 步骤2: 下载音频文件(如果本地已有文件,跳过)
audio_files = list(AUDIO_DIR.glob("*.wav")) + list(AUDIO_DIR.glob("*.mp3")) + list(AUDIO_DIR.glob("*.m4a"))
if len(audio_files) > 0:
print(f"📂 本地已有 {len(audio_files)} 个音频文件,跳过下载")
else:
success = download_audio_files()
if not success:
print("❌ 下载失败,退出")
return
# 步骤3: 分析音频文件
metadata = analyze_audio_files()
if metadata:
print("\n" + "=" * 60)
print("✅ 数据准备完成!")
print("=" * 60)
else:
print("\n" + "=" * 60)
print("❌ 数据准备失败")
print("=" * 60)
if __name__ == "__main__":
main()
|