Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| 准备RVC v2训练数据 - 简化版 | |
| 使用snapshot_download一次性下载整个Dataset | |
| """ | |
| import os | |
| from pathlib import Path | |
| from huggingface_hub import snapshot_download | |
| import subprocess | |
| import json | |
| from tqdm import tqdm | |
| # 配置 | |
| DATASET_ID = "ayf3/numberblocks-audio" | |
| OUTPUT_DIR = Path("data/training_data") | |
| AUDIO_DIR = OUTPUT_DIR / "audio" | |
| METADATA_FILE = OUTPUT_DIR / "metadata.json" | |
| # HuggingFace Token - 从环境变量或缓存读取 | |
| HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", os.environ.get("HF_TOKEN", None)) | |
| def create_directories(): | |
| """创建必要的目录""" | |
| AUDIO_DIR.mkdir(parents=True, exist_ok=True) | |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
| print(f"✅ 目录创建完成: {AUDIO_DIR}") | |
| def download_audio_files(): | |
| """从HuggingFace Dataset下载所有音频文件""" | |
| print(f"📥 开始下载音频文件...") | |
| print(f"📦 Dataset: {DATASET_ID}") | |
| try: | |
| # 使用snapshot_download一次性下载整个repo | |
| snapshot_download( | |
| repo_id=DATASET_ID, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| local_dir=str(AUDIO_DIR), | |
| local_dir_use_symlinks=False | |
| ) | |
| print(f"✅ 下载完成") | |
| except Exception as e: | |
| print(f"❌ 下载失败: {e}") | |
| return False | |
| return True | |
| def analyze_audio_files(): | |
| """分析音频文件(时长、采样率、质量)""" | |
| print(f"\n🔍 分析音频文件...") | |
| audio_files = list(AUDIO_DIR.glob("*.wav")) + list(AUDIO_DIR.glob("*.mp3")) + list(AUDIO_DIR.glob("*.m4a")) | |
| print(f"📊 找到 {len(audio_files)} 个本地音频文件") | |
| if len(audio_files) == 0: | |
| print("❌ 没有找到音频文件") | |
| return None | |
| metadata = [] | |
| total_duration = 0 | |
| print(f"\n处理中...") | |
| for i, audio_file in enumerate(audio_files, 1): | |
| try: | |
| # 使用ffprobe获取音频信息 | |
| cmd = [ | |
| "ffprobe", | |
| "-v", "error", | |
| "-show_entries", "format=duration", | |
| "-show_entries", "stream=sample_rate,channels", | |
| "-of", "json", | |
| str(audio_file) | |
| ] | |
| result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=10) | |
| info = json.loads(result.stdout) | |
| duration = float(info["format"]["duration"]) | |
| sample_rate = int(info["streams"][0]["sample_rate"]) | |
| channels = int(info["streams"][0]["channels"]) | |
| total_duration += duration | |
| file_metadata = { | |
| "filename": audio_file.name, | |
| "duration": duration, | |
| "sample_rate": sample_rate, | |
| "channels": channels, | |
| "size": audio_file.stat().st_size | |
| } | |
| metadata.append(file_metadata) | |
| if i <= 10 or i == len(audio_files): | |
| print(f" [{i:3d}/{len(audio_files)}] {audio_file.name}: {duration:6.2f}s, {sample_rate}Hz, {channels}ch") | |
| except Exception as e: | |
| print(f" ❌ [{i}/{len(audio_files)}] 分析失败: {audio_file.name}, 错误: {e}") | |
| # 保存元数据 | |
| with open(METADATA_FILE, 'w', encoding='utf-8') as f: | |
| json.dump({ | |
| "total_files": len(metadata), | |
| "total_duration": total_duration, | |
| "total_duration_hours": round(total_duration / 3600, 2), | |
| "files": metadata | |
| }, f, indent=2, ensure_ascii=False) | |
| print(f"\n✅ 分析完成:") | |
| print(f" - 文件数: {len(metadata)}") | |
| print(f" - 总时长: {total_duration / 3600:.2f} 小时 ({total_duration / 60:.1f} 分钟)") | |
| print(f" - 元数据保存: {METADATA_FILE}") | |
| return metadata | |
| def main(): | |
| """主函数""" | |
| print("=" * 60) | |
| print("🎤 准备RVC v2训练数据(简化版)") | |
| print("=" * 60) | |
| # 步骤1: 创建目录 | |
| create_directories() | |
| # 步骤2: 下载音频文件(如果本地已有文件,跳过) | |
| audio_files = list(AUDIO_DIR.glob("*.wav")) + list(AUDIO_DIR.glob("*.mp3")) + list(AUDIO_DIR.glob("*.m4a")) | |
| if len(audio_files) > 0: | |
| print(f"📂 本地已有 {len(audio_files)} 个音频文件,跳过下载") | |
| else: | |
| success = download_audio_files() | |
| if not success: | |
| print("❌ 下载失败,退出") | |
| return | |
| # 步骤3: 分析音频文件 | |
| metadata = analyze_audio_files() | |
| if metadata: | |
| print("\n" + "=" * 60) | |
| print("✅ 数据准备完成!") | |
| print("=" * 60) | |
| else: | |
| print("\n" + "=" * 60) | |
| print("❌ 数据准备失败") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() | |