File size: 4,849 Bytes
bcaa58c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
"""
准备RVC v2训练数据 - 简化版
使用snapshot_download一次性下载整个Dataset
"""

import os
from pathlib import Path
from huggingface_hub import snapshot_download
import subprocess
import json
from tqdm import tqdm

# 配置
DATASET_ID = "ayf3/numberblocks-audio"
OUTPUT_DIR = Path("data/training_data")
AUDIO_DIR = OUTPUT_DIR / "audio"
METADATA_FILE = OUTPUT_DIR / "metadata.json"

# HuggingFace Token - 从环境变量或缓存读取
HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", os.environ.get("HF_TOKEN", None))

def create_directories():
    """创建必要的目录"""
    AUDIO_DIR.mkdir(parents=True, exist_ok=True)
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    print(f"✅ 目录创建完成: {AUDIO_DIR}")

def download_audio_files():
    """从HuggingFace Dataset下载所有音频文件"""
    print(f"📥 开始下载音频文件...")
    print(f"📦 Dataset: {DATASET_ID}")
    
    try:
        # 使用snapshot_download一次性下载整个repo
        snapshot_download(
            repo_id=DATASET_ID,
            repo_type="dataset",
            token=HF_TOKEN,
            local_dir=str(AUDIO_DIR),
            local_dir_use_symlinks=False
        )
        print(f"✅ 下载完成")
    except Exception as e:
        print(f"❌ 下载失败: {e}")
        return False
    
    return True

def analyze_audio_files():
    """分析音频文件(时长、采样率、质量)"""
    print(f"\n🔍 分析音频文件...")
    
    audio_files = list(AUDIO_DIR.glob("*.wav")) + list(AUDIO_DIR.glob("*.mp3")) + list(AUDIO_DIR.glob("*.m4a"))
    print(f"📊 找到 {len(audio_files)} 个本地音频文件")
    
    if len(audio_files) == 0:
        print("❌ 没有找到音频文件")
        return None
    
    metadata = []
    total_duration = 0
    
    print(f"\n处理中...")
    for i, audio_file in enumerate(audio_files, 1):
        try:
            # 使用ffprobe获取音频信息
            cmd = [
                "ffprobe",
                "-v", "error",
                "-show_entries", "format=duration",
                "-show_entries", "stream=sample_rate,channels",
                "-of", "json",
                str(audio_file)
            ]
            result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=10)
            info = json.loads(result.stdout)
            
            duration = float(info["format"]["duration"])
            sample_rate = int(info["streams"][0]["sample_rate"])
            channels = int(info["streams"][0]["channels"])
            
            total_duration += duration
            
            file_metadata = {
                "filename": audio_file.name,
                "duration": duration,
                "sample_rate": sample_rate,
                "channels": channels,
                "size": audio_file.stat().st_size
            }
            metadata.append(file_metadata)
            
            if i <= 10 or i == len(audio_files):
                print(f"  [{i:3d}/{len(audio_files)}] {audio_file.name}: {duration:6.2f}s, {sample_rate}Hz, {channels}ch")
            
        except Exception as e:
            print(f"  ❌ [{i}/{len(audio_files)}] 分析失败: {audio_file.name}, 错误: {e}")
    
    # 保存元数据
    with open(METADATA_FILE, 'w', encoding='utf-8') as f:
        json.dump({
            "total_files": len(metadata),
            "total_duration": total_duration,
            "total_duration_hours": round(total_duration / 3600, 2),
            "files": metadata
        }, f, indent=2, ensure_ascii=False)
    
    print(f"\n✅ 分析完成:")
    print(f"  - 文件数: {len(metadata)}")
    print(f"  - 总时长: {total_duration / 3600:.2f} 小时 ({total_duration / 60:.1f} 分钟)")
    print(f"  - 元数据保存: {METADATA_FILE}")
    
    return metadata

def main():
    """主函数"""
    print("=" * 60)
    print("🎤 准备RVC v2训练数据(简化版)")
    print("=" * 60)
    
    # 步骤1: 创建目录
    create_directories()
    
    # 步骤2: 下载音频文件(如果本地已有文件,跳过)
    audio_files = list(AUDIO_DIR.glob("*.wav")) + list(AUDIO_DIR.glob("*.mp3")) + list(AUDIO_DIR.glob("*.m4a"))
    
    if len(audio_files) > 0:
        print(f"📂 本地已有 {len(audio_files)} 个音频文件,跳过下载")
    else:
        success = download_audio_files()
        if not success:
            print("❌ 下载失败,退出")
            return
    
    # 步骤3: 分析音频文件
    metadata = analyze_audio_files()
    
    if metadata:
        print("\n" + "=" * 60)
        print("✅ 数据准备完成!")
        print("=" * 60)
    else:
        print("\n" + "=" * 60)
        print("❌ 数据准备失败")
        print("=" * 60)

if __name__ == "__main__":
    main()