File size: 3,314 Bytes
3b47bbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import json
import torchaudio
from tqdm import tqdm
import os
import sys
from collections import defaultdict

def validate_jsonl_audios(jsonl_path):
    """验证JSONL文件中所有音频文件的完整性"""
    stats = defaultdict(int)
    error_log = []
    valid_samples = 0

    # 第一次遍历:统计总行数(用于进度条)
    with open(jsonl_path, 'r') as f:
        total_lines = sum(1 for _ in f)

    # 第二次遍历:实际验证
    with open(jsonl_path, 'r') as f:
        for line_num, line in enumerate(tqdm(f, total=total_lines, desc="验证进度", unit="line")):
            try:
                data = json.loads(line.strip())
                if 'audios' not in data or not data['audios']:
                    stats['no_audio_field'] += 1
                    continue

                for audio_path in data['audios']:
                    # 检查文件是否存在
                    if not os.path.exists(audio_path):
                        stats['missing'] += 1
                        error_log.append(f"[行{line_num+1}] 缺失文件: {audio_path}")
                        continue

                    # 检查文件大小
                    if os.path.getsize(audio_path) == 0:
                        stats['zero_size'] += 1
                        error_log.append(f"[行{line_num+1}] 空文件: {audio_path}")
                        continue

                    # 验证音频内容
                    try:
                        waveform, sr = torchaudio.load(audio_path)
                        if waveform.numel() == 0:
                            stats['empty_audio'] += 1
                            error_log.append(f"[行{line_num+1}] 空音频: {audio_path}")
                        elif sr not in [8000, 16000, 22050, 44100, 48000]:
                            stats['abnormal_sr'] += 1
                            error_log.append(f"[行{line_num+1}] 异常采样率({sr}Hz): {audio_path}")
                        else:
                            stats['valid'] += 1
                    except Exception as e:
                        stats['corrupted'] += 1
                        error_type = str(e).split('(')[0]
                        error_log.append(f"[行{line_num+1}] 损坏文件({error_type}): {audio_path}")

                valid_samples += 1

            except json.JSONDecodeError:
                stats['invalid_json'] += 1
                error_log.append(f"[行{line_num+1}] 无效JSON格式")

    # 打印统计报告
    print("\n===== 验证报告 =====")
    print(f"总行数: {total_lines}")
    print(f"有效样本: {valid_samples}")
    print("--- 问题统计 ---")
    for k, v in sorted(stats.items()):
        print(f"{k}: {v}")

    # 保存错误日志
    if error_log:
        log_file = f"{os.path.splitext(jsonl_path)[0]}_audio_errors.log"
        with open(log_file, 'w') as f:
            f.write("\n".join(error_log))
        print(f"\n发现 {len(error_log)} 个问题,已保存到 {log_file}")

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("使用方法: python validate_audio_jsonl.py <input.jsonl>")
        sys.exit(1)

    if not os.path.exists(sys.argv[1]):
        print(f"错误: 文件 {sys.argv[1]} 不存在")
        sys.exit(1)

    validate_jsonl_audios(sys.argv[1])