import json import torchaudio from tqdm import tqdm import os import sys from collections import defaultdict def validate_jsonl_audios(jsonl_path): """验证JSONL文件中所有音频文件的完整性""" stats = defaultdict(int) error_log = [] valid_samples = 0 # 第一次遍历:统计总行数(用于进度条) with open(jsonl_path, 'r') as f: total_lines = sum(1 for _ in f) # 第二次遍历:实际验证 with open(jsonl_path, 'r') as f: for line_num, line in enumerate(tqdm(f, total=total_lines, desc="验证进度", unit="line")): try: data = json.loads(line.strip()) if 'audios' not in data or not data['audios']: stats['no_audio_field'] += 1 continue for audio_path in data['audios']: # 检查文件是否存在 if not os.path.exists(audio_path): stats['missing'] += 1 error_log.append(f"[行{line_num+1}] 缺失文件: {audio_path}") continue # 检查文件大小 if os.path.getsize(audio_path) == 0: stats['zero_size'] += 1 error_log.append(f"[行{line_num+1}] 空文件: {audio_path}") continue # 验证音频内容 try: waveform, sr = torchaudio.load(audio_path) if waveform.numel() == 0: stats['empty_audio'] += 1 error_log.append(f"[行{line_num+1}] 空音频: {audio_path}") elif sr not in [8000, 16000, 22050, 44100, 48000]: stats['abnormal_sr'] += 1 error_log.append(f"[行{line_num+1}] 异常采样率({sr}Hz): {audio_path}") else: stats['valid'] += 1 except Exception as e: stats['corrupted'] += 1 error_type = str(e).split('(')[0] error_log.append(f"[行{line_num+1}] 损坏文件({error_type}): {audio_path}") valid_samples += 1 except json.JSONDecodeError: stats['invalid_json'] += 1 error_log.append(f"[行{line_num+1}] 无效JSON格式") # 打印统计报告 print("\n===== 验证报告 =====") print(f"总行数: {total_lines}") print(f"有效样本: {valid_samples}") print("--- 问题统计 ---") for k, v in sorted(stats.items()): print(f"{k}: {v}") # 保存错误日志 if error_log: log_file = f"{os.path.splitext(jsonl_path)[0]}_audio_errors.log" with open(log_file, 'w') as f: f.write("\n".join(error_log)) print(f"\n发现 {len(error_log)} 个问题,已保存到 {log_file}") if __name__ == "__main__": if len(sys.argv) != 2: print("使用方法: python validate_audio_jsonl.py ") sys.exit(1) if not os.path.exists(sys.argv[1]): print(f"错误: 文件 {sys.argv[1]} 不存在") sys.exit(1) validate_jsonl_audios(sys.argv[1])