import json import os import torchaudio import subprocess def check_audio_file(audio_path): # 检查文件是否存在 if not os.path.exists(audio_path): print(f"[ERROR] 音频文件不存在: {audio_path}") return False # 尝试用torchaudio加载 try: waveform, sr = torchaudio.load(audio_path) print(f"[OK] torchaudio加载成功: {audio_path}") print(f" 采样率: {sr}, 时长: {waveform.shape[1]/sr:.2f}s, 通道数: {waveform.shape[0]}") except Exception as e: print(f"[ERROR] torchaudio加载失败: {audio_path}, 错误: {e}") return False # 用sox/ffprobe获取详细信息 try: sox_info = subprocess.check_output(['sox', '--i', audio_path], stderr=subprocess.STDOUT).decode() print(f" sox信息:\n{sox_info}") except Exception as e: print(f" [WARN] sox信息获取失败: {e}") return True def check_json_fields(obj): # 检查messages字段 messages = obj.get("messages", []) for i, msg in enumerate(messages): content = msg.get("content", "") if not isinstance(content, str): print(f"[ERROR] messages[{i}].content 不是字符串") if len(content) > 2000: print(f"[WARN] messages[{i}].content 超长: {len(content)} 字符") if any(ord(c) < 32 and c not in '\n\r\t' for c in content): print(f"[WARN] messages[{i}].content 含有不可见字符") # 检查solution字段 if "solution" not in obj: print("[WARN] 缺少 solution 字段") return True def main(): jsonl_path = "dataset_10k_train.jsonl" # 替换为你的文件 with open(jsonl_path, "r", encoding="utf-8") as f: for idx, line in enumerate(f): print(f"\n==== 检查第 {idx+1} 条数据 ====") try: obj = json.loads(line) except Exception as e: print(f"[ERROR] JSON解析失败: {e}") continue check_json_fields(obj) audios = obj.get("audios", []) for audio_path in audios: check_audio_file(audio_path) print("==== 检查结束 ====") if __name__ == "__main__": main()