import json import os from collections import Counter from pathlib import Path def collect_unique_audio_paths(json_file_path): """ 提取 JSONL 文件中所有不重复的 audios 路径 """ audio_set = set() with open(json_file_path, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue try: data = json.loads(line) if isinstance(data, dict) and 'audios' in data and data['audios']: for audio_path in data['audios']: audio_set.add(audio_path) except Exception as e: print(f"第 {line_num} 行处理错误: {e}") return audio_set def extract_first_subfolder_after_data(audio_path): """ 提取 audio_path 中 'data/' 后的第一级子文件夹名称 例如: /.../data/output_xxx/yyy/file.wav → 返回 output_xxx """ try: path = Path(audio_path) parts = path.parts if "wavrewardDataset" in parts: data_idx = parts.index("wavrewardDataset") if data_idx + 1 < len(parts): return parts[data_idx + 1] return "unknown" except Exception as e: print(f"路径解析错误: {audio_path}, 错误: {e}") return "error" def main(): json_file = "all_dataset_train_resampled_16000.jsonl" if not os.path.exists(json_file): print(f"文件 {json_file} 不存在") return print(f"正在处理文件: {json_file}") print("=" * 50) # 步骤 1:收集去重后的音频路径 unique_audio_paths = collect_unique_audio_paths(json_file) print(f"不重复音频文件数: {len(unique_audio_paths)}") # 步骤 2:按 data 后的一级子目录统计 folder_counter = Counter() for audio_path in unique_audio_paths: first_subfolder = extract_first_subfolder_after_data(audio_path) folder_counter[first_subfolder] += 1 print("\n按 data 后一级子文件夹统计(基于去重后的路径):") print("-" * 50) for folder, count in sorted(folder_counter.items(), key=lambda x: -x[1]): print(f"{folder}: {count} 个文件") if __name__ == "__main__": main()