|
|
import json |
|
|
import os |
|
|
from collections import Counter |
|
|
from pathlib import Path |
|
|
|
|
|
def collect_unique_audio_paths(json_file_path): |
|
|
""" |
|
|
提取 JSONL 文件中所有不重复的 audios 路径 |
|
|
""" |
|
|
audio_set = set() |
|
|
with open(json_file_path, 'r', encoding='utf-8') as f: |
|
|
for line_num, line in enumerate(f, 1): |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
try: |
|
|
data = json.loads(line) |
|
|
if isinstance(data, dict) and 'audios' in data and data['audios']: |
|
|
for audio_path in data['audios']: |
|
|
audio_set.add(audio_path) |
|
|
except Exception as e: |
|
|
print(f"第 {line_num} 行处理错误: {e}") |
|
|
return audio_set |
|
|
|
|
|
def extract_first_subfolder_after_data(audio_path): |
|
|
""" |
|
|
提取 audio_path 中 'data/' 后的第一级子文件夹名称 |
|
|
例如: |
|
|
/.../data/output_xxx/yyy/file.wav → 返回 output_xxx |
|
|
""" |
|
|
try: |
|
|
path = Path(audio_path) |
|
|
parts = path.parts |
|
|
if "wavrewardDataset" in parts: |
|
|
data_idx = parts.index("wavrewardDataset") |
|
|
if data_idx + 1 < len(parts): |
|
|
return parts[data_idx + 1] |
|
|
return "unknown" |
|
|
except Exception as e: |
|
|
print(f"路径解析错误: {audio_path}, 错误: {e}") |
|
|
return "error" |
|
|
|
|
|
def main(): |
|
|
json_file = "all_dataset_train_resampled_16000.jsonl" |
|
|
|
|
|
if not os.path.exists(json_file): |
|
|
print(f"文件 {json_file} 不存在") |
|
|
return |
|
|
|
|
|
print(f"正在处理文件: {json_file}") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
unique_audio_paths = collect_unique_audio_paths(json_file) |
|
|
print(f"不重复音频文件数: {len(unique_audio_paths)}") |
|
|
|
|
|
|
|
|
folder_counter = Counter() |
|
|
for audio_path in unique_audio_paths: |
|
|
first_subfolder = extract_first_subfolder_after_data(audio_path) |
|
|
folder_counter[first_subfolder] += 1 |
|
|
|
|
|
print("\n按 data 后一级子文件夹统计(基于去重后的路径):") |
|
|
print("-" * 50) |
|
|
for folder, count in sorted(folder_counter.items(), key=lambda x: -x[1]): |
|
|
print(f"{folder}: {count} 个文件") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|