File size: 2,307 Bytes
e791fa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import json
import os
from collections import Counter
from pathlib import Path

def collect_unique_audio_paths(json_file_path):
    """
    提取 JSONL 文件中所有不重复的 audios 路径
    """
    audio_set = set()
    with open(json_file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            try:
                data = json.loads(line)
                if isinstance(data, dict) and 'audios' in data and data['audios']:
                    for audio_path in data['audios']:
                        audio_set.add(audio_path)
            except Exception as e:
                print(f"第 {line_num} 行处理错误: {e}")
    return audio_set

def extract_first_subfolder_after_data(audio_path):
    """
    提取 audio_path 中 'data/' 后的第一级子文件夹名称
    例如:
    /.../data/output_xxx/yyy/file.wav → 返回 output_xxx
    """
    try:
        path = Path(audio_path)
        parts = path.parts
        if "wavrewardDataset" in parts:
            data_idx = parts.index("wavrewardDataset")
            if data_idx + 1 < len(parts):
                return parts[data_idx + 1]
        return "unknown"
    except Exception as e:
        print(f"路径解析错误: {audio_path}, 错误: {e}")
        return "error"

def main():
    json_file = "all_dataset_train_resampled_16000.jsonl"
    
    if not os.path.exists(json_file):
        print(f"文件 {json_file} 不存在")
        return

    print(f"正在处理文件: {json_file}")
    print("=" * 50)

    # 步骤 1:收集去重后的音频路径
    unique_audio_paths = collect_unique_audio_paths(json_file)
    print(f"不重复音频文件数: {len(unique_audio_paths)}")

    # 步骤 2:按 data 后的一级子目录统计
    folder_counter = Counter()
    for audio_path in unique_audio_paths:
        first_subfolder = extract_first_subfolder_after_data(audio_path)
        folder_counter[first_subfolder] += 1

    print("\n按 data 后一级子文件夹统计(基于去重后的路径):")
    print("-" * 50)
    for folder, count in sorted(folder_counter.items(), key=lambda x: -x[1]):
        print(f"{folder}: {count} 个文件")

if __name__ == "__main__":
    main()