interactSpeech / .ipynb_checkpoints /count_audios-checkpoint.py
Student0809's picture
Add files using upload-large-folder tool
e791fa3 verified
import json
import os
from collections import Counter
from pathlib import Path
def collect_unique_audio_paths(json_file_path):
"""
提取 JSONL 文件中所有不重复的 audios 路径
"""
audio_set = set()
with open(json_file_path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
if isinstance(data, dict) and 'audios' in data and data['audios']:
for audio_path in data['audios']:
audio_set.add(audio_path)
except Exception as e:
print(f"第 {line_num} 行处理错误: {e}")
return audio_set
def extract_first_subfolder_after_data(audio_path):
"""
提取 audio_path 中 'data/' 后的第一级子文件夹名称
例如:
/.../data/output_xxx/yyy/file.wav → 返回 output_xxx
"""
try:
path = Path(audio_path)
parts = path.parts
if "wavrewardDataset" in parts:
data_idx = parts.index("wavrewardDataset")
if data_idx + 1 < len(parts):
return parts[data_idx + 1]
return "unknown"
except Exception as e:
print(f"路径解析错误: {audio_path}, 错误: {e}")
return "error"
def main():
json_file = "all_dataset_train_resampled_16000.jsonl"
if not os.path.exists(json_file):
print(f"文件 {json_file} 不存在")
return
print(f"正在处理文件: {json_file}")
print("=" * 50)
# 步骤 1:收集去重后的音频路径
unique_audio_paths = collect_unique_audio_paths(json_file)
print(f"不重复音频文件数: {len(unique_audio_paths)}")
# 步骤 2:按 data 后的一级子目录统计
folder_counter = Counter()
for audio_path in unique_audio_paths:
first_subfolder = extract_first_subfolder_after_data(audio_path)
folder_counter[first_subfolder] += 1
print("\n按 data 后一级子文件夹统计(基于去重后的路径):")
print("-" * 50)
for folder, count in sorted(folder_counter.items(), key=lambda x: -x[1]):
print(f"{folder}: {count} 个文件")
if __name__ == "__main__":
main()