File size: 2,670 Bytes
7bfbdc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import json
import os
import soundfile as sf
from pathlib import Path

DATA_CONFIG = [
    {"name": "gigaspeech", "subset": "xl", "split": "train"},
    {"name": "librispeech", "subset": "train-clean-360"},
    {"name": "commonvoice", "subset": "validated"}
]

ROOTS = {
    "gigaspeech": "/home/work/AIDAS/data/audio/GigaSpeech",  
    "librispeech": "/home/work/AIDAS/data/audio/LibriSpeech",
    "commonvoice": "/home/work/AIDAS/data/audio/commonvoice/cv-corpus-22.0-2025-06-20/en"
}

def iter_gigaspeech(cfg):
    import datasets
    ds = datasets.load_dataset("speechcolab/gigaspeech", cfg["subset"], split=cfg["split"])
    for row in ds:
        yield row["audio"]["path"]

def iter_librispeech(cfg):
    subset_root = Path(ROOTS["librispeech"]) / cfg["subset"]
    for txt in subset_root.glob("*/**/*.txt"):
        with txt.open() as f:
            for line in f:
                parts = line.strip().split()
                if not parts:
                    continue
                audio_id = parts[0]
                speaker, chapter, _ = audio_id.split("-")
                audio_path = subset_root / speaker / chapter / f"{audio_id}.flac"
                yield audio_path

def iter_commonvoice(cfg):
    import pandas as pd
    tsv = Path(ROOTS["commonvoice"]) / f"{cfg['subset']}.tsv"
    df = pd.read_csv(tsv, sep="\t", usecols=["path"])
    clips_root = Path(ROOTS["commonvoice"]) / "clips"
    for rel in df["path"]:
        yield clips_root / rel

DISPATCH = {
    "gigaspeech": iter_gigaspeech,
    "librispeech": iter_librispeech,
    "commonvoice": iter_commonvoice,
}

def main():
    total_sec = 0.0
    total_files = 0
    per_dataset = []

    for cfg in DATA_CONFIG:
        name = cfg["name"]
        iterator = DISPATCH[name](cfg)
        subset_total = 0.0
        subset_files = 0

        for audio_path in iterator:
            if not os.path.isfile(audio_path):
                continue
            info = sf.info(str(audio_path))
            duration = info.frames / info.samplerate
            subset_total += duration
            subset_files += 1
            total_sec += duration
            total_files += 1

        per_dataset.append({
            "name": name,
            "subset": cfg.get("subset"),
            "split": cfg.get("split"),
            "num_files": subset_files,
            "avg_seconds": subset_total / subset_files if subset_files else 0.0,
        })

    summary = {
        "total_files": total_files,
        "overall_avg_seconds": total_sec / total_files if total_files else 0.0,
        "per_dataset": per_dataset,
    }
    print(json.dumps(summary, indent=2))

if __name__ == "__main__":
    main()