import argparse import json import os import subprocess import sys from pathlib import Path def run_cmd(cmd, cwd=None): print("\n[CMD]", " ".join(cmd)) r = subprocess.run(cmd, cwd=cwd, shell=False) if r.returncode != 0: raise SystemExit(f"Command failed with code {r.returncode}: {' '.join(cmd)}") def ensure_deps(): # datasets 用来下载数据;soundfile 用来更稳地处理音频;tqdm 只是打印 run_cmd([sys.executable, "-m", "pip", "install", "-U", "pip"]) run_cmd([sys.executable, "-m", "pip", "install", "-U", "datasets", "soundfile", "tqdm"]) def build_manifest_common_voice_zhcn(out_manifest: Path, num_samples: int, split: str, cache_dir: Path | None): from datasets import load_dataset out_manifest.parent.mkdir(parents=True, exist_ok=True) # Common Voice 版本可能会变;这条通常最稳:直接用当前可用版本。 # 如果你发现某个版本不可用,就把这行改成你能 load 成功的版本。 dataset_name = "mozilla-foundation/common_voice_11_0" config_name = "zh-CN" print(f"\n[INFO] Loading dataset: {dataset_name} / {config_name} / split={split} / num={num_samples}") ds = load_dataset( dataset_name, config_name, split=f"{split}[:{num_samples}]", cache_dir=str(cache_dir) if cache_dir else None, trust_remote_code=False, ) # 生成 manifest:写绝对路径,避免 Windows 相对路径找不到文件 count = 0 with out_manifest.open("w", encoding="utf-8") as f: for i, item in enumerate(ds): audio = item.get("audio", None) if not audio or not audio.get("path"): continue audio_path = Path(audio["path"]).resolve() # 绝对路径 if not audio_path.exists(): # 极少数情况下 path 可能还没落盘;跳过 continue ref = item.get("sentence") or item.get("text") or "" if not ref.strip(): continue record = { "utt_id": f"cv_zhcn_{i}", "audio_uri": str(audio_path), "ref_text": ref, "meta": { "dataset": dataset_name, "config": config_name, "split": split, "speaker": item.get("client_id"), "gender": item.get("gender"), "accent": item.get("accent"), "age": item.get("age"), "sample_rate": audio.get("sampling_rate"), }, } f.write(json.dumps(record, ensure_ascii=False) + "\n") count += 1 if count == 0: raise SystemExit("No samples written to manifest. Try another split (e.g., 'train') or increase num_samples.") print(f"[OK] Manifest written: {out_manifest} (samples={count})") def main(): ap = argparse.ArgumentParser() ap.add_argument("--num", type=int, default=20, help="Number of samples to download (default: 20)") ap.add_argument("--split", type=str, default="validation", help="Dataset split: train/validation/test (default: validation)") ap.add_argument("--model_name", type=str, default="small", help="Whisper model size for faster-whisper (default: small)") ap.add_argument("--device", type=str, default="cpu", help="cpu or cuda (default: cpu)") ap.add_argument("--compute_type", type=str, default="int8", help="int8/float16/float32 etc. (default: int8)") ap.add_argument("--cache_dir", type=str, default="", help="Optional HF datasets cache dir") ap.add_argument("--skip_deps", action="store_true", help="Skip installing datasets/soundfile") args = ap.parse_args() project_root = Path(__file__).resolve().parents[1] os.chdir(project_root) if not args.skip_deps: ensure_deps() cache_dir = Path(args.cache_dir).resolve() if args.cache_dir else None manifest_path = project_root / "data" / "manifest.jsonl" build_manifest_common_voice_zhcn( out_manifest=manifest_path, num_samples=args.num, split=args.split, cache_dir=cache_dir, ) # 调用你现有的 pipeline:相当于执行 # python -m pipeline.run_all --manifest data/manifest.jsonl --model_name ... --device ... --compute_type ... run_cmd([ sys.executable, "-m", "pipeline.run_all", "--manifest", str(manifest_path), "--model_name", args.model_name, "--device", args.device, "--compute_type", args.compute_type, ], cwd=str(project_root)) print("\n[DONE] Pipeline finished.") print("You can now run UI:") print(f" {sys.executable} -m ui.app") if __name__ == "__main__": main()