Spaces:
Sleeping
Sleeping
| import argparse | |
| import json | |
| import os | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| def run_cmd(cmd, cwd=None): | |
| print("\n[CMD]", " ".join(cmd)) | |
| r = subprocess.run(cmd, cwd=cwd, shell=False) | |
| if r.returncode != 0: | |
| raise SystemExit(f"Command failed with code {r.returncode}: {' '.join(cmd)}") | |
| def ensure_deps(): | |
| # datasets 用来下载数据;soundfile 用来更稳地处理音频;tqdm 只是打印 | |
| run_cmd([sys.executable, "-m", "pip", "install", "-U", "pip"]) | |
| run_cmd([sys.executable, "-m", "pip", "install", "-U", "datasets", "soundfile", "tqdm"]) | |
| def build_manifest_common_voice_zhcn(out_manifest: Path, num_samples: int, split: str, cache_dir: Path | None): | |
| from datasets import load_dataset | |
| out_manifest.parent.mkdir(parents=True, exist_ok=True) | |
| # Common Voice 版本可能会变;这条通常最稳:直接用当前可用版本。 | |
| # 如果你发现某个版本不可用,就把这行改成你能 load 成功的版本。 | |
| dataset_name = "mozilla-foundation/common_voice_11_0" | |
| config_name = "zh-CN" | |
| print(f"\n[INFO] Loading dataset: {dataset_name} / {config_name} / split={split} / num={num_samples}") | |
| ds = load_dataset( | |
| dataset_name, | |
| config_name, | |
| split=f"{split}[:{num_samples}]", | |
| cache_dir=str(cache_dir) if cache_dir else None, | |
| trust_remote_code=False, | |
| ) | |
| # 生成 manifest:写绝对路径,避免 Windows 相对路径找不到文件 | |
| count = 0 | |
| with out_manifest.open("w", encoding="utf-8") as f: | |
| for i, item in enumerate(ds): | |
| audio = item.get("audio", None) | |
| if not audio or not audio.get("path"): | |
| continue | |
| audio_path = Path(audio["path"]).resolve() # 绝对路径 | |
| if not audio_path.exists(): | |
| # 极少数情况下 path 可能还没落盘;跳过 | |
| continue | |
| ref = item.get("sentence") or item.get("text") or "" | |
| if not ref.strip(): | |
| continue | |
| record = { | |
| "utt_id": f"cv_zhcn_{i}", | |
| "audio_uri": str(audio_path), | |
| "ref_text": ref, | |
| "meta": { | |
| "dataset": dataset_name, | |
| "config": config_name, | |
| "split": split, | |
| "speaker": item.get("client_id"), | |
| "gender": item.get("gender"), | |
| "accent": item.get("accent"), | |
| "age": item.get("age"), | |
| "sample_rate": audio.get("sampling_rate"), | |
| }, | |
| } | |
| f.write(json.dumps(record, ensure_ascii=False) + "\n") | |
| count += 1 | |
| if count == 0: | |
| raise SystemExit("No samples written to manifest. Try another split (e.g., 'train') or increase num_samples.") | |
| print(f"[OK] Manifest written: {out_manifest} (samples={count})") | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--num", type=int, default=20, help="Number of samples to download (default: 20)") | |
| ap.add_argument("--split", type=str, default="validation", help="Dataset split: train/validation/test (default: validation)") | |
| ap.add_argument("--model_name", type=str, default="small", help="Whisper model size for faster-whisper (default: small)") | |
| ap.add_argument("--device", type=str, default="cpu", help="cpu or cuda (default: cpu)") | |
| ap.add_argument("--compute_type", type=str, default="int8", help="int8/float16/float32 etc. (default: int8)") | |
| ap.add_argument("--cache_dir", type=str, default="", help="Optional HF datasets cache dir") | |
| ap.add_argument("--skip_deps", action="store_true", help="Skip installing datasets/soundfile") | |
| args = ap.parse_args() | |
| project_root = Path(__file__).resolve().parents[1] | |
| os.chdir(project_root) | |
| if not args.skip_deps: | |
| ensure_deps() | |
| cache_dir = Path(args.cache_dir).resolve() if args.cache_dir else None | |
| manifest_path = project_root / "data" / "manifest.jsonl" | |
| build_manifest_common_voice_zhcn( | |
| out_manifest=manifest_path, | |
| num_samples=args.num, | |
| split=args.split, | |
| cache_dir=cache_dir, | |
| ) | |
| # 调用你现有的 pipeline:相当于执行 | |
| # python -m pipeline.run_all --manifest data/manifest.jsonl --model_name ... --device ... --compute_type ... | |
| run_cmd([ | |
| sys.executable, "-m", "pipeline.run_all", | |
| "--manifest", str(manifest_path), | |
| "--model_name", args.model_name, | |
| "--device", args.device, | |
| "--compute_type", args.compute_type, | |
| ], cwd=str(project_root)) | |
| print("\n[DONE] Pipeline finished.") | |
| print("You can now run UI:") | |
| print(f" {sys.executable} -m ui.app") | |
| if __name__ == "__main__": | |
| main() | |