ASR_AGENT_ / scripts /run_cv_zhcn_quickstart.py
unknown
Add app.py
6dedffc
import argparse
import json
import os
import subprocess
import sys
from pathlib import Path
def run_cmd(cmd, cwd=None):
print("\n[CMD]", " ".join(cmd))
r = subprocess.run(cmd, cwd=cwd, shell=False)
if r.returncode != 0:
raise SystemExit(f"Command failed with code {r.returncode}: {' '.join(cmd)}")
def ensure_deps():
# datasets 用来下载数据;soundfile 用来更稳地处理音频;tqdm 只是打印
run_cmd([sys.executable, "-m", "pip", "install", "-U", "pip"])
run_cmd([sys.executable, "-m", "pip", "install", "-U", "datasets", "soundfile", "tqdm"])
def build_manifest_common_voice_zhcn(out_manifest: Path, num_samples: int, split: str, cache_dir: Path | None):
from datasets import load_dataset
out_manifest.parent.mkdir(parents=True, exist_ok=True)
# Common Voice 版本可能会变;这条通常最稳:直接用当前可用版本。
# 如果你发现某个版本不可用,就把这行改成你能 load 成功的版本。
dataset_name = "mozilla-foundation/common_voice_11_0"
config_name = "zh-CN"
print(f"\n[INFO] Loading dataset: {dataset_name} / {config_name} / split={split} / num={num_samples}")
ds = load_dataset(
dataset_name,
config_name,
split=f"{split}[:{num_samples}]",
cache_dir=str(cache_dir) if cache_dir else None,
trust_remote_code=False,
)
# 生成 manifest:写绝对路径,避免 Windows 相对路径找不到文件
count = 0
with out_manifest.open("w", encoding="utf-8") as f:
for i, item in enumerate(ds):
audio = item.get("audio", None)
if not audio or not audio.get("path"):
continue
audio_path = Path(audio["path"]).resolve() # 绝对路径
if not audio_path.exists():
# 极少数情况下 path 可能还没落盘;跳过
continue
ref = item.get("sentence") or item.get("text") or ""
if not ref.strip():
continue
record = {
"utt_id": f"cv_zhcn_{i}",
"audio_uri": str(audio_path),
"ref_text": ref,
"meta": {
"dataset": dataset_name,
"config": config_name,
"split": split,
"speaker": item.get("client_id"),
"gender": item.get("gender"),
"accent": item.get("accent"),
"age": item.get("age"),
"sample_rate": audio.get("sampling_rate"),
},
}
f.write(json.dumps(record, ensure_ascii=False) + "\n")
count += 1
if count == 0:
raise SystemExit("No samples written to manifest. Try another split (e.g., 'train') or increase num_samples.")
print(f"[OK] Manifest written: {out_manifest} (samples={count})")
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--num", type=int, default=20, help="Number of samples to download (default: 20)")
ap.add_argument("--split", type=str, default="validation", help="Dataset split: train/validation/test (default: validation)")
ap.add_argument("--model_name", type=str, default="small", help="Whisper model size for faster-whisper (default: small)")
ap.add_argument("--device", type=str, default="cpu", help="cpu or cuda (default: cpu)")
ap.add_argument("--compute_type", type=str, default="int8", help="int8/float16/float32 etc. (default: int8)")
ap.add_argument("--cache_dir", type=str, default="", help="Optional HF datasets cache dir")
ap.add_argument("--skip_deps", action="store_true", help="Skip installing datasets/soundfile")
args = ap.parse_args()
project_root = Path(__file__).resolve().parents[1]
os.chdir(project_root)
if not args.skip_deps:
ensure_deps()
cache_dir = Path(args.cache_dir).resolve() if args.cache_dir else None
manifest_path = project_root / "data" / "manifest.jsonl"
build_manifest_common_voice_zhcn(
out_manifest=manifest_path,
num_samples=args.num,
split=args.split,
cache_dir=cache_dir,
)
# 调用你现有的 pipeline:相当于执行
# python -m pipeline.run_all --manifest data/manifest.jsonl --model_name ... --device ... --compute_type ...
run_cmd([
sys.executable, "-m", "pipeline.run_all",
"--manifest", str(manifest_path),
"--model_name", args.model_name,
"--device", args.device,
"--compute_type", args.compute_type,
], cwd=str(project_root))
print("\n[DONE] Pipeline finished.")
print("You can now run UI:")
print(f" {sys.executable} -m ui.app")
if __name__ == "__main__":
main()