Spaces:

ZTXRiley
/

ASR_AGENT_

Sleeping

ASR_AGENT_ / scripts /run_cv_zhcn_quickstart.py

unknown

Add app.py

6dedffc about 1 month ago

4.77 kB

	import argparse
	import json
	import os
	import subprocess
	import sys
	from pathlib import Path

	def run_cmd(cmd, cwd=None):
	print("\n[CMD]", " ".join(cmd))
	r = subprocess.run(cmd, cwd=cwd, shell=False)
	if r.returncode != 0:
	raise SystemExit(f"Command failed with code {r.returncode}: {' '.join(cmd)}")

	def ensure_deps():
	# datasets 用来下载数据；soundfile 用来更稳地处理音频；tqdm 只是打印
	run_cmd([sys.executable, "-m", "pip", "install", "-U", "pip"])
	run_cmd([sys.executable, "-m", "pip", "install", "-U", "datasets", "soundfile", "tqdm"])

	def build_manifest_common_voice_zhcn(out_manifest: Path, num_samples: int, split: str, cache_dir: Path \| None):
	from datasets import load_dataset

	out_manifest.parent.mkdir(parents=True, exist_ok=True)

	# Common Voice 版本可能会变；这条通常最稳：直接用当前可用版本。
	# 如果你发现某个版本不可用，就把这行改成你能 load 成功的版本。
	dataset_name = "mozilla-foundation/common_voice_11_0"
	config_name = "zh-CN"

	print(f"\n[INFO] Loading dataset: {dataset_name} / {config_name} / split={split} / num={num_samples}")
	ds = load_dataset(
	dataset_name,
	config_name,
	split=f"{split}[:{num_samples}]",
	cache_dir=str(cache_dir) if cache_dir else None,
	trust_remote_code=False,
	)

	# 生成 manifest：写绝对路径，避免 Windows 相对路径找不到文件
	count = 0
	with out_manifest.open("w", encoding="utf-8") as f:
	for i, item in enumerate(ds):
	audio = item.get("audio", None)
	if not audio or not audio.get("path"):
	continue

	audio_path = Path(audio["path"]).resolve() # 绝对路径
	if not audio_path.exists():
	# 极少数情况下 path 可能还没落盘；跳过
	continue

	ref = item.get("sentence") or item.get("text") or ""
	if not ref.strip():
	continue

	record = {
	"utt_id": f"cv_zhcn_{i}",
	"audio_uri": str(audio_path),
	"ref_text": ref,
	"meta": {
	"dataset": dataset_name,
	"config": config_name,
	"split": split,
	"speaker": item.get("client_id"),
	"gender": item.get("gender"),
	"accent": item.get("accent"),
	"age": item.get("age"),
	"sample_rate": audio.get("sampling_rate"),
	},
	}
	f.write(json.dumps(record, ensure_ascii=False) + "\n")
	count += 1

	if count == 0:
	raise SystemExit("No samples written to manifest. Try another split (e.g., 'train') or increase num_samples.")

	print(f"[OK] Manifest written: {out_manifest} (samples={count})")

	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--num", type=int, default=20, help="Number of samples to download (default: 20)")
	ap.add_argument("--split", type=str, default="validation", help="Dataset split: train/validation/test (default: validation)")
	ap.add_argument("--model_name", type=str, default="small", help="Whisper model size for faster-whisper (default: small)")
	ap.add_argument("--device", type=str, default="cpu", help="cpu or cuda (default: cpu)")
	ap.add_argument("--compute_type", type=str, default="int8", help="int8/float16/float32 etc. (default: int8)")
	ap.add_argument("--cache_dir", type=str, default="", help="Optional HF datasets cache dir")
	ap.add_argument("--skip_deps", action="store_true", help="Skip installing datasets/soundfile")
	args = ap.parse_args()

	project_root = Path(__file__).resolve().parents[1]
	os.chdir(project_root)

	if not args.skip_deps:
	ensure_deps()

	cache_dir = Path(args.cache_dir).resolve() if args.cache_dir else None
	manifest_path = project_root / "data" / "manifest.jsonl"

	build_manifest_common_voice_zhcn(
	out_manifest=manifest_path,
	num_samples=args.num,
	split=args.split,
	cache_dir=cache_dir,
	)

	# 调用你现有的 pipeline：相当于执行
	# python -m pipeline.run_all --manifest data/manifest.jsonl --model_name ... --device ... --compute_type ...
	run_cmd([
	sys.executable, "-m", "pipeline.run_all",
	"--manifest", str(manifest_path),
	"--model_name", args.model_name,
	"--device", args.device,
	"--compute_type", args.compute_type,
	], cwd=str(project_root))

	print("\n[DONE] Pipeline finished.")
	print("You can now run UI:")
	print(f" {sys.executable} -m ui.app")

	if __name__ == "__main__":
	main()