import argparse
import json
import os
import subprocess
import sys
from pathlib import Path

def run_cmd(cmd, cwd=None):
    print("\n[CMD]", " ".join(cmd))
    r = subprocess.run(cmd, cwd=cwd, shell=False)
    if r.returncode != 0:
        raise SystemExit(f"Command failed with code {r.returncode}: {' '.join(cmd)}")

def ensure_deps():
    # datasets 用来下载数据；soundfile 用来更稳地处理音频；tqdm 只是打印
    run_cmd([sys.executable, "-m", "pip", "install", "-U", "pip"])
    run_cmd([sys.executable, "-m", "pip", "install", "-U", "datasets", "soundfile", "tqdm"])

def build_manifest_common_voice_zhcn(out_manifest: Path, num_samples: int, split: str, cache_dir: Path | None):
    from datasets import load_dataset

    out_manifest.parent.mkdir(parents=True, exist_ok=True)

    # Common Voice 版本可能会变；这条通常最稳：直接用当前可用版本。
    # 如果你发现某个版本不可用，就把这行改成你能 load 成功的版本。
    dataset_name = "mozilla-foundation/common_voice_11_0"
    config_name = "zh-CN"

    print(f"\n[INFO] Loading dataset: {dataset_name} / {config_name} / split={split} / num={num_samples}")
    ds = load_dataset(
        dataset_name,
        config_name,
        split=f"{split}[:{num_samples}]",
        cache_dir=str(cache_dir) if cache_dir else None,
        trust_remote_code=False,
    )

    # 生成 manifest：写绝对路径，避免 Windows 相对路径找不到文件
    count = 0
    with out_manifest.open("w", encoding="utf-8") as f:
        for i, item in enumerate(ds):
            audio = item.get("audio", None)
            if not audio or not audio.get("path"):
                continue

            audio_path = Path(audio["path"]).resolve()  # 绝对路径
            if not audio_path.exists():
                # 极少数情况下 path 可能还没落盘；跳过
                continue

            ref = item.get("sentence") or item.get("text") or ""
            if not ref.strip():
                continue

            record = {
                "utt_id": f"cv_zhcn_{i}",
                "audio_uri": str(audio_path),
                "ref_text": ref,
                "meta": {
                    "dataset": dataset_name,
                    "config": config_name,
                    "split": split,
                    "speaker": item.get("client_id"),
                    "gender": item.get("gender"),
                    "accent": item.get("accent"),
                    "age": item.get("age"),
                    "sample_rate": audio.get("sampling_rate"),
                },
            }
            f.write(json.dumps(record, ensure_ascii=False) + "\n")
            count += 1

    if count == 0:
        raise SystemExit("No samples written to manifest. Try another split (e.g., 'train') or increase num_samples.")

    print(f"[OK] Manifest written: {out_manifest} (samples={count})")

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--num", type=int, default=20, help="Number of samples to download (default: 20)")
    ap.add_argument("--split", type=str, default="validation", help="Dataset split: train/validation/test (default: validation)")
    ap.add_argument("--model_name", type=str, default="small", help="Whisper model size for faster-whisper (default: small)")
    ap.add_argument("--device", type=str, default="cpu", help="cpu or cuda (default: cpu)")
    ap.add_argument("--compute_type", type=str, default="int8", help="int8/float16/float32 etc. (default: int8)")
    ap.add_argument("--cache_dir", type=str, default="", help="Optional HF datasets cache dir")
    ap.add_argument("--skip_deps", action="store_true", help="Skip installing datasets/soundfile")
    args = ap.parse_args()

    project_root = Path(__file__).resolve().parents[1]
    os.chdir(project_root)

    if not args.skip_deps:
        ensure_deps()

    cache_dir = Path(args.cache_dir).resolve() if args.cache_dir else None
    manifest_path = project_root / "data" / "manifest.jsonl"

    build_manifest_common_voice_zhcn(
        out_manifest=manifest_path,
        num_samples=args.num,
        split=args.split,
        cache_dir=cache_dir,
    )

    # 调用你现有的 pipeline：相当于执行
    # python -m pipeline.run_all --manifest data/manifest.jsonl --model_name ... --device ... --compute_type ...
    run_cmd([
        sys.executable, "-m", "pipeline.run_all",
        "--manifest", str(manifest_path),
        "--model_name", args.model_name,
        "--device", args.device,
        "--compute_type", args.compute_type,
    ], cwd=str(project_root))

    print("\n[DONE] Pipeline finished.")
    print("You can now run UI:")
    print(f"  {sys.executable} -m ui.app")

if __name__ == "__main__":
    main()