File size: 2,977 Bytes
5887db6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""
Run Trelis Chorus v1 (GGML) on an audio file. Prints per-speaker transcripts.

Usage:
  python run_chorus.py audio.wav                       # Q4_K model, both speakers
  python run_chorus.py audio.wav --model f16           # use f16 model instead
  python run_chorus.py audio.wav --json out.json       # JSON output
  python run_chorus.py audio.wav --whisper-cli PATH    # custom whisper-cli path

Prerequisites:
  - Patched whisper.cpp built with `--speaker` support (see patches/whisper.cpp.patch)
  - GGML model files alongside this script (ggml-chorus-v1-q4_k.bin etc.)
"""
from __future__ import annotations
import argparse, json, re, subprocess, sys
from pathlib import Path

HERE = Path(__file__).resolve().parent
MODELS = {"q4_k": "ggml-chorus-v1-q4_k.bin", "f16": "ggml-chorus-v1-f16.bin"}
SEG_RE = re.compile(r"^\[(\d\d:\d\d:\d\d\.\d+)\s*-->\s*(\d\d:\d\d:\d\d\.\d+)\]\s+(.*)$")


def ts_to_sec(ts: str) -> float:
    h, m, s = ts.split(":")
    return int(h) * 3600 + int(m) * 60 + float(s)


def transcribe(wav: Path, speaker: int, model_path: Path, cli: Path) -> list[dict]:
    r = subprocess.run(
        [str(cli), "-m", str(model_path), "-f", str(wav),
         "-l", "en", "--speaker", str(speaker), "-nfa", "-np"],
        capture_output=True, text=True, check=True,
    )
    segs = []
    for line in r.stdout.splitlines():
        m = SEG_RE.match(line.strip())
        if m:
            segs.append({"start": ts_to_sec(m.group(1)),
                          "end": ts_to_sec(m.group(2)),
                          "text": m.group(3).strip()})
    return segs


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("audio")
    ap.add_argument("--model", choices=list(MODELS), default="q4_k")
    ap.add_argument("--models-dir", default=str(HERE.parent), help="directory containing ggml-chorus-v1-*.bin")
    ap.add_argument("--whisper-cli", default=str(HERE.parent / "whisper.cpp/build/bin/whisper-cli"))
    ap.add_argument("--speaker", type=int, choices=(0, 1, 2), default=0, help="0=both (default)")
    ap.add_argument("--json", help="write JSON result to this path")
    args = ap.parse_args()

    model_path = Path(args.models_dir) / MODELS[args.model]
    if not model_path.exists():
        sys.exit(f"Model not found: {model_path}")
    cli = Path(args.whisper_cli)
    if not cli.exists():
        sys.exit(f"whisper-cli not found: {cli}")

    wav = Path(args.audio)
    speakers = [args.speaker] if args.speaker else [1, 2]
    result = {"audio": str(wav), "model": args.model, "speakers": {}}
    for sp in speakers:
        segs = transcribe(wav, sp, model_path, cli)
        result["speakers"][f"speaker{sp}"] = segs
        print(f"\n--- speaker {sp} ---")
        for s in segs:
            print(f"  [{s['start']:6.2f} --> {s['end']:6.2f}]  {s['text']}")

    if args.json:
        Path(args.json).write_text(json.dumps(result, indent=2))
        print(f"\nWrote {args.json}")


if __name__ == "__main__":
    main()