Spaces:

hetchyy
/

quranic-universal-aligner

Running on Zero

App Files Files Community

hetchyy commited on 18 days ago

Commit

e80c3ea

verified ·

1 Parent(s): 343f470

Delete bench_wraparound.py

Browse files

Files changed (1) hide show

bench_wraparound.py +0 -273

bench_wraparound.py DELETED Viewed

@@ -1,273 +0,0 @@
-#!/usr/bin/env python3
-"""
-Benchmark: compare pure-Python align_wraparound vs Cython cy_align_wraparound.
-Loads 50 verses from the repetition test set, runs both implementations,
-verifies results match, and reports timing + speedup factor.
-Usage:
-    python3 bench_wraparound.py
-"""
-import json
-import sys
-import time
-from pathlib import Path
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-SCRIPT_DIR = Path(__file__).parent                       # quranic_universal_aligner/
-REPO_ROOT = SCRIPT_DIR.parent                            # quranic-universal-audio/
-DATA_DIR = REPO_ROOT / "data"
-sys.path.insert(0, str(SCRIPT_DIR))
-# ---------------------------------------------------------------------------
-# Import Python implementation from test harness (without modifying it)
-# ---------------------------------------------------------------------------
-sys.path.insert(0, str(SCRIPT_DIR / "docs" / "repetition_detection"))
-from test_wraparound_dp import (
-    align_wraparound as py_align_wraparound,
-    build_ref_from_phonemizer,
-    load_substitution_costs,
-    COST_SUBSTITUTION, COST_DELETION, COST_INSERTION,
-    WRAP_PENALTY, MAX_WRAPS,
-)
-# ---------------------------------------------------------------------------
-# Import Cython implementation
-# ---------------------------------------------------------------------------
-from src.alignment._dp_core import cy_align_wraparound, init_substitution_matrix
-# ---------------------------------------------------------------------------
-# Setup — defer init_substitution_matrix until all phonemes are registered
-# ---------------------------------------------------------------------------
-SUB_COSTS = load_substitution_costs()
-# NOTE: init_substitution_matrix is called in main() AFTER collecting all
-# unique phonemes from the test data.  This avoids _grow_matrix() being
-# triggered during alignment, which would discard custom sub costs
-# (a known limitation of the current _grow_matrix implementation).
-def load_test_data():
-    path = DATA_DIR / "repetition_test_set_base.json"
-    with open(path) as f:
-        return json.load(f)
-def main():
-    N = 50  # number of verses to benchmark
-    print(f"\n{'='*70}")
-    print(f"  Wraparound DP Benchmark: Python vs Cython")
-    print(f"  Verses: {N}")
-    print(f"{'='*70}\n")
-    # Load test data
-    print("Loading test data...", end=" ", flush=True)
-    test_data = load_test_data()
-    print("done.")
-    # Initialize phonemizer
-    print("Initializing phonemizer...", end=" ", flush=True)
-    from src.alignment.phonemizer_utils import get_phonemizer
-    pm = get_phonemizer()
-    print("done.\n")
-    # Collect verse cases
-    cases = []
-    for reciter in [k for k in test_data if k != "_meta"]:
-        for verse_key, verse_data in test_data[reciter].items():
-            cases.append((reciter, verse_key, verse_data))
-            if len(cases) >= N:
-                break
-        if len(cases) >= N:
-            break
-    # Prepare all inputs first (exclude phonemizer time from benchmark)
-    print(f"Preparing {len(cases)} verse inputs...", end=" ", flush=True)
-    prepared = []
-    all_phonemes = set()
-    for reciter, verse_key, verse_data in cases:
-        surah, ayah = map(int, verse_key.split(":"))
-        P = verse_data["asr_phonemes"].split()
-        try:
-            R, R_phone_to_word, _ = build_ref_from_phonemizer(pm, surah, ayah)
-        except Exception as e:
-            print(f"\n  SKIP {reciter}/{verse_key}: {e}")
-            continue
-        if len(R) == 0:
-            continue
-        all_phonemes.update(P)
-        all_phonemes.update(R)
-        prepared.append({
-            "reciter": reciter,
-            "verse_key": verse_key,
-            "P": P,
-            "R": R,
-            "R_phone_to_word": R_phone_to_word,
-            "p_len": len(P),
-            "r_len": len(R),
-            "num_reps": verse_data["num_reps"],
-        })
-    print(f"done. ({len(prepared)} usable)")
-    # Pre-register ALL phonemes in the substitution cost dict so that
-    # _grow_matrix() is never triggered during alignment.  This avoids
-    # a known limitation where _grow_matrix discards custom sub costs.
-    print(f"Initializing substitution matrix ({len(all_phonemes)} phonemes)...", end=" ", flush=True)
-    augmented_costs = dict(SUB_COSTS)
-    for ph in all_phonemes:
-        # Add a self-pair entry so the phoneme gets an ID during init
-        augmented_costs[(ph, ph)] = 0.0
-    init_substitution_matrix(augmented_costs, COST_SUBSTITUTION)
-    print("done.\n")
-    # Common kwargs
-    common = dict(
-        expected_word=0,
-        prior_weight=0.0,
-        cost_sub=COST_SUBSTITUTION,
-        cost_del=COST_DELETION,
-        cost_ins=COST_INSERTION,
-        wrap_penalty=WRAP_PENALTY,
-        max_wraps=MAX_WRAPS,
-        scoring_mode="subtract",
-        wrap_score_cost=0.01,
-    )
-    # --- Warmup (1 run each) ---
-    print("Warmup run...", end=" ", flush=True)
-    d = prepared[0]
-    py_align_wraparound(d["P"], d["R"], d["R_phone_to_word"], **common)
-    # Cython version doesn't take scoring_mode/wrap_score_cost in common if default
-    cy_align_wraparound(
-        d["P"], d["R"], d["R_phone_to_word"],
-        expected_word=0, prior_weight=0.0,
-        cost_sub=COST_SUBSTITUTION, cost_del=COST_DELETION, cost_ins=COST_INSERTION,
-        wrap_penalty=WRAP_PENALTY, max_wraps=MAX_WRAPS,
-        scoring_mode="subtract", wrap_score_cost=0.01,
-    )
-    print("done.\n")
-    # --- Benchmark Python ---
-    print(f"Running Python align_wraparound on {len(prepared)} verses...")
-    py_results = []
-    t0 = time.perf_counter()
-    for d in prepared:
-        result = py_align_wraparound(d["P"], d["R"], d["R_phone_to_word"], **common)
-        py_results.append(result)
-    py_total = time.perf_counter() - t0
-    print(f"  Python total: {py_total*1000:.1f} ms ({py_total*1000/len(prepared):.1f} ms/verse)\n")
-    # --- Benchmark Cython ---
-    print(f"Running Cython cy_align_wraparound on {len(prepared)} verses...")
-    cy_results = []
-    t0 = time.perf_counter()
-    for d in prepared:
-        result = cy_align_wraparound(
-            d["P"], d["R"], d["R_phone_to_word"],
-            expected_word=0, prior_weight=0.0,
-            cost_sub=COST_SUBSTITUTION, cost_del=COST_DELETION, cost_ins=COST_INSERTION,
-            wrap_penalty=WRAP_PENALTY, max_wraps=MAX_WRAPS,
-            scoring_mode="subtract", wrap_score_cost=0.01,
-        )
-        cy_results.append(result)
-    cy_total = time.perf_counter() - t0
-    print(f"  Cython total: {cy_total*1000:.1f} ms ({cy_total*1000/len(prepared):.1f} ms/verse)\n")
-    # --- Compare results ---
-    print(f"{'='*70}")
-    print(f"  Verification: comparing Python vs Cython results")
-    print(f"{'='*70}\n")
-    mismatches = 0
-    tol = 1e-6
-    for i, (d, py_r, cy_r) in enumerate(zip(prepared, py_results, cy_results)):
-        # Python returns 7-tuple: (best_j, best_j_start, best_cost, best_norm, n_wraps, max_j, wrap_points)
-        # Cython returns 6-tuple: (best_j, best_j_start, best_cost, best_norm, n_wraps, max_j)
-        py_j, py_js, py_cost, py_norm, py_k, py_mj, py_wp = py_r
-        cy_j, cy_js, cy_cost, cy_norm, cy_k, cy_mj = cy_r
-        match = True
-        errors = []
-        if py_j != cy_j:
-            errors.append(f"best_j: py={py_j} cy={cy_j}")
-            match = False
-        if py_js != cy_js:
-            errors.append(f"best_j_start: py={py_js} cy={cy_js}")
-            match = False
-        if py_j is not None and cy_j is not None:
-            if abs(py_cost - cy_cost) > tol:
-                errors.append(f"best_cost: py={py_cost:.6f} cy={cy_cost:.6f}")
-                match = False
-            if abs(py_norm - cy_norm) > tol:
-                errors.append(f"best_norm: py={py_norm:.6f} cy={cy_norm:.6f}")
-                match = False
-        if py_k != cy_k:
-            errors.append(f"n_wraps: py={py_k} cy={cy_k}")
-            match = False
-        # max_j comparison (Python uses max(max_j, j) for end_j; Cython returns raw max_j)
-        if py_j is not None and cy_j is not None and py_mj != cy_mj:
-            errors.append(f"max_j: py={py_mj} cy={cy_mj}")
-            match = False
-        if not match:
-            mismatches += 1
-            print(f"  MISMATCH [{i}] {d['reciter']}/{d['verse_key']} "
-                  f"(P={d['p_len']}, R={d['r_len']}, reps={d['num_reps']})")
-            for e in errors:
-                print(f"    {e}")
-    # --- Summary ---
-    print(f"\n{'='*70}")
-    print(f"  SUMMARY")
-    print(f"{'='*70}")
-    print(f"  Verses benchmarked: {len(prepared)}")
-    print(f"  Python total:  {py_total*1000:>8.1f} ms  ({py_total*1000/len(prepared):>6.1f} ms/verse)")
-    print(f"  Cython total:  {cy_total*1000:>8.1f} ms  ({cy_total*1000/len(prepared):>6.1f} ms/verse)")
-    speedup = py_total / cy_total if cy_total > 0 else float('inf')
-    print(f"  Speedup:       {speedup:>8.1f}x")
-    print(f"  Mismatches:    {mismatches}/{len(prepared)}")
-    if mismatches == 0:
-        print(f"  Result:        ALL MATCH")
-    else:
-        print(f"  Result:        {mismatches} MISMATCHES FOUND")
-    print(f"{'='*70}\n")
-    # Also test scoring modes
-    print("Testing scoring modes (no_subtract, additive)...")
-    for mode in ["no_subtract", "additive"]:
-        d = prepared[0]
-        py_r = py_align_wraparound(
-            d["P"], d["R"], d["R_phone_to_word"],
-            expected_word=0, prior_weight=0.0,
-            cost_sub=COST_SUBSTITUTION, cost_del=COST_DELETION, cost_ins=COST_INSERTION,
-            wrap_penalty=WRAP_PENALTY, max_wraps=MAX_WRAPS,
-            scoring_mode=mode, wrap_score_cost=0.01,
-        )
-        cy_r = cy_align_wraparound(
-            d["P"], d["R"], d["R_phone_to_word"],
-            expected_word=0, prior_weight=0.0,
-            cost_sub=COST_SUBSTITUTION, cost_del=COST_DELETION, cost_ins=COST_INSERTION,
-            wrap_penalty=WRAP_PENALTY, max_wraps=MAX_WRAPS,
-            scoring_mode=mode, wrap_score_cost=0.01,
-        )
-        py_j, py_js, py_cost, py_norm, py_k, py_mj, _ = py_r
-        cy_j, cy_js, cy_cost, cy_norm, cy_k, cy_mj = cy_r
-        ok = (py_j == cy_j and py_js == cy_js and py_k == cy_k)
-        if py_j is not None and cy_j is not None:
-            ok = ok and abs(py_cost - cy_cost) < tol and abs(py_norm - cy_norm) < tol
-        status = "OK" if ok else "MISMATCH"
-        print(f"  {mode}: {status} "
-              f"(py: j={py_j},js={py_js},cost={py_cost:.4f},norm={py_norm:.4f},k={py_k} | "
-              f"cy: j={cy_j},js={cy_js},cost={cy_cost:.4f},norm={cy_norm:.4f},k={cy_k})")
-    print()
-if __name__ == "__main__":
-    main()