"""Compare prompt variants and models on one evocative fixture. Runs (model x baseline/refined) combos and prints each interpretation + films so we can judge whether the refined prompt earns its place as default, and whether 12B is worth the latency over E4B. .venv/bin/python -m scripts.ab_prompt """ import os import time from dotenv import load_dotenv load_dotenv() from scripts.smoke_test import moody_jpeg # noqa: E402 # One layered fixture: warm dusk window + a personal line. IMAGES = [ moody_jpeg((22, 26, 44), (96, 72, 58), blob=(255, 198, 120, 64)), moody_jpeg((12, 14, 18), (44, 52, 70)), ] FRAGMENT = "couldn't sleep again, the streetlight through the curtain" COMBOS = [ ("google/gemma-4-e4b", False), ("google/gemma-4-e4b", True), ("google/gemma-4-12b", False), ("google/gemma-4-12b", True), ] def run_combo(model: str, refined: bool) -> None: os.environ["CHALCHITRA_MODEL"] = model # One direct call per combo (no auto-retry) so baseline stays baseline. from backend.oracle import MAX_TOKENS, _build_content, _extract_json, _validate from backend.prompt import system_prompt from backend.providers import get_provider label = f"{model.split('/')[-1]} | {'refined' if refined else 'baseline'}" print(f"\n{'='*74}\n{label}\n{'-'*74}") content = _build_content(IMAGES, FRAGMENT, []) t0 = time.time() try: raw = get_provider().complete(system_prompt(refined=refined), content, MAX_TOKENS) r = _validate(_extract_json(raw)) except Exception as e: # noqa: BLE001 print(f" ERROR: {e}") return dt = time.time() - t0 print(r["interpretation"]) print("\nfilms: " + " ยท ".join(f"{f['title']} ({f['year']})" for f in r["films"])) print(f"[{dt:.1f}s]") def main() -> None: print(f"fixture fragment: {FRAGMENT!r}") for model, refined in COMBOS: run_combo(model, refined) if __name__ == "__main__": main()