chalchitra / scripts /ab_prompt.py
ajit3259's picture
test: add prompt/model A/B comparison script
b862a8b
Raw
History Blame Contribute Delete
1.97 kB
"""Compare prompt variants and models on one evocative fixture.
Runs (model x baseline/refined) combos and prints each interpretation + films
so we can judge whether the refined prompt earns its place as default, and
whether 12B is worth the latency over E4B.
.venv/bin/python -m scripts.ab_prompt
"""
import os
import time
from dotenv import load_dotenv
load_dotenv()
from scripts.smoke_test import moody_jpeg # noqa: E402
# One layered fixture: warm dusk window + a personal line.
IMAGES = [
moody_jpeg((22, 26, 44), (96, 72, 58), blob=(255, 198, 120, 64)),
moody_jpeg((12, 14, 18), (44, 52, 70)),
]
FRAGMENT = "couldn't sleep again, the streetlight through the curtain"
COMBOS = [
("google/gemma-4-e4b", False),
("google/gemma-4-e4b", True),
("google/gemma-4-12b", False),
("google/gemma-4-12b", True),
]
def run_combo(model: str, refined: bool) -> None:
os.environ["CHALCHITRA_MODEL"] = model
# One direct call per combo (no auto-retry) so baseline stays baseline.
from backend.oracle import MAX_TOKENS, _build_content, _extract_json, _validate
from backend.prompt import system_prompt
from backend.providers import get_provider
label = f"{model.split('/')[-1]} | {'refined' if refined else 'baseline'}"
print(f"\n{'='*74}\n{label}\n{'-'*74}")
content = _build_content(IMAGES, FRAGMENT, [])
t0 = time.time()
try:
raw = get_provider().complete(system_prompt(refined=refined), content, MAX_TOKENS)
r = _validate(_extract_json(raw))
except Exception as e: # noqa: BLE001
print(f" ERROR: {e}")
return
dt = time.time() - t0
print(r["interpretation"])
print("\nfilms: " + " · ".join(f"{f['title']} ({f['year']})" for f in r["films"]))
print(f"[{dt:.1f}s]")
def main() -> None:
print(f"fixture fragment: {FRAGMENT!r}")
for model, refined in COMBOS:
run_combo(model, refined)
if __name__ == "__main__":
main()