"""Quality evaluation: returns a 0-1 score for a synthesized Question. The full pipeline uses an 11-judge panel; this stub provides a fast deterministic heuristic so agents can self-estimate quality before deciding how aggressively to bid. """ from __future__ import annotations import logging from .schemas import QualityScore, Question from .stub_detector import is_stub, stub_reason logger = logging.getLogger(__name__) _PASS_THRESHOLD = 0.7 _MIN_RESOLUTION_LEN = 30 _MIN_QUESTION_LEN = 12 def score_question(question: Question) -> QualityScore: """Cheap heuristic score in [0, 1]. Counts presence of resolution criteria, question length, and a future end-date. Hard-fails (score=0.0, passed=False) when the question text or resolution criteria match a known LLM-glitch stub placeholder. The pre-W14-FIX-STUB heuristic was length-only, so a stub like ``"Resolves YES if the event occurs by the cutoff."`` (52 chars) sailed through. The explicit stub check makes the gate reject those events instead. """ # ----- Stub short-circuit ------------------------------------------- # # If either field is a known placeholder, we know the upstream LLM # call glitched. Return ``score=0.0`` so the downstream pass gate # rejects the event. We do NOT call this "quality" — the rationale # is explicit so operators can tell stubs apart from genuinely poor # translations. if is_stub(question.question_en) or is_stub(question.resolution_criteria): leaked = stub_reason([question.question_en, question.resolution_criteria]) logger.warning( "quality_eval: stub detected in question (event_id=%s); leaked_phrase=%r", question.event_id, leaked, ) return QualityScore( score=0.0, rationale=f"stub_detected: {leaked}", passed=False, ) # ----- Length / shape heuristic ------------------------------------- # score = 0.0 rationale_parts: list[str] = [] if len(question.question_en) >= _MIN_QUESTION_LEN: score += 0.35 else: rationale_parts.append("question too short") if len(question.resolution_criteria) >= _MIN_RESOLUTION_LEN: score += 0.40 else: rationale_parts.append("resolution criteria too short") if question.end_date_iso and "T" in question.end_date_iso: score += 0.25 else: rationale_parts.append("missing/invalid end_date_iso") score = min(1.0, max(0.0, score)) return QualityScore( score=score, rationale="; ".join(rationale_parts) or "all checks passed", passed=score >= _PASS_THRESHOLD, )