polyglot-alpha / polyglot_alpha /quality_eval.py
licaomeng
deploy: main@8970ffb β†’ HF Spaces (2026-05-27T05:19Z)
88d2f2a
"""Quality evaluation: returns a 0-1 score for a synthesized Question.
The full pipeline uses an 11-judge panel; this stub provides a fast
deterministic heuristic so agents can self-estimate quality before
deciding how aggressively to bid.
"""
from __future__ import annotations
import logging
from .schemas import QualityScore, Question
from .stub_detector import is_stub, stub_reason
logger = logging.getLogger(__name__)
_PASS_THRESHOLD = 0.7
_MIN_RESOLUTION_LEN = 30
_MIN_QUESTION_LEN = 12
def score_question(question: Question) -> QualityScore:
"""Cheap heuristic score in [0, 1]. Counts presence of resolution
criteria, question length, and a future end-date.
Hard-fails (score=0.0, passed=False) when the question text or
resolution criteria match a known LLM-glitch stub placeholder. The
pre-W14-FIX-STUB heuristic was length-only, so a stub like
``"Resolves YES if the event occurs by the cutoff."`` (52 chars)
sailed through. The explicit stub check makes the gate reject
those events instead.
"""
# ----- Stub short-circuit ------------------------------------------- #
# If either field is a known placeholder, we know the upstream LLM
# call glitched. Return ``score=0.0`` so the downstream pass gate
# rejects the event. We do NOT call this "quality" β€” the rationale
# is explicit so operators can tell stubs apart from genuinely poor
# translations.
if is_stub(question.question_en) or is_stub(question.resolution_criteria):
leaked = stub_reason([question.question_en, question.resolution_criteria])
logger.warning(
"quality_eval: stub detected in question (event_id=%s); leaked_phrase=%r",
question.event_id,
leaked,
)
return QualityScore(
score=0.0,
rationale=f"stub_detected: {leaked}",
passed=False,
)
# ----- Length / shape heuristic ------------------------------------- #
score = 0.0
rationale_parts: list[str] = []
if len(question.question_en) >= _MIN_QUESTION_LEN:
score += 0.35
else:
rationale_parts.append("question too short")
if len(question.resolution_criteria) >= _MIN_RESOLUTION_LEN:
score += 0.40
else:
rationale_parts.append("resolution criteria too short")
if question.end_date_iso and "T" in question.end_date_iso:
score += 0.25
else:
rationale_parts.append("missing/invalid end_date_iso")
score = min(1.0, max(0.0, score))
return QualityScore(
score=score,
rationale="; ".join(rationale_parts) or "all checks passed",
passed=score >= _PASS_THRESHOLD,
)