Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """Candidate selection for the Text2SPARQL repair pipeline. | |
| Selects exactly one best candidate using purely symbolic scoring. | |
| No LLM tie-breaker in v1. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from .models import CandidateQuery, ValidationResult | |
| logger = logging.getLogger(__name__) | |
| def rank_candidates( | |
| candidates: list[CandidateQuery], | |
| validations: list[ValidationResult], | |
| ) -> list[tuple[str, float]]: | |
| """Rank candidates by validation score with tie-breaking. | |
| Ranking rules (in order): | |
| 1. Higher validation score | |
| 2. Fewer suspicious flags | |
| 3. Fewer query characters (shorter queries preferred) | |
| 4. Lower generation index | |
| Args: | |
| candidates: List of candidate queries. | |
| validations: List of validation results (same order as candidates). | |
| Returns: | |
| List of (candidate_id, score) tuples, sorted best-first. | |
| """ | |
| # Build lookup maps | |
| val_map = {v.candidate_id: v for v in validations} | |
| cand_map = {c.candidate_id: c for c in candidates} | |
| ranked: list[tuple[str, float, int, int, int]] = [] | |
| for cand in candidates: | |
| val = val_map.get(cand.candidate_id) | |
| if val is None: | |
| logger.warning( | |
| "No validation result for candidate %s", cand.candidate_id | |
| ) | |
| continue | |
| ranked.append(( | |
| cand.candidate_id, | |
| val.score, | |
| len(val.suspicious_flags), | |
| len(cand.query), | |
| cand.generation_index, | |
| )) | |
| # Sort: score DESC, then flags ASC, then length ASC, then index ASC | |
| ranked.sort(key=lambda x: (-x[1], x[2], x[3], x[4])) | |
| return [(cid, score) for cid, score, _, _, _ in ranked] | |
| def select_best_candidate( | |
| candidates: list[CandidateQuery], | |
| validations: list[ValidationResult], | |
| ) -> CandidateQuery: | |
| """Select the single best candidate for deeper inspection. | |
| Args: | |
| candidates: List of candidate queries. | |
| validations: List of validation results. | |
| Returns: | |
| The best CandidateQuery. | |
| Raises: | |
| ValueError: If no candidates are available. | |
| """ | |
| if not candidates: | |
| raise ValueError("No candidates to select from") | |
| ranking = rank_candidates(candidates, validations) | |
| if not ranking: | |
| logger.warning("Ranking produced no results — returning first candidate") | |
| return candidates[0] | |
| best_id = ranking[0][0] | |
| best_score = ranking[0][1] | |
| cand_map = {c.candidate_id: c for c in candidates} | |
| best = cand_map[best_id] | |
| logger.info( | |
| "Selected candidate %s with score %.2f (out of %d candidates)", | |
| best_id, best_score, len(candidates), | |
| ) | |
| if len(ranking) > 1: | |
| logger.debug( | |
| "Runner-up: %s with score %.2f", | |
| ranking[1][0], ranking[1][1], | |
| ) | |
| return best | |