File size: 2,843 Bytes
d745844
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""Candidate selection for the Text2SPARQL repair pipeline.

Selects exactly one best candidate using purely symbolic scoring.
No LLM tie-breaker in v1.
"""

from __future__ import annotations

import logging

from .models import CandidateQuery, ValidationResult

logger = logging.getLogger(__name__)


def rank_candidates(
    candidates: list[CandidateQuery],
    validations: list[ValidationResult],
) -> list[tuple[str, float]]:
    """Rank candidates by validation score with tie-breaking.

    Ranking rules (in order):
    1. Higher validation score
    2. Fewer suspicious flags
    3. Fewer query characters (shorter queries preferred)
    4. Lower generation index

    Args:
        candidates: List of candidate queries.
        validations: List of validation results (same order as candidates).

    Returns:
        List of (candidate_id, score) tuples, sorted best-first.
    """
    # Build lookup maps
    val_map = {v.candidate_id: v for v in validations}
    cand_map = {c.candidate_id: c for c in candidates}

    ranked: list[tuple[str, float, int, int, int]] = []
    for cand in candidates:
        val = val_map.get(cand.candidate_id)
        if val is None:
            logger.warning(
                "No validation result for candidate %s", cand.candidate_id
            )
            continue
        ranked.append((
            cand.candidate_id,
            val.score,
            len(val.suspicious_flags),
            len(cand.query),
            cand.generation_index,
        ))

    # Sort: score DESC, then flags ASC, then length ASC, then index ASC
    ranked.sort(key=lambda x: (-x[1], x[2], x[3], x[4]))

    return [(cid, score) for cid, score, _, _, _ in ranked]


def select_best_candidate(
    candidates: list[CandidateQuery],
    validations: list[ValidationResult],
) -> CandidateQuery:
    """Select the single best candidate for deeper inspection.

    Args:
        candidates: List of candidate queries.
        validations: List of validation results.

    Returns:
        The best CandidateQuery.

    Raises:
        ValueError: If no candidates are available.
    """
    if not candidates:
        raise ValueError("No candidates to select from")

    ranking = rank_candidates(candidates, validations)

    if not ranking:
        logger.warning("Ranking produced no results — returning first candidate")
        return candidates[0]

    best_id = ranking[0][0]
    best_score = ranking[0][1]

    cand_map = {c.candidate_id: c for c in candidates}
    best = cand_map[best_id]

    logger.info(
        "Selected candidate %s with score %.2f (out of %d candidates)",
        best_id, best_score, len(candidates),
    )

    if len(ranking) > 1:
        logger.debug(
            "Runner-up: %s with score %.2f",
            ranking[1][0], ranking[1][1],
        )

    return best