"""Phase 3 eval: cross-modal retrieval quality (recall@k, MRR). Design: N held-out (docstring, code) pairs form a closed candidate pool. Each query docstring is ranked against all N code candidates; the paired code is the positive and the other N-1 are distractors. This tests the embedder's ability to bridge natural-language intent → code, without the confound of looking up exact code that is already in the FAISS index. ⚠️ Leakage caveat: CodeSearchNet's func_code_string includes the Python docstring verbatim inside the function body (the triple-quoted string right after `def`). Embedding the raw code therefore lets the embedder trivially find the match via lexical overlap — recall@1 ≈ 0.96 is an artefact, NOT a measure of true code understanding. Call with strip_code_docstrings=True to remove triple-quoted strings and # comments from candidate code before embedding. That number (~0.3-0.5 recall@1) reflects the embedder's actual semantic matching ability. Usage (standalone): python scripts/retrieval_only_eval.py """ from __future__ import annotations import re import sys from pathlib import Path import numpy as np import pandas as pd sys.path.append(str(Path(__file__).resolve().parents[2])) # Matches the first triple-quoted string in a Python function body. _TRIPLE_QUOTE_RE = re.compile(r'"""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\'') _COMMENT_RE = re.compile(r'#[^\n]*') def _strip_code_docstring(code: str) -> str: """Remove the first triple-quoted docstring and all # comments from Python code.""" code = _TRIPLE_QUOTE_RE.sub('', code, count=1) code = _COMMENT_RE.sub('', code) return code def evaluate_cross_modal( embedder, pairs: pd.DataFrame, k_values: tuple[int, ...] = (1, 5, 10), batch_size: int = 64, strip_code_docstrings: bool = False, ) -> dict: """Cross-modal retrieval eval: docstring queries → code candidates. Args: embedder: SentenceTransformer (or anything with .encode()). pairs: DataFrame with 'docstring' and 'code' columns (N rows). k_values: Recall cut-offs to report. batch_size: Encoding batch size. strip_code_docstrings: If True, remove triple-quoted docstrings and # comments from candidate code before embedding. Use this for a leakage-free signal; see module docstring for why the raw number is inflated. Returns dict with keys mrr, recall@k (for each k), n_pairs, stripped. """ n = len(pairs) candidates = pairs["code"].tolist() if strip_code_docstrings: candidates = [_strip_code_docstring(c) for c in candidates] print(f"[eval] encoding {n} docstrings as queries ...") q_emb = embedder.encode( pairs["docstring"].tolist(), batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True, ).astype("float32") print(f"[eval] encoding {n} code candidates" f"{' (docstrings stripped)' if strip_code_docstrings else ''} ...") c_emb = embedder.encode( candidates, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True, ).astype("float32") # Cosine similarity matrix (N × N); both sides are already L2-normalised, # so inner product == cosine similarity. sim = q_emb @ c_emb.T # shape (N, N) reciprocal_ranks: list[float] = [] hits: dict[int, int] = {k: 0 for k in k_values} for i in range(n): order = sim[i].argsort()[::-1] rank = int(np.where(order == i)[0][0]) + 1 # 1-indexed reciprocal_ranks.append(1.0 / rank) for k in k_values: if rank <= k: hits[k] += 1 result: dict = { "mrr": round(float(np.mean(reciprocal_ranks)), 4), "n_pairs": n, "stripped": strip_code_docstrings, } for k in k_values: result[f"recall@{k}"] = round(hits[k] / n, 4) return result