Spaces:

MukulRay
/

recon

Sleeping

App Files Files Community

MukulRay commited on Apr 22

Commit

2138877

1 Parent(s): ad651e3

Phase 2.4: add edge reliability scorer — three-signal formula, foundational paper test passes

Browse files

Files changed (1) hide show

src/reliability.py +240 -0

src/reliability.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+Edge Reliability Scoring for RECON v2.
+Computes a three-signal reliability score for each retrieved paper:
+    edge_reliability = (citation_centrality × 0.4)
+                     + (recency_signal       × 0.3)
+                     + (content_coherence    × 0.3)
+Signals:
+- citation_centrality: normalized cited_by_count from OpenAlex (or S2 fallback)
+  High centrality = foundational paper = high reliability regardless of age
+- recency_signal: linear decay max(0, 1 - age/20) — same as RECON v1
+  Now one of three inputs, not the whole score
+- content_coherence: LLM check — does this paper's abstract still represent
+  current scientific understanding? Batched for all papers in one LLM call.
+Dominant signal labels (for explainability in synthesizer output):
+  FOUNDATIONAL: reliability >= 0.70 AND centrality >= 0.6
+  CURRENT:      reliability >= 0.60 AND recency >= 0.7
+  DECLINING:    reliability 0.35–0.60
+  SUPERSEDED:   reliability < 0.35
+"""
+import math
+import logging
+import os
+from dataclasses import dataclass
+from typing import Optional
+import json
+import re
+logger = logging.getLogger(__name__)
+CURRENT_YEAR = 2026
+# Reliability thresholds
+THRESHOLD_FOUNDATIONAL_RELIABILITY = 0.70
+THRESHOLD_FOUNDATIONAL_CENTRALITY = 0.60
+THRESHOLD_CURRENT_RELIABILITY = 0.60
+THRESHOLD_CURRENT_RECENCY = 0.70
+THRESHOLD_DECLINING_LOW = 0.35
+# Signal weights
+W_CENTRALITY = 0.4
+W_RECENCY = 0.3
+W_COHERENCE = 0.3
+@dataclass
+class ReliabilityScore:
+    score: float                  # [0, 1] composite reliability
+    centrality: float             # normalized citation centrality
+    recency: float                # linear decay recency signal
+    coherence: float              # LLM content coherence [0, 1]
+    dominant_signal: str          # FOUNDATIONAL / CURRENT / DECLINING / SUPERSEDED
+    reason: str                   # one-line human-readable explanation
+def _compute_centrality(citation_count: int, doi: str = "") -> float:
+    """
+    Normalized citation centrality.
+    Uses OpenAlex cited_by_count if DOI available, else falls back to S2 count.
+    Formula: min(1.0, log1p(count) / log1p(10000))
+    """
+    from src.openalex_utils import get_citation_centrality
+    return get_citation_centrality(doi=doi, citation_count=citation_count)
+def _compute_recency(year: Optional[int]) -> float:
+    """Linear decay: max(0, 1 - age/20). Age 0 = 1.0, age 20+ = 0.0."""
+    if not year or year <= 0:
+        return 0.0
+    age = CURRENT_YEAR - year
+    return max(0.0, 1.0 - age / 20.0)
+def _compute_coherence_batch(papers: list, query: str) -> list[float]:
+    """
+    LLM batch coherence check for all papers at once.
+    For each paper, asks: does this paper's abstract still represent
+    current scientific understanding on this topic?
+    Returns a list of float scores [0, 1] in the same order as input papers.
+    Falls back to recency-based heuristic if LLM call fails.
+    Batched: one LLM call for all papers, not one per paper.
+    """
+    if not papers:
+        return []
+    # Build batch prompt
+    paper_summaries = []
+    for i, p in enumerate(papers):
+        abstract_snippet = (p.abstract or "")[:300]
+        paper_summaries.append(
+            f"Paper {i+1}: [{p.year}] {p.title}\n"
+            f"Abstract: {abstract_snippet}"
+        )
+    papers_text = "\n\n".join(paper_summaries)
+    system_prompt = """You are a scientific literature analyst assessing whether papers represent current scientific understanding.
+For each paper provided, assign a content_coherence score from 0.0 to 1.0:
+- 1.0: Paper's central claims are still the consensus view, no major challenges
+- 0.7: Paper is foundational and still cited, but some aspects have been refined
+- 0.5: Paper's claims are actively debated; newer work challenges some findings
+- 0.3: Paper's central claims have been substantially superseded by newer work
+- 0.1: Paper is clearly outdated; its claims contradict current consensus
+Respond ONLY with a JSON array of objects, one per paper, in the same order:
+[{"paper_index": 1, "coherence": 0.8, "reason": "one sentence"}, ...]
+Be concise. No other text."""
+    user_prompt = f"""Research query context: {query[:200]}
+Papers to assess:
+{papers_text}
+Return ONLY the JSON array."""
+    try:
+        from langchain_groq import ChatGroq
+        from langchain_core.messages import SystemMessage, HumanMessage
+        llm = ChatGroq(
+            model="llama-3.3-70b-versatile",
+            temperature=0.1,
+            api_key=os.environ.get("GROQ_API_KEY"),
+        )
+        response = llm.invoke([
+            SystemMessage(content=system_prompt),
+            HumanMessage(content=user_prompt),
+        ])
+        raw = response.content.strip()
+        # Extract JSON array
+        match = re.search(r"\[.*\]", raw, re.DOTALL)
+        if match:
+            data = json.loads(match.group())
+            scores = [0.5] * len(papers)  # default
+            for item in data:
+                idx = int(item.get("paper_index", 0)) - 1  # 1-indexed in prompt
+                if 0 <= idx < len(papers):
+                    scores[idx] = float(item.get("coherence", 0.5))
+            return scores
+    except Exception as e:
+        logger.warning(f"Coherence batch LLM call failed: {e}")
+    # Fallback: use recency as coherence proxy
+    return [_compute_recency(p.year) for p in papers]
+def _dominant_signal(score: float, centrality: float, recency: float) -> str:
+    """Classify dominant signal for explainability."""
+    if score >= THRESHOLD_FOUNDATIONAL_RELIABILITY and centrality >= THRESHOLD_FOUNDATIONAL_CENTRALITY:
+        return "FOUNDATIONAL"
+    elif score >= THRESHOLD_CURRENT_RELIABILITY and recency >= THRESHOLD_CURRENT_RECENCY:
+        return "CURRENT"
+    elif score >= THRESHOLD_DECLINING_LOW:
+        return "DECLINING"
+    else:
+        return "SUPERSEDED"
+def _build_reason(dominant: str, centrality: float, recency: float,
+                  coherence: float, year: Optional[int]) -> str:
+    """One-line reason string for the trust summary."""
+    age = (CURRENT_YEAR - year) if year else None
+    age_str = f"{age}yr old" if age is not None else "unknown age"
+    if dominant == "FOUNDATIONAL":
+        return f"High citation centrality ({centrality:.2f}), {age_str} — foundational work still current"
+    elif dominant == "CURRENT":
+        return f"Recent ({age_str}), coherence={coherence:.2f} — aligns with current consensus"
+    elif dominant == "DECLINING":
+        return f"Mixed signals: centrality={centrality:.2f}, recency={recency:.2f}, coherence={coherence:.2f}"
+    else:
+        return f"Low reliability: {age_str}, centrality={centrality:.2f}, coherence={coherence:.2f} — likely superseded"
+def score_papers(papers: list, query: str, use_llm: bool = True) -> dict[str, ReliabilityScore]:
+    """
+    Main entry point. Scores all papers and returns a dict of paper_id -> ReliabilityScore.
+    Args:
+        papers: list of Paper objects
+        query: the original research query (for coherence context)
+        use_llm: if False, skips coherence LLM call (uses recency as fallback)
+                 Set False during eval to save Groq API calls.
+    Returns:
+        dict mapping paper_id -> ReliabilityScore
+    """
+    if not papers:
+        return {}
+    # Step 1: Centrality (OpenAlex DOI lookup if available, else S2 count)
+    centralities = []
+    for p in papers:
+        c = _compute_centrality(
+            citation_count=getattr(p, "citation_count", 0) or 0,
+            doi=getattr(p, "doi", "") or "",
+        )
+        centralities.append(c)
+    # Step 2: Recency
+    recencies = [_compute_recency(getattr(p, "year", None)) for p in papers]
+    # Step 3: Coherence (batched LLM call)
+    if use_llm:
+        coherences = _compute_coherence_batch(papers, query)
+    else:
+        coherences = [_compute_recency(getattr(p, "year", None)) for p in papers]
+    # Step 4: Composite score and labeling
+    results = {}
+    for i, p in enumerate(papers):
+        c = centralities[i]
+        r = recencies[i]
+        co = coherences[i] if i < len(coherences) else r
+        score = W_CENTRALITY * c + W_RECENCY * r + W_COHERENCE * co
+        dominant = _dominant_signal(score, c, r)
+        reason = _build_reason(dominant, c, r, co, getattr(p, "year", None))
+        results[p.paper_id] = ReliabilityScore(
+            score=round(score, 4),
+            centrality=round(c, 4),
+            recency=round(r, 4),
+            coherence=round(co, 4),
+            dominant_signal=dominant,
+            reason=reason,
+        )
+    return results