| """ |
| Generation: call Claude with retrieved chunks, force citations, validate. |
| |
| The retrieval layer (api/hybrid.py) returns either a ranked list of |
| RerankedHit objects or `[]` when the best chunk falls below the refusal |
| threshold. We mirror that contract: |
| |
| - Empty hits → return the canonical refusal string without an API call. |
| Saves money and guarantees identical refusal behavior across paths. |
| - Non-empty → ask Claude to answer from those chunks only, with |
| `[chunk_id]` citations after every factual claim. Post-generation we |
| parse the cited IDs and flag any that don't appear in the retrieved |
| set — that's our hallucination tripwire. |
| |
| Polarity handling lives in the system prompt as defense-in-depth on top of |
| the retrieval-time NegEx filter (`api/negation.py`): if a denied/negated |
| chunk somehow survives RRF + rerank + NegEx, the model is still instructed |
| not to cite it as positive evidence. |
| |
| Single-turn for now; Phase 4 wraps this in a FastAPI endpoint with audit |
| logging. Phase 6 will call it many times from the eval harness — the |
| system prompt is well below Haiku 4.5's 4096-token cache minimum so |
| prompt caching isn't worth wiring up here. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import os |
| import re |
| import time |
| from dataclasses import dataclass |
|
|
| import anthropic |
|
|
| from .hybrid import RerankedHit |
|
|
| REFUSAL_STRING = "The provided notes do not contain information to answer this." |
|
|
| DEFAULT_MODEL = "claude-haiku-4-5" |
| DEFAULT_MAX_TOKENS = 2048 |
|
|
| _CITATION_RE = re.compile(r"\[(\d+)\]") |
|
|
| SYSTEM_PROMPT = """You are a clinical reference assistant for a portfolio RAG demo. |
| You answer questions strictly from the numbered context chunks the user provides — not |
| from outside knowledge — but you may reason about hypothetical scenarios by applying |
| the definitions and criteria in the chunks to the facts the user describes. |
| |
| RULES (follow exactly, in order): |
| |
| 1. Ground every factual claim in the chunks. Definitions, criteria, symptom lists, |
| prevalence figures, treatment options, and any clinical fact you state must be |
| followed by a citation in square brackets giving the chunk id, e.g. |
| "Generalised anxiety disorder is characterised by marked symptoms of anxiety [42]." |
| If multiple chunks support one claim, cite all of them: "[42][57]". Do not import |
| facts from training knowledge that are not supported by a chunk. |
| |
| 2. Hypothetical scenarios are allowed. When the user describes a hypothetical patient |
| (e.g. "a patient presents with X, Y, and Z — what could this be?"), match the |
| described features against the criteria and descriptions in the chunks, and |
| report the conditions whose criteria are consistent with that presentation. |
| Frame the answer as pattern-matching, not diagnosis. Phrases like "these features |
| are consistent with X per [N]", "this presentation could meet criteria for Y [M]", |
| or "the described symptoms overlap with the criteria for Z [K]" are appropriate. |
| Do NOT write "the patient has" or "the diagnosis is" — the user has not supplied |
| a real patient, and this is not a clinical consultation. |
| |
| 3. Differential-style questions. When asked for a differential or "what else could |
| this be?", list every candidate condition supported by the chunks, with the |
| defining feature(s) that would distinguish each, cited. |
| |
| 4. Polarity check before citing. If a chunk states a patient does NOT have, denies, |
| or has no history of a condition, do NOT cite it as evidence FOR that condition. |
| "Negative for X", "ruled out X", "without X", and "denies X" are evidence about |
| absence, not presence. |
| |
| 5. Refuse only when the chunks genuinely do not cover the topic at all. Respond with |
| EXACTLY this string and nothing else: |
| "The provided notes do not contain information to answer this." |
| Use the refusal when no chunk addresses the question domain, or when the chunks |
| discuss only tangential topics. Do NOT refuse merely because the chunks lack the |
| exact phrasing of the user's question — if the chunks contain the criteria or |
| features the question is about, answer with what the chunks support. |
| |
| 6. Hedge where the chunks are thin. If only one or two chunks marginally address the |
| question, say so briefly ("the retrieved material only partially covers this") |
| and give the partial answer with citations, rather than refusing. |
| |
| Output format: prose, complete sentences, citations inline. Match response length |
| to the question. Simple factual questions get a short answer (2-4 sentences). When |
| the user asks for criteria, definitions, a full description, a differential, a |
| symptom list, treatment options, or poses a hypothetical that calls for working |
| through multiple possibilities, give the full answer the chunks support — do not |
| truncate. Use short paragraphs or bulleted lists when that makes long answers |
| easier to scan. Every claim still needs a citation.""" |
|
|
|
|
| @dataclass(frozen=True) |
| class Generation: |
| answer: str |
| cited_ids: list[int] |
| invalid_cited_ids: list[int] |
| refused: bool |
| model: str |
| latency_ms: float |
|
|
|
|
| def generate( |
| query: str, |
| reranked_hits: list[RerankedHit], |
| *, |
| model: str | None = None, |
| max_tokens: int = DEFAULT_MAX_TOKENS, |
| ) -> Generation: |
| """Call Claude with retrieved chunks; return answer + citation audit. |
| |
| `reranked_hits=[]` short-circuits to the refusal path without an API |
| call. The `refused` field is True when the model returns the exact |
| refusal string (or when we short-circuited). |
| """ |
| model = model or os.environ.get("ANTHROPIC_MODEL", DEFAULT_MODEL) |
| if not reranked_hits: |
| return Generation( |
| answer=REFUSAL_STRING, |
| cited_ids=[], |
| invalid_cited_ids=[], |
| refused=True, |
| model=model, |
| latency_ms=0.0, |
| ) |
|
|
| user_msg = _build_user_message(query, reranked_hits) |
| client = anthropic.Anthropic() |
| t0 = time.perf_counter() |
| response = client.messages.create( |
| model=model, |
| max_tokens=max_tokens, |
| system=SYSTEM_PROMPT, |
| messages=[{"role": "user", "content": user_msg}], |
| ) |
| latency_ms = (time.perf_counter() - t0) * 1000 |
|
|
| answer = "".join(b.text for b in response.content if b.type == "text").strip() |
| retrieved_ids = {h.hit.chunk_id for h in reranked_hits} |
| cited = [int(m) for m in _CITATION_RE.findall(answer)] |
| cited_unique = list(dict.fromkeys(cited)) |
| invalid = [c for c in cited_unique if c not in retrieved_ids] |
| refused = (answer == REFUSAL_STRING) |
|
|
| return Generation( |
| answer=answer, |
| cited_ids=cited_unique, |
| invalid_cited_ids=invalid, |
| refused=refused, |
| model=model, |
| latency_ms=latency_ms, |
| ) |
|
|
|
|
| def _build_user_message(query: str, hits: list[RerankedHit]) -> str: |
| blocks = [] |
| for r in hits: |
| h = r.hit |
| provenance = h.source_type |
| if h.section: |
| provenance += f" / {h.section}" |
| if h.title: |
| provenance += f" / {h.title}" |
| blocks.append(f"[{h.chunk_id}] ({provenance})\n{h.chunk_text}") |
| chunks_text = "\n\n".join(blocks) |
| return f"CONTEXT CHUNKS:\n\n{chunks_text}\n\nQUESTION: {query}" |
|
|