Spaces:

johnnydang88
/

QWEN3

Sleeping

App Files Files Community

johnnydang88 commited on Mar 11

Commit

4d7bb7b

verified ·

1 Parent(s): 3c6a8af

Update app.py

Browse files

Files changed (1) hide show

app.py +295 -31

app.py CHANGED Viewed

@@ -1,17 +1,25 @@
 """
 Cardiology AI Assistant — Alibaba Qwen3-4B-Instruct
 Hugging Face ZeroGPU Space
 """
-import os, gc, torch, warnings, pdfplumber
 import spaces
-from typing import List
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
 from langchain_core.documents import Document
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_core.embeddings import Embeddings
-from sentence_transformers import CrossEncoder
 import gradio as gr
 warnings.filterwarnings("ignore")
@@ -61,9 +69,6 @@ class MedCPTEmbeddings(Embeddings):
 # ══════════════════════════════════════════════════════════════════════════════
 # STARTUP
-# FIX 1: Increased chunk_size 512→1024 and overlap 64→128
-# Smaller chunks were splitting multi-point framework definitions (e.g. AF-CARE pillars)
-# across chunk boundaries, making them unretrievable as a unit.
 # ══════════════════════════════════════════════════════════════════════════════
 print("📂 Loading PDF with pdfplumber...", flush=True)
 docs = []
@@ -97,6 +102,12 @@ print("✅ Vector store ready.", flush=True)
 print("⚖️  Loading CrossEncoder (CPU)...", flush=True)
 reranker = CrossEncoder("BAAI/bge-reranker-base", device="cpu")
 print("🚀 Loading Qwen3-4B in float16 (CPU)...", flush=True)
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_NAME, token=HF_TOKEN, trust_remote_code=True
@@ -111,9 +122,7 @@ model.eval()
 print("✅ Qwen3 ready (CPU). GPU borrowed per request via ZeroGPU.", flush=True)
 # ══════════════════════════════════════════════════════════════════════════════
-# FIX 2: MULTI-QUERY EXPANSION
-# A single embedding query may miss chunks that use different surface forms.
-# We expand to multiple sub-queries and merge unique results before reranking.
 # ══════════════════════════════════════════════════════════════════════════════
 QUERY_EXPANSIONS = {
     "AF-CARE": [
@@ -134,7 +143,6 @@ QUERY_EXPANSIONS = {
 }
 def expand_query(query: str) -> List[str]:
-    """Return a list of sub-queries for retrieval. Falls back to original query."""
     q_lower = query.lower()
     for keyword, expansions in QUERY_EXPANSIONS.items():
         if keyword.lower() in q_lower:
@@ -142,7 +150,6 @@ def expand_query(query: str) -> List[str]:
     return [query]
 def retrieve_with_expansion(query: str, k_per_query: int = 10) -> List[Document]:
-    """Run similarity search for each expanded query, deduplicate by page_content."""
     sub_queries = expand_query(query)
     seen, merged = set(), []
     for sq in sub_queries:
@@ -153,13 +160,197 @@ def retrieve_with_expansion(query: str, k_per_query: int = 10) -> List[Document]
                 merged.append(doc)
     return merged
-# ══════════════════════════════════════════════════════════════════════════════
-# CPU RERANKER
-# ══════════════════════════════════════════════════════════════════════════════
 def rerank_docs(query: str, docs):
     scores = reranker.predict([[query, d.page_content] for d in docs])
     return scores
 # ══════════════════════════════════════════════════════════════════════════════
 # GPU FUNCTION
 # ══════════════════════════════════════════════════════════════════════════════
@@ -167,7 +358,7 @@ def rerank_docs(query: str, docs):
 def llm_generate(messages: list) -> str:
     print("🔥 GPU acquired, running generation...", flush=True)
     model.to("cuda")
-    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer(text, return_tensors="pt").to("cuda")
     with torch.no_grad():
         generated_ids = model.generate(
@@ -188,10 +379,6 @@ def llm_generate(messages: list) -> str:
 # ══════════════════════════════════════════════════════════════════════════════
 # RAG PIPELINE
-# FIX 3: top 8 reranked docs (was 4) — richer context for list-heavy answers
-# FIX 4: Stronger system prompt — prevents model from saying info is missing
-#         when it IS in context; instructs it to enumerate list-type answers fully.
-#         Page numbers now included in context blocks for accurate citation.
 # ══════════════════════════════════════════════════════════════════════════════
 SYSTEM_PROMPT = (
     "You are a medical expert assistant specialising in cardiology. "
@@ -204,10 +391,18 @@ SYSTEM_PROMPT = (
 )
 def rag_query_stream(query: str):
-    yield "⏳ **Status:** 🔍 Retrieving relevant documents (multi-query expansion)...\n\n---\n"
     candidates = retrieve_with_expansion(query, k_per_query=10)
-    yield "⏳ **Status:** 📊 Reranking with CrossEncoder (CPU)...\n\n---\n"
     scores   = rerank_docs(query, candidates)
     ranked   = sorted(zip(scores, candidates), key=lambda x: x[0], reverse=True)
     top_docs = [doc for _, doc in ranked[:8]]
@@ -217,7 +412,11 @@ def rag_query_stream(query: str):
     )
     pages = ", ".join(str(d.metadata.get("page", "?")) for d in top_docs)
-    yield "⏳ **Status:** 🧠 Generating with Qwen3 (ZeroGPU H200)...\n\n---\n"
     messages = [
         {
             "role": "system",
@@ -225,15 +424,25 @@ def rag_query_stream(query: str):
         },
         {"role": "user", "content": query},
     ]
-    answer = llm_generate(messages)
-    yield f"### 🌌 Answer\n\n{answer}\n\n📄 **Source Pages:** {pages}\n"
 # ══════════════════════════════════════════════════════════════════════════════
 # GRADIO UI
 # ══════════════════════════════════════════════════════════════════════════════
 def gradio_wrapper(query):
     if not query or not query.strip():
-        yield "⚠️ Please enter a valid question."
         return
     yield from rag_query_stream(query)
@@ -246,22 +455,29 @@ qwen_theme = gr.themes.Soft(
     button_primary_background_fill_hover="*primary_700",
 )
-with gr.Blocks(theme=qwen_theme) as demo:
     gr.Markdown("# 🌌 Cardiology AI Assistant (ESC 2024)")
     gr.Markdown("### ⚡ Powered by Alibaba Qwen3-4B · ZeroGPU H200")
     gr.Markdown(
         "Ask questions based on the **2024 ESC Medical Guidelines**. "
-        "Uses RAG with MedCPT embeddings, multi-query expansion, CrossEncoder reranking, and Qwen3-4B generation."
     )
     with gr.Row():
-        with gr.Column():
             input_text = gr.Textbox(
                 label="Your Clinical Question",
                 placeholder="e.g., What are the four treatment pillars of AF-CARE?",
                 lines=3,
             )
-            submit_btn = gr.Button("Analyze Guidelines", variant="primary")
-    output_text = gr.Markdown(label="Assistant Response")
     gr.Examples(
         examples=[
             "What are the four treatment pillars of the AF-CARE framework?",
@@ -270,7 +486,55 @@ with gr.Blocks(theme=qwen_theme) as demo:
             "What is the target LDL-C for very high-risk patients?",
         ],
         inputs=input_text,
     )
-    submit_btn.click(gradio_wrapper, inputs=input_text, outputs=output_text)
 demo.queue().launch(server_name="0.0.0.0", server_port=7860)

 """
 Cardiology AI Assistant — Alibaba Qwen3-4B-Instruct
 Hugging Face ZeroGPU Space
+Includes: BERTScore F1, ROUGE-N, Semantic Similarity, Faithfulness, Answer Relevance, Context Recall
+Same metric stack as the Llama-3 and Phi-3 versions — all fixes applied:
+  • SentenceTransformer forced to CPU (prevents stale CUDA zero-vector bug)
+  • ROUGE uses precision (overlap / answer_ngrams), not recall vs huge context
+  • Context capped at 60 sentences before embedding (prevents OOM)
+  • Per-metric try/except so one failure never kills the whole panel
 """
+import os, gc, re, torch, warnings, pdfplumber
+import numpy as np
 import spaces
+from collections import Counter
+from typing import List, Dict
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
 from langchain_core.documents import Document
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_core.embeddings import Embeddings
+from sentence_transformers import CrossEncoder, SentenceTransformer
 import gradio as gr
 warnings.filterwarnings("ignore")
 # ══════════════════════════════════════════════════════════════════════════════
 # STARTUP
 # ══════════════════════════════════════════════════════════════════════════════
 print("📂 Loading PDF with pdfplumber...", flush=True)
 docs = []
 print("⚖️  Loading CrossEncoder (CPU)...", flush=True)
 reranker = CrossEncoder("BAAI/bge-reranker-base", device="cpu")
+# Explicitly load on CPU — after ZeroGPU releases the GPU, auto-device detection
+# can latch onto a stale CUDA context and silently return zero vectors.
+print("📐 Loading metrics SentenceTransformer (CPU)...", flush=True)
+metrics_st = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
+print("✅ Metrics encoder ready.", flush=True)
 print("🚀 Loading Qwen3-4B in float16 (CPU)...", flush=True)
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_NAME, token=HF_TOKEN, trust_remote_code=True
 print("✅ Qwen3 ready (CPU). GPU borrowed per request via ZeroGPU.", flush=True)
 # ══════════════════════════════════════════════════════════════════════════════
+# MULTI-QUERY EXPANSION
 # ══════════════════════════════════════════════════════════════════════════════
 QUERY_EXPANSIONS = {
     "AF-CARE": [
 }
 def expand_query(query: str) -> List[str]:
     q_lower = query.lower()
     for keyword, expansions in QUERY_EXPANSIONS.items():
         if keyword.lower() in q_lower:
     return [query]
 def retrieve_with_expansion(query: str, k_per_query: int = 10) -> List[Document]:
     sub_queries = expand_query(query)
     seen, merged = set(), []
     for sq in sub_queries:
                 merged.append(doc)
     return merged
 def rerank_docs(query: str, docs):
     scores = reranker.predict([[query, d.page_content] for d in docs])
     return scores
+# ═══════���══════════════════════════════════════════════════════════════════════
+# EVALUATION METRICS
+# All reference-free — uses retrieved context + query as the reference signal.
+# Identical implementation to the Llama-3 and Phi-3 versions for consistency.
+# ══════════════════════════════════════════════════════════════════════════════
+def _sent_tokenize(text: str) -> List[str]:
+    """Lightweight sentence splitter — no NLTK required."""
+    sents = re.split(r'(?<=[.!?])\s+', text.strip())
+    return [s.strip() for s in sents if len(s.strip()) > 10]
+def _encode(texts: List[str]) -> np.ndarray:
+    """
+    Encode on CPU explicitly.
+    After ZeroGPU releases the GPU, SentenceTransformer's auto-device detection
+    can latch onto a stale CUDA context and return zero vectors.
+    Forcing CPU guarantees correct, non-zero embeddings every time.
+    """
+    return metrics_st.encode(
+        texts,
+        normalize_embeddings=True,
+        show_progress_bar=False,
+        device="cpu",
+        convert_to_numpy=True,
+    )
+def _ngrams(tokens: List[str], n: int) -> Counter:
+    return Counter(tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1))
+def rouge_n(hypothesis: str, reference: str, n: int = 1) -> float:
+    """
+    ROUGE-N precision: fraction of answer n-grams that appear in the context.
+    Using precision (not recall) because the context is ~6,000+ tokens — recall
+    of a ~60-token answer against that pool is always ~4% even for correct answers.
+    """
+    hyp_tokens = hypothesis.lower().split()
+    ref_tokens = reference.lower().split()
+    hyp_ng = _ngrams(hyp_tokens, n)
+    ref_ng = _ngrams(ref_tokens, n)
+    overlap = sum((hyp_ng & ref_ng).values())
+    denom   = sum(hyp_ng.values())   # precision: denominator = answer n-grams
+    return round(overlap / denom, 4) if denom > 0 else 0.0
+def bertscore_f1(answer: str, context_sents: List[str]) -> float:
+    """
+    Approximate BERTScore F1 via sentence-level embeddings.
+    P = mean max-cosine(answer_sent → any context_sent)
+    R = mean max-cosine(context_sent → any answer_sent)
+    F1 = harmonic mean(P, R)
+    Uses pre-tokenised, capped context sentences to avoid encoding 100+ sentences.
+    """
+    ans_sents = _sent_tokenize(answer)
+    if not ans_sents or not context_sents:
+        return 0.0
+    try:
+        a_embs = _encode(ans_sents)
+        c_embs = _encode(context_sents)
+        sim    = a_embs @ c_embs.T
+        P = float(sim.max(axis=1).mean())
+        R = float(sim.max(axis=0).mean())
+        f1 = 2 * P * R / (P + R + 1e-9)
+        return round(max(f1, 0.0), 4)
+    except Exception as e:
+        print(f"⚠️  bertscore_f1 error: {e}", flush=True)
+        return 0.0
+def semantic_similarity(answer: str, query: str) -> float:
+    """Cosine similarity between answer embedding and query embedding."""
+    try:
+        embs  = _encode([answer, query])
+        score = float(embs[0] @ embs[1])
+        return round(max(score, 0.0), 4)
+    except Exception as e:
+        print(f"⚠️  semantic_similarity error: {e}", flush=True)
+        return 0.0
+def faithfulness(answer: str, context_sents: List[str], threshold: float = 0.35) -> float:
+    """
+    Fraction of answer sentences whose max cosine-sim to any context sentence ≥ threshold.
+    Threshold = 0.35 (not 0.40) so paraphrased but grounded sentences are counted.
+    """
+    ans_sents = _sent_tokenize(answer)
+    if not ans_sents or not context_sents:
+        return 0.0
+    try:
+        a_embs = _encode(ans_sents)
+        c_embs = _encode(context_sents)
+        sim    = a_embs @ c_embs.T
+        max_per_ans = sim.max(axis=1)
+        faithful_count = int((max_per_ans >= threshold).sum())
+        return round(faithful_count / len(ans_sents), 4)
+    except Exception as e:
+        print(f"⚠️  faithfulness error: {e}", flush=True)
+        return 0.0
+def answer_relevance(answer: str, query: str) -> float:
+    """Does the answer actually address what was asked?"""
+    return semantic_similarity(answer, query)
+def context_recall(answer: str, context_sents: List[str], threshold: float = 0.35) -> float:
+    """
+    Fraction of context sentences reflected in the answer.
+    Mirrors RAGAS Context Recall but without ground-truth labels.
+    """
+    ans_sents = _sent_tokenize(answer)
+    if not ans_sents or not context_sents:
+        return 0.0
+    try:
+        a_embs = _encode(ans_sents)
+        c_embs = _encode(context_sents)
+        sim    = a_embs @ c_embs.T
+        max_per_ctx = sim.max(axis=0)
+        recalled_count = int((max_per_ctx >= threshold).sum())
+        return round(recalled_count / len(context_sents), 4)
+    except Exception as e:
+        print(f"⚠️  context_recall error: {e}", flush=True)
+        return 0.0
+def compute_all_metrics(query: str, answer: str, context: str) -> Dict[str, float]:
+    """
+    Tokenise context once, cap at 60 sentences (top-ranked chunks come first),
+    then run all embedding-based metrics against that capped list.
+    ROUGE uses the raw context string (pure token overlap, no matrices).
+    """
+    ctx_sents_all = _sent_tokenize(context)
+    ctx_sents = ctx_sents_all[:60]
+    print(f"📐 Metrics: answer={len(_sent_tokenize(answer))} sents, "
+          f"ctx={len(ctx_sents)}/{len(ctx_sents_all)} sents", flush=True)
+    return {
+        "BERTScore F1":        bertscore_f1(answer, ctx_sents),
+        "ROUGE-1":             rouge_n(answer, context, n=1),
+        "ROUGE-2":             rouge_n(answer, context, n=2),
+        "Semantic Similarity": semantic_similarity(answer, query),
+        "Faithfulness":        faithfulness(answer, ctx_sents),
+        "Answer Relevance":    answer_relevance(answer, query),
+        "Context Recall":      context_recall(answer, ctx_sents),
+    }
+# ── Display helpers ───────────────────────────────────────────────────────────
+_METRIC_DESCRIPTIONS = {
+    "BERTScore F1":        "Sentence-level semantic overlap F1 between answer and top context sentences.",
+    "ROUGE-1":             "Fraction of answer unigrams found in retrieved context (precision).",
+    "ROUGE-2":             "Fraction of answer bigrams found in retrieved context (precision).",
+    "Semantic Similarity": "Cosine similarity between answer and question embeddings.",
+    "Faithfulness":        "Fraction of answer sentences semantically supported by the retrieved context.",
+    "Answer Relevance":    "How directly the answer addresses the original question.",
+    "Context Recall":      "Fraction of top context sentences reflected in the answer.",
+}
+_THRESHOLDS = {
+    # (warn_below, ok_below, good_above)
+    "BERTScore F1":        (0.50, 0.65, 0.80),
+    "ROUGE-1":             (0.15, 0.30, 0.45),
+    "ROUGE-2":             (0.05, 0.15, 0.25),
+    "Semantic Similarity": (0.40, 0.60, 0.75),
+    "Faithfulness":        (0.50, 0.70, 0.85),
+    "Answer Relevance":    (0.40, 0.60, 0.75),
+    "Context Recall":      (0.15, 0.30, 0.50),
+}
+def _colour(name: str, value: float) -> str:
+    warn, ok, good = _THRESHOLDS.get(name, (0.3, 0.6, 0.8))
+    if value >= good: return "🟢"
+    if value >= ok:   return "🟡"
+    return "🔴"
+def _bar(value: float, width: int = 20) -> str:
+    filled = int(round(value * width))
+    return "█" * filled + "░" * (width - filled)
+def format_metrics_markdown(metrics: Dict[str, float]) -> str:
+    lines = ["## 📊 Evaluation Metrics\n"]
+    lines.append(
+        "> Metrics are **reference-free** and computed against the retrieved context "
+        "and original query — no labelled ground truth required.\n"
+    )
+    lines.append("| Metric | Score | Bar | Status | Notes |")
+    lines.append("|--------|------:|-----|--------|-------|")
+    for name, value in metrics.items():
+        pct  = f"{value:.2%}"
+        bar  = f"`{_bar(value)}`"
+        icon = _colour(name, value)
+        desc = _METRIC_DESCRIPTIONS.get(name, "")
+        lines.append(f"| **{name}** | {pct} | {bar} | {icon} | {desc} |")
+    lines.append("\n**Colour key:** 🟢 Good · 🟡 Acceptable · 🔴 Needs attention")
+    return "\n".join(lines)
 # ══════════════════════════════════════════════════════════════════════════════
 # GPU FUNCTION
 # ══════════════════════════════════════════════════════════════════════════════
 def llm_generate(messages: list) -> str:
     print("🔥 GPU acquired, running generation...", flush=True)
     model.to("cuda")
+    text   = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer(text, return_tensors="pt").to("cuda")
     with torch.no_grad():
         generated_ids = model.generate(
 # ══════════════════════════════════════════════════════════════════════════════
 # RAG PIPELINE
 # ══════════════════════════════════════════════════════════════════════════════
 SYSTEM_PROMPT = (
     "You are a medical expert assistant specialising in cardiology. "
 )
 def rag_query_stream(query: str):
+    # ── Step 1: retrieval ────────────────────────────────────────────────────
+    yield (
+        "⏳ **Status:** 🔍 Retrieving relevant documents (multi-query expansion)...\n\n---\n",
+        ""
+    )
     candidates = retrieve_with_expansion(query, k_per_query=10)
+    # ── Step 2: rerank ───────────────────────────────────────────────────────
+    yield (
+        "⏳ **Status:** 📊 Reranking with CrossEncoder (CPU)...\n\n---\n",
+        ""
+    )
     scores   = rerank_docs(query, candidates)
     ranked   = sorted(zip(scores, candidates), key=lambda x: x[0], reverse=True)
     top_docs = [doc for _, doc in ranked[:8]]
     )
     pages = ", ".join(str(d.metadata.get("page", "?")) for d in top_docs)
+    # ── Step 3: generate ─────────────────────────────────────────────────────
+    yield (
+        "⏳ **Status:** 🧠 Generating with Qwen3 (ZeroGPU H200)...\n\n---\n",
+        ""
+    )
     messages = [
         {
             "role": "system",
         },
         {"role": "user", "content": query},
     ]
+    answer    = llm_generate(messages)
+    answer_md = f"### 🌌 Answer\n\n{answer}\n\n📄 **Source Pages:** {pages}\n"
+    # ── Step 4: metrics ──────────────────────────────────────────────────────
+    yield (
+        answer_md,
+        "⏳ **Status:** 📐 Computing evaluation metrics (CPU)...\n"
+    )
+    metrics    = compute_all_metrics(query, answer, context)
+    metrics_md = format_metrics_markdown(metrics)
+    yield (answer_md, metrics_md)
 # ══════════════════════════════════════════════════════════════════════════════
 # GRADIO UI
 # ══════════════════════════════════════════════════════════════════════════════
 def gradio_wrapper(query):
     if not query or not query.strip():
+        yield "⚠️ Please enter a valid question.", ""
         return
     yield from rag_query_stream(query)
     button_primary_background_fill_hover="*primary_700",
 )
+with gr.Blocks(theme=qwen_theme, title="Cardiology AI Assistant") as demo:
+    # ── Header ───────────────────────────────────────────────────────────────
     gr.Markdown("# 🌌 Cardiology AI Assistant (ESC 2024)")
     gr.Markdown("### ⚡ Powered by Alibaba Qwen3-4B · ZeroGPU H200")
     gr.Markdown(
         "Ask questions based on the **2024 ESC Medical Guidelines**. "
+        "Uses RAG with MedCPT embeddings, multi-query expansion, CrossEncoder reranking, "
+        "Qwen3-4B generation, and **live evaluation metrics**."
     )
+    # ── Input ────────────────────────────────────────────────────────────────
     with gr.Row():
+        with gr.Column(scale=4):
             input_text = gr.Textbox(
                 label="Your Clinical Question",
                 placeholder="e.g., What are the four treatment pillars of AF-CARE?",
                 lines=3,
             )
+        with gr.Column(scale=1, min_width=160):
+            submit_btn = gr.Button("🔍 Analyze Guidelines", variant="primary", size="lg")
+    # ── Examples ─────────────────────────────────────────────────────────────
     gr.Examples(
         examples=[
             "What are the four treatment pillars of the AF-CARE framework?",
             "What is the target LDL-C for very high-risk patients?",
         ],
         inputs=input_text,
+        label="Example Questions",
+    )
+    gr.Markdown("---")
+    # ── Answer output (full width) ────────────────────────────────────────────
+    answer_output = gr.Markdown(
+        label="Assistant Response",
+        value="*Your answer will appear here after submission.*",
+    )
+    gr.Markdown("---")
+    # ── Metrics output (full width, below answer) ─────────────────────────────
+    metrics_output = gr.Markdown(
+        label="Evaluation Metrics",
+        value="*Metrics will appear here once the answer is generated.*",
+    )
+    gr.Markdown("---")
+    # ── Metric legend ─────────────────────────────────────────────────────────
+    with gr.Accordion("ℹ️ About the Evaluation Metrics", open=False):
+        gr.Markdown("""
+### How each metric is computed
+| Metric | Method | Interpretation |
+|--------|--------|---------------|
+| **BERTScore F1** | Sentence-level cosine-sim F1 between answer sentences and top-60 context sentences using `all-MiniLM-L6-v2` (forced CPU) | Measures how semantically similar the answer is to the source context |
+| **ROUGE-1** | **Precision**: fraction of answer unigrams that appear in the retrieved context | Are the words the model used actually in the retrieved passages? |
+| **ROUGE-2** | **Precision**: fraction of answer bigrams that appear in the retrieved context | Are the phrases the model used actually in the retrieved passages? |
+| **Semantic Similarity** | Cosine similarity of full answer ↔ question embeddings | Does the answer embed in the same semantic space as the question? |
+| **Faithfulness** | Fraction of answer sentences with cosine-sim ≥ 0.35 to any context sentence | Are answer claims grounded in retrieved text? |
+| **Answer Relevance** | Cosine similarity of answer ↔ question embeddings | How directly does the answer respond to the question? |
+| **Context Recall** | Fraction of top-60 context sentences with cosine-sim ≥ 0.35 to any answer sentence | How much of the retrieved evidence is used in the answer? |
+> **Why precision for ROUGE?** The retrieved context is ~8,000 tokens; a correct ~60-token answer
+> has only ~4% unigram *recall* against that pool — even if every word came from the context.
+> Precision asks the right question: *"Did the model use words that actually appear in the retrieved passages?"*
+> **All metrics are reference-free** — they use the retrieved context and original query as the
+> reference signal, so no annotated ground-truth is needed.
+        """)
+    # ── Wire up ───────────────────────────────────────────────────────────────
+    submit_btn.click(
+        fn=gradio_wrapper,
+        inputs=input_text,
+        outputs=[answer_output, metrics_output],
     )
 demo.queue().launch(server_name="0.0.0.0", server_port=7860)