johnnydang88 commited on
Commit
4d7bb7b
Β·
verified Β·
1 Parent(s): 3c6a8af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +295 -31
app.py CHANGED
@@ -1,17 +1,25 @@
1
  """
2
  Cardiology AI Assistant β€” Alibaba Qwen3-4B-Instruct
3
  Hugging Face ZeroGPU Space
 
 
 
 
 
 
4
  """
5
 
6
- import os, gc, torch, warnings, pdfplumber
 
7
  import spaces
8
- from typing import List
 
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
10
  from langchain_core.documents import Document
11
  from langchain_text_splitters import RecursiveCharacterTextSplitter
12
  from langchain_community.vectorstores import FAISS
13
  from langchain_core.embeddings import Embeddings
14
- from sentence_transformers import CrossEncoder
15
  import gradio as gr
16
 
17
  warnings.filterwarnings("ignore")
@@ -61,9 +69,6 @@ class MedCPTEmbeddings(Embeddings):
61
 
62
  # ══════════════════════════════════════════════════════════════════════════════
63
  # STARTUP
64
- # FIX 1: Increased chunk_size 512β†’1024 and overlap 64β†’128
65
- # Smaller chunks were splitting multi-point framework definitions (e.g. AF-CARE pillars)
66
- # across chunk boundaries, making them unretrievable as a unit.
67
  # ══════════════════════════════════════════════════════════════════════════════
68
  print("πŸ“‚ Loading PDF with pdfplumber...", flush=True)
69
  docs = []
@@ -97,6 +102,12 @@ print("βœ… Vector store ready.", flush=True)
97
  print("βš–οΈ Loading CrossEncoder (CPU)...", flush=True)
98
  reranker = CrossEncoder("BAAI/bge-reranker-base", device="cpu")
99
 
 
 
 
 
 
 
100
  print("πŸš€ Loading Qwen3-4B in float16 (CPU)...", flush=True)
101
  tokenizer = AutoTokenizer.from_pretrained(
102
  MODEL_NAME, token=HF_TOKEN, trust_remote_code=True
@@ -111,9 +122,7 @@ model.eval()
111
  print("βœ… Qwen3 ready (CPU). GPU borrowed per request via ZeroGPU.", flush=True)
112
 
113
  # ══════════════════════════════════════════════════════════════════════════════
114
- # FIX 2: MULTI-QUERY EXPANSION
115
- # A single embedding query may miss chunks that use different surface forms.
116
- # We expand to multiple sub-queries and merge unique results before reranking.
117
  # ══════════════════════════════════════════════════════════════════════════════
118
  QUERY_EXPANSIONS = {
119
  "AF-CARE": [
@@ -134,7 +143,6 @@ QUERY_EXPANSIONS = {
134
  }
135
 
136
  def expand_query(query: str) -> List[str]:
137
- """Return a list of sub-queries for retrieval. Falls back to original query."""
138
  q_lower = query.lower()
139
  for keyword, expansions in QUERY_EXPANSIONS.items():
140
  if keyword.lower() in q_lower:
@@ -142,7 +150,6 @@ def expand_query(query: str) -> List[str]:
142
  return [query]
143
 
144
  def retrieve_with_expansion(query: str, k_per_query: int = 10) -> List[Document]:
145
- """Run similarity search for each expanded query, deduplicate by page_content."""
146
  sub_queries = expand_query(query)
147
  seen, merged = set(), []
148
  for sq in sub_queries:
@@ -153,13 +160,197 @@ def retrieve_with_expansion(query: str, k_per_query: int = 10) -> List[Document]
153
  merged.append(doc)
154
  return merged
155
 
156
- # ══════════════════════════════════════════════════════════════════════════════
157
- # CPU RERANKER
158
- # ══════════════════════════════════════════════════════════════════════════════
159
  def rerank_docs(query: str, docs):
160
  scores = reranker.predict([[query, d.page_content] for d in docs])
161
  return scores
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  # ══════════════════════════════════════════════════════════════════════════════
164
  # GPU FUNCTION
165
  # ══════════════════════════════════════════════════════════════════════════════
@@ -167,7 +358,7 @@ def rerank_docs(query: str, docs):
167
  def llm_generate(messages: list) -> str:
168
  print("πŸ”₯ GPU acquired, running generation...", flush=True)
169
  model.to("cuda")
170
- text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
171
  inputs = tokenizer(text, return_tensors="pt").to("cuda")
172
  with torch.no_grad():
173
  generated_ids = model.generate(
@@ -188,10 +379,6 @@ def llm_generate(messages: list) -> str:
188
 
189
  # ══════════════════════════════════════════════════════════════════════════════
190
  # RAG PIPELINE
191
- # FIX 3: top 8 reranked docs (was 4) β€” richer context for list-heavy answers
192
- # FIX 4: Stronger system prompt β€” prevents model from saying info is missing
193
- # when it IS in context; instructs it to enumerate list-type answers fully.
194
- # Page numbers now included in context blocks for accurate citation.
195
  # ══════════════════════════════════════════════════════════════════════════════
196
  SYSTEM_PROMPT = (
197
  "You are a medical expert assistant specialising in cardiology. "
@@ -204,10 +391,18 @@ SYSTEM_PROMPT = (
204
  )
205
 
206
  def rag_query_stream(query: str):
207
- yield "⏳ **Status:** πŸ” Retrieving relevant documents (multi-query expansion)...\n\n---\n"
 
 
 
 
208
  candidates = retrieve_with_expansion(query, k_per_query=10)
209
 
210
- yield "⏳ **Status:** πŸ“Š Reranking with CrossEncoder (CPU)...\n\n---\n"
 
 
 
 
211
  scores = rerank_docs(query, candidates)
212
  ranked = sorted(zip(scores, candidates), key=lambda x: x[0], reverse=True)
213
  top_docs = [doc for _, doc in ranked[:8]]
@@ -217,7 +412,11 @@ def rag_query_stream(query: str):
217
  )
218
  pages = ", ".join(str(d.metadata.get("page", "?")) for d in top_docs)
219
 
220
- yield "⏳ **Status:** 🧠 Generating with Qwen3 (ZeroGPU H200)...\n\n---\n"
 
 
 
 
221
  messages = [
222
  {
223
  "role": "system",
@@ -225,15 +424,25 @@ def rag_query_stream(query: str):
225
  },
226
  {"role": "user", "content": query},
227
  ]
228
- answer = llm_generate(messages)
229
- yield f"### 🌌 Answer\n\n{answer}\n\nπŸ“„ **Source Pages:** {pages}\n"
 
 
 
 
 
 
 
 
 
 
230
 
231
  # ══════════════════════════════════════════════════════════════════════════════
232
  # GRADIO UI
233
  # ══════════════════════════════════════════════════════════════════════════════
234
  def gradio_wrapper(query):
235
  if not query or not query.strip():
236
- yield "⚠️ Please enter a valid question."
237
  return
238
  yield from rag_query_stream(query)
239
 
@@ -246,22 +455,29 @@ qwen_theme = gr.themes.Soft(
246
  button_primary_background_fill_hover="*primary_700",
247
  )
248
 
249
- with gr.Blocks(theme=qwen_theme) as demo:
 
 
250
  gr.Markdown("# 🌌 Cardiology AI Assistant (ESC 2024)")
251
  gr.Markdown("### ⚑ Powered by Alibaba Qwen3-4B Β· ZeroGPU H200")
252
  gr.Markdown(
253
  "Ask questions based on the **2024 ESC Medical Guidelines**. "
254
- "Uses RAG with MedCPT embeddings, multi-query expansion, CrossEncoder reranking, and Qwen3-4B generation."
 
255
  )
 
 
256
  with gr.Row():
257
- with gr.Column():
258
  input_text = gr.Textbox(
259
  label="Your Clinical Question",
260
  placeholder="e.g., What are the four treatment pillars of AF-CARE?",
261
  lines=3,
262
  )
263
- submit_btn = gr.Button("Analyze Guidelines", variant="primary")
264
- output_text = gr.Markdown(label="Assistant Response")
 
 
265
  gr.Examples(
266
  examples=[
267
  "What are the four treatment pillars of the AF-CARE framework?",
@@ -270,7 +486,55 @@ with gr.Blocks(theme=qwen_theme) as demo:
270
  "What is the target LDL-C for very high-risk patients?",
271
  ],
272
  inputs=input_text,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  )
274
- submit_btn.click(gradio_wrapper, inputs=input_text, outputs=output_text)
275
 
276
  demo.queue().launch(server_name="0.0.0.0", server_port=7860)
 
1
  """
2
  Cardiology AI Assistant β€” Alibaba Qwen3-4B-Instruct
3
  Hugging Face ZeroGPU Space
4
+ Includes: BERTScore F1, ROUGE-N, Semantic Similarity, Faithfulness, Answer Relevance, Context Recall
5
+ Same metric stack as the Llama-3 and Phi-3 versions β€” all fixes applied:
6
+ β€’ SentenceTransformer forced to CPU (prevents stale CUDA zero-vector bug)
7
+ β€’ ROUGE uses precision (overlap / answer_ngrams), not recall vs huge context
8
+ β€’ Context capped at 60 sentences before embedding (prevents OOM)
9
+ β€’ Per-metric try/except so one failure never kills the whole panel
10
  """
11
 
12
+ import os, gc, re, torch, warnings, pdfplumber
13
+ import numpy as np
14
  import spaces
15
+ from collections import Counter
16
+ from typing import List, Dict
17
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
18
  from langchain_core.documents import Document
19
  from langchain_text_splitters import RecursiveCharacterTextSplitter
20
  from langchain_community.vectorstores import FAISS
21
  from langchain_core.embeddings import Embeddings
22
+ from sentence_transformers import CrossEncoder, SentenceTransformer
23
  import gradio as gr
24
 
25
  warnings.filterwarnings("ignore")
 
69
 
70
  # ══════════════════════════════════════════════════════════════════════════════
71
  # STARTUP
 
 
 
72
  # ══════════════════════════════════════════════════════════════════════════════
73
  print("πŸ“‚ Loading PDF with pdfplumber...", flush=True)
74
  docs = []
 
102
  print("βš–οΈ Loading CrossEncoder (CPU)...", flush=True)
103
  reranker = CrossEncoder("BAAI/bge-reranker-base", device="cpu")
104
 
105
+ # Explicitly load on CPU β€” after ZeroGPU releases the GPU, auto-device detection
106
+ # can latch onto a stale CUDA context and silently return zero vectors.
107
+ print("πŸ“ Loading metrics SentenceTransformer (CPU)...", flush=True)
108
+ metrics_st = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
109
+ print("βœ… Metrics encoder ready.", flush=True)
110
+
111
  print("πŸš€ Loading Qwen3-4B in float16 (CPU)...", flush=True)
112
  tokenizer = AutoTokenizer.from_pretrained(
113
  MODEL_NAME, token=HF_TOKEN, trust_remote_code=True
 
122
  print("βœ… Qwen3 ready (CPU). GPU borrowed per request via ZeroGPU.", flush=True)
123
 
124
  # ══════════════════════════════════════════════════════════════════════════════
125
+ # MULTI-QUERY EXPANSION
 
 
126
  # ══════════════════════════════════════════════════════════════════════════════
127
  QUERY_EXPANSIONS = {
128
  "AF-CARE": [
 
143
  }
144
 
145
  def expand_query(query: str) -> List[str]:
 
146
  q_lower = query.lower()
147
  for keyword, expansions in QUERY_EXPANSIONS.items():
148
  if keyword.lower() in q_lower:
 
150
  return [query]
151
 
152
  def retrieve_with_expansion(query: str, k_per_query: int = 10) -> List[Document]:
 
153
  sub_queries = expand_query(query)
154
  seen, merged = set(), []
155
  for sq in sub_queries:
 
160
  merged.append(doc)
161
  return merged
162
 
 
 
 
163
  def rerank_docs(query: str, docs):
164
  scores = reranker.predict([[query, d.page_content] for d in docs])
165
  return scores
166
 
167
+ # ═══════���══════════════════════════════════════════════════════════════════════
168
+ # EVALUATION METRICS
169
+ # All reference-free β€” uses retrieved context + query as the reference signal.
170
+ # Identical implementation to the Llama-3 and Phi-3 versions for consistency.
171
+ # ══════════════════════════════════════════════════════════════════════════════
172
+
173
+ def _sent_tokenize(text: str) -> List[str]:
174
+ """Lightweight sentence splitter β€” no NLTK required."""
175
+ sents = re.split(r'(?<=[.!?])\s+', text.strip())
176
+ return [s.strip() for s in sents if len(s.strip()) > 10]
177
+
178
+ def _encode(texts: List[str]) -> np.ndarray:
179
+ """
180
+ Encode on CPU explicitly.
181
+ After ZeroGPU releases the GPU, SentenceTransformer's auto-device detection
182
+ can latch onto a stale CUDA context and return zero vectors.
183
+ Forcing CPU guarantees correct, non-zero embeddings every time.
184
+ """
185
+ return metrics_st.encode(
186
+ texts,
187
+ normalize_embeddings=True,
188
+ show_progress_bar=False,
189
+ device="cpu",
190
+ convert_to_numpy=True,
191
+ )
192
+
193
+ def _ngrams(tokens: List[str], n: int) -> Counter:
194
+ return Counter(tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1))
195
+
196
+ def rouge_n(hypothesis: str, reference: str, n: int = 1) -> float:
197
+ """
198
+ ROUGE-N precision: fraction of answer n-grams that appear in the context.
199
+ Using precision (not recall) because the context is ~6,000+ tokens β€” recall
200
+ of a ~60-token answer against that pool is always ~4% even for correct answers.
201
+ """
202
+ hyp_tokens = hypothesis.lower().split()
203
+ ref_tokens = reference.lower().split()
204
+ hyp_ng = _ngrams(hyp_tokens, n)
205
+ ref_ng = _ngrams(ref_tokens, n)
206
+ overlap = sum((hyp_ng & ref_ng).values())
207
+ denom = sum(hyp_ng.values()) # precision: denominator = answer n-grams
208
+ return round(overlap / denom, 4) if denom > 0 else 0.0
209
+
210
+ def bertscore_f1(answer: str, context_sents: List[str]) -> float:
211
+ """
212
+ Approximate BERTScore F1 via sentence-level embeddings.
213
+ P = mean max-cosine(answer_sent β†’ any context_sent)
214
+ R = mean max-cosine(context_sent β†’ any answer_sent)
215
+ F1 = harmonic mean(P, R)
216
+ Uses pre-tokenised, capped context sentences to avoid encoding 100+ sentences.
217
+ """
218
+ ans_sents = _sent_tokenize(answer)
219
+ if not ans_sents or not context_sents:
220
+ return 0.0
221
+ try:
222
+ a_embs = _encode(ans_sents)
223
+ c_embs = _encode(context_sents)
224
+ sim = a_embs @ c_embs.T
225
+ P = float(sim.max(axis=1).mean())
226
+ R = float(sim.max(axis=0).mean())
227
+ f1 = 2 * P * R / (P + R + 1e-9)
228
+ return round(max(f1, 0.0), 4)
229
+ except Exception as e:
230
+ print(f"⚠️ bertscore_f1 error: {e}", flush=True)
231
+ return 0.0
232
+
233
+ def semantic_similarity(answer: str, query: str) -> float:
234
+ """Cosine similarity between answer embedding and query embedding."""
235
+ try:
236
+ embs = _encode([answer, query])
237
+ score = float(embs[0] @ embs[1])
238
+ return round(max(score, 0.0), 4)
239
+ except Exception as e:
240
+ print(f"⚠️ semantic_similarity error: {e}", flush=True)
241
+ return 0.0
242
+
243
+ def faithfulness(answer: str, context_sents: List[str], threshold: float = 0.35) -> float:
244
+ """
245
+ Fraction of answer sentences whose max cosine-sim to any context sentence β‰₯ threshold.
246
+ Threshold = 0.35 (not 0.40) so paraphrased but grounded sentences are counted.
247
+ """
248
+ ans_sents = _sent_tokenize(answer)
249
+ if not ans_sents or not context_sents:
250
+ return 0.0
251
+ try:
252
+ a_embs = _encode(ans_sents)
253
+ c_embs = _encode(context_sents)
254
+ sim = a_embs @ c_embs.T
255
+ max_per_ans = sim.max(axis=1)
256
+ faithful_count = int((max_per_ans >= threshold).sum())
257
+ return round(faithful_count / len(ans_sents), 4)
258
+ except Exception as e:
259
+ print(f"⚠️ faithfulness error: {e}", flush=True)
260
+ return 0.0
261
+
262
+ def answer_relevance(answer: str, query: str) -> float:
263
+ """Does the answer actually address what was asked?"""
264
+ return semantic_similarity(answer, query)
265
+
266
+ def context_recall(answer: str, context_sents: List[str], threshold: float = 0.35) -> float:
267
+ """
268
+ Fraction of context sentences reflected in the answer.
269
+ Mirrors RAGAS Context Recall but without ground-truth labels.
270
+ """
271
+ ans_sents = _sent_tokenize(answer)
272
+ if not ans_sents or not context_sents:
273
+ return 0.0
274
+ try:
275
+ a_embs = _encode(ans_sents)
276
+ c_embs = _encode(context_sents)
277
+ sim = a_embs @ c_embs.T
278
+ max_per_ctx = sim.max(axis=0)
279
+ recalled_count = int((max_per_ctx >= threshold).sum())
280
+ return round(recalled_count / len(context_sents), 4)
281
+ except Exception as e:
282
+ print(f"⚠️ context_recall error: {e}", flush=True)
283
+ return 0.0
284
+
285
+ def compute_all_metrics(query: str, answer: str, context: str) -> Dict[str, float]:
286
+ """
287
+ Tokenise context once, cap at 60 sentences (top-ranked chunks come first),
288
+ then run all embedding-based metrics against that capped list.
289
+ ROUGE uses the raw context string (pure token overlap, no matrices).
290
+ """
291
+ ctx_sents_all = _sent_tokenize(context)
292
+ ctx_sents = ctx_sents_all[:60]
293
+ print(f"πŸ“ Metrics: answer={len(_sent_tokenize(answer))} sents, "
294
+ f"ctx={len(ctx_sents)}/{len(ctx_sents_all)} sents", flush=True)
295
+ return {
296
+ "BERTScore F1": bertscore_f1(answer, ctx_sents),
297
+ "ROUGE-1": rouge_n(answer, context, n=1),
298
+ "ROUGE-2": rouge_n(answer, context, n=2),
299
+ "Semantic Similarity": semantic_similarity(answer, query),
300
+ "Faithfulness": faithfulness(answer, ctx_sents),
301
+ "Answer Relevance": answer_relevance(answer, query),
302
+ "Context Recall": context_recall(answer, ctx_sents),
303
+ }
304
+
305
+ # ── Display helpers ───────────────────────────────────────────────────────────
306
+ _METRIC_DESCRIPTIONS = {
307
+ "BERTScore F1": "Sentence-level semantic overlap F1 between answer and top context sentences.",
308
+ "ROUGE-1": "Fraction of answer unigrams found in retrieved context (precision).",
309
+ "ROUGE-2": "Fraction of answer bigrams found in retrieved context (precision).",
310
+ "Semantic Similarity": "Cosine similarity between answer and question embeddings.",
311
+ "Faithfulness": "Fraction of answer sentences semantically supported by the retrieved context.",
312
+ "Answer Relevance": "How directly the answer addresses the original question.",
313
+ "Context Recall": "Fraction of top context sentences reflected in the answer.",
314
+ }
315
+
316
+ _THRESHOLDS = {
317
+ # (warn_below, ok_below, good_above)
318
+ "BERTScore F1": (0.50, 0.65, 0.80),
319
+ "ROUGE-1": (0.15, 0.30, 0.45),
320
+ "ROUGE-2": (0.05, 0.15, 0.25),
321
+ "Semantic Similarity": (0.40, 0.60, 0.75),
322
+ "Faithfulness": (0.50, 0.70, 0.85),
323
+ "Answer Relevance": (0.40, 0.60, 0.75),
324
+ "Context Recall": (0.15, 0.30, 0.50),
325
+ }
326
+
327
+ def _colour(name: str, value: float) -> str:
328
+ warn, ok, good = _THRESHOLDS.get(name, (0.3, 0.6, 0.8))
329
+ if value >= good: return "🟒"
330
+ if value >= ok: return "🟑"
331
+ return "πŸ”΄"
332
+
333
+ def _bar(value: float, width: int = 20) -> str:
334
+ filled = int(round(value * width))
335
+ return "β–ˆ" * filled + "β–‘" * (width - filled)
336
+
337
+ def format_metrics_markdown(metrics: Dict[str, float]) -> str:
338
+ lines = ["## πŸ“Š Evaluation Metrics\n"]
339
+ lines.append(
340
+ "> Metrics are **reference-free** and computed against the retrieved context "
341
+ "and original query β€” no labelled ground truth required.\n"
342
+ )
343
+ lines.append("| Metric | Score | Bar | Status | Notes |")
344
+ lines.append("|--------|------:|-----|--------|-------|")
345
+ for name, value in metrics.items():
346
+ pct = f"{value:.2%}"
347
+ bar = f"`{_bar(value)}`"
348
+ icon = _colour(name, value)
349
+ desc = _METRIC_DESCRIPTIONS.get(name, "")
350
+ lines.append(f"| **{name}** | {pct} | {bar} | {icon} | {desc} |")
351
+ lines.append("\n**Colour key:** 🟒 Good Β· 🟑 Acceptable Β· πŸ”΄ Needs attention")
352
+ return "\n".join(lines)
353
+
354
  # ══════════════════════════════════════════════════════════════════════════════
355
  # GPU FUNCTION
356
  # ══════════════════════════════════════════════════════════════════════════════
 
358
  def llm_generate(messages: list) -> str:
359
  print("πŸ”₯ GPU acquired, running generation...", flush=True)
360
  model.to("cuda")
361
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
362
  inputs = tokenizer(text, return_tensors="pt").to("cuda")
363
  with torch.no_grad():
364
  generated_ids = model.generate(
 
379
 
380
  # ══════════════════════════════════════════════════════════════════════════════
381
  # RAG PIPELINE
 
 
 
 
382
  # ══════════════════════════════════════════════════════════════════════════════
383
  SYSTEM_PROMPT = (
384
  "You are a medical expert assistant specialising in cardiology. "
 
391
  )
392
 
393
  def rag_query_stream(query: str):
394
+ # ── Step 1: retrieval ────────────────────────────────────────────────────
395
+ yield (
396
+ "⏳ **Status:** πŸ” Retrieving relevant documents (multi-query expansion)...\n\n---\n",
397
+ ""
398
+ )
399
  candidates = retrieve_with_expansion(query, k_per_query=10)
400
 
401
+ # ── Step 2: rerank ───────────────────────────────────────────────────────
402
+ yield (
403
+ "⏳ **Status:** πŸ“Š Reranking with CrossEncoder (CPU)...\n\n---\n",
404
+ ""
405
+ )
406
  scores = rerank_docs(query, candidates)
407
  ranked = sorted(zip(scores, candidates), key=lambda x: x[0], reverse=True)
408
  top_docs = [doc for _, doc in ranked[:8]]
 
412
  )
413
  pages = ", ".join(str(d.metadata.get("page", "?")) for d in top_docs)
414
 
415
+ # ── Step 3: generate ─────────────────────────────────────────────────────
416
+ yield (
417
+ "⏳ **Status:** 🧠 Generating with Qwen3 (ZeroGPU H200)...\n\n---\n",
418
+ ""
419
+ )
420
  messages = [
421
  {
422
  "role": "system",
 
424
  },
425
  {"role": "user", "content": query},
426
  ]
427
+ answer = llm_generate(messages)
428
+ answer_md = f"### 🌌 Answer\n\n{answer}\n\nπŸ“„ **Source Pages:** {pages}\n"
429
+
430
+ # ── Step 4: metrics ──────────────────────────────────────────────────────
431
+ yield (
432
+ answer_md,
433
+ "⏳ **Status:** πŸ“ Computing evaluation metrics (CPU)...\n"
434
+ )
435
+ metrics = compute_all_metrics(query, answer, context)
436
+ metrics_md = format_metrics_markdown(metrics)
437
+
438
+ yield (answer_md, metrics_md)
439
 
440
  # ══════════════════════════════════════════════════════════════════════════════
441
  # GRADIO UI
442
  # ══════════════════════════════════════════════════════════════════════════════
443
  def gradio_wrapper(query):
444
  if not query or not query.strip():
445
+ yield "⚠️ Please enter a valid question.", ""
446
  return
447
  yield from rag_query_stream(query)
448
 
 
455
  button_primary_background_fill_hover="*primary_700",
456
  )
457
 
458
+ with gr.Blocks(theme=qwen_theme, title="Cardiology AI Assistant") as demo:
459
+
460
+ # ── Header ───────────────────────────────────────────────────────────────
461
  gr.Markdown("# 🌌 Cardiology AI Assistant (ESC 2024)")
462
  gr.Markdown("### ⚑ Powered by Alibaba Qwen3-4B Β· ZeroGPU H200")
463
  gr.Markdown(
464
  "Ask questions based on the **2024 ESC Medical Guidelines**. "
465
+ "Uses RAG with MedCPT embeddings, multi-query expansion, CrossEncoder reranking, "
466
+ "Qwen3-4B generation, and **live evaluation metrics**."
467
  )
468
+
469
+ # ── Input ────────────────────────────────────────────────────────────────
470
  with gr.Row():
471
+ with gr.Column(scale=4):
472
  input_text = gr.Textbox(
473
  label="Your Clinical Question",
474
  placeholder="e.g., What are the four treatment pillars of AF-CARE?",
475
  lines=3,
476
  )
477
+ with gr.Column(scale=1, min_width=160):
478
+ submit_btn = gr.Button("πŸ” Analyze Guidelines", variant="primary", size="lg")
479
+
480
+ # ── Examples ─────────────────────────────────────────────────────────────
481
  gr.Examples(
482
  examples=[
483
  "What are the four treatment pillars of the AF-CARE framework?",
 
486
  "What is the target LDL-C for very high-risk patients?",
487
  ],
488
  inputs=input_text,
489
+ label="Example Questions",
490
+ )
491
+
492
+ gr.Markdown("---")
493
+
494
+ # ── Answer output (full width) ────────────────────────────────────────────
495
+ answer_output = gr.Markdown(
496
+ label="Assistant Response",
497
+ value="*Your answer will appear here after submission.*",
498
+ )
499
+
500
+ gr.Markdown("---")
501
+
502
+ # ── Metrics output (full width, below answer) ─────────────────────────────
503
+ metrics_output = gr.Markdown(
504
+ label="Evaluation Metrics",
505
+ value="*Metrics will appear here once the answer is generated.*",
506
+ )
507
+
508
+ gr.Markdown("---")
509
+
510
+ # ── Metric legend ─────────────────────────────────────────────────────────
511
+ with gr.Accordion("ℹ️ About the Evaluation Metrics", open=False):
512
+ gr.Markdown("""
513
+ ### How each metric is computed
514
+
515
+ | Metric | Method | Interpretation |
516
+ |--------|--------|---------------|
517
+ | **BERTScore F1** | Sentence-level cosine-sim F1 between answer sentences and top-60 context sentences using `all-MiniLM-L6-v2` (forced CPU) | Measures how semantically similar the answer is to the source context |
518
+ | **ROUGE-1** | **Precision**: fraction of answer unigrams that appear in the retrieved context | Are the words the model used actually in the retrieved passages? |
519
+ | **ROUGE-2** | **Precision**: fraction of answer bigrams that appear in the retrieved context | Are the phrases the model used actually in the retrieved passages? |
520
+ | **Semantic Similarity** | Cosine similarity of full answer ↔ question embeddings | Does the answer embed in the same semantic space as the question? |
521
+ | **Faithfulness** | Fraction of answer sentences with cosine-sim β‰₯ 0.35 to any context sentence | Are answer claims grounded in retrieved text? |
522
+ | **Answer Relevance** | Cosine similarity of answer ↔ question embeddings | How directly does the answer respond to the question? |
523
+ | **Context Recall** | Fraction of top-60 context sentences with cosine-sim β‰₯ 0.35 to any answer sentence | How much of the retrieved evidence is used in the answer? |
524
+
525
+ > **Why precision for ROUGE?** The retrieved context is ~8,000 tokens; a correct ~60-token answer
526
+ > has only ~4% unigram *recall* against that pool β€” even if every word came from the context.
527
+ > Precision asks the right question: *"Did the model use words that actually appear in the retrieved passages?"*
528
+
529
+ > **All metrics are reference-free** β€” they use the retrieved context and original query as the
530
+ > reference signal, so no annotated ground-truth is needed.
531
+ """)
532
+
533
+ # ── Wire up ───────────────────────────────────────────────────────────────
534
+ submit_btn.click(
535
+ fn=gradio_wrapper,
536
+ inputs=input_text,
537
+ outputs=[answer_output, metrics_output],
538
  )
 
539
 
540
  demo.queue().launch(server_name="0.0.0.0", server_port=7860)