muddasser commited on
Commit
c95a2db
·
verified ·
1 Parent(s): 1531375

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +535 -14
app.py CHANGED
@@ -1,15 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- ### Evaluation Results:
3
-
4
- **Generation Quality:**
5
- | Metric | Value | Status |
6
- |--------|-------|--------|
7
- | Hallucination | 0.0 | Good |
8
- | Relevance | 1.0 | Good |
9
-
10
- **Retrieval Quality (Context vs Query):**
11
- | Metric | Value | Meaning |
12
- |--------|-------|---------|
13
- | Context Similarity | 0.65 | How well context matches query |
14
- | Query Coverage | 80% | % of question words found in context |
15
- | Matched Terms | capital, France | Key terms found |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import pypdf
4
+ import docx2txt
5
+ import numpy as np
6
+ import os
7
+ import json
8
+ from datetime import datetime
9
+ from typing import Dict, List
10
+
11
+ # Hybrid + Re-ranking imports
12
+ from rank_bm25 import BM25Okapi
13
+ from sentence_transformers import SentenceTransformer, CrossEncoder
14
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
15
+
16
+ # ======================================
17
+ # CONFIG
18
+ # ======================================
19
+ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
20
+ RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
21
+ CHUNK_SIZE = 800
22
+ CHUNK_OVERLAP = 100
23
+ RETRIEVE_K = 15
24
+ FINAL_K = 5
25
+
26
+ # ======================================
27
+ # Global Variables
28
+ # ======================================
29
+ print("Loading embedding and reranker models...")
30
+
31
+ embed_model = SentenceTransformer(EMBED_MODEL)
32
+ reranker = CrossEncoder(RERANKER_MODEL)
33
+
34
+ # Track evaluation data
35
+ evaluation_log = []
36
+ query_counter = 0
37
+ current_session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
38
+
39
+ # For retrieval evaluation (ground truth mapping)
40
+ ground_truth_map = {}
41
+
42
+ print("Models loaded successfully!")
43
+
44
+ # ======================================
45
+ # Retrieval Quality Evaluator
46
+ # ======================================
47
+ class RetrievalEvaluator:
48
+ """Evaluates retrieval quality: Precision@K, Recall@K, MRR"""
49
+
50
+ @staticmethod
51
+ def precision_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int = None) -> float:
52
+ if k is None:
53
+ k = len(retrieved_chunks)
54
+
55
+ top_k = retrieved_chunks[:k]
56
+ relevant_set = set(relevant_chunks)
57
+
58
+ relevant_retrieved = sum(1 for chunk in top_k if chunk in relevant_set)
59
+ return relevant_retrieved / k if k > 0 else 0.0
60
+
61
+ @staticmethod
62
+ def recall_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int = None) -> float:
63
+ if k is None:
64
+ k = len(retrieved_chunks)
65
+
66
+ top_k = retrieved_chunks[:k]
67
+ relevant_set = set(relevant_chunks)
68
+
69
+ relevant_retrieved = sum(1 for chunk in top_k if chunk in relevant_set)
70
+ total_relevant = len(relevant_set)
71
+
72
+ return relevant_retrieved / total_relevant if total_relevant > 0 else 0.0
73
+
74
+ @staticmethod
75
+ def mrr(retrieved_chunks: List[str], relevant_chunks: List[str]) -> float:
76
+ relevant_set = set(relevant_chunks)
77
+
78
+ for i, chunk in enumerate(retrieved_chunks, start=1):
79
+ if chunk in relevant_set:
80
+ return 1.0 / i
81
+
82
+ return 0.0
83
+
84
+ @staticmethod
85
+ def ndcg_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int = None) -> float:
86
+ if k is None:
87
+ k = len(retrieved_chunks)
88
+
89
+ relevant_set = set(relevant_chunks)
90
+
91
+ dcg = 0.0
92
+ for i, chunk in enumerate(retrieved_chunks[:k], start=1):
93
+ if chunk in relevant_set:
94
+ dcg += 1.0 / np.log2(i + 1)
95
+
96
+ ideal_relevant = min(len(relevant_set), k)
97
+ idcg = sum(1.0 / np.log2(i + 1) for i in range(1, ideal_relevant + 1))
98
+
99
+ return dcg / idcg if idcg > 0 else 0.0
100
+
101
+ def evaluate_retrieval(self, query: str, retrieved_chunks: List[str], relevant_chunks: List[str]) -> Dict:
102
+ if not relevant_chunks:
103
+ return {
104
+ "precision_at_1": None,
105
+ "precision_at_3": None,
106
+ "precision_at_5": None,
107
+ "recall_at_5": None,
108
+ "recall_at_10": None,
109
+ "mrr": None,
110
+ "ndcg_at_5": None,
111
+ "retrieval_quality_score": None,
112
+ }
113
+
114
+ metrics = {
115
+ "precision_at_1": round(self.precision_at_k(retrieved_chunks, relevant_chunks, k=1), 3),
116
+ "precision_at_3": round(self.precision_at_k(retrieved_chunks, relevant_chunks, k=3), 3),
117
+ "precision_at_5": round(self.precision_at_k(retrieved_chunks, relevant_chunks, k=5), 3),
118
+ "recall_at_5": round(self.recall_at_k(retrieved_chunks, relevant_chunks, k=5), 3),
119
+ "recall_at_10": round(self.recall_at_k(retrieved_chunks, relevant_chunks, k=10), 3),
120
+ "mrr": round(self.mrr(retrieved_chunks, relevant_chunks), 3),
121
+ "ndcg_at_5": round(self.ndcg_at_k(retrieved_chunks, relevant_chunks, k=5), 3),
122
+ }
123
+
124
+ metrics["retrieval_quality_score"] = round(
125
+ (metrics["precision_at_5"] * 0.3 +
126
+ metrics["recall_at_5"] * 0.3 +
127
+ metrics["mrr"] * 0.2 +
128
+ metrics["ndcg_at_5"] * 0.2), 3
129
+ )
130
+
131
+ return metrics
132
+
133
+ retrieval_evaluator = RetrievalEvaluator()
134
+
135
+ # ======================================
136
+ # RAG Evaluator (Hallucination, Relevance, Context Similarity)
137
+ # ======================================
138
+ class RAGEvaluator:
139
+ @staticmethod
140
+ def evaluate_hallucination(answer: str, context: str) -> dict:
141
+ """Hallucination score: % of claims not supported by context"""
142
+ answer_sentences = [s.strip() for s in answer.split('.') if len(s.strip()) > 10]
143
+ context_lower = context.lower()
144
+
145
+ stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were'}
146
+
147
+ unsupported_claims = []
148
+ for sent in answer_sentences:
149
+ words = set(sent.lower().split())
150
+ content_words = words - stopwords
151
+
152
+ if content_words:
153
+ matches = sum(1 for word in content_words if word in context_lower)
154
+ if matches / len(content_words) < 0.3:
155
+ unsupported_claims.append(sent[:100])
156
+
157
+ hallucination_score = len(unsupported_claims) / len(answer_sentences) if answer_sentences else 0
158
+
159
+ return {
160
+ "hallucination_score": round(hallucination_score, 3),
161
+ "is_hallucinating": hallucination_score > 0.3,
162
+ "potential_hallucinations": unsupported_claims[:3]
163
+ }
164
+
165
+ @staticmethod
166
+ def evaluate_relevance(answer: str, query: str) -> dict:
167
+ """Relevance score: word overlap between answer and question"""
168
+ stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
169
+ 'what', 'how', 'why', 'when', 'where', 'is', 'are', 'was', 'were', 'be', 'been'}
170
+
171
+ query_words = set(query.lower().split()) - stopwords
172
+ answer_words = set(answer.lower().split()) - stopwords
173
+
174
+ if not query_words:
175
+ return {"relevance_score": 0.5, "matched_terms": []}
176
+
177
+ matched = query_words.intersection(answer_words)
178
+ relevance = len(matched) / len(query_words)
179
+
180
+ return {
181
+ "relevance_score": round(relevance, 3),
182
+ "matched_terms": list(matched)[:10],
183
+ "match_percentage": f"{relevance*100:.1f}%"
184
+ }
185
+
186
+ @staticmethod
187
+ def evaluate_context_similarity(query: str, context: str) -> dict:
188
+ """Context Similarity: measures how well retrieved context matches query"""
189
+ query_words = set(query.lower().split())
190
+ context_words = set(context.lower().split())
191
+
192
+ stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
193
+ 'what', 'how', 'why', 'when', 'where', 'is', 'are', 'was', 'were', 'be', 'been'}
194
+
195
+ query_clean = query_words - stopwords
196
+ context_clean = context_words - stopwords
197
+
198
+ if not query_clean:
199
+ return {"context_similarity": 0.5, "query_coverage": 0, "matched_terms": [], "missing_terms": []}
200
+
201
+ intersection = len(query_clean.intersection(context_clean))
202
+ union = len(query_clean.union(context_clean))
203
+ jaccard_similarity = intersection / union if union > 0 else 0
204
+ coverage = intersection / len(query_clean)
205
+ context_score = (jaccard_similarity * 0.5 + coverage * 0.5)
206
+
207
+ return {
208
+ "context_similarity": round(context_score, 3),
209
+ "jaccard_similarity": round(jaccard_similarity, 3),
210
+ "query_coverage": round(coverage, 3),
211
+ "matched_terms": list(query_clean.intersection(context_clean))[:10],
212
+ "missing_terms": list(query_clean - context_clean)[:10]
213
+ }
214
+
215
+ evaluator = RAGEvaluator()
216
+
217
+ # ======================================
218
+ # Extract text from uploaded file
219
+ # ======================================
220
+ def extract_text(file):
221
+ if not file:
222
+ return ""
223
+ filename = file.name.lower()
224
+
225
+ try:
226
+ if filename.endswith(".pdf"):
227
+ reader = pypdf.PdfReader(file.name)
228
+ return "\n".join([page.extract_text() or "" for page in reader.pages])
229
+
230
+ elif filename.endswith(".docx"):
231
+ return docx2txt.process(file.name)
232
+
233
+ elif filename.endswith(".csv"):
234
+ df = pd.read_csv(file.name)
235
+ return df.to_string(index=False)
236
+ else:
237
+ return ""
238
+ except Exception as e:
239
+ return f"Error reading file: {str(e)}"
240
+
241
+ # ======================================
242
+ # Build Hybrid Index
243
+ # ======================================
244
+ def build_hybrid_index(text: str):
245
+ if not text.strip():
246
+ return None, None, None
247
+
248
+ splitter = RecursiveCharacterTextSplitter(
249
+ chunk_size=CHUNK_SIZE,
250
+ chunk_overlap=CHUNK_OVERLAP
251
+ )
252
+ chunks = splitter.split_text(text)
253
+ texts = [chunk for chunk in chunks if chunk.strip()]
254
+
255
+ from langchain_community.vectorstores import FAISS
256
+ from langchain_community.embeddings import HuggingFaceEmbeddings
257
+
258
+ embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
259
+ vectorstore = FAISS.from_texts(texts, embeddings)
260
+
261
+ tokenized_corpus = [doc.split() for doc in texts]
262
+ bm25 = BM25Okapi(tokenized_corpus)
263
+
264
+ return vectorstore, bm25, texts
265
+
266
+ # ======================================
267
+ # Hybrid Search + Re-ranking
268
+ # ======================================
269
+ def hybrid_retrieve(query: str, vectorstore, bm25, texts):
270
+ if not vectorstore or not bm25:
271
+ return [], []
272
+
273
+ vector_results = vectorstore.similarity_search(query, k=RETRIEVE_K)
274
+ vector_texts = [doc.page_content for doc in vector_results]
275
+
276
+ bm25_scores = bm25.get_scores(query.split())
277
+ bm25_top_idx = np.argsort(bm25_scores)[::-1][:RETRIEVE_K]
278
+ bm25_texts = [texts[i] for i in bm25_top_idx if i < len(texts)]
279
+
280
+ candidate_texts = list(dict.fromkeys(vector_texts + bm25_texts))[:RETRIEVE_K]
281
+
282
+ if not candidate_texts:
283
+ return [], []
284
+
285
+ pairs = [[query, cand] for cand in candidate_texts]
286
+ rerank_scores = reranker.predict(pairs)
287
+
288
+ sorted_indices = np.argsort(rerank_scores)[::-1]
289
+ final_docs = [candidate_texts[i] for i in sorted_indices[:FINAL_K]]
290
+
291
+ return final_docs, candidate_texts
292
+
293
+ # ======================================
294
+ # Generate Answer
295
+ # ======================================
296
+ def generate_answer(prompt: str):
297
+ api_key = os.getenv("GROQ_API_KEY")
298
+ if not api_key:
299
+ return "ERROR: GROQ_API_KEY not set"
300
+
301
+ from groq import Groq
302
+ client = Groq(api_key=api_key)
303
+
304
+ response = client.chat.completions.create(
305
+ model="llama-3.3-70b-versatile",
306
+ messages=[
307
+ {"role": "system", "content": "You are a precise assistant. Answer using only the given context."},
308
+ {"role": "user", "content": prompt}
309
+ ],
310
+ temperature=0.3,
311
+ max_tokens=700
312
+ )
313
+ return response.choices[0].message.content.strip()
314
+
315
+ # ======================================
316
+ # Logging Function with All Metrics
317
+ # ======================================
318
+ def log_query(query: str, context: str, answer: str, all_candidates: List[str], metadata: Dict = None):
319
+ global query_counter
320
+
321
+ query_counter += 1
322
+
323
+ hallucination = evaluator.evaluate_hallucination(answer, context)
324
+ relevance = evaluator.evaluate_relevance(answer, query)
325
+ context_sim = evaluator.evaluate_context_similarity(query, context)
326
+
327
+ retrieval_metrics = {}
328
+ if query in ground_truth_map:
329
+ relevant_chunk = ground_truth_map[query]
330
+ retrieval_metrics = retrieval_evaluator.evaluate_retrieval(query, all_candidates, [relevant_chunk])
331
+ else:
332
+ retrieval_metrics = {
333
+ "precision_at_5": None,
334
+ "recall_at_5": None,
335
+ "mrr": None,
336
+ "retrieval_quality_score": None,
337
+ }
338
+
339
+ log_entry = {
340
+ "timestamp": datetime.now().isoformat(),
341
+ "session_id": current_session_id,
342
+ "query_id": query_counter,
343
+ "query": query,
344
+ "context_length": len(context),
345
+ "context_chunks": context.count("\n\n") + 1,
346
+ "answer_length": len(answer),
347
+ "hallucination_score": hallucination["hallucination_score"],
348
+ "is_hallucinating": hallucination["is_hallucinating"],
349
+ "relevance_score": relevance["relevance_score"],
350
+ "context_similarity": context_sim["context_similarity"],
351
+ "jaccard_similarity": context_sim["jaccard_similarity"],
352
+ "query_coverage": context_sim["query_coverage"],
353
+ "precision_at_5": retrieval_metrics.get("precision_at_5"),
354
+ "recall_at_5": retrieval_metrics.get("recall_at_5"),
355
+ "mrr": retrieval_metrics.get("mrr"),
356
+ "retrieval_quality_score": retrieval_metrics.get("retrieval_quality_score"),
357
+ "metadata": metadata or {}
358
+ }
359
+
360
+ evaluation_log.append(log_entry)
361
+
362
+ with open(f"rag_logs_{current_session_id}.json", "a") as f:
363
+ json.dump(log_entry, f)
364
+ f.write("\n")
365
+
366
+ return log_entry, retrieval_metrics, context_sim
367
+
368
+ # ======================================
369
+ # Main Function
370
+ # ======================================
371
+ def answer_question(file, query):
372
+ if not file:
373
+ return "Please upload a document first."
374
+ if not query or not query.strip():
375
+ return "Please enter a question."
376
+
377
+ text = extract_text(file)
378
+ if len(text.strip()) < 50:
379
+ return "Could not extract enough text from the file."
380
+
381
+ vectorstore, bm25, texts = build_hybrid_index(text)
382
+ retrieved_docs, all_candidates = hybrid_retrieve(query, vectorstore, bm25, texts)
383
+
384
+ context = "\n\n".join(retrieved_docs)
385
+
386
+ prompt = f"""Use ONLY the following context to answer the question accurately.
387
+ If the context does not contain enough information, say so clearly.
388
+ Context:
389
+ {context}
390
+ Question: {query}
391
+ Answer:"""
392
+
393
+ answer = generate_answer(prompt)
394
+
395
+ log_entry, retrieval_metrics, context_sim = log_query(query, context, answer, all_candidates, {
396
+ "num_retrieved_chunks": len(retrieved_docs),
397
+ "total_context_chars": len(context)
398
+ })
399
+
400
+ eval_summary = f"""
401
+
402
  ---
403
+ === Evaluation Results ===
404
+
405
+ Generation Quality:
406
+ - Hallucination: {log_entry['hallucination_score']} (Good if < 0.3)
407
+ - Relevance: {log_entry['relevance_score']} (Good if > 0.5)
408
+
409
+ Retrieval Quality (Context vs Query):
410
+ - Context Similarity: {context_sim['context_similarity']} (Good if > 0.4)
411
+ - Query Coverage: {context_sim['query_coverage']*100:.0f}%
412
+ - Matched Terms: {', '.join(context_sim['matched_terms'][:5]) if context_sim['matched_terms'] else 'None'}
413
+ """
414
+
415
+ if retrieval_metrics.get('precision_at_5') is not None:
416
+ eval_summary += f"""
417
+ Precision/Recall:
418
+ - Precision@5: {retrieval_metrics.get('precision_at_5', 'N/A')}
419
+ - Recall@5: {retrieval_metrics.get('recall_at_5', 'N/A')}
420
+ - MRR: {retrieval_metrics.get('mrr', 'N/A')}
421
+ """
422
+
423
+ eval_summary += f"\nQuery #{log_entry['query_id']} | Session: {current_session_id}"
424
+
425
+ return answer + eval_summary
426
+
427
+ # ======================================
428
+ # Dashboard Functions
429
+ # ======================================
430
+ def show_summary():
431
+ if not evaluation_log:
432
+ return "No data yet. Ask some questions first!"
433
+
434
+ df = pd.DataFrame(evaluation_log)
435
+
436
+ avg_hallucination = df['hallucination_score'].mean()
437
+ avg_relevance = df['relevance_score'].mean()
438
+ avg_context_sim = df['context_similarity'].mean()
439
+ hallucination_rate = (df['is_hallucinating'].sum() / len(df)) * 100
440
+
441
+ summary = f"""
442
+ === RAG System Performance Summary ===
443
+
444
+ Session ID: {current_session_id}
445
+ Total Queries: {len(df)}
446
+
447
+ Generation Quality:
448
+ - Avg Hallucination: {avg_hallucination:.3f}
449
+ - Hallucination Rate: {hallucination_rate:.1f}%
450
+ - Avg Relevance: {avg_relevance:.3f}
451
+
452
+ Retrieval Quality:
453
+ - Avg Context Similarity: {avg_context_sim:.3f}
454
+
455
+ Usage Statistics:
456
+ - Avg Context Length: {df['context_length'].mean():.0f} chars
457
+ - Avg Answer Length: {df['answer_length'].mean():.0f} chars
458
+ - Avg Chunks per Query: {df['context_chunks'].mean():.1f}
459
+
460
+ Recent Queries:
461
+ """
462
+
463
+ for _, row in df.tail(5).iterrows():
464
+ summary += f"\nQ{row['query_id']}: {row['query'][:40]}... | H:{row['hallucination_score']:.2f} | Rel:{row['relevance_score']:.2f} | Ctx:{row['context_similarity']:.2f}"
465
+
466
+ return summary
467
+
468
+ def export_data():
469
+ if not evaluation_log:
470
+ return None
471
+
472
+ df = pd.DataFrame(evaluation_log)
473
+ csv_path = f"rag_export_{current_session_id}.csv"
474
+ df.to_csv(csv_path, index=False)
475
+ return csv_path
476
+
477
+ def reset_logs():
478
+ global evaluation_log, query_counter
479
+ evaluation_log = []
480
+ query_counter = 0
481
+ return "Logs reset. Starting fresh!"
482
+
483
+ # ======================================
484
+ # Gradio UI
485
+ # ======================================
486
+ with gr.Blocks(title="Hybrid RAG with Evaluation", theme=gr.themes.Soft()) as demo:
487
+ gr.Markdown("# Hybrid RAG Chatbot")
488
+ gr.Markdown("Hybrid Search + Re-ranking + Complete RAG Evaluation")
489
+
490
+ with gr.Tabs():
491
+ with gr.TabItem("Chat"):
492
+ with gr.Row():
493
+ with gr.Column(scale=1):
494
+ file_input = gr.File(label="Upload PDF, DOCX or CSV", file_types=[".pdf", ".docx", ".csv"])
495
+ query_input = gr.Textbox(label="Ask a Question", placeholder="What is this document about?", lines=2)
496
+ btn = gr.Button("Get Answer", variant="primary")
497
+
498
+ with gr.Column(scale=2):
499
+ output = gr.Textbox(label="Answer", lines=30)
500
+
501
+ btn.click(
502
+ fn=answer_question,
503
+ inputs=[file_input, query_input],
504
+ outputs=output
505
+ )
506
+
507
+ with gr.TabItem("Analytics"):
508
+ gr.Markdown("## RAG System Analytics Dashboard")
509
+
510
+ summary_output = gr.Markdown("No data yet. Ask some questions first!")
511
+
512
+ with gr.Row():
513
+ refresh_btn = gr.Button("Refresh Summary", variant="primary")
514
+ export_btn = gr.Button("Export CSV", variant="secondary")
515
+ reset_btn = gr.Button("Reset Logs", variant="stop")
516
+
517
+ refresh_btn.click(fn=show_summary, outputs=summary_output)
518
+ reset_btn.click(fn=reset_logs, outputs=summary_output)
519
+
520
+ def export_and_show():
521
+ path = export_data()
522
+ return f"Exported to: {path}" if path else "No data to export"
523
+
524
+ export_btn.click(fn=export_and_show, outputs=summary_output)
525
+
526
+ gr.Markdown("""
527
+ Metrics Explained:
528
+
529
+ - Hallucination: Lower is better (< 0.3 = Good)
530
+ - Relevance: Higher is better (> 0.5 = Good)
531
+ - Context Similarity: Higher is better (> 0.4 = Good)
532
+ - Query Coverage: % of question words found in context
533
+ """)
534
+
535
+ if __name__ == "__main__":
536
+ demo.launch(server_name="0.0.0.0", server_port=7860)