muddasser commited on
Commit
5b3d991
·
verified ·
1 Parent(s): c95a2db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -111
app.py CHANGED
@@ -5,8 +5,9 @@ import docx2txt
5
  import numpy as np
6
  import os
7
  import json
 
8
  from datetime import datetime
9
- from typing import Dict, List
10
 
11
  # Hybrid + Re-ranking imports
12
  from rank_bm25 import BM25Okapi
@@ -42,90 +43,116 @@ ground_truth_map = {}
42
  print("Models loaded successfully!")
43
 
44
  # ======================================
45
- # Retrieval Quality Evaluator
46
  # ======================================
47
  class RetrievalEvaluator:
48
- """Evaluates retrieval quality: Precision@K, Recall@K, MRR"""
49
 
50
  @staticmethod
51
- def precision_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int = None) -> float:
52
- if k is None:
53
- k = len(retrieved_chunks)
54
-
55
  top_k = retrieved_chunks[:k]
56
  relevant_set = set(relevant_chunks)
57
-
58
  relevant_retrieved = sum(1 for chunk in top_k if chunk in relevant_set)
59
- return relevant_retrieved / k if k > 0 else 0.0
60
 
61
  @staticmethod
62
- def recall_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int = None) -> float:
63
- if k is None:
64
- k = len(retrieved_chunks)
65
-
66
  top_k = retrieved_chunks[:k]
67
  relevant_set = set(relevant_chunks)
68
-
69
  relevant_retrieved = sum(1 for chunk in top_k if chunk in relevant_set)
70
  total_relevant = len(relevant_set)
71
-
72
  return relevant_retrieved / total_relevant if total_relevant > 0 else 0.0
73
 
74
  @staticmethod
75
  def mrr(retrieved_chunks: List[str], relevant_chunks: List[str]) -> float:
 
76
  relevant_set = set(relevant_chunks)
77
-
78
  for i, chunk in enumerate(retrieved_chunks, start=1):
79
  if chunk in relevant_set:
80
  return 1.0 / i
81
-
82
  return 0.0
83
 
84
  @staticmethod
85
- def ndcg_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int = None) -> float:
86
- if k is None:
87
- k = len(retrieved_chunks)
88
-
 
 
 
 
 
89
  relevant_set = set(relevant_chunks)
90
 
 
91
  dcg = 0.0
92
  for i, chunk in enumerate(retrieved_chunks[:k], start=1):
93
  if chunk in relevant_set:
94
  dcg += 1.0 / np.log2(i + 1)
95
 
 
96
  ideal_relevant = min(len(relevant_set), k)
97
  idcg = sum(1.0 / np.log2(i + 1) for i in range(1, ideal_relevant + 1))
98
 
99
  return dcg / idcg if idcg > 0 else 0.0
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  def evaluate_retrieval(self, query: str, retrieved_chunks: List[str], relevant_chunks: List[str]) -> Dict:
 
102
  if not relevant_chunks:
103
  return {
104
- "precision_at_1": None,
105
- "precision_at_3": None,
106
- "precision_at_5": None,
107
- "recall_at_5": None,
108
- "recall_at_10": None,
109
- "mrr": None,
110
- "ndcg_at_5": None,
111
  "retrieval_quality_score": None,
112
  }
113
 
114
  metrics = {
115
- "precision_at_1": round(self.precision_at_k(retrieved_chunks, relevant_chunks, k=1), 3),
116
- "precision_at_3": round(self.precision_at_k(retrieved_chunks, relevant_chunks, k=3), 3),
117
- "precision_at_5": round(self.precision_at_k(retrieved_chunks, relevant_chunks, k=5), 3),
118
- "recall_at_5": round(self.recall_at_k(retrieved_chunks, relevant_chunks, k=5), 3),
119
- "recall_at_10": round(self.recall_at_k(retrieved_chunks, relevant_chunks, k=10), 3),
 
 
 
 
 
 
 
120
  "mrr": round(self.mrr(retrieved_chunks, relevant_chunks), 3),
121
- "ndcg_at_5": round(self.ndcg_at_k(retrieved_chunks, relevant_chunks, k=5), 3),
 
122
  }
123
 
 
124
  metrics["retrieval_quality_score"] = round(
125
- (metrics["precision_at_5"] * 0.3 +
126
- metrics["recall_at_5"] * 0.3 +
127
  metrics["mrr"] * 0.2 +
128
- metrics["ndcg_at_5"] * 0.2), 3
 
129
  )
130
 
131
  return metrics
@@ -133,12 +160,12 @@ class RetrievalEvaluator:
133
  retrieval_evaluator = RetrievalEvaluator()
134
 
135
  # ======================================
136
- # RAG Evaluator (Hallucination, Relevance, Context Similarity)
137
  # ======================================
138
  class RAGEvaluator:
139
  @staticmethod
140
  def evaluate_hallucination(answer: str, context: str) -> dict:
141
- """Hallucination score: % of claims not supported by context"""
142
  answer_sentences = [s.strip() for s in answer.split('.') if len(s.strip()) > 10]
143
  context_lower = context.lower()
144
 
@@ -158,13 +185,14 @@ class RAGEvaluator:
158
 
159
  return {
160
  "hallucination_score": round(hallucination_score, 3),
 
161
  "is_hallucinating": hallucination_score > 0.3,
162
  "potential_hallucinations": unsupported_claims[:3]
163
  }
164
 
165
  @staticmethod
166
- def evaluate_relevance(answer: str, query: str) -> dict:
167
- """Relevance score: word overlap between answer and question"""
168
  stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
169
  'what', 'how', 'why', 'when', 'where', 'is', 'are', 'was', 'were', 'be', 'been'}
170
 
@@ -184,8 +212,8 @@ class RAGEvaluator:
184
  }
185
 
186
  @staticmethod
187
- def evaluate_context_similarity(query: str, context: str) -> dict:
188
- """Context Similarity: measures how well retrieved context matches query"""
189
  query_words = set(query.lower().split())
190
  context_words = set(context.lower().split())
191
 
@@ -206,11 +234,26 @@ class RAGEvaluator:
206
 
207
  return {
208
  "context_similarity": round(context_score, 3),
 
209
  "jaccard_similarity": round(jaccard_similarity, 3),
210
  "query_coverage": round(coverage, 3),
211
  "matched_terms": list(query_clean.intersection(context_clean))[:10],
212
  "missing_terms": list(query_clean - context_clean)[:10]
213
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  evaluator = RAGEvaluator()
216
 
@@ -270,6 +313,8 @@ def hybrid_retrieve(query: str, vectorstore, bm25, texts):
270
  if not vectorstore or not bm25:
271
  return [], []
272
 
 
 
273
  vector_results = vectorstore.similarity_search(query, k=RETRIEVE_K)
274
  vector_texts = [doc.page_content for doc in vector_results]
275
 
@@ -288,7 +333,9 @@ def hybrid_retrieve(query: str, vectorstore, bm25, texts):
288
  sorted_indices = np.argsort(rerank_scores)[::-1]
289
  final_docs = [candidate_texts[i] for i in sorted_indices[:FINAL_K]]
290
 
291
- return final_docs, candidate_texts
 
 
292
 
293
  # ======================================
294
  # Generate Answer
@@ -296,11 +343,12 @@ def hybrid_retrieve(query: str, vectorstore, bm25, texts):
296
  def generate_answer(prompt: str):
297
  api_key = os.getenv("GROQ_API_KEY")
298
  if not api_key:
299
- return "ERROR: GROQ_API_KEY not set"
300
 
301
  from groq import Groq
302
  client = Groq(api_key=api_key)
303
 
 
304
  response = client.chat.completions.create(
305
  model="llama-3.3-70b-versatile",
306
  messages=[
@@ -310,19 +358,23 @@ def generate_answer(prompt: str):
310
  temperature=0.3,
311
  max_tokens=700
312
  )
313
- return response.choices[0].message.content.strip()
 
 
314
 
315
  # ======================================
316
  # Logging Function with All Metrics
317
  # ======================================
318
- def log_query(query: str, context: str, answer: str, all_candidates: List[str], metadata: Dict = None):
 
319
  global query_counter
320
 
321
  query_counter += 1
322
 
323
  hallucination = evaluator.evaluate_hallucination(answer, context)
324
- relevance = evaluator.evaluate_relevance(answer, query)
325
- context_sim = evaluator.evaluate_context_similarity(query, context)
 
326
 
327
  retrieval_metrics = {}
328
  if query in ground_truth_map:
@@ -330,9 +382,10 @@ def log_query(query: str, context: str, answer: str, all_candidates: List[str],
330
  retrieval_metrics = retrieval_evaluator.evaluate_retrieval(query, all_candidates, [relevant_chunk])
331
  else:
332
  retrieval_metrics = {
333
- "precision_at_5": None,
334
- "recall_at_5": None,
335
- "mrr": None,
 
336
  "retrieval_quality_score": None,
337
  }
338
 
@@ -344,15 +397,27 @@ def log_query(query: str, context: str, answer: str, all_candidates: List[str],
344
  "context_length": len(context),
345
  "context_chunks": context.count("\n\n") + 1,
346
  "answer_length": len(answer),
 
347
  "hallucination_score": hallucination["hallucination_score"],
 
348
  "is_hallucinating": hallucination["is_hallucinating"],
349
  "relevance_score": relevance["relevance_score"],
350
- "context_similarity": context_sim["context_similarity"],
351
- "jaccard_similarity": context_sim["jaccard_similarity"],
352
- "query_coverage": context_sim["query_coverage"],
 
 
 
 
 
 
 
353
  "precision_at_5": retrieval_metrics.get("precision_at_5"),
354
  "recall_at_5": retrieval_metrics.get("recall_at_5"),
 
355
  "mrr": retrieval_metrics.get("mrr"),
 
 
356
  "retrieval_quality_score": retrieval_metrics.get("retrieval_quality_score"),
357
  "metadata": metadata or {}
358
  }
@@ -363,7 +428,7 @@ def log_query(query: str, context: str, answer: str, all_candidates: List[str],
363
  json.dump(log_entry, f)
364
  f.write("\n")
365
 
366
- return log_entry, retrieval_metrics, context_sim
367
 
368
  # ======================================
369
  # Main Function
@@ -379,7 +444,7 @@ def answer_question(file, query):
379
  return "Could not extract enough text from the file."
380
 
381
  vectorstore, bm25, texts = build_hybrid_index(text)
382
- retrieved_docs, all_candidates = hybrid_retrieve(query, vectorstore, bm25, texts)
383
 
384
  context = "\n\n".join(retrieved_docs)
385
 
@@ -390,37 +455,40 @@ Context:
390
  Question: {query}
391
  Answer:"""
392
 
393
- answer = generate_answer(prompt)
394
 
395
- log_entry, retrieval_metrics, context_sim = log_query(query, context, answer, all_candidates, {
 
396
  "num_retrieved_chunks": len(retrieved_docs),
397
  "total_context_chars": len(context)
398
  })
399
 
 
400
  eval_summary = f"""
401
 
402
- ---
403
- === Evaluation Results ===
404
-
405
- Generation Quality:
406
- - Hallucination: {log_entry['hallucination_score']} (Good if < 0.3)
407
- - Relevance: {log_entry['relevance_score']} (Good if > 0.5)
408
 
409
- Retrieval Quality (Context vs Query):
410
- - Context Similarity: {context_sim['context_similarity']} (Good if > 0.4)
411
- - Query Coverage: {context_sim['query_coverage']*100:.0f}%
412
- - Matched Terms: {', '.join(context_sim['matched_terms'][:5]) if context_sim['matched_terms'] else 'None'}
413
- """
414
 
415
- if retrieval_metrics.get('precision_at_5') is not None:
416
- eval_summary += f"""
417
- Precision/Recall:
418
- - Precision@5: {retrieval_metrics.get('precision_at_5', 'N/A')}
419
- - Recall@5: {retrieval_metrics.get('recall_at_5', 'N/A')}
420
- - MRR: {retrieval_metrics.get('mrr', 'N/A')}
 
 
 
 
 
 
 
 
421
  """
422
-
423
- eval_summary += f"\nQuery #{log_entry['query_id']} | Session: {current_session_id}"
424
 
425
  return answer + eval_summary
426
 
@@ -433,35 +501,33 @@ def show_summary():
433
 
434
  df = pd.DataFrame(evaluation_log)
435
 
436
- avg_hallucination = df['hallucination_score'].mean()
437
- avg_relevance = df['relevance_score'].mean()
438
- avg_context_sim = df['context_similarity'].mean()
439
- hallucination_rate = (df['is_hallucinating'].sum() / len(df)) * 100
440
-
441
  summary = f"""
442
- === RAG System Performance Summary ===
443
 
444
- Session ID: {current_session_id}
445
- Total Queries: {len(df)}
446
 
447
- Generation Quality:
448
- - Avg Hallucination: {avg_hallucination:.3f}
449
- - Hallucination Rate: {hallucination_rate:.1f}%
450
- - Avg Relevance: {avg_relevance:.3f}
 
451
 
452
- Retrieval Quality:
453
- - Avg Context Similarity: {avg_context_sim:.3f}
 
 
 
 
454
 
455
- Usage Statistics:
456
- - Avg Context Length: {df['context_length'].mean():.0f} chars
457
- - Avg Answer Length: {df['answer_length'].mean():.0f} chars
458
- - Avg Chunks per Query: {df['context_chunks'].mean():.1f}
459
 
460
- Recent Queries:
461
  """
462
-
463
  for _, row in df.tail(5).iterrows():
464
- summary += f"\nQ{row['query_id']}: {row['query'][:40]}... | H:{row['hallucination_score']:.2f} | Rel:{row['relevance_score']:.2f} | Ctx:{row['context_similarity']:.2f}"
465
 
466
  return summary
467
 
@@ -478,14 +544,14 @@ def reset_logs():
478
  global evaluation_log, query_counter
479
  evaluation_log = []
480
  query_counter = 0
481
- return "Logs reset. Starting fresh!"
482
 
483
  # ======================================
484
  # Gradio UI
485
  # ======================================
486
- with gr.Blocks(title="Hybrid RAG with Evaluation", theme=gr.themes.Soft()) as demo:
487
- gr.Markdown("# Hybrid RAG Chatbot")
488
- gr.Markdown("Hybrid Search + Re-ranking + Complete RAG Evaluation")
489
 
490
  with gr.Tabs():
491
  with gr.TabItem("Chat"):
@@ -496,7 +562,7 @@ with gr.Blocks(title="Hybrid RAG with Evaluation", theme=gr.themes.Soft()) as de
496
  btn = gr.Button("Get Answer", variant="primary")
497
 
498
  with gr.Column(scale=2):
499
- output = gr.Textbox(label="Answer", lines=30)
500
 
501
  btn.click(
502
  fn=answer_question,
@@ -507,7 +573,7 @@ with gr.Blocks(title="Hybrid RAG with Evaluation", theme=gr.themes.Soft()) as de
507
  with gr.TabItem("Analytics"):
508
  gr.Markdown("## RAG System Analytics Dashboard")
509
 
510
- summary_output = gr.Markdown("No data yet. Ask some questions first!")
511
 
512
  with gr.Row():
513
  refresh_btn = gr.Button("Refresh Summary", variant="primary")
@@ -519,17 +585,25 @@ with gr.Blocks(title="Hybrid RAG with Evaluation", theme=gr.themes.Soft()) as de
519
 
520
  def export_and_show():
521
  path = export_data()
522
- return f"Exported to: {path}" if path else "No data to export"
523
 
524
  export_btn.click(fn=export_and_show, outputs=summary_output)
525
 
526
  gr.Markdown("""
527
- Metrics Explained:
528
 
529
- - Hallucination: Lower is better (< 0.3 = Good)
530
- - Relevance: Higher is better (> 0.5 = Good)
531
- - Context Similarity: Higher is better (> 0.4 = Good)
532
- - Query Coverage: % of question words found in context
 
 
 
 
 
 
 
 
533
  """)
534
 
535
  if __name__ == "__main__":
 
5
  import numpy as np
6
  import os
7
  import json
8
+ import time
9
  from datetime import datetime
10
+ from typing import Dict, List, Optional
11
 
12
  # Hybrid + Re-ranking imports
13
  from rank_bm25 import BM25Okapi
 
43
  print("Models loaded successfully!")
44
 
45
  # ======================================
46
+ # Industry-Standard Retrieval Quality Evaluator
47
  # ======================================
48
  class RetrievalEvaluator:
49
+ """Evaluates retrieval quality: Precision@K, Recall@K, MRR, NDCG, Hit Rate"""
50
 
51
  @staticmethod
52
+ def precision_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int) -> float:
53
+ """Precision@K: Of top K retrieved, how many are relevant"""
54
+ if k == 0:
55
+ return 0.0
56
  top_k = retrieved_chunks[:k]
57
  relevant_set = set(relevant_chunks)
 
58
  relevant_retrieved = sum(1 for chunk in top_k if chunk in relevant_set)
59
+ return relevant_retrieved / k
60
 
61
  @staticmethod
62
+ def recall_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int) -> float:
63
+ """Recall@K: Of all relevant chunks, how many are in top K"""
 
 
64
  top_k = retrieved_chunks[:k]
65
  relevant_set = set(relevant_chunks)
 
66
  relevant_retrieved = sum(1 for chunk in top_k if chunk in relevant_set)
67
  total_relevant = len(relevant_set)
 
68
  return relevant_retrieved / total_relevant if total_relevant > 0 else 0.0
69
 
70
  @staticmethod
71
  def mrr(retrieved_chunks: List[str], relevant_chunks: List[str]) -> float:
72
+ """Mean Reciprocal Rank: 1 / position of first relevant chunk"""
73
  relevant_set = set(relevant_chunks)
 
74
  for i, chunk in enumerate(retrieved_chunks, start=1):
75
  if chunk in relevant_set:
76
  return 1.0 / i
 
77
  return 0.0
78
 
79
  @staticmethod
80
+ def hit_rate_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int) -> float:
81
+ """Hit Rate@K: Whether at least one relevant chunk appears in top K"""
82
+ top_k = retrieved_chunks[:k]
83
+ relevant_set = set(relevant_chunks)
84
+ return 1.0 if any(chunk in relevant_set for chunk in top_k) else 0.0
85
+
86
+ @staticmethod
87
+ def ndcg_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int) -> float:
88
+ """NDCG@K: Normalized Discounted Cumulative Gain"""
89
  relevant_set = set(relevant_chunks)
90
 
91
+ # DCG
92
  dcg = 0.0
93
  for i, chunk in enumerate(retrieved_chunks[:k], start=1):
94
  if chunk in relevant_set:
95
  dcg += 1.0 / np.log2(i + 1)
96
 
97
+ # IDCG (ideal DCG)
98
  ideal_relevant = min(len(relevant_set), k)
99
  idcg = sum(1.0 / np.log2(i + 1) for i in range(1, ideal_relevant + 1))
100
 
101
  return dcg / idcg if idcg > 0 else 0.0
102
 
103
+ @staticmethod
104
+ def average_precision(retrieved_chunks: List[str], relevant_chunks: List[str]) -> float:
105
+ """Average Precision: Average of precision at each relevant chunk position"""
106
+ relevant_set = set(relevant_chunks)
107
+ if not relevant_set:
108
+ return 0.0
109
+
110
+ precisions = []
111
+ relevant_found = 0
112
+
113
+ for i, chunk in enumerate(retrieved_chunks, start=1):
114
+ if chunk in relevant_set:
115
+ relevant_found += 1
116
+ precisions.append(relevant_found / i)
117
+
118
+ return sum(precisions) / len(relevant_set) if precisions else 0.0
119
+
120
  def evaluate_retrieval(self, query: str, retrieved_chunks: List[str], relevant_chunks: List[str]) -> Dict:
121
+ """Calculate all retrieval metrics"""
122
  if not relevant_chunks:
123
  return {
124
+ "precision_at_1": None, "precision_at_3": None, "precision_at_5": None,
125
+ "recall_at_5": None, "recall_at_10": None,
126
+ "hit_rate_at_1": None, "hit_rate_at_3": None, "hit_rate_at_5": None,
127
+ "mrr": None, "ndcg_at_5": None, "map_score": None,
 
 
 
128
  "retrieval_quality_score": None,
129
  }
130
 
131
  metrics = {
132
+ # Precision
133
+ "precision_at_1": round(self.precision_at_k(retrieved_chunks, relevant_chunks, 1), 3),
134
+ "precision_at_3": round(self.precision_at_k(retrieved_chunks, relevant_chunks, 3), 3),
135
+ "precision_at_5": round(self.precision_at_k(retrieved_chunks, relevant_chunks, 5), 3),
136
+ # Recall
137
+ "recall_at_5": round(self.recall_at_k(retrieved_chunks, relevant_chunks, 5), 3),
138
+ "recall_at_10": round(self.recall_at_k(retrieved_chunks, relevant_chunks, 10), 3),
139
+ # Hit Rate
140
+ "hit_rate_at_1": round(self.hit_rate_at_k(retrieved_chunks, relevant_chunks, 1), 3),
141
+ "hit_rate_at_3": round(self.hit_rate_at_k(retrieved_chunks, relevant_chunks, 3), 3),
142
+ "hit_rate_at_5": round(self.hit_rate_at_k(retrieved_chunks, relevant_chunks, 5), 3),
143
+ # Ranking metrics
144
  "mrr": round(self.mrr(retrieved_chunks, relevant_chunks), 3),
145
+ "ndcg_at_5": round(self.ndcg_at_k(retrieved_chunks, relevant_chunks, 5), 3),
146
+ "map_score": round(self.average_precision(retrieved_chunks, relevant_chunks), 3),
147
  }
148
 
149
+ # Overall retrieval quality score (weighted average)
150
  metrics["retrieval_quality_score"] = round(
151
+ (metrics["precision_at_5"] * 0.25 +
152
+ metrics["recall_at_5"] * 0.25 +
153
  metrics["mrr"] * 0.2 +
154
+ metrics["ndcg_at_5"] * 0.15 +
155
+ metrics["map_score"] * 0.15), 3
156
  )
157
 
158
  return metrics
 
160
  retrieval_evaluator = RetrievalEvaluator()
161
 
162
  # ======================================
163
+ # Industry-Standard RAG Evaluator
164
  # ======================================
165
  class RAGEvaluator:
166
  @staticmethod
167
  def evaluate_hallucination(answer: str, context: str) -> dict:
168
+ """Faithfulness/Hallucination: % of claims not supported by context"""
169
  answer_sentences = [s.strip() for s in answer.split('.') if len(s.strip()) > 10]
170
  context_lower = context.lower()
171
 
 
185
 
186
  return {
187
  "hallucination_score": round(hallucination_score, 3),
188
+ "faithfulness_score": round(1 - hallucination_score, 3), # Industry standard
189
  "is_hallucinating": hallucination_score > 0.3,
190
  "potential_hallucinations": unsupported_claims[:3]
191
  }
192
 
193
  @staticmethod
194
+ def evaluate_answer_relevance(answer: str, query: str) -> dict:
195
+ """Answer Relevance: How well answer addresses the question"""
196
  stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
197
  'what', 'how', 'why', 'when', 'where', 'is', 'are', 'was', 'were', 'be', 'been'}
198
 
 
212
  }
213
 
214
  @staticmethod
215
+ def evaluate_context_relevance(query: str, context: str) -> dict:
216
+ """Context Relevance: How well retrieved context matches query"""
217
  query_words = set(query.lower().split())
218
  context_words = set(context.lower().split())
219
 
 
234
 
235
  return {
236
  "context_similarity": round(context_score, 3),
237
+ "context_relevance_score": round(context_score, 3), # Industry standard name
238
  "jaccard_similarity": round(jaccard_similarity, 3),
239
  "query_coverage": round(coverage, 3),
240
  "matched_terms": list(query_clean.intersection(context_clean))[:10],
241
  "missing_terms": list(query_clean - context_clean)[:10]
242
  }
243
+
244
+ @staticmethod
245
+ def evaluate_answer_completeness(answer: str, expected_length: int = 50) -> dict:
246
+ """Answer Completeness: Length and structure of answer"""
247
+ words = answer.split()
248
+ sentences = answer.count('.')
249
+
250
+ return {
251
+ "answer_length_words": len(words),
252
+ "answer_length_chars": len(answer),
253
+ "sentence_count": sentences,
254
+ "is_complete": len(words) > expected_length,
255
+ "completeness_score": min(1.0, len(words) / expected_length)
256
+ }
257
 
258
  evaluator = RAGEvaluator()
259
 
 
313
  if not vectorstore or not bm25:
314
  return [], []
315
 
316
+ start_time = time.time()
317
+
318
  vector_results = vectorstore.similarity_search(query, k=RETRIEVE_K)
319
  vector_texts = [doc.page_content for doc in vector_results]
320
 
 
333
  sorted_indices = np.argsort(rerank_scores)[::-1]
334
  final_docs = [candidate_texts[i] for i in sorted_indices[:FINAL_K]]
335
 
336
+ retrieval_time = time.time() - start_time
337
+
338
+ return final_docs, candidate_texts, retrieval_time
339
 
340
  # ======================================
341
  # Generate Answer
 
343
  def generate_answer(prompt: str):
344
  api_key = os.getenv("GROQ_API_KEY")
345
  if not api_key:
346
+ return "ERROR: GROQ_API_KEY not set", 0
347
 
348
  from groq import Groq
349
  client = Groq(api_key=api_key)
350
 
351
+ start_time = time.time()
352
  response = client.chat.completions.create(
353
  model="llama-3.3-70b-versatile",
354
  messages=[
 
358
  temperature=0.3,
359
  max_tokens=700
360
  )
361
+ generation_time = time.time() - start_time
362
+
363
+ return response.choices[0].message.content.strip(), generation_time
364
 
365
  # ======================================
366
  # Logging Function with All Metrics
367
  # ======================================
368
+ def log_query(query: str, context: str, answer: str, all_candidates: List[str],
369
+ retrieval_time: float, generation_time: float, metadata: Dict = None):
370
  global query_counter
371
 
372
  query_counter += 1
373
 
374
  hallucination = evaluator.evaluate_hallucination(answer, context)
375
+ relevance = evaluator.evaluate_answer_relevance(answer, query)
376
+ context_rel = evaluator.evaluate_context_relevance(query, context)
377
+ completeness = evaluator.evaluate_answer_completeness(answer)
378
 
379
  retrieval_metrics = {}
380
  if query in ground_truth_map:
 
382
  retrieval_metrics = retrieval_evaluator.evaluate_retrieval(query, all_candidates, [relevant_chunk])
383
  else:
384
  retrieval_metrics = {
385
+ "precision_at_1": None, "precision_at_3": None, "precision_at_5": None,
386
+ "recall_at_5": None, "recall_at_10": None,
387
+ "hit_rate_at_1": None, "hit_rate_at_3": None, "hit_rate_at_5": None,
388
+ "mrr": None, "ndcg_at_5": None, "map_score": None,
389
  "retrieval_quality_score": None,
390
  }
391
 
 
397
  "context_length": len(context),
398
  "context_chunks": context.count("\n\n") + 1,
399
  "answer_length": len(answer),
400
+ # Generation metrics
401
  "hallucination_score": hallucination["hallucination_score"],
402
+ "faithfulness_score": hallucination["faithfulness_score"],
403
  "is_hallucinating": hallucination["is_hallucinating"],
404
  "relevance_score": relevance["relevance_score"],
405
+ "context_similarity": context_rel["context_similarity"],
406
+ "context_relevance_score": context_rel["context_relevance_score"],
407
+ "query_coverage": context_rel["query_coverage"],
408
+ "answer_completeness": completeness["completeness_score"],
409
+ "answer_word_count": completeness["answer_length_words"],
410
+ # Latency metrics
411
+ "retrieval_time_sec": round(retrieval_time, 3),
412
+ "generation_time_sec": round(generation_time, 3),
413
+ "total_latency_sec": round(retrieval_time + generation_time, 3),
414
+ # Retrieval metrics
415
  "precision_at_5": retrieval_metrics.get("precision_at_5"),
416
  "recall_at_5": retrieval_metrics.get("recall_at_5"),
417
+ "hit_rate_at_5": retrieval_metrics.get("hit_rate_at_5"),
418
  "mrr": retrieval_metrics.get("mrr"),
419
+ "ndcg_at_5": retrieval_metrics.get("ndcg_at_5"),
420
+ "map_score": retrieval_metrics.get("map_score"),
421
  "retrieval_quality_score": retrieval_metrics.get("retrieval_quality_score"),
422
  "metadata": metadata or {}
423
  }
 
428
  json.dump(log_entry, f)
429
  f.write("\n")
430
 
431
+ return log_entry, retrieval_metrics, context_rel
432
 
433
  # ======================================
434
  # Main Function
 
444
  return "Could not extract enough text from the file."
445
 
446
  vectorstore, bm25, texts = build_hybrid_index(text)
447
+ retrieved_docs, all_candidates, retrieval_time = hybrid_retrieve(query, vectorstore, bm25, texts)
448
 
449
  context = "\n\n".join(retrieved_docs)
450
 
 
455
  Question: {query}
456
  Answer:"""
457
 
458
+ answer, generation_time = generate_answer(prompt)
459
 
460
+ log_entry, retrieval_metrics, context_rel = log_query(query, context, answer, all_candidates,
461
+ retrieval_time, generation_time, {
462
  "num_retrieved_chunks": len(retrieved_docs),
463
  "total_context_chars": len(context)
464
  })
465
 
466
+ # Build evaluation summary
467
  eval_summary = f"""
468
 
469
+ === INDUSTRY-STANDARD RAG EVALUATION ===
 
 
 
 
 
470
 
471
+ Generation Quality (RAGAS-style):
472
+ - Faithfulness: {log_entry['faithfulness_score']} (target: > 0.7)
473
+ - Answer Relevance: {log_entry['relevance_score']} (target: > 0.5)
474
+ - Context Relevance: {log_entry['context_relevance_score']} (target: > 0.4)
475
+ - Hallucination: {log_entry['hallucination_score']} (target: < 0.3)
476
 
477
+ Retrieval Quality:
478
+ - Precision@5: {retrieval_metrics.get('precision_at_5', 'N/A')} (target: > 0.6)
479
+ - Recall@5: {retrieval_metrics.get('recall_at_5', 'N/A')} (target: > 0.7)
480
+ - Hit Rate@5: {retrieval_metrics.get('hit_rate_at_5', 'N/A')} (target: > 0.8)
481
+ - MRR: {retrieval_metrics.get('mrr', 'N/A')} (target: > 0.7)
482
+ - NDCG@5: {retrieval_metrics.get('ndcg_at_5', 'N/A')} (target: > 0.7)
483
+ - MAP: {retrieval_metrics.get('map_score', 'N/A')} (target: > 0.6)
484
+
485
+ Performance Metrics:
486
+ - Retrieval Latency: {log_entry['retrieval_time_sec']} sec
487
+ - Generation Latency: {log_entry['generation_time_sec']} sec
488
+ - Total Latency: {log_entry['total_latency_sec']} sec
489
+
490
+ Query #{log_entry['query_id']} | Session: {current_session_id}
491
  """
 
 
492
 
493
  return answer + eval_summary
494
 
 
501
 
502
  df = pd.DataFrame(evaluation_log)
503
 
 
 
 
 
 
504
  summary = f"""
505
+ === RAG SYSTEM PERFORMANCE DASHBOARD ===
506
 
507
+ Session: {current_session_id} | Total Queries: {len(df)}
 
508
 
509
+ GENERATION QUALITY (Industry Standards):
510
+ - Avg Faithfulness: {df['faithfulness_score'].mean():.3f} (target > 0.7)
511
+ - Avg Answer Relevance: {df['relevance_score'].mean():.3f} (target > 0.5)
512
+ - Avg Context Relevance: {df['context_relevance_score'].mean():.3f} (target > 0.4)
513
+ - Hallucination Rate: {(df['is_hallucinating'].sum() / len(df)) * 100:.1f}% (target < 30%)
514
 
515
+ RETRIEVAL QUALITY:
516
+ - Avg Precision@5: {df['precision_at_5'].mean():.3f} (target > 0.6)
517
+ - Avg Recall@5: {df['recall_at_5'].mean():.3f} (target > 0.7)
518
+ - Avg Hit Rate@5: {df['hit_rate_at_5'].mean():.3f} (target > 0.8)
519
+ - Avg MRR: {df['mrr'].mean():.3f} (target > 0.7)
520
+ - Avg NDCG@5: {df['ndcg_at_5'].mean():.3f} (target > 0.7)
521
 
522
+ PERFORMANCE:
523
+ - Avg Retrieval Time: {df['retrieval_time_sec'].mean():.2f} sec
524
+ - Avg Generation Time: {df['generation_time_sec'].mean():.2f} sec
525
+ - Avg Total Latency: {df['total_latency_sec'].mean():.2f} sec
526
 
527
+ RECENT QUERIES:
528
  """
 
529
  for _, row in df.tail(5).iterrows():
530
+ summary += f"\nQ{row['query_id']}: {row['query'][:35]}... | F:{row['faithfulness_score']:.2f} | R:{row['relevance_score']:.2f} | Lat:{row['total_latency_sec']:.1f}s"
531
 
532
  return summary
533
 
 
544
  global evaluation_log, query_counter
545
  evaluation_log = []
546
  query_counter = 0
547
+ return "Logs reset."
548
 
549
  # ======================================
550
  # Gradio UI
551
  # ======================================
552
+ with gr.Blocks(title="Enterprise RAG with Industry Metrics", theme=gr.themes.Soft()) as demo:
553
+ gr.Markdown("# Enterprise RAG Chatbot")
554
+ gr.Markdown("Hybrid Search + Re-ranking + Industry-Standard RAG Evaluation (RAGAS, Precision/Recall, Latency)")
555
 
556
  with gr.Tabs():
557
  with gr.TabItem("Chat"):
 
562
  btn = gr.Button("Get Answer", variant="primary")
563
 
564
  with gr.Column(scale=2):
565
+ output = gr.Textbox(label="Answer", lines=35)
566
 
567
  btn.click(
568
  fn=answer_question,
 
573
  with gr.TabItem("Analytics"):
574
  gr.Markdown("## RAG System Analytics Dashboard")
575
 
576
+ summary_output = gr.Markdown("No data yet.")
577
 
578
  with gr.Row():
579
  refresh_btn = gr.Button("Refresh Summary", variant="primary")
 
585
 
586
  def export_and_show():
587
  path = export_data()
588
+ return f"Exported to: {path}" if path else "No data"
589
 
590
  export_btn.click(fn=export_and_show, outputs=summary_output)
591
 
592
  gr.Markdown("""
593
+ ### Industry-Standard Metrics Explained:
594
 
595
+ | Metric | Category | Target | What It Measures |
596
+ |--------|----------|--------|------------------|
597
+ | Faithfulness | Generation | > 0.7 | Answer grounded in context |
598
+ | Answer Relevance | Generation | > 0.5 | Answer addresses question |
599
+ | Context Relevance | Generation | > 0.4 | Retrieved context matches query |
600
+ | Precision@5 | Retrieval | > 0.6 | Accuracy of top 5 chunks |
601
+ | Recall@5 | Retrieval | > 0.7 | Coverage of relevant chunks |
602
+ | Hit Rate@5 | Retrieval | > 0.8 | At least one relevant chunk in top 5 |
603
+ | MRR | Ranking | > 0.7 | First relevant chunk position |
604
+ | NDCG@5 | Ranking | > 0.7 | Quality of ranking order |
605
+ | MAP | Ranking | > 0.6 | Average precision across all ranks |
606
+ | Latency | Performance | < 5 sec | End-to-end response time |
607
  """)
608
 
609
  if __name__ == "__main__":