Spaces:

muddasser
/

RAG_Groq_70B

Sleeping

App Files Files Community

muddasser commited on 5 days ago

Commit

5b3d991

verified ·

1 Parent(s): c95a2db

Update app.py

Browse files

Files changed (1) hide show

app.py +185 -111

app.py CHANGED Viewed

@@ -5,8 +5,9 @@ import docx2txt
 import numpy as np
 import os
 import json
 from datetime import datetime
-from typing import Dict, List
 # Hybrid + Re-ranking imports
 from rank_bm25 import BM25Okapi
@@ -42,90 +43,116 @@ ground_truth_map = {}
 print("Models loaded successfully!")
 # ======================================
-# Retrieval Quality Evaluator
 # ======================================
 class RetrievalEvaluator:
-    """Evaluates retrieval quality: Precision@K, Recall@K, MRR"""
     @staticmethod
-    def precision_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int = None) -> float:
-        if k is None:
-            k = len(retrieved_chunks)
         top_k = retrieved_chunks[:k]
         relevant_set = set(relevant_chunks)
         relevant_retrieved = sum(1 for chunk in top_k if chunk in relevant_set)
-        return relevant_retrieved / k if k > 0 else 0.0
     @staticmethod
-    def recall_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int = None) -> float:
-        if k is None:
-            k = len(retrieved_chunks)
         top_k = retrieved_chunks[:k]
         relevant_set = set(relevant_chunks)
         relevant_retrieved = sum(1 for chunk in top_k if chunk in relevant_set)
         total_relevant = len(relevant_set)
         return relevant_retrieved / total_relevant if total_relevant > 0 else 0.0
     @staticmethod
     def mrr(retrieved_chunks: List[str], relevant_chunks: List[str]) -> float:
         relevant_set = set(relevant_chunks)
         for i, chunk in enumerate(retrieved_chunks, start=1):
             if chunk in relevant_set:
                 return 1.0 / i
         return 0.0
     @staticmethod
-    def ndcg_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int = None) -> float:
-        if k is None:
-            k = len(retrieved_chunks)
         relevant_set = set(relevant_chunks)
         dcg = 0.0
         for i, chunk in enumerate(retrieved_chunks[:k], start=1):
             if chunk in relevant_set:
                 dcg += 1.0 / np.log2(i + 1)
         ideal_relevant = min(len(relevant_set), k)
         idcg = sum(1.0 / np.log2(i + 1) for i in range(1, ideal_relevant + 1))
         return dcg / idcg if idcg > 0 else 0.0
     def evaluate_retrieval(self, query: str, retrieved_chunks: List[str], relevant_chunks: List[str]) -> Dict:
         if not relevant_chunks:
             return {
-                "precision_at_1": None,
-                "precision_at_3": None,
-                "precision_at_5": None,
-                "recall_at_5": None,
-                "recall_at_10": None,
-                "mrr": None,
-                "ndcg_at_5": None,
                 "retrieval_quality_score": None,
             }
         metrics = {
-            "precision_at_1": round(self.precision_at_k(retrieved_chunks, relevant_chunks, k=1), 3),
-            "precision_at_3": round(self.precision_at_k(retrieved_chunks, relevant_chunks, k=3), 3),
-            "precision_at_5": round(self.precision_at_k(retrieved_chunks, relevant_chunks, k=5), 3),
-            "recall_at_5": round(self.recall_at_k(retrieved_chunks, relevant_chunks, k=5), 3),
-            "recall_at_10": round(self.recall_at_k(retrieved_chunks, relevant_chunks, k=10), 3),
             "mrr": round(self.mrr(retrieved_chunks, relevant_chunks), 3),
-            "ndcg_at_5": round(self.ndcg_at_k(retrieved_chunks, relevant_chunks, k=5), 3),
         }
         metrics["retrieval_quality_score"] = round(
-            (metrics["precision_at_5"] * 0.3 +
-             metrics["recall_at_5"] * 0.3 +
              metrics["mrr"] * 0.2 +
-             metrics["ndcg_at_5"] * 0.2), 3
         )
         return metrics
@@ -133,12 +160,12 @@ class RetrievalEvaluator:
 retrieval_evaluator = RetrievalEvaluator()
 # ======================================
-# RAG Evaluator (Hallucination, Relevance, Context Similarity)
 # ======================================
 class RAGEvaluator:
     @staticmethod
     def evaluate_hallucination(answer: str, context: str) -> dict:
-        """Hallucination score: % of claims not supported by context"""
         answer_sentences = [s.strip() for s in answer.split('.') if len(s.strip()) > 10]
         context_lower = context.lower()
@@ -158,13 +185,14 @@ class RAGEvaluator:
         return {
             "hallucination_score": round(hallucination_score, 3),
             "is_hallucinating": hallucination_score > 0.3,
             "potential_hallucinations": unsupported_claims[:3]
         }
     @staticmethod
-    def evaluate_relevance(answer: str, query: str) -> dict:
-        """Relevance score: word overlap between answer and question"""
         stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
                      'what', 'how', 'why', 'when', 'where', 'is', 'are', 'was', 'were', 'be', 'been'}
@@ -184,8 +212,8 @@ class RAGEvaluator:
         }
     @staticmethod
-    def evaluate_context_similarity(query: str, context: str) -> dict:
-        """Context Similarity: measures how well retrieved context matches query"""
         query_words = set(query.lower().split())
         context_words = set(context.lower().split())
@@ -206,11 +234,26 @@ class RAGEvaluator:
         return {
             "context_similarity": round(context_score, 3),
             "jaccard_similarity": round(jaccard_similarity, 3),
             "query_coverage": round(coverage, 3),
             "matched_terms": list(query_clean.intersection(context_clean))[:10],
             "missing_terms": list(query_clean - context_clean)[:10]
         }
 evaluator = RAGEvaluator()
@@ -270,6 +313,8 @@ def hybrid_retrieve(query: str, vectorstore, bm25, texts):
     if not vectorstore or not bm25:
         return [], []
     vector_results = vectorstore.similarity_search(query, k=RETRIEVE_K)
     vector_texts = [doc.page_content for doc in vector_results]
@@ -288,7 +333,9 @@ def hybrid_retrieve(query: str, vectorstore, bm25, texts):
     sorted_indices = np.argsort(rerank_scores)[::-1]
     final_docs = [candidate_texts[i] for i in sorted_indices[:FINAL_K]]
-    return final_docs, candidate_texts
 # ======================================
 # Generate Answer
@@ -296,11 +343,12 @@ def hybrid_retrieve(query: str, vectorstore, bm25, texts):
 def generate_answer(prompt: str):
     api_key = os.getenv("GROQ_API_KEY")
     if not api_key:
-        return "ERROR: GROQ_API_KEY not set"
     from groq import Groq
     client = Groq(api_key=api_key)
     response = client.chat.completions.create(
         model="llama-3.3-70b-versatile",
         messages=[
@@ -310,19 +358,23 @@ def generate_answer(prompt: str):
         temperature=0.3,
         max_tokens=700
     )
-    return response.choices[0].message.content.strip()
 # ======================================
 # Logging Function with All Metrics
 # ======================================
-def log_query(query: str, context: str, answer: str, all_candidates: List[str], metadata: Dict = None):
     global query_counter
     query_counter += 1
     hallucination = evaluator.evaluate_hallucination(answer, context)
-    relevance = evaluator.evaluate_relevance(answer, query)
-    context_sim = evaluator.evaluate_context_similarity(query, context)
     retrieval_metrics = {}
     if query in ground_truth_map:
@@ -330,9 +382,10 @@ def log_query(query: str, context: str, answer: str, all_candidates: List[str],
         retrieval_metrics = retrieval_evaluator.evaluate_retrieval(query, all_candidates, [relevant_chunk])
     else:
         retrieval_metrics = {
-            "precision_at_5": None,
-            "recall_at_5": None,
-            "mrr": None,
             "retrieval_quality_score": None,
         }
@@ -344,15 +397,27 @@ def log_query(query: str, context: str, answer: str, all_candidates: List[str],
         "context_length": len(context),
         "context_chunks": context.count("\n\n") + 1,
         "answer_length": len(answer),
         "hallucination_score": hallucination["hallucination_score"],
         "is_hallucinating": hallucination["is_hallucinating"],
         "relevance_score": relevance["relevance_score"],
-        "context_similarity": context_sim["context_similarity"],
-        "jaccard_similarity": context_sim["jaccard_similarity"],
-        "query_coverage": context_sim["query_coverage"],
         "precision_at_5": retrieval_metrics.get("precision_at_5"),
         "recall_at_5": retrieval_metrics.get("recall_at_5"),
         "mrr": retrieval_metrics.get("mrr"),
         "retrieval_quality_score": retrieval_metrics.get("retrieval_quality_score"),
         "metadata": metadata or {}
     }
@@ -363,7 +428,7 @@ def log_query(query: str, context: str, answer: str, all_candidates: List[str],
         json.dump(log_entry, f)
         f.write("\n")
-    return log_entry, retrieval_metrics, context_sim
 # ======================================
 # Main Function
@@ -379,7 +444,7 @@ def answer_question(file, query):
         return "Could not extract enough text from the file."
     vectorstore, bm25, texts = build_hybrid_index(text)
-    retrieved_docs, all_candidates = hybrid_retrieve(query, vectorstore, bm25, texts)
     context = "\n\n".join(retrieved_docs)
@@ -390,37 +455,40 @@ Context:
 Question: {query}
 Answer:"""
-    answer = generate_answer(prompt)
-    log_entry, retrieval_metrics, context_sim = log_query(query, context, answer, all_candidates, {
         "num_retrieved_chunks": len(retrieved_docs),
         "total_context_chars": len(context)
     })
     eval_summary = f"""
----
-=== Evaluation Results ===
-Generation Quality:
-- Hallucination: {log_entry['hallucination_score']} (Good if < 0.3)
-- Relevance: {log_entry['relevance_score']} (Good if > 0.5)
-Retrieval Quality (Context vs Query):
-- Context Similarity: {context_sim['context_similarity']} (Good if > 0.4)
-- Query Coverage: {context_sim['query_coverage']*100:.0f}%
-- Matched Terms: {', '.join(context_sim['matched_terms'][:5]) if context_sim['matched_terms'] else 'None'}
-"""
-    if retrieval_metrics.get('precision_at_5') is not None:
-        eval_summary += f"""
-Precision/Recall:
-- Precision@5: {retrieval_metrics.get('precision_at_5', 'N/A')}
-- Recall@5: {retrieval_metrics.get('recall_at_5', 'N/A')}
-- MRR: {retrieval_metrics.get('mrr', 'N/A')}
 """
-    eval_summary += f"\nQuery #{log_entry['query_id']} | Session: {current_session_id}"
     return answer + eval_summary
@@ -433,35 +501,33 @@ def show_summary():
     df = pd.DataFrame(evaluation_log)
-    avg_hallucination = df['hallucination_score'].mean()
-    avg_relevance = df['relevance_score'].mean()
-    avg_context_sim = df['context_similarity'].mean()
-    hallucination_rate = (df['is_hallucinating'].sum() / len(df)) * 100
     summary = f"""
-=== RAG System Performance Summary ===
-Session ID: {current_session_id}
-Total Queries: {len(df)}
-Generation Quality:
-- Avg Hallucination: {avg_hallucination:.3f}
-- Hallucination Rate: {hallucination_rate:.1f}%
-- Avg Relevance: {avg_relevance:.3f}
-Retrieval Quality:
-- Avg Context Similarity: {avg_context_sim:.3f}
-Usage Statistics:
-- Avg Context Length: {df['context_length'].mean():.0f} chars
-- Avg Answer Length: {df['answer_length'].mean():.0f} chars
-- Avg Chunks per Query: {df['context_chunks'].mean():.1f}
-Recent Queries:
 """
     for _, row in df.tail(5).iterrows():
-        summary += f"\nQ{row['query_id']}: {row['query'][:40]}... | H:{row['hallucination_score']:.2f} | Rel:{row['relevance_score']:.2f} | Ctx:{row['context_similarity']:.2f}"
     return summary
@@ -478,14 +544,14 @@ def reset_logs():
     global evaluation_log, query_counter
     evaluation_log = []
     query_counter = 0
-    return "Logs reset. Starting fresh!"
 # ======================================
 # Gradio UI
 # ======================================
-with gr.Blocks(title="Hybrid RAG with Evaluation", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# Hybrid RAG Chatbot")
-    gr.Markdown("Hybrid Search + Re-ranking + Complete RAG Evaluation")
     with gr.Tabs():
         with gr.TabItem("Chat"):
@@ -496,7 +562,7 @@ with gr.Blocks(title="Hybrid RAG with Evaluation", theme=gr.themes.Soft()) as de
                     btn = gr.Button("Get Answer", variant="primary")
                 with gr.Column(scale=2):
-                    output = gr.Textbox(label="Answer", lines=30)
             btn.click(
                 fn=answer_question,
@@ -507,7 +573,7 @@ with gr.Blocks(title="Hybrid RAG with Evaluation", theme=gr.themes.Soft()) as de
         with gr.TabItem("Analytics"):
             gr.Markdown("## RAG System Analytics Dashboard")
-            summary_output = gr.Markdown("No data yet. Ask some questions first!")
             with gr.Row():
                 refresh_btn = gr.Button("Refresh Summary", variant="primary")
@@ -519,17 +585,25 @@ with gr.Blocks(title="Hybrid RAG with Evaluation", theme=gr.themes.Soft()) as de
             def export_and_show():
                 path = export_data()
-                return f"Exported to: {path}" if path else "No data to export"
             export_btn.click(fn=export_and_show, outputs=summary_output)
             gr.Markdown("""
-            Metrics Explained:
-            - Hallucination: Lower is better (< 0.3 = Good)
-            - Relevance: Higher is better (> 0.5 = Good)
-            - Context Similarity: Higher is better (> 0.4 = Good)
-            - Query Coverage: % of question words found in context
             """)
 if __name__ == "__main__":

 import numpy as np
 import os
 import json
+import time
 from datetime import datetime
+from typing import Dict, List, Optional
 # Hybrid + Re-ranking imports
 from rank_bm25 import BM25Okapi
 print("Models loaded successfully!")
 # ======================================
+# Industry-Standard Retrieval Quality Evaluator
 # ======================================
 class RetrievalEvaluator:
+    """Evaluates retrieval quality: Precision@K, Recall@K, MRR, NDCG, Hit Rate"""
     @staticmethod
+    def precision_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int) -> float:
+        """Precision@K: Of top K retrieved, how many are relevant"""
+        if k == 0:
+            return 0.0
         top_k = retrieved_chunks[:k]
         relevant_set = set(relevant_chunks)
         relevant_retrieved = sum(1 for chunk in top_k if chunk in relevant_set)
+        return relevant_retrieved / k
     @staticmethod
+    def recall_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int) -> float:
+        """Recall@K: Of all relevant chunks, how many are in top K"""
         top_k = retrieved_chunks[:k]
         relevant_set = set(relevant_chunks)
         relevant_retrieved = sum(1 for chunk in top_k if chunk in relevant_set)
         total_relevant = len(relevant_set)
         return relevant_retrieved / total_relevant if total_relevant > 0 else 0.0
     @staticmethod
     def mrr(retrieved_chunks: List[str], relevant_chunks: List[str]) -> float:
+        """Mean Reciprocal Rank: 1 / position of first relevant chunk"""
         relevant_set = set(relevant_chunks)
         for i, chunk in enumerate(retrieved_chunks, start=1):
             if chunk in relevant_set:
                 return 1.0 / i
         return 0.0
     @staticmethod
+    def hit_rate_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int) -> float:
+        """Hit Rate@K: Whether at least one relevant chunk appears in top K"""
+        top_k = retrieved_chunks[:k]
+        relevant_set = set(relevant_chunks)
+        return 1.0 if any(chunk in relevant_set for chunk in top_k) else 0.0
+    @staticmethod
+    def ndcg_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int) -> float:
+        """NDCG@K: Normalized Discounted Cumulative Gain"""
         relevant_set = set(relevant_chunks)
+        # DCG
         dcg = 0.0
         for i, chunk in enumerate(retrieved_chunks[:k], start=1):
             if chunk in relevant_set:
                 dcg += 1.0 / np.log2(i + 1)
+        # IDCG (ideal DCG)
         ideal_relevant = min(len(relevant_set), k)
         idcg = sum(1.0 / np.log2(i + 1) for i in range(1, ideal_relevant + 1))
         return dcg / idcg if idcg > 0 else 0.0
+    @staticmethod
+    def average_precision(retrieved_chunks: List[str], relevant_chunks: List[str]) -> float:
+        """Average Precision: Average of precision at each relevant chunk position"""
+        relevant_set = set(relevant_chunks)
+        if not relevant_set:
+            return 0.0
+        precisions = []
+        relevant_found = 0
+        for i, chunk in enumerate(retrieved_chunks, start=1):
+            if chunk in relevant_set:
+                relevant_found += 1
+                precisions.append(relevant_found / i)
+        return sum(precisions) / len(relevant_set) if precisions else 0.0
     def evaluate_retrieval(self, query: str, retrieved_chunks: List[str], relevant_chunks: List[str]) -> Dict:
+        """Calculate all retrieval metrics"""
         if not relevant_chunks:
             return {
+                "precision_at_1": None, "precision_at_3": None, "precision_at_5": None,
+                "recall_at_5": None, "recall_at_10": None,
+                "hit_rate_at_1": None, "hit_rate_at_3": None, "hit_rate_at_5": None,
+                "mrr": None, "ndcg_at_5": None, "map_score": None,
                 "retrieval_quality_score": None,
             }
         metrics = {
+            # Precision
+            "precision_at_1": round(self.precision_at_k(retrieved_chunks, relevant_chunks, 1), 3),
+            "precision_at_3": round(self.precision_at_k(retrieved_chunks, relevant_chunks, 3), 3),
+            "precision_at_5": round(self.precision_at_k(retrieved_chunks, relevant_chunks, 5), 3),
+            # Recall
+            "recall_at_5": round(self.recall_at_k(retrieved_chunks, relevant_chunks, 5), 3),
+            "recall_at_10": round(self.recall_at_k(retrieved_chunks, relevant_chunks, 10), 3),
+            # Hit Rate
+            "hit_rate_at_1": round(self.hit_rate_at_k(retrieved_chunks, relevant_chunks, 1), 3),
+            "hit_rate_at_3": round(self.hit_rate_at_k(retrieved_chunks, relevant_chunks, 3), 3),
+            "hit_rate_at_5": round(self.hit_rate_at_k(retrieved_chunks, relevant_chunks, 5), 3),
+            # Ranking metrics
             "mrr": round(self.mrr(retrieved_chunks, relevant_chunks), 3),
+            "ndcg_at_5": round(self.ndcg_at_k(retrieved_chunks, relevant_chunks, 5), 3),
+            "map_score": round(self.average_precision(retrieved_chunks, relevant_chunks), 3),
         }
+        # Overall retrieval quality score (weighted average)
         metrics["retrieval_quality_score"] = round(
+            (metrics["precision_at_5"] * 0.25 +
+             metrics["recall_at_5"] * 0.25 +
              metrics["mrr"] * 0.2 +
+             metrics["ndcg_at_5"] * 0.15 +
+             metrics["map_score"] * 0.15), 3
         )
         return metrics
 retrieval_evaluator = RetrievalEvaluator()
 # ======================================
+# Industry-Standard RAG Evaluator
 # ======================================
 class RAGEvaluator:
     @staticmethod
     def evaluate_hallucination(answer: str, context: str) -> dict:
+        """Faithfulness/Hallucination: % of claims not supported by context"""
         answer_sentences = [s.strip() for s in answer.split('.') if len(s.strip()) > 10]
         context_lower = context.lower()
         return {
             "hallucination_score": round(hallucination_score, 3),
+            "faithfulness_score": round(1 - hallucination_score, 3),  # Industry standard
             "is_hallucinating": hallucination_score > 0.3,
             "potential_hallucinations": unsupported_claims[:3]
         }
     @staticmethod
+    def evaluate_answer_relevance(answer: str, query: str) -> dict:
+        """Answer Relevance: How well answer addresses the question"""
         stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
                      'what', 'how', 'why', 'when', 'where', 'is', 'are', 'was', 'were', 'be', 'been'}
         }
     @staticmethod
+    def evaluate_context_relevance(query: str, context: str) -> dict:
+        """Context Relevance: How well retrieved context matches query"""
         query_words = set(query.lower().split())
         context_words = set(context.lower().split())
         return {
             "context_similarity": round(context_score, 3),
+            "context_relevance_score": round(context_score, 3),  # Industry standard name
             "jaccard_similarity": round(jaccard_similarity, 3),
             "query_coverage": round(coverage, 3),
             "matched_terms": list(query_clean.intersection(context_clean))[:10],
             "missing_terms": list(query_clean - context_clean)[:10]
         }
+    @staticmethod
+    def evaluate_answer_completeness(answer: str, expected_length: int = 50) -> dict:
+        """Answer Completeness: Length and structure of answer"""
+        words = answer.split()
+        sentences = answer.count('.')
+        return {
+            "answer_length_words": len(words),
+            "answer_length_chars": len(answer),
+            "sentence_count": sentences,
+            "is_complete": len(words) > expected_length,
+            "completeness_score": min(1.0, len(words) / expected_length)
+        }
 evaluator = RAGEvaluator()
     if not vectorstore or not bm25:
         return [], []
+    start_time = time.time()
     vector_results = vectorstore.similarity_search(query, k=RETRIEVE_K)
     vector_texts = [doc.page_content for doc in vector_results]
     sorted_indices = np.argsort(rerank_scores)[::-1]
     final_docs = [candidate_texts[i] for i in sorted_indices[:FINAL_K]]
+    retrieval_time = time.time() - start_time
+    return final_docs, candidate_texts, retrieval_time
 # ======================================
 # Generate Answer
 def generate_answer(prompt: str):
     api_key = os.getenv("GROQ_API_KEY")
     if not api_key:
+        return "ERROR: GROQ_API_KEY not set", 0
     from groq import Groq
     client = Groq(api_key=api_key)
+    start_time = time.time()
     response = client.chat.completions.create(
         model="llama-3.3-70b-versatile",
         messages=[
         temperature=0.3,
         max_tokens=700
     )
+    generation_time = time.time() - start_time
+    return response.choices[0].message.content.strip(), generation_time
 # ======================================
 # Logging Function with All Metrics
 # ======================================
+def log_query(query: str, context: str, answer: str, all_candidates: List[str],
+              retrieval_time: float, generation_time: float, metadata: Dict = None):
     global query_counter
     query_counter += 1
     hallucination = evaluator.evaluate_hallucination(answer, context)
+    relevance = evaluator.evaluate_answer_relevance(answer, query)
+    context_rel = evaluator.evaluate_context_relevance(query, context)
+    completeness = evaluator.evaluate_answer_completeness(answer)
     retrieval_metrics = {}
     if query in ground_truth_map:
         retrieval_metrics = retrieval_evaluator.evaluate_retrieval(query, all_candidates, [relevant_chunk])
     else:
         retrieval_metrics = {
+            "precision_at_1": None, "precision_at_3": None, "precision_at_5": None,
+            "recall_at_5": None, "recall_at_10": None,
+            "hit_rate_at_1": None, "hit_rate_at_3": None, "hit_rate_at_5": None,
+            "mrr": None, "ndcg_at_5": None, "map_score": None,
             "retrieval_quality_score": None,
         }
         "context_length": len(context),
         "context_chunks": context.count("\n\n") + 1,
         "answer_length": len(answer),
+        # Generation metrics
         "hallucination_score": hallucination["hallucination_score"],
+        "faithfulness_score": hallucination["faithfulness_score"],
         "is_hallucinating": hallucination["is_hallucinating"],
         "relevance_score": relevance["relevance_score"],
+        "context_similarity": context_rel["context_similarity"],
+        "context_relevance_score": context_rel["context_relevance_score"],
+        "query_coverage": context_rel["query_coverage"],
+        "answer_completeness": completeness["completeness_score"],
+        "answer_word_count": completeness["answer_length_words"],
+        # Latency metrics
+        "retrieval_time_sec": round(retrieval_time, 3),
+        "generation_time_sec": round(generation_time, 3),
+        "total_latency_sec": round(retrieval_time + generation_time, 3),
+        # Retrieval metrics
         "precision_at_5": retrieval_metrics.get("precision_at_5"),
         "recall_at_5": retrieval_metrics.get("recall_at_5"),
+        "hit_rate_at_5": retrieval_metrics.get("hit_rate_at_5"),
         "mrr": retrieval_metrics.get("mrr"),
+        "ndcg_at_5": retrieval_metrics.get("ndcg_at_5"),
+        "map_score": retrieval_metrics.get("map_score"),
         "retrieval_quality_score": retrieval_metrics.get("retrieval_quality_score"),
         "metadata": metadata or {}
     }
         json.dump(log_entry, f)
         f.write("\n")
+    return log_entry, retrieval_metrics, context_rel
 # ======================================
 # Main Function
         return "Could not extract enough text from the file."
     vectorstore, bm25, texts = build_hybrid_index(text)
+    retrieved_docs, all_candidates, retrieval_time = hybrid_retrieve(query, vectorstore, bm25, texts)
     context = "\n\n".join(retrieved_docs)
 Question: {query}
 Answer:"""
+    answer, generation_time = generate_answer(prompt)
+    log_entry, retrieval_metrics, context_rel = log_query(query, context, answer, all_candidates,
+                                                           retrieval_time, generation_time, {
         "num_retrieved_chunks": len(retrieved_docs),
         "total_context_chars": len(context)
     })
+    # Build evaluation summary
     eval_summary = f"""
+=== INDUSTRY-STANDARD RAG EVALUATION ===
+Generation Quality (RAGAS-style):
+- Faithfulness: {log_entry['faithfulness_score']} (target: > 0.7)
+- Answer Relevance: {log_entry['relevance_score']} (target: > 0.5)
+- Context Relevance: {log_entry['context_relevance_score']} (target: > 0.4)
+- Hallucination: {log_entry['hallucination_score']} (target: < 0.3)
+Retrieval Quality:
+- Precision@5: {retrieval_metrics.get('precision_at_5', 'N/A')} (target: > 0.6)
+- Recall@5: {retrieval_metrics.get('recall_at_5', 'N/A')} (target: > 0.7)
+- Hit Rate@5: {retrieval_metrics.get('hit_rate_at_5', 'N/A')} (target: > 0.8)
+- MRR: {retrieval_metrics.get('mrr', 'N/A')} (target: > 0.7)
+- NDCG@5: {retrieval_metrics.get('ndcg_at_5', 'N/A')} (target: > 0.7)
+- MAP: {retrieval_metrics.get('map_score', 'N/A')} (target: > 0.6)
+Performance Metrics:
+- Retrieval Latency: {log_entry['retrieval_time_sec']} sec
+- Generation Latency: {log_entry['generation_time_sec']} sec
+- Total Latency: {log_entry['total_latency_sec']} sec
+Query #{log_entry['query_id']} | Session: {current_session_id}
 """
     return answer + eval_summary
     df = pd.DataFrame(evaluation_log)
     summary = f"""
+=== RAG SYSTEM PERFORMANCE DASHBOARD ===
+Session: {current_session_id} | Total Queries: {len(df)}
+GENERATION QUALITY (Industry Standards):
+- Avg Faithfulness: {df['faithfulness_score'].mean():.3f} (target > 0.7)
+- Avg Answer Relevance: {df['relevance_score'].mean():.3f} (target > 0.5)
+- Avg Context Relevance: {df['context_relevance_score'].mean():.3f} (target > 0.4)
+- Hallucination Rate: {(df['is_hallucinating'].sum() / len(df)) * 100:.1f}% (target < 30%)
+RETRIEVAL QUALITY:
+- Avg Precision@5: {df['precision_at_5'].mean():.3f} (target > 0.6)
+- Avg Recall@5: {df['recall_at_5'].mean():.3f} (target > 0.7)
+- Avg Hit Rate@5: {df['hit_rate_at_5'].mean():.3f} (target > 0.8)
+- Avg MRR: {df['mrr'].mean():.3f} (target > 0.7)
+- Avg NDCG@5: {df['ndcg_at_5'].mean():.3f} (target > 0.7)
+PERFORMANCE:
+- Avg Retrieval Time: {df['retrieval_time_sec'].mean():.2f} sec
+- Avg Generation Time: {df['generation_time_sec'].mean():.2f} sec
+- Avg Total Latency: {df['total_latency_sec'].mean():.2f} sec
+RECENT QUERIES:
 """
     for _, row in df.tail(5).iterrows():
+        summary += f"\nQ{row['query_id']}: {row['query'][:35]}... | F:{row['faithfulness_score']:.2f} | R:{row['relevance_score']:.2f} | Lat:{row['total_latency_sec']:.1f}s"
     return summary
     global evaluation_log, query_counter
     evaluation_log = []
     query_counter = 0
+    return "Logs reset."
 # ======================================
 # Gradio UI
 # ======================================
+with gr.Blocks(title="Enterprise RAG with Industry Metrics", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Enterprise RAG Chatbot")
+    gr.Markdown("Hybrid Search + Re-ranking + Industry-Standard RAG Evaluation (RAGAS, Precision/Recall, Latency)")
     with gr.Tabs():
         with gr.TabItem("Chat"):
                     btn = gr.Button("Get Answer", variant="primary")
                 with gr.Column(scale=2):
+                    output = gr.Textbox(label="Answer", lines=35)
             btn.click(
                 fn=answer_question,
         with gr.TabItem("Analytics"):
             gr.Markdown("## RAG System Analytics Dashboard")
+            summary_output = gr.Markdown("No data yet.")
             with gr.Row():
                 refresh_btn = gr.Button("Refresh Summary", variant="primary")
             def export_and_show():
                 path = export_data()
+                return f"Exported to: {path}" if path else "No data"
             export_btn.click(fn=export_and_show, outputs=summary_output)
             gr.Markdown("""
+            ### Industry-Standard Metrics Explained:
+            | Metric | Category | Target | What It Measures |
+            |--------|----------|--------|------------------|
+            | Faithfulness | Generation | > 0.7 | Answer grounded in context |
+            | Answer Relevance | Generation | > 0.5 | Answer addresses question |
+            | Context Relevance | Generation | > 0.4 | Retrieved context matches query |
+            | Precision@5 | Retrieval | > 0.6 | Accuracy of top 5 chunks |
+            | Recall@5 | Retrieval | > 0.7 | Coverage of relevant chunks |
+            | Hit Rate@5 | Retrieval | > 0.8 | At least one relevant chunk in top 5 |
+            | MRR | Ranking | > 0.7 | First relevant chunk position |
+            | NDCG@5 | Ranking | > 0.7 | Quality of ranking order |
+            | MAP | Ranking | > 0.6 | Average precision across all ranks |
+            | Latency | Performance | < 5 sec | End-to-end response time |
             """)
 if __name__ == "__main__":