Spaces:

muddasser
/

RAG_Groq_70B

Sleeping

App Files Files Community

muddasser commited on 7 days ago

Commit

c95a2db

verified ·

1 Parent(s): 1531375

Update app.py

Browse files

Files changed (1) hide show

app.py +535 -14

app.py CHANGED Viewed

@@ -1,15 +1,536 @@
 ---
-### Evaluation Results:
-**Generation Quality:**
-| Metric | Value | Status |
-|--------|-------|--------|
-| Hallucination | 0.0 | Good |
-| Relevance | 1.0 | Good |
-**Retrieval Quality (Context vs Query):**
-| Metric | Value | Meaning |
-|--------|-------|---------|
-| Context Similarity | 0.65 | How well context matches query |
-| Query Coverage | 80% | % of question words found in context |
-| Matched Terms | capital, France | Key terms found |

+import gradio as gr
+import pandas as pd
+import pypdf
+import docx2txt
+import numpy as np
+import os
+import json
+from datetime import datetime
+from typing import Dict, List
+# Hybrid + Re-ranking imports
+from rank_bm25 import BM25Okapi
+from sentence_transformers import SentenceTransformer, CrossEncoder
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+# ======================================
+# CONFIG
+# ======================================
+EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+CHUNK_SIZE = 800
+CHUNK_OVERLAP = 100
+RETRIEVE_K = 15
+FINAL_K = 5
+# ======================================
+# Global Variables
+# ======================================
+print("Loading embedding and reranker models...")
+embed_model = SentenceTransformer(EMBED_MODEL)
+reranker = CrossEncoder(RERANKER_MODEL)
+# Track evaluation data
+evaluation_log = []
+query_counter = 0
+current_session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
+# For retrieval evaluation (ground truth mapping)
+ground_truth_map = {}
+print("Models loaded successfully!")
+# ======================================
+# Retrieval Quality Evaluator
+# ======================================
+class RetrievalEvaluator:
+    """Evaluates retrieval quality: Precision@K, Recall@K, MRR"""
+    @staticmethod
+    def precision_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int = None) -> float:
+        if k is None:
+            k = len(retrieved_chunks)
+        top_k = retrieved_chunks[:k]
+        relevant_set = set(relevant_chunks)
+        relevant_retrieved = sum(1 for chunk in top_k if chunk in relevant_set)
+        return relevant_retrieved / k if k > 0 else 0.0
+    @staticmethod
+    def recall_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int = None) -> float:
+        if k is None:
+            k = len(retrieved_chunks)
+        top_k = retrieved_chunks[:k]
+        relevant_set = set(relevant_chunks)
+        relevant_retrieved = sum(1 for chunk in top_k if chunk in relevant_set)
+        total_relevant = len(relevant_set)
+        return relevant_retrieved / total_relevant if total_relevant > 0 else 0.0
+    @staticmethod
+    def mrr(retrieved_chunks: List[str], relevant_chunks: List[str]) -> float:
+        relevant_set = set(relevant_chunks)
+        for i, chunk in enumerate(retrieved_chunks, start=1):
+            if chunk in relevant_set:
+                return 1.0 / i
+        return 0.0
+    @staticmethod
+    def ndcg_at_k(retrieved_chunks: List[str], relevant_chunks: List[str], k: int = None) -> float:
+        if k is None:
+            k = len(retrieved_chunks)
+        relevant_set = set(relevant_chunks)
+        dcg = 0.0
+        for i, chunk in enumerate(retrieved_chunks[:k], start=1):
+            if chunk in relevant_set:
+                dcg += 1.0 / np.log2(i + 1)
+        ideal_relevant = min(len(relevant_set), k)
+        idcg = sum(1.0 / np.log2(i + 1) for i in range(1, ideal_relevant + 1))
+        return dcg / idcg if idcg > 0 else 0.0
+    def evaluate_retrieval(self, query: str, retrieved_chunks: List[str], relevant_chunks: List[str]) -> Dict:
+        if not relevant_chunks:
+            return {
+                "precision_at_1": None,
+                "precision_at_3": None,
+                "precision_at_5": None,
+                "recall_at_5": None,
+                "recall_at_10": None,
+                "mrr": None,
+                "ndcg_at_5": None,
+                "retrieval_quality_score": None,
+            }
+        metrics = {
+            "precision_at_1": round(self.precision_at_k(retrieved_chunks, relevant_chunks, k=1), 3),
+            "precision_at_3": round(self.precision_at_k(retrieved_chunks, relevant_chunks, k=3), 3),
+            "precision_at_5": round(self.precision_at_k(retrieved_chunks, relevant_chunks, k=5), 3),
+            "recall_at_5": round(self.recall_at_k(retrieved_chunks, relevant_chunks, k=5), 3),
+            "recall_at_10": round(self.recall_at_k(retrieved_chunks, relevant_chunks, k=10), 3),
+            "mrr": round(self.mrr(retrieved_chunks, relevant_chunks), 3),
+            "ndcg_at_5": round(self.ndcg_at_k(retrieved_chunks, relevant_chunks, k=5), 3),
+        }
+        metrics["retrieval_quality_score"] = round(
+            (metrics["precision_at_5"] * 0.3 +
+             metrics["recall_at_5"] * 0.3 +
+             metrics["mrr"] * 0.2 +
+             metrics["ndcg_at_5"] * 0.2), 3
+        )
+        return metrics
+retrieval_evaluator = RetrievalEvaluator()
+# ======================================
+# RAG Evaluator (Hallucination, Relevance, Context Similarity)
+# ======================================
+class RAGEvaluator:
+    @staticmethod
+    def evaluate_hallucination(answer: str, context: str) -> dict:
+        """Hallucination score: % of claims not supported by context"""
+        answer_sentences = [s.strip() for s in answer.split('.') if len(s.strip()) > 10]
+        context_lower = context.lower()
+        stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were'}
+        unsupported_claims = []
+        for sent in answer_sentences:
+            words = set(sent.lower().split())
+            content_words = words - stopwords
+            if content_words:
+                matches = sum(1 for word in content_words if word in context_lower)
+                if matches / len(content_words) < 0.3:
+                    unsupported_claims.append(sent[:100])
+        hallucination_score = len(unsupported_claims) / len(answer_sentences) if answer_sentences else 0
+        return {
+            "hallucination_score": round(hallucination_score, 3),
+            "is_hallucinating": hallucination_score > 0.3,
+            "potential_hallucinations": unsupported_claims[:3]
+        }
+    @staticmethod
+    def evaluate_relevance(answer: str, query: str) -> dict:
+        """Relevance score: word overlap between answer and question"""
+        stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
+                     'what', 'how', 'why', 'when', 'where', 'is', 'are', 'was', 'were', 'be', 'been'}
+        query_words = set(query.lower().split()) - stopwords
+        answer_words = set(answer.lower().split()) - stopwords
+        if not query_words:
+            return {"relevance_score": 0.5, "matched_terms": []}
+        matched = query_words.intersection(answer_words)
+        relevance = len(matched) / len(query_words)
+        return {
+            "relevance_score": round(relevance, 3),
+            "matched_terms": list(matched)[:10],
+            "match_percentage": f"{relevance*100:.1f}%"
+        }
+    @staticmethod
+    def evaluate_context_similarity(query: str, context: str) -> dict:
+        """Context Similarity: measures how well retrieved context matches query"""
+        query_words = set(query.lower().split())
+        context_words = set(context.lower().split())
+        stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
+                     'what', 'how', 'why', 'when', 'where', 'is', 'are', 'was', 'were', 'be', 'been'}
+        query_clean = query_words - stopwords
+        context_clean = context_words - stopwords
+        if not query_clean:
+            return {"context_similarity": 0.5, "query_coverage": 0, "matched_terms": [], "missing_terms": []}
+        intersection = len(query_clean.intersection(context_clean))
+        union = len(query_clean.union(context_clean))
+        jaccard_similarity = intersection / union if union > 0 else 0
+        coverage = intersection / len(query_clean)
+        context_score = (jaccard_similarity * 0.5 + coverage * 0.5)
+        return {
+            "context_similarity": round(context_score, 3),
+            "jaccard_similarity": round(jaccard_similarity, 3),
+            "query_coverage": round(coverage, 3),
+            "matched_terms": list(query_clean.intersection(context_clean))[:10],
+            "missing_terms": list(query_clean - context_clean)[:10]
+        }
+evaluator = RAGEvaluator()
+# ======================================
+# Extract text from uploaded file
+# ======================================
+def extract_text(file):
+    if not file:
+        return ""
+    filename = file.name.lower()
+    try:
+        if filename.endswith(".pdf"):
+            reader = pypdf.PdfReader(file.name)
+            return "\n".join([page.extract_text() or "" for page in reader.pages])
+        elif filename.endswith(".docx"):
+            return docx2txt.process(file.name)
+        elif filename.endswith(".csv"):
+            df = pd.read_csv(file.name)
+            return df.to_string(index=False)
+        else:
+            return ""
+    except Exception as e:
+        return f"Error reading file: {str(e)}"
+# ======================================
+# Build Hybrid Index
+# ======================================
+def build_hybrid_index(text: str):
+    if not text.strip():
+        return None, None, None
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP
+    )
+    chunks = splitter.split_text(text)
+    texts = [chunk for chunk in chunks if chunk.strip()]
+    from langchain_community.vectorstores import FAISS
+    from langchain_community.embeddings import HuggingFaceEmbeddings
+    embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
+    vectorstore = FAISS.from_texts(texts, embeddings)
+    tokenized_corpus = [doc.split() for doc in texts]
+    bm25 = BM25Okapi(tokenized_corpus)
+    return vectorstore, bm25, texts
+# ======================================
+# Hybrid Search + Re-ranking
+# ======================================
+def hybrid_retrieve(query: str, vectorstore, bm25, texts):
+    if not vectorstore or not bm25:
+        return [], []
+    vector_results = vectorstore.similarity_search(query, k=RETRIEVE_K)
+    vector_texts = [doc.page_content for doc in vector_results]
+    bm25_scores = bm25.get_scores(query.split())
+    bm25_top_idx = np.argsort(bm25_scores)[::-1][:RETRIEVE_K]
+    bm25_texts = [texts[i] for i in bm25_top_idx if i < len(texts)]
+    candidate_texts = list(dict.fromkeys(vector_texts + bm25_texts))[:RETRIEVE_K]
+    if not candidate_texts:
+        return [], []
+    pairs = [[query, cand] for cand in candidate_texts]
+    rerank_scores = reranker.predict(pairs)
+    sorted_indices = np.argsort(rerank_scores)[::-1]
+    final_docs = [candidate_texts[i] for i in sorted_indices[:FINAL_K]]
+    return final_docs, candidate_texts
+# ======================================
+# Generate Answer
+# ======================================
+def generate_answer(prompt: str):
+    api_key = os.getenv("GROQ_API_KEY")
+    if not api_key:
+        return "ERROR: GROQ_API_KEY not set"
+    from groq import Groq
+    client = Groq(api_key=api_key)
+    response = client.chat.completions.create(
+        model="llama-3.3-70b-versatile",
+        messages=[
+            {"role": "system", "content": "You are a precise assistant. Answer using only the given context."},
+            {"role": "user", "content": prompt}
+        ],
+        temperature=0.3,
+        max_tokens=700
+    )
+    return response.choices[0].message.content.strip()
+# ======================================
+# Logging Function with All Metrics
+# ======================================
+def log_query(query: str, context: str, answer: str, all_candidates: List[str], metadata: Dict = None):
+    global query_counter
+    query_counter += 1
+    hallucination = evaluator.evaluate_hallucination(answer, context)
+    relevance = evaluator.evaluate_relevance(answer, query)
+    context_sim = evaluator.evaluate_context_similarity(query, context)
+    retrieval_metrics = {}
+    if query in ground_truth_map:
+        relevant_chunk = ground_truth_map[query]
+        retrieval_metrics = retrieval_evaluator.evaluate_retrieval(query, all_candidates, [relevant_chunk])
+    else:
+        retrieval_metrics = {
+            "precision_at_5": None,
+            "recall_at_5": None,
+            "mrr": None,
+            "retrieval_quality_score": None,
+        }
+    log_entry = {
+        "timestamp": datetime.now().isoformat(),
+        "session_id": current_session_id,
+        "query_id": query_counter,
+        "query": query,
+        "context_length": len(context),
+        "context_chunks": context.count("\n\n") + 1,
+        "answer_length": len(answer),
+        "hallucination_score": hallucination["hallucination_score"],
+        "is_hallucinating": hallucination["is_hallucinating"],
+        "relevance_score": relevance["relevance_score"],
+        "context_similarity": context_sim["context_similarity"],
+        "jaccard_similarity": context_sim["jaccard_similarity"],
+        "query_coverage": context_sim["query_coverage"],
+        "precision_at_5": retrieval_metrics.get("precision_at_5"),
+        "recall_at_5": retrieval_metrics.get("recall_at_5"),
+        "mrr": retrieval_metrics.get("mrr"),
+        "retrieval_quality_score": retrieval_metrics.get("retrieval_quality_score"),
+        "metadata": metadata or {}
+    }
+    evaluation_log.append(log_entry)
+    with open(f"rag_logs_{current_session_id}.json", "a") as f:
+        json.dump(log_entry, f)
+        f.write("\n")
+    return log_entry, retrieval_metrics, context_sim
+# ======================================
+# Main Function
+# ======================================
+def answer_question(file, query):
+    if not file:
+        return "Please upload a document first."
+    if not query or not query.strip():
+        return "Please enter a question."
+    text = extract_text(file)
+    if len(text.strip()) < 50:
+        return "Could not extract enough text from the file."
+    vectorstore, bm25, texts = build_hybrid_index(text)
+    retrieved_docs, all_candidates = hybrid_retrieve(query, vectorstore, bm25, texts)
+    context = "\n\n".join(retrieved_docs)
+    prompt = f"""Use ONLY the following context to answer the question accurately.
+If the context does not contain enough information, say so clearly.
+Context:
+{context}
+Question: {query}
+Answer:"""
+    answer = generate_answer(prompt)
+    log_entry, retrieval_metrics, context_sim = log_query(query, context, answer, all_candidates, {
+        "num_retrieved_chunks": len(retrieved_docs),
+        "total_context_chars": len(context)
+    })
+    eval_summary = f"""
 ---
+=== Evaluation Results ===
+Generation Quality:
+- Hallucination: {log_entry['hallucination_score']} (Good if < 0.3)
+- Relevance: {log_entry['relevance_score']} (Good if > 0.5)
+Retrieval Quality (Context vs Query):
+- Context Similarity: {context_sim['context_similarity']} (Good if > 0.4)
+- Query Coverage: {context_sim['query_coverage']*100:.0f}%
+- Matched Terms: {', '.join(context_sim['matched_terms'][:5]) if context_sim['matched_terms'] else 'None'}
+"""
+    if retrieval_metrics.get('precision_at_5') is not None:
+        eval_summary += f"""
+Precision/Recall:
+- Precision@5: {retrieval_metrics.get('precision_at_5', 'N/A')}
+- Recall@5: {retrieval_metrics.get('recall_at_5', 'N/A')}
+- MRR: {retrieval_metrics.get('mrr', 'N/A')}
+"""
+    eval_summary += f"\nQuery #{log_entry['query_id']} | Session: {current_session_id}"
+    return answer + eval_summary
+# ======================================
+# Dashboard Functions
+# ======================================
+def show_summary():
+    if not evaluation_log:
+        return "No data yet. Ask some questions first!"
+    df = pd.DataFrame(evaluation_log)
+    avg_hallucination = df['hallucination_score'].mean()
+    avg_relevance = df['relevance_score'].mean()
+    avg_context_sim = df['context_similarity'].mean()
+    hallucination_rate = (df['is_hallucinating'].sum() / len(df)) * 100
+    summary = f"""
+=== RAG System Performance Summary ===
+Session ID: {current_session_id}
+Total Queries: {len(df)}
+Generation Quality:
+- Avg Hallucination: {avg_hallucination:.3f}
+- Hallucination Rate: {hallucination_rate:.1f}%
+- Avg Relevance: {avg_relevance:.3f}
+Retrieval Quality:
+- Avg Context Similarity: {avg_context_sim:.3f}
+Usage Statistics:
+- Avg Context Length: {df['context_length'].mean():.0f} chars
+- Avg Answer Length: {df['answer_length'].mean():.0f} chars
+- Avg Chunks per Query: {df['context_chunks'].mean():.1f}
+Recent Queries:
+"""
+    for _, row in df.tail(5).iterrows():
+        summary += f"\nQ{row['query_id']}: {row['query'][:40]}... | H:{row['hallucination_score']:.2f} | Rel:{row['relevance_score']:.2f} | Ctx:{row['context_similarity']:.2f}"
+    return summary
+def export_data():
+    if not evaluation_log:
+        return None
+    df = pd.DataFrame(evaluation_log)
+    csv_path = f"rag_export_{current_session_id}.csv"
+    df.to_csv(csv_path, index=False)
+    return csv_path
+def reset_logs():
+    global evaluation_log, query_counter
+    evaluation_log = []
+    query_counter = 0
+    return "Logs reset. Starting fresh!"
+# ======================================
+# Gradio UI
+# ======================================
+with gr.Blocks(title="Hybrid RAG with Evaluation", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Hybrid RAG Chatbot")
+    gr.Markdown("Hybrid Search + Re-ranking + Complete RAG Evaluation")
+    with gr.Tabs():
+        with gr.TabItem("Chat"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    file_input = gr.File(label="Upload PDF, DOCX or CSV", file_types=[".pdf", ".docx", ".csv"])
+                    query_input = gr.Textbox(label="Ask a Question", placeholder="What is this document about?", lines=2)
+                    btn = gr.Button("Get Answer", variant="primary")
+                with gr.Column(scale=2):
+                    output = gr.Textbox(label="Answer", lines=30)
+            btn.click(
+                fn=answer_question,
+                inputs=[file_input, query_input],
+                outputs=output
+            )
+        with gr.TabItem("Analytics"):
+            gr.Markdown("## RAG System Analytics Dashboard")
+            summary_output = gr.Markdown("No data yet. Ask some questions first!")
+            with gr.Row():
+                refresh_btn = gr.Button("Refresh Summary", variant="primary")
+                export_btn = gr.Button("Export CSV", variant="secondary")
+                reset_btn = gr.Button("Reset Logs", variant="stop")
+            refresh_btn.click(fn=show_summary, outputs=summary_output)
+            reset_btn.click(fn=reset_logs, outputs=summary_output)
+            def export_and_show():
+                path = export_data()
+                return f"Exported to: {path}" if path else "No data to export"
+            export_btn.click(fn=export_and_show, outputs=summary_output)
+            gr.Markdown("""
+            Metrics Explained:
+            - Hallucination: Lower is better (< 0.3 = Good)
+            - Relevance: Higher is better (> 0.5 = Good)
+            - Context Similarity: Higher is better (> 0.4 = Good)
+            - Query Coverage: % of question words found in context
+            """)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)