Spaces:

ahmadsanafarooq
/

RagLearningAssistant

Sleeping

App Files Files Community

ahmadsanafarooq commited on Jul 25, 2025

Commit

8b95a88

verified ·

1 Parent(s): d0c46e1

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -163

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import os
 import gradio as gr
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -11,79 +13,47 @@ from typing import List
 import logging
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-import pickle
 from dotenv import load_dotenv
-# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# ---------------------- TF-IDF Embedding Fallback ----------------------
 class SimpleEmbeddings:
-    """Simple TF-IDF based embeddings as fallback"""
     def __init__(self):
         self.vectorizer = TfidfVectorizer(max_features=384, stop_words='english')
         self.fitted = False
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
-        """Embed a list of documents"""
         if not self.fitted:
             self.vectorizer.fit(texts)
             self.fitted = True
-        embeddings = self.vectorizer.transform(texts)
-        return embeddings.toarray().tolist()
     def embed_query(self, text: str) -> List[float]:
-        """Embed a single query"""
         if not self.fitted:
-            # If not fitted, return zero vector
             return [0.0] * 384
-        embedding = self.vectorizer.transform([text])
-        return embedding.toarray()[0].tolist()
-# ---------------------- RAG Assistant ----------------------
 class RAGAssistant:
     def __init__(self, groq_api_key: str):
-        """Initialize the RAG Assistant with Groq API key"""
         self.groq_api_key = groq_api_key
-        # Initialize embeddings with fallback
         self.embeddings = self._init_embeddings()
-        self.text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000,
-            chunk_overlap=200,
-            length_function=len
-        )
         self.learning_vectorstore = None
         self.code_vectorstore = None
-        self.llm = ChatGroq(
-            groq_api_key=groq_api_key,
-            model_name="llama3-70b-8192",
-            temperature=0.1
-        )
         self.learning_persist_dir = "./chroma_learning_db"
         self.code_persist_dir = "./chroma_code_db"
         self._init_vector_stores()
     def _init_embeddings(self):
         try:
             from langchain_huggingface import HuggingFaceEmbeddings
             print("Trying HuggingFace embeddings...")
-            models_to_try = [
-                "all-MiniLM-L6-v2",
-                "paraphrase-MiniLM-L3-v2",
-                "all-mpnet-base-v2"
-            ]
-            for model_name in models_to_try:
                 try:
                     embeddings = HuggingFaceEmbeddings(
                         model_name=model_name,
@@ -95,11 +65,9 @@ class RAGAssistant:
                 except Exception as e:
                     print(f"Failed to load {model_name}: {e}")
         except ImportError:
-            print("HuggingFace embeddings not available")
-        print("Using TF-IDF embeddings as fallback...")
         return SimpleEmbeddings()
     def _init_vector_stores(self):
         try:
             self.learning_vectorstore = Chroma(
@@ -114,14 +82,16 @@ class RAGAssistant:
             )
         except Exception as e:
             logger.error(f"Error initializing vector stores: {str(e)}")
-            raise
     def load_documents(self, files: List[str], assistant_type: str) -> str:
         try:
             documents = []
             for file_path in files:
                 try:
-                    if file_path.endswith('.pdf'):
                         loader = PyPDFLoader(file_path)
                     else:
                         loader = TextLoader(file_path, encoding='utf-8')
@@ -129,228 +99,178 @@ class RAGAssistant:
                     documents.extend(docs)
                 except Exception as e:
                     print(f"Error loading {file_path}: {e}")
             if not documents:
                 return "No documents could be loaded. Please check your files."
             chunks = self.text_splitter.split_documents(documents)
             for chunk in chunks:
                 chunk.metadata['assistant_type'] = assistant_type
             if assistant_type == "learning":
                 self.learning_vectorstore.add_documents(chunks)
                 self.learning_vectorstore.persist()
             elif assistant_type == "code":
                 self.code_vectorstore.add_documents(chunks)
                 self.code_vectorstore.persist()
             return f"Successfully loaded {len(chunks)} chunks from {len(documents)} documents into {assistant_type} assistant."
         except Exception as e:
             logger.error(f"Error loading documents: {str(e)}")
             return f"Error loading documents: {str(e)}"
     def get_learning_tutor_response(self, question: str) -> str:
         try:
             if not self.learning_vectorstore:
                 return "Please upload some learning materials first."
             qa_chain = RetrievalQA.from_chain_type(
                 llm=self.llm,
                 chain_type="stuff",
                 retriever=self.learning_vectorstore.as_retriever(search_kwargs={"k": 3}),
                 return_source_documents=True
             )
             learning_prompt = f"""
-            You are an AI learning assistant that helps students understand academic concepts.
-            Based on the provided course materials, answer the student's question clearly and educationally.
-            Guidelines:
-            - Provide clear, educational explanations
-            - Use examples when helpful
-            - Reference specific sources when possible
-            - Adapt to the student's level of understanding
-            - Offer additional practice questions or related concepts when relevant
-            - Maintain an encouraging, supportive tone
-            Student's question: {question}
             """
             result = qa_chain({"query": learning_prompt})
             response = result['result']
             if result.get('source_documents'):
                 response += "\n\n**Sources:**\n"
                 for doc in result['source_documents'][:3]:
                     source = doc.metadata.get('source', 'Unknown')
                     response += f"- {Path(source).name}\n"
             return response
         except Exception as e:
             logger.error(f"Error in learning tutor: {str(e)}")
             return f"Error generating response: {str(e)}"
     def get_code_helper_response(self, question: str) -> str:
         try:
             if not self.code_vectorstore:
                 return "Please upload some code documentation first."
             qa_chain = RetrievalQA.from_chain_type(
                 llm=self.llm,
                 chain_type="stuff",
                 retriever=self.code_vectorstore.as_retriever(search_kwargs={"k": 3}),
                 return_source_documents=True
             )
             code_prompt = f"""
-            You are a technical assistant that helps developers understand codebases and APIs.
-            Based on the provided documentation and code examples, answer the developer's question.
-            Guidelines:
-            - Provide practical, actionable guidance
-            - Include relevant code snippets with explanations
-            - Reference specific documentation sections when possible
-            - Highlight important considerations (security, performance, errors)
-            - Suggest related APIs or patterns that might be useful
-            - Use clear, technical language appropriate for developers
-            Developer's question: {question}
             """
             result = qa_chain({"query": code_prompt})
             response = result['result']
             if result.get('source_documents'):
-                response += "\n\n**Documentation Sources:**\n"
                 for doc in result['source_documents'][:3]:
                     source = doc.metadata.get('source', 'Unknown')
                     response += f"- {Path(source).name}\n"
             return response
         except Exception as e:
             logger.error(f"Error in code helper: {str(e)}")
             return f"Error generating response: {str(e)}"
-# ---------------------- Gradio UI ----------------------
 def create_gradio_interface(assistant: RAGAssistant):
     def upload_learning_files(files):
         if not files:
             return "No files uploaded."
-        file_paths = [f.name for f in files]
         return assistant.load_documents(file_paths, "learning")
     def upload_code_files(files):
         if not files:
             return "No files uploaded."
-        file_paths = [f.name for f in files]
         return assistant.load_documents(file_paths, "code")
     def learning_chat(message, history):
         if not message.strip():
             return history, ""
         response = assistant.get_learning_tutor_response(message)
         history.append((message, response))
         return history, ""
     def code_chat(message, history):
         if not message.strip():
             return history, ""
         response = assistant.get_code_helper_response(message)
         history.append((message, response))
         return history, ""
     with gr.Blocks(title="RAG-Based Learning & Code Assistant", theme=gr.themes.Soft()) as demo:
         gr.Markdown("# 🎓 RAG-Based Learning & Code Assistant")
-        gr.Markdown("Upload your documents and ask questions to get intelligent responses!")
         with gr.Tabs():
             with gr.TabItem("📚 Learning Tutor"):
                 with gr.Row():
                     with gr.Column(scale=1):
-                        learning_files = gr.File(label="Upload Learning Materials", file_count="multiple", file_types=[".pdf", ".txt", ".md"])
-                        learning_upload_btn = gr.Button("Upload Materials", variant="primary")
                         learning_status = gr.Textbox(label="Upload Status", interactive=False)
                     with gr.Column(scale=2):
-                        learning_chatbot = gr.Chatbot(label="Learning Tutor Chat", height=400)
-                        learning_input = gr.Textbox(label="Ask a question", placeholder="e.g., What is regression?")
-                        learning_submit = gr.Button("Ask Question", variant="primary")
                 learning_upload_btn.click(upload_learning_files, inputs=[learning_files], outputs=[learning_status])
                 learning_submit.click(learning_chat, inputs=[learning_input, learning_chatbot], outputs=[learning_chatbot, learning_input])
                 learning_input.submit(learning_chat, inputs=[learning_input, learning_chatbot], outputs=[learning_chatbot, learning_input])
             with gr.TabItem("💻 Code Documentation Helper"):
                 with gr.Row():
                     with gr.Column(scale=1):
-                        code_files = gr.File(label="Upload Code Documentation", file_count="multiple", file_types=[".pdf", ".txt", ".md", ".py", ".js", ".json"])
-                        code_upload_btn = gr.Button("Upload Documentation", variant="primary")
                         code_status = gr.Textbox(label="Upload Status", interactive=False)
                     with gr.Column(scale=2):
-                        code_chatbot = gr.Chatbot(label="Code Helper Chat", height=400)
-                        code_input = gr.Textbox(label="Ask about code or APIs", placeholder="e.g., How to use this function?")
-                        code_submit = gr.Button("Ask Question", variant="primary")
                 code_upload_btn.click(upload_code_files, inputs=[code_files], outputs=[code_status])
                 code_submit.click(code_chat, inputs=[code_input, code_chatbot], outputs=[code_chatbot, code_input])
                 code_input.submit(code_chat, inputs=[code_input, code_chatbot], outputs=[code_chatbot, code_input])
         gr.Markdown("---")
-        gr.Markdown("*Powered by LangChain, ChromaDB, and Groq API*")
     return demo
-# ---------------------- Evaluation Additions ----------------------
-class RetrieverEvaluator:
-    """Evaluation class for computing Recall@k and MRR@k"""
-    def __init__(self, retriever, ground_truth: dict, k=3):
-        self.retriever = retriever
-        self.ground_truth = ground_truth
-        self.k = k
-    def recall_at_k(self):
-        correct = 0
-        for query, relevant_docs in self.ground_truth.items():
-            results = self.retriever.get_relevant_documents(query)
-            retrieved = [Path(doc.metadata.get("source", "")).name for doc in results]
-            if any(doc in retrieved[:self.k] for doc in relevant_docs):
-                correct += 1
-        recall = correct / len(self.ground_truth)
-        print(f"Recall@{self.k}: {recall:.2f}")
-        return recall
-    def mean_reciprocal_rank(self):
-        mrr_total = 0
-        for query, relevant_docs in self.ground_truth.items():
-            results = self.retriever.get_relevant_documents(query)
-            retrieved = [Path(doc.metadata.get("source", "")).name for doc in results]
-            for rank, doc in enumerate(retrieved[:self.k], 1):
-                if doc in relevant_docs:
-                    mrr_total += 1 / rank
-                    break
-        mrr = mrr_total / len(self.ground_truth)
-        print(f"MRR@{self.k}: {mrr:.2f}")
-        return mrr
-def evaluate_retriever_example(assistant):
-    """Run example evaluation with mock ground truth"""
-    sample_ground_truth = {
-        "What is machine learning?": ["ml_intro.txt"],
-        "What is API authentication?": ["api_guide.pdf"]
-    }
-    if assistant.learning_vectorstore:
-        retriever = assistant.learning_vectorstore.as_retriever(search_kwargs={"k": 3})
-        evaluator = RetrieverEvaluator(retriever, sample_ground_truth, k=3)
-        recall = evaluator.recall_at_k()
-        mrr = evaluator.mean_reciprocal_rank()
-        return f"Evaluation Results:\nRecall@3: {recall:.2f}\nMRR@3: {mrr:.2f}"
-    return "No documents uploaded for evaluation."
-# ---------------------- Entry Point ----------------------
 def main():
     load_dotenv()
     groq_api_key = os.getenv("GROQ_API_KEY")
     if not groq_api_key:
-        print("Please set your GROQ_API_KEY in the environment.")
         return
-    try:
-        print("Initializing RAG Assistant...")
-        assistant = RAGAssistant(groq_api_key)
-        # Optional: Run evaluation after docs are uploaded
-        # print(evaluate_retriever_example(assistant))
-        demo = create_gradio_interface(assistant)
-        print("Launching app...")
-        demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)
-    except Exception as e:
-        logger.error(f"Error starting application: {str(e)}")
-        print(f"Error: {str(e)}")
 if __name__ == "__main__":
     main()

+# app.py
 import os
 import gradio as gr
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import logging
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from dotenv import load_dotenv
+# ----------------- Logger Configuration ------------------
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# ----------------- Simple TF-IDF Fallback Embeddings ------------------
 class SimpleEmbeddings:
     def __init__(self):
         self.vectorizer = TfidfVectorizer(max_features=384, stop_words='english')
         self.fitted = False
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
         if not self.fitted:
             self.vectorizer.fit(texts)
             self.fitted = True
+        return self.vectorizer.transform(texts).toarray().tolist()
     def embed_query(self, text: str) -> List[float]:
         if not self.fitted:
             return [0.0] * 384
+        return self.vectorizer.transform([text]).toarray()[0].tolist()
+# ----------------- RAG Assistant Class ------------------
 class RAGAssistant:
     def __init__(self, groq_api_key: str):
         self.groq_api_key = groq_api_key
         self.embeddings = self._init_embeddings()
+        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
         self.learning_vectorstore = None
         self.code_vectorstore = None
+        self.llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-70b-8192", temperature=0.1)
         self.learning_persist_dir = "./chroma_learning_db"
         self.code_persist_dir = "./chroma_code_db"
         self._init_vector_stores()
     def _init_embeddings(self):
         try:
             from langchain_huggingface import HuggingFaceEmbeddings
             print("Trying HuggingFace embeddings...")
+            for model_name in ["all-MiniLM-L6-v2", "paraphrase-MiniLM-L3-v2", "all-mpnet-base-v2"]:
                 try:
                     embeddings = HuggingFaceEmbeddings(
                         model_name=model_name,
                 except Exception as e:
                     print(f"Failed to load {model_name}: {e}")
         except ImportError:
+            print("HuggingFace not installed. Using fallback TF-IDF.")
         return SimpleEmbeddings()
     def _init_vector_stores(self):
         try:
             self.learning_vectorstore = Chroma(
             )
         except Exception as e:
             logger.error(f"Error initializing vector stores: {str(e)}")
     def load_documents(self, files: List[str], assistant_type: str) -> str:
         try:
             documents = []
+            print("Files received:", files)
             for file_path in files:
+                print(f"Trying to load: {file_path}")
                 try:
+                    if file_path.lower().endswith('.pdf'):
                         loader = PyPDFLoader(file_path)
                     else:
                         loader = TextLoader(file_path, encoding='utf-8')
                     documents.extend(docs)
                 except Exception as e:
                     print(f"Error loading {file_path}: {e}")
+                    continue
             if not documents:
                 return "No documents could be loaded. Please check your files."
             chunks = self.text_splitter.split_documents(documents)
+            print(f"Total chunks created: {len(chunks)}")
             for chunk in chunks:
                 chunk.metadata['assistant_type'] = assistant_type
             if assistant_type == "learning":
                 self.learning_vectorstore.add_documents(chunks)
                 self.learning_vectorstore.persist()
             elif assistant_type == "code":
                 self.code_vectorstore.add_documents(chunks)
                 self.code_vectorstore.persist()
             return f"Successfully loaded {len(chunks)} chunks from {len(documents)} documents into {assistant_type} assistant."
         except Exception as e:
             logger.error(f"Error loading documents: {str(e)}")
             return f"Error loading documents: {str(e)}"
     def get_learning_tutor_response(self, question: str) -> str:
         try:
             if not self.learning_vectorstore:
                 return "Please upload some learning materials first."
             qa_chain = RetrievalQA.from_chain_type(
                 llm=self.llm,
                 chain_type="stuff",
                 retriever=self.learning_vectorstore.as_retriever(search_kwargs={"k": 3}),
                 return_source_documents=True
             )
             learning_prompt = f"""
+            You are an AI learning assistant helping students understand academic concepts.
+            Based on the provided materials, answer the student's question:
+            {question}
             """
             result = qa_chain({"query": learning_prompt})
             response = result['result']
             if result.get('source_documents'):
                 response += "\n\n**Sources:**\n"
                 for doc in result['source_documents'][:3]:
                     source = doc.metadata.get('source', 'Unknown')
                     response += f"- {Path(source).name}\n"
             return response
         except Exception as e:
             logger.error(f"Error in learning tutor: {str(e)}")
             return f"Error generating response: {str(e)}"
     def get_code_helper_response(self, question: str) -> str:
         try:
             if not self.code_vectorstore:
                 return "Please upload some code documentation first."
             qa_chain = RetrievalQA.from_chain_type(
                 llm=self.llm,
                 chain_type="stuff",
                 retriever=self.code_vectorstore.as_retriever(search_kwargs={"k": 3}),
                 return_source_documents=True
             )
             code_prompt = f"""
+            You are a code documentation assistant helping developers with APIs and codebases.
+            Based on the uploaded documentation, answer this question:
+            {question}
             """
             result = qa_chain({"query": code_prompt})
             response = result['result']
             if result.get('source_documents'):
+                response += "\n\n**Sources:**\n"
                 for doc in result['source_documents'][:3]:
                     source = doc.metadata.get('source', 'Unknown')
                     response += f"- {Path(source).name}\n"
             return response
         except Exception as e:
             logger.error(f"Error in code helper: {str(e)}")
             return f"Error generating response: {str(e)}"
+# ----------------- Gradio UI Interface ------------------
 def create_gradio_interface(assistant: RAGAssistant):
     def upload_learning_files(files):
         if not files:
             return "No files uploaded."
+        file_paths = [f.path for f in files]
         return assistant.load_documents(file_paths, "learning")
     def upload_code_files(files):
         if not files:
             return "No files uploaded."
+        file_paths = [f.path for f in files]
         return assistant.load_documents(file_paths, "code")
     def learning_chat(message, history):
         if not message.strip():
             return history, ""
         response = assistant.get_learning_tutor_response(message)
         history.append((message, response))
         return history, ""
     def code_chat(message, history):
         if not message.strip():
             return history, ""
         response = assistant.get_code_helper_response(message)
         history.append((message, response))
         return history, ""
     with gr.Blocks(title="RAG-Based Learning & Code Assistant", theme=gr.themes.Soft()) as demo:
         gr.Markdown("# 🎓 RAG-Based Learning & Code Assistant")
+        gr.Markdown("Upload documents and get smart, personalized answers.")
         with gr.Tabs():
             with gr.TabItem("📚 Learning Tutor"):
+                gr.Markdown("### Upload lecture notes or textbooks below:")
                 with gr.Row():
                     with gr.Column(scale=1):
+                        learning_files = gr.File(label="Upload Materials", file_count="multiple", file_types=[".pdf", ".txt", ".md"])
+                        learning_upload_btn = gr.Button("Upload", variant="primary")
                         learning_status = gr.Textbox(label="Upload Status", interactive=False)
                     with gr.Column(scale=2):
+                        learning_chatbot = gr.Chatbot(label="Tutor Chat", height=400)
+                        learning_input = gr.Textbox(label="Ask a question", placeholder="e.g., What is machine learning?")
+                        learning_submit = gr.Button("Ask", variant="primary")
                 learning_upload_btn.click(upload_learning_files, inputs=[learning_files], outputs=[learning_status])
                 learning_submit.click(learning_chat, inputs=[learning_input, learning_chatbot], outputs=[learning_chatbot, learning_input])
                 learning_input.submit(learning_chat, inputs=[learning_input, learning_chatbot], outputs=[learning_chatbot, learning_input])
             with gr.TabItem("💻 Code Documentation Helper"):
+                gr.Markdown("### Upload code docs or API guides below:")
                 with gr.Row():
                     with gr.Column(scale=1):
+                        code_files = gr.File(label="Upload Docs", file_count="multiple", file_types=[".pdf", ".txt", ".md", ".py", ".js", ".json"])
+                        code_upload_btn = gr.Button("Upload", variant="primary")
                         code_status = gr.Textbox(label="Upload Status", interactive=False)
                     with gr.Column(scale=2):
+                        code_chatbot = gr.Chatbot(label="Code Chat", height=400)
+                        code_input = gr.Textbox(label="Ask about the codebase", placeholder="e.g., How does login work?")
+                        code_submit = gr.Button("Ask", variant="primary")
                 code_upload_btn.click(upload_code_files, inputs=[code_files], outputs=[code_status])
                 code_submit.click(code_chat, inputs=[code_input, code_chatbot], outputs=[code_chatbot, code_input])
                 code_input.submit(code_chat, inputs=[code_input, code_chatbot], outputs=[code_chatbot, code_input])
         gr.Markdown("---")
+        gr.Markdown("Built with ❤️ using LangChain, ChromaDB, and Groq API")
     return demo
+# ----------------- Main Function ------------------
 def main():
     load_dotenv()
     groq_api_key = os.getenv("GROQ_API_KEY")
     if not groq_api_key:
+        print("Set your GROQ_API_KEY in the .env file or environment.")
         return
+    assistant = RAGAssistant(groq_api_key)
+    demo = create_gradio_interface(assistant)
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True)
 if __name__ == "__main__":
     main()