Spaces:

MerveA
/

InsightRAG_Chatbot

Runtime error

App Files Files Community

MerveA commited on Oct 26, 2025

Commit

3445f05

1 Parent(s): 97a95aa

Fix langchain dependency for HF Space

Browse files

Files changed (2) hide show

app.py +88 -185
requirements.txt +9 -13

app.py CHANGED Viewed

@@ -1,19 +1,8 @@
 import streamlit as st
 import os
 import json
-import chromadb
-from chromadb.config import Settings
-from sentence_transformers import SentenceTransformer
-from langchain_google_genai import ChatGoogleGenerativeAI
-from langchain.schema import HumanMessage, SystemMessage
 import time
 from datetime import datetime
-import uuid
-import pandas as pd
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-import re
 # Page configuration
 st.set_page_config(
@@ -69,80 +58,26 @@ if 'rag_system' not in st.session_state:
 if 'initialized' not in st.session_state:
     st.session_state.initialized = False
-# RAG System Functions (from notebook)
-def chunk_text(text, chunk_size=500, overlap=50):
-    """Split text into overlapping chunks"""
-    words = text.split()
-    chunks = []
-    for i in range(0, len(words), chunk_size - overlap):
-        chunk = ' '.join(words[i:i + chunk_size])
-        if len(chunk.strip()) > 50:  # Only keep substantial chunks
-            chunks.append(chunk)
-    return chunks
-def load_and_process_dataset():
-    """Load and process The Pile dataset"""
-    print("📚 Loading The Pile dataset...")
-    try:
-        # Load a specific subset that contains ML/AI content
-        dataset = load_dataset("EleutherAI/the_pile", split="train", streaming=True)
-        # Take first 1000 samples for demonstration
-        texts = []
-        ml_keywords = ['machine learning', 'deep learning', 'neural network', 'artificial intelligence',
-                       'algorithm', 'model', 'training', 'data', 'feature', 'classification',
-                       'regression', 'clustering', 'optimization', 'gradient', 'tensor']
-        print("🔍 Filtering ML/AI related content...")
-        count = 0
-        for sample in tqdm(dataset, desc="Processing samples"):
-            if count >= 1000:  # Limit to 1000 samples for demo
-                break
-            text = sample['text']
-            # Check if text contains ML/AI keywords
-            if any(keyword in text.lower() for keyword in ml_keywords):
-                # Clean and preprocess text
-                text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
-                text = text.strip()
-                # Only keep texts that are reasonable length (not too short or too long)
-                if 100 <= len(text) <= 2000:
-                    texts.append(text)
-                    count += 1
-        print(f"✅ Loaded {len(texts)} ML/AI related text samples")
-        return texts
-    except Exception as e:
-        print(f"❌ Error loading dataset: {e}")
-        print("🔄 Using fallback sample data...")
-        # Fallback sample data if The Pile is not accessible
-        texts = [
-            "Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data. Deep learning uses neural networks with multiple layers to process complex patterns in data.",
-            "Neural networks are computing systems inspired by biological neural networks. They consist of interconnected nodes that process information using a connectionist approach.",
-            "Supervised learning uses labeled training data to learn a mapping from inputs to outputs. Common algorithms include linear regression, decision trees, and support vector machines.",
-            "Unsupervised learning finds hidden patterns in data without labeled examples. Clustering algorithms like K-means group similar data points together.",
-            "Natural language processing combines computational linguistics with machine learning to help computers understand human language. It includes tasks like text classification and sentiment analysis.",
-            "Computer vision enables machines to interpret and understand visual information from the world. It uses deep learning models like convolutional neural networks.",
-            "Reinforcement learning is a type of machine learning where agents learn to make decisions by interacting with an environment and receiving rewards or penalties.",
-            "Feature engineering is the process of selecting and transforming raw data into features that can be used by machine learning algorithms. Good features can significantly improve model performance.",
-            "Cross-validation is a technique used to assess how well a machine learning model generalizes to new data. It involves splitting data into training and validation sets multiple times.",
-            "Overfitting occurs when a model learns the training data too well and performs poorly on new data. Regularization techniques help prevent overfitting."
-        ]
-        print(f"✅ Using {len(texts)} sample texts")
-        return texts
 def initialize_rag_system(api_key):
     """Initialize the RAG system with all components"""
     try:
         # Set API key
         os.environ['GOOGLE_API_KEY'] = api_key
         # Initialize embedding model
         embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -155,148 +90,117 @@ def initialize_rag_system(api_key):
         collection_name = "ml_ai_knowledge"
         try:
             collection = chroma_client.get_collection(collection_name)
-            print(f"✅ Found existing collection: {collection_name}")
         except:
             collection = chroma_client.create_collection(
                 name=collection_name,
                 metadata={"description": "ML/AI knowledge base from The Pile dataset"}
             )
-            print(f"✅ Created new collection: {collection_name}")
         # Check if collection already has data
         existing_count = collection.count()
-        print(f"📊 Current documents in collection: {existing_count}")
         if existing_count == 0:
-            print("🔄 Adding new documents to collection...")
-            # Load and process dataset
-            texts = load_and_process_dataset()
             all_chunks = []
             chunk_ids = []
             chunk_metadatas = []
-            for i, text in enumerate(tqdm(texts, desc="Processing texts")):
-                chunks = chunk_text(text)
-                for j, chunk in enumerate(chunks):
-                    chunk_id = f"doc_{i}_chunk_{j}"
-                    metadata = {
-                        "source": f"the_pile_doc_{i}",
-                        "chunk_index": j,
-                        "total_chunks": len(chunks),
-                        "text_length": len(chunk)
-                    }
-                    all_chunks.append(chunk)
-                    chunk_ids.append(chunk_id)
-                    chunk_metadatas.append(metadata)
-            print(f"📊 Created {len(all_chunks)} text chunks")
-            # Add documents to Chroma in batches to avoid memory issues
-            batch_size = 100
-            for i in tqdm(range(0, len(all_chunks), batch_size), desc="Adding to Chroma"):
-                batch_chunks = all_chunks[i:i + batch_size]
-                batch_ids = chunk_ids[i:i + batch_size]
-                batch_metadatas = chunk_metadatas[i:i + batch_size]
-                collection.add(
-                    documents=batch_chunks,
-                    ids=batch_ids,
-                    metadatas=batch_metadatas
-                )
-            print("✅ All documents added to Chroma!")
-        else:
-            print("✅ Collection already contains data, skipping addition")
-        # Initialize Gemini
-        llm = ChatGoogleGenerativeAI(
-            model="gemini-2.0-flash-exp",
-            temperature=0.7,
-            max_output_tokens=1024,
-            convert_system_message_to_human=True
-        )
         return {
             'embedding_model': embedding_model,
             'chroma_client': chroma_client,
             'collection': collection,
-            'llm': llm
         }
     except Exception as e:
         st.error(f"Error initializing RAG system: {e}")
         return None
-def retrieve_relevant_docs(query, collection, n_results=5):
-    """Retrieve relevant documents from Chroma"""
     try:
         results = collection.query(
             query_texts=[query],
             n_results=n_results
         )
-        # Extract documents and metadata
         documents = results['documents'][0]
-        metadatas = results['metadatas'][0]
         distances = results['distances'][0]
-        return documents, metadatas, distances
-    except Exception as e:
-        print(f"Error retrieving documents: {e}")
-        return [], [], []
-def create_context(documents):
-    """Create context string from retrieved documents"""
-    context = "\n\n".join(documents)
-    return context
-def generate_answer(query, context, llm):
-    """Generate answer using Gemini with retrieved context"""
-    system_prompt = """You are an AI assistant specialized in machine learning, deep learning, and artificial intelligence.
-    Use the provided context to answer questions accurately and comprehensively. If the context doesn't contain enough
-    information, you can supplement with your general knowledge, but always prioritize the provided context.
-    Provide clear, well-structured answers with examples when appropriate."""
-    user_prompt = f"""Context:
-    {context}
-    Question: {query}
-    Please provide a comprehensive answer based on the context above."""
-    try:
-        messages = [
-            SystemMessage(content=system_prompt),
-            HumanMessage(content=user_prompt)
-        ]
-        response = llm.invoke(messages)
-        return response.content
-    except Exception as e:
-        return f"Error generating answer: {e}"
-def rag_pipeline(query, rag_system, n_results=5):
-    """Complete RAG pipeline"""
-    try:
-        collection = rag_system['collection']
-        llm = rag_system['llm']
-        # Retrieve relevant documents
-        documents, metadatas, distances = retrieve_relevant_docs(query, collection, n_results)
         if not documents:
             return "I couldn't find relevant information for your query. Please try asking about machine learning, deep learning, or AI topics."
         # Create context
-        context = create_context(documents)
-        # Generate answer
-        answer = generate_answer(query, context, llm)
-        return answer, documents, distances
     except Exception as e:
         return f"Error generating response: {e}", [], []
@@ -305,7 +209,7 @@ def rag_pipeline(query, rag_system, n_results=5):
 st.markdown("""
 <div class="main-header">
     <h1>🤖 RAG Chatbot: ML/AI Assistant</h1>
-    <p>Powered by Google Gemini 2.5 Flash + LangChain + Chroma</p>
 </div>
 """, unsafe_allow_html=True)
@@ -379,14 +283,13 @@ if not st.session_state.initialized:
     deep learning, AI, and related topics using:
     - **🤖 Generation Model**: Google Gemini 2.5 Flash
-    - **🔗 RAG Framework**: LangChain
     - **🗄️ Vector Database**: Chroma
-    - **📚 Dataset**: The Pile (EleutherAI/the_pile) from Hugging Face
     - **🌐 Interface**: Streamlit
     ### 🚀 How It Works
-    1. **Data Loading**: Text data from The Pile dataset is loaded and filtered for ML/AI content
     2. **Embedding**: Text is processed and embedded using sentence transformers
     3. **Storage**: Embeddings are stored in Chroma vector database
     4. **Retrieval**: Relevant context is retrieved for user queries
@@ -459,7 +362,7 @@ else:
 st.markdown("---")
 st.markdown("""
 <div style="text-align: center; color: #666; padding: 1rem;">
-    <p>🤖 RAG Chatbot | Powered by Google Gemini 2.5 Flash + LangChain + Chroma</p>
-    <p>📚 Knowledge Base: The Pile Dataset (EleutherAI/the_pile)</p>
 </div>
 """, unsafe_allow_html=True)

 import streamlit as st
 import os
 import json
 import time
 from datetime import datetime
 # Page configuration
 st.set_page_config(
 if 'initialized' not in st.session_state:
     st.session_state.initialized = False
+# RAG System Functions
 def initialize_rag_system(api_key):
     """Initialize the RAG system with all components"""
     try:
         # Set API key
         os.environ['GOOGLE_API_KEY'] = api_key
+        # Import required libraries with error handling
+        try:
+            from sentence_transformers import SentenceTransformer
+            import chromadb
+            from chromadb.config import Settings
+            import google.generativeai as genai
+            from datasets import load_dataset
+            from tqdm import tqdm
+            import re
+        except ImportError as e:
+            st.error(f"Import error: {e}")
+            return None
         # Initialize embedding model
         embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
         collection_name = "ml_ai_knowledge"
         try:
             collection = chroma_client.get_collection(collection_name)
         except:
             collection = chroma_client.create_collection(
                 name=collection_name,
                 metadata={"description": "ML/AI knowledge base from The Pile dataset"}
             )
         # Check if collection already has data
         existing_count = collection.count()
         if existing_count == 0:
+            # Load sample data for demo
+            sample_texts = [
+                "Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data. Deep learning uses neural networks with multiple layers to process complex patterns in data.",
+                "Neural networks are computing systems inspired by biological neural networks. They consist of interconnected nodes that process information using a connectionist approach.",
+                "Supervised learning uses labeled training data to learn a mapping from inputs to outputs. Common algorithms include linear regression, decision trees, and support vector machines.",
+                "Unsupervised learning finds hidden patterns in data without labeled examples. Clustering algorithms like K-means group similar data points together.",
+                "Natural language processing combines computational linguistics with machine learning to help computers understand human language. It includes tasks like text classification and sentiment analysis.",
+                "Computer vision enables machines to interpret and understand visual information from the world. It uses deep learning models like convolutional neural networks.",
+                "Reinforcement learning is a type of machine learning where agents learn to make decisions by interacting with an environment and receiving rewards or penalties.",
+                "Feature engineering is the process of selecting and transforming raw data into features that can be used by machine learning algorithms. Good features can significantly improve model performance.",
+                "Cross-validation is a technique used to assess how well a machine learning model generalizes to new data. It involves splitting data into training and validation sets multiple times.",
+                "Overfitting occurs when a model learns the training data too well and performs poorly on new data. Regularization techniques help prevent overfitting.",
+                "Gradient descent is an optimization algorithm used to minimize the cost function in machine learning models. It iteratively adjusts parameters to find the minimum of the function.",
+                "Backpropagation is a method used to train neural networks by calculating gradients and updating weights. It works by propagating errors backward through the network layers.",
+                "Convolutional Neural Networks (CNNs) are specialized neural networks designed for processing grid-like data such as images. They use convolutional layers to detect local features.",
+                "Transformers are a type of neural network architecture that uses attention mechanisms to process sequential data. They are the foundation of modern language models like GPT.",
+                "Large Language Models (LLMs) are AI systems trained on vast amounts of text data to understand and generate human-like text. They can perform various language tasks.",
+                "Generative AI refers to AI systems that can create new content, such as text, images, or code. It differs from predictive AI which focuses on making predictions.",
+                "Transfer learning is a technique where a model trained on one task is adapted for a different but related task. It can significantly reduce training time and improve performance.",
+                "Hyperparameter tuning is the process of finding the optimal hyperparameters for a machine learning model. Common methods include grid search and random search.",
+                "Regularization techniques like L1 and L2 regularization help prevent overfitting by adding penalty terms to the loss function. They encourage simpler models.",
+                "Activation functions introduce non-linearity into neural networks. Common activation functions include ReLU, sigmoid, and tanh."
+            ]
+            # Add sample documents to Chroma
             all_chunks = []
             chunk_ids = []
             chunk_metadatas = []
+            for i, text in enumerate(sample_texts):
+                chunk_id = f"sample_doc_{i}"
+                metadata = {
+                    "source": f"sample_doc_{i}",
+                    "chunk_index": 0,
+                    "total_chunks": 1,
+                    "text_length": len(text)
+                }
+                all_chunks.append(text)
+                chunk_ids.append(chunk_id)
+                chunk_metadatas.append(metadata)
+            # Add documents to Chroma
+            collection.add(
+                documents=all_chunks,
+                ids=chunk_ids,
+                metadatas=chunk_metadatas
+            )
+        # Initialize Gemini using direct API instead of LangChain
+        genai.configure(api_key=api_key)
         return {
             'embedding_model': embedding_model,
             'chroma_client': chroma_client,
             'collection': collection,
+            'genai': genai
         }
     except Exception as e:
         st.error(f"Error initializing RAG system: {e}")
         return None
+def rag_pipeline(query, rag_system, n_results=5):
+    """Complete RAG pipeline using direct Gemini API"""
     try:
+        collection = rag_system['collection']
+        genai = rag_system['genai']
+        # Retrieve relevant documents
         results = collection.query(
             query_texts=[query],
             n_results=n_results
         )
         documents = results['documents'][0]
         distances = results['distances'][0]
         if not documents:
             return "I couldn't find relevant information for your query. Please try asking about machine learning, deep learning, or AI topics."
         # Create context
+        context = "\n\n".join(documents)
+        # Generate answer using direct Gemini API
+        model = genai.GenerativeModel('gemini-2.0-flash-exp')
+        prompt = f"""You are an AI assistant specialized in machine learning, deep learning, and artificial intelligence.
+        Use the provided context to answer questions accurately and comprehensively. If the context doesn't contain enough
+        information, you can supplement with your general knowledge, but always prioritize the provided context.
+        Provide clear, well-structured answers with examples when appropriate.
+        Context:
+        {context}
+        Question: {query}
+        Please provide a comprehensive answer based on the context above."""
+        response = model.generate_content(prompt)
+        return response.text, documents, distances
     except Exception as e:
         return f"Error generating response: {e}", [], []
 st.markdown("""
 <div class="main-header">
     <h1>🤖 RAG Chatbot: ML/AI Assistant</h1>
+    <p>Powered by Google Gemini 2.5 Flash + Chroma + Direct API</p>
 </div>
 """, unsafe_allow_html=True)
     deep learning, AI, and related topics using:
     - **🤖 Generation Model**: Google Gemini 2.5 Flash
     - **🗄️ Vector Database**: Chroma
+    - **📚 Dataset**: Sample ML/AI knowledge base
     - **🌐 Interface**: Streamlit
     ### 🚀 How It Works
+    1. **Data Loading**: Sample ML/AI content is loaded
     2. **Embedding**: Text is processed and embedded using sentence transformers
     3. **Storage**: Embeddings are stored in Chroma vector database
     4. **Retrieval**: Relevant context is retrieved for user queries
 st.markdown("---")
 st.markdown("""
 <div style="text-align: center; color: #666; padding: 1rem;">
+    <p>🤖 RAG Chatbot | Powered by Google Gemini 2.5 Flash + Chroma</p>
+    <p>📚 Knowledge Base: ML/AI Sample Dataset</p>
 </div>
 """, unsafe_allow_html=True)

requirements.txt CHANGED Viewed

@@ -1,13 +1,9 @@
-streamlit
-langchain
-langchain-community
-langchain-google-genai
-chromadb
-datasets
-transformers
-sentence-transformers
-google-generativeai
-tiktoken
-numpy
-pandas
-tqdm

+# Core dependencies for Hugging Face Spaces
+streamlit==1.28.1
+chromadb==0.4.18
+sentence-transformers==2.2.2
+google-generativeai==0.3.2
+numpy==1.24.3
+pandas==2.0.3
+tqdm==4.66.1
+huggingface-hub>=0.16.4,<1.0.0