Spaces:

fguryel
/

scikit-rag

Sleeping

App Files Files Community

fguryel commited on Sep 28, 2025

Commit

8fc10eb

1 Parent(s): b943512

init db

Browse files

Files changed (4) hide show

.gitattributes +0 -4
README.md +0 -19
app.py +414 -232
app_hf.py +0 -309

.gitattributes CHANGED Viewed

@@ -33,8 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-# Large database and data files
-*.sqlite3 filter=lfs diff=lfs merge=lfs -text
-*.json filter=lfs diff=lfs merge=lfs -text
 chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -10,29 +10,10 @@ pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
----
-title: Scikit-learn Documentation Q&A Bot
-emoji: 🤖
-colorFrom: blue
-colorTo: green
-sdk: streamlit
-sdk_version: 1.50.0
-app_file: app.py
-pinned: false
-license: mit
----
 # Scikit-learn Documentation Q&A Bot 🤖
 A Retrieval-Augmented Generation (RAG) chatbot that answers questions about Scikit-learn using the official documentation.
-## How to Use on Hugging Face Spaces
-1. **Enter OpenAI API Key**: In the sidebar, enter your OpenAI API key
-2. **Ask Questions**: Type any question about Scikit-learn functionality
-3. **Get Answers**: Receive detailed responses with source documentation links
-4. **Explore**: Use the example questions or browse chat history
 ## Features
 - **🔍 Smart Retrieval**: Searches through 1,249+ documentation chunks using semantic similarity

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 # Scikit-learn Documentation Q&A Bot 🤖
 A Retrieval-Augmented Generation (RAG) chatbot that answers questions about Scikit-learn using the official documentation.
 ## Features
 - **🔍 Smart Retrieval**: Searches through 1,249+ documentation chunks using semantic similarity

app.py CHANGED Viewed

@@ -1,309 +1,491 @@
 """
-Scikit-learn RAG Chatbot - Hugging Face Spaces Optimized Version
-A Retrieval-Augmented Generation chatbot for Scikit-learn documentation.
 """
-import streamlit as st
 import os
-import json
 import logging
-from typing import List, Dict, Optional, Tuple
-import warnings
-# Suppress warnings for cleaner output
-warnings.filterwarnings("ignore")
-logging.getLogger("chromadb").setLevel(logging.ERROR)
-logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
-# Try imports with error handling
-try:
-    import chromadb
-    from sentence_transformers import SentenceTransformer
-    import openai
-    DEPENDENCIES_AVAILABLE = True
-except ImportError as e:
-    DEPENDENCIES_AVAILABLE = False
-    st.error(f"Missing dependencies: {e}")
-class SimpleRAGChatbot:
-    """Simplified RAG chatbot for HF Spaces deployment"""
-    def __init__(self):
-        self.client = None
         self.collection = None
-        self.model = None
         self.openai_client = None
-        self.initialized = False
-    def initialize(self):
-        """Initialize the RAG system with error handling"""
         try:
-            if not DEPENDENCIES_AVAILABLE:
-                return False
-            # Initialize embedding model
-            self.model = SentenceTransformer('all-MiniLM-L6-v2')
-            # Try to load existing database
-            if self._load_existing_database():
-                self.initialized = True
-                return True
-            # If no database exists, try to rebuild
-            if self._rebuild_from_chunks():
-                self.initialized = True
-                return True
-            return False
-        except Exception as e:
-            st.error(f"Initialization error: {str(e)}")
-            return False
-    def _load_existing_database(self) -> bool:
-        """Try to load existing ChromaDB"""
-        try:
-            # Check multiple possible paths
-            db_paths = ['./chroma_db', './chroma', '.']
-            for db_path in db_paths:
-                try:
-                    if os.path.exists(os.path.join(db_path, 'chroma.sqlite3')) or os.path.exists(db_path):
-                        self.client = chromadb.PersistentClient(path=db_path)
-                        collections = self.client.list_collections()
-                        if collections:
-                            self.collection = collections[0]  # Use first available collection
-                            st.success(f"✅ Loaded database from {db_path} with {self.collection.count()} documents")
-                            return True
-                except Exception:
-                    continue
-            return False
         except Exception as e:
-            st.warning(f"Could not load existing database: {str(e)}")
-            return False
-    def _rebuild_from_chunks(self) -> bool:
-        """Rebuild database from chunks.json if available"""
-        try:
-            chunks_file = 'chunks.json'
-            if not os.path.exists(chunks_file):
-                st.error("❌ No chunks.json file found. Please upload the required data files.")
-                return False
-            with open(chunks_file, 'r') as f:
-                chunks = json.load(f)
-            if not chunks:
-                st.error("❌ Chunks file is empty")
-                return False
-            # Create new database
-            db_path = './chroma_db'
-            os.makedirs(db_path, exist_ok=True)
-            self.client = chromadb.PersistentClient(path=db_path)
-            # Create collection
-            collection_name = "sklearn_docs"
-            try:
-                self.collection = self.client.get_collection(collection_name)
-            except:
-                self.collection = self.client.create_collection(collection_name)
-            # Add chunks in batches
-            batch_size = 100
-            total_chunks = len(chunks)
-            progress_bar = st.progress(0)
-            status_text = st.empty()
-            for i in range(0, total_chunks, batch_size):
-                batch = chunks[i:i + batch_size]
-                documents = [chunk['content'] for chunk in batch]
-                metadatas = [{'source': chunk.get('source', 'unknown')} for chunk in batch]
-                ids = [f"chunk_{i + j}" for j in range(len(batch))]
-                self.collection.add(
-                    documents=documents,
-                    metadatas=metadatas,
-                    ids=ids
-                )
-                progress = (i + len(batch)) / total_chunks
-                progress_bar.progress(progress)
-                status_text.text(f"Processing chunks: {i + len(batch)}/{total_chunks}")
-            progress_bar.empty()
-            status_text.empty()
-            st.success(f"✅ Successfully rebuilt database with {total_chunks} chunks")
             return True
         except Exception as e:
-            st.error(f"Failed to rebuild database: {str(e)}")
             return False
-    def search_documents(self, query: str, n_results: int = 5) -> List[Dict]:
-        """Search for relevant documents"""
         try:
-            if not self.initialized or not self.collection:
-                return []
             results = self.collection.query(
                 query_texts=[query],
                 n_results=n_results
             )
-            documents = []
             if results['documents'] and results['documents'][0]:
-                for i, doc in enumerate(results['documents'][0]):
-                    documents.append({
-                        'content': doc,
-                        'source': results['metadatas'][0][i].get('source', 'unknown') if results['metadatas'] else 'unknown'
-                    })
-            return documents
         except Exception as e:
-            st.error(f"Search error: {str(e)}")
             return []
-    def generate_response(self, query: str, context_docs: List[Dict]) -> str:
-        """Generate response using OpenAI"""
-        try:
-            # Check for OpenAI API key
-            api_key = st.session_state.get('openai_api_key') or os.getenv('OPENAI_API_KEY')
-            if not api_key:
-                return "⚠️ Please provide your OpenAI API key to generate responses."
-            if not self.openai_client:
-                self.openai_client = openai.OpenAI(api_key=api_key)
-            # Prepare context
-            context = "\n\n".join([f"Source: {doc['source']}\nContent: {doc['content']}"
-                                 for doc in context_docs])
-            if not context.strip():
-                return "I couldn't find relevant information in the documentation. Please try rephrasing your question."
-            # Create prompt
-            prompt = f"""Based on the following Scikit-learn documentation, please answer the user's question accurately and helpfully.
-Documentation Context:
 {context}
-User Question: {query}
-Please provide a clear, accurate answer based on the documentation provided. If the documentation doesn't contain enough information to answer the question completely, please say so."""
-            # Generate response
             response = self.openai_client.chat.completions.create(
-                model="gpt-3.5-turbo",
                 messages=[
-                    {"role": "system", "content": "You are a helpful assistant that answers questions about Scikit-learn based on provided documentation."},
-                    {"role": "user", "content": prompt}
                 ],
-                max_tokens=1000,
-                temperature=0.3
             )
-            return response.choices[0].message.content
         except Exception as e:
-            return f"Error generating response: {str(e)}"
 def main():
-    """Main Streamlit application"""
     st.set_page_config(
-        page_title="Scikit-learn RAG Chatbot",
         page_icon="🤖",
-        layout="wide"
     )
-    st.title("🤖 Scikit-learn RAG Chatbot")
-    st.markdown("Ask questions about Scikit-learn and get answers from the official documentation!")
     # Initialize session state
-    if 'chatbot' not in st.session_state:
-        st.session_state.chatbot = SimpleRAGChatbot()
-        st.session_state.messages = []
-        st.session_state.initialized = False
-    # Initialize the chatbot if not already done
-    if not st.session_state.initialized:
-        with st.spinner("Initializing RAG system..."):
-            success = st.session_state.chatbot.initialize()
-            st.session_state.initialized = success
-            if not success:
-                st.error("❌ Failed to initialize the system. Please check the data files.")
-                st.stop()
-    # Sidebar for API key
     with st.sidebar:
-        st.header("Configuration")
         api_key = st.text_input(
-            "OpenAI API Key",
             type="password",
-            value=st.session_state.get('openai_api_key', ''),
-            help="Enter your OpenAI API key to enable response generation"
         )
-        if api_key:
-            st.session_state.openai_api_key = api_key
-            st.success("✅ API key configured")
         st.markdown("---")
-        st.markdown("### About")
-        st.markdown("""
-        This chatbot uses RAG (Retrieval-Augmented Generation) to answer questions about Scikit-learn.
-        - **Data**: Official Scikit-learn documentation
-        - **Embeddings**: all-MiniLM-L6-v2
-        - **Vector DB**: ChromaDB
-        - **LLM**: GPT-3.5-turbo
-        """)
-    # Chat interface
-    st.header("💬 Chat")
-    # Display chat messages
-    for message in st.session_state.messages:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"])
-    # Chat input
-    if prompt := st.chat_input("Ask a question about Scikit-learn..."):
-        # Add user message
-        st.session_state.messages.append({"role": "user", "content": prompt})
-        with st.chat_message("user"):
-            st.markdown(prompt)
-        # Generate response
-        with st.chat_message("assistant"):
-            with st.spinner("Searching documentation and generating response..."):
-                # Search for relevant documents
-                docs = st.session_state.chatbot.search_documents(prompt)
-                if docs:
-                    st.markdown("**Found relevant documentation:**")
-                    for i, doc in enumerate(docs[:3], 1):
-                        with st.expander(f"📄 Source {i}: {doc['source']}", expanded=False):
-                            st.markdown(doc['content'][:500] + "..." if len(doc['content']) > 500 else doc['content'])
-                # Generate response
-                response = st.session_state.chatbot.generate_response(prompt, docs)
-                st.markdown(response)
-                # Add assistant message
-                st.session_state.messages.append({"role": "assistant", "content": response})
 if __name__ == "__main__":
     main()

+#!/usr/bin/env python3
 """
+Scikit-learn Documentation Q&A Bot
+A Retrieval-Augmented Generation (RAG) chatbot built with Streamlit
+that answers questions about Scikit-learn documentation using ChromaDB
+for retrieval and OpenAI for generation.
+Author: AI Assistant
+Date: September 2025
 """
 import os
 import logging
+from typing import List, Dict, Any, Optional, Tuple
+import streamlit as st
+import chromadb
+from chromadb.config import Settings
+from sentence_transformers import SentenceTransformer
+from openai import OpenAI
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class RAGChatbot:
+    """
+    A Retrieval-Augmented Generation chatbot for Scikit-learn documentation.
+    This class handles the complete RAG pipeline: retrieval from ChromaDB,
+    augmentation with context, and generation using OpenAI's API.
+    """
+    def __init__(
+        self,
+        db_path: str = './chroma_db',
+        collection_name: str = 'sklearn_docs',
+        embedding_model_name: str = 'all-MiniLM-L6-v2'
+    ):
+        """
+        Initialize the RAG chatbot.
+        Args:
+            db_path (str): Path to ChromaDB database
+            collection_name (str): Name of the ChromaDB collection
+            embedding_model_name (str): Name of the embedding model
+        """
+        self.db_path = db_path
+        self.collection_name = collection_name
+        self.embedding_model_name = embedding_model_name
+        # Initialize components
+        self.chroma_client = None
         self.collection = None
+        self.embedding_model = None
         self.openai_client = None
+        # Initialize the retrieval system
+        self._initialize_retrieval_system()
+    def _initialize_retrieval_system(self) -> None:
+        """
+        Initialize ChromaDB client and embedding model for retrieval.
+        """
         try:
+            # Initialize ChromaDB client
+            self.chroma_client = chromadb.PersistentClient(
+                path=self.db_path,
+                settings=Settings(anonymized_telemetry=False)
+            )
+            # Get collection
+            self.collection = self.chroma_client.get_collection(
+                name=self.collection_name
+            )
+            # Load embedding model (same as used for building the database)
+            self.embedding_model = SentenceTransformer(self.embedding_model_name)
+            logger.info("RAG retrieval system initialized successfully")
         except Exception as e:
+            logger.error(f"Failed to initialize retrieval system: {e}")
+            raise
+    def set_openai_client(self, api_key: str) -> bool:
+        """
+        Initialize OpenAI client with API key.
+        Args:
+            api_key (str): OpenAI API key
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            self.openai_client = OpenAI(api_key=api_key)
+            # Test the API key with a simple request
+            self.openai_client.models.list()
+            logger.info("OpenAI client initialized successfully")
             return True
         except Exception as e:
+            logger.error(f"Failed to initialize OpenAI client: {e}")
+            st.error(f"Invalid API key or OpenAI connection error: {e}")
             return False
+    def retrieve_relevant_chunks(
+        self,
+        query: str,
+        n_results: int = 3,
+        min_relevance_score: float = 0.1
+    ) -> List[Dict[str, Any]]:
+        """
+        Retrieve relevant text chunks from the vector database.
+        Args:
+            query (str): User question/query
+            n_results (int): Number of chunks to retrieve
+            min_relevance_score (float): Minimum relevance score threshold
+        Returns:
+            List[Dict[str, Any]]: Retrieved chunks with content and metadata
+        """
         try:
+            # Query the collection
             results = self.collection.query(
                 query_texts=[query],
                 n_results=n_results
             )
+            retrieved_chunks = []
+            # Process results
             if results['documents'] and results['documents'][0]:
+                for i in range(len(results['documents'][0])):
+                    chunk_data = {
+                        'content': results['documents'][0][i],
+                        'metadata': results['metadatas'][0][i],
+                        'distance': results['distances'][0][i] if 'distances' in results else None
+                    }
+                    # Filter by relevance score if available
+                    if chunk_data['distance'] is None or chunk_data['distance'] >= min_relevance_score:
+                        retrieved_chunks.append(chunk_data)
+            logger.info(f"Retrieved {len(retrieved_chunks)} relevant chunks for query: {query[:50]}...")
+            return retrieved_chunks
         except Exception as e:
+            logger.error(f"Error retrieving chunks: {e}")
+            st.error(f"Error during retrieval: {e}")
             return []
+    def create_rag_prompt(
+        self,
+        user_question: str,
+        retrieved_chunks: List[Dict[str, Any]]
+    ) -> str:
+        """
+        Create an augmented prompt for OpenAI with retrieved context.
+        Args:
+            user_question (str): Original user question
+            retrieved_chunks (List[Dict[str, Any]]): Retrieved relevant chunks
+        Returns:
+            str: Augmented prompt for OpenAI
+        """
+        # Build context from retrieved chunks
+        context_parts = []
+        for i, chunk in enumerate(retrieved_chunks, 1):
+            url = chunk['metadata'].get('url', 'Unknown source')
+            content = chunk['content'].strip()
+            context_part = f"--- Context {i} (Source: {url}) ---\n{content}\n"
+            context_parts.append(context_part)
+        context = "\n".join(context_parts)
+        # Create the RAG prompt
+        rag_prompt = f"""You are an expert AI assistant specializing in Scikit-learn, a popular Python machine learning library. Your task is to answer questions about Scikit-learn based ONLY on the provided context from the official documentation.
+CONTEXT:
 {context}
+USER QUESTION:
+{user_question}
+INSTRUCTIONS:
+1. Answer the question using ONLY the information provided in the context above
+2. Be accurate, helpful, and specific
+3. If the context doesn't contain enough information to fully answer the question, say so clearly
+4. Include relevant code examples if they appear in the context
+5. Mention specific function names, class names, or parameter names when relevant
+6. Structure your answer clearly with appropriate formatting
+ANSWER:"""
+        return rag_prompt
+    def generate_answer(
+        self,
+        prompt: str,
+        model: str = "gpt-3.5-turbo",
+        max_tokens: int = 1000,
+        temperature: float = 0.1
+    ) -> Optional[str]:
+        """
+        Generate answer using OpenAI API.
+        Args:
+            prompt (str): Augmented prompt with context
+            model (str): OpenAI model to use
+            max_tokens (int): Maximum tokens in response
+            temperature (float): Temperature for generation
+        Returns:
+            Optional[str]: Generated answer or None if failed
+        """
+        try:
             response = self.openai_client.chat.completions.create(
+                model=model,
                 messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a helpful AI assistant specializing in Scikit-learn documentation. Provide accurate, helpful answers based only on the provided context."
+                    },
+                    {
+                        "role": "user",
+                        "content": prompt
+                    }
                 ],
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=0.9
             )
+            answer = response.choices[0].message.content.strip()
+            logger.info(f"Generated answer of length: {len(answer)}")
+            return answer
         except Exception as e:
+            logger.error(f"Error generating answer: {e}")
+            st.error(f"Error generating answer: {e}")
+            return None
+    def get_answer(
+        self,
+        user_question: str,
+        n_chunks: int = 3,
+        model: str = "gpt-3.5-turbo"
+    ) -> Tuple[Optional[str], List[str]]:
+        """
+        Complete RAG pipeline: retrieve, augment, generate.
+        Args:
+            user_question (str): User's question
+            n_chunks (int): Number of chunks to retrieve
+            model (str): OpenAI model to use
+        Returns:
+            Tuple[Optional[str], List[str]]: Generated answer and source URLs
+        """
+        if not self.openai_client:
+            st.error("OpenAI client not initialized. Please provide a valid API key.")
+            return None, []
+        # Step 1: Retrieve relevant chunks
+        with st.spinner("🔍 Searching relevant documentation..."):
+            retrieved_chunks = self.retrieve_relevant_chunks(user_question, n_chunks)
+        if not retrieved_chunks:
+            return "I couldn't find relevant information in the Scikit-learn documentation to answer your question. Please try rephrasing your question or ask about a different topic.", []
+        # Step 2: Create augmented prompt
+        with st.spinner("📝 Preparing context..."):
+            rag_prompt = self.create_rag_prompt(user_question, retrieved_chunks)
+        # Step 3: Generate answer
+        with st.spinner("🤖 Generating answer..."):
+            answer = self.generate_answer(rag_prompt, model)
+        # Extract source URLs
+        source_urls = [chunk['metadata'].get('url', 'Unknown') for chunk in retrieved_chunks]
+        source_urls = list(dict.fromkeys(source_urls))  # Remove duplicates while preserving order
+        return answer, source_urls
+def initialize_session_state():
+    """Initialize Streamlit session state variables."""
+    if 'chatbot' not in st.session_state:
+        try:
+            st.session_state.chatbot = RAGChatbot()
+        except Exception as e:
+            st.error(f"Failed to initialize chatbot: {e}")
+            st.stop()
+    if 'openai_initialized' not in st.session_state:
+        st.session_state.openai_initialized = False
+    if 'chat_history' not in st.session_state:
+        st.session_state.chat_history = []
 def main():
+    """Main Streamlit application."""
+    # Page configuration
     st.set_page_config(
+        page_title="Scikit-learn Q&A Bot",
         page_icon="🤖",
+        layout="wide",
+        initial_sidebar_state="expanded"
     )
     # Initialize session state
+    initialize_session_state()
+    # Main title and description
+    st.title("🤖 Scikit-learn Documentation Q&A Bot")
+    st.markdown("""
+    Welcome to the **Scikit-learn Documentation Q&A Bot**! This intelligent assistant can answer your questions about Scikit-learn using the official documentation.
+    **How it works:**
+    1. 🔍 **Retrieval**: Searches through 1,249+ documentation chunks
+    2. 📝 **Augmentation**: Provides relevant context to the AI
+    3. 🤖 **Generation**: Uses OpenAI to generate accurate answers
+    """)
+    # Sidebar for API key and settings
     with st.sidebar:
+        st.header("⚙️ Configuration")
+        # OpenAI API Key input
         api_key = st.text_input(
+            "🔑 OpenAI API Key",
             type="password",
+            placeholder="sk-...",
+            help="Enter your OpenAI API key to enable the chatbot"
         )
+        if api_key and not st.session_state.openai_initialized:
+            if st.session_state.chatbot.set_openai_client(api_key):
+                st.session_state.openai_initialized = True
+                st.success("✅ API key validated!")
+                st.rerun()
+        # Model selection
+        model = st.selectbox(
+            "🧠 AI Model",
+            ["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo-preview"],
+            index=0,
+            help="Choose the OpenAI model for generating answers"
+        )
+        # Number of context chunks
+        n_chunks = st.slider(
+            "📄 Context Chunks",
+            min_value=1,
+            max_value=5,
+            value=3,
+            help="Number of relevant documentation chunks to use for context"
+        )
         st.markdown("---")
+        # Database info
+        st.header("📊 Database Info")
+        try:
+            collection_count = st.session_state.chatbot.collection.count()
+            st.metric("Total Documents", f"{collection_count:,}")
+            st.metric("Embedding Model", "all-MiniLM-L6-v2")
+            st.metric("Vector Dimensions", "384")
+        except:
+            st.error("Could not load database info")
+        st.markdown("---")
+        # Clear chat history
+        if st.button("🗑️ Clear Chat History"):
+            st.session_state.chat_history = []
+            st.rerun()
+    # Main chat interface
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        st.header("💬 Ask Your Question")
+        # Question input
+        default_question = st.session_state.get('selected_question', '')
+        user_question = st.text_input(
+            "Enter your question about Scikit-learn:",
+            value=default_question,
+            placeholder="e.g., How do I perform cross-validation in scikit-learn?",
+            key="question_input"
+        )
+        # Clear selected question after using it
+        if 'selected_question' in st.session_state:
+            del st.session_state['selected_question']
+        # Submit button
+        submit_button = st.button("🚀 Get Answer", type="primary")
+        # Process question
+        if submit_button and user_question:
+            if not st.session_state.openai_initialized:
+                st.error("⚠️ Please enter a valid OpenAI API key in the sidebar first.")
+            else:
+                # Get answer using RAG
+                answer, sources = st.session_state.chatbot.get_answer(
+                    user_question, n_chunks, model
+                )
+                if answer:
+                    # Add to chat history
+                    st.session_state.chat_history.append({
+                        'question': user_question,
+                        'answer': answer,
+                        'sources': sources
+                    })
+                    # Clear input
+                    st.rerun()
+        # Display chat history
+        if st.session_state.chat_history:
+            st.header("📝 Chat History")
+            for i, chat in enumerate(reversed(st.session_state.chat_history)):
+                with st.expander(f"Q: {chat['question'][:60]}...", expanded=(i == 0)):
+                    st.markdown(f"**Question:** {chat['question']}")
+                    st.markdown(f"**Answer:**\n{chat['answer']}")
+                    if chat['sources']:
+                        st.markdown("**Sources:**")
+                        for j, source in enumerate(chat['sources'], 1):
+                            source_name = source.split('/')[-1] if '/' in source else source
+                            st.markdown(f"{j}. [{source_name}]({source})")
+    with col2:
+        st.header("💡 Example Questions")
+        example_questions = [
+            "How do I perform cross-validation in scikit-learn?",
+            "What is the difference between Ridge and Lasso regression?",
+            "How do I use GridSearchCV for parameter tuning?",
+            "What clustering algorithms are available in scikit-learn?",
+            "How do I preprocess data using StandardScaler?",
+            "What is the difference between classification and regression?",
+            "How do I handle missing values in my dataset?",
+            "What is feature selection and how do I use it?",
+            "How do I visualize decision trees?",
+            "What is ensemble learning in scikit-learn?"
+        ]
+        for question in example_questions:
+            if st.button(question, key=f"example_{hash(question)}"):
+                # Use a different approach to set the question
+                st.session_state['selected_question'] = question
+                st.rerun()
+        st.markdown("---")
+        st.header("ℹ️ Tips")
+        st.markdown("""
+        **For best results:**
+        - Be specific in your questions
+        - Ask about scikit-learn functionality
+        - Include context when possible
+        - Check the sources for verification
+        **The bot can help with:**
+        - API usage and parameters
+        - Algorithm explanations
+        - Code examples
+        - Best practices
+        - Troubleshooting
+        """)
 if __name__ == "__main__":
     main()

app_hf.py CHANGED Viewed

@@ -1,309 +0,0 @@
-"""
-Scikit-learn RAG Chatbot - Hugging Face Spaces Optimized Version
-A Retrieval-Augmented Generation chatbot for Scikit-learn documentation.
-"""
-import streamlit as st
-import os
-import json
-import logging
-from typing import List, Dict, Optional, Tuple
-import warnings
-# Suppress warnings for cleaner output
-warnings.filterwarnings("ignore")
-logging.getLogger("chromadb").setLevel(logging.ERROR)
-logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
-# Try imports with error handling
-try:
-    import chromadb
-    from sentence_transformers import SentenceTransformer
-    import openai
-    DEPENDENCIES_AVAILABLE = True
-except ImportError as e:
-    DEPENDENCIES_AVAILABLE = False
-    st.error(f"Missing dependencies: {e}")
-class SimpleRAGChatbot:
-    """Simplified RAG chatbot for HF Spaces deployment"""
-    def __init__(self):
-        self.client = None
-        self.collection = None
-        self.model = None
-        self.openai_client = None
-        self.initialized = False
-    def initialize(self):
-        """Initialize the RAG system with error handling"""
-        try:
-            if not DEPENDENCIES_AVAILABLE:
-                return False
-            # Initialize embedding model
-            self.model = SentenceTransformer('all-MiniLM-L6-v2')
-            # Try to load existing database
-            if self._load_existing_database():
-                self.initialized = True
-                return True
-            # If no database exists, try to rebuild
-            if self._rebuild_from_chunks():
-                self.initialized = True
-                return True
-            return False
-        except Exception as e:
-            st.error(f"Initialization error: {str(e)}")
-            return False
-    def _load_existing_database(self) -> bool:
-        """Try to load existing ChromaDB"""
-        try:
-            # Check multiple possible paths
-            db_paths = ['./chroma_db', './chroma', '.']
-            for db_path in db_paths:
-                try:
-                    if os.path.exists(os.path.join(db_path, 'chroma.sqlite3')) or os.path.exists(db_path):
-                        self.client = chromadb.PersistentClient(path=db_path)
-                        collections = self.client.list_collections()
-                        if collections:
-                            self.collection = collections[0]  # Use first available collection
-                            st.success(f"✅ Loaded database from {db_path} with {self.collection.count()} documents")
-                            return True
-                except Exception:
-                    continue
-            return False
-        except Exception as e:
-            st.warning(f"Could not load existing database: {str(e)}")
-            return False
-    def _rebuild_from_chunks(self) -> bool:
-        """Rebuild database from chunks.json if available"""
-        try:
-            chunks_file = 'chunks.json'
-            if not os.path.exists(chunks_file):
-                st.error("❌ No chunks.json file found. Please upload the required data files.")
-                return False
-            with open(chunks_file, 'r') as f:
-                chunks = json.load(f)
-            if not chunks:
-                st.error("❌ Chunks file is empty")
-                return False
-            # Create new database
-            db_path = './chroma_db'
-            os.makedirs(db_path, exist_ok=True)
-            self.client = chromadb.PersistentClient(path=db_path)
-            # Create collection
-            collection_name = "sklearn_docs"
-            try:
-                self.collection = self.client.get_collection(collection_name)
-            except:
-                self.collection = self.client.create_collection(collection_name)
-            # Add chunks in batches
-            batch_size = 100
-            total_chunks = len(chunks)
-            progress_bar = st.progress(0)
-            status_text = st.empty()
-            for i in range(0, total_chunks, batch_size):
-                batch = chunks[i:i + batch_size]
-                documents = [chunk['content'] for chunk in batch]
-                metadatas = [{'source': chunk.get('source', 'unknown')} for chunk in batch]
-                ids = [f"chunk_{i + j}" for j in range(len(batch))]
-                self.collection.add(
-                    documents=documents,
-                    metadatas=metadatas,
-                    ids=ids
-                )
-                progress = (i + len(batch)) / total_chunks
-                progress_bar.progress(progress)
-                status_text.text(f"Processing chunks: {i + len(batch)}/{total_chunks}")
-            progress_bar.empty()
-            status_text.empty()
-            st.success(f"✅ Successfully rebuilt database with {total_chunks} chunks")
-            return True
-        except Exception as e:
-            st.error(f"Failed to rebuild database: {str(e)}")
-            return False
-    def search_documents(self, query: str, n_results: int = 5) -> List[Dict]:
-        """Search for relevant documents"""
-        try:
-            if not self.initialized or not self.collection:
-                return []
-            results = self.collection.query(
-                query_texts=[query],
-                n_results=n_results
-            )
-            documents = []
-            if results['documents'] and results['documents'][0]:
-                for i, doc in enumerate(results['documents'][0]):
-                    documents.append({
-                        'content': doc,
-                        'source': results['metadatas'][0][i].get('source', 'unknown') if results['metadatas'] else 'unknown'
-                    })
-            return documents
-        except Exception as e:
-            st.error(f"Search error: {str(e)}")
-            return []
-    def generate_response(self, query: str, context_docs: List[Dict]) -> str:
-        """Generate response using OpenAI"""
-        try:
-            # Check for OpenAI API key
-            api_key = st.session_state.get('openai_api_key') or os.getenv('OPENAI_API_KEY')
-            if not api_key:
-                return "⚠️ Please provide your OpenAI API key to generate responses."
-            if not self.openai_client:
-                self.openai_client = openai.OpenAI(api_key=api_key)
-            # Prepare context
-            context = "\n\n".join([f"Source: {doc['source']}\nContent: {doc['content']}"
-                                 for doc in context_docs])
-            if not context.strip():
-                return "I couldn't find relevant information in the documentation. Please try rephrasing your question."
-            # Create prompt
-            prompt = f"""Based on the following Scikit-learn documentation, please answer the user's question accurately and helpfully.
-Documentation Context:
-{context}
-User Question: {query}
-Please provide a clear, accurate answer based on the documentation provided. If the documentation doesn't contain enough information to answer the question completely, please say so."""
-            # Generate response
-            response = self.openai_client.chat.completions.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {"role": "system", "content": "You are a helpful assistant that answers questions about Scikit-learn based on provided documentation."},
-                    {"role": "user", "content": prompt}
-                ],
-                max_tokens=1000,
-                temperature=0.3
-            )
-            return response.choices[0].message.content
-        except Exception as e:
-            return f"Error generating response: {str(e)}"
-def main():
-    """Main Streamlit application"""
-    st.set_page_config(
-        page_title="Scikit-learn RAG Chatbot",
-        page_icon="🤖",
-        layout="wide"
-    )
-    st.title("🤖 Scikit-learn RAG Chatbot")
-    st.markdown("Ask questions about Scikit-learn and get answers from the official documentation!")
-    # Initialize session state
-    if 'chatbot' not in st.session_state:
-        st.session_state.chatbot = SimpleRAGChatbot()
-        st.session_state.messages = []
-        st.session_state.initialized = False
-    # Initialize the chatbot if not already done
-    if not st.session_state.initialized:
-        with st.spinner("Initializing RAG system..."):
-            success = st.session_state.chatbot.initialize()
-            st.session_state.initialized = success
-            if not success:
-                st.error("❌ Failed to initialize the system. Please check the data files.")
-                st.stop()
-    # Sidebar for API key
-    with st.sidebar:
-        st.header("Configuration")
-        api_key = st.text_input(
-            "OpenAI API Key",
-            type="password",
-            value=st.session_state.get('openai_api_key', ''),
-            help="Enter your OpenAI API key to enable response generation"
-        )
-        if api_key:
-            st.session_state.openai_api_key = api_key
-            st.success("✅ API key configured")
-        st.markdown("---")
-        st.markdown("### About")
-        st.markdown("""
-        This chatbot uses RAG (Retrieval-Augmented Generation) to answer questions about Scikit-learn.
-        - **Data**: Official Scikit-learn documentation
-        - **Embeddings**: all-MiniLM-L6-v2
-        - **Vector DB**: ChromaDB
-        - **LLM**: GPT-3.5-turbo
-        """)
-    # Chat interface
-    st.header("💬 Chat")
-    # Display chat messages
-    for message in st.session_state.messages:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"])
-    # Chat input
-    if prompt := st.chat_input("Ask a question about Scikit-learn..."):
-        # Add user message
-        st.session_state.messages.append({"role": "user", "content": prompt})
-        with st.chat_message("user"):
-            st.markdown(prompt)
-        # Generate response
-        with st.chat_message("assistant"):
-            with st.spinner("Searching documentation and generating response..."):
-                # Search for relevant documents
-                docs = st.session_state.chatbot.search_documents(prompt)
-                if docs:
-                    st.markdown("**Found relevant documentation:**")
-                    for i, doc in enumerate(docs[:3], 1):
-                        with st.expander(f"📄 Source {i}: {doc['source']}", expanded=False):
-                            st.markdown(doc['content'][:500] + "..." if len(doc['content']) > 500 else doc['content'])
-                # Generate response
-                response = st.session_state.chatbot.generate_response(prompt, docs)
-                st.markdown(response)
-                # Add assistant message
-                st.session_state.messages.append({"role": "assistant", "content": response})
-if __name__ == "__main__":
-    main()