Spaces:

fguryel
/

scikit-rag

Sleeping

App Files Files Community

fguryel commited on Sep 28, 2025

Commit

bde494c

1 Parent(s): 8fc10eb

added db

Browse files

Files changed (5) hide show

app.py +17 -0
app_backup.py +0 -621
app_hf.py +0 -0
chroma_db/chroma.sqlite3 +1 -1
run.py +0 -0

app.py CHANGED Viewed

@@ -488,4 +488,21 @@ def main():
 if __name__ == "__main__":
     main()

 if __name__ == "__main__":
+    # Check if running with streamlit vs directly with python
+    import sys
+    # Check if this is being run directly with python (not via streamlit)
+    # When run with streamlit, sys.argv[0] typically contains 'streamlit' or the script is run in a different context
+    if len(sys.argv) == 1 and 'streamlit' not in sys.modules:
+        print("⚠️  This is a Streamlit application!")
+        print("🚀 Please run it with: streamlit run app.py")
+        print()
+        print("📝 Instructions:")
+        print("1. Make sure you have streamlit installed: pip install streamlit")
+        print("2. Run the app: streamlit run app.py")
+        print("3. Enter your OpenAI API key in the sidebar")
+        print("4. Start asking questions about Scikit-learn!")
+        sys.exit(0)
+    # If we get here, we're likely running via streamlit
     main()

app_backup.py DELETED Viewed

@@ -1,621 +0,0 @@
-#!/usr/bin/env python3
-"""
-Scikit-learn Documentation Q&A Bot
-A Retrieval-Augmented Generation (RAG) chatbot built with Streamlit
-that answers questions about Scikit-learn documentation using ChromaDB
-for retrieval and OpenAI for generation.
-Author: AI Assistant
-Date: September 2025
-"""
-import os
-import sys
-import json
-import logging
-from typing import List, Dict, Any, Optional, Tuple
-import streamlit as st
-import chromadb
-from chromadb.config import Settings
-from sentence_transformers import SentenceTransformer
-from openai import OpenAI
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-class RAGChatbot:
-    """
-    A Retrieval-Augmented Generation chatbot for Scikit-learn documentation.
-    This class handles the complete RAG pipeline: retrieval from ChromaDB,
-    augmentation with context, and generation using OpenAI's API.
-    """
-    def __init__(
-        self,
-        db_path: str = './chroma_db',
-        collection_name: str = 'sklearn_docs',
-        embedding_model_name: str = 'all-MiniLM-L6-v2'
-    ):
-        """
-        Initialize the RAG chatbot.
-        Args:
-            db_path (str): Path to ChromaDB database
-            collection_name (str): Name of the ChromaDB collection
-            embedding_model_name (str): Name of the embedding model
-        """
-        self.db_path = db_path
-        self.collection_name = collection_name
-        self.embedding_model_name = embedding_model_name
-        # Initialize components
-        self.chroma_client = None
-        self.collection = None
-        self.embedding_model = None
-        self.openai_client = None
-        # Initialize the retrieval system
-        self._initialize_retrieval_system()
-    def _initialize_retrieval_system(self) -> None:
-        """
-        Initialize ChromaDB client and embedding model for retrieval.
-        """
-        try:
-            # Check if we're in Hugging Face Spaces environment
-            if os.path.exists('chroma.sqlite3'):
-                # We're likely in HF Spaces - use current directory
-                self.db_path = '.'
-            # Initialize ChromaDB client
-            self.chroma_client = chromadb.PersistentClient(
-                path=self.db_path,
-                settings=Settings(anonymized_telemetry=False)
-            )
-            # Get or create collection
-            try:
-                self.collection = self.chroma_client.get_collection(
-                    name=self.collection_name
-                )
-            except Exception:
-                # If collection doesn't exist, try to recreate it from chunks
-                if os.path.exists('chunks.json'):
-                    st.warning("Database collection not found. Rebuilding from chunks...")
-                    self._rebuild_collection_from_chunks()
-                else:
-                    raise Exception("Neither database collection nor chunks.json found. Please build the database first.")
-            # Load embedding model (same as used for building the database)
-            self.embedding_model = SentenceTransformer(self.embedding_model_name)
-            logger.info("RAG retrieval system initialized successfully")
-        except Exception as e:
-            logger.error(f"Failed to initialize retrieval system: {e}")
-            # In Streamlit, show user-friendly error
-            if 'streamlit' in sys.modules:
-                st.error(f"❌ Database initialization failed: {e}")
-                st.info("💡 This might be the first run. The database needs to be built from the scraped content.")
-            raise
-    def _rebuild_collection_from_chunks(self) -> None:
-        """
-        Rebuild the ChromaDB collection from chunks.json file.
-        This is useful for Hugging Face Spaces deployment.
-        """
-        try:
-            st.info("🔄 Rebuilding database collection from chunks...")
-            # Load chunks
-            with open('chunks.json', 'r', encoding='utf-8') as f:
-                chunks = json.load(f)
-            # Create collection
-            try:
-                self.chroma_client.delete_collection(name=self.collection_name)
-            except:
-                pass  # Collection might not exist
-            self.collection = self.chroma_client.create_collection(
-                name=self.collection_name,
-                metadata={"description": "Scikit-learn documentation embeddings"}
-            )
-            # Load embedding model if not loaded
-            if not hasattr(self, 'embedding_model') or self.embedding_model is None:
-                self.embedding_model = SentenceTransformer(self.embedding_model_name)
-            # Process chunks in batches
-            batch_size = 100
-            progress_bar = st.progress(0)
-            status_text = st.empty()
-            for i in range(0, len(chunks), batch_size):
-                batch_chunks = chunks[i:i + batch_size]
-                # Prepare data
-                texts = [chunk['page_content'] for chunk in batch_chunks]
-                metadatas = []
-                for chunk in batch_chunks:
-                    metadata = {
-                        'url': chunk['metadata']['url'],
-                        'chunk_index': str(chunk['metadata']['chunk_index']),
-                        'source': chunk['metadata'].get('source', 'scikit-learn-docs'),
-                        'content_length': str(len(chunk['page_content']))
-                    }
-                    metadatas.append(metadata)
-                # Create embeddings
-                embeddings = self.embedding_model.encode(texts).tolist()
-                # Generate IDs
-                ids = [f"chunk_{i+j}" for j in range(len(batch_chunks))]
-                # Add to collection
-                self.collection.add(
-                    ids=ids,
-                    documents=texts,
-                    metadatas=metadatas,
-                    embeddings=embeddings
-                )
-                # Update progress
-                progress = (i + batch_size) / len(chunks)
-                progress_bar.progress(min(progress, 1.0))
-                status_text.text(f"Processing chunks: {min(i + batch_size, len(chunks))}/{len(chunks)}")
-            progress_bar.empty()
-            status_text.empty()
-            st.success(f"✅ Successfully rebuilt collection with {len(chunks)} chunks!")
-        except Exception as e:
-            st.error(f"❌ Failed to rebuild collection: {e}")
-            raise
-    def set_openai_client(self, api_key: str) -> bool:
-        """
-        Initialize OpenAI client with API key.
-        Args:
-            api_key (str): OpenAI API key
-        Returns:
-            bool: True if successful, False otherwise
-        """
-        try:
-            self.openai_client = OpenAI(api_key=api_key)
-            # Test the API key with a simple request
-            self.openai_client.models.list()
-            logger.info("OpenAI client initialized successfully")
-            return True
-        except Exception as e:
-            logger.error(f"Failed to initialize OpenAI client: {e}")
-            st.error(f"Invalid API key or OpenAI connection error: {e}")
-            return False
-    def retrieve_relevant_chunks(
-        self,
-        query: str,
-        n_results: int = 3,
-        min_relevance_score: float = 0.1
-    ) -> List[Dict[str, Any]]:
-        """
-        Retrieve relevant text chunks from the vector database.
-        Args:
-            query (str): User question/query
-            n_results (int): Number of chunks to retrieve
-            min_relevance_score (float): Minimum relevance score threshold
-        Returns:
-            List[Dict[str, Any]]: Retrieved chunks with content and metadata
-        """
-        try:
-            # Query the collection
-            results = self.collection.query(
-                query_texts=[query],
-                n_results=n_results
-            )
-            retrieved_chunks = []
-            # Process results
-            if results['documents'] and results['documents'][0]:
-                for i in range(len(results['documents'][0])):
-                    chunk_data = {
-                        'content': results['documents'][0][i],
-                        'metadata': results['metadatas'][0][i],
-                        'distance': results['distances'][0][i] if 'distances' in results else None
-                    }
-                    # Filter by relevance score if available
-                    if chunk_data['distance'] is None or chunk_data['distance'] >= min_relevance_score:
-                        retrieved_chunks.append(chunk_data)
-            logger.info(f"Retrieved {len(retrieved_chunks)} relevant chunks for query: {query[:50]}...")
-            return retrieved_chunks
-        except Exception as e:
-            logger.error(f"Error retrieving chunks: {e}")
-            st.error(f"Error during retrieval: {e}")
-            return []
-    def create_rag_prompt(
-        self,
-        user_question: str,
-        retrieved_chunks: List[Dict[str, Any]]
-    ) -> str:
-        """
-        Create an augmented prompt for OpenAI with retrieved context.
-        Args:
-            user_question (str): Original user question
-            retrieved_chunks (List[Dict[str, Any]]): Retrieved relevant chunks
-        Returns:
-            str: Augmented prompt for OpenAI
-        """
-        # Build context from retrieved chunks
-        context_parts = []
-        for i, chunk in enumerate(retrieved_chunks, 1):
-            url = chunk['metadata'].get('url', 'Unknown source')
-            content = chunk['content'].strip()
-            context_part = f"--- Context {i} (Source: {url}) ---\n{content}\n"
-            context_parts.append(context_part)
-        context = "\n".join(context_parts)
-        # Create the RAG prompt
-        rag_prompt = f"""You are an expert AI assistant specializing in Scikit-learn, a popular Python machine learning library. Your task is to answer questions about Scikit-learn based ONLY on the provided context from the official documentation.
-CONTEXT:
-{context}
-USER QUESTION:
-{user_question}
-INSTRUCTIONS:
-1. Answer the question using ONLY the information provided in the context above
-2. Be accurate, helpful, and specific
-3. If the context doesn't contain enough information to fully answer the question, say so clearly
-4. Include relevant code examples if they appear in the context
-5. Mention specific function names, class names, or parameter names when relevant
-6. Structure your answer clearly with appropriate formatting
-ANSWER:"""
-        return rag_prompt
-    def generate_answer(
-        self,
-        prompt: str,
-        model: str = "gpt-3.5-turbo",
-        max_tokens: int = 1000,
-        temperature: float = 0.1
-    ) -> Optional[str]:
-        """
-        Generate answer using OpenAI API.
-        Args:
-            prompt (str): Augmented prompt with context
-            model (str): OpenAI model to use
-            max_tokens (int): Maximum tokens in response
-            temperature (float): Temperature for generation
-        Returns:
-            Optional[str]: Generated answer or None if failed
-        """
-        try:
-            response = self.openai_client.chat.completions.create(
-                model=model,
-                messages=[
-                    {
-                        "role": "system",
-                        "content": "You are a helpful AI assistant specializing in Scikit-learn documentation. Provide accurate, helpful answers based only on the provided context."
-                    },
-                    {
-                        "role": "user",
-                        "content": prompt
-                    }
-                ],
-                max_tokens=max_tokens,
-                temperature=temperature,
-                top_p=0.9
-            )
-            answer = response.choices[0].message.content.strip()
-            logger.info(f"Generated answer of length: {len(answer)}")
-            return answer
-        except Exception as e:
-            logger.error(f"Error generating answer: {e}")
-            st.error(f"Error generating answer: {e}")
-            return None
-    def get_answer(
-        self,
-        user_question: str,
-        n_chunks: int = 3,
-        model: str = "gpt-3.5-turbo"
-    ) -> Tuple[Optional[str], List[str]]:
-        """
-        Complete RAG pipeline: retrieve, augment, generate.
-        Args:
-            user_question (str): User's question
-            n_chunks (int): Number of chunks to retrieve
-            model (str): OpenAI model to use
-        Returns:
-            Tuple[Optional[str], List[str]]: Generated answer and source URLs
-        """
-        if not self.openai_client:
-            st.error("OpenAI client not initialized. Please provide a valid API key.")
-            return None, []
-        # Step 1: Retrieve relevant chunks
-        with st.spinner("🔍 Searching relevant documentation..."):
-            retrieved_chunks = self.retrieve_relevant_chunks(user_question, n_chunks)
-        if not retrieved_chunks:
-            return "I couldn't find relevant information in the Scikit-learn documentation to answer your question. Please try rephrasing your question or ask about a different topic.", []
-        # Step 2: Create augmented prompt
-        with st.spinner("📝 Preparing context..."):
-            rag_prompt = self.create_rag_prompt(user_question, retrieved_chunks)
-        # Step 3: Generate answer
-        with st.spinner("🤖 Generating answer..."):
-            answer = self.generate_answer(rag_prompt, model)
-        # Extract source URLs
-        source_urls = [chunk['metadata'].get('url', 'Unknown') for chunk in retrieved_chunks]
-        source_urls = list(dict.fromkeys(source_urls))  # Remove duplicates while preserving order
-        return answer, source_urls
-def initialize_session_state():
-    """Initialize Streamlit session state variables."""
-    if 'chatbot' not in st.session_state:
-        try:
-            # Show initialization message
-            init_placeholder = st.empty()
-            init_placeholder.info("🔄 Initializing RAG system...")
-            st.session_state.chatbot = RAGChatbot()
-            init_placeholder.empty()
-        except Exception as e:
-            st.error(f"❌ Failed to initialize chatbot: {e}")
-            # Provide helpful instructions
-            st.markdown("""
-            ### 🔧 Troubleshooting
-            This error typically occurs when:
-            1. **First deployment**: The database hasn't been built yet
-            2. **Missing files**: Required data files are not available
-            ### 📋 Required Files
-            Make sure these files are present:
-            - `chunks.json` (processed text chunks)
-            - `chroma.sqlite3` (database file) OR `chroma_db/` directory
-            ### 🚀 Quick Fix for Hugging Face Spaces
-            If you're running this on Hugging Face Spaces, make sure you've uploaded:
-            1. All Python files (`app.py`, `build_vector_db.py`, etc.)
-            2. Data files (`chunks.json`, `scraped_content.json`)
-            3. Database files (`chroma.sqlite3` or the `chroma_db/` folder)
-            """)
-            st.stop()
-    if 'openai_initialized' not in st.session_state:
-        st.session_state.openai_initialized = False
-    if 'chat_history' not in st.session_state:
-        st.session_state.chat_history = []
-def main():
-    """Main Streamlit application."""
-    # Page configuration
-    st.set_page_config(
-        page_title="Scikit-learn Q&A Bot",
-        page_icon="🤖",
-        layout="wide",
-        initial_sidebar_state="expanded"
-    )
-    # Initialize session state
-    initialize_session_state()
-    # Main title and description
-    st.title("🤖 Scikit-learn Documentation Q&A Bot")
-    # Show database status
-    try:
-        collection_count = st.session_state.chatbot.collection.count()
-        st.success(f"✅ Database ready with {collection_count:,} documentation chunks")
-    except:
-        st.warning("⚠️ Database status unknown")
-    st.markdown("""
-    Welcome to the **Scikit-learn Documentation Q&A Bot**! This intelligent assistant can answer your questions about Scikit-learn using the official documentation.
-    **How it works:**
-    1. 🔍 **Retrieval**: Searches through 1,249+ documentation chunks
-    2. 📝 **Augmentation**: Provides relevant context to the AI
-    3. 🤖 **Generation**: Uses OpenAI to generate accurate answers
-    **👈 To get started**: Enter your OpenAI API key in the sidebar!
-    """)
-    # Sidebar for API key and settings
-    with st.sidebar:
-        st.header("⚙️ Configuration")
-        # OpenAI API Key input
-        api_key = st.text_input(
-            "🔑 OpenAI API Key",
-            type="password",
-            placeholder="sk-...",
-            help="Enter your OpenAI API key to enable the chatbot"
-        )
-        if api_key and not st.session_state.openai_initialized:
-            if st.session_state.chatbot.set_openai_client(api_key):
-                st.session_state.openai_initialized = True
-                st.success("✅ API key validated!")
-                st.rerun()
-        # Model selection
-        model = st.selectbox(
-            "🧠 AI Model",
-            ["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo-preview"],
-            index=0,
-            help="Choose the OpenAI model for generating answers"
-        )
-        # Number of context chunks
-        n_chunks = st.slider(
-            "📄 Context Chunks",
-            min_value=1,
-            max_value=5,
-            value=3,
-            help="Number of relevant documentation chunks to use for context"
-        )
-        st.markdown("---")
-        # Database info
-        st.header("📊 Database Info")
-        try:
-            collection_count = st.session_state.chatbot.collection.count()
-            st.metric("Total Documents", f"{collection_count:,}")
-            st.metric("Embedding Model", "all-MiniLM-L6-v2")
-            st.metric("Vector Dimensions", "384")
-        except:
-            st.error("Could not load database info")
-        st.markdown("---")
-        # Clear chat history
-        if st.button("🗑️ Clear Chat History"):
-            st.session_state.chat_history = []
-            st.rerun()
-    # Main chat interface
-    col1, col2 = st.columns([2, 1])
-    with col1:
-        st.header("💬 Ask Your Question")
-        # Question input
-        default_question = st.session_state.get('selected_question', '')
-        user_question = st.text_input(
-            "Enter your question about Scikit-learn:",
-            value=default_question,
-            placeholder="e.g., How do I perform cross-validation in scikit-learn?",
-            key="question_input"
-        )
-        # Clear selected question after using it
-        if 'selected_question' in st.session_state:
-            del st.session_state['selected_question']
-        # Submit button
-        submit_button = st.button("🚀 Get Answer", type="primary")
-        # Process question
-        if submit_button and user_question:
-            if not st.session_state.openai_initialized:
-                st.error("⚠️ Please enter a valid OpenAI API key in the sidebar first.")
-            else:
-                # Get answer using RAG
-                answer, sources = st.session_state.chatbot.get_answer(
-                    user_question, n_chunks, model
-                )
-                if answer:
-                    # Add to chat history
-                    st.session_state.chat_history.append({
-                        'question': user_question,
-                        'answer': answer,
-                        'sources': sources
-                    })
-                    # Clear input
-                    st.rerun()
-        # Display chat history
-        if st.session_state.chat_history:
-            st.header("📝 Chat History")
-            for i, chat in enumerate(reversed(st.session_state.chat_history)):
-                with st.expander(f"Q: {chat['question'][:60]}...", expanded=(i == 0)):
-                    st.markdown(f"**Question:** {chat['question']}")
-                    st.markdown(f"**Answer:**\n{chat['answer']}")
-                    if chat['sources']:
-                        st.markdown("**Sources:**")
-                        for j, source in enumerate(chat['sources'], 1):
-                            source_name = source.split('/')[-1] if '/' in source else source
-                            st.markdown(f"{j}. [{source_name}]({source})")
-    with col2:
-        st.header("💡 Example Questions")
-        example_questions = [
-            "How do I perform cross-validation in scikit-learn?",
-            "What is the difference between Ridge and Lasso regression?",
-            "How do I use GridSearchCV for parameter tuning?",
-            "What clustering algorithms are available in scikit-learn?",
-            "How do I preprocess data using StandardScaler?",
-            "What is the difference between classification and regression?",
-            "How do I handle missing values in my dataset?",
-            "What is feature selection and how do I use it?",
-            "How do I visualize decision trees?",
-            "What is ensemble learning in scikit-learn?"
-        ]
-        for question in example_questions:
-            if st.button(question, key=f"example_{hash(question)}"):
-                # Use a different approach to set the question
-                st.session_state['selected_question'] = question
-                st.rerun()
-        st.markdown("---")
-        st.header("ℹ️ Tips")
-        st.markdown("""
-        **For best results:**
-        - Be specific in your questions
-        - Ask about scikit-learn functionality
-        - Include context when possible
-        - Check the sources for verification
-        **The bot can help with:**
-        - API usage and parameters
-        - Algorithm explanations
-        - Code examples
-        - Best practices
-        - Troubleshooting
-        """)
-if __name__ == "__main__":
-    main()

app_hf.py DELETED Viewed

File without changes

chroma_db/chroma.sqlite3 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9b2ea1d7cd9a55d50c44e113c9160a4c759fdbe3f2912d3770d89fda8b8cd1fb
 size 13279232

 version https://git-lfs.github.com/spec/v1
+oid sha256:58b93c87e29c6b2a74e2b9bf0d13b8a76878037325a1fb5cfbb1886bc2068e68
 size 13279232

run.py DELETED Viewed

File without changes