Spaces:

fguryel
/

scikit-rag

Sleeping

App Files Files Community

fguryel commited on Sep 28, 2025

Commit

7ed4bfa

1 Parent(s): b787a9a

add db

Browse files

Files changed (12) hide show

.streamlit/config.toml +0 -14
.streamlit/secrets.toml +0 -5
README.md +12 -3
ab7fa527-b151-425e-9f81-9aa3f7b65f1d/link_lists.bin +0 -0
app.py +42 -131
chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152/data_level0.bin +3 -0
{ab7fa527-b151-425e-9f81-9aa3f7b65f1d → chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152}/header.bin +1 -1
ab7fa527-b151-425e-9f81-9aa3f7b65f1d/data_level0.bin → chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152/index_metadata.pickle +2 -2
{ab7fa527-b151-425e-9f81-9aa3f7b65f1d → chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152}/length.bin +2 -2
chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152/link_lists.bin +3 -0
chroma.sqlite3 → chroma_db/chroma.sqlite3 +2 -2
run.py +0 -39

.streamlit/config.toml DELETED Viewed

@@ -1,14 +0,0 @@
-[server]
-headless = true
-port = 7860
-enableCORS = false
-enableXsrfProtection = false
-[theme]
-base = "light"
-primaryColor = "#1f77b4"
-backgroundColor = "#ffffff"
-secondaryBackgroundColor = "#f0f2f6"
-[browser]
-gatherUsageStats = false

.streamlit/secrets.toml DELETED Viewed

@@ -1,5 +0,0 @@
-# Streamlit secrets for Hugging Face Spaces
-# This helps reduce warning messages
-[general]
-dataFrameSerialization = "legacy"

README.md CHANGED Viewed

@@ -1,4 +1,16 @@
 ---
 title: Scikit-learn Documentation Q&A Bot
 emoji: 🤖
 colorFrom: blue
@@ -9,9 +21,6 @@ app_file: app.py
 pinned: false
 license: mit
 ---
-pinned: false
-license: mit
----
 # Scikit-learn Documentation Q&A Bot 🤖

 ---
+title: Scikit Rag
+emoji: 🐢
+colorFrom: blue
+colorTo: yellow
+sdk: gradio
+sdk_version: 5.47.2
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+---
 title: Scikit-learn Documentation Q&A Bot
 emoji: 🤖
 colorFrom: blue
 pinned: false
 license: mit
 ---
 # Scikit-learn Documentation Q&A Bot 🤖

ab7fa527-b151-425e-9f81-9aa3f7b65f1d/link_lists.bin DELETED Viewed

File without changes

app.py CHANGED Viewed

@@ -26,10 +26,6 @@ from openai import OpenAI
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Suppress Streamlit context warnings for HF Spaces
-logging.getLogger("streamlit.runtime.scriptrunner_utils.script_run_context").setLevel(logging.ERROR)
-logging.getLogger("streamlit.runtime.state.session_state_proxy").setLevel(logging.ERROR)
 class RAGChatbot:
     """
@@ -71,68 +67,34 @@ class RAGChatbot:
         Initialize ChromaDB client and embedding model for retrieval.
         """
         try:
-            # Detect environment and set appropriate database path
-            current_dir = os.getcwd()
-            # Check for database files in different locations
-            if os.path.exists(os.path.join(current_dir, 'chroma.sqlite3')):
-                self.db_path = current_dir
-                logger.info(f"Using database in current directory: {current_dir}")
-            elif os.path.exists(os.path.join(self.db_path, 'chroma.sqlite3')):
-                logger.info(f"Using database in specified path: {self.db_path}")
-            else:
-                logger.warning("No database file found, will attempt to rebuild from chunks")
-            # Initialize ChromaDB client with error handling
             try:
-                self.chroma_client = chromadb.PersistentClient(
-                    path=self.db_path,
-                    settings=Settings(
-                        anonymized_telemetry=False,
-                        allow_reset=True,
-                        is_persistent=True
-                    )
                 )
-                logger.info(f"ChromaDB client initialized at: {self.db_path}")
-            except Exception as client_error:
-                logger.error(f"ChromaDB client initialization failed: {client_error}")
-                # Try with default settings
-                self.chroma_client = chromadb.PersistentClient(path=self.db_path)
-            # Get or create collection with robust error handling
-            collection_found = False
-            try:
-                # First, list all collections to see what's available
-                collections = self.chroma_client.list_collections()
-                collection_names = [col.name for col in collections]
-                logger.info(f"Available collections: {collection_names}")
-                if self.collection_name in collection_names:
-                    self.collection = self.chroma_client.get_collection(name=self.collection_name)
-                    collection_found = True
-                    logger.info(f"Successfully loaded collection: {self.collection_name}")
-                else:
-                    logger.warning(f"Collection '{self.collection_name}' not found in {collection_names}")
-            except Exception as col_error:
-                logger.error(f"Error accessing collections: {col_error}")
-            # If collection not found, rebuild from chunks
-            if not collection_found:
                 if os.path.exists('chunks.json'):
-                    logger.info("Attempting to rebuild collection from chunks.json")
-                    if 'streamlit' in sys.modules:
-                        st.warning("🔄 Database collection not found. Rebuilding from chunks...")
                     self._rebuild_collection_from_chunks()
                 else:
-                    error_msg = f"Collection '{self.collection_name}' not found and no chunks.json available for rebuilding"
-                    logger.error(error_msg)
-                    raise Exception(error_msg)
-            # Initialize embedding model as None - will be loaded lazily when needed
-            self.embedding_model = None
-            logger.info("RAG retrieval system initialized successfully (embedding model will load on first use)")
         except Exception as e:
             logger.error(f"Failed to initialize retrieval system: {e}")
@@ -148,38 +110,26 @@ class RAGChatbot:
         This is useful for Hugging Face Spaces deployment.
         """
         try:
-            logger.info("Starting collection rebuild from chunks.json")
-            if 'streamlit' in sys.modules:
-                st.info("🔄 Rebuilding database collection from chunks...")
-            # Load chunks with error handling
-            chunks_path = 'chunks.json'
-            if not os.path.exists(chunks_path):
-                raise FileNotFoundError(f"chunks.json not found at {chunks_path}")
-            with open(chunks_path, 'r', encoding='utf-8') as f:
                 chunks = json.load(f)
-            logger.info(f"Loaded {len(chunks)} chunks from {chunks_path}")
-            # Safely create collection
             try:
-                # Try to delete existing collection first
-                existing_collections = [col.name for col in self.chroma_client.list_collections()]
-                if self.collection_name in existing_collections:
-                    logger.info(f"Deleting existing collection: {self.collection_name}")
-                    self.chroma_client.delete_collection(name=self.collection_name)
-            except Exception as del_error:
-                logger.warning(f"Could not delete existing collection: {del_error}")
-            # Create new collection
             self.collection = self.chroma_client.create_collection(
                 name=self.collection_name,
                 metadata={"description": "Scikit-learn documentation embeddings"}
             )
-            # Ensure embedding model is loaded (lazy loading)
-            self._ensure_embedding_model_loaded()
             # Process chunks in batches
             batch_size = 100
@@ -202,9 +152,6 @@ class RAGChatbot:
                     }
                     metadatas.append(metadata)
-                # Ensure embedding model is loaded
-                self._ensure_embedding_model_loaded()
                 # Create embeddings
                 embeddings = self.embedding_model.encode(texts).tolist()
@@ -256,19 +203,6 @@ class RAGChatbot:
             st.error(f"Invalid API key or OpenAI connection error: {e}")
             return False
-    def _ensure_embedding_model_loaded(self):
-        """
-        Lazy loading of embedding model to speed up initialization.
-        """
-        if self.embedding_model is None:
-            logger.info("Loading embedding model (first time use)...")
-            if 'streamlit' in sys.modules:
-                with st.spinner("🔄 Loading embedding model (first time only)..."):
-                    self.embedding_model = SentenceTransformer(self.embedding_model_name)
-            else:
-                self.embedding_model = SentenceTransformer(self.embedding_model_name)
-            logger.info("Embedding model loaded successfully")
     def retrieve_relevant_chunks(
         self,
         query: str,
@@ -456,26 +390,12 @@ ANSWER:"""
 def initialize_session_state():
     """Initialize Streamlit session state variables."""
     if 'chatbot' not in st.session_state:
-        # Use lazy loading to avoid blocking startup
-        st.session_state.chatbot = None
-        st.session_state.chatbot_initialized = False
-    if 'openai_initialized' not in st.session_state:
-        st.session_state.openai_initialized = False
-    if 'chat_history' not in st.session_state:
-        st.session_state.chat_history = []
-def ensure_chatbot_initialized():
-    """Lazy initialization of chatbot to avoid blocking startup."""
-    if st.session_state.chatbot is None or not st.session_state.chatbot_initialized:
         try:
             # Show initialization message
             init_placeholder = st.empty()
-            init_placeholder.info("🔄 Initializing RAG system (first time may take a moment)...")
             st.session_state.chatbot = RAGChatbot()
-            st.session_state.chatbot_initialized = True
             init_placeholder.empty()
         except Exception as e:
@@ -526,15 +446,12 @@ def main():
     # Main title and description
     st.title("🤖 Scikit-learn Documentation Q&A Bot")
-    # Show database status with lazy loading
-    if st.session_state.chatbot_initialized and st.session_state.chatbot:
-        try:
-            collection_count = st.session_state.chatbot.collection.count()
-            st.success(f"✅ Database ready with {collection_count:,} documentation chunks")
-        except:
-            st.warning("⚠️ Database connection issue")
-    else:
-        st.info("💾 Database will be initialized when you start chatting")
     st.markdown("""
     Welcome to the **Scikit-learn Documentation Q&A Bot**! This intelligent assistant can answer your questions about Scikit-learn using the official documentation.
@@ -560,9 +477,7 @@ def main():
         )
         if api_key and not st.session_state.openai_initialized:
-            # Initialize chatbot if needed
-            ensure_chatbot_initialized()
-            if st.session_state.chatbot and st.session_state.chatbot.set_openai_client(api_key):
                 st.session_state.openai_initialized = True
                 st.success("✅ API key validated!")
                 st.rerun()
@@ -630,13 +545,9 @@ def main():
             if not st.session_state.openai_initialized:
                 st.error("⚠️ Please enter a valid OpenAI API key in the sidebar first.")
             else:
-                # Initialize chatbot if not already done
-                ensure_chatbot_initialized()
-                if st.session_state.chatbot:
-                    # Get answer using RAG
-                    answer, sources = st.session_state.chatbot.get_answer(
-                        user_question, n_chunks, model
                 )
                 if answer:

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class RAGChatbot:
     """
         Initialize ChromaDB client and embedding model for retrieval.
         """
         try:
+            # Check if we're in Hugging Face Spaces environment
+            if os.path.exists('chroma.sqlite3'):
+                # We're likely in HF Spaces - use current directory
+                self.db_path = '.'
+            # Initialize ChromaDB client
+            self.chroma_client = chromadb.PersistentClient(
+                path=self.db_path,
+                settings=Settings(anonymized_telemetry=False)
+            )
+            # Get or create collection
             try:
+                self.collection = self.chroma_client.get_collection(
+                    name=self.collection_name
                 )
+            except Exception:
+                # If collection doesn't exist, try to recreate it from chunks
                 if os.path.exists('chunks.json'):
+                    st.warning("Database collection not found. Rebuilding from chunks...")
                     self._rebuild_collection_from_chunks()
                 else:
+                    raise Exception("Neither database collection nor chunks.json found. Please build the database first.")
+            # Load embedding model (same as used for building the database)
+            self.embedding_model = SentenceTransformer(self.embedding_model_name)
+            logger.info("RAG retrieval system initialized successfully")
         except Exception as e:
             logger.error(f"Failed to initialize retrieval system: {e}")
         This is useful for Hugging Face Spaces deployment.
         """
         try:
+            st.info("🔄 Rebuilding database collection from chunks...")
+            # Load chunks
+            with open('chunks.json', 'r', encoding='utf-8') as f:
                 chunks = json.load(f)
+            # Create collection
             try:
+                self.chroma_client.delete_collection(name=self.collection_name)
+            except:
+                pass  # Collection might not exist
             self.collection = self.chroma_client.create_collection(
                 name=self.collection_name,
                 metadata={"description": "Scikit-learn documentation embeddings"}
             )
+            # Load embedding model if not loaded
+            if not hasattr(self, 'embedding_model') or self.embedding_model is None:
+                self.embedding_model = SentenceTransformer(self.embedding_model_name)
             # Process chunks in batches
             batch_size = 100
                     }
                     metadatas.append(metadata)
                 # Create embeddings
                 embeddings = self.embedding_model.encode(texts).tolist()
             st.error(f"Invalid API key or OpenAI connection error: {e}")
             return False
     def retrieve_relevant_chunks(
         self,
         query: str,
 def initialize_session_state():
     """Initialize Streamlit session state variables."""
     if 'chatbot' not in st.session_state:
         try:
             # Show initialization message
             init_placeholder = st.empty()
+            init_placeholder.info("🔄 Initializing RAG system...")
             st.session_state.chatbot = RAGChatbot()
             init_placeholder.empty()
         except Exception as e:
     # Main title and description
     st.title("🤖 Scikit-learn Documentation Q&A Bot")
+    # Show database status
+    try:
+        collection_count = st.session_state.chatbot.collection.count()
+        st.success(f"✅ Database ready with {collection_count:,} documentation chunks")
+    except:
+        st.warning("⚠️ Database status unknown")
     st.markdown("""
     Welcome to the **Scikit-learn Documentation Q&A Bot**! This intelligent assistant can answer your questions about Scikit-learn using the official documentation.
         )
         if api_key and not st.session_state.openai_initialized:
+            if st.session_state.chatbot.set_openai_client(api_key):
                 st.session_state.openai_initialized = True
                 st.success("✅ API key validated!")
                 st.rerun()
             if not st.session_state.openai_initialized:
                 st.error("⚠️ Please enter a valid OpenAI API key in the sidebar first.")
             else:
+                # Get answer using RAG
+                answer, sources = st.session_state.chatbot.get_answer(
+                    user_question, n_chunks, model
                 )
                 if answer:

chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a237be672525b27994290e8ba6f6b7d24fb7ee596722855e7c28426e2b02310
+size 1676000

{ab7fa527-b151-425e-9f81-9aa3f7b65f1d → chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152}/header.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
 size 100

 version https://git-lfs.github.com/spec/v1
+oid sha256:47f6c2dc55a35a27eb2842e8ca379968e83a861a382bdb68505796e318930e07
 size 100

ab7fa527-b151-425e-9f81-9aa3f7b65f1d/data_level0.bin → chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152/index_metadata.pickle RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f97547c2466889737fdadcd740478420160f9c7094c36b6ae29c71d75887824e
-size 167600

 version https://git-lfs.github.com/spec/v1
+oid sha256:531c23a26708ff72912defdb15b34c63b4fdccdb1751c41e1908e084693236de
+size 92132

{ab7fa527-b151-425e-9f81-9aa3f7b65f1d → chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152}/length.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
-size 400

 version https://git-lfs.github.com/spec/v1
+oid sha256:b6b598b7fdc2d148c717bb8c4e55b6b52697a7314208d29ab451ae1543c13619
+size 4000

chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:773e2b377602f83f08386a73b662d59ceeaa6d44f7201a5801894d4df2f3208b
+size 8624

chroma.sqlite3 → chroma_db/chroma.sqlite3 RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5641e3ed4b6a48b08f13e2b125000fe62c3eec109367b5c2c40799c25517e0ff
-size 13283328

 version https://git-lfs.github.com/spec/v1
+oid sha256:c6ea2884ce08a5f478431a1cbbc51133d4c941ff0ff7c6016db2590352066714
+size 13279232

run.py CHANGED Viewed

@@ -1,39 +0,0 @@
-#!/usr/bin/env python3
-"""
-Runner script for Hugging Face Spaces
-This ensures Streamlit runs properly in HF Spaces environment
-"""
-import subprocess
-import sys
-import os
-def main():
-    """Run the Streamlit app with proper configuration for HF Spaces"""
-    # Set environment variables for better HF Spaces compatibility
-    os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
-    os.environ["STREAMLIT_SERVER_ENABLE_CORS"] = "false"
-    os.environ["STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION"] = "false"
-    # Run streamlit with the app file
-    cmd = [
-        sys.executable, "-m", "streamlit", "run", "app.py",
-        "--server.port=7860",
-        "--server.address=0.0.0.0",
-        "--server.headless=true",
-        "--server.enableCORS=false",
-        "--server.enableXsrfProtection=false",
-        "--theme.base=light"
-    ]
-    print("🚀 Starting Streamlit app for Hugging Face Spaces...")
-    print(f"Command: {' '.join(cmd)}")
-    # Execute the command
-    result = subprocess.run(cmd)
-    return result.returncode
-if __name__ == "__main__":
-    exit_code = main()
-    sys.exit(exit_code)