fguryel commited on
Commit
7ed4bfa
Β·
1 Parent(s): b787a9a
.streamlit/config.toml DELETED
@@ -1,14 +0,0 @@
1
- [server]
2
- headless = true
3
- port = 7860
4
- enableCORS = false
5
- enableXsrfProtection = false
6
-
7
- [theme]
8
- base = "light"
9
- primaryColor = "#1f77b4"
10
- backgroundColor = "#ffffff"
11
- secondaryBackgroundColor = "#f0f2f6"
12
-
13
- [browser]
14
- gatherUsageStats = false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.streamlit/secrets.toml DELETED
@@ -1,5 +0,0 @@
1
- # Streamlit secrets for Hugging Face Spaces
2
- # This helps reduce warning messages
3
-
4
- [general]
5
- dataFrameSerialization = "legacy"
 
 
 
 
 
 
README.md CHANGED
@@ -1,4 +1,16 @@
1
  ---
 
 
 
 
 
 
 
 
 
 
 
 
2
  title: Scikit-learn Documentation Q&A Bot
3
  emoji: πŸ€–
4
  colorFrom: blue
@@ -9,9 +21,6 @@ app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
- pinned: false
13
- license: mit
14
- ---
15
 
16
  # Scikit-learn Documentation Q&A Bot πŸ€–
17
 
 
1
  ---
2
+ title: Scikit Rag
3
+ emoji: 🐒
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 5.47.2
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+ ---
14
  title: Scikit-learn Documentation Q&A Bot
15
  emoji: πŸ€–
16
  colorFrom: blue
 
21
  pinned: false
22
  license: mit
23
  ---
 
 
 
24
 
25
  # Scikit-learn Documentation Q&A Bot πŸ€–
26
 
ab7fa527-b151-425e-9f81-9aa3f7b65f1d/link_lists.bin DELETED
File without changes
app.py CHANGED
@@ -26,10 +26,6 @@ from openai import OpenAI
26
  logging.basicConfig(level=logging.INFO)
27
  logger = logging.getLogger(__name__)
28
 
29
- # Suppress Streamlit context warnings for HF Spaces
30
- logging.getLogger("streamlit.runtime.scriptrunner_utils.script_run_context").setLevel(logging.ERROR)
31
- logging.getLogger("streamlit.runtime.state.session_state_proxy").setLevel(logging.ERROR)
32
-
33
 
34
  class RAGChatbot:
35
  """
@@ -71,68 +67,34 @@ class RAGChatbot:
71
  Initialize ChromaDB client and embedding model for retrieval.
72
  """
73
  try:
74
- # Detect environment and set appropriate database path
75
- current_dir = os.getcwd()
76
-
77
- # Check for database files in different locations
78
- if os.path.exists(os.path.join(current_dir, 'chroma.sqlite3')):
79
- self.db_path = current_dir
80
- logger.info(f"Using database in current directory: {current_dir}")
81
- elif os.path.exists(os.path.join(self.db_path, 'chroma.sqlite3')):
82
- logger.info(f"Using database in specified path: {self.db_path}")
83
- else:
84
- logger.warning("No database file found, will attempt to rebuild from chunks")
85
 
86
- # Initialize ChromaDB client with error handling
87
  try:
88
- self.chroma_client = chromadb.PersistentClient(
89
- path=self.db_path,
90
- settings=Settings(
91
- anonymized_telemetry=False,
92
- allow_reset=True,
93
- is_persistent=True
94
- )
95
  )
96
- logger.info(f"ChromaDB client initialized at: {self.db_path}")
97
- except Exception as client_error:
98
- logger.error(f"ChromaDB client initialization failed: {client_error}")
99
- # Try with default settings
100
- self.chroma_client = chromadb.PersistentClient(path=self.db_path)
101
-
102
- # Get or create collection with robust error handling
103
- collection_found = False
104
- try:
105
- # First, list all collections to see what's available
106
- collections = self.chroma_client.list_collections()
107
- collection_names = [col.name for col in collections]
108
- logger.info(f"Available collections: {collection_names}")
109
-
110
- if self.collection_name in collection_names:
111
- self.collection = self.chroma_client.get_collection(name=self.collection_name)
112
- collection_found = True
113
- logger.info(f"Successfully loaded collection: {self.collection_name}")
114
- else:
115
- logger.warning(f"Collection '{self.collection_name}' not found in {collection_names}")
116
-
117
- except Exception as col_error:
118
- logger.error(f"Error accessing collections: {col_error}")
119
-
120
- # If collection not found, rebuild from chunks
121
- if not collection_found:
122
  if os.path.exists('chunks.json'):
123
- logger.info("Attempting to rebuild collection from chunks.json")
124
- if 'streamlit' in sys.modules:
125
- st.warning("πŸ”„ Database collection not found. Rebuilding from chunks...")
126
  self._rebuild_collection_from_chunks()
127
  else:
128
- error_msg = f"Collection '{self.collection_name}' not found and no chunks.json available for rebuilding"
129
- logger.error(error_msg)
130
- raise Exception(error_msg)
131
 
132
- # Initialize embedding model as None - will be loaded lazily when needed
133
- self.embedding_model = None
134
 
135
- logger.info("RAG retrieval system initialized successfully (embedding model will load on first use)")
136
 
137
  except Exception as e:
138
  logger.error(f"Failed to initialize retrieval system: {e}")
@@ -148,38 +110,26 @@ class RAGChatbot:
148
  This is useful for Hugging Face Spaces deployment.
149
  """
150
  try:
151
- logger.info("Starting collection rebuild from chunks.json")
152
- if 'streamlit' in sys.modules:
153
- st.info("πŸ”„ Rebuilding database collection from chunks...")
154
 
155
- # Load chunks with error handling
156
- chunks_path = 'chunks.json'
157
- if not os.path.exists(chunks_path):
158
- raise FileNotFoundError(f"chunks.json not found at {chunks_path}")
159
-
160
- with open(chunks_path, 'r', encoding='utf-8') as f:
161
  chunks = json.load(f)
162
 
163
- logger.info(f"Loaded {len(chunks)} chunks from {chunks_path}")
164
-
165
- # Safely create collection
166
  try:
167
- # Try to delete existing collection first
168
- existing_collections = [col.name for col in self.chroma_client.list_collections()]
169
- if self.collection_name in existing_collections:
170
- logger.info(f"Deleting existing collection: {self.collection_name}")
171
- self.chroma_client.delete_collection(name=self.collection_name)
172
- except Exception as del_error:
173
- logger.warning(f"Could not delete existing collection: {del_error}")
174
-
175
- # Create new collection
176
  self.collection = self.chroma_client.create_collection(
177
  name=self.collection_name,
178
  metadata={"description": "Scikit-learn documentation embeddings"}
179
  )
180
 
181
- # Ensure embedding model is loaded (lazy loading)
182
- self._ensure_embedding_model_loaded()
 
183
 
184
  # Process chunks in batches
185
  batch_size = 100
@@ -202,9 +152,6 @@ class RAGChatbot:
202
  }
203
  metadatas.append(metadata)
204
 
205
- # Ensure embedding model is loaded
206
- self._ensure_embedding_model_loaded()
207
-
208
  # Create embeddings
209
  embeddings = self.embedding_model.encode(texts).tolist()
210
 
@@ -256,19 +203,6 @@ class RAGChatbot:
256
  st.error(f"Invalid API key or OpenAI connection error: {e}")
257
  return False
258
 
259
- def _ensure_embedding_model_loaded(self):
260
- """
261
- Lazy loading of embedding model to speed up initialization.
262
- """
263
- if self.embedding_model is None:
264
- logger.info("Loading embedding model (first time use)...")
265
- if 'streamlit' in sys.modules:
266
- with st.spinner("πŸ”„ Loading embedding model (first time only)..."):
267
- self.embedding_model = SentenceTransformer(self.embedding_model_name)
268
- else:
269
- self.embedding_model = SentenceTransformer(self.embedding_model_name)
270
- logger.info("Embedding model loaded successfully")
271
-
272
  def retrieve_relevant_chunks(
273
  self,
274
  query: str,
@@ -456,26 +390,12 @@ ANSWER:"""
456
  def initialize_session_state():
457
  """Initialize Streamlit session state variables."""
458
  if 'chatbot' not in st.session_state:
459
- # Use lazy loading to avoid blocking startup
460
- st.session_state.chatbot = None
461
- st.session_state.chatbot_initialized = False
462
-
463
- if 'openai_initialized' not in st.session_state:
464
- st.session_state.openai_initialized = False
465
-
466
- if 'chat_history' not in st.session_state:
467
- st.session_state.chat_history = []
468
-
469
- def ensure_chatbot_initialized():
470
- """Lazy initialization of chatbot to avoid blocking startup."""
471
- if st.session_state.chatbot is None or not st.session_state.chatbot_initialized:
472
  try:
473
  # Show initialization message
474
  init_placeholder = st.empty()
475
- init_placeholder.info("πŸ”„ Initializing RAG system (first time may take a moment)...")
476
 
477
  st.session_state.chatbot = RAGChatbot()
478
- st.session_state.chatbot_initialized = True
479
  init_placeholder.empty()
480
 
481
  except Exception as e:
@@ -526,15 +446,12 @@ def main():
526
  # Main title and description
527
  st.title("πŸ€– Scikit-learn Documentation Q&A Bot")
528
 
529
- # Show database status with lazy loading
530
- if st.session_state.chatbot_initialized and st.session_state.chatbot:
531
- try:
532
- collection_count = st.session_state.chatbot.collection.count()
533
- st.success(f"βœ… Database ready with {collection_count:,} documentation chunks")
534
- except:
535
- st.warning("⚠️ Database connection issue")
536
- else:
537
- st.info("πŸ’Ύ Database will be initialized when you start chatting")
538
 
539
  st.markdown("""
540
  Welcome to the **Scikit-learn Documentation Q&A Bot**! This intelligent assistant can answer your questions about Scikit-learn using the official documentation.
@@ -560,9 +477,7 @@ def main():
560
  )
561
 
562
  if api_key and not st.session_state.openai_initialized:
563
- # Initialize chatbot if needed
564
- ensure_chatbot_initialized()
565
- if st.session_state.chatbot and st.session_state.chatbot.set_openai_client(api_key):
566
  st.session_state.openai_initialized = True
567
  st.success("βœ… API key validated!")
568
  st.rerun()
@@ -630,13 +545,9 @@ def main():
630
  if not st.session_state.openai_initialized:
631
  st.error("⚠️ Please enter a valid OpenAI API key in the sidebar first.")
632
  else:
633
- # Initialize chatbot if not already done
634
- ensure_chatbot_initialized()
635
-
636
- if st.session_state.chatbot:
637
- # Get answer using RAG
638
- answer, sources = st.session_state.chatbot.get_answer(
639
- user_question, n_chunks, model
640
  )
641
 
642
  if answer:
 
26
  logging.basicConfig(level=logging.INFO)
27
  logger = logging.getLogger(__name__)
28
 
 
 
 
 
29
 
30
  class RAGChatbot:
31
  """
 
67
  Initialize ChromaDB client and embedding model for retrieval.
68
  """
69
  try:
70
+ # Check if we're in Hugging Face Spaces environment
71
+ if os.path.exists('chroma.sqlite3'):
72
+ # We're likely in HF Spaces - use current directory
73
+ self.db_path = '.'
74
+
75
+ # Initialize ChromaDB client
76
+ self.chroma_client = chromadb.PersistentClient(
77
+ path=self.db_path,
78
+ settings=Settings(anonymized_telemetry=False)
79
+ )
 
80
 
81
+ # Get or create collection
82
  try:
83
+ self.collection = self.chroma_client.get_collection(
84
+ name=self.collection_name
 
 
 
 
 
85
  )
86
+ except Exception:
87
+ # If collection doesn't exist, try to recreate it from chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  if os.path.exists('chunks.json'):
89
+ st.warning("Database collection not found. Rebuilding from chunks...")
 
 
90
  self._rebuild_collection_from_chunks()
91
  else:
92
+ raise Exception("Neither database collection nor chunks.json found. Please build the database first.")
 
 
93
 
94
+ # Load embedding model (same as used for building the database)
95
+ self.embedding_model = SentenceTransformer(self.embedding_model_name)
96
 
97
+ logger.info("RAG retrieval system initialized successfully")
98
 
99
  except Exception as e:
100
  logger.error(f"Failed to initialize retrieval system: {e}")
 
110
  This is useful for Hugging Face Spaces deployment.
111
  """
112
  try:
113
+ st.info("πŸ”„ Rebuilding database collection from chunks...")
 
 
114
 
115
+ # Load chunks
116
+ with open('chunks.json', 'r', encoding='utf-8') as f:
 
 
 
 
117
  chunks = json.load(f)
118
 
119
+ # Create collection
 
 
120
  try:
121
+ self.chroma_client.delete_collection(name=self.collection_name)
122
+ except:
123
+ pass # Collection might not exist
124
+
 
 
 
 
 
125
  self.collection = self.chroma_client.create_collection(
126
  name=self.collection_name,
127
  metadata={"description": "Scikit-learn documentation embeddings"}
128
  )
129
 
130
+ # Load embedding model if not loaded
131
+ if not hasattr(self, 'embedding_model') or self.embedding_model is None:
132
+ self.embedding_model = SentenceTransformer(self.embedding_model_name)
133
 
134
  # Process chunks in batches
135
  batch_size = 100
 
152
  }
153
  metadatas.append(metadata)
154
 
 
 
 
155
  # Create embeddings
156
  embeddings = self.embedding_model.encode(texts).tolist()
157
 
 
203
  st.error(f"Invalid API key or OpenAI connection error: {e}")
204
  return False
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  def retrieve_relevant_chunks(
207
  self,
208
  query: str,
 
390
  def initialize_session_state():
391
  """Initialize Streamlit session state variables."""
392
  if 'chatbot' not in st.session_state:
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  try:
394
  # Show initialization message
395
  init_placeholder = st.empty()
396
+ init_placeholder.info("πŸ”„ Initializing RAG system...")
397
 
398
  st.session_state.chatbot = RAGChatbot()
 
399
  init_placeholder.empty()
400
 
401
  except Exception as e:
 
446
  # Main title and description
447
  st.title("πŸ€– Scikit-learn Documentation Q&A Bot")
448
 
449
+ # Show database status
450
+ try:
451
+ collection_count = st.session_state.chatbot.collection.count()
452
+ st.success(f"βœ… Database ready with {collection_count:,} documentation chunks")
453
+ except:
454
+ st.warning("⚠️ Database status unknown")
 
 
 
455
 
456
  st.markdown("""
457
  Welcome to the **Scikit-learn Documentation Q&A Bot**! This intelligent assistant can answer your questions about Scikit-learn using the official documentation.
 
477
  )
478
 
479
  if api_key and not st.session_state.openai_initialized:
480
+ if st.session_state.chatbot.set_openai_client(api_key):
 
 
481
  st.session_state.openai_initialized = True
482
  st.success("βœ… API key validated!")
483
  st.rerun()
 
545
  if not st.session_state.openai_initialized:
546
  st.error("⚠️ Please enter a valid OpenAI API key in the sidebar first.")
547
  else:
548
+ # Get answer using RAG
549
+ answer, sources = st.session_state.chatbot.get_answer(
550
+ user_question, n_chunks, model
 
 
 
 
551
  )
552
 
553
  if answer:
chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a237be672525b27994290e8ba6f6b7d24fb7ee596722855e7c28426e2b02310
3
+ size 1676000
{ab7fa527-b151-425e-9f81-9aa3f7b65f1d β†’ chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152}/header.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
3
  size 100
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47f6c2dc55a35a27eb2842e8ca379968e83a861a382bdb68505796e318930e07
3
  size 100
ab7fa527-b151-425e-9f81-9aa3f7b65f1d/data_level0.bin β†’ chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152/index_metadata.pickle RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f97547c2466889737fdadcd740478420160f9c7094c36b6ae29c71d75887824e
3
- size 167600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:531c23a26708ff72912defdb15b34c63b4fdccdb1751c41e1908e084693236de
3
+ size 92132
{ab7fa527-b151-425e-9f81-9aa3f7b65f1d β†’ chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152}/length.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
3
- size 400
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b598b7fdc2d148c717bb8c4e55b6b52697a7314208d29ab451ae1543c13619
3
+ size 4000
chroma_db/baa4ce7e-6804-430e-b4a2-be808a468152/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:773e2b377602f83f08386a73b662d59ceeaa6d44f7201a5801894d4df2f3208b
3
+ size 8624
chroma.sqlite3 β†’ chroma_db/chroma.sqlite3 RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5641e3ed4b6a48b08f13e2b125000fe62c3eec109367b5c2c40799c25517e0ff
3
- size 13283328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6ea2884ce08a5f478431a1cbbc51133d4c941ff0ff7c6016db2590352066714
3
+ size 13279232
run.py CHANGED
@@ -1,39 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Runner script for Hugging Face Spaces
4
- This ensures Streamlit runs properly in HF Spaces environment
5
- """
6
-
7
- import subprocess
8
- import sys
9
- import os
10
-
11
- def main():
12
- """Run the Streamlit app with proper configuration for HF Spaces"""
13
-
14
- # Set environment variables for better HF Spaces compatibility
15
- os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
16
- os.environ["STREAMLIT_SERVER_ENABLE_CORS"] = "false"
17
- os.environ["STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION"] = "false"
18
-
19
- # Run streamlit with the app file
20
- cmd = [
21
- sys.executable, "-m", "streamlit", "run", "app.py",
22
- "--server.port=7860",
23
- "--server.address=0.0.0.0",
24
- "--server.headless=true",
25
- "--server.enableCORS=false",
26
- "--server.enableXsrfProtection=false",
27
- "--theme.base=light"
28
- ]
29
-
30
- print("πŸš€ Starting Streamlit app for Hugging Face Spaces...")
31
- print(f"Command: {' '.join(cmd)}")
32
-
33
- # Execute the command
34
- result = subprocess.run(cmd)
35
- return result.returncode
36
-
37
- if __name__ == "__main__":
38
- exit_code = main()
39
- sys.exit(exit_code)