fguryel commited on
Commit
95d9c92
Β·
1 Parent(s): 34e78be
.gitattributes CHANGED
@@ -33,4 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
36
  chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+
37
+ # Large database and data files
38
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
39
+ *.json filter=lfs diff=lfs merge=lfs -text
40
  chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -10,10 +10,29 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
13
  # Scikit-learn Documentation Q&A Bot πŸ€–
14
 
15
  A Retrieval-Augmented Generation (RAG) chatbot that answers questions about Scikit-learn using the official documentation.
16
 
 
 
 
 
 
 
 
17
  ## Features
18
 
19
  - **πŸ” Smart Retrieval**: Searches through 1,249+ documentation chunks using semantic similarity
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+ ---
14
+ title: Scikit-learn Documentation Q&A Bot
15
+ emoji: πŸ€–
16
+ colorFrom: blue
17
+ colorTo: green
18
+ sdk: streamlit
19
+ sdk_version: 1.50.0
20
+ app_file: app.py
21
+ pinned: false
22
+ license: mit
23
+ ---
24
+
25
  # Scikit-learn Documentation Q&A Bot πŸ€–
26
 
27
  A Retrieval-Augmented Generation (RAG) chatbot that answers questions about Scikit-learn using the official documentation.
28
 
29
+ ## How to Use on Hugging Face Spaces
30
+
31
+ 1. **Enter OpenAI API Key**: In the sidebar, enter your OpenAI API key
32
+ 2. **Ask Questions**: Type any question about Scikit-learn functionality
33
+ 3. **Get Answers**: Receive detailed responses with source documentation links
34
+ 4. **Explore**: Use the example questions or browse chat history
35
+
36
  ## Features
37
 
38
  - **πŸ” Smart Retrieval**: Searches through 1,249+ documentation chunks using semantic similarity
ab7fa527-b151-425e-9f81-9aa3f7b65f1d/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f97547c2466889737fdadcd740478420160f9c7094c36b6ae29c71d75887824e
3
+ size 167600
ab7fa527-b151-425e-9f81-9aa3f7b65f1d/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
3
+ size 100
ab7fa527-b151-425e-9f81-9aa3f7b65f1d/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
3
+ size 400
ab7fa527-b151-425e-9f81-9aa3f7b65f1d/link_lists.bin ADDED
File without changes
app.py CHANGED
@@ -11,6 +11,8 @@ Date: September 2025
11
  """
12
 
13
  import os
 
 
14
  import logging
15
  from typing import List, Dict, Any, Optional, Tuple
16
  import streamlit as st
@@ -65,16 +67,29 @@ class RAGChatbot:
65
  Initialize ChromaDB client and embedding model for retrieval.
66
  """
67
  try:
 
 
 
 
 
68
  # Initialize ChromaDB client
69
  self.chroma_client = chromadb.PersistentClient(
70
  path=self.db_path,
71
  settings=Settings(anonymized_telemetry=False)
72
  )
73
 
74
- # Get collection
75
- self.collection = self.chroma_client.get_collection(
76
- name=self.collection_name
77
- )
 
 
 
 
 
 
 
 
78
 
79
  # Load embedding model (same as used for building the database)
80
  self.embedding_model = SentenceTransformer(self.embedding_model_name)
@@ -83,6 +98,85 @@ class RAGChatbot:
83
 
84
  except Exception as e:
85
  logger.error(f"Failed to initialize retrieval system: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  raise
87
 
88
  def set_openai_client(self, api_key: str) -> bool:
@@ -297,9 +391,35 @@ def initialize_session_state():
297
  """Initialize Streamlit session state variables."""
298
  if 'chatbot' not in st.session_state:
299
  try:
 
 
 
 
300
  st.session_state.chatbot = RAGChatbot()
 
 
301
  except Exception as e:
302
- st.error(f"Failed to initialize chatbot: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  st.stop()
304
 
305
  if 'openai_initialized' not in st.session_state:
@@ -325,6 +445,14 @@ def main():
325
 
326
  # Main title and description
327
  st.title("πŸ€– Scikit-learn Documentation Q&A Bot")
 
 
 
 
 
 
 
 
328
  st.markdown("""
329
  Welcome to the **Scikit-learn Documentation Q&A Bot**! This intelligent assistant can answer your questions about Scikit-learn using the official documentation.
330
 
@@ -332,6 +460,8 @@ def main():
332
  1. πŸ” **Retrieval**: Searches through 1,249+ documentation chunks
333
  2. πŸ“ **Augmentation**: Provides relevant context to the AI
334
  3. πŸ€– **Generation**: Uses OpenAI to generate accurate answers
 
 
335
  """)
336
 
337
  # Sidebar for API key and settings
 
11
  """
12
 
13
  import os
14
+ import sys
15
+ import json
16
  import logging
17
  from typing import List, Dict, Any, Optional, Tuple
18
  import streamlit as st
 
67
  Initialize ChromaDB client and embedding model for retrieval.
68
  """
69
  try:
70
+ # Check if we're in Hugging Face Spaces environment
71
+ if os.path.exists('chroma.sqlite3'):
72
+ # We're likely in HF Spaces - use current directory
73
+ self.db_path = '.'
74
+
75
  # Initialize ChromaDB client
76
  self.chroma_client = chromadb.PersistentClient(
77
  path=self.db_path,
78
  settings=Settings(anonymized_telemetry=False)
79
  )
80
 
81
+ # Get or create collection
82
+ try:
83
+ self.collection = self.chroma_client.get_collection(
84
+ name=self.collection_name
85
+ )
86
+ except Exception:
87
+ # If collection doesn't exist, try to recreate it from chunks
88
+ if os.path.exists('chunks.json'):
89
+ st.warning("Database collection not found. Rebuilding from chunks...")
90
+ self._rebuild_collection_from_chunks()
91
+ else:
92
+ raise Exception("Neither database collection nor chunks.json found. Please build the database first.")
93
 
94
  # Load embedding model (same as used for building the database)
95
  self.embedding_model = SentenceTransformer(self.embedding_model_name)
 
98
 
99
  except Exception as e:
100
  logger.error(f"Failed to initialize retrieval system: {e}")
101
+ # In Streamlit, show user-friendly error
102
+ if 'streamlit' in sys.modules:
103
+ st.error(f"❌ Database initialization failed: {e}")
104
+ st.info("πŸ’‘ This might be the first run. The database needs to be built from the scraped content.")
105
+ raise
106
+
107
+ def _rebuild_collection_from_chunks(self) -> None:
108
+ """
109
+ Rebuild the ChromaDB collection from chunks.json file.
110
+ This is useful for Hugging Face Spaces deployment.
111
+ """
112
+ try:
113
+ st.info("πŸ”„ Rebuilding database collection from chunks...")
114
+
115
+ # Load chunks
116
+ with open('chunks.json', 'r', encoding='utf-8') as f:
117
+ chunks = json.load(f)
118
+
119
+ # Create collection
120
+ try:
121
+ self.chroma_client.delete_collection(name=self.collection_name)
122
+ except:
123
+ pass # Collection might not exist
124
+
125
+ self.collection = self.chroma_client.create_collection(
126
+ name=self.collection_name,
127
+ metadata={"description": "Scikit-learn documentation embeddings"}
128
+ )
129
+
130
+ # Load embedding model if not loaded
131
+ if not hasattr(self, 'embedding_model') or self.embedding_model is None:
132
+ self.embedding_model = SentenceTransformer(self.embedding_model_name)
133
+
134
+ # Process chunks in batches
135
+ batch_size = 100
136
+ progress_bar = st.progress(0)
137
+ status_text = st.empty()
138
+
139
+ for i in range(0, len(chunks), batch_size):
140
+ batch_chunks = chunks[i:i + batch_size]
141
+
142
+ # Prepare data
143
+ texts = [chunk['page_content'] for chunk in batch_chunks]
144
+ metadatas = []
145
+
146
+ for chunk in batch_chunks:
147
+ metadata = {
148
+ 'url': chunk['metadata']['url'],
149
+ 'chunk_index': str(chunk['metadata']['chunk_index']),
150
+ 'source': chunk['metadata'].get('source', 'scikit-learn-docs'),
151
+ 'content_length': str(len(chunk['page_content']))
152
+ }
153
+ metadatas.append(metadata)
154
+
155
+ # Create embeddings
156
+ embeddings = self.embedding_model.encode(texts).tolist()
157
+
158
+ # Generate IDs
159
+ ids = [f"chunk_{i+j}" for j in range(len(batch_chunks))]
160
+
161
+ # Add to collection
162
+ self.collection.add(
163
+ ids=ids,
164
+ documents=texts,
165
+ metadatas=metadatas,
166
+ embeddings=embeddings
167
+ )
168
+
169
+ # Update progress
170
+ progress = (i + batch_size) / len(chunks)
171
+ progress_bar.progress(min(progress, 1.0))
172
+ status_text.text(f"Processing chunks: {min(i + batch_size, len(chunks))}/{len(chunks)}")
173
+
174
+ progress_bar.empty()
175
+ status_text.empty()
176
+ st.success(f"βœ… Successfully rebuilt collection with {len(chunks)} chunks!")
177
+
178
+ except Exception as e:
179
+ st.error(f"❌ Failed to rebuild collection: {e}")
180
  raise
181
 
182
  def set_openai_client(self, api_key: str) -> bool:
 
391
  """Initialize Streamlit session state variables."""
392
  if 'chatbot' not in st.session_state:
393
  try:
394
+ # Show initialization message
395
+ init_placeholder = st.empty()
396
+ init_placeholder.info("πŸ”„ Initializing RAG system...")
397
+
398
  st.session_state.chatbot = RAGChatbot()
399
+ init_placeholder.empty()
400
+
401
  except Exception as e:
402
+ st.error(f"❌ Failed to initialize chatbot: {e}")
403
+
404
+ # Provide helpful instructions
405
+ st.markdown("""
406
+ ### πŸ”§ Troubleshooting
407
+
408
+ This error typically occurs when:
409
+ 1. **First deployment**: The database hasn't been built yet
410
+ 2. **Missing files**: Required data files are not available
411
+
412
+ ### πŸ“‹ Required Files
413
+ Make sure these files are present:
414
+ - `chunks.json` (processed text chunks)
415
+ - `chroma.sqlite3` (database file) OR `chroma_db/` directory
416
+
417
+ ### πŸš€ Quick Fix for Hugging Face Spaces
418
+ If you're running this on Hugging Face Spaces, make sure you've uploaded:
419
+ 1. All Python files (`app.py`, `build_vector_db.py`, etc.)
420
+ 2. Data files (`chunks.json`, `scraped_content.json`)
421
+ 3. Database files (`chroma.sqlite3` or the `chroma_db/` folder)
422
+ """)
423
  st.stop()
424
 
425
  if 'openai_initialized' not in st.session_state:
 
445
 
446
  # Main title and description
447
  st.title("πŸ€– Scikit-learn Documentation Q&A Bot")
448
+
449
+ # Show database status
450
+ try:
451
+ collection_count = st.session_state.chatbot.collection.count()
452
+ st.success(f"βœ… Database ready with {collection_count:,} documentation chunks")
453
+ except:
454
+ st.warning("⚠️ Database status unknown")
455
+
456
  st.markdown("""
457
  Welcome to the **Scikit-learn Documentation Q&A Bot**! This intelligent assistant can answer your questions about Scikit-learn using the official documentation.
458
 
 
460
  1. πŸ” **Retrieval**: Searches through 1,249+ documentation chunks
461
  2. πŸ“ **Augmentation**: Provides relevant context to the AI
462
  3. πŸ€– **Generation**: Uses OpenAI to generate accurate answers
463
+
464
+ **πŸ‘ˆ To get started**: Enter your OpenAI API key in the sidebar!
465
  """)
466
 
467
  # Sidebar for API key and settings
chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f0872c151b5912b9d3bfc3b9d1aef9b3d8770366e7d42ffc3f2a1044407e181
3
  size 13283328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5641e3ed4b6a48b08f13e2b125000fe62c3eec109367b5c2c40799c25517e0ff
3
  size 13283328