Spaces:
Sleeping
Sleeping
db fixed
Browse files
.gitattributes
CHANGED
|
@@ -33,4 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
|
| 37 |
+
# Large database and data files
|
| 38 |
+
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
| 40 |
chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -10,10 +10,29 @@ pinned: false
|
|
| 10 |
---
|
| 11 |
|
| 12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
# Scikit-learn Documentation Q&A Bot π€
|
| 14 |
|
| 15 |
A Retrieval-Augmented Generation (RAG) chatbot that answers questions about Scikit-learn using the official documentation.
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
## Features
|
| 18 |
|
| 19 |
- **π Smart Retrieval**: Searches through 1,249+ documentation chunks using semantic similarity
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 13 |
+
---
|
| 14 |
+
title: Scikit-learn Documentation Q&A Bot
|
| 15 |
+
emoji: π€
|
| 16 |
+
colorFrom: blue
|
| 17 |
+
colorTo: green
|
| 18 |
+
sdk: streamlit
|
| 19 |
+
sdk_version: 1.50.0
|
| 20 |
+
app_file: app.py
|
| 21 |
+
pinned: false
|
| 22 |
+
license: mit
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
# Scikit-learn Documentation Q&A Bot π€
|
| 26 |
|
| 27 |
A Retrieval-Augmented Generation (RAG) chatbot that answers questions about Scikit-learn using the official documentation.
|
| 28 |
|
| 29 |
+
## How to Use on Hugging Face Spaces
|
| 30 |
+
|
| 31 |
+
1. **Enter OpenAI API Key**: In the sidebar, enter your OpenAI API key
|
| 32 |
+
2. **Ask Questions**: Type any question about Scikit-learn functionality
|
| 33 |
+
3. **Get Answers**: Receive detailed responses with source documentation links
|
| 34 |
+
4. **Explore**: Use the example questions or browse chat history
|
| 35 |
+
|
| 36 |
## Features
|
| 37 |
|
| 38 |
- **π Smart Retrieval**: Searches through 1,249+ documentation chunks using semantic similarity
|
ab7fa527-b151-425e-9f81-9aa3f7b65f1d/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f97547c2466889737fdadcd740478420160f9c7094c36b6ae29c71d75887824e
|
| 3 |
+
size 167600
|
ab7fa527-b151-425e-9f81-9aa3f7b65f1d/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
|
| 3 |
+
size 100
|
ab7fa527-b151-425e-9f81-9aa3f7b65f1d/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
|
| 3 |
+
size 400
|
ab7fa527-b151-425e-9f81-9aa3f7b65f1d/link_lists.bin
ADDED
|
File without changes
|
app.py
CHANGED
|
@@ -11,6 +11,8 @@ Date: September 2025
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
import os
|
|
|
|
|
|
|
| 14 |
import logging
|
| 15 |
from typing import List, Dict, Any, Optional, Tuple
|
| 16 |
import streamlit as st
|
|
@@ -65,16 +67,29 @@ class RAGChatbot:
|
|
| 65 |
Initialize ChromaDB client and embedding model for retrieval.
|
| 66 |
"""
|
| 67 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
# Initialize ChromaDB client
|
| 69 |
self.chroma_client = chromadb.PersistentClient(
|
| 70 |
path=self.db_path,
|
| 71 |
settings=Settings(anonymized_telemetry=False)
|
| 72 |
)
|
| 73 |
|
| 74 |
-
# Get collection
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
# Load embedding model (same as used for building the database)
|
| 80 |
self.embedding_model = SentenceTransformer(self.embedding_model_name)
|
|
@@ -83,6 +98,85 @@ class RAGChatbot:
|
|
| 83 |
|
| 84 |
except Exception as e:
|
| 85 |
logger.error(f"Failed to initialize retrieval system: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
raise
|
| 87 |
|
| 88 |
def set_openai_client(self, api_key: str) -> bool:
|
|
@@ -297,9 +391,35 @@ def initialize_session_state():
|
|
| 297 |
"""Initialize Streamlit session state variables."""
|
| 298 |
if 'chatbot' not in st.session_state:
|
| 299 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
st.session_state.chatbot = RAGChatbot()
|
|
|
|
|
|
|
| 301 |
except Exception as e:
|
| 302 |
-
st.error(f"Failed to initialize chatbot: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
st.stop()
|
| 304 |
|
| 305 |
if 'openai_initialized' not in st.session_state:
|
|
@@ -325,6 +445,14 @@ def main():
|
|
| 325 |
|
| 326 |
# Main title and description
|
| 327 |
st.title("π€ Scikit-learn Documentation Q&A Bot")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
st.markdown("""
|
| 329 |
Welcome to the **Scikit-learn Documentation Q&A Bot**! This intelligent assistant can answer your questions about Scikit-learn using the official documentation.
|
| 330 |
|
|
@@ -332,6 +460,8 @@ def main():
|
|
| 332 |
1. π **Retrieval**: Searches through 1,249+ documentation chunks
|
| 333 |
2. π **Augmentation**: Provides relevant context to the AI
|
| 334 |
3. π€ **Generation**: Uses OpenAI to generate accurate answers
|
|
|
|
|
|
|
| 335 |
""")
|
| 336 |
|
| 337 |
# Sidebar for API key and settings
|
|
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
import os
|
| 14 |
+
import sys
|
| 15 |
+
import json
|
| 16 |
import logging
|
| 17 |
from typing import List, Dict, Any, Optional, Tuple
|
| 18 |
import streamlit as st
|
|
|
|
| 67 |
Initialize ChromaDB client and embedding model for retrieval.
|
| 68 |
"""
|
| 69 |
try:
|
| 70 |
+
# Check if we're in Hugging Face Spaces environment
|
| 71 |
+
if os.path.exists('chroma.sqlite3'):
|
| 72 |
+
# We're likely in HF Spaces - use current directory
|
| 73 |
+
self.db_path = '.'
|
| 74 |
+
|
| 75 |
# Initialize ChromaDB client
|
| 76 |
self.chroma_client = chromadb.PersistentClient(
|
| 77 |
path=self.db_path,
|
| 78 |
settings=Settings(anonymized_telemetry=False)
|
| 79 |
)
|
| 80 |
|
| 81 |
+
# Get or create collection
|
| 82 |
+
try:
|
| 83 |
+
self.collection = self.chroma_client.get_collection(
|
| 84 |
+
name=self.collection_name
|
| 85 |
+
)
|
| 86 |
+
except Exception:
|
| 87 |
+
# If collection doesn't exist, try to recreate it from chunks
|
| 88 |
+
if os.path.exists('chunks.json'):
|
| 89 |
+
st.warning("Database collection not found. Rebuilding from chunks...")
|
| 90 |
+
self._rebuild_collection_from_chunks()
|
| 91 |
+
else:
|
| 92 |
+
raise Exception("Neither database collection nor chunks.json found. Please build the database first.")
|
| 93 |
|
| 94 |
# Load embedding model (same as used for building the database)
|
| 95 |
self.embedding_model = SentenceTransformer(self.embedding_model_name)
|
|
|
|
| 98 |
|
| 99 |
except Exception as e:
|
| 100 |
logger.error(f"Failed to initialize retrieval system: {e}")
|
| 101 |
+
# In Streamlit, show user-friendly error
|
| 102 |
+
if 'streamlit' in sys.modules:
|
| 103 |
+
st.error(f"β Database initialization failed: {e}")
|
| 104 |
+
st.info("π‘ This might be the first run. The database needs to be built from the scraped content.")
|
| 105 |
+
raise
|
| 106 |
+
|
| 107 |
+
def _rebuild_collection_from_chunks(self) -> None:
|
| 108 |
+
"""
|
| 109 |
+
Rebuild the ChromaDB collection from chunks.json file.
|
| 110 |
+
This is useful for Hugging Face Spaces deployment.
|
| 111 |
+
"""
|
| 112 |
+
try:
|
| 113 |
+
st.info("π Rebuilding database collection from chunks...")
|
| 114 |
+
|
| 115 |
+
# Load chunks
|
| 116 |
+
with open('chunks.json', 'r', encoding='utf-8') as f:
|
| 117 |
+
chunks = json.load(f)
|
| 118 |
+
|
| 119 |
+
# Create collection
|
| 120 |
+
try:
|
| 121 |
+
self.chroma_client.delete_collection(name=self.collection_name)
|
| 122 |
+
except:
|
| 123 |
+
pass # Collection might not exist
|
| 124 |
+
|
| 125 |
+
self.collection = self.chroma_client.create_collection(
|
| 126 |
+
name=self.collection_name,
|
| 127 |
+
metadata={"description": "Scikit-learn documentation embeddings"}
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
# Load embedding model if not loaded
|
| 131 |
+
if not hasattr(self, 'embedding_model') or self.embedding_model is None:
|
| 132 |
+
self.embedding_model = SentenceTransformer(self.embedding_model_name)
|
| 133 |
+
|
| 134 |
+
# Process chunks in batches
|
| 135 |
+
batch_size = 100
|
| 136 |
+
progress_bar = st.progress(0)
|
| 137 |
+
status_text = st.empty()
|
| 138 |
+
|
| 139 |
+
for i in range(0, len(chunks), batch_size):
|
| 140 |
+
batch_chunks = chunks[i:i + batch_size]
|
| 141 |
+
|
| 142 |
+
# Prepare data
|
| 143 |
+
texts = [chunk['page_content'] for chunk in batch_chunks]
|
| 144 |
+
metadatas = []
|
| 145 |
+
|
| 146 |
+
for chunk in batch_chunks:
|
| 147 |
+
metadata = {
|
| 148 |
+
'url': chunk['metadata']['url'],
|
| 149 |
+
'chunk_index': str(chunk['metadata']['chunk_index']),
|
| 150 |
+
'source': chunk['metadata'].get('source', 'scikit-learn-docs'),
|
| 151 |
+
'content_length': str(len(chunk['page_content']))
|
| 152 |
+
}
|
| 153 |
+
metadatas.append(metadata)
|
| 154 |
+
|
| 155 |
+
# Create embeddings
|
| 156 |
+
embeddings = self.embedding_model.encode(texts).tolist()
|
| 157 |
+
|
| 158 |
+
# Generate IDs
|
| 159 |
+
ids = [f"chunk_{i+j}" for j in range(len(batch_chunks))]
|
| 160 |
+
|
| 161 |
+
# Add to collection
|
| 162 |
+
self.collection.add(
|
| 163 |
+
ids=ids,
|
| 164 |
+
documents=texts,
|
| 165 |
+
metadatas=metadatas,
|
| 166 |
+
embeddings=embeddings
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
# Update progress
|
| 170 |
+
progress = (i + batch_size) / len(chunks)
|
| 171 |
+
progress_bar.progress(min(progress, 1.0))
|
| 172 |
+
status_text.text(f"Processing chunks: {min(i + batch_size, len(chunks))}/{len(chunks)}")
|
| 173 |
+
|
| 174 |
+
progress_bar.empty()
|
| 175 |
+
status_text.empty()
|
| 176 |
+
st.success(f"β
Successfully rebuilt collection with {len(chunks)} chunks!")
|
| 177 |
+
|
| 178 |
+
except Exception as e:
|
| 179 |
+
st.error(f"β Failed to rebuild collection: {e}")
|
| 180 |
raise
|
| 181 |
|
| 182 |
def set_openai_client(self, api_key: str) -> bool:
|
|
|
|
| 391 |
"""Initialize Streamlit session state variables."""
|
| 392 |
if 'chatbot' not in st.session_state:
|
| 393 |
try:
|
| 394 |
+
# Show initialization message
|
| 395 |
+
init_placeholder = st.empty()
|
| 396 |
+
init_placeholder.info("π Initializing RAG system...")
|
| 397 |
+
|
| 398 |
st.session_state.chatbot = RAGChatbot()
|
| 399 |
+
init_placeholder.empty()
|
| 400 |
+
|
| 401 |
except Exception as e:
|
| 402 |
+
st.error(f"β Failed to initialize chatbot: {e}")
|
| 403 |
+
|
| 404 |
+
# Provide helpful instructions
|
| 405 |
+
st.markdown("""
|
| 406 |
+
### π§ Troubleshooting
|
| 407 |
+
|
| 408 |
+
This error typically occurs when:
|
| 409 |
+
1. **First deployment**: The database hasn't been built yet
|
| 410 |
+
2. **Missing files**: Required data files are not available
|
| 411 |
+
|
| 412 |
+
### π Required Files
|
| 413 |
+
Make sure these files are present:
|
| 414 |
+
- `chunks.json` (processed text chunks)
|
| 415 |
+
- `chroma.sqlite3` (database file) OR `chroma_db/` directory
|
| 416 |
+
|
| 417 |
+
### π Quick Fix for Hugging Face Spaces
|
| 418 |
+
If you're running this on Hugging Face Spaces, make sure you've uploaded:
|
| 419 |
+
1. All Python files (`app.py`, `build_vector_db.py`, etc.)
|
| 420 |
+
2. Data files (`chunks.json`, `scraped_content.json`)
|
| 421 |
+
3. Database files (`chroma.sqlite3` or the `chroma_db/` folder)
|
| 422 |
+
""")
|
| 423 |
st.stop()
|
| 424 |
|
| 425 |
if 'openai_initialized' not in st.session_state:
|
|
|
|
| 445 |
|
| 446 |
# Main title and description
|
| 447 |
st.title("π€ Scikit-learn Documentation Q&A Bot")
|
| 448 |
+
|
| 449 |
+
# Show database status
|
| 450 |
+
try:
|
| 451 |
+
collection_count = st.session_state.chatbot.collection.count()
|
| 452 |
+
st.success(f"β
Database ready with {collection_count:,} documentation chunks")
|
| 453 |
+
except:
|
| 454 |
+
st.warning("β οΈ Database status unknown")
|
| 455 |
+
|
| 456 |
st.markdown("""
|
| 457 |
Welcome to the **Scikit-learn Documentation Q&A Bot**! This intelligent assistant can answer your questions about Scikit-learn using the official documentation.
|
| 458 |
|
|
|
|
| 460 |
1. π **Retrieval**: Searches through 1,249+ documentation chunks
|
| 461 |
2. π **Augmentation**: Provides relevant context to the AI
|
| 462 |
3. π€ **Generation**: Uses OpenAI to generate accurate answers
|
| 463 |
+
|
| 464 |
+
**π To get started**: Enter your OpenAI API key in the sidebar!
|
| 465 |
""")
|
| 466 |
|
| 467 |
# Sidebar for API key and settings
|
chroma.sqlite3
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 13283328
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5641e3ed4b6a48b08f13e2b125000fe62c3eec109367b5c2c40799c25517e0ff
|
| 3 |
size 13283328
|