Juan Salas commited on
Commit Β·
d8ecf39
1
Parent(s): 3632723
Fix FAISS index naming mismatch for checklist and questions
Browse files- Remove -simple suffix from FAISS index names in build script
- Simplify document handler to use directory names directly
- Skip document type embeddings for framework documents (checklist/questions)
- Resolves 'No pre-built FAISS index found for checklist' error
The build process was creating checklist-simple.faiss but the loader
expected checklist.faiss. Now both are aligned for consistent naming.
- Dockerfile +6 -2
- app/handlers/document_handler.py +23 -19
- app/ui/sidebar.py +45 -0
- scripts/build_indexes.py +4 -4
- uv.lock +1 -1
Dockerfile
CHANGED
|
@@ -28,8 +28,9 @@ COPY . .
|
|
| 28 |
# Ensure LFS files are pulled
|
| 29 |
RUN git lfs pull || echo "LFS pull failed, continuing..."
|
| 30 |
|
| 31 |
-
#
|
| 32 |
-
|
|
|
|
| 33 |
|
| 34 |
# HuggingFace Spaces environment variables
|
| 35 |
ENV STREAMLIT_SERVER_HEADLESS=true
|
|
@@ -46,6 +47,9 @@ ENV HF_HOME=/tmp/huggingface
|
|
| 46 |
ENV HF_HUB_CACHE=/tmp/huggingface/hub
|
| 47 |
ENV SENTENCE_TRANSFORMERS_HOME=/tmp/huggingface/sentence_transformers
|
| 48 |
|
|
|
|
|
|
|
|
|
|
| 49 |
# Spaces-specific optimizations
|
| 50 |
ENV PYTHONUNBUFFERED=1
|
| 51 |
ENV PYTHONDONTWRITEBYTECODE=1
|
|
|
|
| 28 |
# Ensure LFS files are pulled
|
| 29 |
RUN git lfs pull || echo "LFS pull failed, continuing..."
|
| 30 |
|
| 31 |
+
# Try to download datasets (may fail if private repos or no token)
|
| 32 |
+
# Will be retried at runtime with better error handling
|
| 33 |
+
RUN python scripts/setup_datasets.py || (echo "β οΈ Build-time dataset download failed - will retry at runtime" && mkdir -p data/{strategy,checklist,questions,vdrs,search_indexes})
|
| 34 |
|
| 35 |
# HuggingFace Spaces environment variables
|
| 36 |
ENV STREAMLIT_SERVER_HEADLESS=true
|
|
|
|
| 47 |
ENV HF_HUB_CACHE=/tmp/huggingface/hub
|
| 48 |
ENV SENTENCE_TRANSFORMERS_HOME=/tmp/huggingface/sentence_transformers
|
| 49 |
|
| 50 |
+
# HuggingFace authentication (will be set via Spaces secrets)
|
| 51 |
+
# ENV HF_TOKEN will be provided at runtime
|
| 52 |
+
|
| 53 |
# Spaces-specific optimizations
|
| 54 |
ENV PYTHONUNBUFFERED=1
|
| 55 |
ENV PYTHONDONTWRITEBYTECODE=1
|
app/handlers/document_handler.py
CHANGED
|
@@ -35,16 +35,16 @@ class DocumentHandler:
|
|
| 35 |
Returns:
|
| 36 |
Tuple of (documents_count, chunks_count) or None on error
|
| 37 |
"""
|
| 38 |
-
# Extract
|
| 39 |
-
|
| 40 |
|
| 41 |
# Initialize document processor with loaded FAISS store
|
| 42 |
from app.core.utils import create_document_processor
|
| 43 |
-
document_processor = create_document_processor(store_name=
|
| 44 |
|
| 45 |
if not document_processor.vector_store:
|
| 46 |
raise create_processing_error(
|
| 47 |
-
f"No pre-built FAISS index found for '{
|
| 48 |
recovery_hint="Please run scripts/build_indexes.py first to create the index"
|
| 49 |
)
|
| 50 |
|
|
@@ -58,7 +58,7 @@ class DocumentHandler:
|
|
| 58 |
self.session.documents = documents_dict
|
| 59 |
self.session.chunks = chunks
|
| 60 |
self.session.embeddings = document_processor.embeddings
|
| 61 |
-
self.session.vdr_store =
|
| 62 |
|
| 63 |
# Preload checklist embeddings into memory for fast search
|
| 64 |
from app.core.search import preload_checklist_embeddings
|
|
@@ -72,20 +72,24 @@ class DocumentHandler:
|
|
| 72 |
# Don't fail the entire data room processing, but make it very clear this is a problem
|
| 73 |
raise # Re-raise to make this a hard failure
|
| 74 |
|
| 75 |
-
# Load pre-built document type embeddings from disk
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
# Clear existing analysis
|
| 91 |
self.session.reset()
|
|
|
|
| 35 |
Returns:
|
| 36 |
Tuple of (documents_count, chunks_count) or None on error
|
| 37 |
"""
|
| 38 |
+
# Extract directory name from path - use directly as store name
|
| 39 |
+
store_name = Path(data_room_path).name.lower()
|
| 40 |
|
| 41 |
# Initialize document processor with loaded FAISS store
|
| 42 |
from app.core.utils import create_document_processor
|
| 43 |
+
document_processor = create_document_processor(store_name=store_name)
|
| 44 |
|
| 45 |
if not document_processor.vector_store:
|
| 46 |
raise create_processing_error(
|
| 47 |
+
f"No pre-built FAISS index found for '{store_name}'",
|
| 48 |
recovery_hint="Please run scripts/build_indexes.py first to create the index"
|
| 49 |
)
|
| 50 |
|
|
|
|
| 58 |
self.session.documents = documents_dict
|
| 59 |
self.session.chunks = chunks
|
| 60 |
self.session.embeddings = document_processor.embeddings
|
| 61 |
+
self.session.vdr_store = store_name
|
| 62 |
|
| 63 |
# Preload checklist embeddings into memory for fast search
|
| 64 |
from app.core.search import preload_checklist_embeddings
|
|
|
|
| 72 |
# Don't fail the entire data room processing, but make it very clear this is a problem
|
| 73 |
raise # Re-raise to make this a hard failure
|
| 74 |
|
| 75 |
+
# Load pre-built document type embeddings from disk (only for VDR data rooms)
|
| 76 |
+
if store_name not in ['checklist', 'questions']:
|
| 77 |
+
from app.core.search import preload_document_type_embeddings
|
| 78 |
+
logger.info(f"Loading pre-built document type embeddings for {store_name}...")
|
| 79 |
+
try:
|
| 80 |
+
type_embeddings = preload_document_type_embeddings(store_name)
|
| 81 |
+
# Store in session for use during search
|
| 82 |
+
self.session.document_type_embeddings = type_embeddings
|
| 83 |
+
logger.info(f"β
Loaded {len(type_embeddings)} pre-built document type embeddings")
|
| 84 |
+
logger.info(f"Session ID: {id(self.session)}, Embeddings stored: {bool(self.session.document_type_embeddings)}")
|
| 85 |
+
except RuntimeError as e:
|
| 86 |
+
logger.error(f"β Failed to load pre-built document type embeddings: {e}")
|
| 87 |
+
logger.error("This indicates the build process did not complete successfully.")
|
| 88 |
+
logger.error("Please run 'uv run build-indexes' to generate required embeddings.")
|
| 89 |
+
raise # Fail fast - embeddings are required for checklist processing
|
| 90 |
+
else:
|
| 91 |
+
logger.info(f"Skipping document type embeddings for framework data room: {store_name}")
|
| 92 |
+
self.session.document_type_embeddings = {}
|
| 93 |
|
| 94 |
# Clear existing analysis
|
| 95 |
self.session.reset()
|
app/ui/sidebar.py
CHANGED
|
@@ -84,6 +84,11 @@ class Sidebar:
|
|
| 84 |
|
| 85 |
st.divider()
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
# Analysis Configuration
|
| 88 |
st.subheader("π Analysis Configuration")
|
| 89 |
|
|
@@ -162,3 +167,43 @@ class Sidebar:
|
|
| 162 |
logger.error(f"Failed to process data room {data_room_path}: {e}")
|
| 163 |
display_processing_error("data room", e)
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
st.divider()
|
| 86 |
|
| 87 |
+
# Dataset Status
|
| 88 |
+
self._render_dataset_status()
|
| 89 |
+
|
| 90 |
+
st.divider()
|
| 91 |
+
|
| 92 |
# Analysis Configuration
|
| 93 |
st.subheader("π Analysis Configuration")
|
| 94 |
|
|
|
|
| 167 |
logger.error(f"Failed to process data room {data_room_path}: {e}")
|
| 168 |
display_processing_error("data room", e)
|
| 169 |
|
| 170 |
+
def _render_dataset_status(self):
|
| 171 |
+
"""Render dataset status and download options"""
|
| 172 |
+
from pathlib import Path
|
| 173 |
+
|
| 174 |
+
st.subheader("π¦ Dataset Status")
|
| 175 |
+
|
| 176 |
+
# Check data directories
|
| 177 |
+
data_dirs = {
|
| 178 |
+
'Strategy': self.config.paths['strategy_dir'],
|
| 179 |
+
'Checklist': self.config.paths['checklist_dir'],
|
| 180 |
+
'Questions': self.config.paths['questions_dir'],
|
| 181 |
+
'VDRs': self.config.paths['vdrs_dir'],
|
| 182 |
+
'Indexes': self.config.paths['faiss_dir']
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
missing_dirs = []
|
| 186 |
+
for name, path in data_dirs.items():
|
| 187 |
+
if not path.exists() or len(list(path.rglob("*"))) < 1:
|
| 188 |
+
missing_dirs.append(name)
|
| 189 |
+
|
| 190 |
+
if missing_dirs:
|
| 191 |
+
st.warning(f"β οΈ Missing data: {', '.join(missing_dirs)}")
|
| 192 |
+
|
| 193 |
+
if st.button("π Download Datasets", help="Download missing datasets from HuggingFace"):
|
| 194 |
+
with st.spinner("Downloading datasets..."):
|
| 195 |
+
try:
|
| 196 |
+
import sys
|
| 197 |
+
from pathlib import Path
|
| 198 |
+
sys.path.append(str(Path(__file__).parent.parent.parent))
|
| 199 |
+
from scripts.setup_datasets import download_datasets
|
| 200 |
+
|
| 201 |
+
download_datasets()
|
| 202 |
+
st.success("β
Datasets downloaded! Refresh the page.")
|
| 203 |
+
st.rerun()
|
| 204 |
+
except Exception as e:
|
| 205 |
+
st.error(f"β Download failed: {str(e)}")
|
| 206 |
+
st.info("π‘ Your datasets might be private. Check HuggingFace authentication.")
|
| 207 |
+
else:
|
| 208 |
+
st.success("β
All datasets loaded")
|
| 209 |
+
|
scripts/build_indexes.py
CHANGED
|
@@ -413,18 +413,18 @@ class BuildStageManager(StageManager):
|
|
| 413 |
if checklist_docs:
|
| 414 |
embeddings_model = get_cached_embeddings(self.config.model['sentence_transformer_model'])
|
| 415 |
vector_store = FAISS.from_documents(checklist_docs, embeddings_model)
|
| 416 |
-
vector_store.save_local(str(self.config.paths['faiss_dir']), index_name="checklist
|
| 417 |
logger.info(f"β
Created checklist index with {len(checklist_docs)} items")
|
| 418 |
|
| 419 |
if questions_docs:
|
| 420 |
embeddings_model = get_cached_embeddings(self.config.model['sentence_transformer_model'])
|
| 421 |
vector_store = FAISS.from_documents(questions_docs, embeddings_model)
|
| 422 |
-
vector_store.save_local(str(self.config.paths['faiss_dir']), index_name="questions
|
| 423 |
logger.info(f"β
Created questions index with {len(questions_docs)} items")
|
| 424 |
|
| 425 |
checklist_result = {
|
| 426 |
'success': True,
|
| 427 |
-
'store_name': 'checklist
|
| 428 |
'documents_count': len(checklist_docs),
|
| 429 |
'content_type': 'checklist'
|
| 430 |
}
|
|
@@ -432,7 +432,7 @@ class BuildStageManager(StageManager):
|
|
| 432 |
|
| 433 |
questions_result = {
|
| 434 |
'success': True,
|
| 435 |
-
'store_name': 'questions
|
| 436 |
'documents_count': len(questions_docs),
|
| 437 |
'content_type': 'questions'
|
| 438 |
}
|
|
|
|
| 413 |
if checklist_docs:
|
| 414 |
embeddings_model = get_cached_embeddings(self.config.model['sentence_transformer_model'])
|
| 415 |
vector_store = FAISS.from_documents(checklist_docs, embeddings_model)
|
| 416 |
+
vector_store.save_local(str(self.config.paths['faiss_dir']), index_name="checklist")
|
| 417 |
logger.info(f"β
Created checklist index with {len(checklist_docs)} items")
|
| 418 |
|
| 419 |
if questions_docs:
|
| 420 |
embeddings_model = get_cached_embeddings(self.config.model['sentence_transformer_model'])
|
| 421 |
vector_store = FAISS.from_documents(questions_docs, embeddings_model)
|
| 422 |
+
vector_store.save_local(str(self.config.paths['faiss_dir']), index_name="questions")
|
| 423 |
logger.info(f"β
Created questions index with {len(questions_docs)} items")
|
| 424 |
|
| 425 |
checklist_result = {
|
| 426 |
'success': True,
|
| 427 |
+
'store_name': 'checklist',
|
| 428 |
'documents_count': len(checklist_docs),
|
| 429 |
'content_type': 'checklist'
|
| 430 |
}
|
|
|
|
| 432 |
|
| 433 |
questions_result = {
|
| 434 |
'success': True,
|
| 435 |
+
'store_name': 'questions',
|
| 436 |
'documents_count': len(questions_docs),
|
| 437 |
'content_type': 'questions'
|
| 438 |
}
|
uv.lock
CHANGED
|
@@ -317,7 +317,7 @@ wheels = [
|
|
| 317 |
[[package]]
|
| 318 |
name = "dd-poc-spaces"
|
| 319 |
version = "0.1.0"
|
| 320 |
-
source = {
|
| 321 |
dependencies = [
|
| 322 |
{ name = "backoff" },
|
| 323 |
{ name = "blackstone" },
|
|
|
|
| 317 |
[[package]]
|
| 318 |
name = "dd-poc-spaces"
|
| 319 |
version = "0.1.0"
|
| 320 |
+
source = { editable = "." }
|
| 321 |
dependencies = [
|
| 322 |
{ name = "backoff" },
|
| 323 |
{ name = "blackstone" },
|