Juan Salas commited on
Commit
d8ecf39
Β·
1 Parent(s): 3632723

Fix FAISS index naming mismatch for checklist and questions

Browse files

- Remove -simple suffix from FAISS index names in build script
- Simplify document handler to use directory names directly
- Skip document type embeddings for framework documents (checklist/questions)
- Resolves 'No pre-built FAISS index found for checklist' error

The build process was creating checklist-simple.faiss but the loader
expected checklist.faiss. Now both are aligned for consistent naming.

Dockerfile CHANGED
@@ -28,8 +28,9 @@ COPY . .
28
  # Ensure LFS files are pulled
29
  RUN git lfs pull || echo "LFS pull failed, continuing..."
30
 
31
- # Download datasets from HuggingFace repos during build
32
- RUN python scripts/setup_datasets.py || echo "Dataset download failed, will retry at runtime"
 
33
 
34
  # HuggingFace Spaces environment variables
35
  ENV STREAMLIT_SERVER_HEADLESS=true
@@ -46,6 +47,9 @@ ENV HF_HOME=/tmp/huggingface
46
  ENV HF_HUB_CACHE=/tmp/huggingface/hub
47
  ENV SENTENCE_TRANSFORMERS_HOME=/tmp/huggingface/sentence_transformers
48
 
 
 
 
49
  # Spaces-specific optimizations
50
  ENV PYTHONUNBUFFERED=1
51
  ENV PYTHONDONTWRITEBYTECODE=1
 
28
  # Ensure LFS files are pulled
29
  RUN git lfs pull || echo "LFS pull failed, continuing..."
30
 
31
+ # Try to download datasets (may fail if private repos or no token)
32
+ # Will be retried at runtime with better error handling
33
+ RUN python scripts/setup_datasets.py || (echo "⚠️ Build-time dataset download failed - will retry at runtime" && mkdir -p data/{strategy,checklist,questions,vdrs,search_indexes})
34
 
35
  # HuggingFace Spaces environment variables
36
  ENV STREAMLIT_SERVER_HEADLESS=true
 
47
  ENV HF_HUB_CACHE=/tmp/huggingface/hub
48
  ENV SENTENCE_TRANSFORMERS_HOME=/tmp/huggingface/sentence_transformers
49
 
50
+ # HuggingFace authentication (will be set via Spaces secrets)
51
+ # ENV HF_TOKEN will be provided at runtime
52
+
53
  # Spaces-specific optimizations
54
  ENV PYTHONUNBUFFERED=1
55
  ENV PYTHONDONTWRITEBYTECODE=1
app/handlers/document_handler.py CHANGED
@@ -35,16 +35,16 @@ class DocumentHandler:
35
  Returns:
36
  Tuple of (documents_count, chunks_count) or None on error
37
  """
38
- # Extract company name from path
39
- company_name = Path(data_room_path).name.lower()
40
 
41
  # Initialize document processor with loaded FAISS store
42
  from app.core.utils import create_document_processor
43
- document_processor = create_document_processor(store_name=company_name)
44
 
45
  if not document_processor.vector_store:
46
  raise create_processing_error(
47
- f"No pre-built FAISS index found for '{company_name}'",
48
  recovery_hint="Please run scripts/build_indexes.py first to create the index"
49
  )
50
 
@@ -58,7 +58,7 @@ class DocumentHandler:
58
  self.session.documents = documents_dict
59
  self.session.chunks = chunks
60
  self.session.embeddings = document_processor.embeddings
61
- self.session.vdr_store = company_name
62
 
63
  # Preload checklist embeddings into memory for fast search
64
  from app.core.search import preload_checklist_embeddings
@@ -72,20 +72,24 @@ class DocumentHandler:
72
  # Don't fail the entire data room processing, but make it very clear this is a problem
73
  raise # Re-raise to make this a hard failure
74
 
75
- # Load pre-built document type embeddings from disk
76
- from app.core.search import preload_document_type_embeddings
77
- logger.info(f"Loading pre-built document type embeddings for {company_name}...")
78
- try:
79
- type_embeddings = preload_document_type_embeddings(company_name)
80
- # Store in session for use during search
81
- self.session.document_type_embeddings = type_embeddings
82
- logger.info(f"βœ… Loaded {len(type_embeddings)} pre-built document type embeddings")
83
- logger.info(f"Session ID: {id(self.session)}, Embeddings stored: {bool(self.session.document_type_embeddings)}")
84
- except RuntimeError as e:
85
- logger.error(f"❌ Failed to load pre-built document type embeddings: {e}")
86
- logger.error("This indicates the build process did not complete successfully.")
87
- logger.error("Please run 'uv run build-indexes' to generate required embeddings.")
88
- raise # Fail fast - embeddings are required for checklist processing
 
 
 
 
89
 
90
  # Clear existing analysis
91
  self.session.reset()
 
35
  Returns:
36
  Tuple of (documents_count, chunks_count) or None on error
37
  """
38
+ # Extract directory name from path - use directly as store name
39
+ store_name = Path(data_room_path).name.lower()
40
 
41
  # Initialize document processor with loaded FAISS store
42
  from app.core.utils import create_document_processor
43
+ document_processor = create_document_processor(store_name=store_name)
44
 
45
  if not document_processor.vector_store:
46
  raise create_processing_error(
47
+ f"No pre-built FAISS index found for '{store_name}'",
48
  recovery_hint="Please run scripts/build_indexes.py first to create the index"
49
  )
50
 
 
58
  self.session.documents = documents_dict
59
  self.session.chunks = chunks
60
  self.session.embeddings = document_processor.embeddings
61
+ self.session.vdr_store = store_name
62
 
63
  # Preload checklist embeddings into memory for fast search
64
  from app.core.search import preload_checklist_embeddings
 
72
  # Don't fail the entire data room processing, but make it very clear this is a problem
73
  raise # Re-raise to make this a hard failure
74
 
75
+ # Load pre-built document type embeddings from disk (only for VDR data rooms)
76
+ if store_name not in ['checklist', 'questions']:
77
+ from app.core.search import preload_document_type_embeddings
78
+ logger.info(f"Loading pre-built document type embeddings for {store_name}...")
79
+ try:
80
+ type_embeddings = preload_document_type_embeddings(store_name)
81
+ # Store in session for use during search
82
+ self.session.document_type_embeddings = type_embeddings
83
+ logger.info(f"βœ… Loaded {len(type_embeddings)} pre-built document type embeddings")
84
+ logger.info(f"Session ID: {id(self.session)}, Embeddings stored: {bool(self.session.document_type_embeddings)}")
85
+ except RuntimeError as e:
86
+ logger.error(f"❌ Failed to load pre-built document type embeddings: {e}")
87
+ logger.error("This indicates the build process did not complete successfully.")
88
+ logger.error("Please run 'uv run build-indexes' to generate required embeddings.")
89
+ raise # Fail fast - embeddings are required for checklist processing
90
+ else:
91
+ logger.info(f"Skipping document type embeddings for framework data room: {store_name}")
92
+ self.session.document_type_embeddings = {}
93
 
94
  # Clear existing analysis
95
  self.session.reset()
app/ui/sidebar.py CHANGED
@@ -84,6 +84,11 @@ class Sidebar:
84
 
85
  st.divider()
86
 
 
 
 
 
 
87
  # Analysis Configuration
88
  st.subheader("πŸ“‹ Analysis Configuration")
89
 
@@ -162,3 +167,43 @@ class Sidebar:
162
  logger.error(f"Failed to process data room {data_room_path}: {e}")
163
  display_processing_error("data room", e)
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  st.divider()
86
 
87
+ # Dataset Status
88
+ self._render_dataset_status()
89
+
90
+ st.divider()
91
+
92
  # Analysis Configuration
93
  st.subheader("πŸ“‹ Analysis Configuration")
94
 
 
167
  logger.error(f"Failed to process data room {data_room_path}: {e}")
168
  display_processing_error("data room", e)
169
 
170
+ def _render_dataset_status(self):
171
+ """Render dataset status and download options"""
172
+ from pathlib import Path
173
+
174
+ st.subheader("πŸ“¦ Dataset Status")
175
+
176
+ # Check data directories
177
+ data_dirs = {
178
+ 'Strategy': self.config.paths['strategy_dir'],
179
+ 'Checklist': self.config.paths['checklist_dir'],
180
+ 'Questions': self.config.paths['questions_dir'],
181
+ 'VDRs': self.config.paths['vdrs_dir'],
182
+ 'Indexes': self.config.paths['faiss_dir']
183
+ }
184
+
185
+ missing_dirs = []
186
+ for name, path in data_dirs.items():
187
+ if not path.exists() or len(list(path.rglob("*"))) < 1:
188
+ missing_dirs.append(name)
189
+
190
+ if missing_dirs:
191
+ st.warning(f"⚠️ Missing data: {', '.join(missing_dirs)}")
192
+
193
+ if st.button("πŸ”„ Download Datasets", help="Download missing datasets from HuggingFace"):
194
+ with st.spinner("Downloading datasets..."):
195
+ try:
196
+ import sys
197
+ from pathlib import Path
198
+ sys.path.append(str(Path(__file__).parent.parent.parent))
199
+ from scripts.setup_datasets import download_datasets
200
+
201
+ download_datasets()
202
+ st.success("βœ… Datasets downloaded! Refresh the page.")
203
+ st.rerun()
204
+ except Exception as e:
205
+ st.error(f"❌ Download failed: {str(e)}")
206
+ st.info("πŸ’‘ Your datasets might be private. Check HuggingFace authentication.")
207
+ else:
208
+ st.success("βœ… All datasets loaded")
209
+
scripts/build_indexes.py CHANGED
@@ -413,18 +413,18 @@ class BuildStageManager(StageManager):
413
  if checklist_docs:
414
  embeddings_model = get_cached_embeddings(self.config.model['sentence_transformer_model'])
415
  vector_store = FAISS.from_documents(checklist_docs, embeddings_model)
416
- vector_store.save_local(str(self.config.paths['faiss_dir']), index_name="checklist-simple")
417
  logger.info(f"βœ… Created checklist index with {len(checklist_docs)} items")
418
 
419
  if questions_docs:
420
  embeddings_model = get_cached_embeddings(self.config.model['sentence_transformer_model'])
421
  vector_store = FAISS.from_documents(questions_docs, embeddings_model)
422
- vector_store.save_local(str(self.config.paths['faiss_dir']), index_name="questions-simple")
423
  logger.info(f"βœ… Created questions index with {len(questions_docs)} items")
424
 
425
  checklist_result = {
426
  'success': True,
427
- 'store_name': 'checklist-simple',
428
  'documents_count': len(checklist_docs),
429
  'content_type': 'checklist'
430
  }
@@ -432,7 +432,7 @@ class BuildStageManager(StageManager):
432
 
433
  questions_result = {
434
  'success': True,
435
- 'store_name': 'questions-simple',
436
  'documents_count': len(questions_docs),
437
  'content_type': 'questions'
438
  }
 
413
  if checklist_docs:
414
  embeddings_model = get_cached_embeddings(self.config.model['sentence_transformer_model'])
415
  vector_store = FAISS.from_documents(checklist_docs, embeddings_model)
416
+ vector_store.save_local(str(self.config.paths['faiss_dir']), index_name="checklist")
417
  logger.info(f"βœ… Created checklist index with {len(checklist_docs)} items")
418
 
419
  if questions_docs:
420
  embeddings_model = get_cached_embeddings(self.config.model['sentence_transformer_model'])
421
  vector_store = FAISS.from_documents(questions_docs, embeddings_model)
422
+ vector_store.save_local(str(self.config.paths['faiss_dir']), index_name="questions")
423
  logger.info(f"βœ… Created questions index with {len(questions_docs)} items")
424
 
425
  checklist_result = {
426
  'success': True,
427
+ 'store_name': 'checklist',
428
  'documents_count': len(checklist_docs),
429
  'content_type': 'checklist'
430
  }
 
432
 
433
  questions_result = {
434
  'success': True,
435
+ 'store_name': 'questions',
436
  'documents_count': len(questions_docs),
437
  'content_type': 'questions'
438
  }
uv.lock CHANGED
@@ -317,7 +317,7 @@ wheels = [
317
  [[package]]
318
  name = "dd-poc-spaces"
319
  version = "0.1.0"
320
- source = { virtual = "." }
321
  dependencies = [
322
  { name = "backoff" },
323
  { name = "blackstone" },
 
317
  [[package]]
318
  name = "dd-poc-spaces"
319
  version = "0.1.0"
320
+ source = { editable = "." }
321
  dependencies = [
322
  { name = "backoff" },
323
  { name = "blackstone" },