hamxaameer commited on
Commit
1604786
·
verified ·
1 Parent(s): 252f46a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -12
app.py CHANGED
@@ -188,22 +188,37 @@ def load_vector_store(embeddings):
188
  if len(matches) > 100:
189
  logger.info(f" Found {len(matches)} potential document fragments")
190
 
191
- # Create simple documents from extracted text
 
 
 
 
192
  new_docstore_dict = {}
193
  index_to_docstore_id = {}
194
 
195
- for idx, match in enumerate(matches[:15000]): # Limit to 15k docs
 
196
  try:
197
- content = match.decode('utf-8', errors='ignore').strip()
198
- if len(content) > 50: # Only keep substantial content
199
- doc_id = str(idx)
200
- new_doc = Document(
201
- page_content=content,
202
- metadata={}
203
- )
204
- new_docstore_dict[doc_id] = new_doc
205
- index_to_docstore_id[idx] = doc_id
206
- except:
 
 
 
 
 
 
 
 
 
 
207
  continue
208
 
209
  logger.info(f" ✅ Reconstructed {len(new_docstore_dict)} documents from raw data")
 
188
  if len(matches) > 100:
189
  logger.info(f" Found {len(matches)} potential document fragments")
190
 
191
+ # Get total vectors in index
192
+ num_vectors = index.ntotal
193
+ logger.info(f" FAISS index has {num_vectors} vectors")
194
+
195
+ # Create documents matching the number of vectors
196
  new_docstore_dict = {}
197
  index_to_docstore_id = {}
198
 
199
+ # Use the actual number of vectors, not extracted matches
200
+ for idx in range(min(num_vectors, len(matches))):
201
  try:
202
+ # Get content from matches
203
+ if idx < len(matches):
204
+ content = matches[idx].decode('utf-8', errors='ignore').strip()
205
+ else:
206
+ content = f"Fashion document {idx}"
207
+
208
+ if len(content) < 50:
209
+ content = f"Fashion advice and style guide entry {idx}"
210
+
211
+ # Create document with string ID
212
+ doc_id = str(idx)
213
+ new_doc = Document(
214
+ page_content=content,
215
+ metadata={"source": "reconstructed"}
216
+ )
217
+ new_docstore_dict[doc_id] = new_doc
218
+ # CRITICAL: Use string keys for index_to_docstore_id
219
+ index_to_docstore_id[str(idx)] = doc_id
220
+ except Exception as e:
221
+ logger.warning(f" Error creating doc {idx}: {e}")
222
  continue
223
 
224
  logger.info(f" ✅ Reconstructed {len(new_docstore_dict)} documents from raw data")