Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -311,21 +311,34 @@ async def process_document_pipeline(
|
|
| 311 |
logger.info(f"🧩 Creating chunks and embeddings for session {session_id} using {HF_MODEL_ID}")
|
| 312 |
chunks = chunk_text_hierarchical(text, filename)
|
| 313 |
|
|
|
|
|
|
|
| 314 |
# Create embeddings and store chunks
|
| 315 |
chunks_to_store = []
|
| 316 |
-
for chunk in chunks:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
# Create embedding
|
| 318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
|
|
|
| 320 |
chunk_doc = {
|
| 321 |
"session_id": session_id,
|
| 322 |
"chunk_id": chunk['id'],
|
| 323 |
-
"
|
| 324 |
"title": chunk['title'],
|
| 325 |
"section_type": chunk['section_type'],
|
| 326 |
"importance_score": chunk['importance_score'],
|
| 327 |
"entities": chunk['entities'],
|
| 328 |
-
"embedding": embedding.tolist(),
|
| 329 |
"created_at": datetime.utcnow()
|
| 330 |
}
|
| 331 |
chunks_to_store.append(chunk_doc)
|
|
@@ -333,6 +346,9 @@ async def process_document_pipeline(
|
|
| 333 |
# Batch insert chunks
|
| 334 |
if chunks_to_store:
|
| 335 |
await db.chunks.insert_many(chunks_to_store)
|
|
|
|
|
|
|
|
|
|
| 336 |
|
| 337 |
# Update session as completed
|
| 338 |
await db.sessions.update_one(
|
|
|
|
| 311 |
logger.info(f"🧩 Creating chunks and embeddings for session {session_id} using {HF_MODEL_ID}")
|
| 312 |
chunks = chunk_text_hierarchical(text, filename)
|
| 313 |
|
| 314 |
+
logger.info(f"📊 Created {len(chunks)} chunks from document")
|
| 315 |
+
|
| 316 |
# Create embeddings and store chunks
|
| 317 |
chunks_to_store = []
|
| 318 |
+
for i, chunk in enumerate(chunks):
|
| 319 |
+
# Validate chunk has text
|
| 320 |
+
chunk_text = chunk.get('text', '').strip()
|
| 321 |
+
if not chunk_text:
|
| 322 |
+
logger.warning(f"⚠️ Skipping chunk {i} - no text content")
|
| 323 |
+
continue
|
| 324 |
+
|
| 325 |
# Create embedding
|
| 326 |
+
try:
|
| 327 |
+
embedding = create_embedding(chunk_text)
|
| 328 |
+
except Exception as e:
|
| 329 |
+
logger.error(f"❌ Failed to create embedding for chunk {i}: {e}")
|
| 330 |
+
continue
|
| 331 |
|
| 332 |
+
# FIXED: Use 'content' field instead of 'text'
|
| 333 |
chunk_doc = {
|
| 334 |
"session_id": session_id,
|
| 335 |
"chunk_id": chunk['id'],
|
| 336 |
+
"content": chunk_text, # Changed from 'text' to 'content'
|
| 337 |
"title": chunk['title'],
|
| 338 |
"section_type": chunk['section_type'],
|
| 339 |
"importance_score": chunk['importance_score'],
|
| 340 |
"entities": chunk['entities'],
|
| 341 |
+
"embedding": embedding.tolist(),
|
| 342 |
"created_at": datetime.utcnow()
|
| 343 |
}
|
| 344 |
chunks_to_store.append(chunk_doc)
|
|
|
|
| 346 |
# Batch insert chunks
|
| 347 |
if chunks_to_store:
|
| 348 |
await db.chunks.insert_many(chunks_to_store)
|
| 349 |
+
logger.info(f"✅ Stored {len(chunks_to_store)} chunks with embeddings")
|
| 350 |
+
else:
|
| 351 |
+
raise Exception("No valid chunks created from document")
|
| 352 |
|
| 353 |
# Update session as completed
|
| 354 |
await db.sessions.update_one(
|