Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -514,60 +514,66 @@ async def chat(request: ChatRequest):
|
|
| 514 |
raise HTTPException(status_code=500, detail=str(e))
|
| 515 |
|
| 516 |
def process_heavy_files_background(space_id: str, saved_file_paths: List[Dict]):
|
| 517 |
-
"""Runs in the background
|
| 518 |
try:
|
| 519 |
initialize_space(space_id)
|
| 520 |
processor = DocumentProcessor()
|
| 521 |
-
all_chunks = []
|
| 522 |
-
processed_files = []
|
| 523 |
|
| 524 |
for file_info in saved_file_paths:
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
'filename': filename,
|
| 537 |
-
'
|
| 538 |
-
'
|
| 539 |
-
'
|
| 540 |
-
}
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
'
|
| 547 |
-
'processed_at': datetime.now().isoformat()
|
| 548 |
-
})
|
| 549 |
-
|
| 550 |
-
# Upload to Qdrant
|
| 551 |
-
if all_chunks:
|
| 552 |
-
texts = [chunk['content'] for chunk in all_chunks]
|
| 553 |
-
metadatas = [chunk['metadata'] for chunk in all_chunks]
|
| 554 |
-
ids = [f"{space_id}_{idx}_{uuid.uuid4().hex[:8]}" for idx in range(len(all_chunks))]
|
| 555 |
-
|
| 556 |
-
# Batch size set to 100 for Qdrant Cloud limits
|
| 557 |
-
batch_size = 100
|
| 558 |
-
for i in range(0, len(texts), batch_size):
|
| 559 |
-
vector_db.add_documents(
|
| 560 |
-
texts[i:i + batch_size],
|
| 561 |
-
metadatas[i:i + batch_size],
|
| 562 |
-
ids[i:i + batch_size]
|
| 563 |
-
)
|
| 564 |
-
|
| 565 |
-
# Save metadata directly to MongoDB
|
| 566 |
-
if processed_files and MONGO_URI:
|
| 567 |
-
files_collection.insert_many(processed_files)
|
| 568 |
|
| 569 |
except Exception as e:
|
| 570 |
-
print(f"Background
|
| 571 |
|
| 572 |
@app.post("/api/spaces/{space_id}/upload")
|
| 573 |
async def upload_files(
|
|
|
|
| 514 |
raise HTTPException(status_code=500, detail=str(e))
|
| 515 |
|
| 516 |
def process_heavy_files_background(space_id: str, saved_file_paths: List[Dict]):
|
| 517 |
+
"""Runs in the background, processing and saving ONE file at a time."""
|
| 518 |
try:
|
| 519 |
initialize_space(space_id)
|
| 520 |
processor = DocumentProcessor()
|
|
|
|
|
|
|
| 521 |
|
| 522 |
for file_info in saved_file_paths:
|
| 523 |
+
try:
|
| 524 |
+
file_path = Path(file_info['path'])
|
| 525 |
+
filename = file_info['name']
|
| 526 |
+
|
| 527 |
+
print(f"Processing: {filename}...")
|
| 528 |
+
|
| 529 |
+
# 1. Process just this one file
|
| 530 |
+
file_data = processor.process_file(file_path)
|
| 531 |
+
chunks = processor.chunk_text(file_data['content'], chunk_size=512, overlap=50, semantic=True)
|
| 532 |
+
|
| 533 |
+
file_chunks = []
|
| 534 |
+
for idx, chunk in enumerate(chunks):
|
| 535 |
+
file_chunks.append({
|
| 536 |
+
'content': chunk,
|
| 537 |
+
'metadata': {
|
| 538 |
+
'filename': filename,
|
| 539 |
+
'chunk_index': idx,
|
| 540 |
+
'total_chunks': len(chunks),
|
| 541 |
+
'source_type': file_data['format']
|
| 542 |
+
}
|
| 543 |
+
})
|
| 544 |
+
|
| 545 |
+
# 2. Upload to Qdrant immediately (This clears the RAM for the next file!)
|
| 546 |
+
if file_chunks:
|
| 547 |
+
texts = [chunk['content'] for chunk in file_chunks]
|
| 548 |
+
metadatas = [chunk['metadata'] for chunk in file_chunks]
|
| 549 |
+
# Make UUID unique to the file to prevent collisions
|
| 550 |
+
ids = [f"{space_id}_{filename}_{idx}_{uuid.uuid4().hex[:8]}" for idx in range(len(file_chunks))]
|
| 551 |
+
|
| 552 |
+
batch_size = 100
|
| 553 |
+
for i in range(0, len(texts), batch_size):
|
| 554 |
+
vector_db.add_documents(
|
| 555 |
+
texts[i:i + batch_size],
|
| 556 |
+
metadatas[i:i + batch_size],
|
| 557 |
+
ids[i:i + batch_size]
|
| 558 |
+
)
|
| 559 |
+
|
| 560 |
+
# 3. Save metadata directly to MongoDB so it appears in Flutter instantly
|
| 561 |
+
if MONGO_URI:
|
| 562 |
+
files_collection.insert_one({
|
| 563 |
'filename': filename,
|
| 564 |
+
'space_id': space_id,
|
| 565 |
+
'chunks': len(chunks),
|
| 566 |
+
'processed_at': datetime.now().isoformat()
|
| 567 |
+
})
|
| 568 |
+
|
| 569 |
+
print(f"Successfully finished: {filename}")
|
| 570 |
+
|
| 571 |
+
except Exception as file_e:
|
| 572 |
+
# If ONE file has a corrupted page, skip it but KEEP GOING for the rest!
|
| 573 |
+
print(f"Failed to process file {file_info['name']}: {file_e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
|
| 575 |
except Exception as e:
|
| 576 |
+
print(f"Background worker completely crashed: {e}")
|
| 577 |
|
| 578 |
@app.post("/api/spaces/{space_id}/upload")
|
| 579 |
async def upload_files(
|