mohhhhhit commited on
Commit
f8e6236
·
verified ·
1 Parent(s): 421fdaa

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +52 -46
main.py CHANGED
@@ -514,60 +514,66 @@ async def chat(request: ChatRequest):
514
  raise HTTPException(status_code=500, detail=str(e))
515
 
516
  def process_heavy_files_background(space_id: str, saved_file_paths: List[Dict]):
517
- """Runs in the background so the HTTP request doesn't timeout"""
518
  try:
519
  initialize_space(space_id)
520
  processor = DocumentProcessor()
521
- all_chunks = []
522
- processed_files = []
523
 
524
  for file_info in saved_file_paths:
525
- file_path = Path(file_info['path'])
526
- filename = file_info['name']
527
-
528
- # The heavy CPU work
529
- file_data = processor.process_file(file_path)
530
- chunks = processor.chunk_text(file_data['content'], chunk_size=512, overlap=50, semantic=True)
531
-
532
- for idx, chunk in enumerate(chunks):
533
- all_chunks.append({
534
- 'content': chunk,
535
- 'metadata': {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
  'filename': filename,
537
- 'chunk_index': idx,
538
- 'total_chunks': len(chunks),
539
- 'source_type': file_data['format']
540
- }
541
- })
542
-
543
- processed_files.append({
544
- 'filename': filename,
545
- 'space_id': space_id, # Link it to the space
546
- 'chunks': len(chunks),
547
- 'processed_at': datetime.now().isoformat()
548
- })
549
-
550
- # Upload to Qdrant
551
- if all_chunks:
552
- texts = [chunk['content'] for chunk in all_chunks]
553
- metadatas = [chunk['metadata'] for chunk in all_chunks]
554
- ids = [f"{space_id}_{idx}_{uuid.uuid4().hex[:8]}" for idx in range(len(all_chunks))]
555
-
556
- # Batch size set to 100 for Qdrant Cloud limits
557
- batch_size = 100
558
- for i in range(0, len(texts), batch_size):
559
- vector_db.add_documents(
560
- texts[i:i + batch_size],
561
- metadatas[i:i + batch_size],
562
- ids[i:i + batch_size]
563
- )
564
-
565
- # Save metadata directly to MongoDB
566
- if processed_files and MONGO_URI:
567
- files_collection.insert_many(processed_files)
568
 
569
  except Exception as e:
570
- print(f"Background processing failed: {e}")
571
 
572
  @app.post("/api/spaces/{space_id}/upload")
573
  async def upload_files(
 
514
  raise HTTPException(status_code=500, detail=str(e))
515
 
516
  def process_heavy_files_background(space_id: str, saved_file_paths: List[Dict]):
517
+ """Runs in the background, processing and saving ONE file at a time."""
518
  try:
519
  initialize_space(space_id)
520
  processor = DocumentProcessor()
 
 
521
 
522
  for file_info in saved_file_paths:
523
+ try:
524
+ file_path = Path(file_info['path'])
525
+ filename = file_info['name']
526
+
527
+ print(f"Processing: {filename}...")
528
+
529
+ # 1. Process just this one file
530
+ file_data = processor.process_file(file_path)
531
+ chunks = processor.chunk_text(file_data['content'], chunk_size=512, overlap=50, semantic=True)
532
+
533
+ file_chunks = []
534
+ for idx, chunk in enumerate(chunks):
535
+ file_chunks.append({
536
+ 'content': chunk,
537
+ 'metadata': {
538
+ 'filename': filename,
539
+ 'chunk_index': idx,
540
+ 'total_chunks': len(chunks),
541
+ 'source_type': file_data['format']
542
+ }
543
+ })
544
+
545
+ # 2. Upload to Qdrant immediately (This clears the RAM for the next file!)
546
+ if file_chunks:
547
+ texts = [chunk['content'] for chunk in file_chunks]
548
+ metadatas = [chunk['metadata'] for chunk in file_chunks]
549
+ # Make UUID unique to the file to prevent collisions
550
+ ids = [f"{space_id}_{filename}_{idx}_{uuid.uuid4().hex[:8]}" for idx in range(len(file_chunks))]
551
+
552
+ batch_size = 100
553
+ for i in range(0, len(texts), batch_size):
554
+ vector_db.add_documents(
555
+ texts[i:i + batch_size],
556
+ metadatas[i:i + batch_size],
557
+ ids[i:i + batch_size]
558
+ )
559
+
560
+ # 3. Save metadata directly to MongoDB so it appears in Flutter instantly
561
+ if MONGO_URI:
562
+ files_collection.insert_one({
563
  'filename': filename,
564
+ 'space_id': space_id,
565
+ 'chunks': len(chunks),
566
+ 'processed_at': datetime.now().isoformat()
567
+ })
568
+
569
+ print(f"Successfully finished: {filename}")
570
+
571
+ except Exception as file_e:
572
+ # If ONE file has a corrupted page, skip it but KEEP GOING for the rest!
573
+ print(f"Failed to process file {file_info['name']}: {file_e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
 
575
  except Exception as e:
576
+ print(f"Background worker completely crashed: {e}")
577
 
578
  @app.post("/api/spaces/{space_id}/upload")
579
  async def upload_files(