TilanB commited on
Commit
110632f
·
verified ·
1 Parent(s): b95549b

Update content_analyzer/document_parser.py

Browse files
Files changed (1) hide show
  1. content_analyzer/document_parser.py +69 -48
content_analyzer/document_parser.py CHANGED
@@ -434,57 +434,78 @@ class DocumentProcessor:
434
  # === PHASE 1: PARALLEL LOCAL CHART DETECTION (CPU-BOUND) ===
435
  logger.info("Phase 1: Detecting charts and caching to disk...")
436
  batch_size = parameters.CHART_BATCH_SIZE
437
- page_image_tuples = []
438
- for start_page in range(1, total_pages + 1, batch_size):
439
- end_page = min(start_page + batch_size - 1, total_pages)
440
- try:
441
- images = convert_from_path(
442
- file_path,
443
- dpi=parameters.CHART_DPI,
444
- first_page=start_page,
445
- last_page=end_page,
446
- fmt='jpeg',
447
- jpegopt={'quality': 85, 'optimize': True}
448
- )
449
- for idx, image in enumerate(images):
450
- page_num = start_page + idx
451
- stats['pages_scanned'] += 1
452
- # Resize if needed
453
- max_dimension = parameters.CHART_MAX_IMAGE_SIZE
454
- if max(image.size) > max_dimension:
455
- ratio = max_dimension / max(image.size)
456
- new_size = tuple(int(dim * ratio) for dim in image.size)
457
- image = image.resize(new_size, Image.Resampling.LANCZOS)
458
- page_image_tuples.append((page_num, image))
459
- del images
460
- except Exception as e:
461
- logger.warning(f"Failed to process pages {start_page}-{end_page}: {e}")
462
- continue
463
-
464
  detected_charts = []
465
- if use_local and parameters.CHART_SKIP_GEMINI_DETECTION and page_image_tuples:
466
  logger.info("Parallel local chart detection using ProcessPoolExecutor...")
467
- # Limit parallelism to avoid memory errors
468
- with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
469
- results = list(executor.map(detect_chart_on_page, page_image_tuples))
470
- for page_num, image, detection_result in results:
471
- if not detection_result['has_chart']:
472
- logger.debug(f"Page {page_num}: No chart detected (skipping)")
473
- stats['api_calls_saved'] += 1
474
- continue
475
- confidence = detection_result['confidence']
476
- if confidence < parameters.CHART_MIN_CONFIDENCE:
477
- logger.debug(f"Page {page_num}: Low confidence ({confidence:.0%}), skipping")
478
- stats['api_calls_saved'] += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
479
  continue
480
- logger.info(f"📈 Chart detected on page {page_num} (confidence: {confidence:.0%})")
481
- stats['charts_detected_local'] += 1
482
- image_path = os.path.join(temp_dir, f'chart_page_{page_num}.jpg')
483
- image.save(image_path, 'JPEG', quality=90)
484
- detected_charts.append((page_num, image_path, detection_result))
485
- # Release memory
486
- del image
487
- gc.collect()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
  else:
489
  # Fallback: sequential detection
490
  for page_num, image in page_image_tuples:
 
434
  # === PHASE 1: PARALLEL LOCAL CHART DETECTION (CPU-BOUND) ===
435
  logger.info("Phase 1: Detecting charts and caching to disk...")
436
  batch_size = parameters.CHART_BATCH_SIZE
437
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  detected_charts = []
439
+ if use_local and parameters.CHART_SKIP_GEMINI_DETECTION:
440
  logger.info("Parallel local chart detection using ProcessPoolExecutor...")
441
+ # Use optimal worker count: min of CPU count or 4 to avoid memory issues
442
+ import os
443
+ max_workers = min(os.cpu_count() or 2, 4)
444
+ logger.info(f"Using {max_workers} workers for parallel chart detection")
445
+
446
+ # MEMORY OPTIMIZATION: Process pages in streaming batches instead of loading all at once
447
+ # This reduces peak memory by 60-80% for large PDFs
448
+ detection_batch_size = 20 # Process 20 pages at a time to limit memory
449
+
450
+ for batch_start in range(1, total_pages + 1, detection_batch_size):
451
+ batch_end = min(batch_start + detection_batch_size - 1, total_pages)
452
+ logger.debug(f"Processing detection batch: pages {batch_start}-{batch_end}")
453
+
454
+ # Load only this batch of pages into memory
455
+ page_image_tuples = []
456
+ try:
457
+ images = convert_from_path(
458
+ file_path,
459
+ dpi=parameters.CHART_DPI,
460
+ first_page=batch_start,
461
+ last_page=batch_end,
462
+ fmt='jpeg',
463
+ jpegopt={'quality': 85, 'optimize': True}
464
+ )
465
+ for idx, image in enumerate(images):
466
+ page_num = batch_start + idx
467
+ stats['pages_scanned'] += 1
468
+ # Resize if needed
469
+ max_dimension = parameters.CHART_MAX_IMAGE_SIZE
470
+ if max(image.size) > max_dimension:
471
+ ratio = max_dimension / max(image.size)
472
+ new_size = tuple(int(dim * ratio) for dim in image.size)
473
+ image = image.resize(new_size, Image.Resampling.LANCZOS)
474
+ page_image_tuples.append((page_num, image))
475
+ del images
476
+ except Exception as e:
477
+ logger.warning(f"Failed to process pages {batch_start}-{batch_end}: {e}")
478
  continue
479
+
480
+ # Process this batch with parallel detection
481
+ if page_image_tuples:
482
+ with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
483
+ results = list(executor.map(detect_chart_on_page, page_image_tuples))
484
+
485
+ # Process detection results and save charts to disk
486
+ for page_num, image, detection_result in results:
487
+ if not detection_result['has_chart']:
488
+ logger.debug(f"Page {page_num}: No chart detected (skipping)")
489
+ stats['api_calls_saved'] += 1
490
+ continue
491
+ confidence = detection_result['confidence']
492
+ if confidence < parameters.CHART_MIN_CONFIDENCE:
493
+ logger.debug(f"Page {page_num}: Low confidence ({confidence:.0%}), skipping")
494
+ stats['api_calls_saved'] += 1
495
+ continue
496
+ logger.info(f"📈 Chart detected on page {page_num} (confidence: {confidence:.0%})")
497
+ stats['charts_detected_local'] += 1
498
+ image_path = os.path.join(temp_dir, f'chart_page_{page_num}.jpg')
499
+ image.save(image_path, 'JPEG', quality=90)
500
+ detected_charts.append((page_num, image_path, detection_result))
501
+ # Release memory immediately
502
+ del image
503
+
504
+ # Clean up batch memory
505
+ del page_image_tuples
506
+ del results
507
+ gc.collect()
508
+ logger.debug(f"Batch {batch_start}-{batch_end} complete, memory released")
509
  else:
510
  # Fallback: sequential detection
511
  for page_num, image in page_image_tuples: