Spaces:
Sleeping
Sleeping
Update content_analyzer/document_parser.py
Browse files
content_analyzer/document_parser.py
CHANGED
|
@@ -434,57 +434,78 @@ class DocumentProcessor:
|
|
| 434 |
# === PHASE 1: PARALLEL LOCAL CHART DETECTION (CPU-BOUND) ===
|
| 435 |
logger.info("Phase 1: Detecting charts and caching to disk...")
|
| 436 |
batch_size = parameters.CHART_BATCH_SIZE
|
| 437 |
-
|
| 438 |
-
for start_page in range(1, total_pages + 1, batch_size):
|
| 439 |
-
end_page = min(start_page + batch_size - 1, total_pages)
|
| 440 |
-
try:
|
| 441 |
-
images = convert_from_path(
|
| 442 |
-
file_path,
|
| 443 |
-
dpi=parameters.CHART_DPI,
|
| 444 |
-
first_page=start_page,
|
| 445 |
-
last_page=end_page,
|
| 446 |
-
fmt='jpeg',
|
| 447 |
-
jpegopt={'quality': 85, 'optimize': True}
|
| 448 |
-
)
|
| 449 |
-
for idx, image in enumerate(images):
|
| 450 |
-
page_num = start_page + idx
|
| 451 |
-
stats['pages_scanned'] += 1
|
| 452 |
-
# Resize if needed
|
| 453 |
-
max_dimension = parameters.CHART_MAX_IMAGE_SIZE
|
| 454 |
-
if max(image.size) > max_dimension:
|
| 455 |
-
ratio = max_dimension / max(image.size)
|
| 456 |
-
new_size = tuple(int(dim * ratio) for dim in image.size)
|
| 457 |
-
image = image.resize(new_size, Image.Resampling.LANCZOS)
|
| 458 |
-
page_image_tuples.append((page_num, image))
|
| 459 |
-
del images
|
| 460 |
-
except Exception as e:
|
| 461 |
-
logger.warning(f"Failed to process pages {start_page}-{end_page}: {e}")
|
| 462 |
-
continue
|
| 463 |
-
|
| 464 |
detected_charts = []
|
| 465 |
-
if use_local and parameters.CHART_SKIP_GEMINI_DETECTION
|
| 466 |
logger.info("Parallel local chart detection using ProcessPoolExecutor...")
|
| 467 |
-
#
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
continue
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
else:
|
| 489 |
# Fallback: sequential detection
|
| 490 |
for page_num, image in page_image_tuples:
|
|
|
|
| 434 |
# === PHASE 1: PARALLEL LOCAL CHART DETECTION (CPU-BOUND) ===
|
| 435 |
logger.info("Phase 1: Detecting charts and caching to disk...")
|
| 436 |
batch_size = parameters.CHART_BATCH_SIZE
|
| 437 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
detected_charts = []
|
| 439 |
+
if use_local and parameters.CHART_SKIP_GEMINI_DETECTION:
|
| 440 |
logger.info("Parallel local chart detection using ProcessPoolExecutor...")
|
| 441 |
+
# Use optimal worker count: min of CPU count or 4 to avoid memory issues
|
| 442 |
+
import os
|
| 443 |
+
max_workers = min(os.cpu_count() or 2, 4)
|
| 444 |
+
logger.info(f"Using {max_workers} workers for parallel chart detection")
|
| 445 |
+
|
| 446 |
+
# MEMORY OPTIMIZATION: Process pages in streaming batches instead of loading all at once
|
| 447 |
+
# This reduces peak memory by 60-80% for large PDFs
|
| 448 |
+
detection_batch_size = 20 # Process 20 pages at a time to limit memory
|
| 449 |
+
|
| 450 |
+
for batch_start in range(1, total_pages + 1, detection_batch_size):
|
| 451 |
+
batch_end = min(batch_start + detection_batch_size - 1, total_pages)
|
| 452 |
+
logger.debug(f"Processing detection batch: pages {batch_start}-{batch_end}")
|
| 453 |
+
|
| 454 |
+
# Load only this batch of pages into memory
|
| 455 |
+
page_image_tuples = []
|
| 456 |
+
try:
|
| 457 |
+
images = convert_from_path(
|
| 458 |
+
file_path,
|
| 459 |
+
dpi=parameters.CHART_DPI,
|
| 460 |
+
first_page=batch_start,
|
| 461 |
+
last_page=batch_end,
|
| 462 |
+
fmt='jpeg',
|
| 463 |
+
jpegopt={'quality': 85, 'optimize': True}
|
| 464 |
+
)
|
| 465 |
+
for idx, image in enumerate(images):
|
| 466 |
+
page_num = batch_start + idx
|
| 467 |
+
stats['pages_scanned'] += 1
|
| 468 |
+
# Resize if needed
|
| 469 |
+
max_dimension = parameters.CHART_MAX_IMAGE_SIZE
|
| 470 |
+
if max(image.size) > max_dimension:
|
| 471 |
+
ratio = max_dimension / max(image.size)
|
| 472 |
+
new_size = tuple(int(dim * ratio) for dim in image.size)
|
| 473 |
+
image = image.resize(new_size, Image.Resampling.LANCZOS)
|
| 474 |
+
page_image_tuples.append((page_num, image))
|
| 475 |
+
del images
|
| 476 |
+
except Exception as e:
|
| 477 |
+
logger.warning(f"Failed to process pages {batch_start}-{batch_end}: {e}")
|
| 478 |
continue
|
| 479 |
+
|
| 480 |
+
# Process this batch with parallel detection
|
| 481 |
+
if page_image_tuples:
|
| 482 |
+
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
|
| 483 |
+
results = list(executor.map(detect_chart_on_page, page_image_tuples))
|
| 484 |
+
|
| 485 |
+
# Process detection results and save charts to disk
|
| 486 |
+
for page_num, image, detection_result in results:
|
| 487 |
+
if not detection_result['has_chart']:
|
| 488 |
+
logger.debug(f"Page {page_num}: No chart detected (skipping)")
|
| 489 |
+
stats['api_calls_saved'] += 1
|
| 490 |
+
continue
|
| 491 |
+
confidence = detection_result['confidence']
|
| 492 |
+
if confidence < parameters.CHART_MIN_CONFIDENCE:
|
| 493 |
+
logger.debug(f"Page {page_num}: Low confidence ({confidence:.0%}), skipping")
|
| 494 |
+
stats['api_calls_saved'] += 1
|
| 495 |
+
continue
|
| 496 |
+
logger.info(f"📈 Chart detected on page {page_num} (confidence: {confidence:.0%})")
|
| 497 |
+
stats['charts_detected_local'] += 1
|
| 498 |
+
image_path = os.path.join(temp_dir, f'chart_page_{page_num}.jpg')
|
| 499 |
+
image.save(image_path, 'JPEG', quality=90)
|
| 500 |
+
detected_charts.append((page_num, image_path, detection_result))
|
| 501 |
+
# Release memory immediately
|
| 502 |
+
del image
|
| 503 |
+
|
| 504 |
+
# Clean up batch memory
|
| 505 |
+
del page_image_tuples
|
| 506 |
+
del results
|
| 507 |
+
gc.collect()
|
| 508 |
+
logger.debug(f"Batch {batch_start}-{batch_end} complete, memory released")
|
| 509 |
else:
|
| 510 |
# Fallback: sequential detection
|
| 511 |
for page_num, image in page_image_tuples:
|