IsmatS commited on
Commit
585eacf
·
1 Parent(s): c4f0859
Files changed (2) hide show
  1. .env.example +3 -0
  2. app/main.py +54 -42
.env.example CHANGED
@@ -47,6 +47,9 @@ VECTOR_DB_TYPE=pinecone
47
  API_HOST=0.0.0.0
48
  API_PORT=8000
49
 
 
 
 
50
  # Production SSL/Security Configuration
51
  # Set these for production deployment (see docs/markdowns/SSL_CAA_SETUP.md)
52
  PRODUCTION=false
 
47
  API_HOST=0.0.0.0
48
  API_PORT=8000
49
 
50
+ # OCR Configuration
51
+ OCR_MAX_PAGES=0 # 0 = unlimited pages. Set to limit if on constrained hosting (e.g., 5 for 512MB)
52
+
53
  # Production SSL/Security Configuration
54
  # Set these for production deployment (see docs/markdowns/SSL_CAA_SETUP.md)
55
  PRODUCTION=false
app/main.py CHANGED
@@ -9,6 +9,7 @@ import os
9
  import re
10
  import time
11
  import base64
 
12
  from typing import List, Dict
13
  from pathlib import Path
14
  from io import BytesIO
@@ -452,45 +453,39 @@ class OCRPageResponse(BaseModel):
452
  MD_text: str
453
 
454
 
455
- def pdf_to_images(pdf_bytes: bytes, dpi: int = 100) -> List[Image.Image]:
456
- """Convert PDF bytes to PIL Images."""
457
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
458
- images = []
459
 
460
- for page_num in range(len(doc)):
461
- page = doc[page_num]
462
- zoom = dpi / 72
463
- mat = fitz.Matrix(zoom, zoom)
464
- pix = page.get_pixmap(matrix=mat)
465
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
466
- images.append(img)
467
 
468
- doc.close()
469
- return images
 
 
470
 
 
 
471
 
472
- def image_to_base64(image: Image.Image, format: str = "JPEG", quality: int = 85) -> str:
473
- """Convert PIL Image to base64 with compression."""
474
- buffered = BytesIO()
475
- image.save(buffered, format=format, quality=quality, optimize=True)
476
- return base64.b64encode(buffered.getvalue()).decode("utf-8")
477
 
 
 
478
 
479
- def detect_images_in_pdf(pdf_bytes: bytes) -> Dict[int, int]:
480
- """
481
- Detect images in each page of PDF.
482
- Returns dict: {page_number: image_count}
483
- """
484
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
485
- image_counts = {}
486
 
487
- for page_num in range(len(doc)):
488
- page = doc[page_num]
489
- image_list = page.get_images()
490
- image_counts[page_num + 1] = len(image_list)
491
 
492
- doc.close()
493
- return image_counts
494
 
495
 
496
  @app.post("/ocr", response_model=List[OCRPageResponse])
@@ -498,9 +493,14 @@ async def ocr_endpoint(file: UploadFile = File(...)):
498
  """
499
  OCR endpoint for PDF text extraction with image detection.
500
 
 
 
 
 
 
 
501
  Uses VLM (Llama-4-Maverick-17B) for best accuracy:
502
  - Character Success Rate: 87.75%
503
- - Word Success Rate: 61.91%
504
  - Processing: ~6s per page
505
 
506
  Returns:
@@ -511,11 +511,18 @@ async def ocr_endpoint(file: UploadFile = File(...)):
511
  pdf_bytes = await file.read()
512
  pdf_filename = file.filename or "document.pdf"
513
 
514
- # Convert to images
515
- images = pdf_to_images(pdf_bytes, dpi=100)
516
-
517
- # Detect images per page
518
- image_counts = detect_images_in_pdf(pdf_bytes)
 
 
 
 
 
 
 
519
 
520
  # OCR system prompt
521
  system_prompt = """You are an expert OCR system for historical oil & gas documents.
@@ -529,13 +536,13 @@ Extract ALL text from the image with 100% accuracy. Follow these rules:
529
 
530
  Output ONLY the extracted text. No explanations, no descriptions."""
531
 
532
- # Process each page
533
  results = []
534
  client = get_azure_client()
535
 
536
- for page_num, image in enumerate(images, 1):
537
- # Convert image to base64
538
- image_base64 = image_to_base64(image, format="JPEG", quality=85)
539
 
540
  # VLM OCR
541
  messages = [
@@ -559,7 +566,6 @@ Output ONLY the extracted text. No explanations, no descriptions."""
559
  page_text = response.choices[0].message.content
560
 
561
  # Add image references if images exist on this page
562
- num_images = image_counts.get(page_num, 0)
563
  if num_images > 0:
564
  for img_idx in range(1, num_images + 1):
565
  page_text += f"\n\n![Image]({pdf_filename}/page_{page_num}/image_{img_idx})\n\n"
@@ -569,8 +575,14 @@ Output ONLY the extracted text. No explanations, no descriptions."""
569
  "MD_text": page_text
570
  })
571
 
 
 
 
 
572
  return results
573
 
 
 
574
  except Exception as e:
575
  raise HTTPException(status_code=500, detail=f"OCR Error: {str(e)}")
576
 
 
9
  import re
10
  import time
11
  import base64
12
+ import gc
13
  from typing import List, Dict
14
  from pathlib import Path
15
  from io import BytesIO
 
453
  MD_text: str
454
 
455
 
456
+ def process_pdf_page(pdf_bytes: bytes, page_num: int, dpi: int = 100) -> tuple[str, int]:
457
+ """
458
+ Process a single PDF page for OCR (memory efficient).
 
459
 
460
+ Returns: (base64_image, num_embedded_images)
461
+ """
462
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
463
+ page = doc[page_num - 1] # 0-indexed
 
 
 
464
 
465
+ # Convert page to image
466
+ zoom = dpi / 72
467
+ mat = fitz.Matrix(zoom, zoom)
468
+ pix = page.get_pixmap(matrix=mat)
469
 
470
+ # Convert to PIL Image
471
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
472
 
473
+ # Count embedded images
474
+ image_list = page.get_images()
475
+ num_images = len(image_list)
 
 
476
 
477
+ doc.close()
478
+ del pix, page, doc # Explicit cleanup
479
 
480
+ # Convert to base64 JPEG with good quality
481
+ buffered = BytesIO()
482
+ img.save(buffered, format="JPEG", quality=85, optimize=True)
483
+ img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
 
 
 
484
 
485
+ del img, buffered # Explicit cleanup
486
+ gc.collect() # Force garbage collection
 
 
487
 
488
+ return img_base64, num_images
 
489
 
490
 
491
  @app.post("/ocr", response_model=List[OCRPageResponse])
 
493
  """
494
  OCR endpoint for PDF text extraction with image detection.
495
 
496
+ **Memory-optimized**:
497
+ - Processes ONE page at a time (not all pages in memory)
498
+ - 100 DPI for best OCR accuracy
499
+ - JPEG quality 85%
500
+ - Immediate garbage collection after each page
501
+
502
  Uses VLM (Llama-4-Maverick-17B) for best accuracy:
503
  - Character Success Rate: 87.75%
 
504
  - Processing: ~6s per page
505
 
506
  Returns:
 
511
  pdf_bytes = await file.read()
512
  pdf_filename = file.filename or "document.pdf"
513
 
514
+ # Get page count
515
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
516
+ total_pages = len(doc)
517
+ doc.close()
518
+
519
+ # Optional page limit (configurable via env var, default: no limit)
520
+ max_pages = int(os.getenv("OCR_MAX_PAGES", "0")) # 0 = unlimited
521
+ if max_pages > 0 and total_pages > max_pages:
522
+ raise HTTPException(
523
+ status_code=400,
524
+ detail=f"PDF has {total_pages} pages. Current limit is {max_pages} pages. Please split your PDF or increase OCR_MAX_PAGES environment variable."
525
+ )
526
 
527
  # OCR system prompt
528
  system_prompt = """You are an expert OCR system for historical oil & gas documents.
 
536
 
537
  Output ONLY the extracted text. No explanations, no descriptions."""
538
 
539
+ # Process each page ONE AT A TIME (memory efficient)
540
  results = []
541
  client = get_azure_client()
542
 
543
+ for page_num in range(1, total_pages + 1):
544
+ # Process single page (returns base64 image and releases memory immediately)
545
+ image_base64, num_images = process_pdf_page(pdf_bytes, page_num, dpi=100)
546
 
547
  # VLM OCR
548
  messages = [
 
566
  page_text = response.choices[0].message.content
567
 
568
  # Add image references if images exist on this page
 
569
  if num_images > 0:
570
  for img_idx in range(1, num_images + 1):
571
  page_text += f"\n\n![Image]({pdf_filename}/page_{page_num}/image_{img_idx})\n\n"
 
575
  "MD_text": page_text
576
  })
577
 
578
+ # Force cleanup after each page
579
+ del image_base64, messages, response
580
+ gc.collect()
581
+
582
  return results
583
 
584
+ except HTTPException:
585
+ raise
586
  except Exception as e:
587
  raise HTTPException(status_code=500, detail=f"OCR Error: {str(e)}")
588