init
Browse files- .env.example +3 -0
- app/main.py +54 -42
.env.example
CHANGED
|
@@ -47,6 +47,9 @@ VECTOR_DB_TYPE=pinecone
|
|
| 47 |
API_HOST=0.0.0.0
|
| 48 |
API_PORT=8000
|
| 49 |
|
|
|
|
|
|
|
|
|
|
| 50 |
# Production SSL/Security Configuration
|
| 51 |
# Set these for production deployment (see docs/markdowns/SSL_CAA_SETUP.md)
|
| 52 |
PRODUCTION=false
|
|
|
|
| 47 |
API_HOST=0.0.0.0
|
| 48 |
API_PORT=8000
|
| 49 |
|
| 50 |
+
# OCR Configuration
|
| 51 |
+
OCR_MAX_PAGES=0 # 0 = unlimited pages. Set to limit if on constrained hosting (e.g., 5 for 512MB)
|
| 52 |
+
|
| 53 |
# Production SSL/Security Configuration
|
| 54 |
# Set these for production deployment (see docs/markdowns/SSL_CAA_SETUP.md)
|
| 55 |
PRODUCTION=false
|
app/main.py
CHANGED
|
@@ -9,6 +9,7 @@ import os
|
|
| 9 |
import re
|
| 10 |
import time
|
| 11 |
import base64
|
|
|
|
| 12 |
from typing import List, Dict
|
| 13 |
from pathlib import Path
|
| 14 |
from io import BytesIO
|
|
@@ -452,45 +453,39 @@ class OCRPageResponse(BaseModel):
|
|
| 452 |
MD_text: str
|
| 453 |
|
| 454 |
|
| 455 |
-
def
|
| 456 |
-
"""
|
| 457 |
-
|
| 458 |
-
images = []
|
| 459 |
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
pix = page.get_pixmap(matrix=mat)
|
| 465 |
-
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 466 |
-
images.append(img)
|
| 467 |
|
| 468 |
-
|
| 469 |
-
|
|
|
|
|
|
|
| 470 |
|
|
|
|
|
|
|
| 471 |
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
image.save(buffered, format=format, quality=quality, optimize=True)
|
| 476 |
-
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 477 |
|
|
|
|
|
|
|
| 478 |
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
"""
|
| 484 |
-
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 485 |
-
image_counts = {}
|
| 486 |
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
image_list = page.get_images()
|
| 490 |
-
image_counts[page_num + 1] = len(image_list)
|
| 491 |
|
| 492 |
-
|
| 493 |
-
return image_counts
|
| 494 |
|
| 495 |
|
| 496 |
@app.post("/ocr", response_model=List[OCRPageResponse])
|
|
@@ -498,9 +493,14 @@ async def ocr_endpoint(file: UploadFile = File(...)):
|
|
| 498 |
"""
|
| 499 |
OCR endpoint for PDF text extraction with image detection.
|
| 500 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
Uses VLM (Llama-4-Maverick-17B) for best accuracy:
|
| 502 |
- Character Success Rate: 87.75%
|
| 503 |
-
- Word Success Rate: 61.91%
|
| 504 |
- Processing: ~6s per page
|
| 505 |
|
| 506 |
Returns:
|
|
@@ -511,11 +511,18 @@ async def ocr_endpoint(file: UploadFile = File(...)):
|
|
| 511 |
pdf_bytes = await file.read()
|
| 512 |
pdf_filename = file.filename or "document.pdf"
|
| 513 |
|
| 514 |
-
#
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
|
| 520 |
# OCR system prompt
|
| 521 |
system_prompt = """You are an expert OCR system for historical oil & gas documents.
|
|
@@ -529,13 +536,13 @@ Extract ALL text from the image with 100% accuracy. Follow these rules:
|
|
| 529 |
|
| 530 |
Output ONLY the extracted text. No explanations, no descriptions."""
|
| 531 |
|
| 532 |
-
# Process each page
|
| 533 |
results = []
|
| 534 |
client = get_azure_client()
|
| 535 |
|
| 536 |
-
for page_num
|
| 537 |
-
#
|
| 538 |
-
image_base64 =
|
| 539 |
|
| 540 |
# VLM OCR
|
| 541 |
messages = [
|
|
@@ -559,7 +566,6 @@ Output ONLY the extracted text. No explanations, no descriptions."""
|
|
| 559 |
page_text = response.choices[0].message.content
|
| 560 |
|
| 561 |
# Add image references if images exist on this page
|
| 562 |
-
num_images = image_counts.get(page_num, 0)
|
| 563 |
if num_images > 0:
|
| 564 |
for img_idx in range(1, num_images + 1):
|
| 565 |
page_text += f"\n\n\n\n"
|
|
@@ -569,8 +575,14 @@ Output ONLY the extracted text. No explanations, no descriptions."""
|
|
| 569 |
"MD_text": page_text
|
| 570 |
})
|
| 571 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
return results
|
| 573 |
|
|
|
|
|
|
|
| 574 |
except Exception as e:
|
| 575 |
raise HTTPException(status_code=500, detail=f"OCR Error: {str(e)}")
|
| 576 |
|
|
|
|
| 9 |
import re
|
| 10 |
import time
|
| 11 |
import base64
|
| 12 |
+
import gc
|
| 13 |
from typing import List, Dict
|
| 14 |
from pathlib import Path
|
| 15 |
from io import BytesIO
|
|
|
|
| 453 |
MD_text: str
|
| 454 |
|
| 455 |
|
| 456 |
+
def process_pdf_page(pdf_bytes: bytes, page_num: int, dpi: int = 100) -> tuple[str, int]:
|
| 457 |
+
"""
|
| 458 |
+
Process a single PDF page for OCR (memory efficient).
|
|
|
|
| 459 |
|
| 460 |
+
Returns: (base64_image, num_embedded_images)
|
| 461 |
+
"""
|
| 462 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 463 |
+
page = doc[page_num - 1] # 0-indexed
|
|
|
|
|
|
|
|
|
|
| 464 |
|
| 465 |
+
# Convert page to image
|
| 466 |
+
zoom = dpi / 72
|
| 467 |
+
mat = fitz.Matrix(zoom, zoom)
|
| 468 |
+
pix = page.get_pixmap(matrix=mat)
|
| 469 |
|
| 470 |
+
# Convert to PIL Image
|
| 471 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 472 |
|
| 473 |
+
# Count embedded images
|
| 474 |
+
image_list = page.get_images()
|
| 475 |
+
num_images = len(image_list)
|
|
|
|
|
|
|
| 476 |
|
| 477 |
+
doc.close()
|
| 478 |
+
del pix, page, doc # Explicit cleanup
|
| 479 |
|
| 480 |
+
# Convert to base64 JPEG with good quality
|
| 481 |
+
buffered = BytesIO()
|
| 482 |
+
img.save(buffered, format="JPEG", quality=85, optimize=True)
|
| 483 |
+
img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
|
|
|
|
|
|
|
|
|
| 484 |
|
| 485 |
+
del img, buffered # Explicit cleanup
|
| 486 |
+
gc.collect() # Force garbage collection
|
|
|
|
|
|
|
| 487 |
|
| 488 |
+
return img_base64, num_images
|
|
|
|
| 489 |
|
| 490 |
|
| 491 |
@app.post("/ocr", response_model=List[OCRPageResponse])
|
|
|
|
| 493 |
"""
|
| 494 |
OCR endpoint for PDF text extraction with image detection.
|
| 495 |
|
| 496 |
+
**Memory-optimized**:
|
| 497 |
+
- Processes ONE page at a time (not all pages in memory)
|
| 498 |
+
- 100 DPI for best OCR accuracy
|
| 499 |
+
- JPEG quality 85%
|
| 500 |
+
- Immediate garbage collection after each page
|
| 501 |
+
|
| 502 |
Uses VLM (Llama-4-Maverick-17B) for best accuracy:
|
| 503 |
- Character Success Rate: 87.75%
|
|
|
|
| 504 |
- Processing: ~6s per page
|
| 505 |
|
| 506 |
Returns:
|
|
|
|
| 511 |
pdf_bytes = await file.read()
|
| 512 |
pdf_filename = file.filename or "document.pdf"
|
| 513 |
|
| 514 |
+
# Get page count
|
| 515 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 516 |
+
total_pages = len(doc)
|
| 517 |
+
doc.close()
|
| 518 |
+
|
| 519 |
+
# Optional page limit (configurable via env var, default: no limit)
|
| 520 |
+
max_pages = int(os.getenv("OCR_MAX_PAGES", "0")) # 0 = unlimited
|
| 521 |
+
if max_pages > 0 and total_pages > max_pages:
|
| 522 |
+
raise HTTPException(
|
| 523 |
+
status_code=400,
|
| 524 |
+
detail=f"PDF has {total_pages} pages. Current limit is {max_pages} pages. Please split your PDF or increase OCR_MAX_PAGES environment variable."
|
| 525 |
+
)
|
| 526 |
|
| 527 |
# OCR system prompt
|
| 528 |
system_prompt = """You are an expert OCR system for historical oil & gas documents.
|
|
|
|
| 536 |
|
| 537 |
Output ONLY the extracted text. No explanations, no descriptions."""
|
| 538 |
|
| 539 |
+
# Process each page ONE AT A TIME (memory efficient)
|
| 540 |
results = []
|
| 541 |
client = get_azure_client()
|
| 542 |
|
| 543 |
+
for page_num in range(1, total_pages + 1):
|
| 544 |
+
# Process single page (returns base64 image and releases memory immediately)
|
| 545 |
+
image_base64, num_images = process_pdf_page(pdf_bytes, page_num, dpi=100)
|
| 546 |
|
| 547 |
# VLM OCR
|
| 548 |
messages = [
|
|
|
|
| 566 |
page_text = response.choices[0].message.content
|
| 567 |
|
| 568 |
# Add image references if images exist on this page
|
|
|
|
| 569 |
if num_images > 0:
|
| 570 |
for img_idx in range(1, num_images + 1):
|
| 571 |
page_text += f"\n\n\n\n"
|
|
|
|
| 575 |
"MD_text": page_text
|
| 576 |
})
|
| 577 |
|
| 578 |
+
# Force cleanup after each page
|
| 579 |
+
del image_base64, messages, response
|
| 580 |
+
gc.collect()
|
| 581 |
+
|
| 582 |
return results
|
| 583 |
|
| 584 |
+
except HTTPException:
|
| 585 |
+
raise
|
| 586 |
except Exception as e:
|
| 587 |
raise HTTPException(status_code=500, detail=f"OCR Error: {str(e)}")
|
| 588 |
|