Intelligent_PID / pdf_processor.py
msIntui
feat: initial clean deployment
910e0d4
import os
import fitz # PyMuPDF
import cv2
import numpy as np
from pathlib import Path
import logging
from storage import StorageInterface
import shutil
logger = logging.getLogger(__name__)
class DocumentProcessor:
def __init__(self, storage: StorageInterface):
self.storage = storage
self.target_dpi = 600 # Fixed at 600 DPI
def clean_results_folder(self, output_dir: str):
"""Clean the results directory before processing new files"""
if os.path.exists(output_dir):
try:
shutil.rmtree(output_dir)
logger.info(f"Cleaned results directory: {output_dir}")
except Exception as e:
logger.error(f"Error cleaning results directory: {str(e)}")
raise
os.makedirs(output_dir, exist_ok=True)
def process_document(self, file_path: str, output_dir: str) -> list:
"""Process document (PDF/PNG/JPG) and return paths to processed pages"""
# Clean results folder first
self.clean_results_folder(output_dir)
file_ext = Path(file_path).suffix.lower()
if file_ext == '.pdf':
return self._process_pdf(file_path, output_dir)
elif file_ext in ['.png', '.jpg', '.jpeg']:
return self._process_image(file_path, output_dir)
else:
raise ValueError(f"Unsupported file format: {file_ext}")
def _process_pdf(self, pdf_path: str, output_dir: str) -> list:
"""Process PDF document"""
processed_pages = []
base_name = Path(pdf_path).stem
try:
# Open PDF
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc[page_num]
# Get high-res image
pix = page.get_pixmap(matrix=fitz.Matrix(self.target_dpi/72, self.target_dpi/72))
# Convert to numpy array
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
if pix.n == 4: # RGBA
img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
# Save image
output_path = os.path.join(output_dir, f"{base_name}_page_{page_num + 1}.png")
self._save_image(img, output_path)
processed_pages.append(output_path)
return processed_pages
except Exception as e:
logger.error(f"Error processing PDF: {str(e)}")
raise
def _process_image(self, image_path: str, output_dir: str) -> list:
"""Process single image"""
try:
# Read image
img = cv2.imread(image_path)
if img is None:
raise ValueError(f"Could not read image: {image_path}")
# Convert BGR to RGB
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# Calculate scaling factor for 600 DPI
current_dpi = 72 # Assume standard screen resolution
scale = self.target_dpi / current_dpi
# Resize image
img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
# Save image
base_name = Path(image_path).stem
output_path = os.path.join(output_dir, f"{base_name}_page_1.png")
self._save_image(img, output_path)
return [output_path]
except Exception as e:
logger.error(f"Error processing image: {str(e)}")
raise
def _save_image(self, img: np.ndarray, output_path: str):
"""Save processed image"""
# Encode image with high quality PNG
_, buffer = cv2.imencode('.png', cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
self.storage.save_file(output_path, buffer.tobytes())
if __name__ == "__main__":
from storage import StorageFactory
# Initialize storage and processor
storage = StorageFactory.get_storage()
processor = DocumentProcessor(storage)
# Process PDF
pdf_path = "samples/001.pdf"
output_dir = "results" # Changed from "processed_pages" to "results"
try:
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
results = processor.process_document(
file_path=pdf_path,
output_dir=output_dir
)
# Print detailed results
print("\nProcessing Results:")
print(f"Output Directory: {os.path.abspath(output_dir)}")
for page_path in results:
abs_path = os.path.abspath(page_path)
file_size = os.path.getsize(page_path) / (1024 * 1024) # Convert to MB
print(f"- {os.path.basename(page_path)} ({file_size:.2f} MB)")
# Calculate total size of output
total_size = sum(os.path.getsize(os.path.join(output_dir, f))
for f in os.listdir(output_dir)) / (1024 * 1024)
print(f"\nTotal output size: {total_size:.2f} MB")
except Exception as e:
logger.error(f"Error processing PDF: {str(e)}")
raise