|
|
import logging |
|
|
from pathlib import Path |
|
|
|
|
|
import fitz |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class DocumentProcessor: |
|
|
"""Simplified document processor for the API service""" |
|
|
|
|
|
def __init__(self): |
|
|
"""Initialize the document processor""" |
|
|
self.supported_formats = {'.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.bmp'} |
|
|
|
|
|
def process_document( |
|
|
self, |
|
|
file_data: bytes, |
|
|
filename: str, |
|
|
use_ocr: bool = False |
|
|
) -> str: |
|
|
""" |
|
|
Extract text from document (PDF or image) |
|
|
|
|
|
Args: |
|
|
file_data: Raw file content |
|
|
filename: Original filename |
|
|
use_ocr: Whether to use OCR (not implemented in this simplified version) |
|
|
|
|
|
Returns: |
|
|
Extracted text as string |
|
|
""" |
|
|
try: |
|
|
file_ext = Path(filename).suffix.lower() |
|
|
logger.info(f"Processing file: {filename} with extension: {file_ext}") |
|
|
|
|
|
if file_ext not in self.supported_formats: |
|
|
raise ValueError(f"Unsupported file format: {file_ext}") |
|
|
|
|
|
if file_ext == '.pdf': |
|
|
return self._process_pdf(file_data) |
|
|
|
|
|
else: |
|
|
if use_ocr: |
|
|
raise NotImplementedError("OCR for images not implemented") |
|
|
else: |
|
|
return "Text extraction from images requires OCR to be enabled" |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error processing document: {str(e)}") |
|
|
raise |
|
|
|
|
|
def _process_pdf(self, file_data: bytes) -> str: |
|
|
"""Process PDF to extract text using PyMuPDF""" |
|
|
try: |
|
|
with fitz.open(stream=file_data, filetype="pdf") as pdf_doc: |
|
|
text_parts = [] |
|
|
for page_num in range(len(pdf_doc)): |
|
|
page = pdf_doc[page_num] |
|
|
text = page.get_text() |
|
|
text_parts.append(text) |
|
|
|
|
|
return "\n\n".join(text_parts) |
|
|
except Exception as e: |
|
|
logger.error(f"Error processing PDF: {str(e)}") |
|
|
raise |
|
|
|