ocr-engine-3 / src /controllers /_parser_controller.py
kanha-upadhyay's picture
Enhance PDF processing: validate file type, implement temporary file handling, and add preprocessing functionality
4994e6b
import os
import aiofiles
from fastapi import APIRouter, Body, File, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from loguru import logger
from pydantic import BaseModel
from src.services import PDFProcessorService
from src.utils import PDFPreprocessor
class EntityExtractorSchema(BaseModel):
text: str
class ParserController:
def __init__(self):
self.router = APIRouter()
self.service = PDFProcessorService()
self.router.add_api_route("/pdf", self.parse_pdf, methods=["POST"])
self.router.add_api_route("/entity", self.extract_entity, methods=["POST"])
async def parse_pdf(
self, file: UploadFile = File(...), preprocess: bool = Body(False)
):
try:
if not file.filename.lower().endswith(".pdf"):
raise HTTPException(
status_code=400,
detail="Invalid file type. Only PDF files are accepted.",
)
if file.content_type != "application/pdf":
raise HTTPException(
status_code=400,
detail="Invalid content type. Only PDF files are accepted.",
)
async with aiofiles.tempfile.NamedTemporaryFile(
prefix=file.filename + "_original_", suffix=".pdf", delete=False
) as temp_pdf:
content = await file.read()
await temp_pdf.write(content)
await temp_pdf.flush()
await file.seek(0)
if preprocess:
logger.info("Preprocessing the uploaded PDF file.")
file = await PDFPreprocessor.preprocess(file=file)
if file is None:
raise HTTPException(
status_code=400,
detail="Failed to preprocess the PDF file.",
)
async with self.service as processor:
extracted_data = await processor.process_pdf(file)
return JSONResponse(content={"data": extracted_data})
except HTTPException as e:
raise e
except Exception as e:
logger.exception(e)
raise HTTPException(
status_code=500,
detail=str(e),
)
finally:
if os.path.exists(temp_pdf.name):
os.remove(temp_pdf.name)
async def extract_entity(
self, entity_extractor_schema: EntityExtractorSchema = Body(...)
):
try:
extracted_entity = await self.service.extract_entity(
entity_extractor_schema.text
)
return JSONResponse(content={"data": extracted_entity})
except HTTPException as e:
raise e
except Exception as e:
logger.exception(e)
raise HTTPException(
status_code=500,
detail=str(e),
)