Spaces:

sifars
/

ocr-engine

Running

File size: 2,999 Bytes

import os

import aiofiles
from fastapi import APIRouter, Body, File, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from loguru import logger
from pydantic import BaseModel

from src.services import PDFProcessorService
from src.utils import PDFPreprocessor


class EntityExtractorSchema(BaseModel):
    text: str


class ParserController:

    def __init__(self):
        self.router = APIRouter()
        self.service = PDFProcessorService()
        self.router.add_api_route("/pdf", self.parse_pdf, methods=["POST"])
        self.router.add_api_route("/entity", self.extract_entity, methods=["POST"])

    async def parse_pdf(
        self, file: UploadFile = File(...), preprocess: bool = Body(False)
    ):
        try:
            if not file.filename.lower().endswith(".pdf"):
                raise HTTPException(
                    status_code=400,
                    detail="Invalid file type. Only PDF files are accepted.",
                )
            if file.content_type != "application/pdf":
                raise HTTPException(
                    status_code=400,
                    detail="Invalid content type. Only PDF files are accepted.",
                )
            async with aiofiles.tempfile.NamedTemporaryFile(
                prefix=file.filename + "_original_", suffix=".pdf", delete=False
            ) as temp_pdf:
                content = await file.read()
                await temp_pdf.write(content)
                await temp_pdf.flush()
                await file.seek(0)
                if preprocess:
                    logger.info("Preprocessing the uploaded PDF file.")
                    file = await PDFPreprocessor.preprocess(file=file)
                    if file is None:
                        raise HTTPException(
                            status_code=400,
                            detail="Failed to preprocess the PDF file.",
                        )
            async with self.service as processor:
                extracted_data = await processor.process_pdf(file)
            return JSONResponse(content={"data": extracted_data})
        except HTTPException as e:
            raise e
        except Exception as e:
            logger.exception(e)
            raise HTTPException(
                status_code=500,
                detail=str(e),
            )
        finally:
            if os.path.exists(temp_pdf.name):
                os.remove(temp_pdf.name)

    async def extract_entity(
        self, entity_extractor_schema: EntityExtractorSchema = Body(...)
    ):
        try:
            extracted_entity = await self.service.extract_entity(
                entity_extractor_schema.text
            )
            return JSONResponse(content={"data": extracted_entity})
        except HTTPException as e:
            raise e
        except Exception as e:
            logger.exception(e)
            raise HTTPException(
                status_code=500,
                detail=str(e),
            )