File size: 2,999 Bytes
4994e6b
 
 
e42e330
 
 
 
 
 
4994e6b
e42e330
 
 
 
 
 
 
 
 
 
 
 
 
 
4994e6b
 
 
e42e330
4994e6b
 
 
 
 
e42e330
4994e6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e42e330
 
 
 
 
 
 
 
 
 
 
4994e6b
 
 
e42e330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os

import aiofiles
from fastapi import APIRouter, Body, File, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from loguru import logger
from pydantic import BaseModel

from src.services import PDFProcessorService
from src.utils import PDFPreprocessor


class EntityExtractorSchema(BaseModel):
    text: str


class ParserController:

    def __init__(self):
        self.router = APIRouter()
        self.service = PDFProcessorService()
        self.router.add_api_route("/pdf", self.parse_pdf, methods=["POST"])
        self.router.add_api_route("/entity", self.extract_entity, methods=["POST"])

    async def parse_pdf(
        self, file: UploadFile = File(...), preprocess: bool = Body(False)
    ):
        try:
            if not file.filename.lower().endswith(".pdf"):
                raise HTTPException(
                    status_code=400,
                    detail="Invalid file type. Only PDF files are accepted.",
                )
            if file.content_type != "application/pdf":
                raise HTTPException(
                    status_code=400,
                    detail="Invalid content type. Only PDF files are accepted.",
                )
            async with aiofiles.tempfile.NamedTemporaryFile(
                prefix=file.filename + "_original_", suffix=".pdf", delete=False
            ) as temp_pdf:
                content = await file.read()
                await temp_pdf.write(content)
                await temp_pdf.flush()
                await file.seek(0)
                if preprocess:
                    logger.info("Preprocessing the uploaded PDF file.")
                    file = await PDFPreprocessor.preprocess(file=file)
                    if file is None:
                        raise HTTPException(
                            status_code=400,
                            detail="Failed to preprocess the PDF file.",
                        )
            async with self.service as processor:
                extracted_data = await processor.process_pdf(file)
            return JSONResponse(content={"data": extracted_data})
        except HTTPException as e:
            raise e
        except Exception as e:
            logger.exception(e)
            raise HTTPException(
                status_code=500,
                detail=str(e),
            )
        finally:
            if os.path.exists(temp_pdf.name):
                os.remove(temp_pdf.name)

    async def extract_entity(
        self, entity_extractor_schema: EntityExtractorSchema = Body(...)
    ):
        try:
            extracted_entity = await self.service.extract_entity(
                entity_extractor_schema.text
            )
            return JSONResponse(content={"data": extracted_entity})
        except HTTPException as e:
            raise e
        except Exception as e:
            logger.exception(e)
            raise HTTPException(
                status_code=500,
                detail=str(e),
            )