Spaces:
Running
Running
File size: 2,999 Bytes
4994e6b e42e330 4994e6b e42e330 4994e6b e42e330 4994e6b e42e330 4994e6b e42e330 4994e6b e42e330 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import os
import aiofiles
from fastapi import APIRouter, Body, File, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from loguru import logger
from pydantic import BaseModel
from src.services import PDFProcessorService
from src.utils import PDFPreprocessor
class EntityExtractorSchema(BaseModel):
text: str
class ParserController:
def __init__(self):
self.router = APIRouter()
self.service = PDFProcessorService()
self.router.add_api_route("/pdf", self.parse_pdf, methods=["POST"])
self.router.add_api_route("/entity", self.extract_entity, methods=["POST"])
async def parse_pdf(
self, file: UploadFile = File(...), preprocess: bool = Body(False)
):
try:
if not file.filename.lower().endswith(".pdf"):
raise HTTPException(
status_code=400,
detail="Invalid file type. Only PDF files are accepted.",
)
if file.content_type != "application/pdf":
raise HTTPException(
status_code=400,
detail="Invalid content type. Only PDF files are accepted.",
)
async with aiofiles.tempfile.NamedTemporaryFile(
prefix=file.filename + "_original_", suffix=".pdf", delete=False
) as temp_pdf:
content = await file.read()
await temp_pdf.write(content)
await temp_pdf.flush()
await file.seek(0)
if preprocess:
logger.info("Preprocessing the uploaded PDF file.")
file = await PDFPreprocessor.preprocess(file=file)
if file is None:
raise HTTPException(
status_code=400,
detail="Failed to preprocess the PDF file.",
)
async with self.service as processor:
extracted_data = await processor.process_pdf(file)
return JSONResponse(content={"data": extracted_data})
except HTTPException as e:
raise e
except Exception as e:
logger.exception(e)
raise HTTPException(
status_code=500,
detail=str(e),
)
finally:
if os.path.exists(temp_pdf.name):
os.remove(temp_pdf.name)
async def extract_entity(
self, entity_extractor_schema: EntityExtractorSchema = Body(...)
):
try:
extracted_entity = await self.service.extract_entity(
entity_extractor_schema.text
)
return JSONResponse(content={"data": extracted_entity})
except HTTPException as e:
raise e
except Exception as e:
logger.exception(e)
raise HTTPException(
status_code=500,
detail=str(e),
)
|