Spaces:
Running
Running
Enhance PDF processing: validate file type, implement temporary file handling, and add preprocessing functionality
4994e6b
| import os | |
| import aiofiles | |
| from fastapi import APIRouter, Body, File, HTTPException, UploadFile | |
| from fastapi.responses import JSONResponse | |
| from loguru import logger | |
| from pydantic import BaseModel | |
| from src.services import PDFProcessorService | |
| from src.utils import PDFPreprocessor | |
| class EntityExtractorSchema(BaseModel): | |
| text: str | |
| class ParserController: | |
| def __init__(self): | |
| self.router = APIRouter() | |
| self.service = PDFProcessorService() | |
| self.router.add_api_route("/pdf", self.parse_pdf, methods=["POST"]) | |
| self.router.add_api_route("/entity", self.extract_entity, methods=["POST"]) | |
| async def parse_pdf( | |
| self, file: UploadFile = File(...), preprocess: bool = Body(False) | |
| ): | |
| try: | |
| if not file.filename.lower().endswith(".pdf"): | |
| raise HTTPException( | |
| status_code=400, | |
| detail="Invalid file type. Only PDF files are accepted.", | |
| ) | |
| if file.content_type != "application/pdf": | |
| raise HTTPException( | |
| status_code=400, | |
| detail="Invalid content type. Only PDF files are accepted.", | |
| ) | |
| async with aiofiles.tempfile.NamedTemporaryFile( | |
| prefix=file.filename + "_original_", suffix=".pdf", delete=False | |
| ) as temp_pdf: | |
| content = await file.read() | |
| await temp_pdf.write(content) | |
| await temp_pdf.flush() | |
| await file.seek(0) | |
| if preprocess: | |
| logger.info("Preprocessing the uploaded PDF file.") | |
| file = await PDFPreprocessor.preprocess(file=file) | |
| if file is None: | |
| raise HTTPException( | |
| status_code=400, | |
| detail="Failed to preprocess the PDF file.", | |
| ) | |
| async with self.service as processor: | |
| extracted_data = await processor.process_pdf(file) | |
| return JSONResponse(content={"data": extracted_data}) | |
| except HTTPException as e: | |
| raise e | |
| except Exception as e: | |
| logger.exception(e) | |
| raise HTTPException( | |
| status_code=500, | |
| detail=str(e), | |
| ) | |
| finally: | |
| if os.path.exists(temp_pdf.name): | |
| os.remove(temp_pdf.name) | |
| async def extract_entity( | |
| self, entity_extractor_schema: EntityExtractorSchema = Body(...) | |
| ): | |
| try: | |
| extracted_entity = await self.service.extract_entity( | |
| entity_extractor_schema.text | |
| ) | |
| return JSONResponse(content={"data": extracted_entity}) | |
| except HTTPException as e: | |
| raise e | |
| except Exception as e: | |
| logger.exception(e) | |
| raise HTTPException( | |
| status_code=500, | |
| detail=str(e), | |
| ) | |