Spaces:
Running
Running
Commit
·
4994e6b
1
Parent(s):
0abf29d
Enhance PDF processing: validate file type, implement temporary file handling, and add preprocessing functionality
Browse files
src/controllers/_parser_controller.py
CHANGED
|
@@ -1,9 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from fastapi import APIRouter, Body, File, HTTPException, UploadFile
|
| 2 |
from fastapi.responses import JSONResponse
|
| 3 |
from loguru import logger
|
| 4 |
from pydantic import BaseModel
|
| 5 |
|
| 6 |
from src.services import PDFProcessorService
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
class EntityExtractorSchema(BaseModel):
|
|
@@ -18,12 +22,35 @@ class ParserController:
|
|
| 18 |
self.router.add_api_route("/pdf", self.parse_pdf, methods=["POST"])
|
| 19 |
self.router.add_api_route("/entity", self.extract_entity, methods=["POST"])
|
| 20 |
|
| 21 |
-
async def parse_pdf(
|
|
|
|
|
|
|
| 22 |
try:
|
| 23 |
-
if not file:
|
| 24 |
-
raise HTTPException(
|
|
|
|
|
|
|
|
|
|
| 25 |
if file.content_type != "application/pdf":
|
| 26 |
-
raise HTTPException(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
async with self.service as processor:
|
| 28 |
extracted_data = await processor.process_pdf(file)
|
| 29 |
return JSONResponse(content={"data": extracted_data})
|
|
@@ -35,6 +62,9 @@ class ParserController:
|
|
| 35 |
status_code=500,
|
| 36 |
detail=str(e),
|
| 37 |
)
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
async def extract_entity(
|
| 40 |
self, entity_extractor_schema: EntityExtractorSchema = Body(...)
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import aiofiles
|
| 4 |
from fastapi import APIRouter, Body, File, HTTPException, UploadFile
|
| 5 |
from fastapi.responses import JSONResponse
|
| 6 |
from loguru import logger
|
| 7 |
from pydantic import BaseModel
|
| 8 |
|
| 9 |
from src.services import PDFProcessorService
|
| 10 |
+
from src.utils import PDFPreprocessor
|
| 11 |
|
| 12 |
|
| 13 |
class EntityExtractorSchema(BaseModel):
|
|
|
|
| 22 |
self.router.add_api_route("/pdf", self.parse_pdf, methods=["POST"])
|
| 23 |
self.router.add_api_route("/entity", self.extract_entity, methods=["POST"])
|
| 24 |
|
| 25 |
+
async def parse_pdf(
|
| 26 |
+
self, file: UploadFile = File(...), preprocess: bool = Body(False)
|
| 27 |
+
):
|
| 28 |
try:
|
| 29 |
+
if not file.filename.lower().endswith(".pdf"):
|
| 30 |
+
raise HTTPException(
|
| 31 |
+
status_code=400,
|
| 32 |
+
detail="Invalid file type. Only PDF files are accepted.",
|
| 33 |
+
)
|
| 34 |
if file.content_type != "application/pdf":
|
| 35 |
+
raise HTTPException(
|
| 36 |
+
status_code=400,
|
| 37 |
+
detail="Invalid content type. Only PDF files are accepted.",
|
| 38 |
+
)
|
| 39 |
+
async with aiofiles.tempfile.NamedTemporaryFile(
|
| 40 |
+
prefix=file.filename + "_original_", suffix=".pdf", delete=False
|
| 41 |
+
) as temp_pdf:
|
| 42 |
+
content = await file.read()
|
| 43 |
+
await temp_pdf.write(content)
|
| 44 |
+
await temp_pdf.flush()
|
| 45 |
+
await file.seek(0)
|
| 46 |
+
if preprocess:
|
| 47 |
+
logger.info("Preprocessing the uploaded PDF file.")
|
| 48 |
+
file = await PDFPreprocessor.preprocess(file=file)
|
| 49 |
+
if file is None:
|
| 50 |
+
raise HTTPException(
|
| 51 |
+
status_code=400,
|
| 52 |
+
detail="Failed to preprocess the PDF file.",
|
| 53 |
+
)
|
| 54 |
async with self.service as processor:
|
| 55 |
extracted_data = await processor.process_pdf(file)
|
| 56 |
return JSONResponse(content={"data": extracted_data})
|
|
|
|
| 62 |
status_code=500,
|
| 63 |
detail=str(e),
|
| 64 |
)
|
| 65 |
+
finally:
|
| 66 |
+
if os.path.exists(temp_pdf.name):
|
| 67 |
+
os.remove(temp_pdf.name)
|
| 68 |
|
| 69 |
async def extract_entity(
|
| 70 |
self, entity_extractor_schema: EntityExtractorSchema = Body(...)
|
src/services/_pdf_processor_service.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import asyncio
|
|
|
|
| 2 |
import re
|
| 3 |
import tempfile
|
| 4 |
from pathlib import Path
|
|
@@ -181,6 +182,9 @@ class PDFProcessorService:
|
|
| 181 |
except Exception as e:
|
| 182 |
logger.error(f"Error processing PDF: {e}")
|
| 183 |
raise
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
async def extract_entity(self, text: str):
|
| 186 |
logger.debug(f"Extracting entities from text: {text[:100]}...")
|
|
|
|
| 1 |
import asyncio
|
| 2 |
+
import os
|
| 3 |
import re
|
| 4 |
import tempfile
|
| 5 |
from pathlib import Path
|
|
|
|
| 182 |
except Exception as e:
|
| 183 |
logger.error(f"Error processing PDF: {e}")
|
| 184 |
raise
|
| 185 |
+
finally:
|
| 186 |
+
if os.path.exists(pdf_path):
|
| 187 |
+
os.remove(pdf_path)
|
| 188 |
|
| 189 |
async def extract_entity(self, text: str):
|
| 190 |
logger.debug(f"Extracting entities from text: {text[:100]}...")
|
src/utils/__init__.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
from ._model_manager import
|
| 2 |
-
from .
|
| 3 |
-
|
| 4 |
-
__all__ = ["model_manager", "TextExtractor"]
|
|
|
|
| 1 |
+
from ._model_manager import *
|
| 2 |
+
from ._preprocessor import *
|
| 3 |
+
from ._text_extractor import *
|
|
|
src/utils/_preprocessor.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
|
| 3 |
+
import fitz
|
| 4 |
+
from fastapi import UploadFile
|
| 5 |
+
from PIL import Image, ImageEnhance, ImageFilter
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class PDFPreprocessor:
|
| 9 |
+
@staticmethod
|
| 10 |
+
async def preprocess(file: UploadFile) -> UploadFile:
|
| 11 |
+
content = await file.read()
|
| 12 |
+
doc = fitz.open(stream=content, filetype="pdf")
|
| 13 |
+
processed_doc = fitz.open()
|
| 14 |
+
for page in doc:
|
| 15 |
+
pix = page.get_pixmap(dpi=300, colorspace=fitz.csGRAY)
|
| 16 |
+
img = Image.open(io.BytesIO(pix.tobytes()))
|
| 17 |
+
img = img.filter(
|
| 18 |
+
ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3)
|
| 19 |
+
)
|
| 20 |
+
enhancer = ImageEnhance.Contrast(img)
|
| 21 |
+
img = enhancer.enhance(1.5)
|
| 22 |
+
buf = io.BytesIO()
|
| 23 |
+
img.save(buf, format="PNG")
|
| 24 |
+
buf.seek(0)
|
| 25 |
+
processed_doc.new_page(width=page.rect.width, height=page.rect.height)
|
| 26 |
+
processed_doc[-1].insert_image(
|
| 27 |
+
processed_doc[-1].rect, stream=buf.getvalue()
|
| 28 |
+
)
|
| 29 |
+
doc.close()
|
| 30 |
+
output_buf = io.BytesIO()
|
| 31 |
+
processed_doc.save(output_buf)
|
| 32 |
+
processed_doc.close()
|
| 33 |
+
output_buf.seek(0)
|
| 34 |
+
return UploadFile(
|
| 35 |
+
file=output_buf,
|
| 36 |
+
filename=file.filename,
|
| 37 |
+
headers=file.headers,
|
| 38 |
+
)
|