kanha-upadhyay commited on
Commit
4994e6b
·
1 Parent(s): 0abf29d

Enhance PDF processing: validate file type, implement temporary file handling, and add preprocessing functionality

Browse files
src/controllers/_parser_controller.py CHANGED
@@ -1,9 +1,13 @@
 
 
 
1
  from fastapi import APIRouter, Body, File, HTTPException, UploadFile
2
  from fastapi.responses import JSONResponse
3
  from loguru import logger
4
  from pydantic import BaseModel
5
 
6
  from src.services import PDFProcessorService
 
7
 
8
 
9
  class EntityExtractorSchema(BaseModel):
@@ -18,12 +22,35 @@ class ParserController:
18
  self.router.add_api_route("/pdf", self.parse_pdf, methods=["POST"])
19
  self.router.add_api_route("/entity", self.extract_entity, methods=["POST"])
20
 
21
- async def parse_pdf(self, file: UploadFile = File(...)):
 
 
22
  try:
23
- if not file:
24
- raise HTTPException(status_code=400, detail="No file uploaded")
 
 
 
25
  if file.content_type != "application/pdf":
26
- raise HTTPException(status_code=400, detail="Invalid file type")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  async with self.service as processor:
28
  extracted_data = await processor.process_pdf(file)
29
  return JSONResponse(content={"data": extracted_data})
@@ -35,6 +62,9 @@ class ParserController:
35
  status_code=500,
36
  detail=str(e),
37
  )
 
 
 
38
 
39
  async def extract_entity(
40
  self, entity_extractor_schema: EntityExtractorSchema = Body(...)
 
1
+ import os
2
+
3
+ import aiofiles
4
  from fastapi import APIRouter, Body, File, HTTPException, UploadFile
5
  from fastapi.responses import JSONResponse
6
  from loguru import logger
7
  from pydantic import BaseModel
8
 
9
  from src.services import PDFProcessorService
10
+ from src.utils import PDFPreprocessor
11
 
12
 
13
  class EntityExtractorSchema(BaseModel):
 
22
  self.router.add_api_route("/pdf", self.parse_pdf, methods=["POST"])
23
  self.router.add_api_route("/entity", self.extract_entity, methods=["POST"])
24
 
25
+ async def parse_pdf(
26
+ self, file: UploadFile = File(...), preprocess: bool = Body(False)
27
+ ):
28
  try:
29
+ if not file.filename.lower().endswith(".pdf"):
30
+ raise HTTPException(
31
+ status_code=400,
32
+ detail="Invalid file type. Only PDF files are accepted.",
33
+ )
34
  if file.content_type != "application/pdf":
35
+ raise HTTPException(
36
+ status_code=400,
37
+ detail="Invalid content type. Only PDF files are accepted.",
38
+ )
39
+ async with aiofiles.tempfile.NamedTemporaryFile(
40
+ prefix=file.filename + "_original_", suffix=".pdf", delete=False
41
+ ) as temp_pdf:
42
+ content = await file.read()
43
+ await temp_pdf.write(content)
44
+ await temp_pdf.flush()
45
+ await file.seek(0)
46
+ if preprocess:
47
+ logger.info("Preprocessing the uploaded PDF file.")
48
+ file = await PDFPreprocessor.preprocess(file=file)
49
+ if file is None:
50
+ raise HTTPException(
51
+ status_code=400,
52
+ detail="Failed to preprocess the PDF file.",
53
+ )
54
  async with self.service as processor:
55
  extracted_data = await processor.process_pdf(file)
56
  return JSONResponse(content={"data": extracted_data})
 
62
  status_code=500,
63
  detail=str(e),
64
  )
65
+ finally:
66
+ if os.path.exists(temp_pdf.name):
67
+ os.remove(temp_pdf.name)
68
 
69
  async def extract_entity(
70
  self, entity_extractor_schema: EntityExtractorSchema = Body(...)
src/services/_pdf_processor_service.py CHANGED
@@ -1,4 +1,5 @@
1
  import asyncio
 
2
  import re
3
  import tempfile
4
  from pathlib import Path
@@ -181,6 +182,9 @@ class PDFProcessorService:
181
  except Exception as e:
182
  logger.error(f"Error processing PDF: {e}")
183
  raise
 
 
 
184
 
185
  async def extract_entity(self, text: str):
186
  logger.debug(f"Extracting entities from text: {text[:100]}...")
 
1
  import asyncio
2
+ import os
3
  import re
4
  import tempfile
5
  from pathlib import Path
 
182
  except Exception as e:
183
  logger.error(f"Error processing PDF: {e}")
184
  raise
185
+ finally:
186
+ if os.path.exists(pdf_path):
187
+ os.remove(pdf_path)
188
 
189
  async def extract_entity(self, text: str):
190
  logger.debug(f"Extracting entities from text: {text[:100]}...")
src/utils/__init__.py CHANGED
@@ -1,4 +1,3 @@
1
- from ._model_manager import model_manager
2
- from ._text_extractor import TextExtractor
3
-
4
- __all__ = ["model_manager", "TextExtractor"]
 
1
+ from ._model_manager import *
2
+ from ._preprocessor import *
3
+ from ._text_extractor import *
 
src/utils/_preprocessor.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+
3
+ import fitz
4
+ from fastapi import UploadFile
5
+ from PIL import Image, ImageEnhance, ImageFilter
6
+
7
+
8
+ class PDFPreprocessor:
9
+ @staticmethod
10
+ async def preprocess(file: UploadFile) -> UploadFile:
11
+ content = await file.read()
12
+ doc = fitz.open(stream=content, filetype="pdf")
13
+ processed_doc = fitz.open()
14
+ for page in doc:
15
+ pix = page.get_pixmap(dpi=300, colorspace=fitz.csGRAY)
16
+ img = Image.open(io.BytesIO(pix.tobytes()))
17
+ img = img.filter(
18
+ ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3)
19
+ )
20
+ enhancer = ImageEnhance.Contrast(img)
21
+ img = enhancer.enhance(1.5)
22
+ buf = io.BytesIO()
23
+ img.save(buf, format="PNG")
24
+ buf.seek(0)
25
+ processed_doc.new_page(width=page.rect.width, height=page.rect.height)
26
+ processed_doc[-1].insert_image(
27
+ processed_doc[-1].rect, stream=buf.getvalue()
28
+ )
29
+ doc.close()
30
+ output_buf = io.BytesIO()
31
+ processed_doc.save(output_buf)
32
+ processed_doc.close()
33
+ output_buf.seek(0)
34
+ return UploadFile(
35
+ file=output_buf,
36
+ filename=file.filename,
37
+ headers=file.headers,
38
+ )