kanha-upadhyay commited on
Commit
698a3c4
·
1 Parent(s): 6874c0e

Add support for CUDA availability check in PDF processing and raise HTTPException for unsupported scanned PDFs

Browse files
src/services/_pdf_processor_service.py CHANGED
@@ -7,7 +7,8 @@ from typing import List
7
 
8
  import aiofiles
9
  import fitz
10
- from fastapi import UploadFile
 
11
  from loguru import logger
12
 
13
  from src.utils import TextExtractor, model_manager
@@ -151,6 +152,10 @@ class PDFProcessorService:
151
  text_extractor = TextExtractor(self.doctr_model)
152
 
153
  if is_scanned:
 
 
 
 
154
  logger.info(f"PDF {pdf_path} is scanned, using OCR extraction")
155
  extracted_text_list = (
156
  await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
@@ -168,6 +173,10 @@ class PDFProcessorService:
168
  pdf_text += " " + line["line"]
169
  text_noisy = text_extractor.is_text_noisy(pdf_text)
170
  if text_noisy:
 
 
 
 
171
  logger.warning("Text is noisy, falling back to OCR extraction")
172
  extracted_text_list = (
173
  await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
 
7
 
8
  import aiofiles
9
  import fitz
10
+ import torch
11
+ from fastapi import HTTPException, UploadFile
12
  from loguru import logger
13
 
14
  from src.utils import TextExtractor, model_manager
 
152
  text_extractor = TextExtractor(self.doctr_model)
153
 
154
  if is_scanned:
155
+ if not torch.cuda.is_available():
156
+ raise HTTPException(
157
+ status_code=400, detail="Scanned PDFs are not supported."
158
+ )
159
  logger.info(f"PDF {pdf_path} is scanned, using OCR extraction")
160
  extracted_text_list = (
161
  await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
 
173
  pdf_text += " " + line["line"]
174
  text_noisy = text_extractor.is_text_noisy(pdf_text)
175
  if text_noisy:
176
+ if not torch.cuda.is_available():
177
+ raise HTTPException(
178
+ status_code=400, detail="Scanned PDFs are not supported."
179
+ )
180
  logger.warning("Text is noisy, falling back to OCR extraction")
181
  extracted_text_list = (
182
  await text_extractor.extract_lines_with_bbox_from_scanned_pdf(