ocr-engine-3

Sleeping

kanha-upadhyay commited on Nov 7, 2025

Commit

698a3c4

1 Parent(s): 6874c0e

Add support for CUDA availability check in PDF processing and raise HTTPException for unsupported scanned PDFs

Files changed (1) hide show

src/services/_pdf_processor_service.py CHANGED Viewed

@@ -7,7 +7,8 @@ from typing import List
 import aiofiles
 import fitz
-from fastapi import UploadFile
 from loguru import logger
 from src.utils import TextExtractor, model_manager
@@ -151,6 +152,10 @@ class PDFProcessorService:
             text_extractor = TextExtractor(self.doctr_model)
             if is_scanned:
                 logger.info(f"PDF {pdf_path} is scanned, using OCR extraction")
                 extracted_text_list = (
                     await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
@@ -168,6 +173,10 @@ class PDFProcessorService:
                         pdf_text += " " + line["line"]
                 text_noisy = text_extractor.is_text_noisy(pdf_text)
                 if text_noisy:
                     logger.warning("Text is noisy, falling back to OCR extraction")
                     extracted_text_list = (
                         await text_extractor.extract_lines_with_bbox_from_scanned_pdf(

 import aiofiles
 import fitz
+import torch
+from fastapi import HTTPException, UploadFile
 from loguru import logger
 from src.utils import TextExtractor, model_manager
             text_extractor = TextExtractor(self.doctr_model)
             if is_scanned:
+                if not torch.cuda.is_available():
+                    raise HTTPException(
+                        status_code=400, detail="Scanned PDFs are not supported."
+                    )
                 logger.info(f"PDF {pdf_path} is scanned, using OCR extraction")
                 extracted_text_list = (
                     await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
                         pdf_text += " " + line["line"]
                 text_noisy = text_extractor.is_text_noisy(pdf_text)
                 if text_noisy:
+                    if not torch.cuda.is_available():
+                        raise HTTPException(
+                            status_code=400, detail="Scanned PDFs are not supported."
+                        )
                     logger.warning("Text is noisy, falling back to OCR extraction")
                     extracted_text_list = (
                         await text_extractor.extract_lines_with_bbox_from_scanned_pdf(