Spaces:
Sleeping
Sleeping
Commit
·
698a3c4
1
Parent(s):
6874c0e
Add support for CUDA availability check in PDF processing and raise HTTPException for unsupported scanned PDFs
Browse files
src/services/_pdf_processor_service.py
CHANGED
|
@@ -7,7 +7,8 @@ from typing import List
|
|
| 7 |
|
| 8 |
import aiofiles
|
| 9 |
import fitz
|
| 10 |
-
|
|
|
|
| 11 |
from loguru import logger
|
| 12 |
|
| 13 |
from src.utils import TextExtractor, model_manager
|
|
@@ -151,6 +152,10 @@ class PDFProcessorService:
|
|
| 151 |
text_extractor = TextExtractor(self.doctr_model)
|
| 152 |
|
| 153 |
if is_scanned:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
logger.info(f"PDF {pdf_path} is scanned, using OCR extraction")
|
| 155 |
extracted_text_list = (
|
| 156 |
await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
|
|
@@ -168,6 +173,10 @@ class PDFProcessorService:
|
|
| 168 |
pdf_text += " " + line["line"]
|
| 169 |
text_noisy = text_extractor.is_text_noisy(pdf_text)
|
| 170 |
if text_noisy:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
logger.warning("Text is noisy, falling back to OCR extraction")
|
| 172 |
extracted_text_list = (
|
| 173 |
await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
|
|
|
|
| 7 |
|
| 8 |
import aiofiles
|
| 9 |
import fitz
|
| 10 |
+
import torch
|
| 11 |
+
from fastapi import HTTPException, UploadFile
|
| 12 |
from loguru import logger
|
| 13 |
|
| 14 |
from src.utils import TextExtractor, model_manager
|
|
|
|
| 152 |
text_extractor = TextExtractor(self.doctr_model)
|
| 153 |
|
| 154 |
if is_scanned:
|
| 155 |
+
if not torch.cuda.is_available():
|
| 156 |
+
raise HTTPException(
|
| 157 |
+
status_code=400, detail="Scanned PDFs are not supported."
|
| 158 |
+
)
|
| 159 |
logger.info(f"PDF {pdf_path} is scanned, using OCR extraction")
|
| 160 |
extracted_text_list = (
|
| 161 |
await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
|
|
|
|
| 173 |
pdf_text += " " + line["line"]
|
| 174 |
text_noisy = text_extractor.is_text_noisy(pdf_text)
|
| 175 |
if text_noisy:
|
| 176 |
+
if not torch.cuda.is_available():
|
| 177 |
+
raise HTTPException(
|
| 178 |
+
status_code=400, detail="Scanned PDFs are not supported."
|
| 179 |
+
)
|
| 180 |
logger.warning("Text is noisy, falling back to OCR extraction")
|
| 181 |
extracted_text_list = (
|
| 182 |
await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
|