Spaces:
Running
Running
File size: 7,739 Bytes
e42e330 4994e6b e42e330 698a3c4 e42e330 2e2af5e e42e330 2e2af5e e42e330 2e2af5e e42e330 2e2af5e e42e330 2e2af5e e42e330 2e2af5e e42e330 2e2af5e e42e330 2e2af5e e42e330 2e2af5e e42e330 2e2af5e 698a3c4 2e2af5e e42e330 2e2af5e 698a3c4 2e2af5e 4994e6b e42e330 2e2af5e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
import asyncio
import os
import re
import tempfile
from pathlib import Path
from typing import List
import aiofiles
import fitz
import torch
from fastapi import HTTPException, UploadFile
from loguru import logger
from src.utils import TextExtractor, model_manager
class PDFProcessorService:
def __init__(self):
logger.info("Initializing PDFProcessorService")
self._ensure_models_loaded()
def _ensure_models_loaded(self):
if not model_manager.models_loaded:
logger.info("Models not loaded, initializing model manager...")
_ = model_manager.doctr_model
logger.debug("Model manager initialization completed")
@property
def doctr_model(self):
return model_manager.doctr_model
@property
def device(self):
return model_manager.device
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_value, traceback):
pass
async def is_pdf_scanned(self, pdf_path: str) -> bool:
logger.debug(f"Checking if PDF is scanned: {pdf_path}")
def _check_scanned():
try:
doc = fitz.open(pdf_path)
for page in doc:
text = page.get_text()
if text.strip():
return False
return True
except Exception as e:
logger.error(f"Error checking if PDF is scanned: {e}")
raise
return await asyncio.get_event_loop().run_in_executor(None, _check_scanned)
async def save_uploaded_file(self, uploaded_file: UploadFile) -> str:
logger.info(f"Saving uploaded file: {uploaded_file.filename}")
try:
file_name = uploaded_file.filename
suffix = Path(file_name).suffix
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
temp_path = tmp.name
async with aiofiles.open(temp_path, "wb") as f:
await f.write(await uploaded_file.read())
logger.debug(f"File saved to temporary path: {temp_path}")
return temp_path
except Exception as e:
logger.error(f"Error saving uploaded file: {e}")
raise
async def extract_text_from_digital_pdf(self, pdf_path: str) -> List[List[str]]:
logger.debug(f"Extracting text from digital PDF: {pdf_path}")
async def _extract_text():
try:
doc = fitz.open(pdf_path)
extracted_data = []
for page in doc:
ptext = page.get_text()
if ptext:
data = []
for line in ptext.splitlines():
cleaned_line = await self._split_on_repeated_pattern(
line.strip()
)
if cleaned_line:
data.append(cleaned_line[0])
extracted_data.append(data)
logger.info(
f"Successfully extracted text from {len(extracted_data)} pages"
)
return extracted_data
except Exception as e:
logger.error(f"Error extracting text from digital PDF: {e}")
raise
return await asyncio.get_event_loop().run_in_executor(None, _extract_text)
async def _split_on_repeated_pattern(
self, line: str, min_space: int = 10
) -> List[str]:
logger.debug(f"Processing line for repeated patterns: {line[:50]}...")
import re
from difflib import SequenceMatcher
original_line = line.strip()
space_spans = [
(m.start(), len(m.group()))
for m in re.finditer(r" {%d,}" % min_space, original_line)
]
if not space_spans:
return [original_line]
gaps = [span[1] for span in space_spans]
gap_counts = {}
for g in gaps:
gap_counts[g] = gap_counts.get(g, 0) + 1
sorted_gaps = sorted(
gap_counts.items(), key=lambda x: x[1] * x[0], reverse=True
)
if not sorted_gaps:
return [original_line]
dominant_gap = sorted_gaps[0][0]
chunks = re.split(rf" {{%d,}}" % dominant_gap, original_line)
base = chunks[0].strip()
repeated = False
for chunk in chunks[1:]:
chunk = chunk.strip()
if chunk and SequenceMatcher(None, base, chunk).ratio() > 0.8:
repeated = True
break
return [base] if repeated else [original_line]
async def process_pdf(self, file):
logger.info(f"Processing PDF file: {file.filename}")
try:
pdf_path = await self.save_uploaded_file(file)
is_scanned = await self.is_pdf_scanned(pdf_path)
text_extractor = TextExtractor(self.doctr_model)
if is_scanned:
if not torch.cuda.is_available():
raise HTTPException(
status_code=400, detail="Scanned PDFs are not supported."
)
logger.info(f"PDF {pdf_path} is scanned, using OCR extraction")
extracted_text_list = (
await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
pdf_path
)
)
else:
logger.info(f"PDF {pdf_path} is digital, extracting text directly")
extracted_text_list = await text_extractor.extract_lines_with_bbox(
pdf_path
)
pdf_text = ""
for block in extracted_text_list:
for line in block:
pdf_text += " " + line["line"]
text_noisy = text_extractor.is_text_noisy(pdf_text)
if text_noisy:
if not torch.cuda.is_available():
raise HTTPException(
status_code=400, detail="Scanned PDFs are not supported."
)
logger.warning("Text is noisy, falling back to OCR extraction")
extracted_text_list = (
await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
pdf_path
)
)
logger.info(
f"Successfully processed PDF with {len(extracted_text_list)} text blocks"
)
return extracted_text_list
except Exception as e:
logger.error(f"Error processing PDF: {e}")
raise
finally:
if os.path.exists(pdf_path):
os.remove(pdf_path)
async def extract_entity(self, text: str):
logger.debug(f"Extracting entities from text: {text[:100]}...")
try:
text = re.sub(r"[^\w\s]", " ", text)
doc = model_manager.spacy_model(text)
entities = {ent.text: ent.label_ for ent in doc.ents}
for key, value in entities.items():
if value == "ORG":
logger.info(f"Found organization entity: {key}")
return key
if entities:
entity = list(entities.keys())[0]
logger.info(f"Found entity: {entity}")
return entity
logger.debug("No entities found, returning original text")
return text
except Exception as e:
logger.error(f"Error extracting entities: {e}")
return text
|