File size: 5,939 Bytes
e42e330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import asyncio
import re
import tempfile
from pathlib import Path
from typing import List

import aiofiles
import fitz
from fastapi import UploadFile
from loguru import logger

from src.utils import TextExtractor, model_manager


class PDFProcessorService:
    """Async PDF processor for handling both digital and scanned PDFs."""

    def __init__(self):
        # Use the centralized model manager
        self._ensure_models_loaded()

    def _ensure_models_loaded(self):
        """Ensure models are loaded via the model manager."""
        if not model_manager.models_loaded:
            logger.info("🔄 Models not loaded, initializing model manager...")
            # This will trigger model loading if not already done
            _ = model_manager.doctr_model

    @property
    def doctr_model(self):
        """Get the loaded doctr model from model manager."""
        return model_manager.doctr_model

    @property
    def device(self):
        """Get the device being used from model manager."""
        return model_manager.device

    async def __aenter__(self):
        return self

    async def __aexit__(self, exc_type, exc_value, traceback):
        pass

    async def is_pdf_scanned(self, pdf_path: str) -> bool:
        """Check if PDF is scanned (no extractable text)."""

        def _check_scanned():
            doc = fitz.open(pdf_path)
            for page in doc:
                text = page.get_text()
                if text.strip():
                    return False
            return True

        return await asyncio.get_event_loop().run_in_executor(None, _check_scanned)

    async def save_uploaded_file(self, uploaded_file: UploadFile) -> str:
        file_name = uploaded_file.filename
        suffix = Path(file_name).suffix
        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
            temp_path = tmp.name
        async with aiofiles.open(temp_path, "wb") as f:
            await f.write(await uploaded_file.read())
        return temp_path

    async def extract_text_from_digital_pdf(self, pdf_path: str) -> List[List[str]]:
        """Extract text from digital PDF using PyPDF2."""

        async def _extract_text():
            doc = fitz.open(pdf_path)
            extracted_data = []

            for page in doc:
                ptext = page.get_text()
                if ptext:
                    data = []
                    for line in ptext.splitlines():
                        cleaned_line = await self._split_on_repeated_pattern(
                            line.strip()
                        )
                        if cleaned_line:
                            data.append(cleaned_line[0])
                        extracted_data.append(data)

            return extracted_data

        return await asyncio.get_event_loop().run_in_executor(None, _extract_text)

    async def _split_on_repeated_pattern(
        self, line: str, min_space: int = 10
    ) -> List[str]:
        """Split line on repeated pattern."""
        import re
        from difflib import SequenceMatcher

        original_line = line.strip()

        # Find all spans of spaces >= min_space
        space_spans = [
            (m.start(), len(m.group()))
            for m in re.finditer(r" {%d,}" % min_space, original_line)
        ]

        if not space_spans:
            return [original_line]

        # Count how often each gap size occurs
        gaps = [span[1] for span in space_spans]
        gap_counts = {}
        for g in gaps:
            gap_counts[g] = gap_counts.get(g, 0) + 1

        # Sort gaps by size × count (more dominant gaps first)
        sorted_gaps = sorted(
            gap_counts.items(), key=lambda x: x[1] * x[0], reverse=True
        )

        # No significant gaps, return original
        if not sorted_gaps:
            return [original_line]

        dominant_gap = sorted_gaps[0][0]

        # Use the dominant large gap to split
        chunks = re.split(rf" {{%d,}}" % dominant_gap, original_line)

        # Check if it's actually repeated using fuzzy match
        base = chunks[0].strip()
        repeated = False
        for chunk in chunks[1:]:
            chunk = chunk.strip()
            if chunk and SequenceMatcher(None, base, chunk).ratio() > 0.8:
                repeated = True
                break

        return [base] if repeated else [original_line]

    async def process_pdf(self, file):
        pdf_path = await self.save_uploaded_file(file)
        is_scanned = await self.is_pdf_scanned(pdf_path)
        text_extractor = TextExtractor(self.doctr_model)
        if is_scanned:
            logger.info(f"{pdf_path} is likely a scanned PDF.")
            extracted_text_list = (
                await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path)
            )
        else:
            logger.info(f"{pdf_path} is not a scanned PDF. Extracting text...")
            extracted_text_list = await text_extractor.extract_lines_with_bbox(pdf_path)
            pdf_text = ""
            for block in extracted_text_list:
                for line in block:
                    pdf_text += " " + line["line"]
            text_noisy = text_extractor.is_text_noisy(pdf_text)
            if text_noisy:
                logger.info("Text is noisy. Extracting text again...")
                extracted_text_list = (
                    await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
                        pdf_path
                    )
                )
        return extracted_text_list

    async def extract_entity(self, text: str):
        text = re.sub(r"[^\w\s]", " ", text)
        doc = model_manager.spacy_model(text)
        entities = {ent.text: ent.label_ for ent in doc.ents}
        for key, value in entities.items():
            if value == "ORG":
                return key
        if entities:
            return list(entities.keys())[0]
        return text