File size: 4,656 Bytes
4e71548
2a728d0
 
 
 
 
4e71548
2a728d0
 
 
 
4e71548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import asyncio
import logging
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import fitz  # PyMuPDF
import cv2
import numpy as np
from PIL import Image
from config.config import settings
from models.account_models import LineData, WordData
from utils import model_manager
from pdf2image import convert_from_path
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
import torch


class PDFProcessor:
    """Async PDF processor for handling both digital and scanned PDFs."""
    
    def __init__(self):
        # Use the centralized model manager
        self._ensure_models_loaded()
    
    def _ensure_models_loaded(self):
        """Ensure models are loaded via the model manager."""
        if not model_manager.models_loaded:
            print("🔄 Models not loaded, initializing model manager...")
            # This will trigger model loading if not already done
            _ = model_manager.doctr_model
    
    @property
    def doctr_model(self):
        """Get the loaded doctr model from model manager."""
        return model_manager.doctr_model
    
    @property
    def device(self):
        """Get the device being used from model manager."""
        return model_manager.device
    
    async def __aenter__(self):
        return self
    
    async def __aexit__(self, exc_type, exc_value, traceback):
        pass
    
    async def is_pdf_scanned(self, pdf_path: str) -> bool:
        """Check if PDF is scanned (no extractable text)."""
        def _check_scanned():
            doc = fitz.open(pdf_path)
            for page in doc:
                text = page.get_text()
                if text.strip():
                    return False
            return True
        
        return await asyncio.get_event_loop().run_in_executor(None, _check_scanned)
    
    async def save_uploaded_file(self, uploaded_file) -> str:
        """Save uploaded file to temporary location."""
        def _save_file():
            with open(settings.temp_file_name, "wb") as f:
                f.write(uploaded_file.read())
            return settings.temp_file_name
        
        return await asyncio.get_event_loop().run_in_executor(None, _save_file)
    
    async def extract_text_from_digital_pdf(self, pdf_path: str) -> List[List[str]]:
        """Extract text from digital PDF using PyPDF2."""
        from PyPDF2 import PdfReader
        
        def _extract_text():
            reader = PdfReader(pdf_path)
            extracted_data = []
            
            for page in reader.pages:
                ptext = page.extract_text()
                if ptext:
                    data = []
                    for line in ptext.splitlines():
                        cleaned_line = self._split_on_repeated_pattern(line.strip())
                        if cleaned_line:
                            data.append(cleaned_line[0])
                    extracted_data.append(data)
            
            return extracted_data
        
        return await asyncio.get_event_loop().run_in_executor(None, _extract_text)
    
    def _split_on_repeated_pattern(self, line: str, min_space: int = 10) -> List[str]:
        """Split line on repeated pattern."""
        import re
        from difflib import SequenceMatcher
        
        original_line = line.strip()
        
        # Find all spans of spaces >= min_space
        space_spans = [
            (m.start(), len(m.group()))
            for m in re.finditer(r" {%d,}" % min_space, original_line)
        ]
        
        if not space_spans:
            return [original_line]
        
        # Count how often each gap size occurs
        gaps = [span[1] for span in space_spans]
        gap_counts = {}
        for g in gaps:
            gap_counts[g] = gap_counts.get(g, 0) + 1
        
        # Sort gaps by size × count (more dominant gaps first)
        sorted_gaps = sorted(gap_counts.items(), key=lambda x: x[1] * x[0], reverse=True)
        
        # No significant gaps, return original
        if not sorted_gaps:
            return [original_line]
        
        dominant_gap = sorted_gaps[0][0]
        
        # Use the dominant large gap to split
        chunks = re.split(rf" {{%d,}}" % dominant_gap, original_line)
        
        # Check if it's actually repeated using fuzzy match
        base = chunks[0].strip()
        repeated = False
        for chunk in chunks[1:]:
            chunk = chunk.strip()
            if chunk and SequenceMatcher(None, base, chunk).ratio() > 0.8:
                repeated = True
                break
        
        return [base] if repeated else [original_line]