kanha-upadhyay commited on
Commit
e42e330
·
0 Parent(s):
.env.example ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ CORS_ALLOW_ORIGINS=http://localhost, http://127.0.0.1
2
+
3
+ SPACY_MODEL_NAME=en_core_web_trf
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98
+ __pypackages__/
99
+
100
+ # Celery stuff
101
+ celerybeat-schedule
102
+ celerybeat.pid
103
+
104
+ # SageMath parsed files
105
+ *.sage.py
106
+
107
+ # Environments
108
+ .env
109
+ .venv
110
+ env/
111
+ venv/
112
+ ENV/
113
+ env.bak/
114
+ venv.bak/
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
133
+
134
+ # pytype static type analyzer
135
+ .pytype/
136
+
137
+ # Cython debug symbols
138
+ cython_debug/
139
+
140
+ # Formatter cache files
141
+ .mypy_cache/
142
+ .black_cache/
143
+ .ruff_cache/
144
+
145
+ .vscode
146
+ *.db
Dockerfile ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies including those needed for OCR and ML models
6
+ RUN apt-get update && apt-get install -y \
7
+ curl \
8
+ poppler-utils \
9
+ libgl1-mesa-glx \
10
+ libglib2.0-0 \
11
+ libsm6 \
12
+ libxext6 \
13
+ libxrender-dev \
14
+ libgomp1 \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Install poetry
18
+ RUN pip install poetry
19
+
20
+ # Configure poetry
21
+ RUN poetry config virtualenvs.create false
22
+
23
+ # Copy dependency files
24
+ COPY pyproject.toml poetry.lock* /app/
25
+
26
+ # Install dependencies
27
+ RUN poetry install --only main --no-root
28
+
29
+ # Download spacy model
30
+ RUN python -m spacy download en_core_web_sm
31
+
32
+ # Create user
33
+ RUN useradd -m -u 1000 appuser
34
+
35
+ # Copy source code
36
+ COPY --chown=appuser src /app/src
37
+ COPY --chown=appuser main.py /app/
38
+
39
+ # Change ownership
40
+ RUN chown -R appuser /app
41
+
42
+ USER appuser
43
+
44
+ EXPOSE 8001
45
+
46
+ ENV PYTHONUNBUFFERED=1
47
+ ENV SPACY_MODEL_NAME=en_core_web_sm
48
+
49
+ CMD ["python", "main.py"]
main.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ if __name__ == "__main__":
2
+ import uvicorn
3
+
4
+ uvicorn.run("src.app:app", host="0.0.0.0", port=8001, reload=True)
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "parser"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = [
6
+ {name = "Kanha Upadhyay",email = "kanha.upadhyay@sifars.com"}
7
+ ]
8
+ readme = "README.md"
9
+ requires-python = "3.12.*"
10
+ dependencies = [
11
+ "fastapi (>=0.116.1,<0.117.0)",
12
+ "uvicorn (>=0.35.0,<0.36.0)",
13
+ "loguru (>=0.7.3,<0.8.0)",
14
+ "python-dotenv (>=1.1.1,<2.0.0)",
15
+ "pymupdf (>=1.26.3,<2.0.0)",
16
+ "pdf2image (>=1.17.0,<2.0.0)",
17
+ "torch (>=2.8.0,<3.0.0)",
18
+ "fuzzywuzzy (>=0.18.0,<0.19.0)",
19
+ "spacy (>=3.8.7,<4.0.0)",
20
+ "python-doctr (>=1.0.0,<2.0.0)",
21
+ "aiofiles (>=24.1.0,<25.0.0)",
22
+ "numpy (>=1.24.0,<2.0.0)",
23
+ "python-multipart (>=0.0.9,<0.1.0)"
24
+ ]
25
+
26
+
27
+ [build-system]
28
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
29
+ build-backend = "poetry.core.masonry.api"
src/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .app import app
2
+
3
+ __all__ = ["app"]
src/app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from dotenv import load_dotenv
4
+
5
+ # ===========================
6
+ # !!! ATTENTION !!!
7
+ # KEEP THIS AT THE TOP TO ENSURE ENVIRONMENT VARIABLES ARE LOADED BEFORE ANY IMPORTS
8
+ # ===========================
9
+ load_dotenv()
10
+
11
+ import os
12
+ from contextlib import asynccontextmanager
13
+
14
+ from fastapi import FastAPI
15
+ from fastapi.middleware.cors import CORSMiddleware
16
+ from loguru import logger
17
+
18
+ from src.controllers import api_router
19
+ from src.utils import model_manager
20
+
21
+
22
+ @asynccontextmanager
23
+ async def lifespan(app: FastAPI):
24
+ try:
25
+ logger.info("Starting up the application...")
26
+ await model_manager.ensure_models_loaded()
27
+ logger.info("Application started successfully...")
28
+ yield
29
+ except Exception as e:
30
+ logger.error(f"Error during startup: {str(e)}")
31
+ raise
32
+ finally:
33
+ logger.info("Application shutdown complete.")
34
+
35
+
36
+ app = FastAPI(lifespan=lifespan)
37
+
38
+
39
+ app.add_middleware(
40
+ CORSMiddleware,
41
+ allow_origins=[
42
+ origin.strip()
43
+ for origin in os.getenv(
44
+ "CORS_ALLOW_ORIGINS", "http://localhost, http://127.0.0.1"
45
+ ).split(",")
46
+ ],
47
+ allow_credentials=True,
48
+ allow_methods=["*"],
49
+ allow_headers=["*"],
50
+ )
51
+
52
+
53
+ @app.get("/")
54
+ async def check_health():
55
+ return {"response": "Service is healthy!"}
56
+
57
+
58
+ app.include_router(api_router, prefix="/api")
src/controllers/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+ from ._parser_controller import ParserController
4
+
5
+ api_router = APIRouter(prefix="/v1")
6
+ api_router.include_router(ParserController().router, prefix="/parser", tags=["parser"])
7
+
8
+
9
+ __all__ = ["api_router"]
src/controllers/_parser_controller.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Body, File, HTTPException, UploadFile
2
+ from fastapi.responses import JSONResponse
3
+ from loguru import logger
4
+ from pydantic import BaseModel
5
+
6
+ from src.services import PDFProcessorService
7
+
8
+
9
+ class EntityExtractorSchema(BaseModel):
10
+ text: str
11
+
12
+
13
+ class ParserController:
14
+
15
+ def __init__(self):
16
+ self.router = APIRouter()
17
+ self.service = PDFProcessorService()
18
+ self.router.add_api_route("/pdf", self.parse_pdf, methods=["POST"])
19
+ self.router.add_api_route("/entity", self.extract_entity, methods=["POST"])
20
+
21
+ async def parse_pdf(self, file: UploadFile = File(...)):
22
+ try:
23
+ if not file:
24
+ raise HTTPException(status_code=400, detail="No file uploaded")
25
+ if file.content_type != "application/pdf":
26
+ raise HTTPException(status_code=400, detail="Invalid file type")
27
+ async with self.service as processor:
28
+ extracted_data = await processor.process_pdf(file)
29
+ return JSONResponse(content={"data": extracted_data})
30
+ except HTTPException as e:
31
+ raise e
32
+ except Exception as e:
33
+ logger.exception(e)
34
+ raise HTTPException(
35
+ status_code=500,
36
+ detail=str(e),
37
+ )
38
+
39
+ async def extract_entity(
40
+ self, entity_extractor_schema: EntityExtractorSchema = Body(...)
41
+ ):
42
+ try:
43
+ extracted_entity = await self.service.extract_entity(
44
+ entity_extractor_schema.text
45
+ )
46
+ return JSONResponse(content={"data": extracted_entity})
47
+ except HTTPException as e:
48
+ raise e
49
+ except Exception as e:
50
+ logger.exception(e)
51
+ raise HTTPException(
52
+ status_code=500,
53
+ detail=str(e),
54
+ )
src/services/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from ._pdf_processor_service import PDFProcessorService
2
+
3
+ all = ["PDFProcessorService"]
src/services/_pdf_processor_service.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import re
3
+ import tempfile
4
+ from pathlib import Path
5
+ from typing import List
6
+
7
+ import aiofiles
8
+ import fitz
9
+ from fastapi import UploadFile
10
+ from loguru import logger
11
+
12
+ from src.utils import TextExtractor, model_manager
13
+
14
+
15
+ class PDFProcessorService:
16
+ """Async PDF processor for handling both digital and scanned PDFs."""
17
+
18
+ def __init__(self):
19
+ # Use the centralized model manager
20
+ self._ensure_models_loaded()
21
+
22
+ def _ensure_models_loaded(self):
23
+ """Ensure models are loaded via the model manager."""
24
+ if not model_manager.models_loaded:
25
+ logger.info("🔄 Models not loaded, initializing model manager...")
26
+ # This will trigger model loading if not already done
27
+ _ = model_manager.doctr_model
28
+
29
+ @property
30
+ def doctr_model(self):
31
+ """Get the loaded doctr model from model manager."""
32
+ return model_manager.doctr_model
33
+
34
+ @property
35
+ def device(self):
36
+ """Get the device being used from model manager."""
37
+ return model_manager.device
38
+
39
+ async def __aenter__(self):
40
+ return self
41
+
42
+ async def __aexit__(self, exc_type, exc_value, traceback):
43
+ pass
44
+
45
+ async def is_pdf_scanned(self, pdf_path: str) -> bool:
46
+ """Check if PDF is scanned (no extractable text)."""
47
+
48
+ def _check_scanned():
49
+ doc = fitz.open(pdf_path)
50
+ for page in doc:
51
+ text = page.get_text()
52
+ if text.strip():
53
+ return False
54
+ return True
55
+
56
+ return await asyncio.get_event_loop().run_in_executor(None, _check_scanned)
57
+
58
+ async def save_uploaded_file(self, uploaded_file: UploadFile) -> str:
59
+ file_name = uploaded_file.filename
60
+ suffix = Path(file_name).suffix
61
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
62
+ temp_path = tmp.name
63
+ async with aiofiles.open(temp_path, "wb") as f:
64
+ await f.write(await uploaded_file.read())
65
+ return temp_path
66
+
67
+ async def extract_text_from_digital_pdf(self, pdf_path: str) -> List[List[str]]:
68
+ """Extract text from digital PDF using PyPDF2."""
69
+
70
+ async def _extract_text():
71
+ doc = fitz.open(pdf_path)
72
+ extracted_data = []
73
+
74
+ for page in doc:
75
+ ptext = page.get_text()
76
+ if ptext:
77
+ data = []
78
+ for line in ptext.splitlines():
79
+ cleaned_line = await self._split_on_repeated_pattern(
80
+ line.strip()
81
+ )
82
+ if cleaned_line:
83
+ data.append(cleaned_line[0])
84
+ extracted_data.append(data)
85
+
86
+ return extracted_data
87
+
88
+ return await asyncio.get_event_loop().run_in_executor(None, _extract_text)
89
+
90
+ async def _split_on_repeated_pattern(
91
+ self, line: str, min_space: int = 10
92
+ ) -> List[str]:
93
+ """Split line on repeated pattern."""
94
+ import re
95
+ from difflib import SequenceMatcher
96
+
97
+ original_line = line.strip()
98
+
99
+ # Find all spans of spaces >= min_space
100
+ space_spans = [
101
+ (m.start(), len(m.group()))
102
+ for m in re.finditer(r" {%d,}" % min_space, original_line)
103
+ ]
104
+
105
+ if not space_spans:
106
+ return [original_line]
107
+
108
+ # Count how often each gap size occurs
109
+ gaps = [span[1] for span in space_spans]
110
+ gap_counts = {}
111
+ for g in gaps:
112
+ gap_counts[g] = gap_counts.get(g, 0) + 1
113
+
114
+ # Sort gaps by size × count (more dominant gaps first)
115
+ sorted_gaps = sorted(
116
+ gap_counts.items(), key=lambda x: x[1] * x[0], reverse=True
117
+ )
118
+
119
+ # No significant gaps, return original
120
+ if not sorted_gaps:
121
+ return [original_line]
122
+
123
+ dominant_gap = sorted_gaps[0][0]
124
+
125
+ # Use the dominant large gap to split
126
+ chunks = re.split(rf" {{%d,}}" % dominant_gap, original_line)
127
+
128
+ # Check if it's actually repeated using fuzzy match
129
+ base = chunks[0].strip()
130
+ repeated = False
131
+ for chunk in chunks[1:]:
132
+ chunk = chunk.strip()
133
+ if chunk and SequenceMatcher(None, base, chunk).ratio() > 0.8:
134
+ repeated = True
135
+ break
136
+
137
+ return [base] if repeated else [original_line]
138
+
139
+ async def process_pdf(self, file):
140
+ pdf_path = await self.save_uploaded_file(file)
141
+ is_scanned = await self.is_pdf_scanned(pdf_path)
142
+ text_extractor = TextExtractor(self.doctr_model)
143
+ if is_scanned:
144
+ logger.info(f"{pdf_path} is likely a scanned PDF.")
145
+ extracted_text_list = (
146
+ await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path)
147
+ )
148
+ else:
149
+ logger.info(f"{pdf_path} is not a scanned PDF. Extracting text...")
150
+ extracted_text_list = await text_extractor.extract_lines_with_bbox(pdf_path)
151
+ pdf_text = ""
152
+ for block in extracted_text_list:
153
+ for line in block:
154
+ pdf_text += " " + line["line"]
155
+ text_noisy = text_extractor.is_text_noisy(pdf_text)
156
+ if text_noisy:
157
+ logger.info("Text is noisy. Extracting text again...")
158
+ extracted_text_list = (
159
+ await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
160
+ pdf_path
161
+ )
162
+ )
163
+ return extracted_text_list
164
+
165
+ async def extract_entity(self, text: str):
166
+ text = re.sub(r"[^\w\s]", " ", text)
167
+ doc = model_manager.spacy_model(text)
168
+ entities = {ent.text: ent.label_ for ent in doc.ents}
169
+ for key, value in entities.items():
170
+ if value == "ORG":
171
+ return key
172
+ if entities:
173
+ return list(entities.keys())[0]
174
+ return text
src/utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from ._model_manager import model_manager
2
+ from ._text_extractor import TextExtractor
3
+
4
+ __all__ = ["model_manager", "TextExtractor"]
src/utils/_model_manager.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import spacy
4
+ import torch
5
+ from doctr.models import ocr_predictor
6
+ from loguru import logger
7
+
8
+
9
+ class ModelManager:
10
+ """Singleton model manager for pre-loading all models at startup."""
11
+
12
+ _instance = None
13
+ _doctr_model = None
14
+ _spacy_model = None
15
+ _device = None
16
+ _models_loaded = False
17
+
18
+ def __new__(cls):
19
+ if cls._instance is None:
20
+ cls._instance = super(ModelManager, cls).__new__(cls)
21
+ return cls._instance
22
+
23
+ def __init__(self):
24
+ pass
25
+
26
+ async def _load_models(self):
27
+ """Load all models synchronously."""
28
+ logger.info("🚀 Starting model pre-loading...")
29
+
30
+ self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
+ logger.info(f"📱 Using device: {self._device}")
32
+
33
+ # Load doctr model
34
+ logger.info("🔄 Loading doctr OCR model...")
35
+ self._doctr_model = ocr_predictor(pretrained=True)
36
+ self._doctr_model.det_predictor.model = (
37
+ self._doctr_model.det_predictor.model.to(self._device)
38
+ )
39
+ self._doctr_model.reco_predictor.model = (
40
+ self._doctr_model.reco_predictor.model.to(self._device)
41
+ )
42
+ logger.info("✅ Doctr model loaded successfully!")
43
+
44
+ # Load spaCy model
45
+ self._spacy_model = spacy.load(os.getenv("SPACY_MODEL_NAME", "en_core_web_trf"))
46
+ logger.info(f"✅ spaCy model loaded successfully!")
47
+ self._models_loaded = True
48
+ logger.info("🎉 All models loaded successfully!")
49
+
50
+ @property
51
+ def doctr_model(self):
52
+ """Get the loaded doctr model."""
53
+ return self._doctr_model
54
+
55
+ @property
56
+ def spacy_model(self):
57
+ """Get the loaded spaCy model."""
58
+ return self._spacy_model
59
+
60
+ @property
61
+ def device(self):
62
+ """Get the device being used."""
63
+ return self._device
64
+
65
+ @property
66
+ def models_loaded(self):
67
+ """Check if models are loaded."""
68
+ return self._models_loaded
69
+
70
+ async def ensure_models_loaded(self):
71
+ """Ensure models are loaded (async wrapper)."""
72
+ if not self._models_loaded:
73
+ await self._load_models()
74
+ return True
75
+
76
+ async def get_model_status(self):
77
+ """Get status of all models."""
78
+ return {
79
+ "doctr_model": self._doctr_model is not None,
80
+ "spacy_model": self._spacy_model is not None,
81
+ "device": str(self._device),
82
+ "models_loaded": self._models_loaded,
83
+ "spacy_model_name": os.getenv("SPACY_MODEL_NAME"),
84
+ }
85
+
86
+
87
+ # Global model manager instance
88
+ model_manager = ModelManager()
src/utils/_text_extractor.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import math
3
+ import multiprocessing
4
+ import re
5
+ from collections import Counter
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from typing import Dict, List
8
+
9
+ import fitz
10
+ import numpy as np
11
+ from pdf2image import convert_from_path
12
+
13
+
14
+ class TextExtractor:
15
+ """Async text extractor for extracting text with bounding boxes."""
16
+
17
+ def __init__(self, doctr_model):
18
+ self.doctr_model = doctr_model
19
+ self.noise_pattern = [
20
+ r"\b[A-Z]{6,}\b",
21
+ r"[\[\]\\\^\@\#\$\%\&\*]{2,}",
22
+ r"(\d)\1{5,}",
23
+ r"\b(?=[A-Za-z]*\d)(?=\d*[A-Za-z])[A-Za-z\d]{8,}\b",
24
+ ]
25
+
26
+ async def __aenter__(self):
27
+ return self
28
+
29
+ async def __aexit__(self, exc_type, exc_value, traceback):
30
+ pass
31
+
32
+ def normalize_bbox(self, bbox, width: float, height: float) -> List[float]:
33
+ """Normalize bounding box (x0, y0, x1, y1) to range [0, 1]."""
34
+ x0, y0, x1, y1 = bbox
35
+ return [
36
+ round(x0 / width, 6),
37
+ round(y0 / height, 6),
38
+ round(x1 / width, 6),
39
+ round(y1 / height, 6),
40
+ ]
41
+
42
+ def remove_consecutive_items(self, line: List[str]) -> List[str]:
43
+ """Remove consecutive duplicate items from a list."""
44
+ if not line:
45
+ return line
46
+ result = [line[0]]
47
+ for item in line[1:]:
48
+ if item != result[-1]:
49
+ result.append(item)
50
+ return result
51
+
52
+ def remove_consecutive_words(self, word_data: List[Dict]) -> List[Dict]:
53
+ """Remove consecutive duplicate words from word data."""
54
+ if not word_data:
55
+ return word_data
56
+ result = [word_data[0]]
57
+ for i in range(1, len(word_data)):
58
+ if word_data[i]["word"] != result[-1]["word"]:
59
+ result.append(word_data[i])
60
+ return result
61
+
62
+ def shannon_entropy(self, text: str) -> float:
63
+ if not text:
64
+ return 0.0
65
+ counts = Counter(text)
66
+ length = len(text)
67
+ return -sum(
68
+ (count / length) * math.log2(count / length) for count in counts.values()
69
+ )
70
+
71
+ def reconstruct_line_from_bboxes(self, words, space_unit=5):
72
+ """
73
+ Reconstructs a line with appropriate spacing based on word bounding boxes.
74
+
75
+ Parameters:
76
+ - words: list of dicts with 'word' and 'bbox' (bbox = [x0, y0, x1, y1])
77
+ - space_unit: how many pixels roughly correspond to one space
78
+
79
+ Returns:
80
+ - str: reconstructed line with spaces
81
+ """
82
+ # Sort words by x-coordinate (left to right)
83
+ words = sorted(words, key=lambda w: w["bbox"][0])
84
+
85
+ line = ""
86
+ prev_end_x = 0
87
+ for word_info in words:
88
+ word = word_info["word"]
89
+ start_x = word_info["bbox"][0]
90
+
91
+ if prev_end_x is not None:
92
+ # Calculate gap between previous word and current word
93
+ gap = max(0, start_x - prev_end_x)
94
+ num_spaces = int(round(gap / space_unit))
95
+ line += " " * num_spaces
96
+
97
+ line += word
98
+ prev_end_x = word_info["bbox"][2] # x1 of current word
99
+
100
+ return line
101
+
102
+ def is_text_noisy(self, text: str) -> bool:
103
+ """Check if text is noisy (contains special characters)."""
104
+ total_chars = len(text)
105
+ if total_chars < 50: # skip empty or small pages
106
+ return True
107
+
108
+ tokens = re.findall(r"\b\w+\b", text)
109
+ total_words = len(tokens)
110
+
111
+ # Symbol & digit density
112
+ digit_count = len(re.findall(r"\d", text))
113
+ symbol_count = len(
114
+ re.findall(r"[^\w\s]", text)
115
+ ) # anything not a word char or whitespace
116
+ symbol_density = symbol_count / total_chars
117
+ digit_density = digit_count / total_chars
118
+
119
+ # Repeating char patterns like "22222222222" or "!!!!!!"
120
+ long_repeats = len(re.findall(r"(.)\1{5,}", text)) # any char repeated 6+ times
121
+
122
+ # Entropy: randomness of characters
123
+ entropy = self.shannon_entropy(text)
124
+
125
+ # Heuristics tuned for your sample
126
+ if (
127
+ entropy > 4.0
128
+ and symbol_density > 0.15
129
+ and digit_density > 0.15
130
+ and long_repeats > 1
131
+ and total_words > 30
132
+ ):
133
+ return True
134
+ return False
135
+
136
+ async def extract_lines_with_bbox(self, pdf_path: str, y_threshold: float = 3.0):
137
+ """Extract lines with bounding boxes from digital PDF."""
138
+
139
+ def _extract_lines():
140
+ doc = fitz.open(pdf_path)
141
+ page_lines_with_bbox = []
142
+
143
+ for page in doc:
144
+ words = page.get_text(
145
+ "words"
146
+ ) # (x0, y0, x1, y1, word, block_no, line_no, word_no)
147
+ words.sort(key=lambda w: (round(w[1], 1), w[0])) # sort by y then x
148
+
149
+ lines = []
150
+ current_line = []
151
+ current_y = None
152
+ current_word_data = []
153
+
154
+ for w in words:
155
+ x0, y0, x1, y1, word = w[:5]
156
+ if (
157
+ word == "|"
158
+ or not word
159
+ or word == "."
160
+ or word == "#"
161
+ or re.sub(r"[^\w\s-]", "", word) == ""
162
+ or re.sub(r"\d{19,}", "", word) == ""
163
+ ):
164
+ continue
165
+ word = word.lower()
166
+ word = word.replace("$", "")
167
+ word_data = {"word": word.strip(), "bbox": (x0, y0, x1, y1)}
168
+
169
+ if current_y is None or abs(y0 - current_y) < y_threshold:
170
+ current_line.append((x0, y0, word))
171
+ current_y = y0
172
+ current_word_data.append(word_data)
173
+ else:
174
+ current_line.sort()
175
+ line_words = [w[2] for w in current_line]
176
+ clean_line = self.remove_consecutive_items(line_words)
177
+ current_word_data = sorted(
178
+ current_word_data, key=lambda w: w["bbox"][0]
179
+ )
180
+ clean_word_data = self.remove_consecutive_words(
181
+ current_word_data
182
+ )
183
+
184
+ if clean_line:
185
+ x_start = min([w[0] for w in current_line])
186
+ y_start = min([w[1] for w in current_line])
187
+ if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
188
+ lines.append(
189
+ {
190
+ "line": " ".join(clean_line),
191
+ "bbox": [x_start, y_start],
192
+ "words": clean_word_data,
193
+ }
194
+ )
195
+ current_line = [(x0, y0, word)]
196
+ current_y = y0
197
+ current_word_data = [word_data]
198
+
199
+ # Process remaining line
200
+ if current_line:
201
+ current_line.sort()
202
+ line_words = [w[2] for w in current_line]
203
+ clean_line = self.remove_consecutive_items(line_words)
204
+ current_word_data = sorted(
205
+ current_word_data, key=lambda w: w["bbox"][0]
206
+ )
207
+ clean_word_data = self.remove_consecutive_words(current_word_data)
208
+
209
+ if clean_line:
210
+ x_start = min([w[0] for w in current_line])
211
+ y_start = min([w[1] for w in current_line])
212
+ if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
213
+ lines.append(
214
+ {
215
+ "line": " ".join(clean_line),
216
+ "bbox": [x_start, y_start],
217
+ "words": clean_word_data,
218
+ }
219
+ )
220
+
221
+ page_lines_with_bbox.append(lines)
222
+
223
+ return page_lines_with_bbox
224
+
225
+ return await asyncio.get_event_loop().run_in_executor(None, _extract_lines)
226
+
227
+ def create_page_chunks(self, num_pages: int, cpu_core: int):
228
+ final_ranges = []
229
+ page_per_cpu = 2
230
+ for i in range(1, num_pages + 1, page_per_cpu + 1):
231
+ final_ranges.append([i, min(i + page_per_cpu, num_pages)])
232
+ return final_ranges
233
+
234
+ def process_page_parallel_async(
235
+ self, pdf_path: str, page_range: List[int], instance
236
+ ):
237
+ loop = asyncio.new_event_loop()
238
+ asyncio.set_event_loop(loop)
239
+ try:
240
+ return loop.run_until_complete(
241
+ self.process_pages_concurrently(pdf_path, page_range)
242
+ )
243
+ finally:
244
+ loop.close()
245
+
246
+ async def process_pages_concurrently(self, pdf_path: str, page_range: List[int]):
247
+ start_page = page_range[0]
248
+ end_page = page_range[1]
249
+
250
+ tasks = []
251
+ for page in range(start_page, end_page + 1):
252
+ tasks.append(self.process_page_parallel(pdf_path, page))
253
+
254
+ page_results = await asyncio.gather(*tasks)
255
+ page_results.sort(key=lambda x: x[0])
256
+
257
+ chunk_outputs = [output for page_num, output in page_results]
258
+
259
+ return page_range, chunk_outputs
260
+
261
+ async def process_page_parallel(self, pdf_path: str, i: int):
262
+ print(f"Processing page {i}")
263
+ pages = convert_from_path(pdf_path, dpi=300, first_page=i, last_page=i)
264
+ page_imgs = [page.convert("RGB") for page in pages]
265
+ output = self.doctr_model([np.array(img) for img in page_imgs])
266
+ return i, output
267
+
268
+ async def extract_lines_with_bbox_from_scanned_pdf(
269
+ self, pdf_path: str, y_threshold: float = 5.0, first_page: bool = False
270
+ ):
271
+ """Extract lines with bounding boxes from scanned PDF using OCR."""
272
+
273
+ def _extract_from_scanned():
274
+ result = None
275
+ doc = None
276
+
277
+ if first_page:
278
+ number_of_pages = fitz.open(pdf_path).page_count
279
+ if number_of_pages < 3:
280
+ pages = convert_from_path(
281
+ pdf_path, dpi=300, first_page=1, last_page=number_of_pages
282
+ )
283
+ else:
284
+ pages = convert_from_path(
285
+ pdf_path, dpi=300, first_page=1, last_page=3
286
+ )
287
+ first_page_img = [page.convert("RGB") for page in pages]
288
+ result = self.doctr_model([np.array(img) for img in first_page_img])
289
+ doc = [np.array(img) for img in first_page_img]
290
+ else:
291
+ pdf = fitz.open(pdf_path)
292
+ num_pages = pdf.page_count
293
+ page_witdh_f = pdf[0].rect.width
294
+ page_height_f = pdf[0].rect.height
295
+ page_chunks = self.create_page_chunks(
296
+ num_pages, multiprocessing.cpu_count()
297
+ )
298
+ with ThreadPoolExecutor(
299
+ max_workers=multiprocessing.cpu_count()
300
+ ) as executor:
301
+ futures = []
302
+ for chunk in page_chunks:
303
+ futures.append(
304
+ executor.submit(
305
+ self.process_page_parallel_async, pdf_path, chunk, self
306
+ )
307
+ )
308
+ results = [f.result() for f in futures]
309
+ results.sort(key=lambda x: x[0][0])
310
+ result = []
311
+ for r in results:
312
+ result.extend(r[1])
313
+ results = result
314
+ page_lines_with_bbox = []
315
+
316
+ for result in results:
317
+ for page in result.pages:
318
+ if first_page:
319
+ img_width, img_height = doc[0].shape[1], doc[0].shape[0]
320
+ else:
321
+ img_width, img_height = page_witdh_f, page_height_f
322
+ words = []
323
+
324
+ for block in page.blocks:
325
+ for line in block.lines:
326
+ for word in line.words:
327
+ x0, y0 = word.geometry[0]
328
+ x1, y1 = word.geometry[1]
329
+ abs_x0 = x0 * img_width
330
+ abs_y0 = y0 * img_height
331
+ abs_x1 = x1 * img_width
332
+ abs_y1 = y1 * img_height
333
+ text = word.value.strip().lower()
334
+ text = re.sub(r"[#*]", " ", text)
335
+ text = re.sub(f"[$]", "", text)
336
+ text = text.strip()
337
+
338
+ if (
339
+ text == "|"
340
+ or not text
341
+ or text == "."
342
+ or text == "#"
343
+ or re.sub(r"[^\w\s-]", "", text) == ""
344
+ or re.sub(r"\d{19,}", "", text) == ""
345
+ ):
346
+ continue
347
+ words.append(
348
+ {
349
+ "word": text,
350
+ "bbox": [abs_x0, abs_y0, abs_x1, abs_y1],
351
+ }
352
+ )
353
+
354
+ # Sort words by y then x
355
+ words.sort(key=lambda w: (round(w["bbox"][1], 3), w["bbox"][0]))
356
+
357
+ lines = []
358
+ current_line = []
359
+ current_word_data = []
360
+ current_y = None
361
+
362
+ for w in words:
363
+ y0 = w["bbox"][1]
364
+ if current_y is None or abs(y0 - current_y) < y_threshold:
365
+ current_line.append((w["bbox"][0], y0, w["word"]))
366
+ current_word_data.append(w)
367
+ current_y = y0
368
+ else:
369
+ current_line.sort()
370
+ line_words = [x[2] for x in current_line]
371
+ clean_line = self.remove_consecutive_items(line_words)
372
+ current_word_data = sorted(
373
+ current_word_data, key=lambda w: w["bbox"][0]
374
+ )
375
+ clean_word_data = self.remove_consecutive_words(
376
+ current_word_data
377
+ )
378
+
379
+ if clean_line:
380
+ x_start = min(x[0] for x in current_line)
381
+ y_start = min(x[1] for x in current_line)
382
+ if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
383
+ lines.append(
384
+ {
385
+ "line": " ".join(clean_line),
386
+ "bbox": [x_start, y_start],
387
+ "words": clean_word_data,
388
+ }
389
+ )
390
+ current_line = [(w["bbox"][0], y0, w["word"])]
391
+ current_word_data = [w]
392
+ current_y = y0
393
+
394
+ # Final remaining line
395
+ if current_line:
396
+ current_line.sort()
397
+ line_words = [x[2] for x in current_line]
398
+ clean_line = self.remove_consecutive_items(line_words)
399
+ current_word_data = sorted(
400
+ current_word_data, key=lambda w: w["bbox"][0]
401
+ )
402
+ clean_word_data = self.remove_consecutive_words(current_word_data)
403
+
404
+ if clean_line:
405
+ x_start = min(x[0] for x in current_line)
406
+ y_start = min(x[1] for x in current_line)
407
+ if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
408
+ lines.append(
409
+ {
410
+ "line": " ".join(clean_line),
411
+ "bbox": [x_start, y_start],
412
+ "words": clean_word_data,
413
+ }
414
+ )
415
+
416
+ page_lines_with_bbox.append(lines)
417
+
418
+ return page_lines_with_bbox
419
+
420
+ return await asyncio.get_event_loop().run_in_executor(
421
+ None, _extract_from_scanned
422
+ )