ocr-engine-2

Sleeping

App Files Files Community

kanha-upadhyay commited on Aug 21, 2025

Commit

e42e330

0 Parent(s):

init

Browse files

Files changed (16) hide show

.env.example +3 -0
.gitattributes +35 -0
.gitignore +146 -0
Dockerfile +49 -0
main.py +4 -0
poetry.lock +0 -0
pyproject.toml +29 -0
src/__init__.py +3 -0
src/app.py +58 -0
src/controllers/__init__.py +9 -0
src/controllers/_parser_controller.py +54 -0
src/services/__init__.py +3 -0
src/services/_pdf_processor_service.py +174 -0
src/utils/__init__.py +4 -0
src/utils/_model_manager.py +88 -0
src/utils/_text_extractor.py +422 -0

.env.example ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ CORS_ALLOW_ORIGINS=http://localhost, http://127.0.0.1
2	+
3	+ SPACY_MODEL_NAME=en_core_web_trf

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,146 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# Formatter cache files
+.mypy_cache/
+.black_cache/
+.ruff_cache/
+.vscode
+*.db

Dockerfile ADDED Viewed

	@@ -0,0 +1,49 @@

+FROM python:3.12-slim
+WORKDIR /app
+# Install system dependencies including those needed for OCR and ML models
+RUN apt-get update && apt-get install -y \
+    curl \
+    poppler-utils \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+# Install poetry
+RUN pip install poetry
+# Configure poetry
+RUN poetry config virtualenvs.create false
+# Copy dependency files
+COPY pyproject.toml poetry.lock* /app/
+# Install dependencies
+RUN poetry install --only main --no-root
+# Download spacy model
+RUN python -m spacy download en_core_web_sm
+# Create user
+RUN useradd -m -u 1000 appuser
+# Copy source code
+COPY --chown=appuser src /app/src
+COPY --chown=appuser main.py /app/
+# Change ownership
+RUN chown -R appuser /app
+USER appuser
+EXPOSE 8001
+ENV PYTHONUNBUFFERED=1
+ENV SPACY_MODEL_NAME=en_core_web_sm
+CMD ["python", "main.py"]

main.py ADDED Viewed

	@@ -0,0 +1,4 @@

+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("src.app:app", host="0.0.0.0", port=8001, reload=True)

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,29 @@

+[project]
+name = "parser"
+version = "0.1.0"
+description = ""
+authors = [
+    {name = "Kanha Upadhyay",email = "kanha.upadhyay@sifars.com"}
+]
+readme = "README.md"
+requires-python = "3.12.*"
+dependencies = [
+    "fastapi (>=0.116.1,<0.117.0)",
+    "uvicorn (>=0.35.0,<0.36.0)",
+    "loguru (>=0.7.3,<0.8.0)",
+    "python-dotenv (>=1.1.1,<2.0.0)",
+    "pymupdf (>=1.26.3,<2.0.0)",
+    "pdf2image (>=1.17.0,<2.0.0)",
+    "torch (>=2.8.0,<3.0.0)",
+    "fuzzywuzzy (>=0.18.0,<0.19.0)",
+    "spacy (>=3.8.7,<4.0.0)",
+    "python-doctr (>=1.0.0,<2.0.0)",
+    "aiofiles (>=24.1.0,<25.0.0)",
+    "numpy (>=1.24.0,<2.0.0)",
+    "python-multipart (>=0.0.9,<0.1.0)"
+]
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"

src/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .app import app
2	+
3	+ __all__ = ["app"]

src/app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+from dotenv import load_dotenv
+# ===========================
+# !!! ATTENTION !!!
+# KEEP THIS AT THE TOP TO ENSURE ENVIRONMENT VARIABLES ARE LOADED BEFORE ANY IMPORTS
+# ===========================
+load_dotenv()
+import os
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from loguru import logger
+from src.controllers import api_router
+from src.utils import model_manager
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    try:
+        logger.info("Starting up the application...")
+        await model_manager.ensure_models_loaded()
+        logger.info("Application started successfully...")
+        yield
+    except Exception as e:
+        logger.error(f"Error during startup: {str(e)}")
+        raise
+    finally:
+        logger.info("Application shutdown complete.")
+app = FastAPI(lifespan=lifespan)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        origin.strip()
+        for origin in os.getenv(
+            "CORS_ALLOW_ORIGINS", "http://localhost, http://127.0.0.1"
+        ).split(",")
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/")
+async def check_health():
+    return {"response": "Service is healthy!"}
+app.include_router(api_router, prefix="/api")

src/controllers/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from fastapi import APIRouter
+from ._parser_controller import ParserController
+api_router = APIRouter(prefix="/v1")
+api_router.include_router(ParserController().router, prefix="/parser", tags=["parser"])
+__all__ = ["api_router"]

src/controllers/_parser_controller.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from fastapi import APIRouter, Body, File, HTTPException, UploadFile
+from fastapi.responses import JSONResponse
+from loguru import logger
+from pydantic import BaseModel
+from src.services import PDFProcessorService
+class EntityExtractorSchema(BaseModel):
+    text: str
+class ParserController:
+    def __init__(self):
+        self.router = APIRouter()
+        self.service = PDFProcessorService()
+        self.router.add_api_route("/pdf", self.parse_pdf, methods=["POST"])
+        self.router.add_api_route("/entity", self.extract_entity, methods=["POST"])
+    async def parse_pdf(self, file: UploadFile = File(...)):
+        try:
+            if not file:
+                raise HTTPException(status_code=400, detail="No file uploaded")
+            if file.content_type != "application/pdf":
+                raise HTTPException(status_code=400, detail="Invalid file type")
+            async with self.service as processor:
+                extracted_data = await processor.process_pdf(file)
+            return JSONResponse(content={"data": extracted_data})
+        except HTTPException as e:
+            raise e
+        except Exception as e:
+            logger.exception(e)
+            raise HTTPException(
+                status_code=500,
+                detail=str(e),
+            )
+    async def extract_entity(
+        self, entity_extractor_schema: EntityExtractorSchema = Body(...)
+    ):
+        try:
+            extracted_entity = await self.service.extract_entity(
+                entity_extractor_schema.text
+            )
+            return JSONResponse(content={"data": extracted_entity})
+        except HTTPException as e:
+            raise e
+        except Exception as e:
+            logger.exception(e)
+            raise HTTPException(
+                status_code=500,
+                detail=str(e),
+            )

src/services/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from ._pdf_processor_service import PDFProcessorService
2	+
3	+ all = ["PDFProcessorService"]

src/services/_pdf_processor_service.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import asyncio
+import re
+import tempfile
+from pathlib import Path
+from typing import List
+import aiofiles
+import fitz
+from fastapi import UploadFile
+from loguru import logger
+from src.utils import TextExtractor, model_manager
+class PDFProcessorService:
+    """Async PDF processor for handling both digital and scanned PDFs."""
+    def __init__(self):
+        # Use the centralized model manager
+        self._ensure_models_loaded()
+    def _ensure_models_loaded(self):
+        """Ensure models are loaded via the model manager."""
+        if not model_manager.models_loaded:
+            logger.info("🔄 Models not loaded, initializing model manager...")
+            # This will trigger model loading if not already done
+            _ = model_manager.doctr_model
+    @property
+    def doctr_model(self):
+        """Get the loaded doctr model from model manager."""
+        return model_manager.doctr_model
+    @property
+    def device(self):
+        """Get the device being used from model manager."""
+        return model_manager.device
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        pass
+    async def is_pdf_scanned(self, pdf_path: str) -> bool:
+        """Check if PDF is scanned (no extractable text)."""
+        def _check_scanned():
+            doc = fitz.open(pdf_path)
+            for page in doc:
+                text = page.get_text()
+                if text.strip():
+                    return False
+            return True
+        return await asyncio.get_event_loop().run_in_executor(None, _check_scanned)
+    async def save_uploaded_file(self, uploaded_file: UploadFile) -> str:
+        file_name = uploaded_file.filename
+        suffix = Path(file_name).suffix
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+            temp_path = tmp.name
+        async with aiofiles.open(temp_path, "wb") as f:
+            await f.write(await uploaded_file.read())
+        return temp_path
+    async def extract_text_from_digital_pdf(self, pdf_path: str) -> List[List[str]]:
+        """Extract text from digital PDF using PyPDF2."""
+        async def _extract_text():
+            doc = fitz.open(pdf_path)
+            extracted_data = []
+            for page in doc:
+                ptext = page.get_text()
+                if ptext:
+                    data = []
+                    for line in ptext.splitlines():
+                        cleaned_line = await self._split_on_repeated_pattern(
+                            line.strip()
+                        )
+                        if cleaned_line:
+                            data.append(cleaned_line[0])
+                        extracted_data.append(data)
+            return extracted_data
+        return await asyncio.get_event_loop().run_in_executor(None, _extract_text)
+    async def _split_on_repeated_pattern(
+        self, line: str, min_space: int = 10
+    ) -> List[str]:
+        """Split line on repeated pattern."""
+        import re
+        from difflib import SequenceMatcher
+        original_line = line.strip()
+        # Find all spans of spaces >= min_space
+        space_spans = [
+            (m.start(), len(m.group()))
+            for m in re.finditer(r" {%d,}" % min_space, original_line)
+        ]
+        if not space_spans:
+            return [original_line]
+        # Count how often each gap size occurs
+        gaps = [span[1] for span in space_spans]
+        gap_counts = {}
+        for g in gaps:
+            gap_counts[g] = gap_counts.get(g, 0) + 1
+        # Sort gaps by size × count (more dominant gaps first)
+        sorted_gaps = sorted(
+            gap_counts.items(), key=lambda x: x[1] * x[0], reverse=True
+        )
+        # No significant gaps, return original
+        if not sorted_gaps:
+            return [original_line]
+        dominant_gap = sorted_gaps[0][0]
+        # Use the dominant large gap to split
+        chunks = re.split(rf" {{%d,}}" % dominant_gap, original_line)
+        # Check if it's actually repeated using fuzzy match
+        base = chunks[0].strip()
+        repeated = False
+        for chunk in chunks[1:]:
+            chunk = chunk.strip()
+            if chunk and SequenceMatcher(None, base, chunk).ratio() > 0.8:
+                repeated = True
+                break
+        return [base] if repeated else [original_line]
+    async def process_pdf(self, file):
+        pdf_path = await self.save_uploaded_file(file)
+        is_scanned = await self.is_pdf_scanned(pdf_path)
+        text_extractor = TextExtractor(self.doctr_model)
+        if is_scanned:
+            logger.info(f"{pdf_path} is likely a scanned PDF.")
+            extracted_text_list = (
+                await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path)
+            )
+        else:
+            logger.info(f"{pdf_path} is not a scanned PDF. Extracting text...")
+            extracted_text_list = await text_extractor.extract_lines_with_bbox(pdf_path)
+            pdf_text = ""
+            for block in extracted_text_list:
+                for line in block:
+                    pdf_text += " " + line["line"]
+            text_noisy = text_extractor.is_text_noisy(pdf_text)
+            if text_noisy:
+                logger.info("Text is noisy. Extracting text again...")
+                extracted_text_list = (
+                    await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
+                        pdf_path
+                    )
+                )
+        return extracted_text_list
+    async def extract_entity(self, text: str):
+        text = re.sub(r"[^\w\s]", " ", text)
+        doc = model_manager.spacy_model(text)
+        entities = {ent.text: ent.label_ for ent in doc.ents}
+        for key, value in entities.items():
+            if value == "ORG":
+                return key
+        if entities:
+            return list(entities.keys())[0]
+        return text

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from ._model_manager import model_manager
+from ._text_extractor import TextExtractor
+__all__ = ["model_manager", "TextExtractor"]

src/utils/_model_manager.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import spacy
+import torch
+from doctr.models import ocr_predictor
+from loguru import logger
+class ModelManager:
+    """Singleton model manager for pre-loading all models at startup."""
+    _instance = None
+    _doctr_model = None
+    _spacy_model = None
+    _device = None
+    _models_loaded = False
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(ModelManager, cls).__new__(cls)
+        return cls._instance
+    def __init__(self):
+        pass
+    async def _load_models(self):
+        """Load all models synchronously."""
+        logger.info("🚀 Starting model pre-loading...")
+        self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info(f"📱 Using device: {self._device}")
+        # Load doctr model
+        logger.info("🔄 Loading doctr OCR model...")
+        self._doctr_model = ocr_predictor(pretrained=True)
+        self._doctr_model.det_predictor.model = (
+            self._doctr_model.det_predictor.model.to(self._device)
+        )
+        self._doctr_model.reco_predictor.model = (
+            self._doctr_model.reco_predictor.model.to(self._device)
+        )
+        logger.info("✅ Doctr model loaded successfully!")
+        # Load spaCy model
+        self._spacy_model = spacy.load(os.getenv("SPACY_MODEL_NAME", "en_core_web_trf"))
+        logger.info(f"✅ spaCy model loaded successfully!")
+        self._models_loaded = True
+        logger.info("🎉 All models loaded successfully!")
+    @property
+    def doctr_model(self):
+        """Get the loaded doctr model."""
+        return self._doctr_model
+    @property
+    def spacy_model(self):
+        """Get the loaded spaCy model."""
+        return self._spacy_model
+    @property
+    def device(self):
+        """Get the device being used."""
+        return self._device
+    @property
+    def models_loaded(self):
+        """Check if models are loaded."""
+        return self._models_loaded
+    async def ensure_models_loaded(self):
+        """Ensure models are loaded (async wrapper)."""
+        if not self._models_loaded:
+            await self._load_models()
+        return True
+    async def get_model_status(self):
+        """Get status of all models."""
+        return {
+            "doctr_model": self._doctr_model is not None,
+            "spacy_model": self._spacy_model is not None,
+            "device": str(self._device),
+            "models_loaded": self._models_loaded,
+            "spacy_model_name": os.getenv("SPACY_MODEL_NAME"),
+        }
+# Global model manager instance
+model_manager = ModelManager()

src/utils/_text_extractor.py ADDED Viewed

	@@ -0,0 +1,422 @@

+import asyncio
+import math
+import multiprocessing
+import re
+from collections import Counter
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List
+import fitz
+import numpy as np
+from pdf2image import convert_from_path
+class TextExtractor:
+    """Async text extractor for extracting text with bounding boxes."""
+    def __init__(self, doctr_model):
+        self.doctr_model = doctr_model
+        self.noise_pattern = [
+            r"\b[A-Z]{6,}\b",
+            r"[\[\]\\\^\@\#\$\%\&\*]{2,}",
+            r"(\d)\1{5,}",
+            r"\b(?=[A-Za-z]*\d)(?=\d*[A-Za-z])[A-Za-z\d]{8,}\b",
+        ]
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        pass
+    def normalize_bbox(self, bbox, width: float, height: float) -> List[float]:
+        """Normalize bounding box (x0, y0, x1, y1) to range [0, 1]."""
+        x0, y0, x1, y1 = bbox
+        return [
+            round(x0 / width, 6),
+            round(y0 / height, 6),
+            round(x1 / width, 6),
+            round(y1 / height, 6),
+        ]
+    def remove_consecutive_items(self, line: List[str]) -> List[str]:
+        """Remove consecutive duplicate items from a list."""
+        if not line:
+            return line
+        result = [line[0]]
+        for item in line[1:]:
+            if item != result[-1]:
+                result.append(item)
+        return result
+    def remove_consecutive_words(self, word_data: List[Dict]) -> List[Dict]:
+        """Remove consecutive duplicate words from word data."""
+        if not word_data:
+            return word_data
+        result = [word_data[0]]
+        for i in range(1, len(word_data)):
+            if word_data[i]["word"] != result[-1]["word"]:
+                result.append(word_data[i])
+        return result
+    def shannon_entropy(self, text: str) -> float:
+        if not text:
+            return 0.0
+        counts = Counter(text)
+        length = len(text)
+        return -sum(
+            (count / length) * math.log2(count / length) for count in counts.values()
+        )
+    def reconstruct_line_from_bboxes(self, words, space_unit=5):
+        """
+        Reconstructs a line with appropriate spacing based on word bounding boxes.
+        Parameters:
+        - words: list of dicts with 'word' and 'bbox' (bbox = [x0, y0, x1, y1])
+        - space_unit: how many pixels roughly correspond to one space
+        Returns:
+        - str: reconstructed line with spaces
+        """
+        # Sort words by x-coordinate (left to right)
+        words = sorted(words, key=lambda w: w["bbox"][0])
+        line = ""
+        prev_end_x = 0
+        for word_info in words:
+            word = word_info["word"]
+            start_x = word_info["bbox"][0]
+            if prev_end_x is not None:
+                # Calculate gap between previous word and current word
+                gap = max(0, start_x - prev_end_x)
+                num_spaces = int(round(gap / space_unit))
+                line += " " * num_spaces
+            line += word
+            prev_end_x = word_info["bbox"][2]  # x1 of current word
+        return line
+    def is_text_noisy(self, text: str) -> bool:
+        """Check if text is noisy (contains special characters)."""
+        total_chars = len(text)
+        if total_chars < 50:  # skip empty or small pages
+            return True
+        tokens = re.findall(r"\b\w+\b", text)
+        total_words = len(tokens)
+        # Symbol & digit density
+        digit_count = len(re.findall(r"\d", text))
+        symbol_count = len(
+            re.findall(r"[^\w\s]", text)
+        )  # anything not a word char or whitespace
+        symbol_density = symbol_count / total_chars
+        digit_density = digit_count / total_chars
+        # Repeating char patterns like "22222222222" or "!!!!!!"
+        long_repeats = len(re.findall(r"(.)\1{5,}", text))  # any char repeated 6+ times
+        # Entropy: randomness of characters
+        entropy = self.shannon_entropy(text)
+        # Heuristics tuned for your sample
+        if (
+            entropy > 4.0
+            and symbol_density > 0.15
+            and digit_density > 0.15
+            and long_repeats > 1
+            and total_words > 30
+        ):
+            return True
+        return False
+    async def extract_lines_with_bbox(self, pdf_path: str, y_threshold: float = 3.0):
+        """Extract lines with bounding boxes from digital PDF."""
+        def _extract_lines():
+            doc = fitz.open(pdf_path)
+            page_lines_with_bbox = []
+            for page in doc:
+                words = page.get_text(
+                    "words"
+                )  # (x0, y0, x1, y1, word, block_no, line_no, word_no)
+                words.sort(key=lambda w: (round(w[1], 1), w[0]))  # sort by y then x
+                lines = []
+                current_line = []
+                current_y = None
+                current_word_data = []
+                for w in words:
+                    x0, y0, x1, y1, word = w[:5]
+                    if (
+                        word == "|"
+                        or not word
+                        or word == "."
+                        or word == "#"
+                        or re.sub(r"[^\w\s-]", "", word) == ""
+                        or re.sub(r"\d{19,}", "", word) == ""
+                    ):
+                        continue
+                    word = word.lower()
+                    word = word.replace("$", "")
+                    word_data = {"word": word.strip(), "bbox": (x0, y0, x1, y1)}
+                    if current_y is None or abs(y0 - current_y) < y_threshold:
+                        current_line.append((x0, y0, word))
+                        current_y = y0
+                        current_word_data.append(word_data)
+                    else:
+                        current_line.sort()
+                        line_words = [w[2] for w in current_line]
+                        clean_line = self.remove_consecutive_items(line_words)
+                        current_word_data = sorted(
+                            current_word_data, key=lambda w: w["bbox"][0]
+                        )
+                        clean_word_data = self.remove_consecutive_words(
+                            current_word_data
+                        )
+                        if clean_line:
+                            x_start = min([w[0] for w in current_line])
+                            y_start = min([w[1] for w in current_line])
+                            if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
+                                lines.append(
+                                    {
+                                        "line": " ".join(clean_line),
+                                        "bbox": [x_start, y_start],
+                                        "words": clean_word_data,
+                                    }
+                                )
+                        current_line = [(x0, y0, word)]
+                        current_y = y0
+                        current_word_data = [word_data]
+                # Process remaining line
+                if current_line:
+                    current_line.sort()
+                    line_words = [w[2] for w in current_line]
+                    clean_line = self.remove_consecutive_items(line_words)
+                    current_word_data = sorted(
+                        current_word_data, key=lambda w: w["bbox"][0]
+                    )
+                    clean_word_data = self.remove_consecutive_words(current_word_data)
+                    if clean_line:
+                        x_start = min([w[0] for w in current_line])
+                        y_start = min([w[1] for w in current_line])
+                        if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
+                            lines.append(
+                                {
+                                    "line": " ".join(clean_line),
+                                    "bbox": [x_start, y_start],
+                                    "words": clean_word_data,
+                                }
+                            )
+                page_lines_with_bbox.append(lines)
+            return page_lines_with_bbox
+        return await asyncio.get_event_loop().run_in_executor(None, _extract_lines)
+    def create_page_chunks(self, num_pages: int, cpu_core: int):
+        final_ranges = []
+        page_per_cpu = 2
+        for i in range(1, num_pages + 1, page_per_cpu + 1):
+            final_ranges.append([i, min(i + page_per_cpu, num_pages)])
+        return final_ranges
+    def process_page_parallel_async(
+        self, pdf_path: str, page_range: List[int], instance
+    ):
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        try:
+            return loop.run_until_complete(
+                self.process_pages_concurrently(pdf_path, page_range)
+            )
+        finally:
+            loop.close()
+    async def process_pages_concurrently(self, pdf_path: str, page_range: List[int]):
+        start_page = page_range[0]
+        end_page = page_range[1]
+        tasks = []
+        for page in range(start_page, end_page + 1):
+            tasks.append(self.process_page_parallel(pdf_path, page))
+        page_results = await asyncio.gather(*tasks)
+        page_results.sort(key=lambda x: x[0])
+        chunk_outputs = [output for page_num, output in page_results]
+        return page_range, chunk_outputs
+    async def process_page_parallel(self, pdf_path: str, i: int):
+        print(f"Processing page {i}")
+        pages = convert_from_path(pdf_path, dpi=300, first_page=i, last_page=i)
+        page_imgs = [page.convert("RGB") for page in pages]
+        output = self.doctr_model([np.array(img) for img in page_imgs])
+        return i, output
+    async def extract_lines_with_bbox_from_scanned_pdf(
+        self, pdf_path: str, y_threshold: float = 5.0, first_page: bool = False
+    ):
+        """Extract lines with bounding boxes from scanned PDF using OCR."""
+        def _extract_from_scanned():
+            result = None
+            doc = None
+            if first_page:
+                number_of_pages = fitz.open(pdf_path).page_count
+                if number_of_pages < 3:
+                    pages = convert_from_path(
+                        pdf_path, dpi=300, first_page=1, last_page=number_of_pages
+                    )
+                else:
+                    pages = convert_from_path(
+                        pdf_path, dpi=300, first_page=1, last_page=3
+                    )
+                first_page_img = [page.convert("RGB") for page in pages]
+                result = self.doctr_model([np.array(img) for img in first_page_img])
+                doc = [np.array(img) for img in first_page_img]
+            else:
+                pdf = fitz.open(pdf_path)
+                num_pages = pdf.page_count
+                page_witdh_f = pdf[0].rect.width
+                page_height_f = pdf[0].rect.height
+                page_chunks = self.create_page_chunks(
+                    num_pages, multiprocessing.cpu_count()
+                )
+                with ThreadPoolExecutor(
+                    max_workers=multiprocessing.cpu_count()
+                ) as executor:
+                    futures = []
+                    for chunk in page_chunks:
+                        futures.append(
+                            executor.submit(
+                                self.process_page_parallel_async, pdf_path, chunk, self
+                            )
+                        )
+                    results = [f.result() for f in futures]
+                results.sort(key=lambda x: x[0][0])
+                result = []
+                for r in results:
+                    result.extend(r[1])
+                results = result
+            page_lines_with_bbox = []
+            for result in results:
+                for page in result.pages:
+                    if first_page:
+                        img_width, img_height = doc[0].shape[1], doc[0].shape[0]
+                    else:
+                        img_width, img_height = page_witdh_f, page_height_f
+                    words = []
+                    for block in page.blocks:
+                        for line in block.lines:
+                            for word in line.words:
+                                x0, y0 = word.geometry[0]
+                                x1, y1 = word.geometry[1]
+                                abs_x0 = x0 * img_width
+                                abs_y0 = y0 * img_height
+                                abs_x1 = x1 * img_width
+                                abs_y1 = y1 * img_height
+                                text = word.value.strip().lower()
+                                text = re.sub(r"[#*]", " ", text)
+                                text = re.sub(f"[$]", "", text)
+                                text = text.strip()
+                                if (
+                                    text == "|"
+                                    or not text
+                                    or text == "."
+                                    or text == "#"
+                                    or re.sub(r"[^\w\s-]", "", text) == ""
+                                    or re.sub(r"\d{19,}", "", text) == ""
+                                ):
+                                    continue
+                                words.append(
+                                    {
+                                        "word": text,
+                                        "bbox": [abs_x0, abs_y0, abs_x1, abs_y1],
+                                    }
+                                )
+                # Sort words by y then x
+                words.sort(key=lambda w: (round(w["bbox"][1], 3), w["bbox"][0]))
+                lines = []
+                current_line = []
+                current_word_data = []
+                current_y = None
+                for w in words:
+                    y0 = w["bbox"][1]
+                    if current_y is None or abs(y0 - current_y) < y_threshold:
+                        current_line.append((w["bbox"][0], y0, w["word"]))
+                        current_word_data.append(w)
+                        current_y = y0
+                    else:
+                        current_line.sort()
+                        line_words = [x[2] for x in current_line]
+                        clean_line = self.remove_consecutive_items(line_words)
+                        current_word_data = sorted(
+                            current_word_data, key=lambda w: w["bbox"][0]
+                        )
+                        clean_word_data = self.remove_consecutive_words(
+                            current_word_data
+                        )
+                        if clean_line:
+                            x_start = min(x[0] for x in current_line)
+                            y_start = min(x[1] for x in current_line)
+                            if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
+                                lines.append(
+                                    {
+                                        "line": " ".join(clean_line),
+                                        "bbox": [x_start, y_start],
+                                        "words": clean_word_data,
+                                    }
+                                )
+                        current_line = [(w["bbox"][0], y0, w["word"])]
+                        current_word_data = [w]
+                        current_y = y0
+                # Final remaining line
+                if current_line:
+                    current_line.sort()
+                    line_words = [x[2] for x in current_line]
+                    clean_line = self.remove_consecutive_items(line_words)
+                    current_word_data = sorted(
+                        current_word_data, key=lambda w: w["bbox"][0]
+                    )
+                    clean_word_data = self.remove_consecutive_words(current_word_data)
+                    if clean_line:
+                        x_start = min(x[0] for x in current_line)
+                        y_start = min(x[1] for x in current_line)
+                        if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
+                            lines.append(
+                                {
+                                    "line": " ".join(clean_line),
+                                    "bbox": [x_start, y_start],
+                                    "words": clean_word_data,
+                                }
+                            )
+                page_lines_with_bbox.append(lines)
+            return page_lines_with_bbox
+        return await asyncio.get_event_loop().run_in_executor(
+            None, _extract_from_scanned
+        )