Spaces:
Sleeping
Sleeping
| """ | |
| OCR pipeline for complaint documents. | |
| Pre-processing steps applied before Tesseract: | |
| 1. Convert to greyscale | |
| 2. Adaptive threshold (Gaussian-blur background subtraction) | |
| 3. Deskew (pytesseract OSD; graceful no-op if unavailable) | |
| Supported formats: PDF (pdfplumber), PNG / JPG / JPEG / WEBP. | |
| Public API: | |
| extract_text(file_path) -> str | |
| extract_with_entities(file_path) -> tuple[str, list[Entity]] | |
| preprocess_image(image_path) -> PIL.Image | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import re | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Optional | |
| import numpy as np | |
| import pytesseract | |
| from PIL import Image, ImageFilter | |
| from src.ner.model import Entity | |
| logger = logging.getLogger(__name__) | |
| SUPPORTED_IMAGE_EXTS: frozenset[str] = frozenset( | |
| {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff"} | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Image pre-processing | |
| # --------------------------------------------------------------------------- | |
| def _adaptive_threshold(img: Image.Image) -> Image.Image: | |
| """ | |
| Binarise a greyscale image using local background estimation. | |
| Gaussian-blurred copy → local background estimate; pixels darker than | |
| the background by >10 grey levels become black (foreground text), | |
| everything else becomes white. | |
| """ | |
| bg = img.filter(ImageFilter.GaussianBlur(radius=15)) | |
| arr = np.array(img, dtype=np.int32) | |
| arr_bg = np.array(bg, dtype=np.int32) | |
| # Dark relative to local background → foreground (text) | |
| mask = ((arr_bg - arr) > 10).astype(np.uint8) * 255 | |
| return Image.fromarray(mask) | |
| def _deskew(img: Image.Image) -> Image.Image: | |
| """ | |
| Correct page skew using pytesseract's Orientation and Script Detection (OSD). | |
| Silent no-op if OSD data is unavailable or the image has insufficient text | |
| for detection. | |
| """ | |
| try: | |
| osd = pytesseract.image_to_osd( | |
| img, output_type=pytesseract.Output.DICT, nice=0 | |
| ) | |
| angle = osd.get("rotate", 0) | |
| if abs(angle) > 1: | |
| img = img.rotate(-angle, expand=True, fillcolor=255) | |
| except Exception: | |
| pass # OSD traineddata not installed or too little text — skip | |
| return img | |
| def _preprocess_pil(img: Image.Image) -> Image.Image: | |
| """Apply the full pre-processing pipeline to a PIL image.""" | |
| img = img.convert("L") # 1. greyscale | |
| img = img.filter(ImageFilter.MedianFilter(3)) # 2. denoise | |
| img = _deskew(img) # 3. deskew (on greyscale) | |
| img = _adaptive_threshold(img) # 4. binarise | |
| return img | |
| def preprocess_image(image_path: str) -> Image.Image: | |
| """Load *image_path* and return a pre-processed PIL image ready for Tesseract.""" | |
| return _preprocess_pil(Image.open(image_path)) | |
| # --------------------------------------------------------------------------- | |
| # Regex-based entity extraction from OCR text | |
| # --------------------------------------------------------------------------- | |
| # Patterns ordered by specificity; each produces an (Entity) object. | |
| _PATTERNS: list[tuple[str, re.Pattern]] = [ | |
| ("AMOUNT", re.compile( | |
| r"(?:₹|Rs\.?|INR|Rupees?)\s*[\d,]+(?:\.\d{1,2})?" | |
| r"|[\d,]{2,}(?:\.\d{2})?\s*/\s*-", | |
| re.IGNORECASE, | |
| )), | |
| ("DATE", re.compile( | |
| r"\b\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4}\b" | |
| r"|\b\d{1,2}\s+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?" | |
| r"|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?" | |
| r"|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{4}\b" | |
| r"|\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?" | |
| r"|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?" | |
| r"|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2},?\s+\d{4}\b", | |
| re.IGNORECASE, | |
| )), | |
| ("REF_ID", re.compile( | |
| r"(?:Order|Ref(?:erence)?|TXN|Txn|Transaction\s*ID|Ticket|Receipt" | |
| r"|Invoice|Booking|Payment\s*ID|Claim|Case|Complaint)\s*[#:\s]*" | |
| r"([A-Z0-9][-A-Z0-9]{3,24})", | |
| re.IGNORECASE, | |
| )), | |
| ("ACCOUNT", re.compile( | |
| r"(?:A/c|Account|Acct)\.?\s*(?:No\.?|Number|#)?\s*:?\s*" | |
| r"([Xx\*\d]{4,}(?:[- ][Xx\*\d]{4,})*)" | |
| r"|ending\s+(?:in|with)\s+(\d{4})", | |
| re.IGNORECASE, | |
| )), | |
| ("ORG", re.compile( | |
| r"\b(?:Flipkart|Amazon(?:\s+India)?|Myntra|Snapdeal|Meesho" | |
| r"|HDFC\s+Bank|ICICI\s+Bank|State\s+Bank\s+of\s+India|SBI|Axis\s+Bank" | |
| r"|Kotak(?:\s+Mahindra)?\s+Bank|Punjab\s+National\s+Bank|PNB|Bank\s+of\s+Baroda" | |
| r"|Airtel|Reliance\s+Jio|Jio|Vodafone(?:\s+Idea)?|Vi|BSNL" | |
| r"|LIC(?:\s+of\s+India)?|Star\s+Health|New\s+India\s+Assurance" | |
| r"|ICICI\s+Lombard|HDFC\s+ERGO|CIBIL|Experian(?:\s+India)?" | |
| r"|Swiggy|Zomato|Ola(?:\s+Cabs)?|Uber(?:\s+India)?|IRCTC|MakeMyTrip|Paytm|PhonePe)\b", | |
| re.IGNORECASE, | |
| )), | |
| ] | |
| def _extract_entities_from_text(text: str, base_confidence: float = 0.75) -> list[Entity]: | |
| """Apply regex patterns to *text* and return Entity spans.""" | |
| entities: list[Entity] = [] | |
| for label, pattern in _PATTERNS: | |
| for m in pattern.finditer(text): | |
| # For patterns with groups (REF_ID, ACCOUNT), prefer the captured group | |
| span_text = next( | |
| (g for g in m.groups() if g is not None), m.group(0) | |
| ).strip() | |
| if not span_text: | |
| continue | |
| start = text.find(span_text, m.start()) | |
| if start == -1: | |
| start = m.start() | |
| entities.append(Entity( | |
| text=span_text, | |
| label=label, | |
| start=start, | |
| end=start + len(span_text), | |
| confidence=base_confidence, | |
| )) | |
| return entities | |
| # --------------------------------------------------------------------------- | |
| # Text extraction — images | |
| # --------------------------------------------------------------------------- | |
| def _clean_text(raw: str) -> str: | |
| """Normalise whitespace and remove form-feed characters from OCR output.""" | |
| raw = raw.replace("\f", "\n") | |
| raw = re.sub(r"\n{3,}", "\n\n", raw) | |
| return raw.strip() | |
| def ocr_image(image_path: str) -> str: | |
| """Pre-process *image_path* and return Tesseract OCR text.""" | |
| img = preprocess_image(image_path) | |
| raw = pytesseract.image_to_string(img, lang="eng") | |
| return _clean_text(raw) | |
| def _ocr_pil(img: Image.Image) -> str: | |
| """OCR a PIL image that has already been loaded (used by the PDF path).""" | |
| preprocessed = _preprocess_pil(img) | |
| return _clean_text(pytesseract.image_to_string(preprocessed, lang="eng")) | |
| # --------------------------------------------------------------------------- | |
| # Text extraction — PDF | |
| # --------------------------------------------------------------------------- | |
| def extract_text_pdf(pdf_path: str) -> str: | |
| """ | |
| Extract text from *pdf_path* using pdfplumber. | |
| Text-native pages: direct character extraction. | |
| Scanned pages (extracted text < 20 chars): rendered to image and OCR-ed. | |
| Page rendering requires either pdfplumber's wand backend (ImageMagick) or | |
| pdf2image + poppler; if neither is available the scanned page is skipped | |
| with a WARNING log rather than crashing. | |
| """ | |
| import pdfplumber | |
| page_texts: list[str] = [] | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page in pdf.pages: | |
| text = (page.extract_text() or "").strip() | |
| if len(text) >= 20: | |
| page_texts.append(text) | |
| continue | |
| # Scanned page — try image render + OCR | |
| pil: Optional[Image.Image] = None | |
| try: | |
| pil = page.to_image(resolution=200).original | |
| except Exception as exc: | |
| logger.warning( | |
| "PDF page %d: could not render to image (%s). " | |
| "Install ImageMagick/wand or pdf2image+poppler for scanned PDF support.", | |
| page.page_number, exc, | |
| ) | |
| if pil is None: | |
| # Try pdf2image as an alternative | |
| try: | |
| from pdf2image import convert_from_path | |
| imgs = convert_from_path( | |
| pdf_path, | |
| dpi=200, | |
| first_page=page.page_number, | |
| last_page=page.page_number, | |
| ) | |
| if imgs: | |
| pil = imgs[0] | |
| except Exception: | |
| pass | |
| if pil is not None: | |
| text = _ocr_pil(pil) | |
| if text: | |
| page_texts.append(text) | |
| return "\n\n".join(page_texts).strip() | |
| # --------------------------------------------------------------------------- | |
| # Public dispatch | |
| # --------------------------------------------------------------------------- | |
| def extract_text(file_path: str) -> str: | |
| """ | |
| Extract clean text from *file_path*. | |
| Dispatches to extract_text_pdf for .pdf, or ocr_image for image formats. | |
| Raises ValueError for unsupported extensions. | |
| """ | |
| ext = Path(file_path).suffix.lower() | |
| if ext == ".pdf": | |
| return extract_text_pdf(file_path) | |
| if ext in SUPPORTED_IMAGE_EXTS: | |
| return ocr_image(file_path) | |
| raise ValueError( | |
| f"Unsupported file extension {ext!r}. " | |
| f"Supported: .pdf, {', '.join(sorted(SUPPORTED_IMAGE_EXTS))}" | |
| ) | |
| def extract_with_entities( | |
| file_path: str, | |
| ) -> tuple[str, list[Entity]]: | |
| """ | |
| Extract text and regex-detected entities from *file_path*. | |
| Returns (text, entities) where entities are in the same schema as | |
| EvidenceNER (Entity dataclass with text/label/start/end/confidence). | |
| These regex spans complement the model-based spans from EvidenceNER and | |
| DocumentViT; the DocumentProcessor merges all three sources. | |
| """ | |
| text = extract_text(file_path) | |
| entities = _extract_entities_from_text(text) | |
| return text, entities | |