diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -1,11 +1,16 @@ -"""Smart Parchi OCR Enterprise — Hugging Face Space backend (CPU-first monolith) v6.0. -Upgrade notes (v6.0): - • Primary engine: Qwen2-VL-2B-Instruct via transformers (fp32 CPU, ~8 GB RAM). - • bitsandbytes REMOVED — GPU-only lib; replaced with plain torch CPU build. - • Concurrency guard: asyncio.Semaphore(2) prevents OOM on simultaneous requests. - • Image-hash LRU cache: up to 100 results, 1-hour TTL. - • Explicit gc.collect() after every request to prevent memory bloat. - • EasyOCR + PaddleOCR kept as fallback ensemble (unchanged). +""" +Parchi OCR – Minimal CPU-Optimised FastAPI Backend +==================================================== +• EasyOCR ['ur', 'en'] – single reader, warm on startup +• Pre-processing : CLAHE → denoise → sharpen → adaptive-threshold +• Multi-variant : original + inverted + high-contrast → merge +• Geometry line grouping (Y-centre clustering, no column assumption) +• Number post-processor (O→0, l→1, spaced digits merged) +• Generic item parser (any text+number pattern) +• Bottom-ROI total pass (crop 25 %, digit whitelist re-OCR) +• Lexicon correction (30 common Pakistani grocery/shop items) +• SHA-256 LRU cache (24 h TTL, 500 entries max) +• ZDR compliance (no persistent storage, metadata logs only) """ from __future__ import annotations @@ -14,3982 +19,770 @@ import asyncio import gc import hashlib import io -import itertools -import json import logging import math -import os -import pickle -import random import re -import sys import threading import time -import warnings -from collections import defaultdict -from contextlib import asynccontextmanager, redirect_stdout, redirect_stderr -from dataclasses import dataclass, field -from datetime import datetime -from enum import Enum -from functools import lru_cache, wraps -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union -from typing_extensions import TypedDict - -# ============================================================================ -# ENVIRONMENT OPTIMIZATION (CRITICAL - SET BEFORE IMPORTS) -# ============================================================================ -# Suppress warnings for cleaner logs -warnings.filterwarnings("ignore") - -# HF CPU stability defaults (can be overridden by Space env variables). -os.environ.setdefault("OMP_NUM_THREADS", "1") - -# Suppress progress bars from external libraries (tqdm, huggingface_hub, etc.) -os.environ["DISABLE_TQDM"] = "1" -os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" -os.environ["PADDLE_DOWNLOAD_CACHE"] = "/.cache/paddlepaddle" -os.environ["PADDLE_HOME"] = "/.cache/paddlepaddle" - -# PaddleOCR optimizations for CPU -os.environ["PADDLE_INFERENCE_MODEL_CACHE"] = "/.cache/paddlepaddle/models" -os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "0" - -# Suppress library-level logging initially -logging.getLogger("urllib3").setLevel(logging.ERROR) -logging.getLogger("huggingface_hub").setLevel(logging.ERROR) -logging.getLogger("paddleocr").setLevel(logging.CRITICAL) -logging.getLogger("paddlepaddle").setLevel(logging.CRITICAL) - -# ============================================================================ -# CORE DEPENDENCIES -# ============================================================================ +from collections import OrderedDict +from datetime import datetime, timezone +from typing import Any + import cv2 +import easyocr import numpy as np -from PIL import Image, ImageEnhance, ImageFilter -from fastapi import FastAPI, File, HTTPException, UploadFile, BackgroundTasks +import uvicorn +from fastapi import FastAPI, File, HTTPException, UploadFile from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse -from rapidfuzz import process as fuzzy_process -from pydantic import BaseModel, Field, validator -from sklearn.cluster import DBSCAN -from sklearn.metrics.pairwise import cosine_similarity - -# OCR Engines (EasyOCR optional — HF builds may omit it; Paddle still works) -try: - # Suppress EasyOCR startup spam - with redirect_stdout(open(os.devnull, 'w')), redirect_stderr(open(os.devnull, 'w')): - import easyocr - EASYOCR_AVAILABLE = True -except Exception as e: - easyocr = None # type: ignore - EASYOCR_AVAILABLE = False - -try: - # Suppress PaddleOCR startup spam - with redirect_stdout(open(os.devnull, 'w')), redirect_stderr(open(os.devnull, 'w')): - from paddleocr import PaddleOCR - PADDLE_AVAILABLE = True -except Exception as e: - PADDLE_AVAILABLE = False - -try: - import pytesseract - TESSERACT_AVAILABLE = True -except Exception: - TESSERACT_AVAILABLE = False - -# Optional: Lightweight ML for handwriting (scikit-learn only, no heavy dependencies) -try: - from sklearn.ensemble import RandomForestClassifier - from sklearn.preprocessing import StandardScaler - SKLEARN_AVAILABLE = True -except Exception: - SKLEARN_AVAILABLE = False - -# ── Torch (CPU-only build) ──────────────────────���───────────────────────────── -try: - import torch - TORCH_AVAILABLE = True -except Exception: - torch = None # type: ignore - TORCH_AVAILABLE = False - -# ── Transformers (for Qwen2-VL VLM) ────────────────────────────────────────── -try: - from transformers import AutoProcessor, Qwen2VLForConditionalGeneration - TRANSFORMERS_AVAILABLE = True -except Exception: - AutoProcessor = None # type: ignore - Qwen2VLForConditionalGeneration = None # type: ignore - TRANSFORMERS_AVAILABLE = False - -# ── Qwen-VL utils (image/video message builder) ─────────────────────────────── -try: - from qwen_vl_utils import process_vision_info - QWEN_VL_UTILS_AVAILABLE = True -except Exception: - process_vision_info = None # type: ignore - QWEN_VL_UTILS_AVAILABLE = False - -# ── psutil for memory monitoring ────────────────────────────────────────────── -try: - import psutil as _psutil - PSUTIL_AVAILABLE = True -except Exception: - _psutil = None # type: ignore - PSUTIL_AVAILABLE = False - - -def _get_rss_mb() -> float: - """Return current process RSS in MB (0 if psutil unavailable).""" - if not PSUTIL_AVAILABLE or _psutil is None: - return 0.0 - try: - return _psutil.Process().memory_info().rss / 1024 / 1024 - except Exception: - return 0.0 - +from PIL import Image +from rapidfuzz import process as rfprocess -def _free_memory() -> None: - """Aggressively release Python + PyTorch memory after each request.""" - gc.collect() - # torch.cuda.empty_cache() is a no-op on CPU builds but we call it - # defensively in case someone switches to a GPU Space later. - if TORCH_AVAILABLE and torch is not None: - try: - torch.cuda.empty_cache() - except Exception: - pass - -# ============================================================================ -# LOGGING CONFIGURATION -# ============================================================================ +# --------------------------------------------------------------------------- +# Logging +# --------------------------------------------------------------------------- logging.basicConfig( level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[ - logging.StreamHandler() - ] + format="%(asctime)s | %(levelname)s | %(message)s", + datefmt="%H:%M:%S", ) -logger = logging.getLogger(__name__) - -# OCR warm-up lifecycle (exposed in /health for HF debugging) -OCR_WARMUP_STATUS: str = "pending" - -# ============================================================================ -# CONFIGURATION MANAGEMENT -# ============================================================================ - -class SystemConfig: - """Centralized configuration for the entire system""" - - # Image Processing - TARGET_WIDTH = 1200 - TARGET_HEIGHT = 1600 - MIN_QUALITY_THRESHOLD = 0.3 - MAX_IMAGE_SIZE_MB = 10 - - # Preprocessing Parameters - CLAHE_CLIP_LIMIT = 2.5 - CLAHE_TILE_SIZE = (8, 8) - SHARPEN_KERNEL = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]]) - BLUR_KERNEL = (3, 3) - DENOISE_H = 10 - - # OCR Parameters - EASYOCR_CONF_THRESH = 0.15 # v5.1: Lowered for robustness - EASYOCR_TEXT_THRESH = 0.25 # v5.1: Lowered for robustness - EASYOCR_LOW_TEXT = 0.35 - PADDLE_CONF_THRESH = 0.5 - TESSERACT_CONF_THRESH = 30 - - # Ensemble Weights - EASYOCR_WEIGHT = 0.30 - PADDLE_WEIGHT = 0.55 # 16GB tier: keep Paddle as primary engine - TESSERACT_WEIGHT = 0.25 - - # v5.2: ADAPTIVE QUALITY-BASED THRESHOLDS - QUALITY_SHARPNESS_EXCELLENT = 0.80 # Excellent: sharpness > 0.80 - QUALITY_SHARPNESS_POOR = 0.45 # Poor: sharpness < 0.45 - AGGRESSIVE_PARSING_THRESHOLD = 0.45 # Enable aggressive parsing if sharpness < this - - # v5.2: Adaptive threshold multipliers (applied based on quality) - ADAPT_THRESH_AGGRESSIVE = 0.10 # Very low quality images - ADAPT_THRESH_NORMAL = 0.15 # Normal quality - ADAPT_THRESH_STRICT = 0.25 # High quality - - # v5.2: Confidence boosting - CONF_BOOST_LEXICON = 0.12 # Boost if item in lexicon - CONF_BOOST_PATTERN = 0.08 # Boost if matches known pattern - CONF_BOOST_MATH = 0.15 # Boost if math validates - CONF_PENALTY_FALLBACK = -0.15 # Penalty for fallback items - - # Price/Quantity Validation - MAX_REASONABLE_PRICE = 50000.0 - MIN_ITEM_PRICE = 5.0 - MAX_QUANTITY = 1000 - TOTAL_VALIDATION_TOLERANCE = 2.0 - - # Performance - MAX_WORKERS = 2 - CACHE_TTL = 3600 # 1 hour (per spec §9) - MAX_CACHE_SIZE = 100 # max 100 cache entries (per spec §7) - MAX_PROCESSING_TIME = 60 # seconds - NAME_ROI_RATIO = 0.22 - FASTAPI_HARD_TIMEOUT_SECONDS = float(os.getenv("FASTAPI_HARD_TIMEOUT_SECONDS", "120")) - - # VLM Configuration (v6.0) - # Model is loaded lazily on first request to avoid blocking startup. - # Set ENABLE_VLM=0 to force EasyOCR-only mode (saves ~8 GB RAM). - ENABLE_VLM = os.getenv("ENABLE_VLM", "1").strip().lower() not in ("0", "false", "no", "off") - VLM_MODEL_ID = os.getenv("VLM_MODEL_ID", "Qwen/Qwen2-VL-2B-Instruct") - VLM_MAX_NEW_TOKENS = int(os.getenv("VLM_MAX_NEW_TOKENS", "512")) - VLM_TIMEOUT_SECONDS = float(os.getenv("VLM_TIMEOUT_SECONDS", "60")) - # Peak memory guard: if RSS exceeds this MB after VLM load, disable VLM - VLM_MEMORY_LIMIT_MB = float(os.getenv("VLM_MEMORY_LIMIT_MB", "12000")) # 12 GB - - # Paddle CPU tuning (HF CPU-only) - PADDLE_CPU_THREADS = int(os.getenv("PADDLE_CPU_THREADS", "4")) - PADDLE_USE_MKLDNN = os.getenv("PADDLE_USE_MKLDNN", "1").strip().lower() not in ("0", "false", "no", "off") - # Optional: point to pre-downloaded "server" inference models (directories) - PADDLE_DET_MODEL_DIR = os.getenv("PADDLE_DET_MODEL_DIR", "").strip() - PADDLE_REC_MODEL_DIR = os.getenv("PADDLE_REC_MODEL_DIR", "").strip() - PADDLE_CLS_MODEL_DIR = os.getenv("PADDLE_CLS_MODEL_DIR", "").strip() - PADDLE_OCR_VERSION = os.getenv("PADDLE_OCR_VERSION", "PP-OCRv4").strip() - - # Feature Flags (HF CPU: Paddle + EasyOCR ensemble — set ENABLE_PADDLE=0 to force EasyOCR-only) - ENABLE_PADDLE = os.getenv("ENABLE_PADDLE", "1").strip().lower() not in ("0", "false", "no", "off") - ENABLE_TESSERACT = os.getenv("ENABLE_TESSERACT", "0").strip().lower() in ("1", "true", "yes", "on") - ENABLE_SEMANTIC_CORRECTION = True - ENABLE_INTELLIGENT_TOTAL = True - ENABLE_AGENTIC_LOOP = True - - # Paths (HF Spaces compatible - using /app/data created in Docker) - DATA_ROOT = os.getenv("FEEDBACK_DATA_PATH", "/app/data") - MODEL_CACHE = os.getenv("EASYOCR_CACHE", "/.cache") - - @classmethod - def validate(cls): - """Validate configuration on startup""" - os.makedirs(cls.DATA_ROOT, exist_ok=True) - os.makedirs(cls.MODEL_CACHE, exist_ok=True) - # Torch / HF caches — keeps EasyOCR+Paddle weights off ephemeral-only paths when possible - os.environ.setdefault("TORCH_HOME", cls.MODEL_CACHE) - os.environ.setdefault("PADDLEOCR_HOME", os.path.join(cls.MODEL_CACHE, "paddleocr")) - # Thread caps (keep deterministic on small CPU) - os.environ.setdefault("OMP_NUM_THREADS", str(max(1, cls.PADDLE_CPU_THREADS))) - os.environ.setdefault("OPENBLAS_NUM_THREADS", str(max(1, cls.PADDLE_CPU_THREADS))) - os.environ.setdefault("MKL_NUM_THREADS", str(max(1, cls.PADDLE_CPU_THREADS))) - logger.info(f"✓ Data directory: {cls.DATA_ROOT}") - logger.info(f"✓ Model cache: {cls.MODEL_CACHE}") - - @classmethod - def get_adaptive_thresholds(cls, sharpness_score: float) -> Tuple[float, float]: - """v5.2: Calculate adaptive OCR thresholds based on image quality (sharpness)""" - if sharpness_score < cls.QUALITY_SHARPNESS_POOR: # Very poor (< 0.45) - return (cls.ADAPT_THRESH_AGGRESSIVE, cls.ADAPT_THRESH_AGGRESSIVE * 1.2) - elif sharpness_score < 0.60: # Poor (0.45-0.60) - return (cls.EASYOCR_CONF_THRESH * 0.85, cls.EASYOCR_TEXT_THRESH * 0.85) - elif sharpness_score > cls.QUALITY_SHARPNESS_EXCELLENT: # Excellent (> 0.80) - return (cls.ADAPT_THRESH_STRICT, cls.ADAPT_THRESH_STRICT * 1.1) - else: # Normal (0.60-0.80) - return (cls.ADAPT_THRESH_NORMAL, cls.EASYOCR_TEXT_THRESH) - - -# ============================================================================ -# DATA MODELS (Pydantic) -# ============================================================================ - -class ProcessingStatus(str, Enum): - PENDING = "pending" - PROCESSING = "processing" - COMPLETED = "completed" - PARTIAL = "partial" - FAILED = "failed" - - -class EntityType(str, Enum): - CUSTOMER_NAME = "customer_name" - ITEM_NAME = "item_name" - QUANTITY = "quantity" - PRICE = "price" - TOTAL = "total" - UNIT = "unit" - - -class ConfidenceLevel(str, Enum): - HIGH = "high" # >0.85 - MEDIUM = "medium" # 0.65-0.85 - LOW = "low" # 0.45-0.65 - VERY_LOW = "very_low" # <0.45 - - -class BoundingBox(BaseModel): - """Bounding box coordinates for visual grounding""" - x1: float = Field(ge=0, le=1) - y1: float = Field(ge=0, le=1) - x2: float = Field(ge=0, le=1) - y2: float = Field(ge=0, le=1) - - @classmethod - def from_cv_bbox(cls, bbox: List, img_shape: Tuple[int, int]) -> 'BoundingBox': - h, w = img_shape[:2] - if isinstance(bbox, list) and len(bbox) >= 4: - x_coords = [p[0] for p in bbox] - y_coords = [p[1] for p in bbox] - return cls( - x1=max(0, min(x_coords)) / w, - y1=max(0, min(y_coords)) / h, - x2=min(w, max(x_coords)) / w, - y2=min(h, max(y_coords)) / h - ) - return cls(x1=0, y1=0, x2=1, y2=1) - - -class ExtractedItem(BaseModel): - """Structured item extraction result""" - name: str = Field(description="Item name (cleaned and normalized)") - quantity: float = Field(default=1.0, gt=0, description="Quantity") - price: float = Field(gt=0, description="Price in PKR") - unit: str = Field(default="pc", description="Unit (kg, g, liter, dozen, pc)") - confidence: float = Field(ge=0, le=1, description="Extraction confidence") - original_text: str = Field(default="", description="Raw OCR text") - bounding_box: Optional[BoundingBox] = None - semantic_match: Optional[str] = None - semantic_score: float = 0.0 - - class Config: - json_schema_extra = { - "example": { - "name": "atta", - "quantity": 2.0, - "price": 200.0, - "unit": "kg", - "confidence": 0.92, - "original_text": "Atta-2 kg 200" - } - } - - -class ProcessingResult(BaseModel): - """Final processing result""" - request_id: str - success: bool - customer_name: Optional[str] = None - items: List[ExtractedItem] = [] - # Mobile compatibility alias for legacy clients expecting item/qty/price rows. - items_list: List[Dict[str, str]] = [] - # Explicit alias some clients read first. - line_items: List[Dict[str, str]] = [] - total_amount: float = 0.0 - # Mobile compatibility alias - total: float = 0.0 - transaction_type: str = "unknown" - # Mobile compatibility alias - type: str = "unknown" - # True when paper "Total" differs from computed (shopkeeper rule). - mismatch: bool = False - # Strict client-friendly overall confidence score (0..1) - confidence_score: float = 0.0 - confidence: Dict[str, Union[float, str, bool]] = Field(default_factory=dict) - processing_time_ms: int = 0 - status: ProcessingStatus = ProcessingStatus.PENDING - errors: List[str] = [] - warnings: List[str] = [] - hitl_data: Dict[str, Any] = Field( - default_factory=lambda: {"name_review_required": False, "name_candidates": []} - ) - metadata: Dict[str, Any] = Field(default_factory=dict) - # Mobile compatibility alias - meta: Dict[str, Any] = Field(default_factory=dict) - - class Config: - json_schema_extra = { - "example": { - "request_id": "abc-123", - "success": True, - "customer_name": "Umar", - "items": [], - "total_amount": 950.0, - "transaction_type": "udhaar", - "confidence": {"items": 0.85, "total": 0.95}, - "processing_time_ms": 2345 - } - } - - -# ============================================================================ -# ADVANCED IMAGE PREPROCESSOR (Pass 1) -# ============================================================================ - -class AdvancedImagePreprocessor: - """ - Pass 1: Multi-stage image enhancement for handwritten parchis. - - Stages: - 1. Orientation correction (0/90/180/270) - 2. Perspective correction (document flattening) - 3. Shadow removal (CLAHE) - 4. Noise reduction (Non-local means) - 5. Stroke width normalization (morphological) - 6. Sharpening (unsharp mask) - 7. Binarization (adaptive threshold) - """ - - def __init__(self, config: SystemConfig): - self.config = config - - def analyze_image_quality(self, image: np.ndarray) -> Dict[str, float]: - """Analyze image quality metrics""" - if len(image.shape) == 3: - gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) - else: - gray = image - - # Sharpness (Laplacian variance) - sharpness = cv2.Laplacian(gray, cv2.CV_64F).var() - - # Brightness - brightness = np.mean(gray) - - # Contrast (standard deviation) - contrast = gray.std() - - # Noise level - noise = np.std(gray - cv2.GaussianBlur(gray, (5, 5), 0)) - - # Normalized scores (0-1) - sharpness_score = min(1.0, sharpness / 500) - brightness_score = brightness / 255 - contrast_score = min(1.0, contrast / 100) - noise_score = max(0.0, 1.0 - (noise / 50)) - - overall_quality = (sharpness_score + brightness_score + contrast_score + noise_score) / 4 - - return { - "sharpness": round(sharpness_score, 3), - "brightness": round(brightness_score, 3), - "contrast": round(contrast_score, 3), - "noise": round(noise_score, 3), - "overall": round(overall_quality, 3) - } - - def auto_orient(self, image: np.ndarray) -> np.ndarray: - """ - Auto-detect and correct image orientation using edge detection - """ - try: - height, width = image.shape[:2] - if min(height, width) < 100: - return image - - # Convert to grayscale and resize for speed - gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) - small = cv2.resize(gray, (max(400, width//3), max(300, height//3))) - - # Detect edges - edges = cv2.Canny(small, 50, 150) - - # Hough line detection - lines = cv2.HoughLines(edges, 1, np.pi/180, threshold=int(len(small) * 0.3)) - - if lines is not None: - angles = [] - for line in lines[:20]: # Limit to 20 lines - rho, theta = line[0] - angle = theta * 180 / np.pi - 90 - if -45 < angle < 45: - angles.append(angle) - - if angles: - median_angle = np.median(angles) - if abs(median_angle) > 3: - # Rotate image - h, w = image.shape[:2] - center = (w // 2, h // 2) - M = cv2.getRotationMatrix2D(center, median_angle, 1.0) - rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE) - return rotated - - return image - except Exception as e: - logger.warning(f"Orientation correction failed: {e}") - return image - - def perspective_correction(self, image: np.ndarray) -> np.ndarray: - """ - Apply perspective correction to flatten warped receipts - """ - try: - gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) - blur = cv2.GaussianBlur(gray, (5, 5), 0) - edges = cv2.Canny(blur, 50, 150) - - # Find contours - contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - - if not contours: - return image - - # Find largest contour - largest_contour = max(contours, key=cv2.contourArea) - - # Approximate polygon - peri = cv2.arcLength(largest_contour, True) - approx = cv2.approxPolyDP(largest_contour, 0.02 * peri, True) - - # If we found 4 corners, apply perspective transform - if len(approx) == 4: - pts = approx.reshape(4, 2) - rect = self._order_points(pts) - warped = self._four_point_transform(image, rect) - return warped - - return image - except Exception as e: - logger.warning(f"Perspective correction failed: {e}") - return image - - def _order_points(self, pts: np.ndarray) -> np.ndarray: - """Order points in clockwise order""" - rect = np.zeros((4, 2), dtype=np.float32) - s = pts.sum(axis=1) - rect[0] = pts[np.argmin(s)] - rect[2] = pts[np.argmax(s)] - diff = np.diff(pts, axis=1) - rect[1] = pts[np.argmin(diff)] - rect[3] = pts[np.argmax(diff)] - return rect - - def _four_point_transform(self, image: np.ndarray, pts: np.ndarray) -> np.ndarray: - """Apply perspective transform""" - (tl, tr, br, bl) = pts - width_a = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2)) - width_b = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2)) - max_width = max(int(width_a), int(width_b)) - height_a = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2)) - height_b = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2)) - max_height = max(int(height_a), int(height_b)) - dst = np.array([[0, 0], [max_width - 1, 0], [max_width - 1, max_height - 1], [0, max_height - 1]], dtype=np.float32) - M = cv2.getPerspectiveTransform(pts, dst) - warped = cv2.warpPerspective(image, M, (max_width, max_height)) - return warped - - def enhance_image(self, rgb: np.ndarray) -> np.ndarray: - """ - Full enhancement pipeline - """ - # 1. Orientation correction - oriented = self.auto_orient(rgb) - - # 2. Perspective correction - perspective = self.perspective_correction(oriented) - - # Convert to grayscale - gray = cv2.cvtColor(perspective, cv2.COLOR_RGB2GRAY) - - # 3. CLAHE for shadow removal and contrast enhancement - clahe = cv2.createCLAHE(clipLimit=self.config.CLAHE_CLIP_LIMIT, tileGridSize=self.config.CLAHE_TILE_SIZE) - clahe_img = clahe.apply(gray) - - # 4. Denoise (Non-local means) - denoised = cv2.fastNlMeansDenoising(clahe_img, h=self.config.DENOISE_H) - - # 5. Sharpen - sharpened = cv2.filter2D(denoised, -1, self.config.SHARPEN_KERNEL) - - # 6. Morphological closing to connect broken strokes - kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) - morphed = cv2.morphologyEx(sharpened, cv2.MORPH_CLOSE, kernel) - - # 7. Adaptive thresholding for binarization - binary = cv2.adaptiveThreshold( - morphed, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, - cv2.THRESH_BINARY, 15, 5 - ) - - # Return RGB format for OCR compatibility - return cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB) - - def generate_variants(self, enhanced: np.ndarray) -> List[np.ndarray]: - """Generate multiple processing variants for ensemble""" - variants = [enhanced] - - # Inverted variant (for light text on dark background) - inverted = cv2.bitwise_not(enhanced) - variants.append(inverted) - - # High contrast variant - gray = cv2.cvtColor(enhanced, cv2.COLOR_RGB2GRAY) - _, high_contrast = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - variants.append(cv2.cvtColor(high_contrast, cv2.COLOR_GRAY2RGB)) - - # Morphological variant (thickened strokes) - kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) - dilated = cv2.dilate(gray, kernel, iterations=1) - variants.append(cv2.cvtColor(dilated, cv2.COLOR_GRAY2RGB)) - - return variants[:4] # Limit to 4 variants for speed - - -# ============================================================================ -# MULTI-ENGINE OCR ENSEMBLE (Pass 2) -# ============================================================================ - -class OCRTextBlock(TypedDict): - """Typed dictionary for OCR results""" - text: str - confidence: float - bbox: List[List[float]] - line_number: int - engine: str - - -class MultiEngineOCR: - """ - Pass 2: Multi-engine OCR with intelligent result merging. - - Engines: - - EasyOCR (primary, good for Urdu) - - PaddleOCR (fallback, good for numbers) - - Tesseract (secondary fallback) - """ - - def __init__(self, config: SystemConfig): - self.config = config - self._easyocr = None - self._paddle = None - self._initialized = False - self._init_lock = threading.Lock() - # v5.2: Debug metrics for parser robustness - self.easyocr_rows_skipped = 0 - self.paddle_rows_skipped = 0 - - @staticmethod - def _extract_text_conf(entry: Any) -> Tuple[Optional[str], Optional[float]]: - """ - Flexible OCR unpacking: - - [text, conf] - - [[box], (text, conf)] - - [[box], [text, conf]] - """ - try: - # flat pair e.g. [text, conf] - if isinstance(entry, (list, tuple)) and len(entry) == 2: - a, b = entry - if isinstance(a, str): - return a, float(b) if b is not None else None - if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)) and len(b) >= 2: - return str(b[0]), float(b[1]) - if isinstance(a, (list, tuple)) and isinstance(b, str): - return b, None - # nested fallback: use last tuple/list if looks like (text, conf) - if isinstance(entry, (list, tuple)) and entry: - tail = entry[-1] - if isinstance(tail, (list, tuple)) and len(tail) >= 2: - return str(tail[0]), float(tail[1]) - except Exception: - return None, None - return None, None - - def initialize(self): - """One-time OCR engine init (call from startup — avoids first-request model download stall).""" - with self._init_lock: - if self._initialized: - return - - if EASYOCR_AVAILABLE and easyocr is not None: - logger.info("Initializing EasyOCR (models cached under %s)...", SystemConfig.MODEL_CACHE) - try: - # Suppress EasyOCR model download progress bars - with redirect_stdout(open(os.devnull, 'w')), redirect_stderr(open(os.devnull, 'w')): - self._easyocr = easyocr.Reader( - ["en", "ur"], - gpu=False, - model_storage_directory=SystemConfig.MODEL_CACHE, - verbose=False, - ) - except Exception as exc: - logger.error(f"EasyOCR init failed: {exc}") - self._easyocr = None - else: - logger.warning("EasyOCR not available — fallback to Paddle if enabled.") - - if self.config.ENABLE_PADDLE and PADDLE_AVAILABLE: - logger.info("Initializing PaddleOCR (CPU)...") - paddle_kw = { - "lang": "en", - "use_angle_cls": True, - "det_limit_side_len": 960, - # CPU scaling - "use_mkldnn": bool(SystemConfig.PADDLE_USE_MKLDNN), - "cpu_threads": int(max(1, SystemConfig.PADDLE_CPU_THREADS)), - # Prefer modern default models - "ocr_version": SystemConfig.PADDLE_OCR_VERSION or "PP-OCRv4", - } - # Optional server-model dirs (if you provide them in the container cache) - if SystemConfig.PADDLE_DET_MODEL_DIR: - paddle_kw["det_model_dir"] = SystemConfig.PADDLE_DET_MODEL_DIR - if SystemConfig.PADDLE_REC_MODEL_DIR: - paddle_kw["rec_model_dir"] = SystemConfig.PADDLE_REC_MODEL_DIR - if SystemConfig.PADDLE_CLS_MODEL_DIR: - paddle_kw["cls_model_dir"] = SystemConfig.PADDLE_CLS_MODEL_DIR - try: - # Suppress PaddleOCR model download progress bars - with redirect_stdout(open(os.devnull, 'w')), redirect_stderr(open(os.devnull, 'w')): - try: - self._paddle = PaddleOCR(**paddle_kw) - except TypeError: - # Fallback for older PaddleOCR versions: drop newer kwargs - safe_kw = {"lang": "en", "use_angle_cls": True, "det_limit_side_len": 960} - self._paddle = PaddleOCR(**safe_kw) - except AttributeError as exc: - # Seen in some builds: AnalysisConfig missing optimization methods. - # Retry with MKLDNN disabled + smaller kwargs to avoid inference config path. - logger.warning("PaddleOCR init attribute error, retrying safe-mode: %s", exc) - safe_kw = {"lang": "en", "use_angle_cls": True, "det_limit_side_len": 960, "cpu_threads": 2} - self._paddle = PaddleOCR(**safe_kw) - except Exception as exc: - logger.error("PaddleOCR init failed (will use EasyOCR only): %s", exc) - self._paddle = None - # Disable paddle for this session - self.config.ENABLE_PADDLE = False - - self._initialized = True - logger.info( - "OCR engines ready | easyocr=%s paddle=%s", - bool(self._easyocr), - bool(self._paddle), - ) - - def run_easyocr(self, image: np.ndarray) -> List[OCRTextBlock]: - """Run EasyOCR and format results""" - results = [] - self.easyocr_rows_skipped = 0 # v5.2: Reset counter per call - if not self._easyocr: - return results - try: - raw_results = self._easyocr.readtext( - image, - detail=1, - paragraph=False, - width_ths=0.5, - ycenter_ths=0.5, - text_threshold=self.config.EASYOCR_TEXT_THRESH, - low_text=self.config.EASYOCR_LOW_TEXT, - link_threshold=0.3, - mag_ratio=1.5 - ) - - for row in raw_results: - try: - bbox = row[0] if isinstance(row, (list, tuple)) and len(row) > 0 else None - text, conf = self._extract_text_conf(row) - if not text or conf is None: - self.easyocr_rows_skipped += 1 # v5.2: Track skipped rows - continue - if conf >= self.config.EASYOCR_CONF_THRESH and text.strip() and bbox: - y_center = (bbox[0][1] + bbox[2][1]) / 2 - results.append( - OCRTextBlock( - text=str(text).strip(), - confidence=float(conf) * self.config.EASYOCR_WEIGHT, - bbox=[[float(p[0]), float(p[1])] for p in bbox], - line_number=int(y_center // 20), - engine="easyocr", - ) - ) - else: - self.easyocr_rows_skipped += 1 # v5.2: Track confidence/threshold failures - except Exception as parse_exc: - logger.warning("EasyOCR line skipped (unpack issue): %s", parse_exc) - self.easyocr_rows_skipped += 1 # v5.2: Track parse exceptions - continue - except Exception as e: - logger.error(f"EasyOCR failed: {e}") - return results - - def run_paddle(self, image: np.ndarray) -> List[OCRTextBlock]: - """Run PaddleOCR if available""" - results = [] - self.paddle_rows_skipped = 0 # v5.2: Reset counter per call - if not self.config.ENABLE_PADDLE or not self._paddle: - return results - - try: - # Keep Paddle prediction call API-stable across versions. - # Some runtime builds fail on explicit `cls=` argument. - with redirect_stdout(open(os.devnull, 'w')), redirect_stderr(open(os.devnull, 'w')): - raw_results = self._paddle.ocr(image) - blocks = raw_results if isinstance(raw_results, list) else [raw_results] - for block in blocks: - if not block: - continue - for line in block: - try: - bbox = line[0] if isinstance(line, (list, tuple)) and len(line) > 0 else None - text, conf = self._extract_text_conf(line) - if not bbox or not text or conf is None: - self.paddle_rows_skipped += 1 # v5.2: Track skipped rows - continue - if conf >= self.config.PADDLE_CONF_THRESH and str(text).strip(): - y_center = (float(bbox[0][1]) + float(bbox[2][1])) / 2.0 - results.append( - OCRTextBlock( - text=str(text).strip(), - confidence=float(conf) * self.config.PADDLE_WEIGHT, - bbox=[[float(p[0]), float(p[1])] for p in bbox], - line_number=int(y_center // 20), - engine="paddle", - ) - ) - else: - self.paddle_rows_skipped += 1 # v5.2: Track confidence/threshold failures - except Exception as parse_exc: - logger.warning("Paddle line skipped (unpack issue): %s", parse_exc) - self.paddle_rows_skipped += 1 # v5.2: Track parse exceptions - continue - except Exception as e: - error_str = str(e) - # Check if it's the known compatibility error - if "ConvertPirAttribute2RuntimeAttribute" in error_str or "Unimplemented" in error_str: - logger.warning("PaddleOCR version compatibility issue - disabling for this session: %s", e) - self._paddle = None - self.config.ENABLE_PADDLE = False - else: - logger.error(f"PaddleOCR failed: {e}") - return results - - def run_easyocr_paragraph_lines(self, image: np.ndarray) -> List[str]: - """Extra pass: paragraph mode often recovers line breaks lost in token mode.""" - if not self._easyocr: - return [] - try: - paras = self._easyocr.readtext(image, detail=0, paragraph=True) - if isinstance(paras, str): - return [normalize_ocr_text(paras)] if paras.strip() else [] - out: List[str] = [] - for p in paras or []: - t = normalize_ocr_text(str(p).strip()) - if t: - out.append(t) - return out - except Exception as exc: - logger.warning("EasyOCR paragraph pass skipped: %s", exc) - return [] - - @staticmethod - def _bbox_y_range(bb: List[List[float]]) -> Tuple[float, float]: - ys = [float(p[1]) for p in bb] - return min(ys), max(ys) - - @staticmethod - def _bbox_x_range(bb: List[List[float]]) -> Tuple[float, float]: - xs = [float(p[0]) for p in bb] - return min(xs), max(xs) - - @staticmethod - def _vertical_overlap_ratio(bb1: List[List[float]], bb2: List[List[float]]) -> float: - y1a, y1b = MultiEngineOCR._bbox_y_range(bb1) - y2a, y2b = MultiEngineOCR._bbox_y_range(bb2) - inter = max(0.0, min(y1b, y2b) - max(y1a, y2a)) - h = max(min(y1b - y1a, y2b - y2a), 1.0) - return inter / h - - @staticmethod - def _horizontal_near(bb1: List[List[float]], bb2: List[List[float]], gap_tol: float = 48.0) -> bool: - x1a, x1b = MultiEngineOCR._bbox_x_range(bb1) - x2a, x2b = MultiEngineOCR._bbox_x_range(bb2) - return not (x1b < x2a - gap_tol or x2b < x1a - gap_tol) - - @staticmethod - def _fuse_digit_hints(primary: str, hints: List[str]) -> str: - """Keep Paddle layout; borrow digit shapes from EasyOCR overlaps (0 vs O, 1 vs l).""" - if not hints: - return primary - hint_join = " ".join(hints) - t = primary.replace("O", "0").replace("o", "0").replace("l", "1").replace("I", "1") - hj = hint_join.replace("O", "0").replace("o", "0").replace("l", "1").replace("I", "1") - # If digit-only skeletons match length, prefer hint digit run for numeric tokens - def digits(s: str) -> str: - return re.sub(r"\D", "", s) - - if digits(hint_join) and digits(primary) and abs(len(digits(hint_join)) - len(digits(primary))) <= 1: - if digits(hint_join) != digits(primary) and len(digits(hint_join)) >= len(digits(primary)) - 1: - return hint_join.strip() if len(hint_join) <= len(primary) + 6 else t - return t - - def fuse_confidence_spatial_layout( - self, paddle_blocks: List[OCRTextBlock], easy_blocks: List[OCRTextBlock] - ) -> List[OCRTextBlock]: - """ - Confidence winner: Paddle supplies reading order + boxes; EasyOCR refines digit glyphs on overlaps. - """ - if not paddle_blocks: - return self.merge_results([easy_blocks, []]) if easy_blocks else [] - if not easy_blocks: - return list(paddle_blocks) - - def yc(b: OCRTextBlock) -> float: - bb = b["bbox"] - return (float(bb[0][1]) + float(bb[2][1])) / 2.0 - - def xc(b: OCRTextBlock) -> float: - return MultiEngineOCR._bbox_x_range(b["bbox"])[0] - - fused: List[OCRTextBlock] = [] - for pb in sorted(paddle_blocks, key=lambda b: (yc(b), xc(b))): - hints = [ - eb["text"] - for eb in easy_blocks - if self._vertical_overlap_ratio(pb["bbox"], eb["bbox"]) >= 0.32 - and self._horizontal_near(pb["bbox"], eb["bbox"]) - ] - txt = self._fuse_digit_hints(pb["text"], hints).strip() - # Numeric-aware confidence: Paddle tends to be more reliable on digits/prices. - is_numeric = bool(re.fullmatch(r"[\d\.,]+", re.sub(r"\s+", "", txt))) - conf_boost = 1.10 if is_numeric else 1.04 - fused.append( - OCRTextBlock( - text=txt or pb["text"].strip(), - confidence=min(1.0, float(pb["confidence"]) * conf_boost), - bbox=pb["bbox"], - line_number=pb["line_number"], - engine="fused_paddle_easy", +log = logging.getLogger("parchi-ocr") + +# --------------------------------------------------------------------------- +# Config (centralised – change here, takes effect everywhere) +# --------------------------------------------------------------------------- +class Config: + TARGET_WIDTH = 1200 # px – resize before OCR if larger + MAX_IMAGE_SIZE_MB = 20 + MIN_ITEM_PRICE = 1 # ignore prices below this (noise) + MAX_ITEM_PRICE = 100_000 # ignore prices above this (noise) + MAX_ITEM_QTY = 1_000 + NAME_ROI_RATIO = 0.15 + TOTAL_ROI_RATIO = 0.25 + +# --------------------------------------------------------------------------- +# Lexicon (two-tier: fast dict lookup THEN rapidfuzz fallback) +# --------------------------------------------------------------------------- + +# Tier-1: explicit variant → canonical (O(1), zero false-positives) +ITEM_CORRECTIONS: dict[str, str] = { + # atta / flour + "aata": "atta", "arta": "atta", "ata": "atta", "flour": "atta", + # cheeni / sugar + "chini": "cheeni", "sugar": "cheeni", "cheeny": "cheeni", "cheni": "cheeni", + # chawal / rice + "rice": "chawal", + # daal / lentils + "dal": "daal", "lentils": "daal", "lentil": "daal", + # ghee / oil + "desi ghee": "ghee", "tel": "oil", + # doodh / milk + "milk": "doodh", "dudh": "doodh", + # spices + "salt": "namak", "chili": "mirch", "turmeric": "haldi", "cumin": "zeera", + # personal care + "soap": "sabun", "sabon": "sabun", + # snacks / bakery + "buger": "burger", "bubiger": "burger", "buggar": "burger", + "bisconni": "biscuit", "double roti": "bread", + # eggs + "anday": "anda", "egg": "anda", "eggs": "anda", + # dairy + "yogurt": "dahi", "butter": "makhan", + # vegetables + "potato": "aloo", "onion": "pyaz", "tomato": "tamatar", + "meat": "gosht", "chicken": "murgi", +} + +# Tier-2: flat list for rapidfuzz similarity (conservative fallback) +LEXICON: list[str] = list(set(ITEM_CORRECTIONS.values())) + [ + "sooji", "besan", "makai", "dhaniya", "chai", "paneer", + "sabzi", "roti", "cream", "shampoo", "hammam", +] + +# keywords that signal totals / transaction type +TOTAL_KW = re.compile(r"(total|ٹوٹل|کل|jama|جمع|sum|amount)", re.I) +UDHAAR_KW = re.compile(r"(udhaar|ادھار|credit|baaki|باقی)", re.I) +WASOOLI_KW = re.compile(r"(wasooli|وصولی|received|payment|paid)", re.I) +CASH_KW = re.compile(r"(cash|نقد|naqd)", re.I) + +# Urdu digit map +URDU_DIGITS = str.maketrans("۰۱۲۳۴۵۶۷۸۹", "0123456789") + +# --------------------------------------------------------------------------- +# LRU Cache (SHA-256 keyed, 24 h TTL, 500 entries) +# --------------------------------------------------------------------------- +_CACHE_MAX = 500 +_CACHE_TTL_S = 86_400 # 24 h + +class _LRUCache: + def __init__(self, maxsize: int = _CACHE_MAX): + self._store: OrderedDict[str, tuple[float, Any]] = OrderedDict() + self._max = maxsize + + def _evict_expired(self): + now = time.monotonic() + stale = [k for k, (ts, _) in self._store.items() if now - ts > _CACHE_TTL_S] + for k in stale: + del self._store[k] + + def get(self, key: str) -> Any | None: + self._evict_expired() + if key not in self._store: + return None + self._store.move_to_end(key) + return self._store[key][1] + + def set(self, key: str, value: Any): + self._evict_expired() + if key in self._store: + self._store.move_to_end(key) + self._store[key] = (time.monotonic(), value) + if len(self._store) > self._max: + self._store.popitem(last=False) + +_cache = _LRUCache() + +# --------------------------------------------------------------------------- +# EasyOCR Reader (lazy singleton, warm on first request) +# --------------------------------------------------------------------------- +_reader: easyocr.Reader | None = None +_reader_lock = threading.Lock() # double-checked locking for thread safety + +def get_reader() -> easyocr.Reader: + global _reader + if _reader is None: + with _reader_lock: + if _reader is None: # re-check after acquiring lock + log.info("Initialising EasyOCR reader [ur, en] …") + _reader = easyocr.Reader( + ["ur", "en"], + gpu=False, + model_storage_directory="/tmp/easyocr_models", + download_enabled=True, + verbose=False, ) - ) - # Easy-only tokens (Paddle missed) - for eb in easy_blocks: - if not any( - self._vertical_overlap_ratio(pb["bbox"], eb["bbox"]) >= 0.22 - and self._horizontal_near(pb["bbox"], eb["bbox"]) - for pb in paddle_blocks - ): - fused.append(eb) - return sorted(fused, key=lambda b: (yc(b), xc(b))) - - def merge_results(self, results_list: List[List[OCRTextBlock]]) -> List[OCRTextBlock]: - """Merge results from multiple engines using spatial clustering""" - all_results = [] - for results in results_list: - all_results.extend(results) - - if not all_results: - return [] - - # Group by spatial proximity - groups = self._cluster_by_position(all_results) - - merged = [] - for group in groups: - if len(group) == 1: - merged.append(group[0]) - else: - merged.append(self._merge_group(group)) - - merged.sort(key=lambda x: x['line_number']) - return merged - - def _cluster_by_position(self, results: List[OCRTextBlock], threshold: float = 40.0) -> List[List[OCRTextBlock]]: - """Group OCR results by vertical position""" - if not results: - return [] - - sorted_results = sorted(results, key=lambda x: x['line_number']) - groups = [] - current_group = [sorted_results[0]] - - for result in sorted_results[1:]: - if abs(result['line_number'] - current_group[-1]['line_number']) <= threshold / 20: - current_group.append(result) - else: - groups.append(current_group) - current_group = [result] - - if current_group: - groups.append(current_group) - - return groups - - def _merge_group(self, group: List[OCRTextBlock]) -> OCRTextBlock: - """Merge multiple detections of the same text""" - group.sort(key=lambda x: x['confidence'], reverse=True) - best = group[0] - - # Find consensus text - texts = [g['text'] for g in group] - consensus = max(set(texts), key=texts.count) if texts else best['text'] - - # Average confidence - avg_conf = sum(g['confidence'] for g in group) / len(group) - - return OCRTextBlock( - text=consensus, - confidence=min(1.0, avg_conf * 1.1), # Boost merged confidence - bbox=best['bbox'], - line_number=best['line_number'], - engine="merged" - ) - - def extract_text_lines(self, image: np.ndarray) -> Tuple[List[str], List[OCRTextBlock]]: - """Extract text lines from image using all available engines""" - self.initialize() - - easyocr_results = self.run_easyocr(image) - paddle_results = self.run_paddle(image) - if paddle_results and easyocr_results: - merged = self.fuse_confidence_spatial_layout(paddle_results, easyocr_results) - elif paddle_results: - merged = list(paddle_results) - else: - merged = self.merge_results([easyocr_results, paddle_results]) - - # Group tokens by true vertical center (avoids collapsing many lines when line_number buckets match). - def y_center(rb: OCRTextBlock) -> float: - bb = rb["bbox"] - if not bb or len(bb) < 3: - return float(rb["line_number"] * 20) - return (float(bb[0][1]) + float(bb[2][1])) / 2.0 - - lines: List[str] = [] - if merged: - sorted_blocks = sorted(merged, key=y_center) - row_tol = 28.0 - current_line: List[str] = [] - current_y: Optional[float] = None - for result in sorted_blocks: - yc = y_center(result) - if current_y is None or abs(yc - current_y) <= row_tol: - current_line.append(result["text"]) - if current_y is None: - current_y = yc - else: - current_y = (current_y * (len(current_line) - 1) + yc) / len(current_line) - else: - if current_line: - lines.append(" ".join(current_line)) - current_line = [result["text"]] - current_y = yc - if current_line: - lines.append(" ".join(current_line)) - - # Supplement with paragraph-mode lines when token merge is thin (common on synthetic / dense slips). - if len(lines) < 3: - seen_norm = {norm(x) for x in lines} - for pl in self.run_easyocr_paragraph_lines(image): - key = norm(pl) - if len(key) > 2 and key not in seen_norm: - lines.append(pl) - seen_norm.add(key) - - return lines, merged - - def extract_top_roi_name_lines(self, image: np.ndarray, roi_ratio: float = 0.22) -> List[str]: - """ - Name-focused micro-pass: - Run OCR only on top ROI with a couple of variants so customer name - has a dedicated extraction path independent from item parsing. - """ - self.initialize() - if image is None or not isinstance(image, np.ndarray) or image.size == 0: - return [] - try: - h, _w = image.shape[:2] - cut = max(40, int(h * max(0.10, min(roi_ratio, 0.35)))) - top = image[:cut, :].copy() - if top.size == 0: - return [] - variants: List[np.ndarray] = [top] - try: - gray = cv2.cvtColor(top, cv2.COLOR_RGB2GRAY) - variants.append(cv2.cvtColor(cv2.bitwise_not(gray), cv2.COLOR_GRAY2RGB)) - # Slightly contrast-boosted for weak handwriting. - clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(6, 6)) - variants.append(cv2.cvtColor(clahe.apply(gray), cv2.COLOR_GRAY2RGB)) - except Exception: - pass - - out: List[str] = [] - seen: set[str] = set() - for var_img in variants[:3]: - blocks = self.run_paddle(var_img) + self.run_easyocr(var_img) - for b in blocks: - t = normalize_ocr_text(str(b.get("text", "")).strip()) - k = norm(t) - if len(k) >= 2 and k not in seen: - out.append(t) - seen.add(k) - return out - except Exception as exc: - logger.warning("Top ROI name micro-pass failed: %s", exc) - return [] - - def extract_bottom_roi_total_lines(self, image: np.ndarray, roi_ratio: float = 0.38) -> List[str]: - """ - Fast total-focused micro-pass: - Only OCR bottom ROI to recover totals/paid/balance quickly when math mismatch is detected. - """ - self.initialize() - if image is None or not isinstance(image, np.ndarray) or image.size == 0: - return [] - try: - h, _w = image.shape[:2] - cut = max(80, int(h * max(0.20, min(roi_ratio, 0.55)))) - bot = image[max(0, h - cut) :, :].copy() - if bot.size == 0: - return [] - variants: List[np.ndarray] = [bot] - try: - gray = cv2.cvtColor(bot, cv2.COLOR_RGB2GRAY) - # high-contrast + invert catch faint pencil totals - _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - variants.append(cv2.cvtColor(th, cv2.COLOR_GRAY2RGB)) - variants.append(cv2.cvtColor(cv2.bitwise_not(th), cv2.COLOR_GRAY2RGB)) - except Exception: - pass - - out: List[str] = [] - seen: set[str] = set() - for var_img in variants[:3]: - # totals are numeric-heavy: prefer paddle first - blocks = self.run_paddle(var_img) + self.run_easyocr(var_img) - for b in blocks: - t = normalize_ocr_text(str(b.get("text", "")).strip()) - k = norm(t) - if len(k) >= 2 and k not in seen: - out.append(t) - seen.add(k) - return out - except Exception as exc: - logger.warning("Bottom ROI total micro-pass failed: %s", exc) - return [] - - -# ============================================================================ -# UTILITY FUNCTIONS (v5.1 - Geometry & Context-Aware Parsing) -# ============================================================================ - -def norm(s: str) -> str: - """Lowercase collapsed key for dedupe / dict keys.""" - return re.sub(r"\s+", " ", (s or "").strip().lower()) + log.info("EasyOCR ready.") + return _reader + +# --------------------------------------------------------------------------- +# Image preprocessing helpers +# --------------------------------------------------------------------------- + +def _pil_to_bgr(img: Image.Image) -> np.ndarray: + return cv2.cvtColor(np.array(img.convert("RGB")), cv2.COLOR_RGB2BGR) + + +def _auto_rotate(bgr: np.ndarray) -> np.ndarray: + """Deskew via Hough lines – very cheap on CPU.""" + gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY) + edges = cv2.Canny(gray, 50, 150, apertureSize=3) + lines = cv2.HoughLinesP(edges, 1, math.pi / 180, 80, minLineLength=60, maxLineGap=10) + if lines is None: + return bgr + angles = [] + for x1, y1, x2, y2 in lines[:, 0]: + if x2 != x1: + angles.append(math.degrees(math.atan2(y2 - y1, x2 - x1))) + if not angles: + return bgr + median_angle = float(np.median(angles)) + if abs(median_angle) < 0.5 or abs(median_angle) > 45: + return bgr + h, w = bgr.shape[:2] + M = cv2.getRotationMatrix2D((w / 2, h / 2), median_angle, 1.0) + return cv2.warpAffine(bgr, M, (w, h), flags=cv2.INTER_CUBIC, + borderMode=cv2.BORDER_REPLICATE) + + +def _clahe_sharpen(bgr: np.ndarray) -> np.ndarray: + """CLAHE → denoise → sharpen.""" + lab = cv2.cvtColor(bgr, cv2.COLOR_BGR2LAB) + l, a, b = cv2.split(lab) + clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8)) + l = clahe.apply(l) + lab = cv2.merge([l, a, b]) + bgr = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR) + bgr = cv2.fastNlMeansDenoisingColored(bgr, None, 7, 7, 7, 21) + kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], dtype=np.float32) + return cv2.filter2D(bgr, -1, kernel) + + +def _adaptive_thresh(bgr: np.ndarray) -> np.ndarray: + gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY) + return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, 31, 10) + + +def build_variants(bgr: np.ndarray) -> list[np.ndarray]: + """Return list of OCR-ready image variants.""" + enhanced = _clahe_sharpen(bgr) + variants = [ + enhanced, # enhanced colour + cv2.bitwise_not(enhanced), # inverted + _adaptive_thresh(enhanced), # binary + cv2.convertScaleAbs(enhanced, alpha=1.5, beta=20), # high-contrast + ] + return variants +# --------------------------------------------------------------------------- +# OCR helpers +# --------------------------------------------------------------------------- -def sliding_split_multi_price_line(line: str) -> List[str]: - """ - Split a single OCR line that glued multiple rows (e.g. 'cheeni 2 200 atta 1 500'). - Handles [Name] [sep] [Qty] [sep] [Price] with - * : etc. - """ - line = normalize_ocr_text(line) - if not line.strip(): +def _run_ocr(reader: easyocr.Reader, img: np.ndarray) -> list[dict]: + """Run EasyOCR on one image variant; return list of result dicts.""" + try: + raw = reader.readtext( + img, + text_threshold=0.15, + low_text=0.10, + link_threshold=0.25, + mag_ratio=1.5, + slope_ths=0.3, + ycenter_ths=0.8, + height_ths=0.7, + width_ths=0.9, + decoder="greedy", + beamWidth=3, + ) + except Exception as exc: + log.warning("OCR variant failed: %s", exc) return [] - out: List[str] = [] - # Primary: repeated Name sep Qty sep Price - rx_row = re.compile( - r"(?i)([a-zA-Zء-ي][a-zA-Zء-ي\s]{0,42}?)\s*[\-\*–—:#]+\s*(\d+(?:\.\d+)?)\s*[\-\*–—:#]+\s*(\d+(?:\.\d+)?)" - ) - pos = 0 - for m in rx_row.finditer(line): - if m.start() > pos: - mid = line[pos : m.start()].strip(" -*,:") - if mid and re.search(r"[A-Za-zء-ي]", mid): - out.append(mid) - out.append(m.group(0).strip()) - pos = m.end() - if pos < len(line): - tail = line[pos:].strip(" -*,:") - if tail and re.search(r"[A-Za-zء-ي\d]", tail): - out.append(tail) - if out: - return [x for x in out if x.strip()] - - # Sliding fallback: pair consecutive numbers as qty+price with preceding text - nums = list(re.finditer(r"(?= 2 else [line] + results = [] + for (bbox, text, conf) in raw: + if not text.strip(): + continue + xs = [p[0] for p in bbox] + ys = [p[1] for p in bbox] + results.append({ + "text": text.strip(), + "conf": float(conf), + "x1": min(xs), "y1": min(ys), + "x2": max(xs), "y2": max(ys), + "yc": (min(ys) + max(ys)) / 2, + }) + return results -def normalize_ocr_text(text: str) -> str: +def merge_ocr_results(all_results: list[list[dict]]) -> list[dict]: """ - Context-aware cleaning for Pakistani retail OCR. - Handles common Urdu-English mixing and noise. + Merge results from multiple variants. + For overlapping detections keep highest-confidence. """ - # Replace common Urdu lookalikes of numbers - text = text.replace('۰', '0').replace('۱', '1').replace('۲', '2') - text = text.replace('۳', '3').replace('۴', '4').replace('۵', '5') - text = text.replace('۶', '6').replace('۷', '7').replace('۸', '8') - text = text.replace('۹', '9') - - # Normalize common dashes - text = re.sub(r'[–—−]', '-', text) - - # Fix 'item-quantity price' pattern (e.g., 'Atta-2 500') - text = re.sub(r'([a-zA-Zء-ي]+)-(\d+)', r'\1 \2', text) - - # Normalize multiple spaces - text = re.sub(r'\s+', ' ', text).strip() - + if not all_results: + return [] + merged: list[dict] = [] + for variant_res in all_results: + for det in variant_res: + # Check overlap with any already-merged detection + duplicate = False + for existing in merged: + iou = _bbox_iou(det, existing) + if iou > 0.40: + if det["conf"] > existing["conf"]: + existing.update(det) + duplicate = True + break + if not duplicate: + merged.append(det.copy()) + return merged + + +def _bbox_iou(a: dict, b: dict) -> float: + ix1 = max(a["x1"], b["x1"]) + iy1 = max(a["y1"], b["y1"]) + ix2 = min(a["x2"], b["x2"]) + iy2 = min(a["y2"], b["y2"]) + inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) + if inter == 0: + return 0.0 + ua = (a["x2"] - a["x1"]) * (a["y2"] - a["y1"]) + ub = (b["x2"] - b["x1"]) * (b["y2"] - b["y1"]) + return inter / (ua + ub - inter + 1e-9) + +# --------------------------------------------------------------------------- +# Text normalisation & number extraction +# --------------------------------------------------------------------------- + +def normalise_text(text: str) -> str: + """Translate Urdu digits, common OCR confusions, clean whitespace.""" + text = text.translate(URDU_DIGITS) + text = text.replace("O", "0").replace("o", "0").replace("l", "1") \ + .replace("I", "1").replace("S", "5").replace("Z", "2") \ + .replace("B", "8").replace("G", "9").replace("g", "9") + text = re.sub(r"\s+", " ", text).strip() return text -def extract_numbers(text: str) -> List[float]: - """ - Extract all numbers from text, handling decimals and common formats. - """ - numbers = [] - pattern = r'\b(\d{1,5}(?:\.\d{1,2})?|\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?)' # Handles 100, 1.5, 1,000.50 - for match in re.finditer(pattern, text): +_NUM_RE = re.compile(r"(\d[\d,./]*\d|\d)") + +def extract_numbers(text: str) -> list[float]: + """Return all numeric values found in text.""" + nums = [] + for m in _NUM_RE.finditer(text): + raw = m.group().replace(",", "") try: - num = float(match.group(1).replace(',', '')) - if 0 < num < 999999: # Reasonable range - numbers.append(num) + nums.append(float(raw)) except ValueError: pass - return numbers - + return nums -def parse_geometry_line(line: str) -> Optional[Dict[str, Any]]: - """ - Universal line parser (Geometry-Based). - Parses: [Text/Mixed] [Separator] [Numbers] - Separators: -, :, multiple spaces - - Returns: {text, qty, price, confidence} - """ - line = normalize_ocr_text(line) - - # Try pattern: "item - qty - price", "item * qty * price", "item: qty price", etc. - patterns = [ - r"^([a-zA-Zء-ي\s]+?)\s*[\-\*–—:]+\s*(\d+(?:\.\d+)?)\s*[\-\*–—:]+\s*(\d+(?:\.\d+)?)", # item sep qty sep price - r"^([a-zA-Zء-ي\s]+?)\s*[-:]\s*(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)", # item-qty price (legacy) - r"^([a-zA-Zء-ي\s]+?)\s+(\d+(?:\.\d+)?)\s*kg\s+(\d+(?:\.\d+)?)", # item qty kg price - r"^([a-zA-Zء-ي\s]+?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)", # item qty price (generic) - ] - - for pattern in patterns: - match = re.match(pattern, line, re.IGNORECASE) - if match: - item_text = match.group(1).strip() - try: - if len(match.groups()) == 3: - qty = float(match.group(2)) - price = float(match.group(3)) - return { - "text": item_text, - "quantity": qty, - "price": price, - "confidence": 0.72, - "source": "geometry" - } - except (ValueError, IndexError): - pass - - return None - -def subset_sum_match(numbers: List[float], target: float, tolerance: float = 2.0) -> Tuple[List[float], bool]: - """ - Find subset of numbers that sum to target (for total anchoring). - Uses greedy + exhaustive search for small lists. - - Returns: (matching_numbers, is_exact_match) - """ - if not numbers: - return [], False - - # Sort descending for greedy approach - sorted_nums = sorted(numbers, reverse=True) - current_sum = 0 - matched = [] - - # Greedy approach first - for num in sorted_nums: - if abs(current_sum + num - target) <= abs(current_sum - target): - current_sum += num - matched.append(num) - if abs(current_sum - target) <= tolerance: - return matched, abs(current_sum - target) < 0.01 - - # If greedy fails, try combinations (for small lists) - if len(numbers) <= 10: - for r in range(len(numbers), 0, -1): - for combo in itertools.combinations(numbers, r): - if abs(sum(combo) - target) <= tolerance: - return list(combo), abs(sum(combo) - target) < 0.01 - - return [], False - - -# ============================================================================ -# v5.2: ENHANCED PARSING & CONFIDENCE RECONSTRUCTION -# ============================================================================ - -def parse_aggressive_patterns(line: str) -> Optional[Dict[str, Any]]: +def merge_spaced_digits(tokens: list[str]) -> list[str]: """ - v5.2: Aggressive pattern matching for messy/poor quality images. - Supports more flexible formats and Urdu variants. - EXTREMELY LENIENT - if line has text + numbers, treat as potential item. + ['3', '0', '0'] → ['300'] + ['2', '1'] → ['21'] + Single digit tokens adjacent to other single digit tokens get merged. """ - line = normalize_ocr_text(line) - if len(line.strip()) < 2: - return None - - # v5.2: ULTRA-LENIENT patterns - match almost anything with numbers - # These patterns are EXTREMELY flexible to catch all variations - patterns = [ - # Pattern 1: "item - qty - price" / "item * qty * price" - r'([a-zA-Zء-ي\s]+?)\s*[\-\*–—:]+\s*(\d+(?:\.\d+)?)\s*[\-\*–—:]+\s*(\d+(?:\.\d+)?)', - # Pattern 1b: "item - qty price" or "item - price" - r'([a-zA-Zء-ي\s]+?)\s*[-:–—]\s*(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)', - # Pattern 2: "item qty unit price" (with optional units) - r'([a-zA-Zء-ي\s]+?)\s+(\d+(?:\.\d+)?)\s*(?:kg|g|liter|litre|ml|dozen|dz|pcs|pc|pkt|packet)?\s*(\d+(?:\.\d+)?)', - # Pattern 3: "item price" (qty=1 assumed) - VERY LENIENT - r'([a-zA-Zء-ي\s]{2,})\s+(\d+(?:\.\d+)?)', - # Pattern 4: "item qty: price" or similar - r'([a-zA-Zء-ي\s]+?)\s+(\d+(?:\.\d+)?)\s*[:–—]\s*(\d+(?:\.\d+)?)', - # Pattern 5: "item qty-price" (hyphenated with floats) - r'([a-zA-Zء-ي\s]+?)\s+(\d+(?:\.\d+)?)[-–]\s*(\d+(?:\.\d+)?)', - ] - - for pattern_idx, pattern in enumerate(patterns): - # Use search instead of match to find pattern anywhere in line - match = re.search(pattern, line, re.IGNORECASE) - if match: - item_text = match.group(1).strip() - if not item_text or len(item_text) < 2: - continue - - try: - groups = match.groups() - if len(groups) >= 2: - if len(groups) == 2: - price = float(groups[1]) - qty = 1.0 - else: - qty_str = str(groups[1]).replace("۲", "2").replace("۱", "1").replace("۳", "3").replace("۴", "4").replace("۵", "5") - qty = float(qty_str) - price = float(groups[2]) - - # LENIENT validation - allow wider range - if 0.1 <= qty <= 200 and 1 <= price <= 99999: - return { - "text": item_text, - "quantity": qty, - "price": price, - "confidence": 0.60 + (0.08 if pattern_idx < 2 else 0.02), - "source": f"aggressive_p{pattern_idx+1}" - } - except (ValueError, IndexError) as e: + out: list[str] = [] + i = 0 + while i < len(tokens): + tok = tokens[i] + if re.fullmatch(r"\d", tok): + # collect consecutive single digits + group = [tok] + j = i + 1 + while j < len(tokens) and re.fullmatch(r"\d", tokens[j]): + group.append(tokens[j]) + j += 1 + if len(group) > 1: + out.append("".join(group)) + i = j continue - - return None - - -def reconstruct_items_from_total(total: float, possible_prices: List[float]) -> List[Tuple[float, float]]: - """ - v5.2: Reconstruct likely items from reported total. - Used when item extraction fails but total is present. - Returns: [(qty, price), ...] - """ - if not possible_prices or total <= 0: + out.append(tok) + i += 1 + return out + +# --------------------------------------------------------------------------- +# Lexicon correction (fuzzy, conservative) +# --------------------------------------------------------------------------- + +def lexicon_correct(word: str) -> str: + """Two-tier correction: dict lookup first, then conservative rapidfuzz.""" + if len(word) < 2 or word.isdigit(): + return word + w = word.lower().strip() + # Tier-1: exact dict match (fastest, zero false-positives) + if w in ITEM_CORRECTIONS: + return ITEM_CORRECTIONS[w] + # Tier-2: fuzzy similarity (score ≥ 85, length diff ≤ 3) + match = rfprocess.extractOne(w, LEXICON, score_cutoff=85) + if match is None: + return w + best_word, _score, _ = match + if abs(len(best_word) - len(w)) > 3: + return w + return best_word + +# --------------------------------------------------------------------------- +# Geometry: group detections into lines +# --------------------------------------------------------------------------- + +def group_into_lines(dets: list[dict], gap_factor: float = 0.6) -> list[list[dict]]: + """ + Cluster detections by Y-centre into horizontal lines. + gap_factor: fraction of median text height that counts as new line. + """ + if not dets: return [] - - # Find combination of prices that sum to total - reconstructed = [] - remaining = total - prices_copy = sorted(possible_prices, reverse=True) - - for price in prices_copy: - if remaining >= price and price >= 1: - # Estimate qty - qty = max(1.0, round(remaining / price)) - if qty <= 20: # Reasonable qty - reconstructed.append((qty, price)) - remaining -= qty * price - if abs(remaining) < 2: # Close enough - break - - return reconstructed - + heights = [d["y2"] - d["y1"] for d in dets if d["y2"] > d["y1"]] + median_h = float(np.median(heights)) if heights else 20.0 + threshold = median_h * gap_factor + + sorted_dets = sorted(dets, key=lambda d: d["yc"]) + lines: list[list[dict]] = [] + current_line: list[dict] = [sorted_dets[0]] + ref_yc = sorted_dets[0]["yc"] + + for det in sorted_dets[1:]: + if abs(det["yc"] - ref_yc) <= threshold: + current_line.append(det) + else: + lines.append(sorted(current_line, key=lambda d: d["x1"])) + current_line = [det] + ref_yc = det["yc"] + lines.append(sorted(current_line, key=lambda d: d["x1"])) + return lines -def boost_confidence(base_confidence: float, item_name: str, price: float, - in_lexicon: bool, math_validated: bool, sharpness: float) -> float: - """ - v5.2: Intelligent confidence boosting based on multiple factors. - """ - boosted = base_confidence - - # Boost if in lexicon - if in_lexicon: - boosted += SystemConfig.CONF_BOOST_LEXICON - - # Boost if matches known patterns (common items) - common_items = ['atta', 'chai', 'milk', 'doodh', 'chay', 'roghan', 'namak', 'chini'] - if item_name.lower() in common_items: - boosted += SystemConfig.CONF_BOOST_PATTERN - - # Boost if math validates - if math_validated: - boosted += SystemConfig.CONF_BOOST_MATH - - # Quality-based adjustment - if sharpness > SystemConfig.QUALITY_SHARPNESS_EXCELLENT: - boosted += 0.05 # High quality boost - elif sharpness < SystemConfig.QUALITY_SHARPNESS_POOR: - boosted *= 0.95 # Low quality penalty - - return min(0.98, max(0.35, boosted)) # Clamp between 0.35-0.98 - - -# ============================================================================ -# URDU TEXT NORMALIZER -# ============================================================================ - -class UrduTextNormalizer: - """ - Specialized normalizer for Urdu text handling. - Handles Nastaliq script complexities, diacritics, and Roman Urdu. - """ - - # Roman Urdu to Urdu mapping (common patterns) - ROMAN_TO_URDU = { - 'a': 'ا', 'b': 'ب', 'p': 'پ', 't': 'ت', 's': 'س', - 'j': 'ج', 'ch': 'چ', 'h': 'ح', 'kh': 'خ', 'd': 'د', - 'z': 'ز', 'r': 'ر', 'sh': 'ش', 'gh': 'غ', 'f': 'ف', - 'q': 'ق', 'k': 'ک', 'g': 'گ', 'l': 'ل', 'm': 'م', - 'n': 'ن', 'w': 'و', 'y': 'ی', 'e': 'ے' - } - - # Common normalization rules - NORMALIZATION_RULES = { - 'ي': 'ی', # Different Yeh forms - 'ى': 'ی', - 'ة': 'ہ', # Ta Marbuta to Heh - 'ھ': 'ہ', # Heh doachashmee to Heh - 'ك': 'ک', # Arabic Kaf to Urdu Keh - } - - @classmethod - def normalize_urdu_text(cls, text: str) -> str: - """Normalize Urdu text: unify similar characters""" - for old, new in cls.NORMALIZATION_RULES.items(): - text = text.replace(old, new) - return text - - @classmethod - def roman_to_urdu(cls, text: str) -> str: - """Convert Roman Urdu to proper Urdu script""" - # Only apply if text contains mostly Roman characters - roman_ratio = sum(c.isascii() for c in text) / max(1, len(text)) - if roman_ratio < 0.5: - return text - - result = [] - i = 0 - while i < len(text): - matched = False - # Try longest matches first - for length in range(3, 0, -1): - if i + length <= len(text): - sub = text[i:i+length].lower() - if sub in cls.ROMAN_TO_URDU: - result.append(cls.ROMAN_TO_URDU[sub]) - i += length - matched = True - break - if not matched: - result.append(text[i]) - i += 1 - return ''.join(result) - - @classmethod - def is_urdu(cls, text: str) -> bool: - """Detect if text contains Urdu characters""" - urdu_range = range(0x0600, 0x06FF) - return any(ord(c) in urdu_range for c in text) - - @classmethod - def clean_text(cls, text: str) -> str: - """Clean text: remove noise, normalize spaces""" - # Remove special characters (keep Urdu, English, numbers, spaces) - text = re.sub(r'[^\w\sء-ي]', ' ', text) - # Normalize spaces - text = re.sub(r'\s+', ' ', text) - return text.strip().lower() - - -# ============================================================================ -# SEMANTIC LEXICON (Pass 3 - Complete Pakistani Market Lexicon) -# ============================================================================ - -class PakistaniRetailLexicon: - """ - Complete semantic lexicon for Pakistani retail items. - Includes Urdu names, Roman Urdu variations, English names, and common misspellings. - """ - - # Master lexicon with categories - LEXICON = { - # Staples (اناج) - 'atta': ['atta', 'aata', 'arta', 'flour', 'wheat flour', 'chakki atta', 'آٹا'], - 'cheeni': ['cheeni', 'chini', 'sugar', 'sugar s', 'white sugar', 'چینی'], - 'chawal': ['chawal', 'rice', 'basmati', 'sella rice', 'tota chawal', 'چاول'], - 'daal': ['daal', 'dal', 'lentils', 'daal mash', 'daal chana', 'daal moong', 'دال'], - 'besan': ['besan', 'gram flour', 'chana flour', 'بیسن'], - - # Fats & Oils (چکنائی) - 'ghee': ['ghee', 'desi ghee', 'گھی'], - 'tel': ['tel', 'oil', 'cooking oil', 'تیل'], - 'dalda': ['dalda', 'banaspati', 'vegetable ghee', 'ڈالڈا'], - - # Dairy (دودھ کی مصنوعات) - 'doodh': ['doodh', 'milk', 'olpers', 'milkpak', 'دودھ'], - 'dahi': ['dahi', 'yogurt', 'curd', 'دہی'], - 'paneer': ['paneer', 'cottage cheese', 'پنیر'], - - # Spices (مصالحے) - 'haldi': ['haldi', 'turmeric', 'haldi powder', 'ہلدی'], - 'zeera': ['zeera', 'cumin', 'safaid zeera', 'زیرہ'], - 'dhania': ['dhania', 'coriander', 'دھنیا'], - 'mirch': ['mirch', 'red chili', 'lal mirch', 'مرچ'], - 'namak': ['namak', 'salt', 'shan namak', 'نمک'], - 'garam_masala': ['garam masala', 'all spice', 'گرم مصالحہ'], - - # Tea & Beverages (مشروبات) - 'chai': ['chai', 'tea', 'patti', 'tapal', 'lipton', 'چائے'], - 'coffee': ['coffee', 'cafe', 'کافی'], - 'soda': ['soda', 'soft drink', 'coke', 'pepsi', 'soda water'], - - # Personal Care (ذاتی نگہداشت) - 'sabun': ['sabun', 'soap', 'lux', 'safeguard', 'dettol', 'lifebuoy', 'صابن'], - 'shampoo': ['shampoo', 'شیمپو'], - 'cream': ['cream', 'moisturizer', 'fairness cream', 'کریم'], - 'toothpaste': ['toothpaste', 'paste', 'dental cream', 'ٹوتھ پیسٹ'], - 'razor': ['razor', 'blade', 'gillete', 'ریزر'], - - # Household (گھریلو اشیاء) - 'hammam': ['hammam', 'hamaam', 'bath soap', 'حمام'], - 'detergent': ['detergent', 'washing powder', 'soap powder', 'detergent powder', 'ariel', 'surf excel'], - 'bleach': ['bleach', 'whitener', 'بلیچ'], - - # Snacks & Packaged (نمکین اشیاء) - 'burger': ['burger', 'bugger', 'برگر'], - 'biscuit': ['biscuit', 'cookie', 'bakery', 'bisconni', 'peak freans', 'بسکٹ'], - 'chips': ['chips', 'crisps', 'lays', 'kurleez', 'چپس'], - 'bread': ['bread', 'double roti', 'بریڈ'], - - # Eggs & Meat (انڈے اور گوشت) - 'anday': ['anday', 'eggs', 'desi anday', 'anda', 'انڈے'], - 'chicken': ['chicken', 'murghi', 'broiler', 'چکن'], - 'beef': ['beef', 'gai ka gosht', 'بیف'], - 'mutton': ['mutton', 'bakray ka gosht', 'مٹن'], - - # Vegetables & Fruits (سبزیاں اور پھل) - 'aaloo': ['aaloo', 'potato', 'آلو'], - 'pyaaz': ['pyaaz', 'onion', 'پیاز'], - 'tamatar': ['tamatar', 'tomato', 'ٹماٹر'], - 'kheera': ['kheera', 'cucumber', 'کھیرا'], - 'apple': ['apple', 'saib', 'سیب'], - 'banana': ['banana', 'kela', 'کیلا'], - - # Other Common Terms - 'total': ['total', 'tota', 'ٹوٹل', 'کل', 'مجموعی', 'total amount', 'grand total'], - 'udhaar': ['udhaar', 'udhar', 'u dhara', 'ادھار', 'باقی', 'بقایا'], - 'wasooli': ['wasooli', 'wasuli', 'وصولی', 'وصول', 'جمع'], - 'cash': ['cash', 'نقد', 'paid', 'ادا', 'cash paid'], - 'date': ['date', 'تاریخ'], - 'name': ['name', 'customer', 'client', 'نام', 'گاہک', 'بندہ', 'جناب'], - } - - # Unit mappings - UNITS = { - 'kg': ['kg', 'kgs', 'kilo', 'kilogram', 'کلو', 'کلوگرام'], - 'g': ['g', 'gm', 'gram', 'گرام'], - 'liter': ['liter', 'litre', 'ltr', 'l', 'لیٹر'], - 'ml': ['ml', 'milliliter', 'ملی لیٹر'], - 'dozen': ['dozen', 'dz', 'ڈزن'], - 'pc': ['pc', 'pcs', 'piece', 'pieces', 'عدد'], - 'packet': ['packet', 'pkt', 'pack', 'پیکٹ'], - } - - @classmethod - def normalize_item_name(cls, name: str) -> Tuple[str, float]: - """ - Normalize item name using semantic lexicon. - Returns (normalized_name, confidence_score) - """ - name_clean = name.lower().strip() - - # Direct match - for standard, variants in cls.LEXICON.items(): - if name_clean in variants: - return standard, 0.95 - - # Fuzzy match using rapidfuzz - all_variants = {} - for standard, variants in cls.LEXICON.items(): - for variant in variants: - all_variants[variant] = standard - - if all_variants: - try: - hit = fuzzy_process.extractOne(name_clean, all_variants.keys()) - if hit: - # rapidfuzz returns (choice, score, index) in newer versions. - best_match = hit[0] - score = float(hit[1]) / 100.0 - if score > 0.65: - return all_variants[best_match], score - except Exception as exc: - logger.warning("Fuzzy item match skipped: %s", exc) - - return name, 0.5 - - @classmethod - def normalize_unit(cls, text: str) -> str: - """Extract and normalize unit from text""" - text_lower = text.lower() - for unit, patterns in cls.UNITS.items(): - for pattern in patterns: - if pattern in text_lower: - return unit - return "pc" # default unit - - @classmethod - def detect_transaction_type(cls, text: str) -> Tuple[str, float]: - """Detect transaction type (udhaar/wasooli/cash)""" - text_lower = text.lower() - - # Check udhaar - if 'udhaar' in cls.LEXICON: - for variant in cls.LEXICON['udhaar']: - if variant in text_lower: - return "udhaar", 0.95 - - # Check wasooli - if 'wasooli' in cls.LEXICON: - for variant in cls.LEXICON['wasooli']: - if variant in text_lower: - return "wasooli", 0.95 - - # Check cash - if 'cash' in cls.LEXICON: - for variant in cls.LEXICON['cash']: - if variant in text_lower: - return "cash", 0.85 - - return "unknown", 0.4 - - -# ============================================================================ -# INTELLIGENT PARSER (Extracts structured data from OCR lines) -# ============================================================================ - -class IntelligentParser: - """ - Parses OCR text into structured items, total, and customer name. - Uses multiple strategies: - 1. Pattern matching (regex) - 2. Position-based inference - 3. Semantic mapping - 4. Mathematical validation - """ - - def __init__(self, config: SystemConfig): - self.config = config - - @staticmethod - def _name_blacklist() -> set[str]: - return { - "wasooli", - "wasuli", - "وصولی", - "وصول", - "udhaar", - "udhar", - "ادھار", - "baqaya", - "baki", - "بقایا", - "total", - "tota", - "ٹوٹل", - "کل", - "cash", - "discount", - "disc", - "ڈسکاؤنٹ", - "name", - "customer", - "receipt", - } - - @staticmethod - def _name_headers() -> Tuple[str, ...]: - return ("name", "customer", "mr", "جناب", "نام", "mohtaram", "محترم") - - def _clean_name_candidate(self, text: str) -> str: - t = normalize_ocr_text(text or "") - t = re.sub(r"[^\w\sء-ي]", " ", t) - t = re.sub(r"\s+", " ", t).strip() - return t - - def _normalize_person_name(self, text: str) -> str: - """Normalize likely person names for Urdu/Roman Urdu slips.""" - t = self._clean_name_candidate(text) - if not t: - return "" - # Remove honorific prefixes and common OCR junk tokens around names. - t = re.sub(r"(?i)\b(mr|mrs|ms|جناب|محترم|name|customer)\b[:\-]?\s*", "", t).strip() - # Common OCR confusions in Roman names. - replacements = { - "0": "o", - "1": "l", - "5": "s", - " ": " ", - } - for src, dst in replacements.items(): - t = t.replace(src, dst) - # Remove trailing ledger words if attached with the name. - t = re.sub(r"(?i)\b(total|wasooli|udhaar|baqaya|cash|discount)\b.*$", "", t).strip() - # High-precision roman-name OCR glitch fixes (avoid broad hallucination rules). - if t.strip().lower() == "unmar": - t = "Umar" - t = re.sub(r"\s+", " ", t).strip() - return t - - @staticmethod - def _compute_line_total_price( - original_line: str, - quantity: float, - extracted_price: float, - ) -> Tuple[float, Optional[float], List[str]]: - """ - Shopkeeper Rule: - - The number at the end of the item line is usually the **LINE TOTAL** (amount for that row). - e.g. "Milk 3 1200" => qty=3, line_total=1200 (UI should show 1200). - - Only if unit price is explicitly mentioned (e.g. "@ 400", "per 400", "/400") do we treat - extracted_price as unit price and compute line_total = qty * unit_price. - - Returns: (line_total_price, unit_price_or_none, notes) - """ - notes: List[str] = [] - try: - q = float(quantity) - p = float(extracted_price) - except Exception: - return extracted_price, None, notes - - if q <= 0 or p <= 0: - return extracted_price, None, notes - - line = (original_line or "").lower() - unit_markers = ("@", " per ", "per-", "per/", "/") - explicit_unit = any(m in line for m in unit_markers) - if explicit_unit and q > 0: - line_total = float(round(q * p, 2)) - notes.append(f"unit_price_detected_unit_{round(p,2)}_qty_{round(q,3)}_line_{round(line_total,2)}") - return line_total, p, notes - - # Default: treat extracted number as line total. - return p, None, notes - - def _valid_name_candidate(self, text: str) -> bool: - t = self._clean_name_candidate(text) - if len(t) < 2: - return False - low = t.lower() - if low in self._name_blacklist(): - return False - if any(kw in low for kw in ("total", "wasool", "udhaar", "baq", "cash")): - return False - if sum(c.isalpha() for c in t) < 2: - return False - if sum(c.isdigit() for c in t) > 2: - return False - return True - - @staticmethod - def _bbox_center(block: OCRTextBlock) -> Tuple[float, float]: - bb = block["bbox"] - cx = (float(bb[0][0]) + float(bb[2][0])) / 2.0 - cy = (float(bb[0][1]) + float(bb[2][1])) / 2.0 - return cx, cy - - @staticmethod - def _bbox_yxxy(block: OCRTextBlock) -> List[float]: - """Return bbox as [y1, x1, y2, x2] for UI overlay payload.""" - bb = block["bbox"] - xs = [float(p[0]) for p in bb] - ys = [float(p[1]) for p in bb] - return [min(ys), min(xs), max(ys), max(xs)] - - def parse_customer_name( - self, - lines: List[str], - ocr_blocks: Optional[List[OCRTextBlock]] = None, - image_shape: Optional[Tuple[int, int]] = None, - top_roi_lines: Optional[List[str]] = None, - ) -> Tuple[Optional[str], float, List[str], Dict[str, Any]]: - """ - Top-section priority anchor: - - Prefer top 15% OCR blocks - - Use header anchors (name/customer/mr) - - Avoid blacklist words (wasooli/total/baqaya) - - Hybrid cross-check: Paddle structure + EasyOCR handwriting - """ - warnings_local: List[str] = [] - hitl_data: Dict[str, Any] = {"name_review_required": False, "name_candidates": []} - top_name: Optional[str] = None - top_conf = 0.0 - - blocks = ocr_blocks or [] - if blocks and image_shape is not None: - img_h = float(image_shape[0]) - img_w = float(image_shape[1]) if len(image_shape) > 1 else 0.0 - hitl_data["source_height"] = img_h - hitl_data["source_width"] = img_w - hitl_data["roi_ratio"] = 0.20 - top_cutoff = 0.15 * img_h - top_blocks = [b for b in blocks if self._bbox_center(b)[1] <= top_cutoff] - top_blocks = sorted(top_blocks, key=lambda b: self._bbox_center(b)[1]) - - # Build top candidate payload (for HITL/manual tap UI). - candidates: List[Dict[str, Any]] = [] - for b in top_blocks: - txt = self._clean_name_candidate(b["text"]) - if not self._valid_name_candidate(txt): - continue - candidates.append( - { - "text": txt, - "bbox": self._bbox_yxxy(b), - "conf": round(float(b["confidence"]), 3), - } - ) - # Deduplicate candidate texts while preserving best confidence first. - uniq: Dict[str, Dict[str, Any]] = {} - for c in sorted(candidates, key=lambda x: x["conf"], reverse=True): - k = c["text"].lower() - if k not in uniq: - uniq[k] = c - hitl_data["name_candidates"] = list(uniq.values())[:5] - - # 1) Anchor by explicit Name/Customer-like headers in Paddle/fused blocks. - anchors = [ - b - for b in top_blocks - if any(h in b["text"].lower() for h in self._name_headers()) - and b["engine"] in ("paddle", "fused_paddle_easy", "merged") - ] - if anchors: - anchor = anchors[0] - ax, ay = self._bbox_center(anchor) - # If the header itself contains "Name: XYZ", extract inline value first. - anchor_text = normalize_ocr_text(anchor["text"]) - inline = re.sub(r"(?i).{0,20}(name|customer|mr|جناب|نام)\s*[:\-]?\s*", "", anchor_text).strip() - if self._valid_name_candidate(inline) and not any(h in inline.lower() for h in self._name_headers()): - top_name = inline - top_conf = max(top_conf, 0.91) - nearby = [] - for b in top_blocks: - bx, by = self._bbox_center(b) - if by >= ay and abs(by - ay) <= (0.12 * img_h) and abs(bx - ax) <= 260: - nearby.append(b) - # Prefer non-header line adjacent/below anchor - for cand in nearby: - txt = self._clean_name_candidate(cand["text"]) - if self._valid_name_candidate(txt) and not any(h in txt.lower() for h in self._name_headers()): - top_name = txt - top_conf = max(top_conf, 0.90) - break - - # 2) If anchor failed, score top candidates and pick the best non-transaction token. - if not top_name: - ranked: List[Tuple[float, str]] = [] - for b in top_blocks: - txt = self._clean_name_candidate(b["text"]) - if self._valid_name_candidate(txt): - low = txt.lower() - score = float(b["confidence"]) - # Prefer shorter human-like names and top-most region. - score += 0.10 if len(txt.split()) <= 3 else 0.0 - score += 0.06 if len(txt) <= 24 else 0.0 - score += 0.08 if " " in txt else 0.0 - if any(k in low for k in ("total", "udhaar", "wasooli", "cash", "baqaya", "discount")): - score -= 0.35 - score -= 0.10 if any(ch.isdigit() for ch in txt) else 0.0 - ranked.append((score, txt)) - if ranked: - ranked.sort(key=lambda x: x[0], reverse=True) - top_name = ranked[0][1] - top_conf = max(top_conf, min(0.90, max(0.70, ranked[0][0]))) - - # 3) Hybrid cross-check: compare paddle-layout candidate with easyocr nearby. - if top_name: - easy_neighbors = [] - for b in top_blocks: - if b["engine"] != "easyocr": - continue - et = self._clean_name_candidate(b["text"]) - if self._valid_name_candidate(et): - easy_neighbors.append(et) - if easy_neighbors: - # choose closest-length neighbor as proxy similarity target - easy_best = min(easy_neighbors, key=lambda s: abs(len(s) - len(top_name))) - a = re.sub(r"\s+", "", top_name.lower()) - b = re.sub(r"\s+", "", easy_best.lower()) - overlap = len(set(a) & set(b)) / max(1, len(set(a) | set(b))) - if overlap < 0.28 and top_name.lower() != easy_best.lower(): - warnings_local.append("manual_review_name_mismatch_paddle_easy") - hitl_data["name_review_required"] = True - # Avoid wrong confident guess ("wasooli" style): drop confidence if mismatch is high - top_conf = min(top_conf, 0.66) - - if top_name: - top_name = self._normalize_person_name(top_name) - if top_name and self._valid_name_candidate(top_name): - return top_name, min(0.95, top_conf), warnings_local, hitl_data - - # Dedicated ROI OCR fallback before generic line fallback. - if top_roi_lines: - for i, line in enumerate(top_roi_lines[:8]): - cleaned = self._normalize_person_name(line) - if not self._valid_name_candidate(cleaned): - continue - conf = max(0.70, 0.88 - (i * 0.05)) - if hitl_data.get("name_candidates"): - hitl_data["name_review_required"] = len(hitl_data["name_candidates"]) > 1 - return cleaned, min(0.90, conf), warnings_local, hitl_data - - # Fallback to top text lines (legacy) - for i, line in enumerate(lines[:6]): - cleaned = self._normalize_person_name(line.strip()) - if not self._valid_name_candidate(cleaned): - continue - item_match, _ = PakistaniRetailLexicon.normalize_item_name(cleaned) - if item_match != cleaned.lower() and item_match in PakistaniRetailLexicon.LEXICON: - continue - confidence = 0.88 - (i * 0.08) - if len(cleaned) >= 3: - if hitl_data.get("name_candidates"): - # If we fell back despite ROI candidates, request user confirmation. - hitl_data["name_review_required"] = True - return cleaned.strip(), min(0.92, confidence), warnings_local, hitl_data - - if hitl_data.get("name_candidates"): - hitl_data["name_review_required"] = True - return None, 0.0, warnings_local, hitl_data - - def parse_items(self, lines: List[str]) -> Tuple[List[ExtractedItem], float]: - """v5.2: ULTRA-LENIENT item extraction - capture everything or label as uncategorized""" - items = [] - confidences = [] - - expanded: List[str] = [] - for raw in lines: - for piece in sliding_split_multi_price_line(raw): - expanded.append(piece) - - # v5.2: LENIENT regex patterns for fallback - pattern1 = re.compile(r'([a-zA-Zء-ي\s]+?)\s+(\d+(?:\.\d+)?)\s*(?:kg|kgs?|g|gm|gram|liter|ml|dozen)?\s+(\d+(?:\.\d+)?)') - pattern2 = re.compile(r'([a-zA-Zء-ي\s]+?)\s*[\-\*–—]+\s*(\d+(?:\.\d+)?)\s*(?:kg|kgs?)?\s+(\d+(?:\.\d+)?)') - pattern3 = re.compile(r'([a-zA-Zء-ي\s]+?)\s+(\d+(?:\.\d+)?)') # Ultra-lenient: item + any number - - for line in expanded: - # Skip obvious non-item lines - line_lower = line.lower() - skip_keywords = ['date', 'total', 'udhaar', 'wasooli', 'cash', 'name', 'customer', 'time', 'receipt'] - if any(kw in line_lower for kw in skip_keywords): - continue - - # v5.2: Enhanced parsing chain - geometry → aggressive → regex → ULTRA-LENIENT fallback - parsed = parse_geometry_line(line) - if not parsed: - parsed = parse_aggressive_patterns(line) # v5.2: Try aggressive patterns - if not parsed: - # Fallback to legacy regex patterns - match = pattern2.search(line) or pattern1.search(line) - if not match: - # v5.2: ULTRA-LENIENT fallback - if any text + number at end, treat as item - match = pattern3.search(line) - if not match: - continue - groups = match.groups() - if len(groups) >= 2: - item_name = groups[0] - try: - price = float(groups[1].replace(',', '')) - except: - continue - quantity = 1.0 - confidence = 0.50 # Lower for ultra-lenient - else: - continue - else: - groups = match.groups() - if len(groups) >= 3: - item_name, qty_str, price_str = groups[0], groups[1], groups[2] - quantity = float(qty_str) - confidence = 0.68 - try: - price = float(price_str.replace(',', '')) - except: - continue - else: - continue - else: - # Use parsed result from geometry or aggressive parser - item_name = parsed['text'] - quantity = parsed['quantity'] - price = parsed['price'] - confidence = parsed['confidence'] - - # Shopkeeper Rule: treat extracted `price` as LINE TOTAL by default. - # Only compute qty*unit_price when explicit unit marker exists (e.g. "@ 400"). - price, _unit_price, _price_notes = self._compute_line_total_price( - line, float(quantity), float(price) - ) - - # v5.2: LENIENT validation - allow wider range - if price < 0.5 or price > self.config.MAX_REASONABLE_PRICE: - continue - if quantity <= 0 or quantity > self.config.MAX_QUANTITY: - quantity = 1.0 - - # Clean item name (v5.2: preserve more text) - item_name = normalize_ocr_text(item_name) - item_name = re.sub(r'[^\w\sء-ي]', ' ', item_name) - item_name = re.sub(r'\s+', ' ', item_name).strip().lower() - - if not item_name or len(item_name) < 2: - continue - - # v5.2: Apply semantic normalization - but BYPASS DISCARDS - normalized_name, semantic_conf = PakistaniRetailLexicon.normalize_item_name(item_name) - unit = PakistaniRetailLexicon.normalize_unit(line) - - in_lexicon = (normalized_name != item_name) or (item_name in PakistaniRetailLexicon.LEXICON) - - # v5.2: BYPASS LEXICON FAILURE - label as 'Uncategorized Item' instead of discarding - if not in_lexicon: - semantic_conf = 0.40 - normalized_name = f"Uncategorized: {item_name}" - else: - # Apply confidence boosting only if in lexicon - if semantic_conf > 0.5: - confidence = max(confidence, semantic_conf * 0.85) - if in_lexicon: - confidence += SystemConfig.CONF_BOOST_LEXICON - - # Clamp confidence - confidence = min(0.98, max(0.35, confidence)) - - items.append(ExtractedItem( - name=normalized_name, - quantity=quantity, - price=price, - unit=unit, - confidence=min(0.92, confidence), - original_text=line, - semantic_match=normalized_name if semantic_conf > 0.65 else None, - semantic_score=semantic_conf - )) - confidences.append(confidence) - - # Keep row-level items (do not collapse duplicates) so mobile UI can - # render exactly what OCR detected (e.g. 3 lines => 3 editable rows). - - avg_confidence = sum(confidences) / len(confidences) if confidences else 0.5 - return items, avg_confidence - - def _merge_duplicates(self, items: List[ExtractedItem]) -> List[ExtractedItem]: - """Merge duplicate items (same name, same unit)""" - merged = {} - - for item in items: - key = (item.name, item.unit) - if key in merged: - existing = merged[key] - existing.quantity += item.quantity - existing.price = max(existing.price, item.price) # Keep highest price - existing.confidence = max(existing.confidence, item.confidence) - else: - merged[key] = item - - return list(merged.values()) - - def reconcile_items_from_ocr_gap( - self, - items: List[ExtractedItem], - total: float, - ocr_blocks: List[OCRTextBlock], - ) -> Tuple[List[ExtractedItem], List[str]]: - """ - If total_amount and sum(items) disagree, search raw OCR blocks for a missing - price (e.g. 50 when gap is 50) that was never attached to a line item. - """ - notes: List[str] = [] - if total <= 0 or not ocr_blocks: - return items, notes - sub = sum(it.quantity * it.price for it in items) - gap = round(float(total) - float(sub), 2) - tol = max(2.0, 0.02 * max(float(total), 1.0)) - if abs(gap) <= self.config.TOTAL_VALIDATION_TOLERANCE or gap <= 0 or gap > 50000: - return items, notes - - used = {round(it.price, 2) for it in items} - for blk in ocr_blocks: - raw = str(blk["text"]) - for val in extract_numbers(raw): - if round(val, 2) in used: - continue - if abs(float(val) - gap) <= tol: - name_guess = re.sub(r"[\d\.\s,\-\*#:]+", " ", raw).strip() - if len(name_guess) < 2: - name_guess = f"recovered_line" - nn, sc = PakistaniRetailLexicon.normalize_item_name(name_guess.lower()) - items.append( - ExtractedItem( - name=nn, - quantity=1.0, - price=float(val), - unit="pc", - confidence=0.52, - original_text=raw, - semantic_match=nn if sc > 0.62 else None, - semantic_score=sc, - ) - ) - notes.append(f"gap_recovered_amount_{val}_for_delta_{gap}") - return self._merge_duplicates(items), notes - return items, notes - - def parse_total(self, lines: List[str], items: List[ExtractedItem]) -> Tuple[float, float]: - """Extract total amount with confidence""" - candidates = [] - - # Strategy 1: Look for 'Total' keyword in last lines - for line in reversed(lines[-8:]): - line_lower = line.lower() - if 'total' in line_lower or 'ٹوٹل' in line_lower or 'کل' in line_lower: - numbers = re.findall(r'(\d+(?:\.\d+)?)', line) - if numbers: - total = float(numbers[-1]) - if total > 100: # Reasonable total - return total, 0.95 - - # Strategy 2: Look for numbers near 'udhaar/wasooli' - for line in reversed(lines[-5:]): - line_lower = line.lower() - if any(kw in line_lower for kw in ['udhaar', 'wasooli', 'ادھار', 'وصولی']): - numbers = re.findall(r'(\d+(?:\.\d+)?)', line) - if numbers: - total = float(numbers[-1]) - if total > 100: - return total, 0.90 - - # Strategy 3: v5.1 Smart Total Anchoring - use subset-sum for validation - if items: - all_prices = [item.quantity * item.price for item in items] - total_from_items = sum(all_prices) - - # Try to find valid subset using subset-sum algorithm - if total_from_items > 0: - # First check if all items sum correctly - all_text = ' '.join(lines[-3:]) # Check last 3 lines for explicit total - explicit_numbers = re.findall(r'\b(\d{3,}(?:\.\d+)?)\b', all_text) - if explicit_numbers: - explicit_total = float(max(explicit_numbers)) - if abs(explicit_total - total_from_items) < 5: # Within tolerance - return explicit_total, 0.88 # Trust explicit total - - return total_from_items, 0.80 - - # Strategy 4: Last number in document - all_text = ' '.join(lines) - numbers = re.findall(r'\b(\d{3,}(?:\.\d+)?)\b', all_text) - if numbers: - total = max(float(n) for n in numbers) - return total, 0.65 - - return 0.0, 0.0 - - def validate_math(self, items: List[ExtractedItem], total: float) -> Tuple[bool, float, List[str]]: - """Validate mathematical consistency""" - corrections = [] - # Shopkeeper rule: item.price is the LINE TOTAL (not unit price). - calculated_total = sum(float(item.price) for item in items) - - if abs(calculated_total - total) <= SystemConfig.TOTAL_VALIDATION_TOLERANCE: - return True, total, corrections - - # If discrepancy, **always** prefer calculated total from items (no hallucinations). - # Keep paper total only as a warning/diagnostic signal. - if calculated_total > 0: - if total > 0: - corrections.append(f"paper_total_mismatch_paper_{round(float(total),2)}_computed_{round(float(calculated_total),2)}") - corrections.append(f"using_calculated_total_{round(float(calculated_total),2)}") - return True, float(calculated_total), corrections - - return False, total, corrections - - -# ============================================================================ -# AGENTIC MATH & LOGIC VALIDATOR (Pass 4) -# ============================================================================ - -class AgenticMathValidator: - """ - Pass 4: Self-correcting mathematical validation loop. - - Validates item sum vs total - - Triggers targeted re-examination on mismatches - - Suggests corrections - """ - - def __init__(self, config: SystemConfig): - self.config = config - - def validate(self, items: List[ExtractedItem], total: float) -> Tuple[bool, float, List[str], float]: - """ - Validate and correct mathematical inconsistencies. - Returns: (is_valid, corrected_total, corrections, overall_confidence) - """ - corrections = [] - - # Calculate sum - # Shopkeeper rule: item.price is the LINE TOTAL (not unit price). - item_sum = sum(float(item.price) for item in items) - - # Perfect match - if abs(item_sum - total) <= self.config.TOTAL_VALIDATION_TOLERANCE: - return True, total, corrections, 0.95 - - # Check if total is zero or missing - if total == 0 and item_sum > 0: - corrections.append("total_missing_using_items_sum") - return True, item_sum, corrections, 0.85 - - # Check if items sum is zero - if item_sum == 0 and total > 0: - corrections.append("items_missing_using_provided_total") - return True, total, corrections, 0.80 - - # Calculate relative error - relative_error = abs(item_sum - total) / max(total, item_sum, 1) - - # Small relative error tolerance - if relative_error <= 0.05: # Within 5% - corrections.append(f"small_discrepancy_{relative_error:.2%}_using_total") - return True, total, corrections, 0.90 - - # Medium error - adjust total to items - if relative_error <= 0.15: - corrections.append(f"total_adjusted_from_{total}_to_{item_sum}") - return True, item_sum, corrections, 0.75 - - # Large error - flag for human review - corrections.append(f"large_discrepancy_{relative_error:.2%}_needs_review") - return False, total, corrections, 0.50 - - def calculate_confidence(self, items: List[ExtractedItem], math_validated: bool, total: float) -> Dict[str, float]: - """Calculate overall confidence scores""" - if not items: - return {"items": 0.0, "total": 0.0, "overall": 0.0} - - # Item confidence - item_conf = sum(item.confidence for item in items) / len(items) - - # Total confidence - total_conf = 0.95 if math_validated else 0.65 - - # Transaction type confidence (default) - type_conf = 0.85 if total > 0 else 0.40 - - # Overall confidence (weighted) - overall = (item_conf * 0.4 + total_conf * 0.4 + type_conf * 0.2) - - return { - "items": round(item_conf, 3), - "total": round(total_conf, 3), - "type": round(type_conf, 3), - "overall": round(overall, 3) - } - - -# ============================================================================ -# ADVANCED CALCULATION ENGINE (Pass 3.5) -# ============================================================================ - -class AdvancedCalculationEngine: - """ - Extra calculation intelligence for messy parchis: - - Detect subtotal/discount/paid/balance style numbers - - Score multiple total candidates - - Repair one likely wrong item price when near-match exists - - Reconcile final total using item sum + financial hints - """ - def __init__(self, config: SystemConfig): - self.config = config - - @staticmethod - def _line_numbers(line: str) -> List[float]: - vals: List[float] = [] - for m in re.findall(r"\d+(?:\.\d+)?", line or ""): - try: - v = float(m) - if 0 <= v <= 999999: - vals.append(v) - except Exception: - continue - return vals - - @staticmethod - def _line_tag(line: str) -> str: - s = (line or "").lower() - if any(k in s for k in ("subtotal", "sub total", "ذیلی", "جمع")): - return "subtotal" - if any(k in s for k in ("discount", "disc", "رعایت", "ڈسکاؤنٹ")): - return "discount" - if any(k in s for k in ("paid", "ادا", "cash", "وصولی")): - return "paid" - if any(k in s for k in ("balance", "baqaya", "بقای", "بقایا", "udhaar", "ادھار")): - return "balance" - if any(k in s for k in ("total", "grand total", "کل", "ٹوٹل")): - return "total" - return "other" - - def _extract_financial_hints(self, lines: List[str]) -> Dict[str, float]: - hints: Dict[str, float] = {} - for ln in lines: - tag = self._line_tag(ln) - nums = self._line_numbers(ln) - if not nums: - continue - v = float(nums[-1]) - if v <= 0: - continue - # keep stronger/latest candidate by preferring larger values for totals, - # and larger discounts/paid when repeated. - prev = hints.get(tag) - if prev is None or v >= prev: - hints[tag] = v - return hints - - def _candidate_totals(self, lines: List[str], items: List[ExtractedItem], initial_total: float) -> List[Tuple[float, float, str]]: - cands: List[Tuple[float, float, str]] = [] - if initial_total > 0: - cands.append((initial_total, 0.78, "parser_total")) - - item_sum = float(sum(i.quantity * i.price for i in items)) - if item_sum > 0: - cands.append((item_sum, 0.74, "items_sum")) - - hints = self._extract_financial_hints(lines) - if "total" in hints: - cands.append((hints["total"], 0.90, "line_total")) - if "balance" in hints: - cands.append((hints["balance"], 0.84, "line_balance")) - if "subtotal" in hints: - subtotal = hints["subtotal"] - discount = hints.get("discount", 0.0) - paid = hints.get("paid", 0.0) - # subtotal - discount - paid (if present) as strong ledger formula - calc = subtotal - discount - paid - if calc > 0: - cands.append((calc, 0.82, "subtotal_discount_paid_formula")) - - # dedupe by rounded value while keeping best confidence source - best: Dict[float, Tuple[float, float, str]] = {} - for v, conf, src in cands: - k = round(v, 2) - if k not in best or conf > best[k][1]: - best[k] = (v, conf, src) - return sorted(best.values(), key=lambda x: x[1], reverse=True) - - def _try_single_item_repair(self, items: List[ExtractedItem], target_total: float) -> Tuple[List[ExtractedItem], List[str]]: - notes: List[str] = [] - if not items or target_total <= 0: - return items, notes - # Shopkeeper rule: item.price is line total. - current = float(sum(float(i.price) for i in items)) - delta = round(target_total - current, 2) - if abs(delta) < 2.0: - return items, notes - - # Try to repair one low-confidence item by adjusting its price by delta/qty. - ranked_idx = sorted( - range(len(items)), - key=lambda idx: (items[idx].confidence, items[idx].quantity), - ) - for idx in ranked_idx[: min(4, len(ranked_idx))]: - it = items[idx] - new_price = round(float(it.price) + float(delta), 2) - if not (self.config.MIN_ITEM_PRICE <= new_price <= self.config.MAX_REASONABLE_PRICE): - continue - old = it.price - it.price = new_price - it.confidence = max(0.35, min(0.90, it.confidence - 0.05)) - notes.append(f"single_item_price_repair_{it.name}_{old}_to_{new_price}") - break - return items, notes - - def reconcile( - self, - lines: List[str], - items: List[ExtractedItem], - parsed_total: float, - ) -> Tuple[List[ExtractedItem], float, List[str], float]: - """ - Returns: - (possibly adjusted items, reconciled_total, notes, confidence) - """ - notes: List[str] = [] - candidates = self._candidate_totals(lines, items, parsed_total) - if not candidates: - return items, parsed_total, notes, 0.45 - - # Pick best candidate by confidence, but prefer consistency with items where close. - item_sum = float(sum(float(i.price) for i in items)) - best_v, best_c, best_src = candidates[0] - if item_sum > 0: - for v, c, src in candidates: - rel = abs(v - item_sum) / max(v, item_sum, 1.0) - if rel <= 0.08 and c >= 0.72: - best_v, best_c, best_src = v, max(c, 0.88), src - notes.append(f"candidate_selected_by_item_consistency_{src}_{v}") - break +def line_text(line: list[dict]) -> str: + return " ".join(d["text"] for d in line) - notes.append(f"calc_engine_selected_{best_src}_{round(best_v,2)}") - pre_items = float(sum(float(i.price) for i in items)) - rel_gap = abs(pre_items - best_v) / max(pre_items, best_v, 1.0) if (pre_items > 0 and best_v > 0) else 0.0 - if rel_gap > 0.12: - items, fix_notes = self._try_single_item_repair(items, best_v) - notes.extend(fix_notes) +# --------------------------------------------------------------------------- +# Generic item parser +# --------------------------------------------------------------------------- - final_items_sum = float(sum(float(i.price) for i in items)) - final_gap = abs(final_items_sum - best_v) / max(final_items_sum, best_v, 1.0) if (final_items_sum > 0 and best_v > 0) else 0.0 - final_conf = max(0.50, min(0.95, best_c - min(0.20, final_gap * 0.5))) +def _is_text_token(tok: str) -> bool: + return bool(re.search(r"[a-zA-Z\u0600-\u06FF]", tok)) - # If still too far and item sum looks reliable, pivot to item sum. - if final_items_sum > 0 and final_gap > 0.22: - notes.append(f"calc_engine_pivot_to_item_sum_from_{best_v}_to_{final_items_sum}") - return items, final_items_sum, notes, max(0.68, final_conf - 0.08) - return items, best_v, notes, final_conf +def _is_num_token(tok: str) -> bool: + return bool(re.fullmatch(r"[\d,./]+", tok)) -# ============================================================================ -# HUMAN-IN-THE-LOOP (HITL) FEEDBACK SYSTEM -# ============================================================================ +def parse_item_from_tokens(tokens: list[str]) -> dict | None: + """ + Pattern matching (order matters): + [text] [num] [num] → name qty price OR name price unit + [num] [text] [num] → qty name price + [text]-[num] [num] → name qty price + [text] [num] → name price (qty=1) + Returns None if no valid item detected. + """ + tokens = merge_spaced_digits(tokens) + text_toks = [t for t in tokens if _is_text_token(t)] + num_toks = [t for t in tokens if _is_num_token(t)] -class UserFeedback(BaseModel): - """User correction feedback for model improvement""" - request_id: str - corrected_items: List[ExtractedItem] - corrected_total: Optional[float] = None - corrected_customer: Optional[str] = None - transaction_type: Optional[str] = None - feedback_notes: str = "" - timestamp: Optional[str] = None + # Skip lines with no text or no numbers + if not text_toks or not num_toks: + return None + nums = [float(t.replace(",", "")) for t in num_toks] + name_raw = " ".join(text_toks) + name = lexicon_correct(name_raw.strip()) -class FeedbackLearner: - """ - HITL learning system: aggregates user corrections to identify patterns. - Uses in-memory learning with zero persistence (ZDR compliance). - """ - - def __init__(self, max_feedback_items: int = 500): - self.feedback_history = defaultdict(list) # {item_name: [corrections]} - self.error_patterns = defaultdict(int) # {pattern: frequency} - self.max_items = max_feedback_items - self.total_corrections = 0 - self.last_cleared = datetime.now() - - def add_feedback(self, feedback: UserFeedback) -> Dict[str, Any]: - """Process user feedback and learn patterns""" - patterns = [] - - for item in feedback.corrected_items: - self.feedback_history[item.name].append({ - "price": item.price, - "quantity": item.quantity, - "confidence_original": item.confidence, - "timestamp": datetime.now().isoformat() - }) - - # Detect error pattern - if item.semantic_score < 0.7 and item.semantic_match: - pattern = f"confuse_{item.semantic_match}_with_{item.name}" - self.error_patterns[pattern] += 1 - patterns.append(pattern) - - self.total_corrections += len(feedback.corrected_items) - - # Age out old feedback (ZDR: only keep recent corrections) - if len(self.feedback_history) > self.max_items: - oldest_item = min( - self.feedback_history.items(), - key=lambda x: x[1][-1]["timestamp"] if x[1] else "0" - ) - del self.feedback_history[oldest_item[0]] - - logger.info(f"Feedback recorded: {feedback.request_id} | Patterns: {patterns}") - - return { - "status": "feedback_recorded", - "patterns_detected": patterns, - "total_corrections_aggregated": self.total_corrections - } - - def get_high_confidence_corrections(self) -> Dict[str, float]: - """Return learned price/qty mappings with high frequency""" - corrections = {} - for item_name, corrections_list in self.feedback_history.items(): - if len(corrections_list) >= 3: # Need 3+ corrections to be confident - avg_price = np.mean([c["price"] for c in corrections_list]) - avg_qty = np.mean([c["quantity"] for c in corrections_list]) - corrections[item_name] = {"price": avg_price, "qty": avg_qty} - return corrections - - def get_error_insights(self) -> Dict[str, Any]: - """Provide insights into common OCR mistakes""" - if not self.error_patterns: - return {"insights": "No error patterns detected yet"} - - top_errors = sorted( - self.error_patterns.items(), - key=lambda x: x[1], - reverse=True - )[:5] - - return { - "top_error_patterns": [{"pattern": p, "frequency": f} for p, f in top_errors], - "total_feedbacks": self.total_corrections, - "unique_items_learned": len(self.feedback_history) - } - - def clear_old_data(self, hours: int = 24): - """ZDR: Clear old feedback data periodically""" - now = datetime.now() - cutoff_time = (now - threading.Event()).replace(hour=now.hour - hours) - - cleared = 0 - for item_name in list(self.feedback_history.keys()): - self.feedback_history[item_name] = [ - c for c in self.feedback_history[item_name] - if c["timestamp"] > cutoff_time.isoformat() - ] - if not self.feedback_history[item_name]: - del self.feedback_history[item_name] - cleared += 1 - - self.last_cleared = now - logger.info(f"ZDR cleanup: Cleared {cleared} old item records") - return {"status": "cleanup_complete", "items_cleared": cleared} - - -# ============================================================================ -# ZERO DATA RETENTION (ZDR) COMPLIANCE MANAGER -# ============================================================================ - -class ZDRCompliance: - """ - Ensure zero persistent storage of user data. - All processing is in-memory and ephemeral. - """ - - def __init__(self): - self.request_cache = {} # {request_id: {data}, expires_at} - self.cache_ttl = 3600 # 1 hour - self.last_cleanup = time.time() - - def store_request_data(self, request_id: str, result: ProcessingResult) -> None: - """Store only during active session""" - self.request_cache[request_id] = { - "result": result, - "expires_at": time.time() + self.cache_ttl - } - - # Auto cleanup every 100 requests - if len(self.request_cache) % 100 == 0: - self._cleanup_expired() - - def retrieve_request_data(self, request_id: str) -> Optional[ProcessingResult]: - """Retrieve only if not expired""" - if request_id not in self.request_cache: - return None - - entry = self.request_cache[request_id] - if time.time() > entry["expires_at"]: - del self.request_cache[request_id] + if len(nums) >= 2: + if _is_num_token(tokens[0]): + qty, price = nums[0], nums[-1] + else: + qty, price = nums[0], nums[-1] + # sanity guards (from Config) + if qty > Config.MAX_ITEM_QTY: + qty, price = 1.0, nums[-1] + if price < Config.MIN_ITEM_PRICE or price > Config.MAX_ITEM_PRICE: + return None # noise – don't fabricate an item + return {"name": name, "quantity": qty, "price": price} + else: + price = nums[0] + if price < Config.MIN_ITEM_PRICE or price > Config.MAX_ITEM_PRICE: return None - - return entry["result"] - - def _cleanup_expired(self) -> int: - """Remove expired entries""" - now = time.time() - expired_ids = [ - rid for rid, entry in self.request_cache.items() - if now > entry["expires_at"] - ] - for rid in expired_ids: - del self.request_cache[rid] - - logger.info(f"ZDR cleanup: Removed {len(expired_ids)} expired requests") - return len(expired_ids) - - def get_compliance_status(self) -> Dict[str, Any]: - """Return compliance status""" - self._cleanup_expired() - return { - "zdr_enabled": True, - "cache_ttl_seconds": self.cache_ttl, - "active_requests": len(self.request_cache), - "last_cleanup": datetime.fromtimestamp(self.last_cleanup).isoformat(), - "privacy_level": "HIPAA-compliant (no persistent storage)" - } - - -# ============================================================================ -# ADVANCED AGENTIC SELF-CORRECTION LOOP (Pass 4 Enhanced) -# ============================================================================ - -class AgenticSelfCorrector: - """ - Intelligent multi-agent self-correction based on logical constraints: - - Price must be reasonable for item - - Total must equal sum of (price × quantity) - - Items must match known retail lexicon - - Quantities must be valid units - """ - - def __init__(self, feedback_learner: FeedbackLearner): - self.learner = feedback_learner - self.correction_log = [] - - def apply_learned_corrections(self, items: List[ExtractedItem]) -> Tuple[List[ExtractedItem], List[str]]: - """Apply corrections based on HITL feedback""" - corrections = [] - high_conf_corrections = self.learner.get_high_confidence_corrections() - - for item in items: - if item.name in high_conf_corrections: - learned = high_conf_corrections[item.name] - - # If current extraction is low confidence, apply learned correction - if item.confidence < 0.75: - old_price = item.price - old_qty = item.quantity - - item.price = learned["price"] - item.quantity = learned["qty"] - item.confidence = min(0.95, item.confidence + 0.15) - - corrections.append( - f"Applied learned correction for {item.name}: " - f"${old_price} → ${item.price}, qty {old_qty} → {item.quantity}" - ) - - return items, corrections - - def validate_item_prices(self, items: List[ExtractedItem]) -> Tuple[List[ExtractedItem], List[str]]: - """ - Validate prices against Pakistani retail knowledge: - - Vegetables/Fruits: 50-500 PKR/kg - - Dairy: 100-1000 PKR - - Grains: 40-300 PKR/kg - """ - corrections = [] - - price_ranges = { - "vegetables": (50, 500), - "fruits": (50, 500), - "dairy": (100, 1000), - "meat": (300, 2000), - "grains": (40, 300), - "spices": (200, 3000), - "general": (10, 50000) - } - - for item in items: - category = "general" - item_lower = item.name.lower() - - for cat in price_ranges: - if cat in item_lower or item_lower in cat: - category = cat - break - - min_price, max_price = price_ranges[category] - - if item.price < min_price or item.price > max_price: - old_price = item.price - item.price = max(min_price, min(item.price, max_price)) - item.confidence *= 0.9 # Reduce confidence for adjusted price - - corrections.append( - f"Price validation: {item.name} ${old_price} → ${item.price} " - f"(valid range: ${min_price}-${max_price})" - ) - - return items, corrections - - def apply_mathematical_corrections(self, items: List[ExtractedItem], reported_total: float) -> Tuple[float, List[str]]: - """ - Self-correct using mathematical constraints: - 1. Calculate sum of (qty × price) - 2. If sum != reported_total, identify which item(s) might be wrong - 3. Apply Bayesian adjustment based on confidence - """ - corrections = [] - calculated_sum = sum(item.quantity * item.price for item in items) - - if abs(calculated_sum - reported_total) > 2.0: - # Find lowest-confidence items and adjust - items_by_conf = sorted(items, key=lambda x: x.confidence) - - diff = reported_total - calculated_sum - - # Try to fix using lowest confidence items - for item in items_by_conf[:min(2, len(items))]: - if item.confidence < 0.80: - adjustment = diff / (item.quantity or 1) - item.price += adjustment - calculated_sum = sum(i.quantity * i.price for i in items) - - corrections.append( - f"Mathematical correction: {item.name} adjusted " - f"(diff was {diff:.2f} PKR, confidence was {item.confidence:.2f})" - ) - - if abs(calculated_sum - reported_total) < 2.0: - break - - return reported_total, corrections - - -# ============================================================================ -# IMAGE-HASH LRU CACHE (v6.0) — ZDR-compliant, TTL-1h, max 100 entries -# ============================================================================ - -class _ImageHashCache: - """ - Thread-safe in-memory cache keyed by SHA-256 of the raw image bytes. - Entries expire after CACHE_TTL seconds (default 3600 = 1 hour). - Maximum MAX_CACHE_SIZE entries; oldest entry evicted on overflow. - No images or PII are stored — only the structured ProcessingResult. - """ + return {"name": name, "quantity": 1.0, "price": price} - def __init__(self, ttl: int = 3600, max_size: int = 100): - self._store: dict = {} # {hash_hex: {"result": ..., "expires_at": float}} - self._order: list = [] # insertion order for LRU eviction - self._lock = threading.Lock() - self.ttl = ttl - self.max_size = max_size - - def _key(self, image_bytes: bytes) -> str: - return hashlib.sha256(image_bytes).hexdigest() - - def get(self, image_bytes: bytes): - """Return cached ProcessingResult or None (cache miss / expired).""" - key = self._key(image_bytes) - with self._lock: - entry = self._store.get(key) - if entry is None: - return None - if time.time() > entry["expires_at"]: - # Expired — evict - self._store.pop(key, None) - if key in self._order: - self._order.remove(key) - return None - return entry["result"] - - def set(self, image_bytes: bytes, result) -> None: - """Store result for image_bytes. Evict LRU entry if at capacity.""" - key = self._key(image_bytes) - with self._lock: - # Evict expired entries first - now = time.time() - expired = [k for k, v in self._store.items() if now > v["expires_at"]] - for k in expired: - self._store.pop(k, None) - if k in self._order: - self._order.remove(k) - # LRU eviction if still full - while len(self._store) >= self.max_size and self._order: - oldest = self._order.pop(0) - self._store.pop(oldest, None) - self._store[key] = {"result": result, "expires_at": now + self.ttl} - if key in self._order: - self._order.remove(key) - self._order.append(key) - - def stats(self) -> dict: - with self._lock: - return {"entries": len(self._store), "max_size": self.max_size, "ttl_seconds": self.ttl} - - -# Global cache instance -_IMAGE_CACHE = _ImageHashCache(ttl=SystemConfig.CACHE_TTL, max_size=SystemConfig.MAX_CACHE_SIZE) - -# Concurrency guard: max 2 simultaneous OCR/VLM requests to prevent OOM -_REQUEST_SEMAPHORE = asyncio.Semaphore(2) - - -# ============================================================================ -# VLM ENGINE (v6.0) — Qwen2-VL-2B-Instruct, CPU fp32, NO bitsandbytes -# ============================================================================ - -class QwenVLMEngine: - """ - Wraps Qwen2-VL-2B-Instruct for full-page parchi extraction. - - Design decisions for CPU / 16 GB RAM: - • dtype=torch.float32 — bfloat16 is unreliable on some CPU builds. - • device_map="cpu" — explicit, no auto-GPU fallback. - • Lazy loading — model is NOT loaded at import time; first call - triggers load so startup is fast and HF health probe passes. - • Memory guard — if RSS > VLM_MEMORY_LIMIT_MB after load, the - engine self-disables and the fallback OCR ensemble takes over. - • Thread lock — only one inference thread at a time. - """ - #: Prompt sent to Qwen2-VL for structured parchi extraction. - _SYSTEM_PROMPT = ( - "You are an expert OCR assistant for Pakistani handwritten receipts (parchi). " - "Extract ALL text exactly as written, preserving Urdu and English. " - "For each line, output: item_name | quantity | unit | price. " - "At the end output TOTAL: on its own line. " - "If a field is missing use 'N/A'. Do not invent data." - ) - - def __init__(self, config: SystemConfig): - self.config = config - self._model = None - self._processor = None - self._loaded = False - self._disabled = False # set True if load/memory guard fails - self._lock = threading.Lock() - - # ------------------------------------------------------------------ - # Lazy loader - # ------------------------------------------------------------------ - - def _load(self) -> bool: - """ - Load model + processor synchronously (called once from a thread). - Returns True on success, False on failure. - """ - if self._loaded or self._disabled: - return self._loaded and not self._disabled - - if not (TRANSFORMERS_AVAILABLE and SystemConfig.ENABLE_VLM): - logger.info("VLM disabled (ENABLE_VLM=0 or transformers unavailable).") - self._disabled = True - return False - - if not TORCH_AVAILABLE or torch is None: - logger.warning("VLM skipped: torch not available.") - self._disabled = True - return False - - model_id = self.config.VLM_MODEL_ID - logger.info("Loading VLM %s on CPU (fp32) — this may take 60-120 s...", model_id) - rss_before = _get_rss_mb() - try: - self._processor = AutoProcessor.from_pretrained( - model_id, - trust_remote_code=True, - # Cache weights to HF_HOME (/.cache by default in Docker) - cache_dir=os.getenv("TRANSFORMERS_CACHE", "/.cache"), - ) - self._model = Qwen2VLForConditionalGeneration.from_pretrained( - model_id, - torch_dtype=torch.float32, # fp32 — safest on CPU - device_map="cpu", - trust_remote_code=True, - cache_dir=os.getenv("TRANSFORMERS_CACHE", "/.cache"), - ) - self._model.eval() # inference-only mode - - rss_after = _get_rss_mb() - delta = rss_after - rss_before - logger.info( - "VLM loaded | RSS before=%.0fMB after=%.0fMB delta=%.0fMB", - rss_before, rss_after, delta, - ) - - # Memory safety guard - if rss_after > self.config.VLM_MEMORY_LIMIT_MB: - logger.error( - "VLM RSS %.0f MB exceeds limit %.0f MB — disabling VLM.", - rss_after, self.config.VLM_MEMORY_LIMIT_MB, - ) - self._model = None - self._processor = None - _free_memory() - self._disabled = True - return False +def parse_item_from_line(line: list[dict]) -> dict | None: + raw_text = line_text(line) + normalised = normalise_text(raw_text) + tokens = normalised.split() + result = parse_item_from_tokens(tokens) + if result is None: + return None + # avg confidence for the line + conf = float(np.mean([d["conf"] for d in line])) + result["confidence"] = round(conf, 3) + result["low_confidence"] = conf < 0.50 + result["unit"] = _detect_unit(normalised) + return result - self._loaded = True - return True - except Exception as exc: - logger.error("VLM load failed: %s", exc, exc_info=True) - self._model = None - self._processor = None - _free_memory() - self._disabled = True - return False - - # ------------------------------------------------------------------ - # Inference - # ------------------------------------------------------------------ - - def extract(self, pil_image: "Image.Image") -> Optional[str]: - """ - Run Qwen2-VL on a PIL image and return raw text output. - Returns None if VLM is disabled or inference fails. - Runs synchronously (CPU-bound); caller wraps in asyncio.to_thread. - """ - with self._lock: - if not self._load(): - return None - - try: - # Build the multi-modal message payload - messages = [ - { - "role": "user", - "content": [ - {"type": "image", "image": pil_image}, - {"type": "text", "text": self._SYSTEM_PROMPT}, - ], - } - ] - - # Prepare inputs (qwen_vl_utils path or fallback) - if QWEN_VL_UTILS_AVAILABLE and process_vision_info is not None: - image_inputs, video_inputs = process_vision_info(messages) - text = self._processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - inputs = self._processor( - text=[text], - images=image_inputs, - videos=video_inputs, - padding=True, - return_tensors="pt", - ) - else: - # Minimal fallback without qwen_vl_utils - text = self._processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - inputs = self._processor( - text=[text], - images=[pil_image], - padding=True, - return_tensors="pt", - ) - - inputs = inputs.to("cpu") - - with torch.no_grad(): - output_ids = self._model.generate( - **inputs, - max_new_tokens=self.config.VLM_MAX_NEW_TOKENS, - do_sample=False, # deterministic on CPU - temperature=None, - top_p=None, - ) - - # Decode only the newly generated tokens - generated = output_ids[:, inputs.input_ids.shape[1]:] - result = self._processor.batch_decode( - generated, skip_special_tokens=True, clean_up_tokenization_spaces=True - ) - return result[0].strip() if result else None - - except Exception as exc: - logger.error("VLM inference error: %s", exc) - return None - - # ------------------------------------------------------------------ - # VLM text → structured lines - # ------------------------------------------------------------------ - - @staticmethod - def parse_vlm_output(raw: str) -> Tuple[List[str], Optional[float]]: - """ - Convert VLM pipe-delimited output into plain text lines + optional total. - Lines look like: "atta | 2 | kg | 240" or freeform if the model deviates. - Returns (lines, total_from_vlm_or_None). - """ - if not raw: - return [], None - - lines: List[str] = [] - vlm_total: Optional[float] = None - - for raw_line in raw.splitlines(): - stripped = raw_line.strip() - if not stripped: - continue +def _detect_unit(text: str) -> str: + t = text.lower() + for u, pats in { + "kg": ["kg", "kilo", "کلو"], + "g": ["gram", "gm", " g "], + "liter": ["liter", "litre", "ltr", "لیٹر"], + "ml": ["ml", "milliliter"], + "dozen": ["dozen", "dz", "درجن"], + "pc": ["pc", "piece", "pcs", "عدد"], + }.items(): + if any(p in t for p in pats): + return u + return "pc" + +# --------------------------------------------------------------------------- +# Customer name extraction (top 15 % of image) +# --------------------------------------------------------------------------- + +# keywords that disqualify a line from being a customer name +_NAME_SKIP = re.compile( + r"(date|time|total|udhaar|wasooli|cash|receipt|shop|دکان|تاریخ)", re.I +) - # Detect explicit TOTAL line produced by the prompt - if re.match(r'(?i)^total\s*[:=]?\s*', stripped): - nums = re.findall(r'[\d,\.]+', stripped) - for n in reversed(nums): - try: - v = float(n.replace(',', '')) - if v > 0: - vlm_total = v - break - except ValueError: - pass - continue # Don't add total line to item lines - - # Pipe-delimited: "name | qty | unit | price" - parts = [p.strip() for p in stripped.split('|')] - if len(parts) >= 2 and parts[0] and parts[0].lower() != 'n/a': - # Reconstruct as a plain OCR-style line for the existing parser - reconstructed = ' '.join(p for p in parts if p and p.lower() != 'n/a') - if reconstructed: - lines.append(normalize_ocr_text(reconstructed)) - else: - # Free-form line — pass through as-is - lines.append(normalize_ocr_text(stripped)) - - return lines, vlm_total - - -# ============================================================================ -# MAIN ORCHESTRATOR (4-Pass Agentic Loop) -# ============================================================================ - -class ParchiOrchestrator: +def extract_customer_name( + dets: list[dict], img_height: int +) -> str | None: + threshold_y = img_height * Config.NAME_ROI_RATIO + top_dets = [d for d in dets if d["yc"] < threshold_y] + if not top_dets: + return None + top_lines = group_into_lines(top_dets) + for line in top_lines: + text = normalise_text(line_text(line)).strip() + if re.search(r"\d", text): # has digits → skip + continue + if not re.fullmatch(r"[a-zA-Z\u0600-\u06FF\s]{3,30}", text): + continue + if _NAME_SKIP.search(text): # keyword → skip + continue + # Capitalise ASCII words; leave Urdu as-is + words = [w.capitalize() if w[:1].isascii() else w for w in text.split()] + return " ".join(words) + return None + +# --------------------------------------------------------------------------- +# Total extraction (bottom 30 % + keyword scan) +# --------------------------------------------------------------------------- + +def extract_total( + reader: easyocr.Reader, + bgr: np.ndarray, + all_dets: list[dict], + img_height: int, +) -> tuple[float | None, bool]: """ - Main orchestrator implementing 4-pass agentic processing. - - Pass 1: Vision Enhancement - Image preprocessing and quality improvement - Pass 2: Multi-Engine OCR - Text extraction with ensemble - Pass 3: Semantic Grounding - Lexicon-based correction and normalization - Pass 4: Agentic Math Loop - Validation, correction, and confidence scoring - - Enhanced with: - - HITL Feedback Learning - - Zero Data Retention Compliance - - Advanced Self-Correction + Returns (total_value, found_via_keyword). + Runs a dedicated re-OCR on the bottom 25 % with digit whitelist. """ - - def __init__(self): - self.config = SystemConfig() - self.preprocessor = AdvancedImagePreprocessor(self.config) - self.ocr_engine = MultiEngineOCR(self.config) - # v6.0: Primary VLM engine (Qwen2-VL-2B-Instruct, CPU fp32) - self.vlm_engine = QwenVLMEngine(self.config) - self.parser = IntelligentParser(self.config) - self.calc_engine = AdvancedCalculationEngine(self.config) - self.validator = AgenticMathValidator(self.config) - - # Enterprise features - self.feedback_learner = FeedbackLearner(max_feedback_items=500) - self.zdr_manager = ZDRCompliance() - self.self_corrector = AgenticSelfCorrector(self.feedback_learner) - - async def process(self, image_bytes: bytes) -> ProcessingResult: - """Main processing pipeline with enterprise features. - v6.0: Image-hash cache + concurrency semaphore + VLM pass-1A. - """ - start_time = time.time() - request_id = hashlib.md5(image_bytes).hexdigest()[:16] - all_corrections = [] - - # ── Cache lookup (SHA-256 keyed, 1-hour TTL) ────────────────────────── - cached = _IMAGE_CACHE.get(image_bytes) - if cached is not None: - logger.info("[%s] Cache hit — returning cached result.", request_id) - return cached - - # ── Concurrency guard: max 2 simultaneous requests ──────────────────── - result: Optional[ProcessingResult] = None - async with _REQUEST_SEMAPHORE: - logger.info( - "[%s] Processing started | RSS=%.0f MB", - request_id, _get_rss_mb(), - ) - try: - result = await self._process_inner( - image_bytes, request_id, start_time, all_corrections - ) - except Exception as outer_exc: - logger.error("[%s] Outer pipeline error: %s", request_id, outer_exc, exc_info=True) - result = ProcessingResult( - request_id=request_id, - success=False, - status=ProcessingStatus.FAILED, - errors=[str(outer_exc)], - processing_time_ms=int((time.time() - start_time) * 1000), - ) - finally: - _free_memory() - logger.info( - "[%s] Memory after cleanup | RSS=%.0f MB", - request_id, _get_rss_mb(), - ) + # --- keyword scan in full dets first --- + keyword_total = _scan_keyword_total(all_dets) + if keyword_total is not None: + return keyword_total, True - if result is not None and result.success: - _IMAGE_CACHE.set(image_bytes, result) - return result or ProcessingResult( - request_id=request_id, - success=False, - status=ProcessingStatus.FAILED, - errors=["unknown_pipeline_failure"], - processing_time_ms=int((time.time() - start_time) * 1000), + # --- bottom-ROI re-OCR --- + roi_y = int(img_height * 0.75) + roi = bgr[roi_y:, :] + try: + raw = reader.readtext( + roi, + text_threshold=0.10, + low_text=0.05, + allowlist="0123456789.,۰۱۲۳۴۵۶۷۸۹", ) + candidates: list[float] = [] + for (_, text, _) in raw: + norm = normalise_text(text) + nums = extract_numbers(norm) + candidates.extend(nums) + if candidates: + return max(candidates), False + except Exception as exc: + log.warning("Bottom-ROI OCR failed: %s", exc) + + return None, False + + +def _scan_keyword_total(dets: list[dict]) -> float | None: + """Look for largest number near a total keyword.""" + candidates: list[float] = [] + for det in dets: + text = normalise_text(det["text"]) + if TOTAL_KW.search(text): + nums = extract_numbers(text) + if nums: + candidates.append(max(nums)) + # also check neighbours (same line or next line) + return max(candidates) if candidates else None + + +def detect_transaction_type(dets: list[dict]) -> str: + full = " ".join(normalise_text(d["text"]) for d in dets) + if UDHAAR_KW.search(full): + return "udhaar" + if WASOOLI_KW.search(full): + return "wasooli" + if CASH_KW.search(full): + return "cash" + return "unknown" + +# --------------------------------------------------------------------------- +# Master pipeline +# --------------------------------------------------------------------------- + +def process_image(image_bytes: bytes) -> dict: + t0 = time.monotonic() + request_id = hashlib.sha256(image_bytes).hexdigest()[:16] + + # ---- cache lookup (full SHA-256 key) ---- + img_hash = hashlib.sha256(image_bytes).hexdigest() + cached = _cache.get(img_hash) + if cached is not None: + log.info("Cache HIT %s", img_hash[:12]) + return cached + + # ---- resize if too large (saves OCR time) ---- + pil_img = Image.open(io.BytesIO(image_bytes)).convert("RGB") + w0, h0 = pil_img.size + if max(w0, h0) > Config.TARGET_WIDTH: + scale = Config.TARGET_WIDTH / max(w0, h0) + pil_img = pil_img.resize((int(w0 * scale), int(h0 * scale)), Image.LANCZOS) + + # ---- decode & auto-rotate ---- + bgr = _pil_to_bgr(pil_img) + bgr = _auto_rotate(bgr) + h, w = bgr.shape[:2] + + reader = get_reader() + + # ---- multi-variant OCR ---- + variants = build_variants(bgr) + all_variant_results: list[list[dict]] = [] + for v in variants: + all_variant_results.append(_run_ocr(reader, v)) + + merged_dets = merge_ocr_results(all_variant_results) + log.info("[%s] Merged %d detections from %d variants", + request_id[:8], len(merged_dets), len(variants)) + + if not merged_dets: + result = _empty_result(t0, request_id) + _cache.set(img_hash, result) + return result + + # ---- customer name ---- + customer_name = extract_customer_name(merged_dets, h) + + # ---- line grouping (exclude top NAME_ROI for items) ---- + name_cutoff = h * Config.NAME_ROI_RATIO + body_dets = [d for d in merged_dets if d["yc"] >= name_cutoff] + lines = group_into_lines(body_dets) + + # ---- item parsing ---- + items: list[dict] = [] + for line in lines: + raw = line_text(line) + norm = normalise_text(raw) + if TOTAL_KW.search(norm) or UDHAAR_KW.search(norm) or WASOOLI_KW.search(norm): + continue + item = parse_item_from_line(line) + if item and item["price"] > 0: + items.append(item) + + # ---- total extraction ---- + total_val, found_by_kw = extract_total(reader, bgr, merged_dets, h) + items_sum = round(sum(i["price"] * i["quantity"] for i in items), 2) + + if total_val is None: + total_val = items_sum + mismatch = False + log.info("[%s] No total found; summed items → %.2f", request_id[:8], total_val) + else: + tolerance = max(5.0, total_val * 0.05) + mismatch = abs(total_val - items_sum) > tolerance + + # ---- transaction type ---- + tx_type = detect_transaction_type(merged_dets) + + # ---- build response ---- + elapsed_ms = round((time.monotonic() - t0) * 1000, 1) + result = { + "request_id": request_id, + "success": True, + "customer_name": customer_name, + "items": items, + "total": round(float(total_val), 2), + "mismatch": mismatch, + "transaction_type": tx_type, + "processing_time_ms": elapsed_ms, + "item_count": len(items), + } + _cache.set(img_hash, result) + gc.collect() + log.info("[%s] Done %.0f ms | items=%d | total=%.2f | mismatch=%s", + request_id[:8], elapsed_ms, len(items), total_val, mismatch) + return result - async def _process_inner( - self, - image_bytes: bytes, - request_id: str, - start_time: float, - all_corrections: list, - ) -> "ProcessingResult": - """Core extraction logic (called inside semaphore).""" - vlm_lines_used = False - vlm_total_hint: Optional[float] = None - try: - # Load image - pil_img = Image.open(io.BytesIO(image_bytes)).convert('RGB') - rgb = np.array(pil_img) - - # ===== PASS 1: VISION ENHANCEMENT ===== - logger.info("[%s] PASS 1: Vision Enhancement", request_id) - enhanced = self.preprocessor.enhance_image(rgb) - quality_metrics = self.preprocessor.analyze_image_quality(enhanced) - sharpness = quality_metrics.get('sharpness', 0.70) - mismatch = False - - if sharpness < SystemConfig.AGGRESSIVE_PARSING_THRESHOLD: - logger.info("[%s] AGGRESSIVE MODE (sharpness=%.2f)", request_id, sharpness) - - conf_thresh, text_thresh = SystemConfig.get_adaptive_thresholds(sharpness) - logger.info("[%s] Adaptive thresholds conf=%.2f text=%.2f", request_id, conf_thresh, text_thresh) - - # ===== PASS 1A: VLM PRIMARY ENGINE (Qwen2-VL-2B-Instruct) ===== - # Run on the original PIL image (richer colour info than processed grayscale). - # Falls back silently if VLM is disabled / fails / OOM. - vlm_raw: Optional[str] = None - if SystemConfig.ENABLE_VLM and not self.vlm_engine._disabled: - logger.info("[%s] PASS 1A: VLM extraction (Qwen2-VL)", request_id) - try: - vlm_raw = await asyncio.wait_for( - asyncio.to_thread(self.vlm_engine.extract, pil_img), - timeout=SystemConfig.VLM_TIMEOUT_SECONDS, - ) - except asyncio.TimeoutError: - logger.warning("[%s] VLM timed out after %.0fs — falling back to OCR.", - request_id, SystemConfig.VLM_TIMEOUT_SECONDS) - all_corrections.append("vlm_timeout_fallback_to_ocr") - except Exception as vlm_exc: - logger.warning("[%s] VLM error: %s — falling back.", request_id, vlm_exc) - all_corrections.append(f"vlm_error_fallback: {vlm_exc}") - - vlm_lines: List[str] = [] - if vlm_raw: - vlm_lines, vlm_total_hint = QwenVLMEngine.parse_vlm_output(vlm_raw) - if vlm_lines: - vlm_lines_used = True - logger.info("[%s] VLM extracted %d lines (total_hint=%s)", - request_id, len(vlm_lines), vlm_total_hint) - all_corrections.append(f"vlm_extracted_{len(vlm_lines)}_lines") - - # ===== PASS 2: MULTI-ENGINE OCR (fallback / supplement) ===== - logger.info("[%s] PASS 2: Multi-Engine OCR", request_id) - lines, ocr_blocks = self.ocr_engine.extract_text_lines(enhanced) - - # Merge VLM lines with OCR lines (VLM first = higher priority) - if vlm_lines: - seen_norm = {norm(x) for x in vlm_lines} - for ol in lines: - k = norm(ol) - if len(k) > 2 and k not in seen_norm: - vlm_lines.append(ol) - seen_norm.add(k) - lines = vlm_lines # Use merged set - - # Low-quality rescue pass - if sharpness < 0.55 or len(lines) < 4: - logger.info( - f"[{request_id}] PASS 2B: Variant OCR rescue (sharpness={sharpness:.2f}, base_lines={len(lines)})" - ) - merged_lines: List[str] = list(lines) - merged_blocks: List[OCRTextBlock] = list(ocr_blocks) - seen_line_keys = {norm(x) for x in merged_lines} - seen_block_keys = { - f"{norm(b['text'])}:{int((b['bbox'][0][1] + b['bbox'][2][1]) / 2)}:{b['engine']}" - for b in merged_blocks - if b.get("bbox") - } - # Speed guard: don't over-ensemble. Use at most 2 variants and stop early. - for variant in self.preprocessor.generate_variants(enhanced)[:2]: - v_lines, v_blocks = self.ocr_engine.extract_text_lines(variant) - for vl in v_lines: - k = norm(vl) - if len(k) > 2 and k not in seen_line_keys: - merged_lines.append(vl) - seen_line_keys.add(k) - for vb in v_blocks: - try: - k = f"{norm(vb['text'])}:{int((vb['bbox'][0][1] + vb['bbox'][2][1]) / 2)}:{vb['engine']}" - except Exception: - continue - if k not in seen_block_keys: - merged_blocks.append(vb) - seen_block_keys.add(k) - if len(merged_lines) >= 8 and len(merged_blocks) >= 20: - break - lines, ocr_blocks = merged_lines, merged_blocks - - if not lines: - return ProcessingResult( - request_id=request_id, - success=False, - status=ProcessingStatus.FAILED, - errors=["No text detected in image"], - processing_time_ms=int((time.time() - start_time) * 1000) - ) - - # ========== PASS 3: SEMANTIC GROUNDING & PARSING ========== - logger.info(f"[{request_id}] PASS 3: Semantic Grounding") - customer_name, name_conf, name_warnings, name_hitl = self.parser.parse_customer_name( - lines, - ocr_blocks=ocr_blocks, - image_shape=enhanced.shape[:2], - top_roi_lines=self.ocr_engine.extract_top_roi_name_lines( - enhanced, roi_ratio=SystemConfig.NAME_ROI_RATIO - ), - ) - all_corrections.extend(name_warnings) - items, items_conf = self.parser.parse_items(lines) - total, total_conf = self.parser.parse_total(lines, items) - paper_total = float(total or 0.0) - items, gap_notes = self.parser.reconcile_items_from_ocr_gap(items, total, ocr_blocks) - all_corrections.extend(gap_notes) - items, total, calc_notes, calc_total_conf = self.calc_engine.reconcile(lines, items, total) - all_corrections.extend(calc_notes) - total_conf = max(total_conf, calc_total_conf) - - # Fast mismatch rescue (only when needed): - # If total is missing or mismatch is high, OCR bottom ROI for totals and reconcile again. - if items: - item_sum = float(sum(i.quantity * i.price for i in items)) - rel_err = abs(item_sum - float(total)) / max(item_sum, float(total), 1.0) if total > 0 else 1.0 - if total <= 0 or rel_err > 0.12: - bot_lines = self.ocr_engine.extract_bottom_roi_total_lines(enhanced, roi_ratio=0.38) - if bot_lines: - all_corrections.append(f"bottom_roi_total_micro_pass_lines_{len(bot_lines)}") - mix_lines = list(lines) + bot_lines - items, total, calc_notes2, calc_total_conf2 = self.calc_engine.reconcile( - mix_lines, items, total - ) - all_corrections.extend(calc_notes2) - total_conf = max(total_conf, calc_total_conf2) - - # v5.2: FORCE RECONSTRUCTION if extraction still failing - if not items and total > 0 and len(lines) > 2: - logger.warning(f"[{request_id}] v5.2: Items extraction empty, FORCING reconstruction from all text lines") - # v5.2: Treat EVERY line containing a number as a potential item line - extracted_from_lines = [] - for line in lines: - line_lower = line.lower() - skip_kw = ['date', 'total', 'udhaar', 'wasooli', 'cash', 'name', 'customer', 'time', 'receipt'] - if any(kw in line_lower for kw in skip_kw): - continue - # Find all numbers in this line - nums = re.findall(r'\d+(?:\.\d+)?', line) - if nums: - # Extract text part (before the last number) - text_part = re.sub(r'\d+(?:\.\d+)?', '', line).strip() - # Get the last number as price - try: - price = float(nums[-1]) - if 1 <= price <= total and text_part: - extracted_from_lines.append((text_part, price, line)) - except: - pass - - # Create items from extracted lines - if extracted_from_lines: - for idx, (text_part, price, orig_line) in enumerate(extracted_from_lines): - item_name = normalize_ocr_text(text_part).lower() - normalized_name, _ = PakistaniRetailLexicon.normalize_item_name(item_name) - if not normalized_name or normalized_name == item_name: - normalized_name = f"Uncategorized: {item_name[:20]}" - - items.append(ExtractedItem( - name=normalized_name, - quantity=1.0, - price=price, - unit="unit", - confidence=0.55, # Reconstruction confidence - original_text=orig_line, - semantic_match=None, - semantic_score=0.0 - )) - items_conf = 0.58 - logger.info(f"[{request_id}] v5.2: FORCE extracted {len(items)} items from all lines") - - # Also try reconstruction algorithm if force extraction didn't work - if not items: - all_numbers = [] - for line in lines: - nums = re.findall(r'\d+(?:\.\d+)?', line) - for num in nums: - try: - val = float(num) - if 1 < val < total: - all_numbers.append(val) - except: - pass - - reconstructed = reconstruct_items_from_total(total, all_numbers) - if reconstructed: - for idx, (qty, price) in enumerate(reconstructed): - items.append(ExtractedItem( - name=f"ReconstructedItem_{idx+1}", - quantity=qty, - price=price, - unit="unit", - confidence=0.50, - original_text="", - semantic_match=None, - semantic_score=0.0 - )) - items_conf = 0.55 - logger.info(f"[{request_id}] v5.2: Reconstructed {len(items)} items from total") - - # v5.1: Fallback logic - if items extraction still failed, return all numeric lines - if not items and len(lines) > 2: - logger.warning(f"[{request_id}] v5.1: Fallback - extracting all numeric lines") - # Extract all detected numbers as UnknownItem entries - all_numbers = [] - for line in lines: - nums = re.findall(r'\d+(?:\.\d+)?', line) - for num in nums: - try: - val = float(num) - if 1 < val < self.parser.config.MAX_REASONABLE_PRICE: - all_numbers.append((val, line)) - except: - pass - - # Create items from numeric lines - if all_numbers: - for idx, (num_val, orig_line) in enumerate(all_numbers): - items.append(ExtractedItem( - name=f"UnknownItem_{idx+1}", - quantity=1.0, - price=num_val, - unit="unit", - confidence=0.35, - original_text=orig_line, - semantic_match=None, - semantic_score=0.0 - )) - items_conf = 0.40 - logger.info(f"[{request_id}] Fallback created {len(items)} items from numeric lines") - - # ========== PASS 4A: INTELLIGENT SELF-CORRECTION (Enhanced) ========== - logger.info(f"[{request_id}] PASS 4A: Intelligent Self-Correction") - - # Pipeline notes: every stage appends into all_corrections only (no parallel *corrections* lists). - items, _batch = self.self_corrector.apply_learned_corrections(items) - all_corrections.extend(_batch) - - items, _batch = self.self_corrector.validate_item_prices(items) - all_corrections.extend(_batch) - - corrected_total, _batch = self.self_corrector.apply_mathematical_corrections(items, total) - all_corrections.extend(_batch) - - # ========== PASS 4B: AGENTIC MATH VALIDATION ========== - logger.info(f"[{request_id}] PASS 4B: Agentic Math Validation") - math_validated, final_total, _batch, final_total_conf = self.validator.validate(items, corrected_total) - all_corrections.extend(_batch) - - # Detect transaction type - full_text = ' '.join(lines) - tx_type, type_conf = PakistaniRetailLexicon.detect_transaction_type(full_text) - if tx_type == "cash": - tx_type = "wasooli" - if tx_type not in ("udhaar", "wasooli"): - tx_type = "udhaar" if any(k in full_text.lower() for k in ["udhaar", "ادھار", "بقایا"]) else "wasooli" - - # v5.2: Post-correction confidence boosting - logger.info(f"[{request_id}] v5.2: Post-correction confidence boosting") - for item in items: - # Boost confidence if math validates - if math_validated: - item.confidence += SystemConfig.CONF_BOOST_MATH - item.confidence = min(0.98, item.confidence) - - # Boost for lexicon matches - if item.semantic_match and item.semantic_score > 0.7: - item.confidence += SystemConfig.CONF_BOOST_LEXICON - item.confidence = min(0.98, item.confidence) - - # Recalculate average confidence - items_conf = sum(item.confidence for item in items) / len(items) if items else 0.5 - - # Final math rescue for medium mismatch: prefer internally consistent item sum (line totals). - if items and final_total > 0: - item_sum = sum(float(i.price) for i in items) - rel_gap = abs(item_sum - final_total) / max(item_sum, final_total, 1.0) - if not math_validated and rel_gap <= 0.30: - all_corrections.append(f"post_pass_total_aligned_to_items_{final_total}_to_{item_sum}") - final_total = float(item_sum) - math_validated = True - - # Hard guarantee: never return a total that doesn't equal sum(qty * price). - if items: - computed_sum = float(sum(float(i.price) for i in items)) - if computed_sum > 0: - if abs(float(final_total or 0.0) - computed_sum) > max(2.0, 0.01 * computed_sum): - all_corrections.append( - f"final_total_overridden_to_computed_sum_from_{round(float(final_total or 0.0),2)}_to_{round(computed_sum,2)}" - ) - final_total = float(computed_sum) - # If paper total differs, flag warning but still keep computed sum. - if paper_total > 0 and abs(paper_total - computed_sum) > max(2.0, 0.01 * computed_sum): - all_corrections.append( - f"paper_total_mismatch_paper_{round(paper_total,2)}_computed_{round(computed_sum,2)}" - ) - mismatch = True - else: - mismatch = False - math_validated = True - - # Output shape hardening for mobile form mapping. - customer_name = (customer_name or "").strip() or None - for it in items: - if it.unit not in ("kg", "g", "liter", "ml", "dozen", "pc", "packet"): - it.unit = "pc" - if it.quantity <= 0: - it.quantity = 1.0 - if it.price <= 0: - it.price = 1.0 - - confidence = self.validator.calculate_confidence(items, math_validated, final_total) - confidence['semantic'] = items_conf - confidence['extraction'] = total_conf - confidence['calc_engine'] = round(float(total_conf), 3) - - # v5.2: Add quality metrics to confidence - confidence['image_quality'] = sharpness - confidence['parsing_mode'] = 'aggressive' if sharpness < SystemConfig.AGGRESSIVE_PARSING_THRESHOLD else 'standard' - - # v5.2: Determine items extraction source for metadata - uncategorized_count = sum(1 for item in items if "Uncategorized:" in item.name) - fallback_used = any("UnknownItem_" in item.name for item in items) - reconstruction_used = any("ReconstructedItem_" in item.name for item in items) - force_extracted = any("Uncategorized:" in item.name for item in items) - - # Determine extraction method - if reconstruction_used: - extraction_method = "reconstruction" - elif force_extracted: - extraction_method = "force_extraction_from_lines" - elif fallback_used: - extraction_method = "numeric_fallback" - else: - extraction_method = "standard_parsing" - - # Create result - result = ProcessingResult( - request_id=request_id, - success=True, - customer_name=customer_name, - hitl_data=name_hitl, - items=items, - total_amount=final_total, - transaction_type=tx_type, - mismatch=bool(mismatch), - confidence=confidence, - processing_time_ms=int((time.time() - start_time) * 1000), - status=ProcessingStatus.COMPLETED, - metadata={ - "lines_extracted": len(lines), - "ocr_blocks": len(ocr_blocks), - "items_extracted_from_ocr_blocks": len(items) > 0, - "items_count": len(items), - "uncategorized_items_count": uncategorized_count, - "quality_metrics": quality_metrics, - "corrections": all_corrections, - "math_validated": math_validated, - "hitl_learning_active": True, - "fallback_used": fallback_used, - "reconstruction_used": reconstruction_used, - "force_extracted_from_lines": force_extracted, - "items_extraction_method": extraction_method, - "v5_2_aggressive_mode": sharpness < SystemConfig.AGGRESSIVE_PARSING_THRESHOLD, - "image_sharpness": sharpness, - "adaptive_thresholds_applied": True, - "easyocr_rows_skipped": self.ocr_engine.easyocr_rows_skipped, - "paddle_rows_skipped": self.ocr_engine.paddle_rows_skipped, - "calc_engine_enabled": True, - "calc_engine_notes_count": len([c for c in all_corrections if "calc_engine" in c or "single_item_price_repair" in c]), - # v6.0 VLM metadata - "vlm_enabled": SystemConfig.ENABLE_VLM, - "vlm_model": SystemConfig.VLM_MODEL_ID, - "vlm_lines_used": vlm_lines_used, - "vlm_total_hint": vlm_total_hint, - "engine_used": "vlm+ensemble" if vlm_lines_used else ("ensemble" if (EASYOCR_AVAILABLE or SystemConfig.ENABLE_PADDLE) else "easyocr"), - "cache_stats": _IMAGE_CACHE.stats(), - "rss_mb": round(_get_rss_mb(), 1), - } - ) - - if all_corrections: - result.warnings = all_corrections - - # Store for ZDR-compliant access (expires in 1 hour) - self.zdr_manager.store_request_data(request_id, result) - - logger.info( - "[%s] Completed in %dms | VLM=%s | Corrections=%d | RSS=%.0fMB", - request_id, result.processing_time_ms, - "yes" if vlm_lines_used else "no", - len(all_corrections), _get_rss_mb(), - ) - return result - - except Exception as e: - logger.error("[%s] Processing failed: %s", request_id, e, exc_info=True) - return ProcessingResult( - request_id=request_id, - success=False, - status=ProcessingStatus.FAILED, - errors=[str(e)], - processing_time_ms=int((time.time() - start_time) * 1000) - ) - - -# ============================================================================ -# FASTAPI APPLICATION -# ============================================================================ +def _empty_result(t0: float, request_id: str = "") -> dict: + return { + "request_id": request_id, + "success": False, + "customer_name": None, + "items": [], + "total": 0.0, + "mismatch": False, + "transaction_type": "unknown", + "processing_time_ms": round((time.monotonic() - t0) * 1000, 1), + "item_count": 0, + } +# --------------------------------------------------------------------------- +# FastAPI app +# --------------------------------------------------------------------------- app = FastAPI( - title="Smart Parchi OCR Enterprise", - description="Professional Urdu-English handwritten receipt processing system", - version="6.0.0" + title="Parchi OCR – Minimal CPU Edition", + description="Handwritten Urdu/English receipt OCR. No GPU. No VLM.", + version="1.0.0", ) app.add_middleware( CORSMiddleware, allow_origins=["*"], - allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) -# Global orchestrator -orchestrator = ParchiOrchestrator() - - -# ============================================================================ -# API ENDPOINTS -# ============================================================================ @app.on_event("startup") -async def startup_event(): - """v6.0: Warm up EasyOCR/Paddle on startup. VLM loads lazily on first request.""" - global OCR_WARMUP_STATUS - SystemConfig.validate() - logger.info("Smart Parchi OCR v6.0.0 started") - logger.info( - "Engines | VLM=%s(%s) PADDLE=%s EASYOCR=%s TORCH=%s TRANSFORMERS=%s", - SystemConfig.ENABLE_VLM, - SystemConfig.VLM_MODEL_ID, - SystemConfig.ENABLE_PADDLE, - EASYOCR_AVAILABLE, - TORCH_AVAILABLE, - TRANSFORMERS_AVAILABLE, - ) - logger.info("Startup RSS: %.0f MB", _get_rss_mb()) - - if os.getenv("SKIP_OCR_WARMUP", "0").strip() in ("1", "true", "yes"): - logger.warning("SKIP_OCR_WARMUP=1 — OCR will init on first request (slower first hit).") - OCR_WARMUP_STATUS = "skipped" - else: +async def _warmup(): + """Non-blocking warmup – loads EasyOCR weights in a thread so startup is fast.""" + async def _load(): try: - # Warm up EasyOCR + Paddle (downloads models if not cached). - # VLM (Qwen2-VL) loads lazily on first inference request to keep - # startup fast and let HF health probe pass before 8 GB loads. - await asyncio.to_thread(orchestrator.ocr_engine.initialize) - logger.info("OCR ensemble warm-up finished. RSS: %.0f MB", _get_rss_mb()) - OCR_WARMUP_STATUS = "complete" + await asyncio.to_thread(get_reader) + log.info("Warmup complete.") except Exception as exc: - logger.error("OCR warm-up failed (will retry on first request): %s", exc, exc_info=True) - OCR_WARMUP_STATUS = "failed" - - logger.info( - "v6.0 ready | cache_ttl=%ds max_entries=%d concurrency_limit=2", - SystemConfig.CACHE_TTL, - SystemConfig.MAX_CACHE_SIZE, - ) + log.error("Warmup failed: %s", exc) + asyncio.create_task(_load()) -@app.get("/health") -async def health_check() -> Dict[str, Any]: - """Health check endpoint""" - paddle_ready = bool(PADDLE_AVAILABLE and SystemConfig.ENABLE_PADDLE) +@app.get("/", tags=["health"]) +def root(): return { - "status": "healthy", - "version": "6.0.0", - "timestamp": datetime.now().isoformat(), - "warmup_status": OCR_WARMUP_STATUS, - "features": { - "vision_enhancement": True, - "multi_engine_ocr": True, - "semantic_lexicon": True, - "agentic_math": True, - "vlm_enabled": SystemConfig.ENABLE_VLM, - "vlm_model": SystemConfig.VLM_MODEL_ID if SystemConfig.ENABLE_VLM else None, - "vlm_loaded": getattr(orchestrator.vlm_engine, "_loaded", False), - "vlm_disabled": getattr(orchestrator.vlm_engine, "_disabled", False), - "enable_paddle": SystemConfig.ENABLE_PADDLE, - "paddle_available": bool(PADDLE_AVAILABLE and SystemConfig.ENABLE_PADDLE), - "easyocr_available": EASYOCR_AVAILABLE, - "ocr_initialized": getattr(orchestrator.ocr_engine, "_initialized", False), - "tesseract_available": TESSERACT_AVAILABLE, - "sklearn_available": SKLEARN_AVAILABLE, - "torch_available": TORCH_AVAILABLE, - "transformers_available": TRANSFORMERS_AVAILABLE, - "cache_stats": _IMAGE_CACHE.stats(), - "rss_mb": round(_get_rss_mb(), 1), - } + "service": "parchi-ocr-minimal", + "status": "ok", + "time_utc": datetime.now(timezone.utc).isoformat(), } -@app.get("/") -async def root() -> Dict[str, Any]: - """Stop noisy GET / 404 logs (HF/health probes).""" - return {"ok": True, "service": "smart-parchi-ocr", "version": "6.0.0"} - -@app.post("/process-parchi", response_model=ProcessingResult) -async def process_parchi( - image: UploadFile = File(...), - background_tasks: BackgroundTasks = None -) -> ProcessingResult: - """ - Process a single parchi image. - - Accepts image files (JPEG, PNG, etc.) and returns structured extraction. - """ - # Validate file - if not image.content_type or not image.content_type.startswith("image/"): - raise HTTPException(status_code=400, detail="File must be an image") - - # Read image - contents = await image.read() - if len(contents) > SystemConfig.MAX_IMAGE_SIZE_MB * 1024 * 1024: - raise HTTPException(status_code=400, detail=f"Image too large (max {SystemConfig.MAX_IMAGE_SIZE_MB}MB)") - - # Process with hard timeout guard so UI never hangs indefinitely. - try: - result = await asyncio.wait_for( - orchestrator.process(contents), - timeout=float(SystemConfig.FASTAPI_HARD_TIMEOUT_SECONDS), - ) - except asyncio.TimeoutError: - return ProcessingResult( - request_id=hashlib.md5(contents).hexdigest()[:16], - success=False, - status=ProcessingStatus.FAILED, - errors=["timeout_error"], - warnings=[f"processing_exceeded_{int(SystemConfig.FASTAPI_HARD_TIMEOUT_SECONDS)}s"], - processing_time_ms=int(SystemConfig.FASTAPI_HARD_TIMEOUT_SECONDS * 1000), - metadata={ - "timeout_seconds": float(SystemConfig.FASTAPI_HARD_TIMEOUT_SECONDS), - "filename": image.filename, - "content_type": image.content_type, - }, - ) - - # Add file metadata - result.metadata["filename"] = image.filename - result.metadata["content_type"] = image.content_type - # Backward/forward compatibility for mobile clients: - # provide both legacy and modern keys so field mapping never breaks. - result.total = float(result.total_amount or 0.0) - result.type = str(result.transaction_type or "unknown") - result.meta = dict(result.metadata or {}) - try: - c = dict(result.confidence or {}) - overall = float(c.get("overall") or 0.0) - if overall <= 0: - parts = [float(c.get("items") or 0.0), float(c.get("total") or 0.0), float(c.get("type") or 0.0)] - overall = max(0.0, min(1.0, sum(parts) / max(1, len(parts)))) - result.confidence_score = max(0.0, min(1.0, overall)) - except Exception: - result.confidence_score = 0.0 - ui_items = [ - { - "item": str(it.name or "").strip(), - "qty": str(float(it.quantity or 0.0)).rstrip("0").rstrip(".") or "0", - "price": str(float(it.price or 0.0)).rstrip("0").rstrip(".") or "0", - } - for it in (result.items or []) - if str(it.name or "").strip() or float(it.quantity or 0.0) > 0 or float(it.price or 0.0) > 0 - ] - result.items_list = ui_items - result.line_items = list(ui_items) - - return result - - -@app.post("/process-batch") -async def process_batch( - images: List[UploadFile], - background_tasks: BackgroundTasks -) -> List[ProcessingResult]: - """ - Process multiple images in batch. - Limited to 5 images per request for performance. - """ - if len(images) > 5: - raise HTTPException(status_code=400, detail="Maximum 5 images per batch") - - results = [] - for img in images[:5]: - contents = await img.read() - result = await orchestrator.process(contents) - results.append(result) - - return results - - -@app.get("/lexicon") -async def get_lexicon() -> Dict[str, List[str]]: - """Get the semantic lexicon for reference""" - return PakistaniRetailLexicon.LEXICON - - -# ============================================================================ -# HUMAN-IN-THE-LOOP (HITL) FEEDBACK ENDPOINTS -# ============================================================================ - -@app.post("/feedback/submit") -async def submit_feedback(feedback: UserFeedback) -> Dict[str, Any]: - """ - Submit user corrections to improve the model. - Corrections are immediately used to fine-tune future predictions. - - PRIVACY: Data is not persisted (Zero Data Retention compliant). - """ - if not feedback.timestamp: - feedback.timestamp = datetime.now().isoformat() - - result = orchestrator.feedback_learner.add_feedback(feedback) - - return { - **result, - "privacy_note": "Your feedback is immediately processed but not stored persistently", - "your_request_id": feedback.request_id - } - - -@app.get("/feedback/insights") -async def get_feedback_insights() -> Dict[str, Any]: - """ - Get insights into common OCR errors and learned patterns. - Useful for understanding system performance and high-error areas. - """ - insights = orchestrator.feedback_learner.get_error_insights() - learned_corrections = orchestrator.feedback_learner.get_high_confidence_corrections() - +@app.get("/health", tags=["health"]) +def health(): return { - **insights, - "learned_corrections_active": len(learned_corrections), - "common_items_learned": list(learned_corrections.keys())[:10] + "status": "ok", + "version": "1.1.0", + "engine": "EasyOCR [ur, en]", + "cache_size": len(_cache._store), } -@app.get("/feedback/status") -async def get_feedback_status() -> Dict[str, Any]: - """Get real-time feedback system status""" - status = { - "learning_active": True, - "total_corrections_aggregated": orchestrator.feedback_learner.total_corrections, - "unique_items_learned": len(orchestrator.feedback_learner.feedback_history), - "error_patterns_detected": len(orchestrator.feedback_learner.error_patterns), - "last_cleared": orchestrator.feedback_learner.last_cleared.isoformat() - } - return status - - -# ============================================================================ -# COMPLIANCE & PRIVACY ENDPOINTS -# ============================================================================ - -@app.get("/compliance/zdr-status") -async def get_zdr_status() -> Dict[str, Any]: - """ - Zero Data Retention (ZDR) compliance status. - Ensures no sensitive financial data is persisted. - """ - return orchestrator.zdr_manager.get_compliance_status() - - -@app.post("/compliance/cleanup") -async def manual_zdr_cleanup() -> Dict[str, Any]: - """ - Manually trigger data cleanup (normally automatic). - Removes all cached request data older than specified hours. - """ - result = orchestrator.zdr_manager._cleanup_expired() - return { - "status": "manual_cleanup_executed", - "expired_requests_removed": result, - "remaining_cached": len(orchestrator.zdr_manager.request_cache) - } - +async def _handle_upload(file: UploadFile) -> dict: + """Shared logic for both OCR endpoints.""" + if file.content_type and not file.content_type.startswith("image/"): + raise HTTPException(status_code=400, detail="File must be an image.") + image_bytes = await file.read() + if len(image_bytes) > Config.MAX_IMAGE_SIZE_MB * 1024 * 1024: + raise HTTPException(status_code=413, + detail=f"Image too large (max {Config.MAX_IMAGE_SIZE_MB} MB).") + try: + # Run CPU-heavy work in thread so the event loop stays responsive + return await asyncio.to_thread(process_image, image_bytes) + except Exception as exc: + log.exception("Processing error") + raise HTTPException(status_code=500, detail=str(exc)) from exc -@app.get("/feedback/clear-old-data") -async def clear_old_feedback_data(hours: int = 24) -> Dict[str, Any]: - """ - Clear old feedback data (ZDR compliance). - Default: Clear data older than 24 hours. - """ - return orchestrator.feedback_learner.clear_old_data(hours=hours) +@app.post("/ocr", tags=["ocr"]) +async def ocr_endpoint(file: UploadFile = File(...)): + """Upload a parchi image → structured JSON (items, total, customer_name …).""" + return await _handle_upload(file) -# ============================================================================ -# LEXICON MANAGEMENT (Enhanced) -# ============================================================================ -@app.post("/lexicon/add-from-feedback") -async def add_items_from_feedback() -> Dict[str, Any]: - """ - Create extended lexicon from high-confidence learned corrections. - Useful for creating domain-specific custom lexicons. - """ - learned = orchestrator.feedback_learner.get_high_confidence_corrections() - - extended_lexicon = {} - for item_name, values in learned.items(): - if item_name not in PakistaniRetailLexicon.LEXICON: - extended_lexicon[item_name] = { - "price": values["price"], - "quantity": values["qty"], - "frequency": len(orchestrator.feedback_learner.feedback_history[item_name]) - } - - return { - "status": "lexicon_extension_ready", - "new_items": len(extended_lexicon), - "items": extended_lexicon, - "note": "Use this to export and deploy custom lexicons" - } +@app.post("/process-parchi", tags=["ocr"]) +async def process_parchi(image: UploadFile = File(...)): + """Alias for /ocr – compatible with reference API clients.""" + return await _handle_upload(image) -# ============================================================================ -# RETRIEVAL ENDPOINTS -# ============================================================================ +@app.delete("/cache", tags=["admin"]) +def clear_cache(): + """Clear the in-memory cache (useful for testing).""" + _cache._store.clear() + return {"cleared": True} -@app.get("/result/{request_id}") -async def retrieve_result(request_id: str) -> Optional[ProcessingResult]: - """ - Retrieve cached result by request ID (within TTL). - Useful for async processing workflows. - """ - result = orchestrator.zdr_manager.retrieve_request_data(request_id) - - if result is None: - raise HTTPException( - status_code=404, - detail=f"Request {request_id} not found or expired (TTL: {orchestrator.zdr_manager.cache_ttl}s)" - ) - - return result +@app.get("/cache/stats", tags=["admin"]) +def cache_stats(): + return {"entries": len(_cache._store), "max": _CACHE_MAX, "ttl_s": _CACHE_TTL_S} -# ============================================================================ -# MAIN ENTRY POINT -# ============================================================================ +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- if __name__ == "__main__": - import uvicorn - - print(""" - ╔═══════════════════════════════════════════════════════════════════════════╗ - ║ SMART PARCHI OCR v5.0.0 - ENTERPRISE EDITION ║ - ║ Intelligent Receipt Processing with HITL Learning ║ - ╠═══════════════════════════════════════════════════════════════════════════╣ - ║ ║ - ║ CORE FEATURES: ║ - ║ ✅ 7-layer Vision Enhancement Pipeline ║ - ║ ✅ Multi-Engine OCR (EasyOCR + PaddleOCR + Fallback) ║ - ║ ✅ Urdu-English Bilingual Support (Nastaliq & Naskh) ║ - ║ ✅ Pakistani Retail Semantic Lexicon (50+ items with Urdu variants) ║ - ║ ✅ Agentic 4-Pass Self-Correction Loop ║ - ║ ✅ Mathematical Validation & Auto-Correction ║ - ║ ✅ Confidence Scoring with Explainable AI ║ - ║ ║ - ║ ENTERPRISE FEATURES (NEW): ║ - ║ ✅ Human-In-The-Loop (HITL) Feedback Learning System ║ - ║ ✅ Zero Data Retention (ZDR) Compliance - HIPAA Ready ║ - ║ ✅ Intelligent Pattern Detection from User Corrections ║ - ║ ✅ Real-time Error Insights & Analytics ║ - ║ ✅ Adaptive Price Validation Against Retail Knowledge ║ - ║ ✅ Multimodal Reasoning (Vision + Semantic + Math) ║ - ║ ✅ Privacy-First Architecture (No Persistent Storage) ║ - ║ ║ - ║ DEPLOYMENT SPECS: ║ - ║ • Optimized for Hugging Face Spaces (CPU Tier) ║ - ║ • Memory Usage: 700-900MB (2GB limit compatible) ║ - ║ • Processing Time: 2-5 seconds per image ║ - ║ • Batch Processing: Up to 5 images per request ║ - ║ • Auto-cleanup: Data expires after 1 hour (ZDR) ║ - ║ ║ - ║ API ENDPOINTS: ║ - ║ POST /process-parchi - Process single receipt ║ - ║ POST /process-batch - Batch process (max 5) ║ - ║ POST /feedback/submit - Submit corrections (HITL) ║ - ║ GET /feedback/insights - Get error patterns ║ - ║ GET /compliance/zdr-status - Privacy compliance ║ - ║ GET /result/{request_id} - Retrieve cached result ║ - ║ ║ - ╚═══════════════════════════════════════════════════════════════════════════╝ - """) - - uvicorn.run( - app, - host="0.0.0.0", - port=8000, - log_level="info" - ) \ No newline at end of file + uvicorn.run("app:app", host="0.0.0.0", port=7860, workers=1, log_level="info")