e / ocr.py
Shaikhsarib's picture
Upload 11 files
57e072f verified
"""
app/services/ocr.py
EasyOCR wrapper with lazy loading, caching, and label-presence detection.
"""
import re
import logging
import hashlib
import threading
import numpy as np
from PIL import Image
from io import BytesIO
from app.models.db import get_ocr_cache, set_ocr_cache
logger = logging.getLogger(__name__)
DATA_DIR = __import__('os').path.join(__import__('os').getcwd(), "data")
CACHE_DIR = __import__('os').environ.get("HF_HOME", "/app/.cache")
MODEL_DIR = __import__('os').path.join(CACHE_DIR, "easyocr_models")
# ── Lazy EasyOCR (avoids 30s startup freeze) ──────────────────────────
_LANG_READERS: dict = {}
_READERS_LOCK = threading.Lock()
_EASYOCR_LANG_MAP = {
"en": ["en"], "hi": ["en","hi"], "zh": ["en","ch_sim"],
"ta": ["en","ta"], "te": ["en","te"], "bn": ["en","bn"],
}
def get_reader_for(lang_hint: str):
langs = _EASYOCR_LANG_MAP.get(lang_hint, ["en"])
key = "_".join(sorted(langs))
if key not in _LANG_READERS:
with _READERS_LOCK:
if key not in _LANG_READERS:
import easyocr as _easyocr
logger.info("Loading EasyOCR for langs=%s", langs)
_LANG_READERS[key] = _easyocr.Reader(
langs, gpu=False, model_storage_directory=MODEL_DIR)
return _LANG_READERS[key]
def run_ocr(content: bytes, lang_hint: str = "en") -> dict:
"""Extract text from image bytes. Returns text, word_count, avg_confidence."""
cache_key = f"{hashlib.md5(content).hexdigest()}_{lang_hint}"
cached = get_ocr_cache(cache_key)
if cached:
return cached
img = Image.open(BytesIO(content)).convert("RGB")
img.thumbnail((1200, 1200))
img_np = np.array(img)
results = get_reader_for(lang_hint).readtext(img_np, detail=1)
words = [r[1] for r in results]
confidences = [r[2] for r in results]
text = " ".join(words)
avg_conf = sum(confidences) / len(confidences) if confidences else 0.0
result = {"text": text, "word_count": len(words),
"avg_confidence": round(avg_conf, 3),
"is_readable": len(words) >= 3 and avg_conf > 0.15}
set_ocr_cache(cache_key, result)
return result
# ── Label presence detection ───────────────────────────────────────────
LABEL_KEYWORDS = [
'ingredients','nutrition','nutritional','calories','calorie','protein',
'fat','carbohydrate','carbs','sodium','sugar','sugars','fiber','fibre',
'serving','cholesterol','saturated','trans','vitamin','calcium','iron',
'per 100g','per 100 g','daily value','daily values','amount per','total fat',
'contains','may contain','preservative','flavour','flavor','emulsifier',
'mg','mcg','kcal','kj','% dv','%dv','g per','per serving',
'fssai','best before','mfg','mrp','net wt','manufactured','packed',
]
FRONT_PACK_SIGNALS = [
'new','improved','original','classic','natural','organic','premium',
'delicious','flavoured','variety','crunchy','crispy','fresh','tasty',
'yummy','light','baked','roasted',
]
# Must have β‰₯2 of these to confirm a nutrition-facts panel
NUTRITION_TABLE_ANCHORS = [
'per 100g','per 100 g','per serving','serving size','amount per',
'daily value','daily values','% dv','%dv','calories','calorie','kcal',
'kj','energy','nutrition facts','nutritional information','total fat',
'saturated fat','trans fat','total carbohydrate','dietary fiber',
'ingredients:','fssai','best before','mfg','mrp','net wt',
]
def detect_label_presence(ocr_text: str) -> dict:
if not ocr_text:
return {'has_label': False, 'confidence': 'high',
'label_hits': [], 'front_hits': [], 'suggestion': 'no_text'}
tl = ocr_text.lower()
label_hits = [kw for kw in LABEL_KEYWORDS if kw in tl]
front_hits = [kw for kw in FRONT_PACK_SIGNALS if kw in tl]
anchor_hits = [kw for kw in NUTRITION_TABLE_ANCHORS if kw in tl]
ls, fs = len(label_hits), len(front_hits)
has_table = len(anchor_hits) >= 2
if has_table and ls >= 3:
return {'has_label': True,
'confidence': 'high' if ls >= 6 else 'medium',
'label_hits': label_hits[:5], 'front_hits': front_hits[:3],
'suggestion': None}
if has_table and ls >= 1 and fs <= 2:
return {'has_label': True, 'confidence': 'low',
'label_hits': label_hits, 'front_hits': front_hits, 'suggestion': None}
if fs > ls or not has_table:
sug = 'wrong_side' if (fs > 0 or not has_table) else 'no_label'
return {'has_label': False, 'confidence': 'high',
'label_hits': label_hits, 'front_hits': front_hits[:3], 'suggestion': sug}
return {'has_label': True, 'confidence': 'low',
'label_hits': label_hits, 'front_hits': front_hits, 'suggestion': 'partial'}