e
File size: 5,001 Bytes
57e072f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
app/services/ocr.py
EasyOCR wrapper with lazy loading, caching, and label-presence detection.
"""
import re
import logging
import hashlib
import threading
import numpy as np
from PIL import Image
from io import BytesIO
from app.models.db import get_ocr_cache, set_ocr_cache

logger  = logging.getLogger(__name__)
DATA_DIR = __import__('os').path.join(__import__('os').getcwd(), "data")
CACHE_DIR = __import__('os').environ.get("HF_HOME", "/app/.cache")
MODEL_DIR = __import__('os').path.join(CACHE_DIR, "easyocr_models")

# ── Lazy EasyOCR (avoids 30s startup freeze) ──────────────────────────
_LANG_READERS: dict    = {}
_READERS_LOCK          = threading.Lock()
_EASYOCR_LANG_MAP      = {
    "en": ["en"], "hi": ["en","hi"], "zh": ["en","ch_sim"],
    "ta": ["en","ta"], "te": ["en","te"], "bn": ["en","bn"],
}


def get_reader_for(lang_hint: str):
    langs = _EASYOCR_LANG_MAP.get(lang_hint, ["en"])
    key   = "_".join(sorted(langs))
    if key not in _LANG_READERS:
        with _READERS_LOCK:
            if key not in _LANG_READERS:
                import easyocr as _easyocr
                logger.info("Loading EasyOCR for langs=%s", langs)
                _LANG_READERS[key] = _easyocr.Reader(
                    langs, gpu=False, model_storage_directory=MODEL_DIR)
    return _LANG_READERS[key]


def run_ocr(content: bytes, lang_hint: str = "en") -> dict:
    """Extract text from image bytes. Returns text, word_count, avg_confidence."""
    cache_key = f"{hashlib.md5(content).hexdigest()}_{lang_hint}"
    cached    = get_ocr_cache(cache_key)
    if cached:
        return cached

    img = Image.open(BytesIO(content)).convert("RGB")
    img.thumbnail((1200, 1200))
    img_np      = np.array(img)
    results     = get_reader_for(lang_hint).readtext(img_np, detail=1)
    words       = [r[1] for r in results]
    confidences = [r[2] for r in results]
    text        = " ".join(words)
    avg_conf    = sum(confidences) / len(confidences) if confidences else 0.0

    result = {"text": text, "word_count": len(words),
              "avg_confidence": round(avg_conf, 3),
              "is_readable": len(words) >= 3 and avg_conf > 0.15}
    set_ocr_cache(cache_key, result)
    return result


# ── Label presence detection ───────────────────────────────────────────
LABEL_KEYWORDS = [
    'ingredients','nutrition','nutritional','calories','calorie','protein',
    'fat','carbohydrate','carbs','sodium','sugar','sugars','fiber','fibre',
    'serving','cholesterol','saturated','trans','vitamin','calcium','iron',
    'per 100g','per 100 g','daily value','daily values','amount per','total fat',
    'contains','may contain','preservative','flavour','flavor','emulsifier',
    'mg','mcg','kcal','kj','% dv','%dv','g per','per serving',
    'fssai','best before','mfg','mrp','net wt','manufactured','packed',
]
FRONT_PACK_SIGNALS = [
    'new','improved','original','classic','natural','organic','premium',
    'delicious','flavoured','variety','crunchy','crispy','fresh','tasty',
    'yummy','light','baked','roasted',
]
# Must have β‰₯2 of these to confirm a nutrition-facts panel
NUTRITION_TABLE_ANCHORS = [
    'per 100g','per 100 g','per serving','serving size','amount per',
    'daily value','daily values','% dv','%dv','calories','calorie','kcal',
    'kj','energy','nutrition facts','nutritional information','total fat',
    'saturated fat','trans fat','total carbohydrate','dietary fiber',
    'ingredients:','fssai','best before','mfg','mrp','net wt',
]


def detect_label_presence(ocr_text: str) -> dict:
    if not ocr_text:
        return {'has_label': False, 'confidence': 'high',
                'label_hits': [], 'front_hits': [], 'suggestion': 'no_text'}
    tl          = ocr_text.lower()
    label_hits  = [kw for kw in LABEL_KEYWORDS          if kw in tl]
    front_hits  = [kw for kw in FRONT_PACK_SIGNALS      if kw in tl]
    anchor_hits = [kw for kw in NUTRITION_TABLE_ANCHORS if kw in tl]
    ls, fs      = len(label_hits), len(front_hits)
    has_table   = len(anchor_hits) >= 2

    if has_table and ls >= 3:
        return {'has_label': True,
                'confidence': 'high' if ls >= 6 else 'medium',
                'label_hits': label_hits[:5], 'front_hits': front_hits[:3],
                'suggestion': None}
    if has_table and ls >= 1 and fs <= 2:
        return {'has_label': True, 'confidence': 'low',
                'label_hits': label_hits, 'front_hits': front_hits, 'suggestion': None}
    if fs > ls or not has_table:
        sug = 'wrong_side' if (fs > 0 or not has_table) else 'no_label'
        return {'has_label': False, 'confidence': 'high',
                'label_hits': label_hits, 'front_hits': front_hits[:3], 'suggestion': sug}
    return {'has_label': True, 'confidence': 'low',
            'label_hits': label_hits, 'front_hits': front_hits, 'suggestion': 'partial'}