""" app/services/llm.py LLM abstraction layer + proprietary food database population. Doc says: "Your 'AI analysis' is a prompt. Prompt engineering is not a competitive advantage." This module starts building the real moat: every verified scan result goes into food_products table. """ import os import re import json import logging import asyncio from app.models.db import get_ai_cache, set_ai_cache, db_conn logger = logging.getLogger(__name__) GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "") _groq_client = None if GROQ_API_KEY: from groq import Groq _groq_client = Groq(api_key=GROQ_API_KEY) MEDICAL_DISCLAIMER = ( "⚕️ For informational purposes only — not medical advice. " "Consult a qualified nutritionist or physician before making dietary decisions." ) LANGUAGE_MAP = { "en": "English", "zh": "Simplified Chinese", "es": "Spanish", "ar": "Arabic", "fr": "French", "hi": "Hindi (हिन्दी)", "pt": "Portuguese","de": "German", } def call_llm(prompt: str, max_tokens: int = 2500) -> str: """Provider-agnostic LLM call. Swap Groq → Anthropic → Ollama here.""" if not _groq_client: raise RuntimeError("GROQ_API_KEY not set") for model in ["llama-3.3-70b-versatile", "llama-3.1-8b-instant"]: try: comp = _groq_client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=max_tokens, response_format={"type": "json_object"}, ) return comp.choices[0].message.content except Exception as exc: logger.warning("LLM %s failed: %s", model, exc) raise RuntimeError("All LLM models failed") def build_analysis_prompt(extracted_text: str, persona: str, age_group: str, product_category: str, language: str, web_context: str, label_confidence: str, blur_info: dict) -> str: lang_name = LANGUAGE_MAP.get(language, "English") conf_note = ("⚠️ Label text may be partial — only list nutrients you can read confidently." if label_confidence == "low" else "") blur_ctx = "" if blur_info.get("detected"): verb = "enhanced via Wiener deconvolution" if blur_info.get("deblurred") else "blurry, used original" blur_ctx = f"IMAGE: {blur_info['severity']}ly blurry ({verb}). Only report confident values." return f"""[INST] You are an expert nutritional scientist and food safety auditor. CRITICAL: Respond ENTIRELY in {lang_name}. Every text field MUST be in {lang_name}. Persona: {persona} | Age: {age_group} | Category: {product_category} {conf_note} {blur_ctx} Label Text: "{extracted_text}" Web Context: "{web_context}" Return ONLY valid JSON — no markdown, no preamble: {{ "product_name" : "Short name from label", "product_category" : "Snack|Dairy|Beverage|Cereal|Supplement|etc.", "score" : , "verdict" : "Two-word verdict in {lang_name}", "chart_data" : [, , ], "summary" : "2-sentence professional summary in {lang_name}.", "eli5_explanation" : "Child-friendly explanation with emojis in {lang_name}.", "molecular_insight" : "1-2 sentences on biochemical body impact in {lang_name}.", "paragraph_benefits": "Full paragraph on genuine benefits in {lang_name}.", "paragraph_uniqueness": "Unique characteristics OR 2 better alternatives in {lang_name}.", "is_unique" : true, "nutrient_breakdown": [ {{"name":"Protein","value":,"unit":"g","rating":"good","impact":"brief note in {lang_name}"}}, {{"name":"Sugar","value":,"unit":"g","rating":"moderate","impact":"brief note"}}, {{"name":"Fat","value":,"unit":"g","rating":"good","impact":"brief note"}}, {{"name":"Sodium","value":,"unit":"mg","rating":"caution","impact":"brief note"}}, {{"name":"Fiber","value":,"unit":"g","rating":"good","impact":"brief note"}} ], "pros" : ["Benefit 1 in {lang_name}", "Benefit 2", "Benefit 3"], "cons" : ["Risk 1 in {lang_name}", "Risk 2"], "age_warnings" : [ {{"group":"Children","emoji":"👶","status":"warning","message":"in {lang_name}"}}, {{"group":"Adults","emoji":"🧑","status":"good","message":"in {lang_name}"}}, {{"group":"Seniors","emoji":"👴","status":"caution","message":"in {lang_name}"}}, {{"group":"Pregnant","emoji":"🤰","status":"caution","message":"in {lang_name}"}} ], "better_alternative": "A specific healthier alternative in {lang_name}.", "is_low_confidence" : false }} SCORING RUBRIC — MANDATORY, never use 6 or 7 as defaults: 9-10: Whole food, no added sugar, low sodium, high fibre/protein 7-8 : Mildly processed, sugar <5g/100g, reasonable sodium 5-6 : Processed, sugar 5-15g/100g OR sodium 400-700mg/100g 3-4 : High sugar >15g/100g OR sodium >700mg/100g OR poor profile 1-2 : Ultra-processed, very high sugar/sodium/sat-fat RULES: chart_data sums to 100 | rating: good|moderate|caution|bad | status: good|caution|warning [/INST]""" def sanitise_result(result: dict) -> dict: """Fix all known LLM output issues: chart rounding, unit strings, defaults.""" # chart_data — must sum to exactly 100 cd = result.get("chart_data") if isinstance(cd, list) and len(cd) == 3 and all(isinstance(x, (int, float)) for x in cd): total = sum(cd) if total > 0 and total != 100: scaled = [round(v * 100 / total) for v in cd] scaled[scaled.index(max(scaled))] += 100 - sum(scaled) result["chart_data"] = scaled else: result["chart_data"] = [70, 20, 10] # Nutrient value "34g" → 34.0 for n in result.get("nutrient_breakdown", []): m = re.search(r"[\d]+\.?[\d]*", str(n.get("value", "")).replace(",", ".")) if m: n["value"] = float(m.group()) result.setdefault("score", 5) result.setdefault("verdict", "Analyzed") result.setdefault("product_name", "Unknown Product") result.setdefault("nutrient_breakdown", []) result.setdefault("pros", []) result.setdefault("cons", []) result.setdefault("age_warnings", []) result.setdefault("is_low_confidence", False) return result async def analyse_label( extracted_text: str, persona: str, age_group: str, product_category: str, language: str, web_context: str, blur_info: dict, label_confidence: str, ) -> dict: """Full analysis pipeline: cache → LLM → sanitise → return.""" cache_key = f"v3:{language}:{persona}:{age_group}:{extracted_text[:80]}" cached = get_ai_cache(cache_key) if cached: return cached prompt = build_analysis_prompt( extracted_text, persona, age_group, product_category, language, web_context, label_confidence, blur_info ) raw = await asyncio.to_thread(call_llm, prompt, 2500) result = sanitise_result(json.loads(raw)) result["disclaimer"] = MEDICAL_DISCLAIMER # Cache (without ephemeral fields) cacheable = {k: v for k, v in result.items() if k not in ("blur_info", "scan_meta", "allergen_warning")} set_ai_cache(cache_key, cacheable) return result # ── Phase 2: Proprietary food database population ───────────────────── def upsert_food_product( name: str, nutrients: list, score: int, ingredients_raw: str = "", barcode: str | None = None, brand: str = "", category: str = "", source: str = "llm_scan", ) -> int: """ Insert or update a product in the proprietary food_products table. Every scan calls this. Over time this builds a data moat. Returns the product id. """ def _get(key): for n in nutrients: if key in n.get("name", "").lower(): v = n.get("value", 0) return float(v) if isinstance(v, (int, float)) else 0 return 0 cal = _get("calorie") or _get("energy") or _get("kcal") prot = _get("protein") carb = _get("carb") or _get("carbohydrate") fat = _get("fat") sod = _get("sodium") fib = _get("fiber") or _get("fibre") sug = _get("sugar") sat = _get("saturated") with db_conn() as conn: # Try to find existing by barcode (most reliable) or name+brand if barcode: existing = conn.execute( "SELECT id, scan_count FROM food_products WHERE barcode=?", (barcode,) ).fetchone() else: existing = conn.execute( "SELECT id, scan_count FROM food_products WHERE name=? AND brand=?", (name.strip(), brand.strip()) ).fetchone() if existing: # Increment scan_count — this is how we know which products are popular conn.execute( """UPDATE food_products SET scan_count=scan_count+1, updated_at=datetime('now') WHERE id=?""", (existing["id"],) ) return existing["id"] else: cursor = conn.execute( """INSERT INTO food_products (name,brand,category,barcode,calories_100g,protein_100g,carbs_100g, fat_100g,sodium_100g,fiber_100g,sugar_100g,sat_fat_100g, eatlytic_score,ingredients_raw,source,scan_count) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,1)""", (name.strip(), brand, category, barcode, cal, prot, carb, fat, sod, fib, sug, sat, score, ingredients_raw, source) ) return cursor.lastrowid def get_food_from_db(name: str = "", barcode: str = "") -> dict | None: """Look up a product in our proprietary DB before hitting LLM.""" with db_conn() as conn: if barcode: row = conn.execute( "SELECT * FROM food_products WHERE barcode=? AND verified=1", (barcode,) ).fetchone() elif name: row = conn.execute( """SELECT * FROM food_products WHERE name LIKE ? AND verified=1 ORDER BY scan_count DESC LIMIT 1""", (f"%{name}%",) ).fetchone() else: return None return dict(row) if row else None