# demo.py — Depth Pro + YOLO segmentation + Portion & Nutrition post-processing (tables version) import sys import json import numpy as np import cv2 import torch from PIL import Image import gradio as gr from ultralytics import YOLO # ----------------------------------------------------------- # 1. Import depth_pro (adjust path if needed) # ----------------------------------------------------------- # If depth_pro is in a local folder "ml-depth-pro/src" next to this file: sys.path.append("ml-depth-pro/src") import depth_pro # noqa: E402 # ----------------------------------------------------------- # 2. Device selection # ----------------------------------------------------------- device = "cuda" if torch.cuda.is_available() else "cpu" print(f"[INFO] Using device: {device}") # ----------------------------------------------------------- # 3. Load Depth Pro model # ----------------------------------------------------------- print("[INFO] Loading Depth Pro model...") dp_model, dp_transform = depth_pro.create_model_and_transforms() dp_model = dp_model.to(device) dp_model.eval() print("[INFO] Depth Pro ready.") # ----------------------------------------------------------- # 4. Load YOLO segmentation model # ----------------------------------------------------------- # TODO: change this to your actual best.pt path YOLO_MODEL_PATH = r"C:\Users\monol\Desktop\Senior_demo\ml-depth-pro\model\yolo-seg.pt" print(f"[INFO] Loading YOLO model from: {YOLO_MODEL_PATH}") yolo_model = YOLO(YOLO_MODEL_PATH) print("[INFO] YOLO ready.") # ----------------------------------------------------------- # 5. Load preset + nutrition metadata # ----------------------------------------------------------- try: with open("presetdata.json", "r", encoding="utf-8") as f: PRESET_LIST = json.load(f) PRESET_BY_CLASS = {item["class"]: item for item in PRESET_LIST} print(f"[INFO] Loaded {len(PRESET_LIST)} preset entries.") except Exception as e: print("[WARN] Could not load presetdata.json:", e) PRESET_LIST = [] PRESET_BY_CLASS = {} try: with open("nutrition_data.json", "r", encoding="utf-8") as f: NUTRITION_LIST = json.load(f) NUTR_BY_CLASS = {item["class"]: item for item in NUTRITION_LIST} print(f"[INFO] Loaded {len(NUTRITION_LIST)} nutrition entries.") except Exception as e: print("[WARN] Could not load nutrition_data.json:", e) NUTRITION_LIST = [] NUTR_BY_CLASS = {} # ----------------------------------------------------------- # 6. Helper: make depth visualization (RGB uint8) # ----------------------------------------------------------- def make_depth_vis(depth: np.ndarray) -> np.ndarray: """ depth: HxW float (meters), may contain NaNs returns: HxWx3 uint8 RGB image """ d = depth.copy() d[~np.isfinite(d)] = np.nan if not np.isfinite(d).any(): return np.zeros((*depth.shape, 3), dtype=np.uint8) d_min = np.nanpercentile(d, 1) d_max = np.nanpercentile(d, 99) if d_max <= d_min: d_max = d_min + 1e-6 d_norm = (d - d_min) / (d_max - d_min) d_norm = np.clip(d_norm, 0.0, 1.0) d_uint8 = (d_norm * 255).astype(np.uint8) depth_color_bgr = cv2.applyColorMap(d_uint8, cv2.COLORMAP_INFERNO) depth_color_rgb = cv2.cvtColor(depth_color_bgr, cv2.COLOR_BGR2RGB) return depth_color_rgb # ----------------------------------------------------------- # 7. Portion + nutrition helper functions # Using your equation: # Mass_in = Mass_ref * (%area_in / %area_ref) * (Z_in / Z_ref)^2 # ----------------------------------------------------------- def estimate_portion_for_class(cls_name, area_in_pct, z_in_m, default_z_in=None): """ Estimate portion (grams) for one class using preset reference + depth. area_in_pct: percentage area of image (0-100) z_in_m: median depth for that class (meters) """ preset = PRESET_BY_CLASS.get(cls_name) if not preset: return None try: mass_ref = float(preset["portion"]) # grams area_ref = float(preset["mask_region"]) # % area in reference z_ref = float(preset["center_depth"]) # meters except (KeyError, ValueError, TypeError): return None if area_ref <= 0 or z_ref <= 0: return None if z_in_m is None: z_in_m = default_z_in if z_in_m is None or not np.isfinite(z_in_m) or z_in_m <= 0: return None # Apply your scaling equation mass_in = mass_ref * (area_in_pct / area_ref) * (z_in_m / z_ref) ** 2 return { "class": cls_name, "estimated_portion_g": float(mass_in), "area_in_pct": float(area_in_pct), "area_ref_pct": float(area_ref), "z_in_m": float(z_in_m), "z_ref_m": float(z_ref), "mass_ref_g": float(mass_ref), } def estimate_nutrition_for_mass(class_name, mass_g): """ Use nutrition_data.json to scale nutrition by mass. Typically data is per 100 g. """ nutr = NUTR_BY_CLASS.get(class_name) if not nutr: return None try: ref_mass = float(nutr["amount"]) calories = float(nutr["calories"]) protein = float(nutr["protein"]) fat = float(nutr["fat"]) carbs = float(nutr["carbohydrates"]) sodium = float(nutr["sodium"]) except (KeyError, ValueError, TypeError): return None if ref_mass <= 0: return None factor = mass_g / ref_mass return { "class": class_name, "mass_g": float(mass_g), "calories": calories * factor, "protein": protein * factor, "fat": fat * factor, "carbohydrates": carbs * factor, "sodium": sodium * factor, } def breakdown_ingredients(dish_class_name, dish_mass_g): """ Split a dish (e.g., pad kaprao) into ingredients using presetdata.json, then compute ingredient-level nutrition if available in nutrition_data.json. """ preset = PRESET_BY_CLASS.get(dish_class_name) if not preset or "ingredients" not in preset: return [], [] try: portion_ref = float(preset["portion"]) except (KeyError, ValueError, TypeError): return [], [] if portion_ref <= 0: return [], [] ingredient_masses = [] ingredient_nutrition = [] for ing in preset["ingredients"]: ing_name = ing.get("name") try: ing_ref_mass = float(ing["amount"]) except (KeyError, ValueError, TypeError): continue ratio = ing_ref_mass / portion_ref ing_mass_in = dish_mass_g * ratio ingredient_masses.append({ "dish_class": dish_class_name, "ingredient": ing_name, "mass_g": float(ing_mass_in), }) nutr = estimate_nutrition_for_mass(ing_name, ing_mass_in) if nutr: nutr["dish_class"] = dish_class_name ingredient_nutrition.append(nutr) return ingredient_masses, ingredient_nutrition def postprocess_ai_results(rows, center_depth_m): """ rows: list of [class_name, area_pct, median_depth_m] center_depth_m: depth at center of image (meters) Returns: - portions_json: list of dicts like { "class": "pad kaprao", "portion": 100, "portion_label": "gram", "center_depth": "0.47", "mask_region": "5.07" } - dish_nutr_json: list of dish-level nutrition dicts - ingredient_nutr_json: list of ingredient-level nutrition dicts """ portions_json = [] dish_nutr_json = [] ingredient_nutr_json = [] for cls_name, area_pct, md in rows: if area_pct is None: continue # Use median depth if available; otherwise use global center depth if md is not None and np.isfinite(md): z_in = md else: z_in = center_depth_m portion_info = estimate_portion_for_class( cls_name=cls_name, area_in_pct=area_pct, z_in_m=z_in, default_z_in=center_depth_m, ) if portion_info is None: continue # Portion JSON in your requested-ish format portions_json.append({ "class": portion_info["class"], "portion": round(portion_info["estimated_portion_g"], 2), "portion_label": "gram", "center_depth": f"{portion_info['z_in_m']:.2f}", "mask_region": f"{portion_info['area_in_pct']:.2f}", }) # Dish-level nutrition dish_n = estimate_nutrition_for_mass( cls_name, portion_info["estimated_portion_g"] ) if dish_n: dish_nutr_json.append({ "class": dish_n["class"], "mass_g": round(dish_n["mass_g"], 2), "calories": round(dish_n["calories"], 1), "protein": round(dish_n["protein"], 1), "fat": round(dish_n["fat"], 1), "carbohydrates": round(dish_n["carbohydrates"], 1), "sodium": round(dish_n["sodium"], 1), }) # Ingredient-level nutrition (show ALL ingredients, even if we don’t know nutrition) ing_masses, ing_nutrition = breakdown_ingredients( dish_class_name=cls_name, dish_mass_g=portion_info["estimated_portion_g"], ) # Build a quick lookup: (dish_class, ingredient_name) -> nutrition dict nutr_lookup = {} for n in ing_nutrition: key = (n.get("dish_class", cls_name), n["class"]) nutr_lookup[key] = n for mass_rec in ing_masses: dish_cls = mass_rec["dish_class"] ing_name = mass_rec["ingredient"] mass_g = mass_rec["mass_g"] key = (dish_cls, ing_name) n = nutr_lookup.get(key) if n is not None: # We have nutrition data for this ingredient ingredient_nutr_json.append({ "dish_class": dish_cls, "ingredient": ing_name, "mass_g": round(mass_g, 2), "calories": round(n["calories"], 1), "protein": round(n["protein"], 1), "fat": round(n["fat"], 1), "carbohydrates": round(n["carbohydrates"], 1), "sodium": round(n["sodium"], 1), }) else: # No nutrition data -> still show ingredient with mass, leave nutrients blank ingredient_nutr_json.append({ "dish_class": dish_cls, "ingredient": ing_name, "mass_g": round(mass_g, 2), "calories": None, "protein": None, "fat": None, "carbohydrates": None, "sodium": None, }) return portions_json, dish_nutr_json, ingredient_nutr_json # ----------------------------------------------------------- # 8. Main pipeline: Depth Pro + YOLO segmentation + post-processing # ----------------------------------------------------------- def analyze_image(pil_img: Image.Image): # ---------- safety ---------- if pil_img is None: blank = np.zeros((10, 10, 3), dtype=np.uint8) return blank, blank, "Please upload an image first.", [], [], [], [] # Ensure RGB pil_img = pil_img.convert("RGB") rgb_np = np.array(pil_img) H_s, W_s, _ = rgb_np.shape # ======================================================= # A) YOLO segmentation (for mask & class percentages) # ======================================================= seg_vis = rgb_np.copy() class_to_mask = {} # class_name -> combined bool mask H_s x W_s # YOLO expects BGR typically; convert bgr_np = cv2.cvtColor(rgb_np, cv2.COLOR_RGB2BGR) try: results = yolo_model.predict( source=bgr_np, save=False, # we don't save images to disk conf=0.25, iou=0.7, verbose=False ) r = results[0] # visualization (BGR -> RGB) seg_plot_bgr = r.plot() seg_vis = cv2.cvtColor(seg_plot_bgr, cv2.COLOR_BGR2RGB) if r.masks is not None and len(r.masks.data) > 0: masks = r.masks.data.cpu().numpy() # [N, H, W] in YOLO image space boxes = r.boxes for i in range(len(masks)): cls_id = int(boxes.cls[i]) cls_name = yolo_model.names[cls_id] mask_i = masks[i] > 0.5 # bool H_s x W_s if cls_name not in class_to_mask: class_to_mask[cls_name] = mask_i else: class_to_mask[cls_name] |= mask_i else: print("[YOLO] No masks found.") except Exception as e: print("[YOLO ERROR]", e) seg_vis = seg_vis.astype(np.uint8) # ======================================================= # B) Depth Pro (distance from camera) # ======================================================= try: dp_in = dp_transform(pil_img).to(device) with torch.no_grad(): pred = dp_model.infer(dp_in, f_px=None) depth = pred["depth"] if isinstance(depth, torch.Tensor): depth = depth.squeeze().cpu().numpy() except Exception as e: blank = np.zeros((10, 10, 3), dtype=np.uint8) return blank, seg_vis, f"Depth estimation error: {e}", [], [], [], [] if depth is None or not np.isfinite(depth).any(): blank = np.zeros((10, 10, 3), dtype=np.uint8) return blank, seg_vis, "Depth map invalid (NaN/empty).", [], [], [], [] H_d, W_d = depth.shape # depth visualization (resized to original image size) depth_vis = make_depth_vis(depth) depth_vis_big = cv2.resize(depth_vis, (W_s, H_s), interpolation=cv2.INTER_LINEAR) depth_vis_big = depth_vis_big.astype(np.uint8) # ------------------------------------------------------- # Global depth summary (center + ROI) # ------------------------------------------------------- cx_d, cy_d = W_d // 2, H_d // 2 center_depth = float(depth[cy_d, cx_d]) roi = depth[int(H_d * 0.4):int(H_d * 0.6), int(W_d * 0.4):int(W_d * 0.6)] roi = roi[np.isfinite(roi)] roi_depth = float(np.median(roi)) if roi.size > 0 else float("nan") depth_lines = [ "### Depth Estimate", f"- Center depth: **{center_depth:.2f} m**", ] if np.isfinite(roi_depth): depth_lines.append(f"- Center ROI median depth: **{roi_depth:.2f} m**") # ======================================================= # C) Compute % area + median depth per class # ======================================================= total_pixels = H_s * W_s rows = [] # for segmentation stats table: [class, area%, median_depth] for cls_name, mask in class_to_mask.items(): # percentage of image area area_px = int(mask.sum()) area_pct = 100.0 * area_px / total_pixels if total_pixels > 0 else 0.0 # resize mask to depth resolution to sample depth correctly mask_u8 = (mask.astype(np.uint8) * 255) mask_depth = cv2.resize( mask_u8, (W_d, H_d), interpolation=cv2.INTER_NEAREST ) > 0 obj_depths = depth[mask_depth & np.isfinite(depth)] if obj_depths.size > 0: median_depth = float(np.median(obj_depths)) else: median_depth = float("nan") rows.append([ cls_name, round(area_pct, 2), None if not np.isfinite(median_depth) else round(median_depth, 2) ]) # Post-processing: portions + nutrition based on rows + center_depth portions_json, dish_nutr_json, ingredient_nutr_json = postprocess_ai_results( rows, center_depth ) if rows: depth_lines.append("\n### Object distances (per class)") for cls_name, area_pct, md in rows: if md is None: depth_lines.append( f"- {cls_name}: {area_pct:.2f}% of image, depth: N/A" ) else: depth_lines.append( f"- {cls_name}: {area_pct:.2f}% of image, median depth **{md:.2f} m**" ) else: depth_lines.append("\n_No segmentation masks detected._") depth_text = "\n".join(depth_lines) # ------------------------------------------------------- # Convert JSON-like results to table rows for Dataframe # ------------------------------------------------------- # Portions table: class, portion(g), center_depth(m), mask_region(%) portions_table_rows = [ [ p["class"], p["portion"], p["portion_label"], p["center_depth"], p["mask_region"], ] for p in portions_json ] # Dish nutrition table: class, mass_g, kcal, protein, fat, carbs, sodium dish_table_rows = [ [ d["class"], d["mass_g"], d["calories"], d["protein"], d["fat"], d["carbohydrates"], d["sodium"], ] for d in dish_nutr_json ] # Ingredient nutrition table: # dish_class, ingredient, mass_g, kcal, protein, fat, carbs, sodium ingredient_table_rows = [ [ ing["dish_class"], ing["ingredient"], ing["mass_g"], ing["calories"], ing["protein"], ing["fat"], ing["carbohydrates"], ing["sodium"], ] for ing in ingredient_nutr_json ] return ( depth_vis_big, seg_vis, depth_text, rows, portions_table_rows, dish_table_rows, ingredient_table_rows, ) # ----------------------------------------------------------- # 9. Gradio UI (using tables/Dataframe instead of JSON) # ----------------------------------------------------------- with gr.Blocks() as demo: gr.Markdown( "
" "Upload a food image → get depth map, object distance, estimated portion, and nutrition per dish & ingredient." "
" ) with gr.Row(): input_img = gr.Image(label="Upload food image", type="pil") with gr.Row(): depth_out = gr.Image(label="Depth overlay", type="numpy") seg_out = gr.Image(label="Segmentation result", type="numpy") with gr.Row(): depth_info = gr.Markdown(label="Depth estimate") seg_table = gr.Dataframe( headers=["Class", "Area % of image", "Median depth (m)"], datatype=["str", "number", "number"], label="Segmentation stats" ) portions_table = gr.Dataframe( headers=["Class", "Portion (g)", "Unit", "Center depth (m)", "Mask region (%)"], datatype=["str", "number", "str", "str", "str"], label="Estimated Portions (per class)", ) dish_nutrition_table = gr.Dataframe( headers=["Class", "Mass (g)", "Calories", "Protein (g)", "Fat (g)", "Carbs (g)", "Sodium (mg)"], datatype=["str", "number", "number", "number", "number", "number", "number"], label="Dish Nutrition (per class)", ) ingredient_nutrition_table = gr.Dataframe( headers=["Dish", "Ingredient", "Mass (g)", "Calories", "Protein (g)", "Fat (g)", "Carbs (g)", "Sodium (mg)"], datatype=["str", "str", "number", "number", "number", "number", "number", "number"], label="Ingredient Nutrition (per ingredient)", ) run_btn = gr.Button("Run analysis") run_btn.click( fn=analyze_image, inputs=input_img, outputs=[ depth_out, seg_out, depth_info, seg_table, portions_table, dish_nutrition_table, ingredient_nutrition_table, ], ) demo.launch(server_name="0.0.0.0", server_port=7860)