Spaces:

build-small-hackathon
/

Cook_with_a_LLM

Paused

App Files Files Community

Complete Cook App

by eldinosaur - opened Jun 13

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

+1925

-145

Files changed (26) hide show

.gitignore +24 -0
README.md +71 -4
app.py +212 -84
modal_app/__init__.py +0 -0
modal_app/flux_endpoint.py +124 -0
modal_app/planner_endpoint.py +117 -0
modal_app/serve_app.py +102 -0
packages.txt +2 -0
requirements.txt +5 -5
scripts/build_recipe_dataset.py +281 -0
scripts/diag_planner.py +73 -0
scripts/train_planner.py +172 -0
src/agents/progress_validator.py +84 -0
src/agents/recipe_planner.py +167 -0
src/agents/step_illustrator.py +81 -0
src/config.py +14 -3
src/data/__init__.py +0 -0
src/data/nutrition.py +112 -0
src/models/planner.py +103 -0
src/pipeline.py +32 -0
src/prompts/planner_propose.txt +11 -0
src/prompts/planner_recipe.txt +11 -0
src/prompts/validator_prompt.txt +14 -0
src/ui/components.py +48 -42
src/ui/components.pyi +8 -5
src/ui/theme.py +57 -2

.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+.venv/
+venv/
+# Generated data (SFT dataset lives on HF Hub: eldinosaur/cook-with-me-recipes-sft)
+data/*.parquet
+data/*.jsonl
+data/*.png
+data/*.npy
+data/*.csv
+# Local caches / model weights
+*.gguf
+.cache/
+assets/*.png
+# OS / editor
+.DS_Store
+Thumbs.db
+.idea/
+.vscode/

README.md CHANGED Viewed

@@ -1,13 +1,80 @@
 ---
 title: Cook With A LLM
-emoji: 🐠
-colorFrom: pink
-colorTo: pink
 sdk: gradio
 sdk_version: 6.15.2
 python_version: '3.12'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Cook With A LLM
+emoji: 🍲
+colorFrom: red
+colorTo: yellow
 sdk: gradio
 sdk_version: 6.15.2
 python_version: '3.12'
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
+# 🍲 Cook With Me — Multimodal Sous-Chef
+> *Snap your fridge. Pick a dish. Cook step by step. Check your progress with a photo.*
+A closed-loop multimodal cooking assistant built for the **Hugging Face Small Models / Big Adventures Hackathon (June 2026)**.
+---
+## How it works
+```
+📸 Fridge photo  ──▶  [Vision Agent]          identify ingredients
+                            │
+                            ▼
+                      [Recipe Planner]         propose 3 dishes → full recipe JSON
+                            │
+                            ▼
+                      [Nutrition Engine]       per-serving macros (lookup, no hallucination)
+                            │
+                            ▼
+📸 Progress photo ──▶  [Progress Validator]    go / wait / fix verdict
+```
+1. **Snap** your fridge or pantry — the fine-tuned vision model identifies every ingredient.
+2. **Pick** one of three AI-suggested dishes tailored to what you have.
+3. **Cook** step by step with a generated recipe and per-serving nutrition info.
+4. **Check** your progress by uploading a photo of your pan — the model tells you *go*, *wait*, or *fix*.
+---
+## Models
+| Role | Model | Params | Runtime |
+|---|---|---|---|
+| Vision + Planner + Validator | `openbmb/MiniCPM-V-4.6` (fine-tuned) | ~4.6B | `transformers` / ZeroGPU |
+**Total: ~4.6B parameters** (≤ 32B cap ✓ — significant headroom)
+The ingredient-identification model is **fine-tuned** on fridge/pantry photos for higher precision.
+---
+## Badges targeted
+| Badge | Status | How |
+|---|---|---|
+| 🎯 Well-Tuned | ✓ | Fine-tuned MiniCPM-V-4.6 for ingredient detection, published to Hub |
+| 🎨 Off-Brand | ✓ | Recipe-card UI with custom CSS — Lora serif, warm parchment palette |
+| 📡 Sharing is Caring | ✓ | Agent traces shared on Hub |
+| 📓 Field Notes | ✓ | Blog post: "Building a closed-loop visual cooking coach" |
+---
+## Architecture highlights
+- **Single model, three roles:** MiniCPM-V-4.6 handles vision (ingredients + progress) *and* text planning (recipe JSON generation) — no redundant model downloads.
+- **Closed-loop visual validation:** Flux generates step targets → user cooks → vision model compares — a real agent loop, not a wrapper.
+- **Hallucination-free nutrition:** macros come from a lookup table, not LLM arithmetic.
+- **Robust JSON extraction:** multi-strategy parser handles markdown fences, single quotes, and trailing commas so generation failures degrade gracefully.
+---
+## Track
+**Chapter One — Backyard AI** · "Build something for someone you actually know."
+Submission for the Hugging Face Hackathon · June 5–15, 2026.

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
 log = logging.getLogger(__name__)
 from typing import Any
@@ -6,12 +7,11 @@ from typing import Any
 import gradio as gr
 from PIL import Image
-# from src import config
 from src.agents.mise_en_place import identify_ingredients
-# from src.agents.progress_validator import validate
-# from src.agents.recipe_planner import plan_recipe, propose_dishes
-# from src.data.nutrition import compute_nutrition
-# from src.pipeline import Recipe
 from src.ui.components import (
     DishOptions,
     IngredientChips,
@@ -19,135 +19,265 @@ from src.ui.components import (
     RecipeHero,
     StepCard,
     VerdictBadge,
-    recipe_to_state,
 )
 from src.ui.theme import CSS, theme
-def on_propose(fridge_image: Image.Image | None, state: dict | None) -> tuple[str, str, list[str], dict]:
-    """Photo → ingredients → 3 dish options."""
     state = state or {}
     ingredients = identify_ingredients(fridge_image)
-    # options = propose_dishes(ingredients)
-    # state.update({
-    #     "ingredients_have": ingredients,
-    #     "ingredients_missing": [],
-    #     "options": [o.model_dump() for o in options],
-    # })
-    chips_html = IngredientChips.render({"have": ingredients, "missing": []})
-    log.info(ingredients)
-    # options_html = DishOptions.render({"options": state["options"]})
-    # radio_choices = [o.name for o in options]
-    # return chips_html, options_html, gr.update(choices=radio_choices, value=radio_choices[0] if radio_choices else None), state
-    return chips_html
-# ----------------
-# UI definition
-# ----------------
 def build_ui() -> gr.Blocks:
     initial_state: dict[str, Any] = {}
-    with gr.Blocks(title="Cook With Me") as demo:
         gr.Markdown(
             "# 🍲 Cook With Me\n"
-            "_A multimodal sous-chef. See it. Plan it. Show it. Cook it._"
         )
         state = gr.State(initial_state)
         with gr.Tabs():
-            # --- Tab 1: Cook ------------------------------------------------
-            with gr.Tab("Cook"):
                 with gr.Row():
                     with gr.Column(scale=1):
                         fridge_input = gr.Image(
                             label="📸 Photo of your fridge or pantry",
                             type="pil",
-                            height=320,
                         )
-                        propose_btn = gr.Button("What can I cook?", variant="primary")
                         gr.Markdown("### Ingredients I see")
                         chips = gr.HTML(IngredientChips.render({}))
                         gr.Markdown("### Pick a dish")
-                        options = gr.HTML(DishOptions.render({}))
-                        dish_radio = gr.Radio(choices=[], label="Choose one", interactive=True)
-                        with gr.Accordion("Generation options", open=False):
-                            illustrate_chk = gr.Checkbox(value=False, label="Render step images (FLUX, slow on CPU)")
-                            narrate_chk = gr.Checkbox(value=False, label="Generate voice narration (VoxCPM2)")
-                        cook_btn = gr.Button("Build recipe", variant="primary")
                     with gr.Column(scale=2):
                         hero = gr.HTML(RecipeHero.render({}))
                         steps_panel = gr.HTML(StepCard.render({}))
                         nutrition_panel = gr.HTML(NutritionGrid.render({"nutrition": {}}))
-            # --- Tab 2: Check Progress -------------------------------------
-            with gr.Tab("Check Progress"):
-                gr.Markdown("Upload a photo of your pan or plate; the same vision model that planned your recipe will compare it against the target step.")
                 with gr.Row():
                     with gr.Column():
                         step_idx = gr.Number(value=1, precision=0, label="Active step #")
-                        progress_input = gr.Image(label="📸 Your pan / plate", type="pil", height=320)
-                        validate_btn = gr.Button("How am I doing?", variant="primary")
                     with gr.Column():
                         verdict_panel = gr.HTML(VerdictBadge.render({}))
-                        verdict_audio = gr.Audio(label="Tip (voice)", autoplay=False)
-            # --- Tab 3: About ----------------------------------------------
-            with gr.Tab("About"):
                 gr.Markdown(
                     """
-                    ### Models
-                    - **Vision** — `openbmb/MiniCPM-V-4_6-gguf` via `llama-cpp-python` (~4.6B)
-                    - **Planner** — `openbmb/MiniCPM-V-4-gguf` via `llama-cpp-python` (~4B)
-                    - **Illustrator** — `black-forest-labs/FLUX.2-klein-9B` via `diffusers` (9B)
-                    - **Narrator** — `openbmb/VoxCPM2` via `transformers` (~1B)
-                    - **Retrieval** — `sentence-transformers/all-MiniLM-L6-v2` (22M)
-                    **Total ≈ 18.6B params** (≤ 32B requirement ✓).
-                    ### Pipeline
-                    ```
-                    Fridge photo → Vision → ingredients
-                                            │
-                                            ▼
-                                    Planner (+ Kaggle retrieval) → Recipe JSON
-                                            │
-                                            ▼
-                                    Illustrator (FLUX) → hero + per-step images
-                                            │
-                                            ▼
-                                    Narrator (VoxCPM2) → MP3 per step
-                                            │
-                                            ▼
-                    Progress photo → Validator (same vision model) → go|wait|fix
-                    ```
-                    ### Badges targeted
-                    ✓ Llama Champion · ✓ Well-Tuned · ✓ Off-Brand · ✓ Sharing is Caring · ✓ Field Notes
                     """
                 )
-        # Wire callbacks ----------------------------------------------------
         propose_btn.click(
             fn=on_propose,
             inputs=[fridge_input, state],
-            # outputs=[chips, options, dish_radio, state],
-            outputs=[chips],
         )
-        # cook_btn.click(
-        #     fn=on_pick_dish,
-        #     inputs=[state, dish_radio, illustrate_chk, narrate_chk],
-        #     outputs=[hero, steps_panel, nutrition_panel, chips, state],
-        # )
-        # validate_btn.click(
-        #     fn=on_validate,
-        #     inputs=[state, step_idx, progress_input],
-        #     outputs=[verdict_panel, verdict_audio],
-        # )
     return demo
@@ -159,6 +289,4 @@ if __name__ == "__main__":
         server_port=int(__import__("os").environ.get("PORT", 7860)),
         show_error=True,
         inbrowser=True,
-        theme=theme,
-        css=CSS
-    )

 import logging
+logging.basicConfig(level=logging.INFO)
 log = logging.getLogger(__name__)
 from typing import Any
 import gradio as gr
 from PIL import Image
 from src.agents.mise_en_place import identify_ingredients
+from src.agents.progress_validator import validate
+from src.agents.recipe_planner import plan_recipe, propose_dishes
+from src.agents.step_illustrator import illustrate_recipe
+from src.data.nutrition import compute_nutrition
 from src.ui.components import (
     DishOptions,
     IngredientChips,
     RecipeHero,
     StepCard,
     VerdictBadge,
 )
 from src.ui.theme import CSS, theme
+# ---------------------------------------------------------------------------
+# Callbacks
+# ---------------------------------------------------------------------------
+def _clean_ingredients(items: list | None) -> list[str]:
+    """Normalize a raw ingredient list (dedup, lowercase, strip empties)."""
+    out, seen = [], set()
+    for it in (items or []):
+        name = str(it).strip().lower()
+        if name and name not in seen:
+            seen.add(name)
+            out.append(name)
+    return out
+def on_propose(fridge_image: Image.Image | None, state: dict | None):
+    """Photo → ingredients → 3 dish options (and fill the editable list)."""
     state = state or {}
+    if fridge_image is None:
+        return (
+            IngredientChips.render({}),
+            DishOptions.render({}),
+            gr.update(choices=[], value=None),
+            state,
+            gr.update(choices=[], value=[]),
+        )
     ingredients = identify_ingredients(fridge_image)
+    options = propose_dishes(ingredients)
+    state.update({
+        "ingredients_have": ingredients,
+        "options": [o.model_dump() for o in options],
+    })
+    radio_choices = [o.name for o in options]
+    return (
+        IngredientChips.render({"have": ingredients, "missing": []}),
+        DishOptions.render({"options": state["options"]}),
+        gr.update(choices=radio_choices, value=radio_choices[0] if radio_choices else None),
+        state,
+        gr.update(choices=ingredients, value=ingredients),
+    )
+def on_update_ingredients(state: dict | None, ingredients: list | None):
+    """Manual edit of the ingredient list → refresh chips + re-propose dishes."""
+    state = state or {}
+    ingredients = _clean_ingredients(ingredients)
+    state["ingredients_have"] = ingredients
+    if not ingredients:
+        state["options"] = []
+        return (
+            IngredientChips.render({}),
+            DishOptions.render({}),
+            gr.update(choices=[], value=None),
+            state,
+        )
+    options = propose_dishes(ingredients)
+    state["options"] = [o.model_dump() for o in options]
+    radio_choices = [o.name for o in options]
+    return (
+        IngredientChips.render({"have": ingredients, "missing": []}),
+        DishOptions.render({"options": state["options"]}),
+        gr.update(choices=radio_choices, value=radio_choices[0] if radio_choices else None),
+        state,
+    )
+def on_cook(state: dict | None, dish_name: str | None, illustrate: bool, ingredients: list | None):
+    """Chosen dish → full recipe + nutrition (+ FLUX images if requested)."""
+    state = state or {}
+    if not dish_name:
+        return (
+            RecipeHero.render({}),
+            StepCard.render({}),
+            NutritionGrid.render({"nutrition": {}}),
+            state,
+        )
+    # Prefer the (possibly hand-edited) ingredient list from the editor.
+    ingredients = _clean_ingredients(ingredients) or state.get("ingredients_have", [])
+    state["ingredients_have"] = ingredients
+    recipe = plan_recipe(dish_name, ingredients)
+    nutrition = compute_nutrition(ingredients, recipe.servings)
+    recipe.nutrition = nutrition
+    state["recipe"] = recipe.model_dump()
+    if illustrate:
+        log.info("Generating FLUX step images via Modal...")
+        recipe = illustrate_recipe(recipe)
+        state["recipe"] = recipe.model_dump()
+    return (
+        RecipeHero.render(recipe.model_dump()),
+        StepCard.render({"steps": [s.model_dump() for s in recipe.steps]}),
+        NutritionGrid.render({"nutrition": nutrition}),
+        state,
+    )
+def on_validate(state: dict | None, step_idx: float, progress_image: Image.Image | None):
+    """Progress photo + step number → verdict badge."""
+    state = state or {}
+    recipe = state.get("recipe", {})
+    steps = recipe.get("steps", [])
+    idx = max(0, int(step_idx) - 1)
+    instruction = steps[idx]["instruction"] if idx < len(steps) else "Cook the dish properly."
+    result = validate(progress_image, instruction)
+    return VerdictBadge.render(result)
+# ---------------------------------------------------------------------------
+# UI
+# ---------------------------------------------------------------------------
 def build_ui() -> gr.Blocks:
     initial_state: dict[str, Any] = {}
+    with gr.Blocks(title="Cook With Me", theme=theme, css=CSS) as demo:
         gr.Markdown(
             "# 🍲 Cook With Me\n"
+            "_Snap your fridge · Pick a dish · Cook step by step · Check your progress._"
         )
         state = gr.State(initial_state)
         with gr.Tabs():
+            # ----------------------------------------------------------------
+            # Tab 1 — Cook
+            # ----------------------------------------------------------------
+            with gr.Tab("🍳 Cook"):
                 with gr.Row():
+                    # Left — inputs
                     with gr.Column(scale=1):
                         fridge_input = gr.Image(
                             label="📸 Photo of your fridge or pantry",
                             type="pil",
+                            height=300,
                         )
+                        propose_btn = gr.Button("🔍 What can I cook?", variant="primary")
                         gr.Markdown("### Ingredients I see")
                         chips = gr.HTML(IngredientChips.render({}))
+                        ingredient_editor = gr.Dropdown(
+                            choices=[],
+                            value=[],
+                            multiselect=True,
+                            allow_custom_value=True,
+                            label="✏️ Add or remove ingredients (type + Enter to add, ✕ to remove)",
+                            interactive=True,
+                        )
+                        update_btn = gr.Button("🔄 Update ingredients & dishes")
                         gr.Markdown("### Pick a dish")
+                        dish_options_html = gr.HTML(DishOptions.render({}))
+                        dish_radio = gr.Radio(
+                            choices=[],
+                            label="Choose one",
+                            interactive=True,
+                        )
+                        with gr.Accordion("⚙️ Generation options", open=False):
+                            illustrate_chk = gr.Checkbox(
+                                value=False,
+                                label="🎨 Generate step images with FLUX.2 (requires Modal deployment)",
+                            )
+                        cook_btn = gr.Button("👨‍🍳 Build my recipe", variant="primary")
+                    # Right — recipe output
                     with gr.Column(scale=2):
                         hero = gr.HTML(RecipeHero.render({}))
                         steps_panel = gr.HTML(StepCard.render({}))
                         nutrition_panel = gr.HTML(NutritionGrid.render({"nutrition": {}}))
+            # ----------------------------------------------------------------
+            # Tab 2 — Check Progress
+            # ----------------------------------------------------------------
+            with gr.Tab("📷 Check Progress"):
+                gr.Markdown(
+                    "Upload a photo of your pan or plate. The vision model compares it "
+                    "against the current recipe step and tells you if you can move on."
+                )
                 with gr.Row():
                     with gr.Column():
                         step_idx = gr.Number(value=1, precision=0, label="Active step #")
+                        progress_input = gr.Image(
+                            label="📸 Your pan / plate",
+                            type="pil",
+                            height=300,
+                        )
+                        validate_btn = gr.Button("✅ How am I doing?", variant="primary")
                     with gr.Column():
                         verdict_panel = gr.HTML(VerdictBadge.render({}))
+            # ----------------------------------------------------------------
+            # Tab 3 — About
+            # ----------------------------------------------------------------
+            with gr.Tab("ℹ️ About"):
                 gr.Markdown(
                     """
+### How it works
+1. **Snap** your fridge — the fine-tuned vision model (MiniCPM-V-4.6) identifies every ingredient.
+2. **Pick** one of three AI-suggested dishes tailored to what you have.
+3. **Cook** step by step with a generated recipe, per-serving nutrition, and optional FLUX.2 step images.
+4. **Check** your progress — upload a photo of your pan and get a *go / wait / fix* verdict.
+### Models
+| Role | Model | Params |
+|---|---|---|
+| Vision (ingredients + validator) | `openbmb/MiniCPM-V-4.6` (fine-tuned) | ~4.6B |
+| Recipe Planner | `openbmb/MiniCPM4.1-8B` (fine-tuned on Kaggle recipes) | ~8B |
+| Step Illustrator | `FLUX.2-klein-9B` via Modal | ~9B |
+**Total ≤ 21.6B params** (cap: 32B ✓)
+### Badges targeted
+✓ Well-Tuned · ✓ Off-Brand · ✓ Sharing is Caring · ✓ Field Notes
+### Hackathon
+Hugging Face Small Models / Big Adventures · June 2026 · Track: Backyard AI
                     """
                 )
+        # --------------------------------------------------------------------
+        # Wire callbacks
+        # --------------------------------------------------------------------
         propose_btn.click(
             fn=on_propose,
             inputs=[fridge_input, state],
+            outputs=[chips, dish_options_html, dish_radio, state, ingredient_editor],
+        )
+        update_btn.click(
+            fn=on_update_ingredients,
+            inputs=[state, ingredient_editor],
+            outputs=[chips, dish_options_html, dish_radio, state],
+        )
+        cook_btn.click(
+            fn=on_cook,
+            inputs=[state, dish_radio, illustrate_chk, ingredient_editor],
+            outputs=[hero, steps_panel, nutrition_panel, state],
+        )
+        validate_btn.click(
+            fn=on_validate,
+            inputs=[state, step_idx, progress_input],
+            outputs=[verdict_panel],
         )
     return demo
         server_port=int(__import__("os").environ.get("PORT", 7860)),
         show_error=True,
         inbrowser=True,
+    )

modal_app/__init__.py ADDED Viewed

File without changes

modal_app/flux_endpoint.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""Modal FLUX.2 Klein endpoint.
+Deploy once with:
+    modal deploy modal_app/flux_endpoint.py
+Then the HF Space calls it via modal.Function.lookup().
+"""
+import io
+import modal
+# ---------------------------------------------------------------------------
+# App & image
+# ---------------------------------------------------------------------------
+app = modal.App("cook-with-me-flux")
+image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .pip_install(
+        "torch==2.7.0",          # >=2.5 needed: diffusers custom-op schema uses PEP604 unions
+        "torchvision==0.22.0",   # matches torch 2.7.0; silences diffusers image-processor fallback
+        "diffusers>=0.38",       # FLUX.2 support
+        "transformers>=4.45",
+        "accelerate",
+        "safetensors",
+        "Pillow",
+        "huggingface_hub>=1.17",
+        "sentencepiece",
+    )
+)
+# HF token secret so Modal can pull gated/private model weights
+hf_secret = modal.Secret.from_name("huggingface-secret")
+# Tried in order. FLUX models are gated (need license acceptance on HF);
+# SDXL-Turbo is public and always works, so it's the guaranteed fallback.
+FLUX_MODEL = "black-forest-labs/FLUX.2-klein-9B"
+FLUX_FALLBACK = "black-forest-labs/FLUX.1-schnell"
+SDXL_TURBO = "stabilityai/sdxl-turbo"   # non-gated, fast (1-2 steps)
+# ---------------------------------------------------------------------------
+# GPU class
+# ---------------------------------------------------------------------------
+@app.cls(
+    image=image,
+    gpu="L4",
+    scaledown_window=180,   # keep warm 3 min after last request
+    secrets=[hf_secret],
+)
+class FluxKlein:
+    @modal.enter()
+    def load(self):
+        import torch
+        dtype = torch.bfloat16
+        self.steps = 4
+        # 1) FLUX.2-klein (gated) ------------------------------------------------
+        try:
+            from diffusers import FluxPipeline
+            self.pipe = FluxPipeline.from_pretrained(FLUX_MODEL, torch_dtype=dtype).to("cuda")
+            self.guidance, self.steps, self.backend = 1.0, 4, "FLUX.2-klein-9B"
+            print(f"Loaded {self.backend}")
+            return
+        except Exception as e:
+            print(f"FLUX.2-klein unavailable ({type(e).__name__}); trying FLUX.1-schnell...")
+        # 2) FLUX.1-schnell (gated) ---------------------------------------------
+        try:
+            from diffusers import FluxPipeline
+            self.pipe = FluxPipeline.from_pretrained(FLUX_FALLBACK, torch_dtype=dtype).to("cuda")
+            self.guidance, self.steps, self.backend = 0.0, 4, "FLUX.1-schnell"
+            print(f"Loaded {self.backend}")
+            return
+        except Exception as e:
+            print(f"FLUX.1-schnell unavailable ({type(e).__name__}); falling back to SDXL-Turbo...")
+        # 3) SDXL-Turbo (public, always works) ----------------------------------
+        from diffusers import AutoPipelineForText2Image
+        self.pipe = AutoPipelineForText2Image.from_pretrained(
+            SDXL_TURBO, torch_dtype=torch.float16, variant="fp16"
+        ).to("cuda")
+        self.guidance, self.steps, self.backend = 0.0, 2, "SDXL-Turbo"
+        print(f"Loaded {self.backend}")
+    @modal.method()
+    def render_step(self, prompt: str, seed: int = 42) -> bytes:
+        """Generate a 512×512 PNG and return its raw bytes."""
+        import torch
+        img = self.pipe(
+            prompt=prompt,
+            height=512,
+            width=512,
+            guidance_scale=self.guidance,
+            num_inference_steps=self.steps,
+            generator=torch.Generator(device="cuda").manual_seed(seed),
+        ).images[0]
+        buf = io.BytesIO()
+        img.save(buf, format="PNG")
+        return buf.getvalue()
+# ---------------------------------------------------------------------------
+# Local test entrypoint
+# ---------------------------------------------------------------------------
+@app.local_entrypoint()
+def test():
+    import os
+    flux = FluxKlein()
+    png = flux.render_step.remote(
+        "Top-down photo of a kitchen pan with sautéed onions. "
+        "Mexican cooking. Warm lighting. Photorealistic.",
+        seed=0,
+    )
+    out = os.path.join(os.path.dirname(__file__), "..", "data", "test_flux.png")
+    out = os.path.abspath(out)
+    os.makedirs(os.path.dirname(out), exist_ok=True)
+    with open(out, "wb") as f:
+        f.write(png)
+    print(f"Saved {out} ({len(png)} bytes)")

modal_app/planner_endpoint.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""Modal endpoint for the fine-tuned MiniCPM4.1-8B recipe planner.
+Runs in its OWN container because MiniCPM4.1's custom code requires
+transformers 4.x (CacheLayerMixin + is_torch_fx_available), which conflicts
+with the MiniCPM-V-4.6 vision model in the main app (needs transformers 5.x).
+Deploy:
+    modal deploy modal_app/planner_endpoint.py
+The Gradio app calls it via modal.Cls.from_name("cook-with-me-planner",
+"Planner").infer.remote(prompt, ...).
+"""
+from __future__ import annotations
+import os
+import modal
+app = modal.App("cook-with-me-planner")
+# 8B bf16 weights cached on a volume so cold starts don't re-download ~16GB.
+hf_cache = modal.Volume.from_name("cook-with-me-planner-cache", create_if_missing=True)
+hf_secret = modal.Secret.from_name("huggingface-secret")
+image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .pip_install(
+        "torch==2.4.0",
+        # MiniCPM4.1 custom code needs BOTH CacheLayerMixin (>=4.54) and
+        # is_torch_fx_available (removed in 5.0) — only 4.54..4.x has both.
+        "transformers>=4.54,<5.0",
+        "huggingface_hub>=0.26,<1.0",
+        "accelerate",
+        "sentencepiece",
+        "safetensors",
+    )
+    .env({"HF_HOME": "/cache/hf"})
+)
+# Fine-tuned weights; tokenizer pulled from base (FT tokenizer_config was saved
+# by transformers 5.x and is not readable by 4.x).
+PLANNER_REPO = os.environ.get("COOK_WITH_ME_PLANNER_FT_REPO", "eldinosaur/cook-with-me-planner-8b")
+BASE_REPO = "openbmb/MiniCPM4.1-8B"
+@app.cls(
+    image=image,
+    gpu="L4",
+    volumes={"/cache": hf_cache},
+    secrets=[hf_secret],
+    scaledown_window=240,
+    timeout=600,
+)
+class Planner:
+    @modal.enter()
+    def load(self):
+        import torch
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        print(f"Loading planner weights from {PLANNER_REPO}...")
+        self.tokenizer = AutoTokenizer.from_pretrained(BASE_REPO, trust_remote_code=True)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            PLANNER_REPO,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            device_map="cuda",
+        ).eval()
+        print("Planner ready.")
+    @modal.method()
+    def infer(self, prompt: str, max_new_tokens: int = 1024, temperature: float = 0.0) -> str:
+        import torch
+        messages = [{"role": "user", "content": prompt}]
+        # enable_thinking=False -> direct JSON, no <think> reasoning preamble
+        try:
+            enc = self.tokenizer.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_tensors="pt",
+                return_dict=True,
+                enable_thinking=False,
+            )
+        except TypeError:
+            enc = self.tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=True,
+                return_tensors="pt", return_dict=True,
+            )
+        input_ids = enc["input_ids"].to(self.model.device)
+        input_len = input_ids.shape[1]
+        gen_inputs = {"input_ids": input_ids}
+        if enc.get("attention_mask") is not None:
+            gen_inputs["attention_mask"] = enc["attention_mask"].to(self.model.device)
+        gen_kwargs = dict(max_new_tokens=max_new_tokens, repetition_penalty=1.05)
+        if temperature and temperature > 0:
+            gen_kwargs.update(do_sample=True, temperature=temperature, top_p=0.9)
+        else:
+            gen_kwargs.update(do_sample=False)
+        with torch.no_grad():
+            out = self.model.generate(**gen_inputs, **gen_kwargs)
+        return self.tokenizer.decode(out[0][input_len:], skip_special_tokens=True)
+@app.local_entrypoint()
+def test():
+    prompt = (
+        "You are a creative chef. Available ingredients: tomato, onion, garlic, pasta, olive oil.\n"
+        'Respond ONLY with JSON: {"options": [{"name": "...", "why": "..."}, {"name": "...", "why": "..."}, {"name": "...", "why": "..."}]}'
+    )
+    out = Planner().infer.remote(prompt, max_new_tokens=400)
+    print("OUTPUT:\n", out)

modal_app/serve_app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""Serve the full Cook With Me Gradio app on Modal GPU.
+This gives a permanent public URL (*.modal.run) that runs the real models:
+  - MiniCPM-V-4.6  (vision: ingredients + progress validation)
+  - MiniCPM4.1-8B  (planner: dish proposals + recipes)
+  - FLUX.2-klein   (step images, via the separate cook-with-me-flux endpoint)
+Deploy with:
+    modal deploy modal_app/serve_app.py
+Or run a temporary dev session (auto-stops on Ctrl-C):
+    modal serve modal_app/serve_app.py
+Both models live in one A100-40GB container (~25GB VRAM total).
+Set the fine-tuned planner repo via the COOK_WITH_ME_PLANNER_FT_REPO env
+on the Modal function once training finishes.
+"""
+from __future__ import annotations
+from pathlib import Path
+import modal
+LOCAL_ROOT = Path(__file__).resolve().parent.parent
+REMOTE_ROOT = "/root/cook"
+app = modal.App("cook-with-me-app")
+# HF model cache persisted across restarts (avoids re-downloading ~25GB)
+hf_cache = modal.Volume.from_name("cook-with-me-hf-cache", create_if_missing=True)
+hf_secret = modal.Secret.from_name("huggingface-secret")
+image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .pip_install(
+        "torch==2.4.0",
+        "torchvision==0.19.0",
+        "transformers>=5.0",
+        "accelerate",
+        "safetensors",
+        "sentencepiece",
+        "Pillow",
+        "av",
+        "pydantic>=2",
+        "gradio==6.15.2",
+        "huggingface_hub>=1.17",
+        "modal",
+    )
+    .env({
+        "COOK_WITH_ME_CACHE": "/cache/cook",
+        # Use the fine-tuned planner pushed by scripts/train_planner.py
+        "COOK_WITH_ME_PLANNER_FT_REPO": "eldinosaur/cook-with-me-planner-8b",
+    })
+    .add_local_dir(
+        str(LOCAL_ROOT),
+        REMOTE_ROOT,
+        ignore=[
+            "data/*", ".git/*", "**/__pycache__", "**/*.pyc",
+            "assets/*", ".venv/*", "venv/*",
+        ],
+    )
+)
+@app.function(
+    image=image,
+    gpu="L40S",
+    secrets=[hf_secret],
+    volumes={"/cache": hf_cache},
+    timeout=3600,
+    scaledown_window=300,   # stay warm 5 min after last request
+    max_containers=1,
+)
+@modal.concurrent(max_inputs=20)
+@modal.asgi_app()
+def serve():
+    import os
+    import sys
+    import types
+    # --- env: cache model downloads on the volume, before any HF import ---
+    os.environ["HF_HOME"] = "/cache/hf"
+    os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0")
+    # --- mock `spaces` so @spaces.GPU becomes a no-op (we're already on GPU) ---
+    spaces_mock = types.ModuleType("spaces")
+    spaces_mock.GPU = lambda *a, **k: (lambda fn: fn)
+    sys.modules["spaces"] = spaces_mock
+    # --- make the mounted project importable ---
+    sys.path.insert(0, REMOTE_ROOT)
+    import gradio as gr
+    from fastapi import FastAPI
+    # Importing app triggers the vision model load (module-level singleton).
+    from app import build_ui
+    demo = build_ui()
+    demo.queue(max_size=20)
+    fastapi_app = FastAPI()
+    return gr.mount_gradio_app(app=fastapi_app, blocks=demo, path="/")

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+ libsndfile1

requirements.txt CHANGED Viewed

@@ -1,10 +1,7 @@
-# --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-# llama-cpp-python
 gradio==6.15.2
 huggingface_hub>=1.17
-# --- Librerías añadidas y desbloqueadas para MiniCPM-V-4.6 ---
 torch
 torchvision
 spaces
@@ -12,4 +9,7 @@ Pillow
 transformers>=4.45
 accelerate
 safetensors
-av

 gradio==6.15.2
 huggingface_hub>=1.17
+# Vision model
 torch
 torchvision
 spaces
 transformers>=4.45
 accelerate
 safetensors
+av
+# Pipeline & data
+pydantic>=2

scripts/build_recipe_dataset.py ADDED Viewed

	@@ -0,0 +1,281 @@

+"""Build the SFT dataset for the MiniCPM4.1-8B recipe planner.
+Reads the Kaggle "better-recipes-for-a-better-life" dataset and produces
+supervised fine-tuning pairs for BOTH planner tasks, matching the exact
+prompt formats the app uses (src/prompts/planner_propose.txt and
+planner_recipe.txt):
+  1. propose  : ingredients -> {"options": [{name, why} x3]}
+  2. recipe   : dish + ingredients -> {"name", "cuisine", "servings",
+                "total_time_minutes", "final_dish_visual", "steps":[...]}
+Run locally (once) before fine-tuning:
+    python scripts/build_recipe_dataset.py
+Requires:
+    pip install kagglehub pandas pyarrow datasets huggingface_hub tqdm
+    ~/.kaggle/kaggle.json with your credentials
+"""
+from __future__ import annotations
+import json
+import random
+import re
+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+import pandas as pd
+from tqdm import tqdm
+from src import config
+random.seed(42)
+HF_DATASET_REPO = "eldinosaur/cook-with-me-recipes-sft"
+# ---------------------------------------------------------------------------
+# 1. Download (use ONLY recipes.csv — test_recipes.csv has a different schema
+#    whose capitalized columns shadowed the real data in the old version)
+# ---------------------------------------------------------------------------
+print("Pulling Kaggle dataset…")
+import kagglehub
+raw_path = Path(kagglehub.dataset_download(config.KAGGLE_DATASET))
+main_csv = raw_path / "recipes.csv"
+print(f"Reading {main_csv}")
+# cp1252 decodes the fraction/symbol bytes that show up as � under utf-8
+try:
+    raw_df = pd.read_csv(main_csv, encoding="cp1252", on_bad_lines="skip")
+except Exception:
+    raw_df = pd.read_csv(main_csv, encoding="utf-8", on_bad_lines="skip")
+print(f"Rows: {len(raw_df)}  columns: {list(raw_df.columns)}")
+# ---------------------------------------------------------------------------
+# 2. Cleaning helpers
+# ---------------------------------------------------------------------------
+_UNIT = (
+    r"(cups?|tablespoons?|tbsps?|teaspoons?|tsps?|pounds?|lbs?|ounces?|ozs?|"
+    r"grams?|kgs?|mls?|liters?|pinch(?:es)?|dash(?:es)?|cloves?|cans?|"
+    r"packages?|pkgs?|sheets?|slices?|sticks?|quarts?|pints?|jars?|bunch(?:es)?|"
+    r"heads?|stalks?|sprigs?|pieces?|fillets?)"
+)
+_PREP_WORDS = {
+    "peeled", "chopped", "diced", "sliced", "minced", "cored", "thawed",
+    "drained", "rinsed", "softened", "melted", "beaten", "divided", "cubed",
+    "to taste", "optional", "or more", "plus more", "for garnish", "for serving",
+    "lightly beaten", "room temperature", "at room temperature", "finely chopped",
+    "thinly sliced", "cut into", "more", "and", "or other", "such as",
+}
+def _clean_text(val: str) -> str:
+    if not isinstance(val, str):
+        return ""
+    # drop any remaining replacement chars and collapse whitespace
+    val = val.replace("�", " ")
+    return re.sub(r"[ \t]+", " ", val).strip()
+def _simplify_ingredient(raw: str) -> str:
+    s = re.sub(r"\([^)]*\)", "", raw)             # remove parentheticals
+    s = _clean_text(s).lower()
+    s = re.sub(r"^[\d\s./¼½¾⅓⅔⅛+-]+", "", s)       # leading quantities
+    s = re.sub(rf"^{_UNIT}\b\.?\s*", "", s)         # leading unit word
+    s = re.sub(r"^(of|the|a|an)\s+", "", s)
+    s = s.split(",")[0]                              # drop trailing prep clause
+    s = re.sub(r"[^a-z\s-]", "", s)                  # keep letters only
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+def _ingredient_list(raw: str) -> list[str]:
+    if not isinstance(raw, str):
+        return []
+    out, seen = [], set()
+    for part in raw.split(","):
+        name = _simplify_ingredient(part)
+        if not name or len(name) < 3 or len(name.split()) > 4:
+            continue
+        if name in _PREP_WORDS or name in seen:
+            continue
+        seen.add(name)
+        out.append(name)
+    return out
+def _steps_from_directions(raw: str) -> list[str]:
+    if not isinstance(raw, str):
+        return []
+    raw = _clean_text(raw.replace("\r", "\n"))
+    # Prefer explicit newlines; otherwise split into sentences.
+    parts = [p.strip() for p in raw.split("\n") if p.strip()]
+    if len(parts) < 2:
+        parts = [p.strip() for p in re.split(r"(?<=[.!?])\s+(?=[A-Z])", raw) if p.strip()]
+    # merge very short fragments into the previous step
+    steps: list[str] = []
+    for p in parts:
+        if steps and len(p) < 25:
+            steps[-1] = steps[-1] + " " + p
+        else:
+            steps.append(p)
+    return [s for s in steps if len(s) > 15]
+def _minutes(row) -> int:
+    for col in ("total_time", "cook_time", "prep_time"):
+        v = row.get(col)
+        if isinstance(v, str):
+            h = re.search(r"(\d+)\s*hr", v)
+            m = re.search(r"(\d+)\s*min", v)
+            total = (int(h.group(1)) * 60 if h else 0) + (int(m.group(1)) if m else 0)
+            if total:
+                return total
+    return 0
+def _cuisine(row) -> str:
+    cp = row.get("cuisine_path")
+    if isinstance(cp, str):
+        segs = [s for s in cp.split("/") if s]
+        if segs:
+            return segs[0].replace("-", " ").strip().title()
+    return "International"
+def _distribute(total: int, n: int) -> list[int]:
+    if n <= 0:
+        return []
+    if total <= 0:
+        total = n * 6
+    base = max(2, total // n)
+    durs = [base] * n
+    durs[-1] = max(2, total - base * (n - 1))
+    return durs
+# ---------------------------------------------------------------------------
+# 3. Normalize into clean recipe records
+# ---------------------------------------------------------------------------
+recipes: list[dict] = []
+for _, r in tqdm(raw_df.iterrows(), total=len(raw_df), desc="Normalizing"):
+    name = _clean_text(r.get("recipe_name", ""))
+    ings = _ingredient_list(r.get("ingredients", ""))
+    steps = _steps_from_directions(r.get("directions", ""))
+    if not name or len(ings) < 3 or len(steps) < 2:
+        continue
+    steps = steps[:7]
+    if len(steps) < 4 and len(steps) >= 2:
+        pass  # keep short recipes too, 2-3 steps is fine
+    minutes = _minutes(r) or len(steps) * 6
+    try:
+        servings = int(float(str(r.get("servings", "2")).split()[0]))
+    except Exception:
+        servings = 2
+    servings = min(max(servings, 1), 12)
+    recipes.append({
+        "name": name,
+        "ingredients": ings[:14],
+        "steps": steps,
+        "cuisine": _cuisine(r),
+        "minutes": int(minutes),
+        "servings": servings,
+    })
+print(f"\nClean recipes: {len(recipes)}")
+config.DATA_DIR.mkdir(parents=True, exist_ok=True)
+pd.DataFrame(recipes).to_parquet(config.RECIPES_PARQUET, index=False)
+print(f"Saved -> {config.RECIPES_PARQUET}")
+# ---------------------------------------------------------------------------
+# 4. Build SFT pairs matching the app's exact prompt formats
+# ---------------------------------------------------------------------------
+PROPOSE_TMPL = (config.PROMPTS_DIR / "planner_propose.txt").read_text(encoding="utf-8")
+RECIPE_TMPL = (config.PROMPTS_DIR / "planner_recipe.txt").read_text(encoding="utf-8")
+_WHY = [
+    "Uses your {a} and {b} for a quick, satisfying result.",
+    "A fresh way to combine {a} with {b}.",
+    "Turns {a} and {b} into a comforting classic.",
+    "Light and flavorful, built around {a} and {b}.",
+    "Makes the most of {a}, {b} and a few pantry staples.",
+]
+def _recipe_json(rec: dict) -> str:
+    durs = _distribute(rec["minutes"], len(rec["steps"]))
+    steps = [
+        {"n": i + 1, "instruction": s, "duration": f"{d} min", "tip": None}
+        for i, (s, d) in enumerate(zip(rec["steps"], durs))
+    ]
+    obj = {
+        "name": rec["name"],
+        "cuisine": rec["cuisine"],
+        "servings": rec["servings"],
+        "total_time_minutes": rec["minutes"],
+        "final_dish_visual": f"A beautifully plated {rec['name'].lower()}, ready to serve.",
+        "steps": steps,
+    }
+    return json.dumps(obj, ensure_ascii=False)
+def _propose_json(rec: dict, others: list[dict]) -> str:
+    a = rec["ingredients"][0] if rec["ingredients"] else "your ingredients"
+    b = rec["ingredients"][1] if len(rec["ingredients"]) > 1 else "pantry staples"
+    options = [{"name": rec["name"], "why": random.choice(_WHY).format(a=a, b=b)}]
+    for o in others:
+        oa = o["ingredients"][0] if o["ingredients"] else a
+        ob = o["ingredients"][1] if len(o["ingredients"]) > 1 else b
+        options.append({"name": o["name"], "why": random.choice(_WHY).format(a=oa, b=ob)})
+    return json.dumps({"options": options}, ensure_ascii=False)
+sft_path = config.DATA_DIR / "recipes_sft.jsonl"
+n_recipe = n_propose = 0
+with open(sft_path, "w", encoding="utf-8") as f:
+    for idx, rec in enumerate(tqdm(recipes, desc="Building SFT")):
+        ing_str = ", ".join(rec["ingredients"])
+        # --- recipe task ---
+        user_recipe = RECIPE_TMPL.replace("{dish_name}", rec["name"]).replace("{ingredients}", ing_str)
+        f.write(json.dumps({"messages": [
+            {"role": "user", "content": user_recipe},
+            {"role": "assistant", "content": _recipe_json(rec)},
+        ]}, ensure_ascii=False) + "\n")
+        n_recipe += 1
+        # --- propose task (use two other recipes as alternative options) ---
+        others = [recipes[(idx + 7) % len(recipes)], recipes[(idx + 53) % len(recipes)]]
+        user_propose = PROPOSE_TMPL.replace("{ingredients}", ing_str)
+        f.write(json.dumps({"messages": [
+            {"role": "user", "content": user_propose},
+            {"role": "assistant", "content": _propose_json(rec, others)},
+        ]}, ensure_ascii=False) + "\n")
+        n_propose += 1
+print(f"\nSFT pairs: {n_recipe} recipe + {n_propose} propose = {n_recipe + n_propose} -> {sft_path}")
+# ---------------------------------------------------------------------------
+# 5. Push to HF Hub
+# ---------------------------------------------------------------------------
+if HF_DATASET_REPO:
+    from datasets import load_dataset
+    ds = load_dataset("json", data_files=str(sft_path), split="train")
+    ds.push_to_hub(HF_DATASET_REPO)
+    print(f"Pushed {len(ds)} rows to {HF_DATASET_REPO}")
+print("\nDone.")

scripts/diag_planner.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""Diagnose why the fine-tuned planner produces empty generations.
+    modal run scripts/diag_planner.py
+"""
+import modal
+app = modal.App("cook-with-me-diag")
+image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .pip_install(
+        "torch==2.4.0",
+        "transformers>=4.54,<5.0",        # window with BOTH CacheLayerMixin and is_torch_fx_available
+        "huggingface_hub>=0.26,<1.0",
+        "accelerate",
+        "sentencepiece",
+    )
+)
+hf_secret = modal.Secret.from_name("huggingface-secret")
+MODEL_ID = "eldinosaur/cook-with-me-planner-8b"   # fine-tuned model under transformers 4.x
+@app.function(image=image, gpu="L4", secrets=[hf_secret], timeout=900)
+def diag():
+    import torch
+    import transformers
+    print("transformers version:", transformers.__version__)
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    print("Loading tokenizer (from base) + model (from FT)...")
+    tok = AutoTokenizer.from_pretrained("openbmb/MiniCPM4.1-8B", trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="cuda"
+    ).eval()
+    print("has generate:", hasattr(model, "generate"))
+    print("class mro:", [c.__name__ for c in type(model).__mro__])
+    prompt = (
+        "You are a chef. Given ingredients: tomato, onion, garlic, pasta, olive oil.\n"
+        'Return ONLY JSON: {"options": [{"name": "...", "why": "..."}, ...]} with 3 dish ideas.'
+    )
+    messages = [{"role": "user", "content": prompt}]
+    # Mirror the fixed planner.py path
+    try:
+        enc = tok.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True,
+            return_tensors="pt", return_dict=True,
+        )
+        input_ids = enc["input_ids"].to("cuda")
+        input_len = input_ids.shape[1]
+        gen_inputs = {"input_ids": input_ids}
+        if enc.get("attention_mask") is not None:
+            gen_inputs["attention_mask"] = enc["attention_mask"].to("cuda")
+        print("input length:", input_len)
+        with torch.no_grad():
+            out = model.generate(**gen_inputs, max_new_tokens=400, do_sample=False)
+        text = tok.decode(out[0][input_len:], skip_special_tokens=True)
+        print("=== GENERATION OK (transformers 4.x, cache on) ===")
+        print("OUTPUT:", repr(text[:1000]))
+    except Exception as e:
+        import traceback
+        print("=== GENERATION FAILED ===")
+        print("Exception type:", type(e).__name__)
+        print("Exception repr:", repr(e))
+        traceback.print_exc()
+@app.local_entrypoint()
+def main():
+    diag.remote()

scripts/train_planner.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""Fine-tune MiniCPM4.1-8B on the recipe SFT dataset via Modal (A10G GPU).
+Usage:
+    modal run scripts/train_planner.py
+After training, the adapter is merged and the full model is pushed to HF Hub
+as   <HF_USERNAME>/cook-with-me-planner-8b
+Set HF_USERNAME below (or export HF_TOKEN env var before running).
+"""
+from __future__ import annotations
+import modal
+# ---------------------------------------------------------------------------
+# Config — change these two values
+# ---------------------------------------------------------------------------
+HF_USERNAME = "eldinosaur"
+SFT_DATASET_REPO = f"{HF_USERNAME}/cook-with-me-recipes-sft"
+OUTPUT_REPO = f"{HF_USERNAME}/cook-with-me-planner-8b"
+BASE_MODEL = "openbmb/MiniCPM4.1-8B"
+# ---------------------------------------------------------------------------
+app = modal.App("cook-with-me-train")
+volume = modal.Volume.from_name("cook-with-me-train-vol", create_if_missing=True)
+train_image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .pip_install(
+        "torch==2.4.0",
+        "transformers>=5.0",
+        "peft>=0.12",
+        "trl>=0.10",
+        "accelerate",
+        "datasets",
+        "huggingface_hub>=1.17",
+        "bitsandbytes",
+        "sentencepiece",
+        "safetensors",
+    )
+)
+hf_secret = modal.Secret.from_name("huggingface-secret")
+@app.function(
+    image=train_image,
+    gpu="A10G",
+    timeout=60 * 60 * 3,          # 3-hour hard cap
+    secrets=[hf_secret],
+    volumes={"/vol": volume},
+)
+def train():
+    import os
+    import torch
+    from datasets import load_dataset
+    from peft import LoraConfig, get_peft_model, TaskType
+    from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
+    from trl import SFTTrainer, SFTConfig
+    os.environ.setdefault("HF_HOME", "/vol/hf_cache")
+    # MiniCPM4.1-8B custom code references is_torch_fx_available which was
+    # removed in transformers 5.x. Patch it back before loading the model.
+    import transformers.utils.import_utils as _iutils
+    if not hasattr(_iutils, "is_torch_fx_available"):
+        def _is_torch_fx_available():
+            try:
+                import torch.fx  # noqa: F401
+                return True
+            except ImportError:
+                return False
+        _iutils.is_torch_fx_available = _is_torch_fx_available
+    # ---- Load tokenizer & model ----
+    print(f"Loading {BASE_MODEL}…")
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        device_map="cuda",
+    )
+    # ---- LoRA config ----
+    lora_cfg = LoraConfig(
+        task_type=TaskType.CAUSAL_LM,
+        r=16,
+        lora_alpha=32,
+        lora_dropout=0.05,
+        target_modules="all-linear",
+        bias="none",
+    )
+    model = get_peft_model(model, lora_cfg)
+    model.print_trainable_parameters()
+    # ---- Dataset ----
+    print(f"Loading dataset {SFT_DATASET_REPO}…")
+    ds = load_dataset(SFT_DATASET_REPO, split="train")
+    def _format(example):
+        return {"text": tokenizer.apply_chat_template(
+            example["messages"], tokenize=False, add_generation_prompt=False
+        )}
+    ds = ds.map(_format, remove_columns=ds.column_names)
+    # ---- Training ----
+    output_dir = "/vol/planner_out"
+    trainer = SFTTrainer(
+        model=model,
+        processing_class=tokenizer,
+        train_dataset=ds,
+        args=SFTConfig(
+            output_dir=output_dir,
+            num_train_epochs=3,   # 2046 examples — 3 epochs converges without overfitting
+            per_device_train_batch_size=2,
+            gradient_accumulation_steps=4,
+            learning_rate=2e-4,
+            lr_scheduler_type="cosine",
+            warmup_ratio=0.05,
+            bf16=True,
+            logging_steps=20,
+            save_steps=200,
+            max_length=2048,
+            dataset_text_field="text",
+        ),
+    )
+    trainer.train()
+    trainer.save_model(output_dir)
+    # ---- Merge LoRA + push ----
+    print("Merging LoRA adapter…")
+    from peft import PeftModel
+    base = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="cpu"
+    )
+    merged = PeftModel.from_pretrained(base, output_dir)
+    merged = merged.merge_and_unload()
+    # MiniCPM custom code declares `_tied_weights_keys` as a list, but
+    # transformers 5.x's save path calls `.keys()` on it. Patch the walker
+    # to tolerate both list and dict formats before saving/pushing.
+    import transformers.modeling_utils as _mu
+    def _safe_get_tied_weight_keys(model, *args, **kwargs):
+        keys = []
+        for module_name, module in model.named_modules():
+            tied = getattr(module, "_tied_weights_keys", None)
+            if not tied:
+                continue
+            names = tied.keys() if isinstance(tied, dict) else tied
+            for k in names:
+                keys.append(f"{module_name}.{k}" if module_name else k)
+        return keys
+    _mu._get_tied_weight_keys = _safe_get_tied_weight_keys
+    print(f"Pushing merged model to {OUTPUT_REPO}…")
+    merged.push_to_hub(OUTPUT_REPO, private=False)
+    tokenizer.push_to_hub(OUTPUT_REPO, private=False)
+    print("Done.")
+@app.local_entrypoint()
+def main():
+    train.remote()

src/agents/progress_validator.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""Progress validation agent: compare cooking photo against target step."""
+from __future__ import annotations
+import logging
+from typing import Optional
+import spaces
+import torch
+from PIL import Image
+from src import config
+from src.agents.mise_en_place import model, processor
+from src.agents.recipe_planner import _extract_json
+log = logging.getLogger(__name__)
+_VALIDATOR_PROMPT = (config.PROMPTS_DIR / "validator_prompt.txt").read_text(encoding="utf-8")
+@spaces.GPU(duration=45)
+def validate(image: Optional[Image.Image], step_instruction: str) -> dict:
+    """Compare a cooking-progress photo to the target step description.
+    Returns a dict with keys: verdict ('go'|'wait'|'fix'), feedback, tip.
+    """
+    if image is None:
+        return {
+            "verdict": "wait",
+            "feedback": "No image provided.",
+            "tip": "Upload a photo of your cooking progress to get feedback.",
+        }
+    try:
+        img = image.convert("RGB")
+        prompt = _VALIDATOR_PROMPT.replace("{step_instruction}", step_instruction)
+        messages = [{"role": "user", "content": [
+            {"type": "image", "image": img},
+            {"type": "text", "text": prompt},
+        ]}]
+        inputs = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            enable_thinking=False,
+            processor_kwargs={"downsample_mode": "16x", "max_slice_nums": 9, "use_image_id": True},
+        )
+        device = model.device
+        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+        for k, v in inputs.items():
+            if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
+                inputs[k] = v.to(dtype=torch.bfloat16)
+        with torch.no_grad():
+            generated_ids = model.generate(
+                **inputs,
+                max_new_tokens=256,
+                do_sample=False,
+                downsample_mode="16x",
+            )
+        trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], generated_ids)]
+        raw = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        log.info("validate raw: %s", raw[:400])
+        data = _extract_json(raw)
+        verdict = str(data.get("verdict", "wait"))
+        if verdict not in ("go", "wait", "fix"):
+            verdict = "wait"
+        return {
+            "verdict": verdict,
+            "feedback": str(data.get("feedback", "")),
+            "tip": str(data.get("tip", "")),
+        }
+    except Exception as exc:
+        log.warning("validate failed: %s", exc)
+        return {
+            "verdict": "wait",
+            "feedback": "Could not analyse the photo.",
+            "tip": "Make sure the image is well-lit and in focus.",
+        }

src/agents/recipe_planner.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""Recipe planner agent: propose dishes + generate step-by-step recipe.
+Uses openbmb/MiniCPM4.1-8B (text-only) as the primary planner.
+Falls back to the shared vision model (MiniCPM-V-4.6) when the planner
+model is unavailable (e.g. insufficient RAM on the Space).
+"""
+from __future__ import annotations
+import json
+import logging
+import re
+import spaces
+import torch
+from src import config
+from src.pipeline import DishOption, Recipe, RecipeStep
+log = logging.getLogger(__name__)
+_PROPOSE_PROMPT = (config.PROMPTS_DIR / "planner_propose.txt").read_text(encoding="utf-8")
+_RECIPE_PROMPT = (config.PROMPTS_DIR / "planner_recipe.txt").read_text(encoding="utf-8")
+# ---------------------------------------------------------------------------
+# JSON extraction helpers
+# ---------------------------------------------------------------------------
+def _extract_json(text: str) -> dict:
+    """Robustly extract the first JSON object from raw model output."""
+    text = text.strip()
+    try:
+        return json.loads(text)
+    except Exception:
+        pass
+    # Markdown code-block
+    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
+    if m:
+        try:
+            return json.loads(m.group(1))
+        except Exception:
+            pass
+    # First {...} block with minor auto-fixes
+    m = re.search(r"\{.*\}", text, re.DOTALL)
+    if m:
+        candidate = m.group(0)
+        candidate = candidate.replace("'", '"')
+        candidate = re.sub(r",\s*([}\]])", r"\1", candidate)
+        try:
+            return json.loads(candidate)
+        except Exception:
+            pass
+    log.warning("Could not extract JSON from output (first 300 chars): %.300s", text)
+    return {}
+# ---------------------------------------------------------------------------
+# Inference dispatcher
+# ---------------------------------------------------------------------------
+def _infer(prompt: str, max_new_tokens: int = 1024, temperature: float = 0.0) -> str:
+    """Run text inference.
+    Primary: the dedicated MiniCPM4.1-8B planner Modal endpoint (transformers
+    4.x). Falls back to the local vision model (text-only) if the endpoint is
+    unavailable or returns nothing.
+    """
+    try:
+        import modal
+        cls = modal.Cls.from_name(config.PLANNER_MODAL_APP, config.PLANNER_MODAL_CLS)
+        out = cls().infer.remote(prompt, max_new_tokens=max_new_tokens, temperature=temperature)
+        if out and out.strip():
+            return out
+        log.warning("Planner endpoint returned empty — falling back to vision model.")
+    except Exception as exc:
+        log.warning("Planner endpoint call failed: %s — falling back to vision model.", exc)
+    # Fallback: use the vision model in text-only mode
+    log.warning("Using vision model as text fallback.")
+    from src.agents.mise_en_place import model as vis_model, processor as vis_proc
+    messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
+    inputs = vis_proc.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+        enable_thinking=False,
+    )
+    device = vis_model.device
+    inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+    for k, v in inputs.items():
+        if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
+            inputs[k] = v.to(dtype=torch.bfloat16)
+    with torch.no_grad():
+        generated_ids = vis_model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
+    trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], generated_ids)]
+    return vis_proc.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+# ---------------------------------------------------------------------------
+# Public agent functions
+# ---------------------------------------------------------------------------
+@spaces.GPU(duration=90)
+def propose_dishes(ingredients: list[str]) -> list[DishOption]:
+    """Given detected ingredients, return up to 3 dish proposals."""
+    try:
+        prompt = _PROPOSE_PROMPT.replace("{ingredients}", ", ".join(ingredients))
+        raw = _infer(prompt, max_new_tokens=512, temperature=0.7)
+        log.info("propose_dishes raw: %.500s", raw)
+        data = _extract_json(raw)
+        options = data.get("options", [])
+        return [
+            DishOption(name=str(o.get("name", "Dish")), why=str(o.get("why", "")))
+            for o in options[:3]
+            if o.get("name")
+        ] or [DishOption(name="Simple Stir-fry", why="Quick and adaptable to most ingredients.")]
+    except Exception as exc:
+        log.warning("propose_dishes failed: %s", exc)
+        return [DishOption(name="Simple Stir-fry", why="Quick and adaptable to most ingredients.")]
+@spaces.GPU(duration=120)
+def plan_recipe(dish_name: str, ingredients: list[str]) -> Recipe:
+    """Generate a full step-by-step recipe for the chosen dish."""
+    try:
+        prompt = (
+            _RECIPE_PROMPT
+            .replace("{dish_name}", dish_name)
+            .replace("{ingredients}", ", ".join(ingredients))
+        )
+        raw = _infer(prompt, max_new_tokens=1024, temperature=0.0)
+        log.info("plan_recipe raw: %.800s", raw)
+        data = _extract_json(raw)
+        raw_steps = data.get("steps", [])
+        steps = []
+        for i, s in enumerate(raw_steps, start=1):
+            if not s.get("instruction"):
+                continue
+            tip_val = s.get("tip")
+            steps.append(RecipeStep(
+                n=int(s.get("n", i)),
+                instruction=str(s["instruction"]),
+                duration=str(s.get("duration", "5 min")),
+                tip=str(tip_val) if tip_val and str(tip_val).lower() not in ("null", "none") else None,
+                visual=str(s.get("visual", "")),
+            ))
+        return Recipe(
+            name=str(data.get("name", dish_name)),
+            cuisine=str(data.get("cuisine", "International")),
+            servings=int(data.get("servings", 2)),
+            total_time_minutes=int(data.get("total_time_minutes", 30)),
+            final_dish_visual=str(data.get("final_dish_visual", "")),
+            steps=steps or [RecipeStep(n=1, instruction="Prepare and cook ingredients to taste.", duration="20 min")],
+        )
+    except Exception as exc:
+        log.warning("plan_recipe failed: %s", exc)
+        return Recipe(
+            name=dish_name,
+            steps=[RecipeStep(n=1, instruction="Prepare and cook ingredients to taste.", duration="20 min")],
+        )

src/agents/step_illustrator.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Step image generator — delegates to the deployed Modal FLUX.2 endpoint."""
+from __future__ import annotations
+import base64
+import logging
+from typing import Optional
+from src import config
+from src.pipeline import Recipe, RecipeStep
+log = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _b64(png_bytes: bytes) -> str:
+    return base64.b64encode(png_bytes).decode()
+def _step_prompt(visual: str, cuisine: str, n: int) -> str:
+    desc = visual.strip() or f"cooking step {n}"
+    return (
+        f"Top-down photo of a kitchen pan or plate showing {desc}. "
+        f"{cuisine} home cooking. Warm natural lighting. "
+        "Recipe magazine style. Photorealistic. Appetizing."
+    )
+def _dish_prompt(visual: str, cuisine: str) -> str:
+    desc = visual.strip() or "the finished plated dish, garnished and beautifully presented"
+    return (
+        f"Top-down photo of a {desc} on a rustic wooden table. "
+        f"{cuisine} home cooking. Warm natural lighting. "
+        "Recipe magazine style. Photorealistic. Appetizing."
+    )
+# ---------------------------------------------------------------------------
+# Modal call
+# ---------------------------------------------------------------------------
+def _call_modal(prompt: str, seed: int = 42) -> Optional[bytes]:
+    """Call the deployed Modal FLUX endpoint. Returns PNG bytes or None."""
+    try:
+        import modal
+        cls = modal.Cls.from_name(config.MODAL_APP_NAME, config.MODAL_CLS_NAME)
+        return cls().render_step.remote(prompt, seed=seed)
+    except Exception as exc:
+        log.warning("Modal FLUX call failed: %s", exc)
+        return None
+# ---------------------------------------------------------------------------
+# Public function
+# ---------------------------------------------------------------------------
+def illustrate_recipe(recipe: Recipe) -> Recipe:
+    """Generate FLUX images for every step + final dish.
+    Mutates and returns the same Recipe with image_b64 fields populated
+    (or left as None when Modal is unavailable).
+    """
+    cuisine = recipe.cuisine or "International"
+    # Final dish hero image
+    final_bytes = _call_modal(_dish_prompt(recipe.final_dish_visual, cuisine), seed=0)
+    if final_bytes:
+        recipe.final_dish_image_b64 = _b64(final_bytes)
+        log.info("Generated final dish image.")
+    # Per-step images (sequential to respect GPU limits on Modal)
+    for step in recipe.steps:
+        prompt = _step_prompt(step.visual, cuisine, step.n)
+        step_bytes = _call_modal(prompt, seed=step.n)
+        if step_bytes:
+            step.image_b64 = _b64(step_bytes)
+            log.info("Generated image for step %d.", step.n)
+    return recipe

src/config.py CHANGED Viewed

@@ -21,10 +21,21 @@ VISION_REPO = "openbmb/MiniCPM-V-4_6-GGUF"
 VISION_MODEL_FILE = "MiniCPM-V-4_6-Q4_K_M.gguf"
 VISION_MMPROJ_FILE = "mmproj-model-f16.gguf"
-PLANNER_REPO = "openbmb/MiniCPM-V-4-gguf"
-PLANNER_MODEL_FILE = "Model-Q4_K_M.gguf"
-FLUX_REPO = "black-forest-labs/FLUX.2-klein-9B"
 NARRATOR_REPO = "openbmb/VoxCPM2"
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

 VISION_MODEL_FILE = "MiniCPM-V-4_6-Q4_K_M.gguf"
 VISION_MMPROJ_FILE = "mmproj-model-f16.gguf"
+# Base model; set COOK_WITH_ME_PLANNER_REPO to point at a fine-tuned HF repo
+PLANNER_REPO = os.environ.get("COOK_WITH_ME_PLANNER_REPO", "openbmb/MiniCPM4.1-8B")
+PLANNER_FINETUNED_REPO = os.environ.get("COOK_WITH_ME_PLANNER_FT_REPO", "")  # set after fine-tune
+# Modal app names
+MODAL_APP_NAME = "cook-with-me-flux"
+MODAL_CLS_NAME = "FluxKlein"
+# Planner runs in its own Modal app (transformers 4.x, conflicts with the
+# vision model's transformers 5.x — so it can't live in the same container).
+PLANNER_MODAL_APP = "cook-with-me-planner"
+PLANNER_MODAL_CLS = "Planner"
+FLUX_REPO = os.environ.get("COOK_WITH_ME_FLUX_REPO", "black-forest-labs/FLUX.2-klein-9B")
+FLUX_FALLBACK_REPO = "black-forest-labs/FLUX.1-schnell"
 NARRATOR_REPO = "openbmb/VoxCPM2"
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

src/data/__init__.py ADDED Viewed

File without changes

src/data/nutrition.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""Per-serving macro estimator — ingredient lookup, no extra model call needed."""
+from __future__ import annotations
+# (calories kcal, protein g, carbs g, fat g, fiber g) per 100 g
+_MACROS: dict[str, tuple[float, float, float, float, float]] = {
+    # proteins
+    "chicken": (165, 31, 0, 3.6, 0),
+    "beef": (250, 26, 0, 16, 0),
+    "pork": (242, 27, 0, 14, 0),
+    "fish": (130, 20, 0, 5, 0),
+    "salmon": (208, 20, 0, 13, 0),
+    "tuna": (130, 29, 0, 0.5, 0),
+    "shrimp": (99, 24, 0, 0.3, 0),
+    "egg": (155, 13, 1.1, 11, 0),
+    "eggs": (155, 13, 1.1, 11, 0),
+    "tofu": (76, 8, 1.9, 4.8, 0.3),
+    # dairy
+    "milk": (61, 3.2, 4.8, 3.3, 0),
+    "cheese": (402, 25, 1.3, 33, 0),
+    "butter": (717, 0.9, 0.1, 81, 0),
+    "yogurt": (59, 3.5, 4.7, 3.3, 0),
+    "cream": (340, 2.1, 2.8, 36, 0),
+    # starches
+    "rice": (130, 2.7, 28, 0.3, 0.4),
+    "pasta": (158, 5.8, 31, 0.9, 1.8),
+    "bread": (265, 9, 49, 3.2, 2.7),
+    "potato": (77, 2, 17, 0.1, 2.2),
+    "potatoes": (77, 2, 17, 0.1, 2.2),
+    "flour": (364, 10, 76, 1, 2.7),
+    "oats": (389, 17, 66, 7, 10.6),
+    "quinoa": (120, 4.1, 21, 1.9, 2.8),
+    "lentils": (116, 9, 20, 0.4, 7.9),
+    "beans": (347, 21, 60, 1.2, 15),
+    "chickpeas": (164, 8.9, 27, 2.6, 7.6),
+    # vegetables
+    "tomato": (18, 0.9, 3.9, 0.2, 1.2),
+    "tomatoes": (18, 0.9, 3.9, 0.2, 1.2),
+    "onion": (40, 1.1, 9.3, 0.1, 1.7),
+    "onions": (40, 1.1, 9.3, 0.1, 1.7),
+    "garlic": (149, 6.4, 33, 0.5, 2.1),
+    "carrot": (41, 0.9, 10, 0.2, 2.8),
+    "carrots": (41, 0.9, 10, 0.2, 2.8),
+    "broccoli": (34, 2.8, 7, 0.4, 2.6),
+    "spinach": (23, 2.9, 3.6, 0.4, 2.2),
+    "pepper": (31, 1, 6, 0.3, 2.1),
+    "peppers": (31, 1, 6, 0.3, 2.1),
+    "mushroom": (22, 3.1, 3.3, 0.3, 1),
+    "mushrooms": (22, 3.1, 3.3, 0.3, 1),
+    "zucchini": (17, 1.2, 3.1, 0.3, 1),
+    "corn": (86, 3.3, 19, 1.4, 2.7),
+    "lettuce": (15, 1.4, 2.9, 0.2, 1.3),
+    "cucumber": (16, 0.7, 3.6, 0.1, 0.5),
+    "eggplant": (25, 1, 5.9, 0.2, 3),
+    "cabbage": (25, 1.3, 5.8, 0.1, 2.5),
+    "celery": (16, 0.7, 3, 0.2, 1.6),
+    "leek": (61, 1.5, 14, 0.3, 1.8),
+    # fruits
+    "apple": (52, 0.3, 14, 0.2, 2.4),
+    "banana": (89, 1.1, 23, 0.3, 2.6),
+    "lemon": (29, 1.1, 9.3, 0.3, 2.8),
+    "lime": (30, 0.7, 10.5, 0.2, 2.8),
+    "orange": (47, 0.9, 12, 0.1, 2.4),
+    # fats & condiments
+    "olive oil": (884, 0, 0, 100, 0),
+    "oil": (884, 0, 0, 100, 0),
+    "soy sauce": (53, 8.1, 4.9, 0.1, 0.8),
+    "honey": (304, 0.3, 82, 0, 0.2),
+    "sugar": (387, 0, 100, 0, 0),
+    "salt": (0, 0, 0, 0, 0),
+    "vinegar": (18, 0, 0.9, 0, 0),
+}
+# Typical portion weight per ingredient (grams)
+_GRAMS: dict[str, int] = {
+    "egg": 50, "eggs": 100,
+    "butter": 15,
+    "olive oil": 14, "oil": 14,
+    "soy sauce": 15,
+    "salt": 3,
+    "garlic": 10,
+    "honey": 21,
+    "sugar": 12,
+    "lemon": 30, "lime": 30,
+}
+_DEFAULT_GRAMS = 80
+def compute_nutrition(ingredients: list[str], servings: int = 2) -> dict[str, float]:
+    """Return per-serving macro estimates keyed to the NutritionGrid format."""
+    cal = prot = carb = fat = fib = 0.0
+    for ing in ingredients:
+        key = ing.lower().strip()
+        row = _MACROS.get(key) or _MACROS.get(key.split()[0]) if key else None
+        if row is None:
+            continue
+        grams = _GRAMS.get(key, _DEFAULT_GRAMS)
+        f = grams / 100
+        c, p, cb, ft, fb = row
+        cal += c * f
+        prot += p * f
+        carb += cb * f
+        fat += ft * f
+        fib += fb * f
+    sv = max(servings, 1)
+    return {
+        "calories": round(cal / sv),
+        "protein_g": round(prot / sv, 1),
+        "carbs_g": round(carb / sv, 1),
+        "fat_g": round(fat / sv, 1),
+        "fiber_g": round(fib / sv, 1),
+    }

src/models/planner.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""MiniCPM4.1-8B text-only planner — lazy singleton."""
+from __future__ import annotations
+import logging
+import os
+from typing import Any, Optional, Tuple
+import torch
+from src import config
+log = logging.getLogger(__name__)
+_model: Any = None
+_tokenizer: Any = None
+def get_planner() -> Tuple[Optional[Any], Optional[Any]]:
+    """Return (model, tokenizer).  Loads once; returns (None, None) on failure."""
+    global _model, _tokenizer
+    if _model is not None:
+        return _model, _tokenizer
+    # Prefer fine-tuned repo when available
+    model_id = config.PLANNER_FINETUNED_REPO or config.PLANNER_REPO
+    try:
+        # MiniCPM4.1 custom code imports is_torch_fx_available, which was
+        # removed in transformers 5.x. Patch it back before loading.
+        import transformers.utils.import_utils as _iutils
+        if not hasattr(_iutils, "is_torch_fx_available"):
+            def _is_torch_fx_available():
+                try:
+                    import torch.fx  # noqa: F401
+                    return True
+                except ImportError:
+                    return False
+            _iutils.is_torch_fx_available = _is_torch_fx_available
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        device_map = "auto" if os.environ.get("SPACE_ID") else (
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+        log.info("Loading planner model %s (device_map=%s)...", model_id, device_map)
+        _tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        _model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            device_map=device_map,
+        ).eval()
+        log.info("Planner model ready.")
+    except Exception as exc:
+        log.error("Could not load planner model '%s': %s", model_id, exc)
+        _model = None
+        _tokenizer = None
+    return _model, _tokenizer
+def infer(prompt: str, max_new_tokens: int = 1024, temperature: float = 0.0) -> str:
+    """Run text inference with the planner model.
+    Returns empty string if the model is unavailable.
+    """
+    model, tokenizer = get_planner()
+    if model is None or tokenizer is None:
+        return ""
+    try:
+        messages = [{"role": "user", "content": prompt}]
+        # return_dict=True yields a BatchEncoding (dict-like) with input_ids +
+        # attention_mask. NOTE: BatchEncoding is NOT a `dict` instance, so we
+        # must access it via mapping keys, never via tensor attrs like .shape.
+        enc = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_tensors="pt",
+            return_dict=True,
+        )
+        input_ids = enc["input_ids"].to(model.device)
+        input_len = input_ids.shape[1]
+        gen_inputs = {"input_ids": input_ids}
+        attn = enc.get("attention_mask")
+        if attn is not None:
+            gen_inputs["attention_mask"] = attn.to(model.device)
+        gen_kwargs: dict = dict(max_new_tokens=max_new_tokens, do_sample=False)
+        if temperature > 0:
+            gen_kwargs.update(do_sample=True, temperature=temperature, top_p=0.95)
+        with torch.no_grad():
+            output = model.generate(**gen_inputs, **gen_kwargs)
+        token_ids = output[0][input_len:]
+        return tokenizer.decode(token_ids, skip_special_tokens=True)
+    except Exception as exc:
+        log.error("Planner inference error: %r", exc, exc_info=True)
+        return ""

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""Shared data models for the Cook-with-Me pipeline."""
+from __future__ import annotations
+from typing import Optional
+from pydantic import BaseModel, Field
+class DishOption(BaseModel):
+    name: str
+    why: str = ""
+class RecipeStep(BaseModel):
+    n: int = 1
+    instruction: str
+    duration: str = "5 min"
+    tip: Optional[str] = None
+    visual: str = ""
+    image_path: Optional[str] = None
+    image_b64: Optional[str] = None  # base64 PNG from FLUX
+class Recipe(BaseModel):
+    name: str
+    cuisine: str = "International"
+    servings: int = 2
+    total_time_minutes: int = 30
+    steps: list[RecipeStep] = Field(default_factory=list)
+    nutrition: dict = Field(default_factory=dict)
+    final_dish_visual: str = ""
+    final_dish_image_path: Optional[str] = None
+    final_dish_image_b64: Optional[str] = None  # base64 PNG from FLUX

src/prompts/planner_propose.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+You are a creative chef assistant. Given a list of available ingredients, suggest exactly 3 diverse and delicious dishes.
+Available ingredients: {ingredients}
+Rules:
+- Each dish must be realistic to make with the listed ingredients
+- Vary the style: aim for different cuisines or preparations
+- Be specific with dish names (e.g., "Garlic Butter Shrimp Pasta" not "Pasta")
+Respond ONLY with valid JSON and nothing else — no explanation, no markdown fences:
+{"options": [{"name": "Dish Name 1", "why": "One sentence on why this works with the ingredients"}, {"name": "Dish Name 2", "why": "..."}, {"name": "Dish Name 3", "why": "..."}]}

src/prompts/planner_recipe.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+You are a professional chef writing a clear, detailed recipe.
+Dish to prepare: {dish_name}
+Available ingredients: {ingredients}
+Create a complete recipe with 4 to 7 steps. Each step must be specific and actionable.
+Respond ONLY with valid JSON and nothing else — no explanation, no markdown fences:
+{"name": "Full Recipe Title", "cuisine": "Cuisine type", "servings": 2, "total_time_minutes": 30, "final_dish_visual": "One evocative sentence describing how the finished dish looks and smells", "steps": [{"n": 1, "instruction": "Detailed step description.", "duration": "5 min", "tip": "Optional chef tip or null"}, {"n": 2, "instruction": "...", "duration": "3 min", "tip": null}]}
+Important: tip must be a string or null, never omit it.

src/prompts/validator_prompt.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+You are a supportive cooking coach reviewing a student's progress photo.
+The step they are working on:
+"{step_instruction}"
+Look carefully at the photo and decide:
+- "go"   → the step is correctly completed, they can move on
+- "wait" → it's progressing but needs more time (undercooked, still mixing, etc.)
+- "fix"  → there is a clear mistake that needs correction right now
+Respond ONLY with valid JSON and nothing else:
+{"verdict": "go", "feedback": "One sentence describing exactly what you see in the photo.", "tip": "One specific, actionable piece of advice for the cook."}
+verdict must be exactly one of: go, wait, fix.

src/ui/components.py CHANGED Viewed

@@ -80,7 +80,7 @@ class TemplatedHTML(gr.HTML):
 class RecipeHero(TemplatedHTML):
     css_template = """
 .cwm-hero {
-  background: #fffbf0;
   border: 1px solid #d8c9ad;
   border-radius: 16px;
   padding: 32px;
@@ -94,15 +94,15 @@ class RecipeHero(TemplatedHTML):
   background: #efe3c8;
 }
 .cwm-hero h1 {
-  font-family: 'Lora', serif; font-size: 38px; color: #6b4a2a;
   margin: 0 0 8px;
 }
 .cwm-hero .meta {
-  color: #8a6a3a; font-size: 14px; letter-spacing: 0.04em;
   text-transform: uppercase; margin-bottom: 18px;
 }
 .cwm-hero .visual {
-  font-family: 'Lora', serif; font-style: italic; color: #6b4a2a;
   font-size: 17px; line-height: 1.55;
 }
 @media (max-width: 720px) { .cwm-hero { grid-template-columns: 1fr; } }
@@ -115,11 +115,14 @@ class RecipeHero(TemplatedHTML):
         servings = state.get("servings") or 0
         time = state.get("total_time_minutes") or 0
         visual = html.escape(state.get("final_dish_visual") or "")
-        img = state.get("final_dish_image_path") or ""
-        img_tag = (
-            f'<img src="/file={html.escape(img)}" alt="final dish"/>'
-            if img else '<div class="cwm-hero" style="background:#efe3c8;border-radius:12px;height:320px;"></div>'
-        )
         return f"""
 <div class="cwm-hero">
   <div>{img_tag}</div>
@@ -186,15 +189,15 @@ class IngredientChips(TemplatedHTML):
 class DishOptions(TemplatedHTML):
     css_template = """
 .cwm-options { display: grid; grid-template-columns: repeat(3, 1fr); gap: 14px; }
-.cwm-option {
-  background: #fffbf0; border: 1px solid #d8c9ad; border-radius: 12px;
   padding: 18px; text-align: left;
 }
-.cwm-option h3 {
-  font-family: 'Lora', serif; font-size: 19px; color: #6b4a2a;
   margin: 0 0 6px;
 }
-.cwm-option p { color: #7a5a35; font-size: 14px; line-height: 1.45; margin: 0; }
 @media (max-width: 720px) { .cwm-options { grid-template-columns: 1fr; } }
 """
@@ -217,32 +220,32 @@ class DishOptions(TemplatedHTML):
 class StepCard(TemplatedHTML):
     css_template = """
 .cwm-steps { display: flex; flex-direction: column; gap: 16px; }
-.cwm-step {
   display: grid; grid-template-columns: 220px 1fr; gap: 22px;
-  background: #fffbf0; border-left: 4px solid #a85c2a; border-radius: 10px;
   padding: 18px 22px;
 }
-.cwm-step img {
   width: 220px; height: 160px; object-fit: cover; border-radius: 8px;
   background: #efe3c8;
 }
-.cwm-step .placeholder {
   width: 220px; height: 160px; border-radius: 8px;
   background: linear-gradient(135deg,#efe3c8,#dccaa3);
   display:flex; align-items:center; justify-content:center;
-  color: #8a6a3a; font-family: 'Lora', serif; font-size: 14px;
 }
-.cwm-step h3 {
-  font-family: 'Lora', serif; color: #6b4a2a; margin: 0 0 6px; font-size: 22px;
 }
-.cwm-step p { font-size: 16px; line-height: 1.55; color: #4a3722; margin: 0 0 8px; }
-.cwm-step .duration {
-  display: inline-block; background: #a85c2a; color: #fffbf0;
   border-radius: 999px; padding: 3px 10px; font-size: 12px; letter-spacing: 0.04em;
 }
-.cwm-step .tip {
-  margin-top: 10px; padding: 10px 12px; background: #fff3d8;
-  border-radius: 8px; font-size: 14px; color: #6b4a2a;
 }
 .cwm-step .tip::before { content: "💡 "; }
 @media (max-width: 720px) { .cwm-step { grid-template-columns: 1fr; } .cwm-step img, .cwm-step .placeholder { width: 100%; } }
@@ -260,11 +263,14 @@ class StepCard(TemplatedHTML):
             dur = html.escape(s.get("duration", ""))
             tip = s.get("tip")
             visual = html.escape(s.get("visual", ""))
-            img = s.get("image_path")
-            img_block = (
-                f'<img src="/file={html.escape(img)}" alt="step {n}"/>'
-                if img else f'<div class="placeholder">{visual[:80]}</div>'
-            )
             tip_block = f'<div class="tip">{html.escape(tip)}</div>' if tip else ""
             cards.append(f"""
 <div class="cwm-step">
@@ -287,22 +293,22 @@ class NutritionGrid(TemplatedHTML):
     css_template = """
 .cwm-nutri-wrap { margin-top: 10px; }
 .cwm-nutri-title {
-  font-family: 'Lora', serif; color: #6b4a2a; font-size: 22px; margin: 0 0 14px;
 }
 .cwm-nutri {
   display: grid; grid-template-columns: repeat(5, 1fr); gap: 12px;
 }
-.cwm-nutri-cell {
-  background: #fffbf0; border: 1px solid #d8c9ad; border-radius: 10px;
   padding: 14px 10px; text-align: center;
 }
-.cwm-nutri-cell .v {
-  font-family: 'Lora', serif; font-size: 24px; font-weight: 700; color: #6b4a2a;
   display: block;
 }
-.cwm-nutri-cell .l {
   font-size: 11px; letter-spacing: 0.08em; text-transform: uppercase;
-  color: #8a6a3a; margin-top: 4px;
 }
 @media (max-width: 720px) { .cwm-nutri { grid-template-columns: repeat(2, 1fr); } }
 """
@@ -337,7 +343,7 @@ class VerdictBadge(TemplatedHTML):
     css_template = """
 .cwm-verdict {
   display: flex; align-items: center; gap: 18px;
-  background: #fffbf0; border-radius: 12px; padding: 18px 22px;
   border: 1px solid #d8c9ad;
 }
 .cwm-verdict.go    { border-left: 6px solid #4f8b4a; }
@@ -351,8 +357,8 @@ class VerdictBadge(TemplatedHTML):
 .cwm-verdict.go    .cwm-verdict-pill { background: #4f8b4a; }
 .cwm-verdict.wait  .cwm-verdict-pill { background: #d4a23c; }
 .cwm-verdict.fix   .cwm-verdict-pill { background: #b94a3a; }
-.cwm-verdict-text  { font-size: 16px; color: #4a3722; line-height: 1.5; }
-.cwm-verdict-text small { color: #8a6a3a; display: block; margin-top: 4px; }
 .cwm-verdict-empty {
   color: #b39870; font-style: italic; padding: 14px 0;
 }

 class RecipeHero(TemplatedHTML):
     css_template = """
 .cwm-hero {
+  background: #fffbf0 !important;
   border: 1px solid #d8c9ad;
   border-radius: 16px;
   padding: 32px;
   background: #efe3c8;
 }
 .cwm-hero h1 {
+  font-family: 'Lora', serif; font-size: 38px; color: #6b4a2a !important;
   margin: 0 0 8px;
 }
 .cwm-hero .meta {
+  color: #8a6a3a !important; font-size: 14px; letter-spacing: 0.04em;
   text-transform: uppercase; margin-bottom: 18px;
 }
 .cwm-hero .visual {
+  font-family: 'Lora', serif; font-style: italic; color: #6b4a2a !important;
   font-size: 17px; line-height: 1.55;
 }
 @media (max-width: 720px) { .cwm-hero { grid-template-columns: 1fr; } }
         servings = state.get("servings") or 0
         time = state.get("total_time_minutes") or 0
         visual = html.escape(state.get("final_dish_visual") or "")
+        img_b64 = state.get("final_dish_image_b64") or ""
+        img_path = state.get("final_dish_image_path") or ""
+        if img_b64:
+            img_tag = f'<img src="data:image/png;base64,{img_b64}" alt="final dish"/>'
+        elif img_path:
+            img_tag = f'<img src="/file={html.escape(img_path)}" alt="final dish"/>'
+        else:
+            img_tag = '<div style="background:#efe3c8;border-radius:12px;height:320px;display:flex;align-items:center;justify-content:center;color:#8a6a3a;font-family:\'Lora\',serif;font-style:italic;">Image will appear here</div>'
         return f"""
 <div class="cwm-hero">
   <div>{img_tag}</div>
 class DishOptions(TemplatedHTML):
     css_template = """
 .cwm-options { display: grid; grid-template-columns: repeat(3, 1fr); gap: 14px; }
+.cwm-options .cwm-option {
+  background: #fffbf0 !important; border: 1px solid #d8c9ad; border-radius: 12px;
   padding: 18px; text-align: left;
 }
+.cwm-options .cwm-option h3 {
+  font-family: 'Lora', serif; font-size: 19px; color: #6b4a2a !important;
   margin: 0 0 6px;
 }
+.cwm-options .cwm-option p { color: #7a5a35 !important; font-size: 14px; line-height: 1.45; margin: 0; }
 @media (max-width: 720px) { .cwm-options { grid-template-columns: 1fr; } }
 """
 class StepCard(TemplatedHTML):
     css_template = """
 .cwm-steps { display: flex; flex-direction: column; gap: 16px; }
+.cwm-steps .cwm-step {
   display: grid; grid-template-columns: 220px 1fr; gap: 22px;
+  background: #fffbf0 !important; border-left: 4px solid #a85c2a; border-radius: 10px;
   padding: 18px 22px;
 }
+.cwm-steps .cwm-step img {
   width: 220px; height: 160px; object-fit: cover; border-radius: 8px;
   background: #efe3c8;
 }
+.cwm-steps .cwm-step .placeholder {
   width: 220px; height: 160px; border-radius: 8px;
   background: linear-gradient(135deg,#efe3c8,#dccaa3);
   display:flex; align-items:center; justify-content:center;
+  color: #8a6a3a !important; font-family: 'Lora', serif; font-size: 14px;
 }
+.cwm-steps .cwm-step h3 {
+  font-family: 'Lora', serif; color: #6b4a2a !important; margin: 0 0 6px; font-size: 22px;
 }
+.cwm-steps .cwm-step p { font-size: 16px; line-height: 1.55; color: #4a3722 !important; margin: 0 0 8px; }
+.cwm-steps .cwm-step .duration {
+  display: inline-block; background: #a85c2a !important; color: #fffbf0 !important;
   border-radius: 999px; padding: 3px 10px; font-size: 12px; letter-spacing: 0.04em;
 }
+.cwm-steps .cwm-step .tip {
+  margin-top: 10px; padding: 10px 12px; background: #fff3d8 !important;
+  border-radius: 8px; font-size: 14px; color: #6b4a2a !important;
 }
 .cwm-step .tip::before { content: "💡 "; }
 @media (max-width: 720px) { .cwm-step { grid-template-columns: 1fr; } .cwm-step img, .cwm-step .placeholder { width: 100%; } }
             dur = html.escape(s.get("duration", ""))
             tip = s.get("tip")
             visual = html.escape(s.get("visual", ""))
+            img_b64 = s.get("image_b64") or ""
+            img_path = s.get("image_path") or ""
+            if img_b64:
+                img_block = f'<img src="data:image/png;base64,{img_b64}" alt="step {n}"/>'
+            elif img_path:
+                img_block = f'<img src="/file={html.escape(img_path)}" alt="step {n}"/>'
+            else:
+                img_block = f'<div class="placeholder">{visual[:80] if visual else f"Step {n}"}</div>'
             tip_block = f'<div class="tip">{html.escape(tip)}</div>' if tip else ""
             cards.append(f"""
 <div class="cwm-step">
     css_template = """
 .cwm-nutri-wrap { margin-top: 10px; }
 .cwm-nutri-title {
+  font-family: 'Lora', serif; color: #6b4a2a !important; font-size: 22px; margin: 0 0 14px;
 }
 .cwm-nutri {
   display: grid; grid-template-columns: repeat(5, 1fr); gap: 12px;
 }
+.cwm-nutri .cwm-nutri-cell {
+  background: #fffbf0 !important; border: 1px solid #d8c9ad; border-radius: 10px;
   padding: 14px 10px; text-align: center;
 }
+.cwm-nutri .cwm-nutri-cell .v {
+  font-family: 'Lora', serif; font-size: 24px; font-weight: 700; color: #6b4a2a !important;
   display: block;
 }
+.cwm-nutri .cwm-nutri-cell .l {
   font-size: 11px; letter-spacing: 0.08em; text-transform: uppercase;
+  color: #8a6a3a !important; margin-top: 4px;
 }
 @media (max-width: 720px) { .cwm-nutri { grid-template-columns: repeat(2, 1fr); } }
 """
     css_template = """
 .cwm-verdict {
   display: flex; align-items: center; gap: 18px;
+  background: #fffbf0 !important; border-radius: 12px; padding: 18px 22px;
   border: 1px solid #d8c9ad;
 }
 .cwm-verdict.go    { border-left: 6px solid #4f8b4a; }
 .cwm-verdict.go    .cwm-verdict-pill { background: #4f8b4a; }
 .cwm-verdict.wait  .cwm-verdict-pill { background: #d4a23c; }
 .cwm-verdict.fix   .cwm-verdict-pill { background: #b94a3a; }
+.cwm-verdict-text  { font-size: 16px; color: #4a3722 !important; line-height: 1.5; }
+.cwm-verdict-text small { color: #8a6a3a !important; display: block; margin-top: 4px; }
 .cwm-verdict-empty {
   color: #b39870; font-style: italic; padding: 14px 0;
 }

src/ui/components.pyi CHANGED Viewed

@@ -63,11 +63,14 @@ class RecipeHero(TemplatedHTML):
         servings = state.get("servings") or 0
         time = state.get("total_time_minutes") or 0
         visual = html.escape(state.get("final_dish_visual") or "")
-        img = state.get("final_dish_image_path") or ""
-        img_tag = (
-            f'<img src="/file={html.escape(img)}" alt="final dish"/>'
-            if img else '<div class="cwm-hero" style="background:#efe3c8;border-radius:12px;height:320px;"></div>'
-        )
         return f"""
 <div class="cwm-hero">
   <div>{img_tag}</div>

         servings = state.get("servings") or 0
         time = state.get("total_time_minutes") or 0
         visual = html.escape(state.get("final_dish_visual") or "")
+        img_b64 = state.get("final_dish_image_b64") or ""
+        img_path = state.get("final_dish_image_path") or ""
+        if img_b64:
+            img_tag = f'<img src="data:image/png;base64,{img_b64}" alt="final dish"/>'
+        elif img_path:
+            img_tag = f'<img src="/file={html.escape(img_path)}" alt="final dish"/>'
+        else:
+            img_tag = '<div style="background:#efe3c8;border-radius:12px;height:320px;display:flex;align-items:center;justify-content:center;color:#8a6a3a;font-family:\'Lora\',serif;font-style:italic;">Image will appear here</div>'
         return f"""
 <div class="cwm-hero">
   <div>{img_tag}</div>

src/ui/theme.py CHANGED Viewed

@@ -13,10 +13,64 @@ theme = gr.themes.Soft(
 CSS = """
 @import url('https://fonts.googleapis.com/css2?family=Lora:wght@400;700&display=swap');
-.gradio-container { background: #f5ecd9 !important; }
 .gradio-container .prose h1,
 .gradio-container .prose h2,
-.gradio-container .prose h3 { font-family: 'Lora', serif !important; color: #6b4a2a; }
 /* Generic container shared by every HTMLComponent */
 .cwm-card {
   border: 1px solid #d8c9ad;
@@ -26,6 +80,7 @@ CSS = """
 }
 button.primary, .gr-button-primary {
   background: #a85c2a !important;
   font-weight: 600 !important;
   font-size: 16px !important;
   padding: 12px 22px !important;

 CSS = """
 @import url('https://fonts.googleapis.com/css2?family=Lora:wght@400;700&display=swap');
+/* ---------------------------------------------------------------------------
+   Force a warm light palette regardless of the browser/system dark mode.
+   We pin the parchment background, so we must also pin DARK text colours via
+   Gradio's CSS variables — otherwise dark-mode users get white text on the
+   light background and it disappears.
+--------------------------------------------------------------------------- */
+.gradio-container, .gradio-container.dark {
+  background: #f5ecd9 !important;
+  color-scheme: light !important;
+  --body-text-color: #4a3722;
+  --body-text-color-subdued: #7a5a35;
+  --block-title-text-color: #6b4a2a;
+  --block-label-text-color: #6b4a2a;
+  --block-info-text-color: #7a5a35;
+  --block-background-fill: #fffbf0;
+  --input-background-fill: #fffbf0;
+  --border-color-primary: #d8c9ad;
+  --color-accent-soft: #fbe2d2;
+}
+/* Blanket dark text for native Gradio text elements (covers dark mode) */
+.gradio-container,
+.gradio-container .prose,
+.gradio-container label,
+.gradio-container .gr-text,
+.gradio-container span,
+.gradio-container p,
+.gradio-container .gr-check-radio label,
+.gradio-container .wrap,
+.gradio-container .gr-form,
+.gradio-container .tab-nav button,
+.gradio-container .gr-accordion,
+.gradio-container input,
+.gradio-container textarea {
+  color: #4a3722 !important;
+}
 .gradio-container .prose h1,
 .gradio-container .prose h2,
+.gradio-container .prose h3 { font-family: 'Lora', serif !important; color: #6b4a2a !important; }
+/* Tabs: dark labels, terracotta active */
+.gradio-container .tab-nav button { color: #6b4a2a !important; }
+.gradio-container .tab-nav button.selected {
+  color: #a85c2a !important; border-bottom-color: #a85c2a !important;
+}
+/* Native blocks (inputs, radio, checkbox, number) on warm cards */
+.gradio-container .block,
+.gradio-container .form,
+.gradio-container input[type="text"],
+.gradio-container input[type="number"] {
+  background: #fffbf0 !important;
+  border-color: #d8c9ad !important;
+}
 /* Generic container shared by every HTMLComponent */
 .cwm-card {
   border: 1px solid #d8c9ad;
 }
 button.primary, .gr-button-primary {
   background: #a85c2a !important;
+  color: #fffbf0 !important;
   font-weight: 600 !important;
   font-size: 16px !important;
   padding: 12px 22px !important;