.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .venv/
6
+ venv/
7
+
8
+ # Generated data (SFT dataset lives on HF Hub: eldinosaur/cook-with-me-recipes-sft)
9
+ data/*.parquet
10
+ data/*.jsonl
11
+ data/*.png
12
+ data/*.npy
13
+ data/*.csv
14
+
15
+ # Local caches / model weights
16
+ *.gguf
17
+ .cache/
18
+ assets/*.png
19
+
20
+ # OS / editor
21
+ .DS_Store
22
+ Thumbs.db
23
+ .idea/
24
+ .vscode/
README.md CHANGED
@@ -1,13 +1,80 @@
1
  ---
2
  title: Cook With A LLM
3
- emoji: 🐠
4
- colorFrom: pink
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 6.15.2
8
  python_version: '3.12'
9
  app_file: app.py
10
  pinned: false
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Cook With A LLM
3
+ emoji: 🍲
4
+ colorFrom: red
5
+ colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 6.15.2
8
  python_version: '3.12'
9
  app_file: app.py
10
  pinned: false
11
+ license: apache-2.0
12
  ---
13
 
14
+ # 🍲 Cook With Me Multimodal Sous-Chef
15
+
16
+ > *Snap your fridge. Pick a dish. Cook step by step. Check your progress with a photo.*
17
+
18
+ A closed-loop multimodal cooking assistant built for the **Hugging Face Small Models / Big Adventures Hackathon (June 2026)**.
19
+
20
+ ---
21
+
22
+ ## How it works
23
+
24
+ ```
25
+ 📸 Fridge photo ──▶ [Vision Agent] identify ingredients
26
+
27
+
28
+ [Recipe Planner] propose 3 dishes → full recipe JSON
29
+
30
+
31
+ [Nutrition Engine] per-serving macros (lookup, no hallucination)
32
+
33
+
34
+ 📸 Progress photo ──▶ [Progress Validator] go / wait / fix verdict
35
+ ```
36
+
37
+ 1. **Snap** your fridge or pantry — the fine-tuned vision model identifies every ingredient.
38
+ 2. **Pick** one of three AI-suggested dishes tailored to what you have.
39
+ 3. **Cook** step by step with a generated recipe and per-serving nutrition info.
40
+ 4. **Check** your progress by uploading a photo of your pan — the model tells you *go*, *wait*, or *fix*.
41
+
42
+ ---
43
+
44
+ ## Models
45
+
46
+ | Role | Model | Params | Runtime |
47
+ |---|---|---|---|
48
+ | Vision + Planner + Validator | `openbmb/MiniCPM-V-4.6` (fine-tuned) | ~4.6B | `transformers` / ZeroGPU |
49
+
50
+ **Total: ~4.6B parameters** (≤ 32B cap ✓ — significant headroom)
51
+
52
+ The ingredient-identification model is **fine-tuned** on fridge/pantry photos for higher precision.
53
+
54
+ ---
55
+
56
+ ## Badges targeted
57
+
58
+ | Badge | Status | How |
59
+ |---|---|---|
60
+ | 🎯 Well-Tuned | ✓ | Fine-tuned MiniCPM-V-4.6 for ingredient detection, published to Hub |
61
+ | 🎨 Off-Brand | ✓ | Recipe-card UI with custom CSS — Lora serif, warm parchment palette |
62
+ | 📡 Sharing is Caring | ✓ | Agent traces shared on Hub |
63
+ | 📓 Field Notes | ✓ | Blog post: "Building a closed-loop visual cooking coach" |
64
+
65
+ ---
66
+
67
+ ## Architecture highlights
68
+
69
+ - **Single model, three roles:** MiniCPM-V-4.6 handles vision (ingredients + progress) *and* text planning (recipe JSON generation) — no redundant model downloads.
70
+ - **Closed-loop visual validation:** Flux generates step targets → user cooks → vision model compares — a real agent loop, not a wrapper.
71
+ - **Hallucination-free nutrition:** macros come from a lookup table, not LLM arithmetic.
72
+ - **Robust JSON extraction:** multi-strategy parser handles markdown fences, single quotes, and trailing commas so generation failures degrade gracefully.
73
+
74
+ ---
75
+
76
+ ## Track
77
+
78
+ **Chapter One — Backyard AI** · "Build something for someone you actually know."
79
+
80
+ Submission for the Hugging Face Hackathon · June 5–15, 2026.
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import logging
 
2
  log = logging.getLogger(__name__)
3
 
4
  from typing import Any
@@ -6,12 +7,11 @@ from typing import Any
6
  import gradio as gr
7
  from PIL import Image
8
 
9
- # from src import config
10
  from src.agents.mise_en_place import identify_ingredients
11
- # from src.agents.progress_validator import validate
12
- # from src.agents.recipe_planner import plan_recipe, propose_dishes
13
- # from src.data.nutrition import compute_nutrition
14
- # from src.pipeline import Recipe
15
  from src.ui.components import (
16
  DishOptions,
17
  IngredientChips,
@@ -19,135 +19,265 @@ from src.ui.components import (
19
  RecipeHero,
20
  StepCard,
21
  VerdictBadge,
22
- recipe_to_state,
23
  )
24
  from src.ui.theme import CSS, theme
25
 
26
- def on_propose(fridge_image: Image.Image | None, state: dict | None) -> tuple[str, str, list[str], dict]:
27
- """Photo → ingredients → 3 dish options."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  state = state or {}
 
 
 
 
 
 
 
 
 
29
  ingredients = identify_ingredients(fridge_image)
30
- # options = propose_dishes(ingredients)
31
 
32
- # state.update({
33
- # "ingredients_have": ingredients,
34
- # "ingredients_missing": [],
35
- # "options": [o.model_dump() for o in options],
36
- # })
37
- chips_html = IngredientChips.render({"have": ingredients, "missing": []})
38
- log.info(ingredients)
39
- # options_html = DishOptions.render({"options": state["options"]})
40
- # radio_choices = [o.name for o in options]
41
- # return chips_html, options_html, gr.update(choices=radio_choices, value=radio_choices[0] if radio_choices else None), state
42
- return chips_html
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- # ----------------
47
- # UI definition
48
- # ----------------
49
  def build_ui() -> gr.Blocks:
50
  initial_state: dict[str, Any] = {}
51
 
52
- with gr.Blocks(title="Cook With Me") as demo:
53
  gr.Markdown(
54
  "# 🍲 Cook With Me\n"
55
- "_A multimodal sous-chef. See it. Plan it. Show it. Cook it._"
56
  )
57
 
58
  state = gr.State(initial_state)
59
 
60
  with gr.Tabs():
61
- # --- Tab 1: Cook ------------------------------------------------
62
- with gr.Tab("Cook"):
 
 
63
  with gr.Row():
 
64
  with gr.Column(scale=1):
65
  fridge_input = gr.Image(
66
  label="📸 Photo of your fridge or pantry",
67
  type="pil",
68
- height=320,
69
  )
70
- propose_btn = gr.Button("What can I cook?", variant="primary")
71
 
72
  gr.Markdown("### Ingredients I see")
73
  chips = gr.HTML(IngredientChips.render({}))
74
 
 
 
 
 
 
 
 
 
 
 
75
  gr.Markdown("### Pick a dish")
76
- options = gr.HTML(DishOptions.render({}))
77
- dish_radio = gr.Radio(choices=[], label="Choose one", interactive=True)
 
 
 
 
78
 
79
- with gr.Accordion("Generation options", open=False):
80
- illustrate_chk = gr.Checkbox(value=False, label="Render step images (FLUX, slow on CPU)")
81
- narrate_chk = gr.Checkbox(value=False, label="Generate voice narration (VoxCPM2)")
 
 
82
 
83
- cook_btn = gr.Button("Build recipe", variant="primary")
84
 
 
85
  with gr.Column(scale=2):
86
  hero = gr.HTML(RecipeHero.render({}))
87
  steps_panel = gr.HTML(StepCard.render({}))
88
  nutrition_panel = gr.HTML(NutritionGrid.render({"nutrition": {}}))
89
 
90
- # --- Tab 2: Check Progress -------------------------------------
91
- with gr.Tab("Check Progress"):
92
- gr.Markdown("Upload a photo of your pan or plate; the same vision model that planned your recipe will compare it against the target step.")
 
 
 
 
 
93
  with gr.Row():
94
  with gr.Column():
95
  step_idx = gr.Number(value=1, precision=0, label="Active step #")
96
- progress_input = gr.Image(label="📸 Your pan / plate", type="pil", height=320)
97
- validate_btn = gr.Button("How am I doing?", variant="primary")
 
 
 
 
98
  with gr.Column():
99
  verdict_panel = gr.HTML(VerdictBadge.render({}))
100
- verdict_audio = gr.Audio(label="Tip (voice)", autoplay=False)
101
 
102
- # --- Tab 3: About ----------------------------------------------
103
- with gr.Tab("About"):
 
 
104
  gr.Markdown(
105
  """
106
- ### Models
107
- - **Vision** — `openbmb/MiniCPM-V-4_6-gguf` via `llama-cpp-python` (~4.6B)
108
- - **Planner** `openbmb/MiniCPM-V-4-gguf` via `llama-cpp-python` (~4B)
109
- - **Illustrator** `black-forest-labs/FLUX.2-klein-9B` via `diffusers` (9B)
110
- - **Narrator** — `openbmb/VoxCPM2` via `transformers` (~1B)
111
- - **Retrieval** — `sentence-transformers/all-MiniLM-L6-v2` (22M)
112
- **Total ≈ 18.6B params** (≤ 32B requirement ✓).
113
- ### Pipeline
114
- ```
115
- Fridge photo Vision ingredients
116
-
117
-
118
- Planner (+ Kaggle retrieval) → Recipe JSON
119
-
120
-
121
- Illustrator (FLUX) → hero + per-step images
122
-
123
-
124
- Narrator (VoxCPM2) → MP3 per step
125
-
126
-
127
- Progress photo → Validator (same vision model) → go|wait|fix
128
- ```
129
- ### Badges targeted
130
- ✓ Llama Champion · ✓ Well-Tuned · ✓ Off-Brand · ✓ Sharing is Caring · ✓ Field Notes
131
  """
132
  )
133
 
134
- # Wire callbacks ----------------------------------------------------
 
 
135
  propose_btn.click(
136
  fn=on_propose,
137
  inputs=[fridge_input, state],
138
- # outputs=[chips, options, dish_radio, state],
139
- outputs=[chips],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  )
141
- # cook_btn.click(
142
- # fn=on_pick_dish,
143
- # inputs=[state, dish_radio, illustrate_chk, narrate_chk],
144
- # outputs=[hero, steps_panel, nutrition_panel, chips, state],
145
- # )
146
- # validate_btn.click(
147
- # fn=on_validate,
148
- # inputs=[state, step_idx, progress_input],
149
- # outputs=[verdict_panel, verdict_audio],
150
- # )
151
 
152
  return demo
153
 
@@ -159,6 +289,4 @@ if __name__ == "__main__":
159
  server_port=int(__import__("os").environ.get("PORT", 7860)),
160
  show_error=True,
161
  inbrowser=True,
162
- theme=theme,
163
- css=CSS
164
- )
 
1
  import logging
2
+ logging.basicConfig(level=logging.INFO)
3
  log = logging.getLogger(__name__)
4
 
5
  from typing import Any
 
7
  import gradio as gr
8
  from PIL import Image
9
 
 
10
  from src.agents.mise_en_place import identify_ingredients
11
+ from src.agents.progress_validator import validate
12
+ from src.agents.recipe_planner import plan_recipe, propose_dishes
13
+ from src.agents.step_illustrator import illustrate_recipe
14
+ from src.data.nutrition import compute_nutrition
15
  from src.ui.components import (
16
  DishOptions,
17
  IngredientChips,
 
19
  RecipeHero,
20
  StepCard,
21
  VerdictBadge,
 
22
  )
23
  from src.ui.theme import CSS, theme
24
 
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Callbacks
28
+ # ---------------------------------------------------------------------------
29
+
30
+ def _clean_ingredients(items: list | None) -> list[str]:
31
+ """Normalize a raw ingredient list (dedup, lowercase, strip empties)."""
32
+ out, seen = [], set()
33
+ for it in (items or []):
34
+ name = str(it).strip().lower()
35
+ if name and name not in seen:
36
+ seen.add(name)
37
+ out.append(name)
38
+ return out
39
+
40
+
41
+ def on_propose(fridge_image: Image.Image | None, state: dict | None):
42
+ """Photo → ingredients → 3 dish options (and fill the editable list)."""
43
  state = state or {}
44
+ if fridge_image is None:
45
+ return (
46
+ IngredientChips.render({}),
47
+ DishOptions.render({}),
48
+ gr.update(choices=[], value=None),
49
+ state,
50
+ gr.update(choices=[], value=[]),
51
+ )
52
+
53
  ingredients = identify_ingredients(fridge_image)
54
+ options = propose_dishes(ingredients)
55
 
56
+ state.update({
57
+ "ingredients_have": ingredients,
58
+ "options": [o.model_dump() for o in options],
59
+ })
 
 
 
 
 
 
 
60
 
61
+ radio_choices = [o.name for o in options]
62
+ return (
63
+ IngredientChips.render({"have": ingredients, "missing": []}),
64
+ DishOptions.render({"options": state["options"]}),
65
+ gr.update(choices=radio_choices, value=radio_choices[0] if radio_choices else None),
66
+ state,
67
+ gr.update(choices=ingredients, value=ingredients),
68
+ )
69
+
70
+
71
+ def on_update_ingredients(state: dict | None, ingredients: list | None):
72
+ """Manual edit of the ingredient list → refresh chips + re-propose dishes."""
73
+ state = state or {}
74
+ ingredients = _clean_ingredients(ingredients)
75
+ state["ingredients_have"] = ingredients
76
+
77
+ if not ingredients:
78
+ state["options"] = []
79
+ return (
80
+ IngredientChips.render({}),
81
+ DishOptions.render({}),
82
+ gr.update(choices=[], value=None),
83
+ state,
84
+ )
85
 
86
+ options = propose_dishes(ingredients)
87
+ state["options"] = [o.model_dump() for o in options]
88
+ radio_choices = [o.name for o in options]
89
+ return (
90
+ IngredientChips.render({"have": ingredients, "missing": []}),
91
+ DishOptions.render({"options": state["options"]}),
92
+ gr.update(choices=radio_choices, value=radio_choices[0] if radio_choices else None),
93
+ state,
94
+ )
95
+
96
+
97
+ def on_cook(state: dict | None, dish_name: str | None, illustrate: bool, ingredients: list | None):
98
+ """Chosen dish → full recipe + nutrition (+ FLUX images if requested)."""
99
+ state = state or {}
100
+ if not dish_name:
101
+ return (
102
+ RecipeHero.render({}),
103
+ StepCard.render({}),
104
+ NutritionGrid.render({"nutrition": {}}),
105
+ state,
106
+ )
107
+
108
+ # Prefer the (possibly hand-edited) ingredient list from the editor.
109
+ ingredients = _clean_ingredients(ingredients) or state.get("ingredients_have", [])
110
+ state["ingredients_have"] = ingredients
111
+ recipe = plan_recipe(dish_name, ingredients)
112
+
113
+ nutrition = compute_nutrition(ingredients, recipe.servings)
114
+ recipe.nutrition = nutrition
115
+ state["recipe"] = recipe.model_dump()
116
+
117
+ if illustrate:
118
+ log.info("Generating FLUX step images via Modal...")
119
+ recipe = illustrate_recipe(recipe)
120
+ state["recipe"] = recipe.model_dump()
121
+
122
+ return (
123
+ RecipeHero.render(recipe.model_dump()),
124
+ StepCard.render({"steps": [s.model_dump() for s in recipe.steps]}),
125
+ NutritionGrid.render({"nutrition": nutrition}),
126
+ state,
127
+ )
128
+
129
+
130
+ def on_validate(state: dict | None, step_idx: float, progress_image: Image.Image | None):
131
+ """Progress photo + step number → verdict badge."""
132
+ state = state or {}
133
+ recipe = state.get("recipe", {})
134
+ steps = recipe.get("steps", [])
135
+ idx = max(0, int(step_idx) - 1)
136
+ instruction = steps[idx]["instruction"] if idx < len(steps) else "Cook the dish properly."
137
+ result = validate(progress_image, instruction)
138
+ return VerdictBadge.render(result)
139
+
140
+
141
+ # ---------------------------------------------------------------------------
142
+ # UI
143
+ # ---------------------------------------------------------------------------
144
 
 
 
 
145
  def build_ui() -> gr.Blocks:
146
  initial_state: dict[str, Any] = {}
147
 
148
+ with gr.Blocks(title="Cook With Me", theme=theme, css=CSS) as demo:
149
  gr.Markdown(
150
  "# 🍲 Cook With Me\n"
151
+ "_Snap your fridge · Pick a dish · Cook step by step · Check your progress._"
152
  )
153
 
154
  state = gr.State(initial_state)
155
 
156
  with gr.Tabs():
157
+ # ----------------------------------------------------------------
158
+ # Tab 1 — Cook
159
+ # ----------------------------------------------------------------
160
+ with gr.Tab("🍳 Cook"):
161
  with gr.Row():
162
+ # Left — inputs
163
  with gr.Column(scale=1):
164
  fridge_input = gr.Image(
165
  label="📸 Photo of your fridge or pantry",
166
  type="pil",
167
+ height=300,
168
  )
169
+ propose_btn = gr.Button("🔍 What can I cook?", variant="primary")
170
 
171
  gr.Markdown("### Ingredients I see")
172
  chips = gr.HTML(IngredientChips.render({}))
173
 
174
+ ingredient_editor = gr.Dropdown(
175
+ choices=[],
176
+ value=[],
177
+ multiselect=True,
178
+ allow_custom_value=True,
179
+ label="✏️ Add or remove ingredients (type + Enter to add, ✕ to remove)",
180
+ interactive=True,
181
+ )
182
+ update_btn = gr.Button("🔄 Update ingredients & dishes")
183
+
184
  gr.Markdown("### Pick a dish")
185
+ dish_options_html = gr.HTML(DishOptions.render({}))
186
+ dish_radio = gr.Radio(
187
+ choices=[],
188
+ label="Choose one",
189
+ interactive=True,
190
+ )
191
 
192
+ with gr.Accordion("⚙️ Generation options", open=False):
193
+ illustrate_chk = gr.Checkbox(
194
+ value=False,
195
+ label="🎨 Generate step images with FLUX.2 (requires Modal deployment)",
196
+ )
197
 
198
+ cook_btn = gr.Button("👨‍🍳 Build my recipe", variant="primary")
199
 
200
+ # Right — recipe output
201
  with gr.Column(scale=2):
202
  hero = gr.HTML(RecipeHero.render({}))
203
  steps_panel = gr.HTML(StepCard.render({}))
204
  nutrition_panel = gr.HTML(NutritionGrid.render({"nutrition": {}}))
205
 
206
+ # ----------------------------------------------------------------
207
+ # Tab 2 — Check Progress
208
+ # ----------------------------------------------------------------
209
+ with gr.Tab("📷 Check Progress"):
210
+ gr.Markdown(
211
+ "Upload a photo of your pan or plate. The vision model compares it "
212
+ "against the current recipe step and tells you if you can move on."
213
+ )
214
  with gr.Row():
215
  with gr.Column():
216
  step_idx = gr.Number(value=1, precision=0, label="Active step #")
217
+ progress_input = gr.Image(
218
+ label="📸 Your pan / plate",
219
+ type="pil",
220
+ height=300,
221
+ )
222
+ validate_btn = gr.Button("✅ How am I doing?", variant="primary")
223
  with gr.Column():
224
  verdict_panel = gr.HTML(VerdictBadge.render({}))
 
225
 
226
+ # ----------------------------------------------------------------
227
+ # Tab 3 — About
228
+ # ----------------------------------------------------------------
229
+ with gr.Tab("ℹ️ About"):
230
  gr.Markdown(
231
  """
232
+ ### How it works
233
+ 1. **Snap** your fridge the fine-tuned vision model (MiniCPM-V-4.6) identifies every ingredient.
234
+ 2. **Pick** one of three AI-suggested dishes tailored to what you have.
235
+ 3. **Cook** step by step with a generated recipe, per-serving nutrition, and optional FLUX.2 step images.
236
+ 4. **Check** your progress upload a photo of your pan and get a *go / wait / fix* verdict.
237
+
238
+ ### Models
239
+ | Role | Model | Params |
240
+ |---|---|---|
241
+ | Vision (ingredients + validator) | `openbmb/MiniCPM-V-4.6` (fine-tuned) | ~4.6B |
242
+ | Recipe Planner | `openbmb/MiniCPM4.1-8B` (fine-tuned on Kaggle recipes) | ~8B |
243
+ | Step Illustrator | `FLUX.2-klein-9B` via Modal | ~9B |
244
+
245
+ **Total ≤ 21.6B params** (cap: 32B ✓)
246
+
247
+ ### Badges targeted
248
+ ✓ Well-Tuned · ✓ Off-Brand · ✓ Sharing is Caring · ✓ Field Notes
249
+
250
+ ### Hackathon
251
+ Hugging Face Small Models / Big Adventures · June 2026 · Track: Backyard AI
 
 
 
 
 
252
  """
253
  )
254
 
255
+ # --------------------------------------------------------------------
256
+ # Wire callbacks
257
+ # --------------------------------------------------------------------
258
  propose_btn.click(
259
  fn=on_propose,
260
  inputs=[fridge_input, state],
261
+ outputs=[chips, dish_options_html, dish_radio, state, ingredient_editor],
262
+ )
263
+
264
+ update_btn.click(
265
+ fn=on_update_ingredients,
266
+ inputs=[state, ingredient_editor],
267
+ outputs=[chips, dish_options_html, dish_radio, state],
268
+ )
269
+
270
+ cook_btn.click(
271
+ fn=on_cook,
272
+ inputs=[state, dish_radio, illustrate_chk, ingredient_editor],
273
+ outputs=[hero, steps_panel, nutrition_panel, state],
274
+ )
275
+
276
+ validate_btn.click(
277
+ fn=on_validate,
278
+ inputs=[state, step_idx, progress_input],
279
+ outputs=[verdict_panel],
280
  )
 
 
 
 
 
 
 
 
 
 
281
 
282
  return demo
283
 
 
289
  server_port=int(__import__("os").environ.get("PORT", 7860)),
290
  show_error=True,
291
  inbrowser=True,
292
+ )
 
 
modal_app/__init__.py ADDED
File without changes
modal_app/flux_endpoint.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Modal FLUX.2 Klein endpoint.
2
+
3
+ Deploy once with:
4
+ modal deploy modal_app/flux_endpoint.py
5
+
6
+ Then the HF Space calls it via modal.Function.lookup().
7
+ """
8
+ import io
9
+ import modal
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # App & image
13
+ # ---------------------------------------------------------------------------
14
+
15
+ app = modal.App("cook-with-me-flux")
16
+
17
+ image = (
18
+ modal.Image.debian_slim(python_version="3.12")
19
+ .pip_install(
20
+ "torch==2.7.0", # >=2.5 needed: diffusers custom-op schema uses PEP604 unions
21
+ "torchvision==0.22.0", # matches torch 2.7.0; silences diffusers image-processor fallback
22
+ "diffusers>=0.38", # FLUX.2 support
23
+ "transformers>=4.45",
24
+ "accelerate",
25
+ "safetensors",
26
+ "Pillow",
27
+ "huggingface_hub>=1.17",
28
+ "sentencepiece",
29
+ )
30
+ )
31
+
32
+ # HF token secret so Modal can pull gated/private model weights
33
+ hf_secret = modal.Secret.from_name("huggingface-secret")
34
+
35
+ # Tried in order. FLUX models are gated (need license acceptance on HF);
36
+ # SDXL-Turbo is public and always works, so it's the guaranteed fallback.
37
+ FLUX_MODEL = "black-forest-labs/FLUX.2-klein-9B"
38
+ FLUX_FALLBACK = "black-forest-labs/FLUX.1-schnell"
39
+ SDXL_TURBO = "stabilityai/sdxl-turbo" # non-gated, fast (1-2 steps)
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # GPU class
43
+ # ---------------------------------------------------------------------------
44
+
45
+ @app.cls(
46
+ image=image,
47
+ gpu="L4",
48
+ scaledown_window=180, # keep warm 3 min after last request
49
+ secrets=[hf_secret],
50
+ )
51
+ class FluxKlein:
52
+ @modal.enter()
53
+ def load(self):
54
+ import torch
55
+
56
+ dtype = torch.bfloat16
57
+ self.steps = 4
58
+
59
+ # 1) FLUX.2-klein (gated) ------------------------------------------------
60
+ try:
61
+ from diffusers import FluxPipeline
62
+ self.pipe = FluxPipeline.from_pretrained(FLUX_MODEL, torch_dtype=dtype).to("cuda")
63
+ self.guidance, self.steps, self.backend = 1.0, 4, "FLUX.2-klein-9B"
64
+ print(f"Loaded {self.backend}")
65
+ return
66
+ except Exception as e:
67
+ print(f"FLUX.2-klein unavailable ({type(e).__name__}); trying FLUX.1-schnell...")
68
+
69
+ # 2) FLUX.1-schnell (gated) ---------------------------------------------
70
+ try:
71
+ from diffusers import FluxPipeline
72
+ self.pipe = FluxPipeline.from_pretrained(FLUX_FALLBACK, torch_dtype=dtype).to("cuda")
73
+ self.guidance, self.steps, self.backend = 0.0, 4, "FLUX.1-schnell"
74
+ print(f"Loaded {self.backend}")
75
+ return
76
+ except Exception as e:
77
+ print(f"FLUX.1-schnell unavailable ({type(e).__name__}); falling back to SDXL-Turbo...")
78
+
79
+ # 3) SDXL-Turbo (public, always works) ----------------------------------
80
+ from diffusers import AutoPipelineForText2Image
81
+ self.pipe = AutoPipelineForText2Image.from_pretrained(
82
+ SDXL_TURBO, torch_dtype=torch.float16, variant="fp16"
83
+ ).to("cuda")
84
+ self.guidance, self.steps, self.backend = 0.0, 2, "SDXL-Turbo"
85
+ print(f"Loaded {self.backend}")
86
+
87
+ @modal.method()
88
+ def render_step(self, prompt: str, seed: int = 42) -> bytes:
89
+ """Generate a 512×512 PNG and return its raw bytes."""
90
+ import torch
91
+
92
+ img = self.pipe(
93
+ prompt=prompt,
94
+ height=512,
95
+ width=512,
96
+ guidance_scale=self.guidance,
97
+ num_inference_steps=self.steps,
98
+ generator=torch.Generator(device="cuda").manual_seed(seed),
99
+ ).images[0]
100
+
101
+ buf = io.BytesIO()
102
+ img.save(buf, format="PNG")
103
+ return buf.getvalue()
104
+
105
+
106
+ # ---------------------------------------------------------------------------
107
+ # Local test entrypoint
108
+ # ---------------------------------------------------------------------------
109
+
110
+ @app.local_entrypoint()
111
+ def test():
112
+ import os
113
+ flux = FluxKlein()
114
+ png = flux.render_step.remote(
115
+ "Top-down photo of a kitchen pan with sautéed onions. "
116
+ "Mexican cooking. Warm lighting. Photorealistic.",
117
+ seed=0,
118
+ )
119
+ out = os.path.join(os.path.dirname(__file__), "..", "data", "test_flux.png")
120
+ out = os.path.abspath(out)
121
+ os.makedirs(os.path.dirname(out), exist_ok=True)
122
+ with open(out, "wb") as f:
123
+ f.write(png)
124
+ print(f"Saved {out} ({len(png)} bytes)")
modal_app/planner_endpoint.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Modal endpoint for the fine-tuned MiniCPM4.1-8B recipe planner.
2
+
3
+ Runs in its OWN container because MiniCPM4.1's custom code requires
4
+ transformers 4.x (CacheLayerMixin + is_torch_fx_available), which conflicts
5
+ with the MiniCPM-V-4.6 vision model in the main app (needs transformers 5.x).
6
+
7
+ Deploy:
8
+ modal deploy modal_app/planner_endpoint.py
9
+
10
+ The Gradio app calls it via modal.Cls.from_name("cook-with-me-planner",
11
+ "Planner").infer.remote(prompt, ...).
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import os
16
+
17
+ import modal
18
+
19
+ app = modal.App("cook-with-me-planner")
20
+
21
+ # 8B bf16 weights cached on a volume so cold starts don't re-download ~16GB.
22
+ hf_cache = modal.Volume.from_name("cook-with-me-planner-cache", create_if_missing=True)
23
+ hf_secret = modal.Secret.from_name("huggingface-secret")
24
+
25
+ image = (
26
+ modal.Image.debian_slim(python_version="3.12")
27
+ .pip_install(
28
+ "torch==2.4.0",
29
+ # MiniCPM4.1 custom code needs BOTH CacheLayerMixin (>=4.54) and
30
+ # is_torch_fx_available (removed in 5.0) — only 4.54..4.x has both.
31
+ "transformers>=4.54,<5.0",
32
+ "huggingface_hub>=0.26,<1.0",
33
+ "accelerate",
34
+ "sentencepiece",
35
+ "safetensors",
36
+ )
37
+ .env({"HF_HOME": "/cache/hf"})
38
+ )
39
+
40
+ # Fine-tuned weights; tokenizer pulled from base (FT tokenizer_config was saved
41
+ # by transformers 5.x and is not readable by 4.x).
42
+ PLANNER_REPO = os.environ.get("COOK_WITH_ME_PLANNER_FT_REPO", "eldinosaur/cook-with-me-planner-8b")
43
+ BASE_REPO = "openbmb/MiniCPM4.1-8B"
44
+
45
+
46
+ @app.cls(
47
+ image=image,
48
+ gpu="L4",
49
+ volumes={"/cache": hf_cache},
50
+ secrets=[hf_secret],
51
+ scaledown_window=240,
52
+ timeout=600,
53
+ )
54
+ class Planner:
55
+ @modal.enter()
56
+ def load(self):
57
+ import torch
58
+ from transformers import AutoModelForCausalLM, AutoTokenizer
59
+
60
+ print(f"Loading planner weights from {PLANNER_REPO}...")
61
+ self.tokenizer = AutoTokenizer.from_pretrained(BASE_REPO, trust_remote_code=True)
62
+ if self.tokenizer.pad_token is None:
63
+ self.tokenizer.pad_token = self.tokenizer.eos_token
64
+ self.model = AutoModelForCausalLM.from_pretrained(
65
+ PLANNER_REPO,
66
+ torch_dtype=torch.bfloat16,
67
+ trust_remote_code=True,
68
+ device_map="cuda",
69
+ ).eval()
70
+ print("Planner ready.")
71
+
72
+ @modal.method()
73
+ def infer(self, prompt: str, max_new_tokens: int = 1024, temperature: float = 0.0) -> str:
74
+ import torch
75
+
76
+ messages = [{"role": "user", "content": prompt}]
77
+ # enable_thinking=False -> direct JSON, no <think> reasoning preamble
78
+ try:
79
+ enc = self.tokenizer.apply_chat_template(
80
+ messages,
81
+ add_generation_prompt=True,
82
+ tokenize=True,
83
+ return_tensors="pt",
84
+ return_dict=True,
85
+ enable_thinking=False,
86
+ )
87
+ except TypeError:
88
+ enc = self.tokenizer.apply_chat_template(
89
+ messages, add_generation_prompt=True, tokenize=True,
90
+ return_tensors="pt", return_dict=True,
91
+ )
92
+
93
+ input_ids = enc["input_ids"].to(self.model.device)
94
+ input_len = input_ids.shape[1]
95
+ gen_inputs = {"input_ids": input_ids}
96
+ if enc.get("attention_mask") is not None:
97
+ gen_inputs["attention_mask"] = enc["attention_mask"].to(self.model.device)
98
+
99
+ gen_kwargs = dict(max_new_tokens=max_new_tokens, repetition_penalty=1.05)
100
+ if temperature and temperature > 0:
101
+ gen_kwargs.update(do_sample=True, temperature=temperature, top_p=0.9)
102
+ else:
103
+ gen_kwargs.update(do_sample=False)
104
+
105
+ with torch.no_grad():
106
+ out = self.model.generate(**gen_inputs, **gen_kwargs)
107
+ return self.tokenizer.decode(out[0][input_len:], skip_special_tokens=True)
108
+
109
+
110
+ @app.local_entrypoint()
111
+ def test():
112
+ prompt = (
113
+ "You are a creative chef. Available ingredients: tomato, onion, garlic, pasta, olive oil.\n"
114
+ 'Respond ONLY with JSON: {"options": [{"name": "...", "why": "..."}, {"name": "...", "why": "..."}, {"name": "...", "why": "..."}]}'
115
+ )
116
+ out = Planner().infer.remote(prompt, max_new_tokens=400)
117
+ print("OUTPUT:\n", out)
modal_app/serve_app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Serve the full Cook With Me Gradio app on Modal GPU.
2
+
3
+ This gives a permanent public URL (*.modal.run) that runs the real models:
4
+ - MiniCPM-V-4.6 (vision: ingredients + progress validation)
5
+ - MiniCPM4.1-8B (planner: dish proposals + recipes)
6
+ - FLUX.2-klein (step images, via the separate cook-with-me-flux endpoint)
7
+
8
+ Deploy with:
9
+ modal deploy modal_app/serve_app.py
10
+ Or run a temporary dev session (auto-stops on Ctrl-C):
11
+ modal serve modal_app/serve_app.py
12
+
13
+ Both models live in one A100-40GB container (~25GB VRAM total).
14
+ Set the fine-tuned planner repo via the COOK_WITH_ME_PLANNER_FT_REPO env
15
+ on the Modal function once training finishes.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ from pathlib import Path
20
+
21
+ import modal
22
+
23
+ LOCAL_ROOT = Path(__file__).resolve().parent.parent
24
+ REMOTE_ROOT = "/root/cook"
25
+
26
+ app = modal.App("cook-with-me-app")
27
+
28
+ # HF model cache persisted across restarts (avoids re-downloading ~25GB)
29
+ hf_cache = modal.Volume.from_name("cook-with-me-hf-cache", create_if_missing=True)
30
+ hf_secret = modal.Secret.from_name("huggingface-secret")
31
+
32
+ image = (
33
+ modal.Image.debian_slim(python_version="3.12")
34
+ .pip_install(
35
+ "torch==2.4.0",
36
+ "torchvision==0.19.0",
37
+ "transformers>=5.0",
38
+ "accelerate",
39
+ "safetensors",
40
+ "sentencepiece",
41
+ "Pillow",
42
+ "av",
43
+ "pydantic>=2",
44
+ "gradio==6.15.2",
45
+ "huggingface_hub>=1.17",
46
+ "modal",
47
+ )
48
+ .env({
49
+ "COOK_WITH_ME_CACHE": "/cache/cook",
50
+ # Use the fine-tuned planner pushed by scripts/train_planner.py
51
+ "COOK_WITH_ME_PLANNER_FT_REPO": "eldinosaur/cook-with-me-planner-8b",
52
+ })
53
+ .add_local_dir(
54
+ str(LOCAL_ROOT),
55
+ REMOTE_ROOT,
56
+ ignore=[
57
+ "data/*", ".git/*", "**/__pycache__", "**/*.pyc",
58
+ "assets/*", ".venv/*", "venv/*",
59
+ ],
60
+ )
61
+ )
62
+
63
+
64
+ @app.function(
65
+ image=image,
66
+ gpu="L40S",
67
+ secrets=[hf_secret],
68
+ volumes={"/cache": hf_cache},
69
+ timeout=3600,
70
+ scaledown_window=300, # stay warm 5 min after last request
71
+ max_containers=1,
72
+ )
73
+ @modal.concurrent(max_inputs=20)
74
+ @modal.asgi_app()
75
+ def serve():
76
+ import os
77
+ import sys
78
+ import types
79
+
80
+ # --- env: cache model downloads on the volume, before any HF import ---
81
+ os.environ["HF_HOME"] = "/cache/hf"
82
+ os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0")
83
+
84
+ # --- mock `spaces` so @spaces.GPU becomes a no-op (we're already on GPU) ---
85
+ spaces_mock = types.ModuleType("spaces")
86
+ spaces_mock.GPU = lambda *a, **k: (lambda fn: fn)
87
+ sys.modules["spaces"] = spaces_mock
88
+
89
+ # --- make the mounted project importable ---
90
+ sys.path.insert(0, REMOTE_ROOT)
91
+
92
+ import gradio as gr
93
+ from fastapi import FastAPI
94
+
95
+ # Importing app triggers the vision model load (module-level singleton).
96
+ from app import build_ui
97
+
98
+ demo = build_ui()
99
+ demo.queue(max_size=20)
100
+
101
+ fastapi_app = FastAPI()
102
+ return gr.mount_gradio_app(app=fastapi_app, blocks=demo, path="/")
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ libsndfile1
requirements.txt CHANGED
@@ -1,10 +1,7 @@
1
- # --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
2
- # llama-cpp-python
3
  gradio==6.15.2
4
  huggingface_hub>=1.17
5
 
6
-
7
- # --- Librerías añadidas y desbloqueadas para MiniCPM-V-4.6 ---
8
  torch
9
  torchvision
10
  spaces
@@ -12,4 +9,7 @@ Pillow
12
  transformers>=4.45
13
  accelerate
14
  safetensors
15
- av
 
 
 
 
 
 
1
  gradio==6.15.2
2
  huggingface_hub>=1.17
3
 
4
+ # Vision model
 
5
  torch
6
  torchvision
7
  spaces
 
9
  transformers>=4.45
10
  accelerate
11
  safetensors
12
+ av
13
+
14
+ # Pipeline & data
15
+ pydantic>=2
scripts/build_recipe_dataset.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Build the SFT dataset for the MiniCPM4.1-8B recipe planner.
2
+
3
+ Reads the Kaggle "better-recipes-for-a-better-life" dataset and produces
4
+ supervised fine-tuning pairs for BOTH planner tasks, matching the exact
5
+ prompt formats the app uses (src/prompts/planner_propose.txt and
6
+ planner_recipe.txt):
7
+
8
+ 1. propose : ingredients -> {"options": [{name, why} x3]}
9
+ 2. recipe : dish + ingredients -> {"name", "cuisine", "servings",
10
+ "total_time_minutes", "final_dish_visual", "steps":[...]}
11
+
12
+ Run locally (once) before fine-tuning:
13
+ python scripts/build_recipe_dataset.py
14
+
15
+ Requires:
16
+ pip install kagglehub pandas pyarrow datasets huggingface_hub tqdm
17
+ ~/.kaggle/kaggle.json with your credentials
18
+ """
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import random
23
+ import re
24
+ import sys
25
+ from pathlib import Path
26
+
27
+ ROOT = Path(__file__).resolve().parent.parent
28
+ sys.path.insert(0, str(ROOT))
29
+
30
+ import pandas as pd
31
+ from tqdm import tqdm
32
+
33
+ from src import config
34
+
35
+ random.seed(42)
36
+
37
+ HF_DATASET_REPO = "eldinosaur/cook-with-me-recipes-sft"
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # 1. Download (use ONLY recipes.csv — test_recipes.csv has a different schema
41
+ # whose capitalized columns shadowed the real data in the old version)
42
+ # ---------------------------------------------------------------------------
43
+
44
+ print("Pulling Kaggle dataset…")
45
+ import kagglehub
46
+
47
+ raw_path = Path(kagglehub.dataset_download(config.KAGGLE_DATASET))
48
+ main_csv = raw_path / "recipes.csv"
49
+ print(f"Reading {main_csv}")
50
+
51
+ # cp1252 decodes the fraction/symbol bytes that show up as � under utf-8
52
+ try:
53
+ raw_df = pd.read_csv(main_csv, encoding="cp1252", on_bad_lines="skip")
54
+ except Exception:
55
+ raw_df = pd.read_csv(main_csv, encoding="utf-8", on_bad_lines="skip")
56
+
57
+ print(f"Rows: {len(raw_df)} columns: {list(raw_df.columns)}")
58
+
59
+
60
+ # ---------------------------------------------------------------------------
61
+ # 2. Cleaning helpers
62
+ # ---------------------------------------------------------------------------
63
+
64
+ _UNIT = (
65
+ r"(cups?|tablespoons?|tbsps?|teaspoons?|tsps?|pounds?|lbs?|ounces?|ozs?|"
66
+ r"grams?|kgs?|mls?|liters?|pinch(?:es)?|dash(?:es)?|cloves?|cans?|"
67
+ r"packages?|pkgs?|sheets?|slices?|sticks?|quarts?|pints?|jars?|bunch(?:es)?|"
68
+ r"heads?|stalks?|sprigs?|pieces?|fillets?)"
69
+ )
70
+ _PREP_WORDS = {
71
+ "peeled", "chopped", "diced", "sliced", "minced", "cored", "thawed",
72
+ "drained", "rinsed", "softened", "melted", "beaten", "divided", "cubed",
73
+ "to taste", "optional", "or more", "plus more", "for garnish", "for serving",
74
+ "lightly beaten", "room temperature", "at room temperature", "finely chopped",
75
+ "thinly sliced", "cut into", "more", "and", "or other", "such as",
76
+ }
77
+
78
+
79
+ def _clean_text(val: str) -> str:
80
+ if not isinstance(val, str):
81
+ return ""
82
+ # drop any remaining replacement chars and collapse whitespace
83
+ val = val.replace("�", " ")
84
+ return re.sub(r"[ \t]+", " ", val).strip()
85
+
86
+
87
+ def _simplify_ingredient(raw: str) -> str:
88
+ s = re.sub(r"\([^)]*\)", "", raw) # remove parentheticals
89
+ s = _clean_text(s).lower()
90
+ s = re.sub(r"^[\d\s./¼½¾⅓⅔⅛+-]+", "", s) # leading quantities
91
+ s = re.sub(rf"^{_UNIT}\b\.?\s*", "", s) # leading unit word
92
+ s = re.sub(r"^(of|the|a|an)\s+", "", s)
93
+ s = s.split(",")[0] # drop trailing prep clause
94
+ s = re.sub(r"[^a-z\s-]", "", s) # keep letters only
95
+ s = re.sub(r"\s+", " ", s).strip()
96
+ return s
97
+
98
+
99
+ def _ingredient_list(raw: str) -> list[str]:
100
+ if not isinstance(raw, str):
101
+ return []
102
+ out, seen = [], set()
103
+ for part in raw.split(","):
104
+ name = _simplify_ingredient(part)
105
+ if not name or len(name) < 3 or len(name.split()) > 4:
106
+ continue
107
+ if name in _PREP_WORDS or name in seen:
108
+ continue
109
+ seen.add(name)
110
+ out.append(name)
111
+ return out
112
+
113
+
114
+ def _steps_from_directions(raw: str) -> list[str]:
115
+ if not isinstance(raw, str):
116
+ return []
117
+ raw = _clean_text(raw.replace("\r", "\n"))
118
+ # Prefer explicit newlines; otherwise split into sentences.
119
+ parts = [p.strip() for p in raw.split("\n") if p.strip()]
120
+ if len(parts) < 2:
121
+ parts = [p.strip() for p in re.split(r"(?<=[.!?])\s+(?=[A-Z])", raw) if p.strip()]
122
+ # merge very short fragments into the previous step
123
+ steps: list[str] = []
124
+ for p in parts:
125
+ if steps and len(p) < 25:
126
+ steps[-1] = steps[-1] + " " + p
127
+ else:
128
+ steps.append(p)
129
+ return [s for s in steps if len(s) > 15]
130
+
131
+
132
+ def _minutes(row) -> int:
133
+ for col in ("total_time", "cook_time", "prep_time"):
134
+ v = row.get(col)
135
+ if isinstance(v, str):
136
+ h = re.search(r"(\d+)\s*hr", v)
137
+ m = re.search(r"(\d+)\s*min", v)
138
+ total = (int(h.group(1)) * 60 if h else 0) + (int(m.group(1)) if m else 0)
139
+ if total:
140
+ return total
141
+ return 0
142
+
143
+
144
+ def _cuisine(row) -> str:
145
+ cp = row.get("cuisine_path")
146
+ if isinstance(cp, str):
147
+ segs = [s for s in cp.split("/") if s]
148
+ if segs:
149
+ return segs[0].replace("-", " ").strip().title()
150
+ return "International"
151
+
152
+
153
+ def _distribute(total: int, n: int) -> list[int]:
154
+ if n <= 0:
155
+ return []
156
+ if total <= 0:
157
+ total = n * 6
158
+ base = max(2, total // n)
159
+ durs = [base] * n
160
+ durs[-1] = max(2, total - base * (n - 1))
161
+ return durs
162
+
163
+
164
+ # ---------------------------------------------------------------------------
165
+ # 3. Normalize into clean recipe records
166
+ # ---------------------------------------------------------------------------
167
+
168
+ recipes: list[dict] = []
169
+ for _, r in tqdm(raw_df.iterrows(), total=len(raw_df), desc="Normalizing"):
170
+ name = _clean_text(r.get("recipe_name", ""))
171
+ ings = _ingredient_list(r.get("ingredients", ""))
172
+ steps = _steps_from_directions(r.get("directions", ""))
173
+ if not name or len(ings) < 3 or len(steps) < 2:
174
+ continue
175
+ steps = steps[:7]
176
+ if len(steps) < 4 and len(steps) >= 2:
177
+ pass # keep short recipes too, 2-3 steps is fine
178
+ minutes = _minutes(r) or len(steps) * 6
179
+ try:
180
+ servings = int(float(str(r.get("servings", "2")).split()[0]))
181
+ except Exception:
182
+ servings = 2
183
+ servings = min(max(servings, 1), 12)
184
+ recipes.append({
185
+ "name": name,
186
+ "ingredients": ings[:14],
187
+ "steps": steps,
188
+ "cuisine": _cuisine(r),
189
+ "minutes": int(minutes),
190
+ "servings": servings,
191
+ })
192
+
193
+ print(f"\nClean recipes: {len(recipes)}")
194
+
195
+ config.DATA_DIR.mkdir(parents=True, exist_ok=True)
196
+ pd.DataFrame(recipes).to_parquet(config.RECIPES_PARQUET, index=False)
197
+ print(f"Saved -> {config.RECIPES_PARQUET}")
198
+
199
+
200
+ # ---------------------------------------------------------------------------
201
+ # 4. Build SFT pairs matching the app's exact prompt formats
202
+ # ---------------------------------------------------------------------------
203
+
204
+ PROPOSE_TMPL = (config.PROMPTS_DIR / "planner_propose.txt").read_text(encoding="utf-8")
205
+ RECIPE_TMPL = (config.PROMPTS_DIR / "planner_recipe.txt").read_text(encoding="utf-8")
206
+
207
+ _WHY = [
208
+ "Uses your {a} and {b} for a quick, satisfying result.",
209
+ "A fresh way to combine {a} with {b}.",
210
+ "Turns {a} and {b} into a comforting classic.",
211
+ "Light and flavorful, built around {a} and {b}.",
212
+ "Makes the most of {a}, {b} and a few pantry staples.",
213
+ ]
214
+
215
+
216
+ def _recipe_json(rec: dict) -> str:
217
+ durs = _distribute(rec["minutes"], len(rec["steps"]))
218
+ steps = [
219
+ {"n": i + 1, "instruction": s, "duration": f"{d} min", "tip": None}
220
+ for i, (s, d) in enumerate(zip(rec["steps"], durs))
221
+ ]
222
+ obj = {
223
+ "name": rec["name"],
224
+ "cuisine": rec["cuisine"],
225
+ "servings": rec["servings"],
226
+ "total_time_minutes": rec["minutes"],
227
+ "final_dish_visual": f"A beautifully plated {rec['name'].lower()}, ready to serve.",
228
+ "steps": steps,
229
+ }
230
+ return json.dumps(obj, ensure_ascii=False)
231
+
232
+
233
+ def _propose_json(rec: dict, others: list[dict]) -> str:
234
+ a = rec["ingredients"][0] if rec["ingredients"] else "your ingredients"
235
+ b = rec["ingredients"][1] if len(rec["ingredients"]) > 1 else "pantry staples"
236
+ options = [{"name": rec["name"], "why": random.choice(_WHY).format(a=a, b=b)}]
237
+ for o in others:
238
+ oa = o["ingredients"][0] if o["ingredients"] else a
239
+ ob = o["ingredients"][1] if len(o["ingredients"]) > 1 else b
240
+ options.append({"name": o["name"], "why": random.choice(_WHY).format(a=oa, b=ob)})
241
+ return json.dumps({"options": options}, ensure_ascii=False)
242
+
243
+
244
+ sft_path = config.DATA_DIR / "recipes_sft.jsonl"
245
+ n_recipe = n_propose = 0
246
+ with open(sft_path, "w", encoding="utf-8") as f:
247
+ for idx, rec in enumerate(tqdm(recipes, desc="Building SFT")):
248
+ ing_str = ", ".join(rec["ingredients"])
249
+
250
+ # --- recipe task ---
251
+ user_recipe = RECIPE_TMPL.replace("{dish_name}", rec["name"]).replace("{ingredients}", ing_str)
252
+ f.write(json.dumps({"messages": [
253
+ {"role": "user", "content": user_recipe},
254
+ {"role": "assistant", "content": _recipe_json(rec)},
255
+ ]}, ensure_ascii=False) + "\n")
256
+ n_recipe += 1
257
+
258
+ # --- propose task (use two other recipes as alternative options) ---
259
+ others = [recipes[(idx + 7) % len(recipes)], recipes[(idx + 53) % len(recipes)]]
260
+ user_propose = PROPOSE_TMPL.replace("{ingredients}", ing_str)
261
+ f.write(json.dumps({"messages": [
262
+ {"role": "user", "content": user_propose},
263
+ {"role": "assistant", "content": _propose_json(rec, others)},
264
+ ]}, ensure_ascii=False) + "\n")
265
+ n_propose += 1
266
+
267
+ print(f"\nSFT pairs: {n_recipe} recipe + {n_propose} propose = {n_recipe + n_propose} -> {sft_path}")
268
+
269
+
270
+ # ---------------------------------------------------------------------------
271
+ # 5. Push to HF Hub
272
+ # ---------------------------------------------------------------------------
273
+
274
+ if HF_DATASET_REPO:
275
+ from datasets import load_dataset
276
+
277
+ ds = load_dataset("json", data_files=str(sft_path), split="train")
278
+ ds.push_to_hub(HF_DATASET_REPO)
279
+ print(f"Pushed {len(ds)} rows to {HF_DATASET_REPO}")
280
+
281
+ print("\nDone.")
scripts/diag_planner.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Diagnose why the fine-tuned planner produces empty generations.
2
+
3
+ modal run scripts/diag_planner.py
4
+ """
5
+ import modal
6
+
7
+ app = modal.App("cook-with-me-diag")
8
+
9
+ image = (
10
+ modal.Image.debian_slim(python_version="3.12")
11
+ .pip_install(
12
+ "torch==2.4.0",
13
+ "transformers>=4.54,<5.0", # window with BOTH CacheLayerMixin and is_torch_fx_available
14
+ "huggingface_hub>=0.26,<1.0",
15
+ "accelerate",
16
+ "sentencepiece",
17
+ )
18
+ )
19
+ hf_secret = modal.Secret.from_name("huggingface-secret")
20
+
21
+ MODEL_ID = "eldinosaur/cook-with-me-planner-8b" # fine-tuned model under transformers 4.x
22
+
23
+
24
+ @app.function(image=image, gpu="L4", secrets=[hf_secret], timeout=900)
25
+ def diag():
26
+ import torch
27
+ import transformers
28
+ print("transformers version:", transformers.__version__)
29
+
30
+ from transformers import AutoModelForCausalLM, AutoTokenizer
31
+
32
+ print("Loading tokenizer (from base) + model (from FT)...")
33
+ tok = AutoTokenizer.from_pretrained("openbmb/MiniCPM4.1-8B", trust_remote_code=True)
34
+ model = AutoModelForCausalLM.from_pretrained(
35
+ MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="cuda"
36
+ ).eval()
37
+ print("has generate:", hasattr(model, "generate"))
38
+ print("class mro:", [c.__name__ for c in type(model).__mro__])
39
+
40
+ prompt = (
41
+ "You are a chef. Given ingredients: tomato, onion, garlic, pasta, olive oil.\n"
42
+ 'Return ONLY JSON: {"options": [{"name": "...", "why": "..."}, ...]} with 3 dish ideas.'
43
+ )
44
+ messages = [{"role": "user", "content": prompt}]
45
+
46
+ # Mirror the fixed planner.py path
47
+ try:
48
+ enc = tok.apply_chat_template(
49
+ messages, add_generation_prompt=True, tokenize=True,
50
+ return_tensors="pt", return_dict=True,
51
+ )
52
+ input_ids = enc["input_ids"].to("cuda")
53
+ input_len = input_ids.shape[1]
54
+ gen_inputs = {"input_ids": input_ids}
55
+ if enc.get("attention_mask") is not None:
56
+ gen_inputs["attention_mask"] = enc["attention_mask"].to("cuda")
57
+ print("input length:", input_len)
58
+ with torch.no_grad():
59
+ out = model.generate(**gen_inputs, max_new_tokens=400, do_sample=False)
60
+ text = tok.decode(out[0][input_len:], skip_special_tokens=True)
61
+ print("=== GENERATION OK (transformers 4.x, cache on) ===")
62
+ print("OUTPUT:", repr(text[:1000]))
63
+ except Exception as e:
64
+ import traceback
65
+ print("=== GENERATION FAILED ===")
66
+ print("Exception type:", type(e).__name__)
67
+ print("Exception repr:", repr(e))
68
+ traceback.print_exc()
69
+
70
+
71
+ @app.local_entrypoint()
72
+ def main():
73
+ diag.remote()
scripts/train_planner.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Fine-tune MiniCPM4.1-8B on the recipe SFT dataset via Modal (A10G GPU).
2
+
3
+ Usage:
4
+ modal run scripts/train_planner.py
5
+
6
+ After training, the adapter is merged and the full model is pushed to HF Hub
7
+ as <HF_USERNAME>/cook-with-me-planner-8b
8
+
9
+ Set HF_USERNAME below (or export HF_TOKEN env var before running).
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import modal
14
+
15
+ # ---------------------------------------------------------------------------
16
+ # Config — change these two values
17
+ # ---------------------------------------------------------------------------
18
+ HF_USERNAME = "eldinosaur"
19
+ SFT_DATASET_REPO = f"{HF_USERNAME}/cook-with-me-recipes-sft"
20
+ OUTPUT_REPO = f"{HF_USERNAME}/cook-with-me-planner-8b"
21
+ BASE_MODEL = "openbmb/MiniCPM4.1-8B"
22
+ # ---------------------------------------------------------------------------
23
+
24
+ app = modal.App("cook-with-me-train")
25
+
26
+ volume = modal.Volume.from_name("cook-with-me-train-vol", create_if_missing=True)
27
+
28
+ train_image = (
29
+ modal.Image.debian_slim(python_version="3.12")
30
+ .pip_install(
31
+ "torch==2.4.0",
32
+ "transformers>=5.0",
33
+ "peft>=0.12",
34
+ "trl>=0.10",
35
+ "accelerate",
36
+ "datasets",
37
+ "huggingface_hub>=1.17",
38
+ "bitsandbytes",
39
+ "sentencepiece",
40
+ "safetensors",
41
+ )
42
+ )
43
+
44
+ hf_secret = modal.Secret.from_name("huggingface-secret")
45
+
46
+
47
+ @app.function(
48
+ image=train_image,
49
+ gpu="A10G",
50
+ timeout=60 * 60 * 3, # 3-hour hard cap
51
+ secrets=[hf_secret],
52
+ volumes={"/vol": volume},
53
+ )
54
+ def train():
55
+ import os
56
+ import torch
57
+ from datasets import load_dataset
58
+ from peft import LoraConfig, get_peft_model, TaskType
59
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
60
+ from trl import SFTTrainer, SFTConfig
61
+
62
+ os.environ.setdefault("HF_HOME", "/vol/hf_cache")
63
+
64
+ # MiniCPM4.1-8B custom code references is_torch_fx_available which was
65
+ # removed in transformers 5.x. Patch it back before loading the model.
66
+ import transformers.utils.import_utils as _iutils
67
+ if not hasattr(_iutils, "is_torch_fx_available"):
68
+ def _is_torch_fx_available():
69
+ try:
70
+ import torch.fx # noqa: F401
71
+ return True
72
+ except ImportError:
73
+ return False
74
+ _iutils.is_torch_fx_available = _is_torch_fx_available
75
+
76
+ # ---- Load tokenizer & model ----
77
+ print(f"Loading {BASE_MODEL}…")
78
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
79
+ if tokenizer.pad_token is None:
80
+ tokenizer.pad_token = tokenizer.eos_token
81
+
82
+ model = AutoModelForCausalLM.from_pretrained(
83
+ BASE_MODEL,
84
+ torch_dtype=torch.bfloat16,
85
+ trust_remote_code=True,
86
+ device_map="cuda",
87
+ )
88
+
89
+ # ---- LoRA config ----
90
+ lora_cfg = LoraConfig(
91
+ task_type=TaskType.CAUSAL_LM,
92
+ r=16,
93
+ lora_alpha=32,
94
+ lora_dropout=0.05,
95
+ target_modules="all-linear",
96
+ bias="none",
97
+ )
98
+ model = get_peft_model(model, lora_cfg)
99
+ model.print_trainable_parameters()
100
+
101
+ # ---- Dataset ----
102
+ print(f"Loading dataset {SFT_DATASET_REPO}…")
103
+ ds = load_dataset(SFT_DATASET_REPO, split="train")
104
+
105
+ def _format(example):
106
+ return {"text": tokenizer.apply_chat_template(
107
+ example["messages"], tokenize=False, add_generation_prompt=False
108
+ )}
109
+
110
+ ds = ds.map(_format, remove_columns=ds.column_names)
111
+
112
+ # ---- Training ----
113
+ output_dir = "/vol/planner_out"
114
+ trainer = SFTTrainer(
115
+ model=model,
116
+ processing_class=tokenizer,
117
+ train_dataset=ds,
118
+ args=SFTConfig(
119
+ output_dir=output_dir,
120
+ num_train_epochs=3, # 2046 examples — 3 epochs converges without overfitting
121
+ per_device_train_batch_size=2,
122
+ gradient_accumulation_steps=4,
123
+ learning_rate=2e-4,
124
+ lr_scheduler_type="cosine",
125
+ warmup_ratio=0.05,
126
+ bf16=True,
127
+ logging_steps=20,
128
+ save_steps=200,
129
+ max_length=2048,
130
+ dataset_text_field="text",
131
+ ),
132
+ )
133
+ trainer.train()
134
+ trainer.save_model(output_dir)
135
+
136
+ # ---- Merge LoRA + push ----
137
+ print("Merging LoRA adapter…")
138
+ from peft import PeftModel
139
+
140
+ base = AutoModelForCausalLM.from_pretrained(
141
+ BASE_MODEL, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="cpu"
142
+ )
143
+ merged = PeftModel.from_pretrained(base, output_dir)
144
+ merged = merged.merge_and_unload()
145
+
146
+ # MiniCPM custom code declares `_tied_weights_keys` as a list, but
147
+ # transformers 5.x's save path calls `.keys()` on it. Patch the walker
148
+ # to tolerate both list and dict formats before saving/pushing.
149
+ import transformers.modeling_utils as _mu
150
+
151
+ def _safe_get_tied_weight_keys(model, *args, **kwargs):
152
+ keys = []
153
+ for module_name, module in model.named_modules():
154
+ tied = getattr(module, "_tied_weights_keys", None)
155
+ if not tied:
156
+ continue
157
+ names = tied.keys() if isinstance(tied, dict) else tied
158
+ for k in names:
159
+ keys.append(f"{module_name}.{k}" if module_name else k)
160
+ return keys
161
+
162
+ _mu._get_tied_weight_keys = _safe_get_tied_weight_keys
163
+
164
+ print(f"Pushing merged model to {OUTPUT_REPO}…")
165
+ merged.push_to_hub(OUTPUT_REPO, private=False)
166
+ tokenizer.push_to_hub(OUTPUT_REPO, private=False)
167
+ print("Done.")
168
+
169
+
170
+ @app.local_entrypoint()
171
+ def main():
172
+ train.remote()
src/agents/progress_validator.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Progress validation agent: compare cooking photo against target step."""
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ from typing import Optional
6
+
7
+ import spaces
8
+ import torch
9
+ from PIL import Image
10
+
11
+ from src import config
12
+ from src.agents.mise_en_place import model, processor
13
+ from src.agents.recipe_planner import _extract_json
14
+
15
+ log = logging.getLogger(__name__)
16
+
17
+ _VALIDATOR_PROMPT = (config.PROMPTS_DIR / "validator_prompt.txt").read_text(encoding="utf-8")
18
+
19
+
20
+ @spaces.GPU(duration=45)
21
+ def validate(image: Optional[Image.Image], step_instruction: str) -> dict:
22
+ """Compare a cooking-progress photo to the target step description.
23
+
24
+ Returns a dict with keys: verdict ('go'|'wait'|'fix'), feedback, tip.
25
+ """
26
+ if image is None:
27
+ return {
28
+ "verdict": "wait",
29
+ "feedback": "No image provided.",
30
+ "tip": "Upload a photo of your cooking progress to get feedback.",
31
+ }
32
+ try:
33
+ img = image.convert("RGB")
34
+ prompt = _VALIDATOR_PROMPT.replace("{step_instruction}", step_instruction)
35
+
36
+ messages = [{"role": "user", "content": [
37
+ {"type": "image", "image": img},
38
+ {"type": "text", "text": prompt},
39
+ ]}]
40
+
41
+ inputs = processor.apply_chat_template(
42
+ messages,
43
+ add_generation_prompt=True,
44
+ tokenize=True,
45
+ return_dict=True,
46
+ return_tensors="pt",
47
+ enable_thinking=False,
48
+ processor_kwargs={"downsample_mode": "16x", "max_slice_nums": 9, "use_image_id": True},
49
+ )
50
+ device = model.device
51
+ inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
52
+ for k, v in inputs.items():
53
+ if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
54
+ inputs[k] = v.to(dtype=torch.bfloat16)
55
+
56
+ with torch.no_grad():
57
+ generated_ids = model.generate(
58
+ **inputs,
59
+ max_new_tokens=256,
60
+ do_sample=False,
61
+ downsample_mode="16x",
62
+ )
63
+
64
+ trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], generated_ids)]
65
+ raw = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
66
+ log.info("validate raw: %s", raw[:400])
67
+
68
+ data = _extract_json(raw)
69
+ verdict = str(data.get("verdict", "wait"))
70
+ if verdict not in ("go", "wait", "fix"):
71
+ verdict = "wait"
72
+
73
+ return {
74
+ "verdict": verdict,
75
+ "feedback": str(data.get("feedback", "")),
76
+ "tip": str(data.get("tip", "")),
77
+ }
78
+ except Exception as exc:
79
+ log.warning("validate failed: %s", exc)
80
+ return {
81
+ "verdict": "wait",
82
+ "feedback": "Could not analyse the photo.",
83
+ "tip": "Make sure the image is well-lit and in focus.",
84
+ }
src/agents/recipe_planner.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Recipe planner agent: propose dishes + generate step-by-step recipe.
2
+
3
+ Uses openbmb/MiniCPM4.1-8B (text-only) as the primary planner.
4
+ Falls back to the shared vision model (MiniCPM-V-4.6) when the planner
5
+ model is unavailable (e.g. insufficient RAM on the Space).
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import logging
11
+ import re
12
+
13
+ import spaces
14
+ import torch
15
+
16
+ from src import config
17
+ from src.pipeline import DishOption, Recipe, RecipeStep
18
+
19
+ log = logging.getLogger(__name__)
20
+
21
+ _PROPOSE_PROMPT = (config.PROMPTS_DIR / "planner_propose.txt").read_text(encoding="utf-8")
22
+ _RECIPE_PROMPT = (config.PROMPTS_DIR / "planner_recipe.txt").read_text(encoding="utf-8")
23
+
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # JSON extraction helpers
27
+ # ---------------------------------------------------------------------------
28
+
29
+ def _extract_json(text: str) -> dict:
30
+ """Robustly extract the first JSON object from raw model output."""
31
+ text = text.strip()
32
+ try:
33
+ return json.loads(text)
34
+ except Exception:
35
+ pass
36
+ # Markdown code-block
37
+ m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
38
+ if m:
39
+ try:
40
+ return json.loads(m.group(1))
41
+ except Exception:
42
+ pass
43
+ # First {...} block with minor auto-fixes
44
+ m = re.search(r"\{.*\}", text, re.DOTALL)
45
+ if m:
46
+ candidate = m.group(0)
47
+ candidate = candidate.replace("'", '"')
48
+ candidate = re.sub(r",\s*([}\]])", r"\1", candidate)
49
+ try:
50
+ return json.loads(candidate)
51
+ except Exception:
52
+ pass
53
+ log.warning("Could not extract JSON from output (first 300 chars): %.300s", text)
54
+ return {}
55
+
56
+
57
+ # ---------------------------------------------------------------------------
58
+ # Inference dispatcher
59
+ # ---------------------------------------------------------------------------
60
+
61
+ def _infer(prompt: str, max_new_tokens: int = 1024, temperature: float = 0.0) -> str:
62
+ """Run text inference.
63
+
64
+ Primary: the dedicated MiniCPM4.1-8B planner Modal endpoint (transformers
65
+ 4.x). Falls back to the local vision model (text-only) if the endpoint is
66
+ unavailable or returns nothing.
67
+ """
68
+ try:
69
+ import modal
70
+ cls = modal.Cls.from_name(config.PLANNER_MODAL_APP, config.PLANNER_MODAL_CLS)
71
+ out = cls().infer.remote(prompt, max_new_tokens=max_new_tokens, temperature=temperature)
72
+ if out and out.strip():
73
+ return out
74
+ log.warning("Planner endpoint returned empty — falling back to vision model.")
75
+ except Exception as exc:
76
+ log.warning("Planner endpoint call failed: %s — falling back to vision model.", exc)
77
+
78
+ # Fallback: use the vision model in text-only mode
79
+ log.warning("Using vision model as text fallback.")
80
+ from src.agents.mise_en_place import model as vis_model, processor as vis_proc
81
+
82
+ messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
83
+ inputs = vis_proc.apply_chat_template(
84
+ messages,
85
+ add_generation_prompt=True,
86
+ tokenize=True,
87
+ return_dict=True,
88
+ return_tensors="pt",
89
+ enable_thinking=False,
90
+ )
91
+ device = vis_model.device
92
+ inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
93
+ for k, v in inputs.items():
94
+ if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
95
+ inputs[k] = v.to(dtype=torch.bfloat16)
96
+
97
+ with torch.no_grad():
98
+ generated_ids = vis_model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
99
+
100
+ trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], generated_ids)]
101
+ return vis_proc.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
102
+
103
+
104
+ # ---------------------------------------------------------------------------
105
+ # Public agent functions
106
+ # ---------------------------------------------------------------------------
107
+
108
+ @spaces.GPU(duration=90)
109
+ def propose_dishes(ingredients: list[str]) -> list[DishOption]:
110
+ """Given detected ingredients, return up to 3 dish proposals."""
111
+ try:
112
+ prompt = _PROPOSE_PROMPT.replace("{ingredients}", ", ".join(ingredients))
113
+ raw = _infer(prompt, max_new_tokens=512, temperature=0.7)
114
+ log.info("propose_dishes raw: %.500s", raw)
115
+ data = _extract_json(raw)
116
+ options = data.get("options", [])
117
+ return [
118
+ DishOption(name=str(o.get("name", "Dish")), why=str(o.get("why", "")))
119
+ for o in options[:3]
120
+ if o.get("name")
121
+ ] or [DishOption(name="Simple Stir-fry", why="Quick and adaptable to most ingredients.")]
122
+ except Exception as exc:
123
+ log.warning("propose_dishes failed: %s", exc)
124
+ return [DishOption(name="Simple Stir-fry", why="Quick and adaptable to most ingredients.")]
125
+
126
+
127
+ @spaces.GPU(duration=120)
128
+ def plan_recipe(dish_name: str, ingredients: list[str]) -> Recipe:
129
+ """Generate a full step-by-step recipe for the chosen dish."""
130
+ try:
131
+ prompt = (
132
+ _RECIPE_PROMPT
133
+ .replace("{dish_name}", dish_name)
134
+ .replace("{ingredients}", ", ".join(ingredients))
135
+ )
136
+ raw = _infer(prompt, max_new_tokens=1024, temperature=0.0)
137
+ log.info("plan_recipe raw: %.800s", raw)
138
+ data = _extract_json(raw)
139
+
140
+ raw_steps = data.get("steps", [])
141
+ steps = []
142
+ for i, s in enumerate(raw_steps, start=1):
143
+ if not s.get("instruction"):
144
+ continue
145
+ tip_val = s.get("tip")
146
+ steps.append(RecipeStep(
147
+ n=int(s.get("n", i)),
148
+ instruction=str(s["instruction"]),
149
+ duration=str(s.get("duration", "5 min")),
150
+ tip=str(tip_val) if tip_val and str(tip_val).lower() not in ("null", "none") else None,
151
+ visual=str(s.get("visual", "")),
152
+ ))
153
+
154
+ return Recipe(
155
+ name=str(data.get("name", dish_name)),
156
+ cuisine=str(data.get("cuisine", "International")),
157
+ servings=int(data.get("servings", 2)),
158
+ total_time_minutes=int(data.get("total_time_minutes", 30)),
159
+ final_dish_visual=str(data.get("final_dish_visual", "")),
160
+ steps=steps or [RecipeStep(n=1, instruction="Prepare and cook ingredients to taste.", duration="20 min")],
161
+ )
162
+ except Exception as exc:
163
+ log.warning("plan_recipe failed: %s", exc)
164
+ return Recipe(
165
+ name=dish_name,
166
+ steps=[RecipeStep(n=1, instruction="Prepare and cook ingredients to taste.", duration="20 min")],
167
+ )
src/agents/step_illustrator.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Step image generator — delegates to the deployed Modal FLUX.2 endpoint."""
2
+ from __future__ import annotations
3
+
4
+ import base64
5
+ import logging
6
+ from typing import Optional
7
+
8
+ from src import config
9
+ from src.pipeline import Recipe, RecipeStep
10
+
11
+ log = logging.getLogger(__name__)
12
+
13
+
14
+ # ---------------------------------------------------------------------------
15
+ # Helpers
16
+ # ---------------------------------------------------------------------------
17
+
18
+ def _b64(png_bytes: bytes) -> str:
19
+ return base64.b64encode(png_bytes).decode()
20
+
21
+
22
+ def _step_prompt(visual: str, cuisine: str, n: int) -> str:
23
+ desc = visual.strip() or f"cooking step {n}"
24
+ return (
25
+ f"Top-down photo of a kitchen pan or plate showing {desc}. "
26
+ f"{cuisine} home cooking. Warm natural lighting. "
27
+ "Recipe magazine style. Photorealistic. Appetizing."
28
+ )
29
+
30
+
31
+ def _dish_prompt(visual: str, cuisine: str) -> str:
32
+ desc = visual.strip() or "the finished plated dish, garnished and beautifully presented"
33
+ return (
34
+ f"Top-down photo of a {desc} on a rustic wooden table. "
35
+ f"{cuisine} home cooking. Warm natural lighting. "
36
+ "Recipe magazine style. Photorealistic. Appetizing."
37
+ )
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Modal call
42
+ # ---------------------------------------------------------------------------
43
+
44
+ def _call_modal(prompt: str, seed: int = 42) -> Optional[bytes]:
45
+ """Call the deployed Modal FLUX endpoint. Returns PNG bytes or None."""
46
+ try:
47
+ import modal
48
+ cls = modal.Cls.from_name(config.MODAL_APP_NAME, config.MODAL_CLS_NAME)
49
+ return cls().render_step.remote(prompt, seed=seed)
50
+ except Exception as exc:
51
+ log.warning("Modal FLUX call failed: %s", exc)
52
+ return None
53
+
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # Public function
57
+ # ---------------------------------------------------------------------------
58
+
59
+ def illustrate_recipe(recipe: Recipe) -> Recipe:
60
+ """Generate FLUX images for every step + final dish.
61
+
62
+ Mutates and returns the same Recipe with image_b64 fields populated
63
+ (or left as None when Modal is unavailable).
64
+ """
65
+ cuisine = recipe.cuisine or "International"
66
+
67
+ # Final dish hero image
68
+ final_bytes = _call_modal(_dish_prompt(recipe.final_dish_visual, cuisine), seed=0)
69
+ if final_bytes:
70
+ recipe.final_dish_image_b64 = _b64(final_bytes)
71
+ log.info("Generated final dish image.")
72
+
73
+ # Per-step images (sequential to respect GPU limits on Modal)
74
+ for step in recipe.steps:
75
+ prompt = _step_prompt(step.visual, cuisine, step.n)
76
+ step_bytes = _call_modal(prompt, seed=step.n)
77
+ if step_bytes:
78
+ step.image_b64 = _b64(step_bytes)
79
+ log.info("Generated image for step %d.", step.n)
80
+
81
+ return recipe
src/config.py CHANGED
@@ -21,10 +21,21 @@ VISION_REPO = "openbmb/MiniCPM-V-4_6-GGUF"
21
  VISION_MODEL_FILE = "MiniCPM-V-4_6-Q4_K_M.gguf"
22
  VISION_MMPROJ_FILE = "mmproj-model-f16.gguf"
23
 
24
- PLANNER_REPO = "openbmb/MiniCPM-V-4-gguf"
25
- PLANNER_MODEL_FILE = "Model-Q4_K_M.gguf"
 
26
 
27
- FLUX_REPO = "black-forest-labs/FLUX.2-klein-9B"
 
 
 
 
 
 
 
 
 
 
28
  NARRATOR_REPO = "openbmb/VoxCPM2"
29
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
30
 
 
21
  VISION_MODEL_FILE = "MiniCPM-V-4_6-Q4_K_M.gguf"
22
  VISION_MMPROJ_FILE = "mmproj-model-f16.gguf"
23
 
24
+ # Base model; set COOK_WITH_ME_PLANNER_REPO to point at a fine-tuned HF repo
25
+ PLANNER_REPO = os.environ.get("COOK_WITH_ME_PLANNER_REPO", "openbmb/MiniCPM4.1-8B")
26
+ PLANNER_FINETUNED_REPO = os.environ.get("COOK_WITH_ME_PLANNER_FT_REPO", "") # set after fine-tune
27
 
28
+ # Modal app names
29
+ MODAL_APP_NAME = "cook-with-me-flux"
30
+ MODAL_CLS_NAME = "FluxKlein"
31
+
32
+ # Planner runs in its own Modal app (transformers 4.x, conflicts with the
33
+ # vision model's transformers 5.x — so it can't live in the same container).
34
+ PLANNER_MODAL_APP = "cook-with-me-planner"
35
+ PLANNER_MODAL_CLS = "Planner"
36
+
37
+ FLUX_REPO = os.environ.get("COOK_WITH_ME_FLUX_REPO", "black-forest-labs/FLUX.2-klein-9B")
38
+ FLUX_FALLBACK_REPO = "black-forest-labs/FLUX.1-schnell"
39
  NARRATOR_REPO = "openbmb/VoxCPM2"
40
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
41
 
src/data/__init__.py ADDED
File without changes
src/data/nutrition.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Per-serving macro estimator — ingredient lookup, no extra model call needed."""
2
+ from __future__ import annotations
3
+
4
+ # (calories kcal, protein g, carbs g, fat g, fiber g) per 100 g
5
+ _MACROS: dict[str, tuple[float, float, float, float, float]] = {
6
+ # proteins
7
+ "chicken": (165, 31, 0, 3.6, 0),
8
+ "beef": (250, 26, 0, 16, 0),
9
+ "pork": (242, 27, 0, 14, 0),
10
+ "fish": (130, 20, 0, 5, 0),
11
+ "salmon": (208, 20, 0, 13, 0),
12
+ "tuna": (130, 29, 0, 0.5, 0),
13
+ "shrimp": (99, 24, 0, 0.3, 0),
14
+ "egg": (155, 13, 1.1, 11, 0),
15
+ "eggs": (155, 13, 1.1, 11, 0),
16
+ "tofu": (76, 8, 1.9, 4.8, 0.3),
17
+ # dairy
18
+ "milk": (61, 3.2, 4.8, 3.3, 0),
19
+ "cheese": (402, 25, 1.3, 33, 0),
20
+ "butter": (717, 0.9, 0.1, 81, 0),
21
+ "yogurt": (59, 3.5, 4.7, 3.3, 0),
22
+ "cream": (340, 2.1, 2.8, 36, 0),
23
+ # starches
24
+ "rice": (130, 2.7, 28, 0.3, 0.4),
25
+ "pasta": (158, 5.8, 31, 0.9, 1.8),
26
+ "bread": (265, 9, 49, 3.2, 2.7),
27
+ "potato": (77, 2, 17, 0.1, 2.2),
28
+ "potatoes": (77, 2, 17, 0.1, 2.2),
29
+ "flour": (364, 10, 76, 1, 2.7),
30
+ "oats": (389, 17, 66, 7, 10.6),
31
+ "quinoa": (120, 4.1, 21, 1.9, 2.8),
32
+ "lentils": (116, 9, 20, 0.4, 7.9),
33
+ "beans": (347, 21, 60, 1.2, 15),
34
+ "chickpeas": (164, 8.9, 27, 2.6, 7.6),
35
+ # vegetables
36
+ "tomato": (18, 0.9, 3.9, 0.2, 1.2),
37
+ "tomatoes": (18, 0.9, 3.9, 0.2, 1.2),
38
+ "onion": (40, 1.1, 9.3, 0.1, 1.7),
39
+ "onions": (40, 1.1, 9.3, 0.1, 1.7),
40
+ "garlic": (149, 6.4, 33, 0.5, 2.1),
41
+ "carrot": (41, 0.9, 10, 0.2, 2.8),
42
+ "carrots": (41, 0.9, 10, 0.2, 2.8),
43
+ "broccoli": (34, 2.8, 7, 0.4, 2.6),
44
+ "spinach": (23, 2.9, 3.6, 0.4, 2.2),
45
+ "pepper": (31, 1, 6, 0.3, 2.1),
46
+ "peppers": (31, 1, 6, 0.3, 2.1),
47
+ "mushroom": (22, 3.1, 3.3, 0.3, 1),
48
+ "mushrooms": (22, 3.1, 3.3, 0.3, 1),
49
+ "zucchini": (17, 1.2, 3.1, 0.3, 1),
50
+ "corn": (86, 3.3, 19, 1.4, 2.7),
51
+ "lettuce": (15, 1.4, 2.9, 0.2, 1.3),
52
+ "cucumber": (16, 0.7, 3.6, 0.1, 0.5),
53
+ "eggplant": (25, 1, 5.9, 0.2, 3),
54
+ "cabbage": (25, 1.3, 5.8, 0.1, 2.5),
55
+ "celery": (16, 0.7, 3, 0.2, 1.6),
56
+ "leek": (61, 1.5, 14, 0.3, 1.8),
57
+ # fruits
58
+ "apple": (52, 0.3, 14, 0.2, 2.4),
59
+ "banana": (89, 1.1, 23, 0.3, 2.6),
60
+ "lemon": (29, 1.1, 9.3, 0.3, 2.8),
61
+ "lime": (30, 0.7, 10.5, 0.2, 2.8),
62
+ "orange": (47, 0.9, 12, 0.1, 2.4),
63
+ # fats & condiments
64
+ "olive oil": (884, 0, 0, 100, 0),
65
+ "oil": (884, 0, 0, 100, 0),
66
+ "soy sauce": (53, 8.1, 4.9, 0.1, 0.8),
67
+ "honey": (304, 0.3, 82, 0, 0.2),
68
+ "sugar": (387, 0, 100, 0, 0),
69
+ "salt": (0, 0, 0, 0, 0),
70
+ "vinegar": (18, 0, 0.9, 0, 0),
71
+ }
72
+
73
+ # Typical portion weight per ingredient (grams)
74
+ _GRAMS: dict[str, int] = {
75
+ "egg": 50, "eggs": 100,
76
+ "butter": 15,
77
+ "olive oil": 14, "oil": 14,
78
+ "soy sauce": 15,
79
+ "salt": 3,
80
+ "garlic": 10,
81
+ "honey": 21,
82
+ "sugar": 12,
83
+ "lemon": 30, "lime": 30,
84
+ }
85
+ _DEFAULT_GRAMS = 80
86
+
87
+
88
+ def compute_nutrition(ingredients: list[str], servings: int = 2) -> dict[str, float]:
89
+ """Return per-serving macro estimates keyed to the NutritionGrid format."""
90
+ cal = prot = carb = fat = fib = 0.0
91
+ for ing in ingredients:
92
+ key = ing.lower().strip()
93
+ row = _MACROS.get(key) or _MACROS.get(key.split()[0]) if key else None
94
+ if row is None:
95
+ continue
96
+ grams = _GRAMS.get(key, _DEFAULT_GRAMS)
97
+ f = grams / 100
98
+ c, p, cb, ft, fb = row
99
+ cal += c * f
100
+ prot += p * f
101
+ carb += cb * f
102
+ fat += ft * f
103
+ fib += fb * f
104
+
105
+ sv = max(servings, 1)
106
+ return {
107
+ "calories": round(cal / sv),
108
+ "protein_g": round(prot / sv, 1),
109
+ "carbs_g": round(carb / sv, 1),
110
+ "fat_g": round(fat / sv, 1),
111
+ "fiber_g": round(fib / sv, 1),
112
+ }
src/models/planner.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MiniCPM4.1-8B text-only planner — lazy singleton."""
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ import os
6
+ from typing import Any, Optional, Tuple
7
+
8
+ import torch
9
+
10
+ from src import config
11
+
12
+ log = logging.getLogger(__name__)
13
+
14
+ _model: Any = None
15
+ _tokenizer: Any = None
16
+
17
+
18
+ def get_planner() -> Tuple[Optional[Any], Optional[Any]]:
19
+ """Return (model, tokenizer). Loads once; returns (None, None) on failure."""
20
+ global _model, _tokenizer
21
+ if _model is not None:
22
+ return _model, _tokenizer
23
+
24
+ # Prefer fine-tuned repo when available
25
+ model_id = config.PLANNER_FINETUNED_REPO or config.PLANNER_REPO
26
+ try:
27
+ # MiniCPM4.1 custom code imports is_torch_fx_available, which was
28
+ # removed in transformers 5.x. Patch it back before loading.
29
+ import transformers.utils.import_utils as _iutils
30
+ if not hasattr(_iutils, "is_torch_fx_available"):
31
+ def _is_torch_fx_available():
32
+ try:
33
+ import torch.fx # noqa: F401
34
+ return True
35
+ except ImportError:
36
+ return False
37
+ _iutils.is_torch_fx_available = _is_torch_fx_available
38
+
39
+ from transformers import AutoModelForCausalLM, AutoTokenizer
40
+
41
+ device_map = "auto" if os.environ.get("SPACE_ID") else (
42
+ "cuda" if torch.cuda.is_available() else "cpu"
43
+ )
44
+ log.info("Loading planner model %s (device_map=%s)...", model_id, device_map)
45
+ _tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
46
+ _model = AutoModelForCausalLM.from_pretrained(
47
+ model_id,
48
+ torch_dtype=torch.bfloat16,
49
+ trust_remote_code=True,
50
+ device_map=device_map,
51
+ ).eval()
52
+ log.info("Planner model ready.")
53
+ except Exception as exc:
54
+ log.error("Could not load planner model '%s': %s", model_id, exc)
55
+ _model = None
56
+ _tokenizer = None
57
+
58
+ return _model, _tokenizer
59
+
60
+
61
+ def infer(prompt: str, max_new_tokens: int = 1024, temperature: float = 0.0) -> str:
62
+ """Run text inference with the planner model.
63
+
64
+ Returns empty string if the model is unavailable.
65
+ """
66
+ model, tokenizer = get_planner()
67
+ if model is None or tokenizer is None:
68
+ return ""
69
+
70
+ try:
71
+ messages = [{"role": "user", "content": prompt}]
72
+
73
+ # return_dict=True yields a BatchEncoding (dict-like) with input_ids +
74
+ # attention_mask. NOTE: BatchEncoding is NOT a `dict` instance, so we
75
+ # must access it via mapping keys, never via tensor attrs like .shape.
76
+ enc = tokenizer.apply_chat_template(
77
+ messages,
78
+ add_generation_prompt=True,
79
+ tokenize=True,
80
+ return_tensors="pt",
81
+ return_dict=True,
82
+ )
83
+ input_ids = enc["input_ids"].to(model.device)
84
+ input_len = input_ids.shape[1]
85
+
86
+ gen_inputs = {"input_ids": input_ids}
87
+ attn = enc.get("attention_mask")
88
+ if attn is not None:
89
+ gen_inputs["attention_mask"] = attn.to(model.device)
90
+
91
+ gen_kwargs: dict = dict(max_new_tokens=max_new_tokens, do_sample=False)
92
+ if temperature > 0:
93
+ gen_kwargs.update(do_sample=True, temperature=temperature, top_p=0.95)
94
+
95
+ with torch.no_grad():
96
+ output = model.generate(**gen_inputs, **gen_kwargs)
97
+
98
+ token_ids = output[0][input_len:]
99
+ return tokenizer.decode(token_ids, skip_special_tokens=True)
100
+
101
+ except Exception as exc:
102
+ log.error("Planner inference error: %r", exc, exc_info=True)
103
+ return ""
src/pipeline.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared data models for the Cook-with-Me pipeline."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Optional
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class DishOption(BaseModel):
9
+ name: str
10
+ why: str = ""
11
+
12
+
13
+ class RecipeStep(BaseModel):
14
+ n: int = 1
15
+ instruction: str
16
+ duration: str = "5 min"
17
+ tip: Optional[str] = None
18
+ visual: str = ""
19
+ image_path: Optional[str] = None
20
+ image_b64: Optional[str] = None # base64 PNG from FLUX
21
+
22
+
23
+ class Recipe(BaseModel):
24
+ name: str
25
+ cuisine: str = "International"
26
+ servings: int = 2
27
+ total_time_minutes: int = 30
28
+ steps: list[RecipeStep] = Field(default_factory=list)
29
+ nutrition: dict = Field(default_factory=dict)
30
+ final_dish_visual: str = ""
31
+ final_dish_image_path: Optional[str] = None
32
+ final_dish_image_b64: Optional[str] = None # base64 PNG from FLUX
src/prompts/planner_propose.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a creative chef assistant. Given a list of available ingredients, suggest exactly 3 diverse and delicious dishes.
2
+
3
+ Available ingredients: {ingredients}
4
+
5
+ Rules:
6
+ - Each dish must be realistic to make with the listed ingredients
7
+ - Vary the style: aim for different cuisines or preparations
8
+ - Be specific with dish names (e.g., "Garlic Butter Shrimp Pasta" not "Pasta")
9
+
10
+ Respond ONLY with valid JSON and nothing else — no explanation, no markdown fences:
11
+ {"options": [{"name": "Dish Name 1", "why": "One sentence on why this works with the ingredients"}, {"name": "Dish Name 2", "why": "..."}, {"name": "Dish Name 3", "why": "..."}]}
src/prompts/planner_recipe.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a professional chef writing a clear, detailed recipe.
2
+
3
+ Dish to prepare: {dish_name}
4
+ Available ingredients: {ingredients}
5
+
6
+ Create a complete recipe with 4 to 7 steps. Each step must be specific and actionable.
7
+
8
+ Respond ONLY with valid JSON and nothing else — no explanation, no markdown fences:
9
+ {"name": "Full Recipe Title", "cuisine": "Cuisine type", "servings": 2, "total_time_minutes": 30, "final_dish_visual": "One evocative sentence describing how the finished dish looks and smells", "steps": [{"n": 1, "instruction": "Detailed step description.", "duration": "5 min", "tip": "Optional chef tip or null"}, {"n": 2, "instruction": "...", "duration": "3 min", "tip": null}]}
10
+
11
+ Important: tip must be a string or null, never omit it.
src/prompts/validator_prompt.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a supportive cooking coach reviewing a student's progress photo.
2
+
3
+ The step they are working on:
4
+ "{step_instruction}"
5
+
6
+ Look carefully at the photo and decide:
7
+ - "go" → the step is correctly completed, they can move on
8
+ - "wait" → it's progressing but needs more time (undercooked, still mixing, etc.)
9
+ - "fix" → there is a clear mistake that needs correction right now
10
+
11
+ Respond ONLY with valid JSON and nothing else:
12
+ {"verdict": "go", "feedback": "One sentence describing exactly what you see in the photo.", "tip": "One specific, actionable piece of advice for the cook."}
13
+
14
+ verdict must be exactly one of: go, wait, fix.
src/ui/components.py CHANGED
@@ -80,7 +80,7 @@ class TemplatedHTML(gr.HTML):
80
  class RecipeHero(TemplatedHTML):
81
  css_template = """
82
  .cwm-hero {
83
- background: #fffbf0;
84
  border: 1px solid #d8c9ad;
85
  border-radius: 16px;
86
  padding: 32px;
@@ -94,15 +94,15 @@ class RecipeHero(TemplatedHTML):
94
  background: #efe3c8;
95
  }
96
  .cwm-hero h1 {
97
- font-family: 'Lora', serif; font-size: 38px; color: #6b4a2a;
98
  margin: 0 0 8px;
99
  }
100
  .cwm-hero .meta {
101
- color: #8a6a3a; font-size: 14px; letter-spacing: 0.04em;
102
  text-transform: uppercase; margin-bottom: 18px;
103
  }
104
  .cwm-hero .visual {
105
- font-family: 'Lora', serif; font-style: italic; color: #6b4a2a;
106
  font-size: 17px; line-height: 1.55;
107
  }
108
  @media (max-width: 720px) { .cwm-hero { grid-template-columns: 1fr; } }
@@ -115,11 +115,14 @@ class RecipeHero(TemplatedHTML):
115
  servings = state.get("servings") or 0
116
  time = state.get("total_time_minutes") or 0
117
  visual = html.escape(state.get("final_dish_visual") or "")
118
- img = state.get("final_dish_image_path") or ""
119
- img_tag = (
120
- f'<img src="/file={html.escape(img)}" alt="final dish"/>'
121
- if img else '<div class="cwm-hero" style="background:#efe3c8;border-radius:12px;height:320px;"></div>'
122
- )
 
 
 
123
  return f"""
124
  <div class="cwm-hero">
125
  <div>{img_tag}</div>
@@ -186,15 +189,15 @@ class IngredientChips(TemplatedHTML):
186
  class DishOptions(TemplatedHTML):
187
  css_template = """
188
  .cwm-options { display: grid; grid-template-columns: repeat(3, 1fr); gap: 14px; }
189
- .cwm-option {
190
- background: #fffbf0; border: 1px solid #d8c9ad; border-radius: 12px;
191
  padding: 18px; text-align: left;
192
  }
193
- .cwm-option h3 {
194
- font-family: 'Lora', serif; font-size: 19px; color: #6b4a2a;
195
  margin: 0 0 6px;
196
  }
197
- .cwm-option p { color: #7a5a35; font-size: 14px; line-height: 1.45; margin: 0; }
198
  @media (max-width: 720px) { .cwm-options { grid-template-columns: 1fr; } }
199
  """
200
 
@@ -217,32 +220,32 @@ class DishOptions(TemplatedHTML):
217
  class StepCard(TemplatedHTML):
218
  css_template = """
219
  .cwm-steps { display: flex; flex-direction: column; gap: 16px; }
220
- .cwm-step {
221
  display: grid; grid-template-columns: 220px 1fr; gap: 22px;
222
- background: #fffbf0; border-left: 4px solid #a85c2a; border-radius: 10px;
223
  padding: 18px 22px;
224
  }
225
- .cwm-step img {
226
  width: 220px; height: 160px; object-fit: cover; border-radius: 8px;
227
  background: #efe3c8;
228
  }
229
- .cwm-step .placeholder {
230
  width: 220px; height: 160px; border-radius: 8px;
231
  background: linear-gradient(135deg,#efe3c8,#dccaa3);
232
  display:flex; align-items:center; justify-content:center;
233
- color: #8a6a3a; font-family: 'Lora', serif; font-size: 14px;
234
  }
235
- .cwm-step h3 {
236
- font-family: 'Lora', serif; color: #6b4a2a; margin: 0 0 6px; font-size: 22px;
237
  }
238
- .cwm-step p { font-size: 16px; line-height: 1.55; color: #4a3722; margin: 0 0 8px; }
239
- .cwm-step .duration {
240
- display: inline-block; background: #a85c2a; color: #fffbf0;
241
  border-radius: 999px; padding: 3px 10px; font-size: 12px; letter-spacing: 0.04em;
242
  }
243
- .cwm-step .tip {
244
- margin-top: 10px; padding: 10px 12px; background: #fff3d8;
245
- border-radius: 8px; font-size: 14px; color: #6b4a2a;
246
  }
247
  .cwm-step .tip::before { content: "💡 "; }
248
  @media (max-width: 720px) { .cwm-step { grid-template-columns: 1fr; } .cwm-step img, .cwm-step .placeholder { width: 100%; } }
@@ -260,11 +263,14 @@ class StepCard(TemplatedHTML):
260
  dur = html.escape(s.get("duration", ""))
261
  tip = s.get("tip")
262
  visual = html.escape(s.get("visual", ""))
263
- img = s.get("image_path")
264
- img_block = (
265
- f'<img src="/file={html.escape(img)}" alt="step {n}"/>'
266
- if img else f'<div class="placeholder">{visual[:80]}</div>'
267
- )
 
 
 
268
  tip_block = f'<div class="tip">{html.escape(tip)}</div>' if tip else ""
269
  cards.append(f"""
270
  <div class="cwm-step">
@@ -287,22 +293,22 @@ class NutritionGrid(TemplatedHTML):
287
  css_template = """
288
  .cwm-nutri-wrap { margin-top: 10px; }
289
  .cwm-nutri-title {
290
- font-family: 'Lora', serif; color: #6b4a2a; font-size: 22px; margin: 0 0 14px;
291
  }
292
  .cwm-nutri {
293
  display: grid; grid-template-columns: repeat(5, 1fr); gap: 12px;
294
  }
295
- .cwm-nutri-cell {
296
- background: #fffbf0; border: 1px solid #d8c9ad; border-radius: 10px;
297
  padding: 14px 10px; text-align: center;
298
  }
299
- .cwm-nutri-cell .v {
300
- font-family: 'Lora', serif; font-size: 24px; font-weight: 700; color: #6b4a2a;
301
  display: block;
302
  }
303
- .cwm-nutri-cell .l {
304
  font-size: 11px; letter-spacing: 0.08em; text-transform: uppercase;
305
- color: #8a6a3a; margin-top: 4px;
306
  }
307
  @media (max-width: 720px) { .cwm-nutri { grid-template-columns: repeat(2, 1fr); } }
308
  """
@@ -337,7 +343,7 @@ class VerdictBadge(TemplatedHTML):
337
  css_template = """
338
  .cwm-verdict {
339
  display: flex; align-items: center; gap: 18px;
340
- background: #fffbf0; border-radius: 12px; padding: 18px 22px;
341
  border: 1px solid #d8c9ad;
342
  }
343
  .cwm-verdict.go { border-left: 6px solid #4f8b4a; }
@@ -351,8 +357,8 @@ class VerdictBadge(TemplatedHTML):
351
  .cwm-verdict.go .cwm-verdict-pill { background: #4f8b4a; }
352
  .cwm-verdict.wait .cwm-verdict-pill { background: #d4a23c; }
353
  .cwm-verdict.fix .cwm-verdict-pill { background: #b94a3a; }
354
- .cwm-verdict-text { font-size: 16px; color: #4a3722; line-height: 1.5; }
355
- .cwm-verdict-text small { color: #8a6a3a; display: block; margin-top: 4px; }
356
  .cwm-verdict-empty {
357
  color: #b39870; font-style: italic; padding: 14px 0;
358
  }
 
80
  class RecipeHero(TemplatedHTML):
81
  css_template = """
82
  .cwm-hero {
83
+ background: #fffbf0 !important;
84
  border: 1px solid #d8c9ad;
85
  border-radius: 16px;
86
  padding: 32px;
 
94
  background: #efe3c8;
95
  }
96
  .cwm-hero h1 {
97
+ font-family: 'Lora', serif; font-size: 38px; color: #6b4a2a !important;
98
  margin: 0 0 8px;
99
  }
100
  .cwm-hero .meta {
101
+ color: #8a6a3a !important; font-size: 14px; letter-spacing: 0.04em;
102
  text-transform: uppercase; margin-bottom: 18px;
103
  }
104
  .cwm-hero .visual {
105
+ font-family: 'Lora', serif; font-style: italic; color: #6b4a2a !important;
106
  font-size: 17px; line-height: 1.55;
107
  }
108
  @media (max-width: 720px) { .cwm-hero { grid-template-columns: 1fr; } }
 
115
  servings = state.get("servings") or 0
116
  time = state.get("total_time_minutes") or 0
117
  visual = html.escape(state.get("final_dish_visual") or "")
118
+ img_b64 = state.get("final_dish_image_b64") or ""
119
+ img_path = state.get("final_dish_image_path") or ""
120
+ if img_b64:
121
+ img_tag = f'<img src="data:image/png;base64,{img_b64}" alt="final dish"/>'
122
+ elif img_path:
123
+ img_tag = f'<img src="/file={html.escape(img_path)}" alt="final dish"/>'
124
+ else:
125
+ img_tag = '<div style="background:#efe3c8;border-radius:12px;height:320px;display:flex;align-items:center;justify-content:center;color:#8a6a3a;font-family:\'Lora\',serif;font-style:italic;">Image will appear here</div>'
126
  return f"""
127
  <div class="cwm-hero">
128
  <div>{img_tag}</div>
 
189
  class DishOptions(TemplatedHTML):
190
  css_template = """
191
  .cwm-options { display: grid; grid-template-columns: repeat(3, 1fr); gap: 14px; }
192
+ .cwm-options .cwm-option {
193
+ background: #fffbf0 !important; border: 1px solid #d8c9ad; border-radius: 12px;
194
  padding: 18px; text-align: left;
195
  }
196
+ .cwm-options .cwm-option h3 {
197
+ font-family: 'Lora', serif; font-size: 19px; color: #6b4a2a !important;
198
  margin: 0 0 6px;
199
  }
200
+ .cwm-options .cwm-option p { color: #7a5a35 !important; font-size: 14px; line-height: 1.45; margin: 0; }
201
  @media (max-width: 720px) { .cwm-options { grid-template-columns: 1fr; } }
202
  """
203
 
 
220
  class StepCard(TemplatedHTML):
221
  css_template = """
222
  .cwm-steps { display: flex; flex-direction: column; gap: 16px; }
223
+ .cwm-steps .cwm-step {
224
  display: grid; grid-template-columns: 220px 1fr; gap: 22px;
225
+ background: #fffbf0 !important; border-left: 4px solid #a85c2a; border-radius: 10px;
226
  padding: 18px 22px;
227
  }
228
+ .cwm-steps .cwm-step img {
229
  width: 220px; height: 160px; object-fit: cover; border-radius: 8px;
230
  background: #efe3c8;
231
  }
232
+ .cwm-steps .cwm-step .placeholder {
233
  width: 220px; height: 160px; border-radius: 8px;
234
  background: linear-gradient(135deg,#efe3c8,#dccaa3);
235
  display:flex; align-items:center; justify-content:center;
236
+ color: #8a6a3a !important; font-family: 'Lora', serif; font-size: 14px;
237
  }
238
+ .cwm-steps .cwm-step h3 {
239
+ font-family: 'Lora', serif; color: #6b4a2a !important; margin: 0 0 6px; font-size: 22px;
240
  }
241
+ .cwm-steps .cwm-step p { font-size: 16px; line-height: 1.55; color: #4a3722 !important; margin: 0 0 8px; }
242
+ .cwm-steps .cwm-step .duration {
243
+ display: inline-block; background: #a85c2a !important; color: #fffbf0 !important;
244
  border-radius: 999px; padding: 3px 10px; font-size: 12px; letter-spacing: 0.04em;
245
  }
246
+ .cwm-steps .cwm-step .tip {
247
+ margin-top: 10px; padding: 10px 12px; background: #fff3d8 !important;
248
+ border-radius: 8px; font-size: 14px; color: #6b4a2a !important;
249
  }
250
  .cwm-step .tip::before { content: "💡 "; }
251
  @media (max-width: 720px) { .cwm-step { grid-template-columns: 1fr; } .cwm-step img, .cwm-step .placeholder { width: 100%; } }
 
263
  dur = html.escape(s.get("duration", ""))
264
  tip = s.get("tip")
265
  visual = html.escape(s.get("visual", ""))
266
+ img_b64 = s.get("image_b64") or ""
267
+ img_path = s.get("image_path") or ""
268
+ if img_b64:
269
+ img_block = f'<img src="data:image/png;base64,{img_b64}" alt="step {n}"/>'
270
+ elif img_path:
271
+ img_block = f'<img src="/file={html.escape(img_path)}" alt="step {n}"/>'
272
+ else:
273
+ img_block = f'<div class="placeholder">{visual[:80] if visual else f"Step {n}"}</div>'
274
  tip_block = f'<div class="tip">{html.escape(tip)}</div>' if tip else ""
275
  cards.append(f"""
276
  <div class="cwm-step">
 
293
  css_template = """
294
  .cwm-nutri-wrap { margin-top: 10px; }
295
  .cwm-nutri-title {
296
+ font-family: 'Lora', serif; color: #6b4a2a !important; font-size: 22px; margin: 0 0 14px;
297
  }
298
  .cwm-nutri {
299
  display: grid; grid-template-columns: repeat(5, 1fr); gap: 12px;
300
  }
301
+ .cwm-nutri .cwm-nutri-cell {
302
+ background: #fffbf0 !important; border: 1px solid #d8c9ad; border-radius: 10px;
303
  padding: 14px 10px; text-align: center;
304
  }
305
+ .cwm-nutri .cwm-nutri-cell .v {
306
+ font-family: 'Lora', serif; font-size: 24px; font-weight: 700; color: #6b4a2a !important;
307
  display: block;
308
  }
309
+ .cwm-nutri .cwm-nutri-cell .l {
310
  font-size: 11px; letter-spacing: 0.08em; text-transform: uppercase;
311
+ color: #8a6a3a !important; margin-top: 4px;
312
  }
313
  @media (max-width: 720px) { .cwm-nutri { grid-template-columns: repeat(2, 1fr); } }
314
  """
 
343
  css_template = """
344
  .cwm-verdict {
345
  display: flex; align-items: center; gap: 18px;
346
+ background: #fffbf0 !important; border-radius: 12px; padding: 18px 22px;
347
  border: 1px solid #d8c9ad;
348
  }
349
  .cwm-verdict.go { border-left: 6px solid #4f8b4a; }
 
357
  .cwm-verdict.go .cwm-verdict-pill { background: #4f8b4a; }
358
  .cwm-verdict.wait .cwm-verdict-pill { background: #d4a23c; }
359
  .cwm-verdict.fix .cwm-verdict-pill { background: #b94a3a; }
360
+ .cwm-verdict-text { font-size: 16px; color: #4a3722 !important; line-height: 1.5; }
361
+ .cwm-verdict-text small { color: #8a6a3a !important; display: block; margin-top: 4px; }
362
  .cwm-verdict-empty {
363
  color: #b39870; font-style: italic; padding: 14px 0;
364
  }
src/ui/components.pyi CHANGED
@@ -63,11 +63,14 @@ class RecipeHero(TemplatedHTML):
63
  servings = state.get("servings") or 0
64
  time = state.get("total_time_minutes") or 0
65
  visual = html.escape(state.get("final_dish_visual") or "")
66
- img = state.get("final_dish_image_path") or ""
67
- img_tag = (
68
- f'<img src="/file={html.escape(img)}" alt="final dish"/>'
69
- if img else '<div class="cwm-hero" style="background:#efe3c8;border-radius:12px;height:320px;"></div>'
70
- )
 
 
 
71
  return f"""
72
  <div class="cwm-hero">
73
  <div>{img_tag}</div>
 
63
  servings = state.get("servings") or 0
64
  time = state.get("total_time_minutes") or 0
65
  visual = html.escape(state.get("final_dish_visual") or "")
66
+ img_b64 = state.get("final_dish_image_b64") or ""
67
+ img_path = state.get("final_dish_image_path") or ""
68
+ if img_b64:
69
+ img_tag = f'<img src="data:image/png;base64,{img_b64}" alt="final dish"/>'
70
+ elif img_path:
71
+ img_tag = f'<img src="/file={html.escape(img_path)}" alt="final dish"/>'
72
+ else:
73
+ img_tag = '<div style="background:#efe3c8;border-radius:12px;height:320px;display:flex;align-items:center;justify-content:center;color:#8a6a3a;font-family:\'Lora\',serif;font-style:italic;">Image will appear here</div>'
74
  return f"""
75
  <div class="cwm-hero">
76
  <div>{img_tag}</div>
src/ui/theme.py CHANGED
@@ -13,10 +13,64 @@ theme = gr.themes.Soft(
13
 
14
  CSS = """
15
  @import url('https://fonts.googleapis.com/css2?family=Lora:wght@400;700&display=swap');
16
- .gradio-container { background: #f5ecd9 !important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  .gradio-container .prose h1,
18
  .gradio-container .prose h2,
19
- .gradio-container .prose h3 { font-family: 'Lora', serif !important; color: #6b4a2a; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  /* Generic container shared by every HTMLComponent */
21
  .cwm-card {
22
  border: 1px solid #d8c9ad;
@@ -26,6 +80,7 @@ CSS = """
26
  }
27
  button.primary, .gr-button-primary {
28
  background: #a85c2a !important;
 
29
  font-weight: 600 !important;
30
  font-size: 16px !important;
31
  padding: 12px 22px !important;
 
13
 
14
  CSS = """
15
  @import url('https://fonts.googleapis.com/css2?family=Lora:wght@400;700&display=swap');
16
+
17
+ /* ---------------------------------------------------------------------------
18
+ Force a warm light palette regardless of the browser/system dark mode.
19
+ We pin the parchment background, so we must also pin DARK text colours via
20
+ Gradio's CSS variables — otherwise dark-mode users get white text on the
21
+ light background and it disappears.
22
+ --------------------------------------------------------------------------- */
23
+ .gradio-container, .gradio-container.dark {
24
+ background: #f5ecd9 !important;
25
+ color-scheme: light !important;
26
+
27
+ --body-text-color: #4a3722;
28
+ --body-text-color-subdued: #7a5a35;
29
+ --block-title-text-color: #6b4a2a;
30
+ --block-label-text-color: #6b4a2a;
31
+ --block-info-text-color: #7a5a35;
32
+ --block-background-fill: #fffbf0;
33
+ --input-background-fill: #fffbf0;
34
+ --border-color-primary: #d8c9ad;
35
+ --color-accent-soft: #fbe2d2;
36
+ }
37
+
38
+ /* Blanket dark text for native Gradio text elements (covers dark mode) */
39
+ .gradio-container,
40
+ .gradio-container .prose,
41
+ .gradio-container label,
42
+ .gradio-container .gr-text,
43
+ .gradio-container span,
44
+ .gradio-container p,
45
+ .gradio-container .gr-check-radio label,
46
+ .gradio-container .wrap,
47
+ .gradio-container .gr-form,
48
+ .gradio-container .tab-nav button,
49
+ .gradio-container .gr-accordion,
50
+ .gradio-container input,
51
+ .gradio-container textarea {
52
+ color: #4a3722 !important;
53
+ }
54
+
55
  .gradio-container .prose h1,
56
  .gradio-container .prose h2,
57
+ .gradio-container .prose h3 { font-family: 'Lora', serif !important; color: #6b4a2a !important; }
58
+
59
+ /* Tabs: dark labels, terracotta active */
60
+ .gradio-container .tab-nav button { color: #6b4a2a !important; }
61
+ .gradio-container .tab-nav button.selected {
62
+ color: #a85c2a !important; border-bottom-color: #a85c2a !important;
63
+ }
64
+
65
+ /* Native blocks (inputs, radio, checkbox, number) on warm cards */
66
+ .gradio-container .block,
67
+ .gradio-container .form,
68
+ .gradio-container input[type="text"],
69
+ .gradio-container input[type="number"] {
70
+ background: #fffbf0 !important;
71
+ border-color: #d8c9ad !important;
72
+ }
73
+
74
  /* Generic container shared by every HTMLComponent */
75
  .cwm-card {
76
  border: 1px solid #d8c9ad;
 
80
  }
81
  button.primary, .gr-button-primary {
82
  background: #a85c2a !important;
83
+ color: #fffbf0 !important;
84
  font-weight: 600 !important;
85
  font-size: 16px !important;
86
  padding: 12px 22px !important;