Liori25 commited on
Commit
56afb7e
·
verified ·
1 Parent(s): 1c885d8

Upload SynthaticDataGeneration (2).ipynb

Browse files
Files changed (1) hide show
  1. SynthaticDataGeneration (2).ipynb +740 -0
SynthaticDataGeneration (2).ipynb ADDED
@@ -0,0 +1,740 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "A100"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU",
17
+ "widgets": {
18
+ "application/vnd.jupyter.widget-state+json": {
19
+ "362ad3c800864e88b4718c36c61aff6f": {
20
+ "model_module": "@jupyter-widgets/controls",
21
+ "model_name": "HBoxModel",
22
+ "model_module_version": "1.5.0",
23
+ "state": {
24
+ "_dom_classes": [],
25
+ "_model_module": "@jupyter-widgets/controls",
26
+ "_model_module_version": "1.5.0",
27
+ "_model_name": "HBoxModel",
28
+ "_view_count": null,
29
+ "_view_module": "@jupyter-widgets/controls",
30
+ "_view_module_version": "1.5.0",
31
+ "_view_name": "HBoxView",
32
+ "box_style": "",
33
+ "children": [
34
+ "IPY_MODEL_04671a1d41404a3f8d3118d963162d55",
35
+ "IPY_MODEL_c140514ea9094d0a83d0eb871e1c96d8",
36
+ "IPY_MODEL_db4e7a6835774140a26c28d8af93457b"
37
+ ],
38
+ "layout": "IPY_MODEL_7c570d0c1dce4a218f7a9d537ceb2b43"
39
+ }
40
+ },
41
+ "04671a1d41404a3f8d3118d963162d55": {
42
+ "model_module": "@jupyter-widgets/controls",
43
+ "model_name": "HTMLModel",
44
+ "model_module_version": "1.5.0",
45
+ "state": {
46
+ "_dom_classes": [],
47
+ "_model_module": "@jupyter-widgets/controls",
48
+ "_model_module_version": "1.5.0",
49
+ "_model_name": "HTMLModel",
50
+ "_view_count": null,
51
+ "_view_module": "@jupyter-widgets/controls",
52
+ "_view_module_version": "1.5.0",
53
+ "_view_name": "HTMLView",
54
+ "description": "",
55
+ "description_tooltip": null,
56
+ "layout": "IPY_MODEL_f7b4a83a6921499c841a38ec75c09d27",
57
+ "placeholder": "​",
58
+ "style": "IPY_MODEL_11ebd72684464498bd59b5677d26fb6f",
59
+ "value": "Loading checkpoint shards: 100%"
60
+ }
61
+ },
62
+ "c140514ea9094d0a83d0eb871e1c96d8": {
63
+ "model_module": "@jupyter-widgets/controls",
64
+ "model_name": "FloatProgressModel",
65
+ "model_module_version": "1.5.0",
66
+ "state": {
67
+ "_dom_classes": [],
68
+ "_model_module": "@jupyter-widgets/controls",
69
+ "_model_module_version": "1.5.0",
70
+ "_model_name": "FloatProgressModel",
71
+ "_view_count": null,
72
+ "_view_module": "@jupyter-widgets/controls",
73
+ "_view_module_version": "1.5.0",
74
+ "_view_name": "ProgressView",
75
+ "bar_style": "success",
76
+ "description": "",
77
+ "description_tooltip": null,
78
+ "layout": "IPY_MODEL_baee00ce0df8491c8813b15e1341e545",
79
+ "max": 2,
80
+ "min": 0,
81
+ "orientation": "horizontal",
82
+ "style": "IPY_MODEL_01d8bca7a75a41249a9af5c40d397286",
83
+ "value": 2
84
+ }
85
+ },
86
+ "db4e7a6835774140a26c28d8af93457b": {
87
+ "model_module": "@jupyter-widgets/controls",
88
+ "model_name": "HTMLModel",
89
+ "model_module_version": "1.5.0",
90
+ "state": {
91
+ "_dom_classes": [],
92
+ "_model_module": "@jupyter-widgets/controls",
93
+ "_model_module_version": "1.5.0",
94
+ "_model_name": "HTMLModel",
95
+ "_view_count": null,
96
+ "_view_module": "@jupyter-widgets/controls",
97
+ "_view_module_version": "1.5.0",
98
+ "_view_name": "HTMLView",
99
+ "description": "",
100
+ "description_tooltip": null,
101
+ "layout": "IPY_MODEL_1fe94bdf961f4fa1bb724499ab5ce5e3",
102
+ "placeholder": "​",
103
+ "style": "IPY_MODEL_5efb08d4cf874b9c8c424f09cdd2f1e8",
104
+ "value": " 2/2 [00:01<00:00,  1.12it/s]"
105
+ }
106
+ },
107
+ "7c570d0c1dce4a218f7a9d537ceb2b43": {
108
+ "model_module": "@jupyter-widgets/base",
109
+ "model_name": "LayoutModel",
110
+ "model_module_version": "1.2.0",
111
+ "state": {
112
+ "_model_module": "@jupyter-widgets/base",
113
+ "_model_module_version": "1.2.0",
114
+ "_model_name": "LayoutModel",
115
+ "_view_count": null,
116
+ "_view_module": "@jupyter-widgets/base",
117
+ "_view_module_version": "1.2.0",
118
+ "_view_name": "LayoutView",
119
+ "align_content": null,
120
+ "align_items": null,
121
+ "align_self": null,
122
+ "border": null,
123
+ "bottom": null,
124
+ "display": null,
125
+ "flex": null,
126
+ "flex_flow": null,
127
+ "grid_area": null,
128
+ "grid_auto_columns": null,
129
+ "grid_auto_flow": null,
130
+ "grid_auto_rows": null,
131
+ "grid_column": null,
132
+ "grid_gap": null,
133
+ "grid_row": null,
134
+ "grid_template_areas": null,
135
+ "grid_template_columns": null,
136
+ "grid_template_rows": null,
137
+ "height": null,
138
+ "justify_content": null,
139
+ "justify_items": null,
140
+ "left": null,
141
+ "margin": null,
142
+ "max_height": null,
143
+ "max_width": null,
144
+ "min_height": null,
145
+ "min_width": null,
146
+ "object_fit": null,
147
+ "object_position": null,
148
+ "order": null,
149
+ "overflow": null,
150
+ "overflow_x": null,
151
+ "overflow_y": null,
152
+ "padding": null,
153
+ "right": null,
154
+ "top": null,
155
+ "visibility": null,
156
+ "width": null
157
+ }
158
+ },
159
+ "f7b4a83a6921499c841a38ec75c09d27": {
160
+ "model_module": "@jupyter-widgets/base",
161
+ "model_name": "LayoutModel",
162
+ "model_module_version": "1.2.0",
163
+ "state": {
164
+ "_model_module": "@jupyter-widgets/base",
165
+ "_model_module_version": "1.2.0",
166
+ "_model_name": "LayoutModel",
167
+ "_view_count": null,
168
+ "_view_module": "@jupyter-widgets/base",
169
+ "_view_module_version": "1.2.0",
170
+ "_view_name": "LayoutView",
171
+ "align_content": null,
172
+ "align_items": null,
173
+ "align_self": null,
174
+ "border": null,
175
+ "bottom": null,
176
+ "display": null,
177
+ "flex": null,
178
+ "flex_flow": null,
179
+ "grid_area": null,
180
+ "grid_auto_columns": null,
181
+ "grid_auto_flow": null,
182
+ "grid_auto_rows": null,
183
+ "grid_column": null,
184
+ "grid_gap": null,
185
+ "grid_row": null,
186
+ "grid_template_areas": null,
187
+ "grid_template_columns": null,
188
+ "grid_template_rows": null,
189
+ "height": null,
190
+ "justify_content": null,
191
+ "justify_items": null,
192
+ "left": null,
193
+ "margin": null,
194
+ "max_height": null,
195
+ "max_width": null,
196
+ "min_height": null,
197
+ "min_width": null,
198
+ "object_fit": null,
199
+ "object_position": null,
200
+ "order": null,
201
+ "overflow": null,
202
+ "overflow_x": null,
203
+ "overflow_y": null,
204
+ "padding": null,
205
+ "right": null,
206
+ "top": null,
207
+ "visibility": null,
208
+ "width": null
209
+ }
210
+ },
211
+ "11ebd72684464498bd59b5677d26fb6f": {
212
+ "model_module": "@jupyter-widgets/controls",
213
+ "model_name": "DescriptionStyleModel",
214
+ "model_module_version": "1.5.0",
215
+ "state": {
216
+ "_model_module": "@jupyter-widgets/controls",
217
+ "_model_module_version": "1.5.0",
218
+ "_model_name": "DescriptionStyleModel",
219
+ "_view_count": null,
220
+ "_view_module": "@jupyter-widgets/base",
221
+ "_view_module_version": "1.2.0",
222
+ "_view_name": "StyleView",
223
+ "description_width": ""
224
+ }
225
+ },
226
+ "baee00ce0df8491c8813b15e1341e545": {
227
+ "model_module": "@jupyter-widgets/base",
228
+ "model_name": "LayoutModel",
229
+ "model_module_version": "1.2.0",
230
+ "state": {
231
+ "_model_module": "@jupyter-widgets/base",
232
+ "_model_module_version": "1.2.0",
233
+ "_model_name": "LayoutModel",
234
+ "_view_count": null,
235
+ "_view_module": "@jupyter-widgets/base",
236
+ "_view_module_version": "1.2.0",
237
+ "_view_name": "LayoutView",
238
+ "align_content": null,
239
+ "align_items": null,
240
+ "align_self": null,
241
+ "border": null,
242
+ "bottom": null,
243
+ "display": null,
244
+ "flex": null,
245
+ "flex_flow": null,
246
+ "grid_area": null,
247
+ "grid_auto_columns": null,
248
+ "grid_auto_flow": null,
249
+ "grid_auto_rows": null,
250
+ "grid_column": null,
251
+ "grid_gap": null,
252
+ "grid_row": null,
253
+ "grid_template_areas": null,
254
+ "grid_template_columns": null,
255
+ "grid_template_rows": null,
256
+ "height": null,
257
+ "justify_content": null,
258
+ "justify_items": null,
259
+ "left": null,
260
+ "margin": null,
261
+ "max_height": null,
262
+ "max_width": null,
263
+ "min_height": null,
264
+ "min_width": null,
265
+ "object_fit": null,
266
+ "object_position": null,
267
+ "order": null,
268
+ "overflow": null,
269
+ "overflow_x": null,
270
+ "overflow_y": null,
271
+ "padding": null,
272
+ "right": null,
273
+ "top": null,
274
+ "visibility": null,
275
+ "width": null
276
+ }
277
+ },
278
+ "01d8bca7a75a41249a9af5c40d397286": {
279
+ "model_module": "@jupyter-widgets/controls",
280
+ "model_name": "ProgressStyleModel",
281
+ "model_module_version": "1.5.0",
282
+ "state": {
283
+ "_model_module": "@jupyter-widgets/controls",
284
+ "_model_module_version": "1.5.0",
285
+ "_model_name": "ProgressStyleModel",
286
+ "_view_count": null,
287
+ "_view_module": "@jupyter-widgets/base",
288
+ "_view_module_version": "1.2.0",
289
+ "_view_name": "StyleView",
290
+ "bar_color": null,
291
+ "description_width": ""
292
+ }
293
+ },
294
+ "1fe94bdf961f4fa1bb724499ab5ce5e3": {
295
+ "model_module": "@jupyter-widgets/base",
296
+ "model_name": "LayoutModel",
297
+ "model_module_version": "1.2.0",
298
+ "state": {
299
+ "_model_module": "@jupyter-widgets/base",
300
+ "_model_module_version": "1.2.0",
301
+ "_model_name": "LayoutModel",
302
+ "_view_count": null,
303
+ "_view_module": "@jupyter-widgets/base",
304
+ "_view_module_version": "1.2.0",
305
+ "_view_name": "LayoutView",
306
+ "align_content": null,
307
+ "align_items": null,
308
+ "align_self": null,
309
+ "border": null,
310
+ "bottom": null,
311
+ "display": null,
312
+ "flex": null,
313
+ "flex_flow": null,
314
+ "grid_area": null,
315
+ "grid_auto_columns": null,
316
+ "grid_auto_flow": null,
317
+ "grid_auto_rows": null,
318
+ "grid_column": null,
319
+ "grid_gap": null,
320
+ "grid_row": null,
321
+ "grid_template_areas": null,
322
+ "grid_template_columns": null,
323
+ "grid_template_rows": null,
324
+ "height": null,
325
+ "justify_content": null,
326
+ "justify_items": null,
327
+ "left": null,
328
+ "margin": null,
329
+ "max_height": null,
330
+ "max_width": null,
331
+ "min_height": null,
332
+ "min_width": null,
333
+ "object_fit": null,
334
+ "object_position": null,
335
+ "order": null,
336
+ "overflow": null,
337
+ "overflow_x": null,
338
+ "overflow_y": null,
339
+ "padding": null,
340
+ "right": null,
341
+ "top": null,
342
+ "visibility": null,
343
+ "width": null
344
+ }
345
+ },
346
+ "5efb08d4cf874b9c8c424f09cdd2f1e8": {
347
+ "model_module": "@jupyter-widgets/controls",
348
+ "model_name": "DescriptionStyleModel",
349
+ "model_module_version": "1.5.0",
350
+ "state": {
351
+ "_model_module": "@jupyter-widgets/controls",
352
+ "_model_module_version": "1.5.0",
353
+ "_model_name": "DescriptionStyleModel",
354
+ "_view_count": null,
355
+ "_view_module": "@jupyter-widgets/base",
356
+ "_view_module_version": "1.2.0",
357
+ "_view_name": "StyleView",
358
+ "description_width": ""
359
+ }
360
+ }
361
+ }
362
+ }
363
+ },
364
+ "cells": [
365
+ {
366
+ "cell_type": "markdown",
367
+ "source": [
368
+ "# Final Project DS Course"
369
+ ],
370
+ "metadata": {
371
+ "id": "_64vlsYnLasu"
372
+ }
373
+ },
374
+ {
375
+ "cell_type": "markdown",
376
+ "source": [
377
+ "## Part 1: Synthetic Data Generation"
378
+ ],
379
+ "metadata": {
380
+ "id": "JkWTOu9TMGHS"
381
+ }
382
+ },
383
+ {
384
+ "cell_type": "markdown",
385
+ "source": [
386
+ "**Project Overview:**\n",
387
+ "This project involves building an AI-powered application that digitizes handwritten recipes from images using Optical Character Recognition (OCR) and Natural Language Processing. By generating vector embeddings of the extracted text, the system identifies and retrieves three semantically similar recipes from a synthetically generated dataset of 10,000 entries. The final solution is deployed as an interactive web interface on Hugging Face Spaces, bridging the gap between physical archives and digital accessibility."
388
+ ],
389
+ "metadata": {
390
+ "id": "IgUr5Or9L_0y"
391
+ }
392
+ },
393
+ {
394
+ "cell_type": "code",
395
+ "source": [
396
+ "!pip install -q transformers torch accelerate\n",
397
+ "print(\"✅ Installations complete.\")"
398
+ ],
399
+ "metadata": {
400
+ "colab": {
401
+ "base_uri": "https://localhost:8080/"
402
+ },
403
+ "id": "j8Ws9fnAZGEb",
404
+ "outputId": "77ed15cc-91b1-4ae4-a551-90d1bc1d1d14"
405
+ },
406
+ "execution_count": 2,
407
+ "outputs": [
408
+ {
409
+ "output_type": "stream",
410
+ "name": "stdout",
411
+ "text": [
412
+ "✅ Installations complete.\n"
413
+ ]
414
+ }
415
+ ]
416
+ },
417
+ {
418
+ "cell_type": "code",
419
+ "source": [
420
+ "# ================================\n",
421
+ "# ONE-SHOT: FAST + STABLE 10K RECIPE GENERATION (A100 OPTIMIZED)\n",
422
+ "# FIXED: Padding Side Error\n",
423
+ "# ================================\n",
424
+ "\n",
425
+ "import os, json, random, re, time\n",
426
+ "import pandas as pd\n",
427
+ "from tqdm.auto import tqdm\n",
428
+ "\n",
429
+ "import torch\n",
430
+ "from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM\n",
431
+ "\n",
432
+ "# ----------------\n",
433
+ "# 1) SETTINGS\n",
434
+ "# ----------------\n",
435
+ "TARGET_COUNT = 10_000\n",
436
+ "SAVE_EVERY = 500\n",
437
+ "BATCH_SIZE = 64\n",
438
+ "MAX_NEW_TOKENS = 150\n",
439
+ "OUT_JSONL = \"RecipeData_10K.jsonl\"\n",
440
+ "OUT_CSV = \"RecipeData_10K.csv\"\n",
441
+ "\n",
442
+ "# Model: Qwen 2.5 3B (Fast & Smart)\n",
443
+ "MODEL_ID = \"Qwen/Qwen2.5-3B-Instruct\"\n",
444
+ "\n",
445
+ "# ----------------\n",
446
+ "# 2) EXAMPLE TEMPLATE\n",
447
+ "# ----------------\n",
448
+ "grandma_template = \"\"\"\n",
449
+ "Title: Granma's Meatballs\n",
450
+ "Ingredients:\n",
451
+ "- Meat 1kg\n",
452
+ "- Tomatos 8\n",
453
+ "- Onion (as much as you like)\n",
454
+ "- Spices: salt, pepper, chili\n",
455
+ "- Parsley\n",
456
+ "- Bread crumbs (2 spoons)\n",
457
+ "Instructions:\n",
458
+ "In one bowl mix it all, eventually create the meat balls, put in a pot, and cook it all for 40 minutes approximately.\n",
459
+ "<END_RECIPE>\n",
460
+ "\"\"\".strip()\n",
461
+ "\n",
462
+ "# ----------------\n",
463
+ "# 3) MENU GENERATOR\n",
464
+ "# ----------------\n",
465
+ "cuisine_profiles = {\n",
466
+ " \"Italian\": {\n",
467
+ " \"adjs\": [\"Classic\",\"Rustic\",\"Creamy\",\"Baked\",\"Cheesy\",\"Tomato-Basil\",\"Garlic\",\"Sicilian\",\"Tuscan\",\"Spicy\",\"Homemade\",\"Nonna's\"],\n",
468
+ " \"mains\": [\"Pasta\",\"Risotto\",\"Lasagna\",\"Chicken Parmesan\",\"Gnocchi\",\"Polenta\",\"Ravioli\",\"Meatballs\",\"Ziti\",\"Alfredo\"],\n",
469
+ " \"extras\": [\"with Mushrooms\",\"with Spinach\",\"Al Forno\",\"Primavera\",\"Supremo\",\"Rustica\",\"Delight\",\"Special\"]\n",
470
+ " },\n",
471
+ " \"Mediterranean\": {\n",
472
+ " \"adjs\": [\"Spicy\",\"Fresh\",\"Roasted\",\"Grandma's\",\"Tahini-Drizzled\",\"Zesty\",\"Lemon\",\"Grilled\",\"Golden\",\"Herbed\"],\n",
473
+ " \"mains\": [\"Shakshuka\",\"Eggplant\",\"Falafel\",\"Hummus Plate\",\"Kebab\",\"Couscous\",\"Shawarma\",\"Lamb Chops\",\"Fish Fillet\"],\n",
474
+ " \"extras\": [\"with Pita\",\"Bowl\",\"Platter\",\"Salad\",\"Stew\",\"with Yogurt Sauce\",\"Feast\",\"Medley\"]\n",
475
+ " },\n",
476
+ " \"Asian_Fusion\": {\n",
477
+ " \"adjs\": [\"Spicy\",\"Golden\",\"Soy-Glazed\",\"Ginger\",\"Crispy\",\"Steamed\",\"Wok-Fried\",\"Teriyaki\",\"Szechuan\",\"Sweet & Sour\"],\n",
478
+ " \"mains\": [\"Chicken\",\"Tofu\",\"Beef\",\"Rice Bowl\",\"Noodles\",\"Dumplings\",\"Stir-Fry\",\"Duck\",\"Prawns\"],\n",
479
+ " \"extras\": [\"Delight\",\"Surprise\",\"Box\",\"Feast\",\"with Cashews\",\"with Broccoli\",\"Dragon Style\"]\n",
480
+ " },\n",
481
+ " \"Dessert\": {\n",
482
+ " \"adjs\": [\"Sweet\",\"Chocolate\",\"Fluffy\",\"Cinnamon\",\"Glazed\",\"Homemade\",\"Vanilla\",\"Berry\",\"Dark\",\"Creamy\"],\n",
483
+ " \"mains\": [\"Cake\",\"Cookies\",\"Apple Pie\",\"Brownies\",\"Pudding\",\"Rugelach\",\"Muffins\",\"Cheesecake\",\"Tart\"],\n",
484
+ " \"extras\": [\"Swirl\",\"Crumble\",\"Bites\",\"Bars\",\"Supreme\",\"Dream\",\"Celebration\"]\n",
485
+ " }\n",
486
+ "}\n",
487
+ "\n",
488
+ "def build_prompts(target_count: int):\n",
489
+ " prompt_data = []\n",
490
+ " per_cuisine = max(1, target_count // len(cuisine_profiles))\n",
491
+ "\n",
492
+ " for cuisine, data in cuisine_profiles.items():\n",
493
+ " for _ in range(per_cuisine):\n",
494
+ " dish_name = f\"{random.choice(data['adjs'])} {cuisine} {random.choice(data['mains'])} {random.choice(data['extras'])}\"\n",
495
+ "\n",
496
+ " prompt = f\"\"\"<|im_start|>system\n",
497
+ "You are a helpful assistant. Follow the exact format of the example provided. Be brief.\n",
498
+ "Rules:\n",
499
+ "- Keep output short.\n",
500
+ "- MUST include: Title:, Ingredients:, Instructions:\n",
501
+ "- MUST end with: <END_RECIPE>\n",
502
+ "- Output ONLY the recipe (no extra commentary).\n",
503
+ "<|im_end|>\n",
504
+ "<|im_start|>user\n",
505
+ "Example:\n",
506
+ "{grandma_template}\n",
507
+ "\n",
508
+ "Task:\n",
509
+ "Generate a recipe for '{dish_name}' using exactly the same style and format.\n",
510
+ "<|im_end|>\n",
511
+ "<|im_start|>assistant\n",
512
+ "\"\"\"\n",
513
+ " prompt_data.append({\"title\": dish_name, \"prompt\": prompt})\n",
514
+ "\n",
515
+ " while len(prompt_data) < target_count:\n",
516
+ " prompt_data.append(random.choice(prompt_data))\n",
517
+ "\n",
518
+ " random.shuffle(prompt_data)\n",
519
+ " return prompt_data[:target_count]\n",
520
+ "\n",
521
+ "# ----------------\n",
522
+ "# 4) PARSER\n",
523
+ "# ----------------\n",
524
+ "def parse_recipe(clean_text: str, fallback_title: str):\n",
525
+ " if \"<END_RECIPE>\" in clean_text:\n",
526
+ " clean_text = clean_text.split(\"<END_RECIPE>\")[0].strip()\n",
527
+ "\n",
528
+ " title = fallback_title\n",
529
+ " ingredients = \"Parse Error\"\n",
530
+ " instructions = clean_text\n",
531
+ "\n",
532
+ " m = re.search(r'(?im)^\\s*Title:\\s*(.+)\\s*$', clean_text)\n",
533
+ " if m:\n",
534
+ " title = m.group(1).strip()\n",
535
+ "\n",
536
+ " parts = re.split(r'(?im)^\\s*Ingredients:\\s*$|^\\s*Instructions:\\s*$', clean_text)\n",
537
+ " if len(parts) >= 3:\n",
538
+ " ingredients = parts[1].strip()\n",
539
+ " instructions = parts[2].strip()\n",
540
+ "\n",
541
+ " return title, ingredients, instructions, clean_text\n",
542
+ "\n",
543
+ "# ----------------\n",
544
+ "# 5) PIPELINE SETUP (FIXED)\n",
545
+ "# ----------------\n",
546
+ "print(f\"CUDA Available: {torch.cuda.is_available()}\")\n",
547
+ "dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n",
548
+ "\n",
549
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)\n",
550
+ "\n",
551
+ "# --- THE FIX IS HERE ---\n",
552
+ "tokenizer.padding_side = \"left\" # Explicitly set left padding\n",
553
+ "# -----------------------\n",
554
+ "\n",
555
+ "model = AutoModelForCausalLM.from_pretrained(\n",
556
+ " MODEL_ID,\n",
557
+ " torch_dtype=dtype,\n",
558
+ " device_map=\"auto\"\n",
559
+ ")\n",
560
+ "\n",
561
+ "if tokenizer.pad_token_id is None:\n",
562
+ " tokenizer.pad_token = tokenizer.eos_token\n",
563
+ "\n",
564
+ "pipe = pipeline(\n",
565
+ " \"text-generation\",\n",
566
+ " model=model,\n",
567
+ " tokenizer=tokenizer\n",
568
+ ")\n",
569
+ "\n",
570
+ "gen_kwargs = dict(\n",
571
+ " max_new_tokens=MAX_NEW_TOKENS,\n",
572
+ " do_sample=True,\n",
573
+ " temperature=0.9,\n",
574
+ " top_p=0.95,\n",
575
+ " repetition_penalty=1.05,\n",
576
+ " return_full_text=False,\n",
577
+ " pad_token_id=tokenizer.pad_token_id,\n",
578
+ " eos_token_id=tokenizer.eos_token_id\n",
579
+ ")\n",
580
+ "\n",
581
+ "# ----------------\n",
582
+ "# 6) RESUME SUPPORT & GENERATION\n",
583
+ "# ----------------\n",
584
+ "existing = 0\n",
585
+ "if os.path.exists(OUT_JSONL):\n",
586
+ " with open(OUT_JSONL, \"r\", encoding=\"utf-8\") as f:\n",
587
+ " for _ in f:\n",
588
+ " existing += 1\n",
589
+ " print(f\"Found existing {existing} rows. Resuming...\")\n",
590
+ "\n",
591
+ "need = max(0, TARGET_COUNT - existing)\n",
592
+ "\n",
593
+ "if need > 0:\n",
594
+ " prompt_data = build_prompts(need)\n",
595
+ " print(f\"🚀 Starting generation for {len(prompt_data)} recipes...\")\n",
596
+ "\n",
597
+ " def run_with_batchsize(prompts, batch_size):\n",
598
+ " with torch.inference_mode():\n",
599
+ " return pipe(prompts, batch_size=batch_size, **gen_kwargs)\n",
600
+ "\n",
601
+ " start = time.time()\n",
602
+ " written = 0\n",
603
+ "\n",
604
+ " with open(OUT_JSONL, \"a\", encoding=\"utf-8\") as f_out:\n",
605
+ " for i in tqdm(range(0, len(prompt_data), SAVE_EVERY), desc=\"Generating chunks\"):\n",
606
+ " chunk = prompt_data[i:i+SAVE_EVERY]\n",
607
+ " chunk_prompts = [x[\"prompt\"] for x in chunk]\n",
608
+ "\n",
609
+ " try:\n",
610
+ " results = run_with_batchsize(chunk_prompts, BATCH_SIZE)\n",
611
+ " except RuntimeError as e:\n",
612
+ " if \"out of memory\" in str(e).lower():\n",
613
+ " torch.cuda.empty_cache()\n",
614
+ " print(\"⚠️ OOM detected. Retrying with reduced batch size (8)...\")\n",
615
+ " results = run_with_batchsize(chunk_prompts, 8)\n",
616
+ " else:\n",
617
+ " raise\n",
618
+ "\n",
619
+ " for j, out in enumerate(results):\n",
620
+ " gen_text = out[0][\"generated_text\"] if isinstance(out, list) else out.get(\"generated_text\", \"\")\n",
621
+ "\n",
622
+ " clean_text = gen_text.strip()\n",
623
+ " title, ingreds, instrs, raw = parse_recipe(clean_text, chunk[j][\"title\"])\n",
624
+ "\n",
625
+ " row = {\n",
626
+ " \"Title\": title,\n",
627
+ " \"Ingredients\": ingreds,\n",
628
+ " \"Instructions\": instrs,\n",
629
+ " \"Raw_Output\": raw\n",
630
+ " }\n",
631
+ " f_out.write(json.dumps(row, ensure_ascii=False) + \"\\n\")\n",
632
+ " written += 1\n",
633
+ "\n",
634
+ " f_out.flush()\n",
635
+ "\n",
636
+ " elapsed = time.time() - start\n",
637
+ " print(f\"✅ Generation done! {written} recipes in {elapsed/60:.1f} minutes.\")\n",
638
+ "\n",
639
+ "else:\n",
640
+ " print(\"✅ Target reached. No new generation needed.\")\n",
641
+ "\n",
642
+ "# ----------------\n",
643
+ "# 7) EXPORT TO CSV\n",
644
+ "# ----------------\n",
645
+ "print(\"Exporting to CSV...\")\n",
646
+ "rows = []\n",
647
+ "with open(OUT_JSONL, \"r\", encoding=\"utf-8\") as f:\n",
648
+ " for line in f:\n",
649
+ " rows.append(json.loads(line))\n",
650
+ "\n",
651
+ "df = pd.DataFrame(rows)\n",
652
+ "df.to_csv(OUT_CSV, index=False)\n",
653
+ "print(f\"🎉 FINAL SUCCESS! Saved '{OUT_CSV}' with {len(df)} recipes.\")\n",
654
+ "print(df[['Title', 'Ingredients']].head())"
655
+ ],
656
+ "metadata": {
657
+ "colab": {
658
+ "base_uri": "https://localhost:8080/",
659
+ "height": 379,
660
+ "referenced_widgets": [
661
+ "362ad3c800864e88b4718c36c61aff6f",
662
+ "04671a1d41404a3f8d3118d963162d55",
663
+ "c140514ea9094d0a83d0eb871e1c96d8",
664
+ "db4e7a6835774140a26c28d8af93457b",
665
+ "7c570d0c1dce4a218f7a9d537ceb2b43",
666
+ "f7b4a83a6921499c841a38ec75c09d27",
667
+ "11ebd72684464498bd59b5677d26fb6f",
668
+ "baee00ce0df8491c8813b15e1341e545",
669
+ "01d8bca7a75a41249a9af5c40d397286",
670
+ "1fe94bdf961f4fa1bb724499ab5ce5e3",
671
+ "5efb08d4cf874b9c8c424f09cdd2f1e8"
672
+ ]
673
+ },
674
+ "id": "WhYOWuJPXLcT",
675
+ "outputId": "ce796ad9-b1d3-4a7e-bfab-a6118c763c3c"
676
+ },
677
+ "execution_count": null,
678
+ "outputs": [
679
+ {
680
+ "output_type": "stream",
681
+ "name": "stdout",
682
+ "text": [
683
+ "CUDA Available: True\n"
684
+ ]
685
+ },
686
+ {
687
+ "output_type": "display_data",
688
+ "data": {
689
+ "text/plain": [
690
+ "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
691
+ ],
692
+ "application/vnd.jupyter.widget-view+json": {
693
+ "version_major": 2,
694
+ "version_minor": 0,
695
+ "model_id": "362ad3c800864e88b4718c36c61aff6f"
696
+ }
697
+ },
698
+ "metadata": {}
699
+ },
700
+ {
701
+ "output_type": "stream",
702
+ "name": "stderr",
703
+ "text": [
704
+ "Device set to use cuda:0\n"
705
+ ]
706
+ },
707
+ {
708
+ "output_type": "stream",
709
+ "name": "stdout",
710
+ "text": [
711
+ "Found existing 10000 rows. Resuming...\n",
712
+ "✅ Target reached. No new generation needed.\n",
713
+ "Exporting to CSV...\n",
714
+ "🎉 FINAL SUCCESS! Saved 'RecipeData_10K.csv' with 10000 recipes.\n",
715
+ " Title \\\n",
716
+ "0 Zesty Mediterranean Lamb Chops Platter \n",
717
+ "1 Szechuan Asian_Fusion Tofu with Cashews \n",
718
+ "2 Zesty Mediterranean Hummus Plate Medley \n",
719
+ "3 Tuscan Italian Ravioli with Mushrooms \n",
720
+ "4 Lemon Mediterranean Shawarma with Yogurt Sauce \n",
721
+ "\n",
722
+ " Ingredients \n",
723
+ "0 - Lamb Chops 6\\n- Lemon (freshly squeezed) 1\\n... \n",
724
+ "1 - Tofu 500g\\n- Cashews 100g\\n- Soy Sauce 3 tbs... \n",
725
+ "2 - Chickpeas 500g\\n- Olive Oil 2 tbsp\\n- Lemon ... \n",
726
+ "3 - Flour 500g\\n- Eggs 3\\n- Fillings: ricotta ch... \n",
727
+ "4 - Chicken or lamb (1kg)\\n- Olive oil\\n- Lemon ... \n"
728
+ ]
729
+ }
730
+ ]
731
+ },
732
+ {
733
+ "cell_type": "markdown",
734
+ "source": [],
735
+ "metadata": {
736
+ "id": "RUYFuxuXqJmB"
737
+ }
738
+ }
739
+ ]
740
+ }