Spaces:

Liori25
/

CookBookAI

Sleeping

App Files Files Community

Liori25 commited on Jan 13

Commit

8573333

verified ·

1 Parent(s): aaebbeb

Upload SynthaticDataGeneration (1).ipynb

Browse files

Files changed (1) hide show

SynthaticDataGeneration (1).ipynb +744 -0

SynthaticDataGeneration (1).ipynb ADDED Viewed

	@@ -0,0 +1,744 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "A100"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU",
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "362ad3c800864e88b4718c36c61aff6f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_04671a1d41404a3f8d3118d963162d55",
+              "IPY_MODEL_c140514ea9094d0a83d0eb871e1c96d8",
+              "IPY_MODEL_db4e7a6835774140a26c28d8af93457b"
+            ],
+            "layout": "IPY_MODEL_7c570d0c1dce4a218f7a9d537ceb2b43"
+          }
+        },
+        "04671a1d41404a3f8d3118d963162d55": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_f7b4a83a6921499c841a38ec75c09d27",
+            "placeholder": "",
+            "style": "IPY_MODEL_11ebd72684464498bd59b5677d26fb6f",
+            "value": "Loading checkpoint shards: 100%"
+          }
+        },
+        "c140514ea9094d0a83d0eb871e1c96d8": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_baee00ce0df8491c8813b15e1341e545",
+            "max": 2,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_01d8bca7a75a41249a9af5c40d397286",
+            "value": 2
+          }
+        },
+        "db4e7a6835774140a26c28d8af93457b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_1fe94bdf961f4fa1bb724499ab5ce5e3",
+            "placeholder": "",
+            "style": "IPY_MODEL_5efb08d4cf874b9c8c424f09cdd2f1e8",
+            "value": " 2/2 [00:01&lt;00:00,  1.12it/s]"
+          }
+        },
+        "7c570d0c1dce4a218f7a9d537ceb2b43": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "f7b4a83a6921499c841a38ec75c09d27": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "11ebd72684464498bd59b5677d26fb6f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "baee00ce0df8491c8813b15e1341e545": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "01d8bca7a75a41249a9af5c40d397286": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "1fe94bdf961f4fa1bb724499ab5ce5e3": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "5efb08d4cf874b9c8c424f09cdd2f1e8": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        }
+      }
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Final Project DS Course"
+      ],
+      "metadata": {
+        "id": "_64vlsYnLasu"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Part 1: Synthetic Data Generation"
+      ],
+      "metadata": {
+        "id": "JkWTOu9TMGHS"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Project Overview:**\n",
+        "This project involves building an AI-powered application that digitizes handwritten recipes from images using Optical Character Recognition (OCR) and Natural Language Processing. By generating vector embeddings of the extracted text, the system identifies and retrieves three semantically similar recipes from a synthetically generated dataset of 10,000 entries. The final solution is deployed as an interactive web interface on Hugging Face Spaces, bridging the gap between physical archives and digital accessibility."
+      ],
+      "metadata": {
+        "id": "IgUr5Or9L_0y"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install -q -U transformers torch accelerate pandas tqdm\n",
+        "print(\"✅ Installations complete.\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "j8Ws9fnAZGEb",
+        "outputId": "9556a8c4-d980-4366-d4bc-d18218ad33bf"
+      },
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m91.2/91.2 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.4/12.4 MB\u001b[0m \u001b[31m137.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+            "google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.\u001b[0m\u001b[31m\n",
+            "\u001b[0m✅ Installations complete.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# ================================\n",
+        "# ONE-SHOT: FAST + STABLE 10K RECIPE GENERATION (A100 OPTIMIZED)\n",
+        "# FIXED: Padding Side Error\n",
+        "# ================================\n",
+        "\n",
+        "import os, json, random, re, time\n",
+        "import pandas as pd\n",
+        "from tqdm.auto import tqdm\n",
+        "\n",
+        "import torch\n",
+        "from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM\n",
+        "\n",
+        "# ----------------\n",
+        "# 1) SETTINGS\n",
+        "# ----------------\n",
+        "TARGET_COUNT = 10_000\n",
+        "SAVE_EVERY = 500\n",
+        "BATCH_SIZE = 64\n",
+        "MAX_NEW_TOKENS = 150\n",
+        "OUT_JSONL = \"RecipeData_10K.jsonl\"\n",
+        "OUT_CSV = \"RecipeData_10K.csv\"\n",
+        "\n",
+        "# Model: Qwen 2.5 3B (Fast & Smart)\n",
+        "MODEL_ID = \"Qwen/Qwen2.5-3B-Instruct\"\n",
+        "\n",
+        "# ----------------\n",
+        "# 2) EXAMPLE TEMPLATE\n",
+        "# ----------------\n",
+        "grandma_template = \"\"\"\n",
+        "Title: Granma's Meatballs\n",
+        "Ingredients:\n",
+        "- Meat 1kg\n",
+        "- Tomatos 8\n",
+        "- Onion (as much as you like)\n",
+        "- Spices: salt, pepper, chili\n",
+        "- Parsley\n",
+        "- Bread crumbs (2 spoons)\n",
+        "Instructions:\n",
+        "In one bowl mix it all, eventually create the meat balls, put in a pot, and cook it all for 40 minutes approximately.\n",
+        "<END_RECIPE>\n",
+        "\"\"\".strip()\n",
+        "\n",
+        "# ----------------\n",
+        "# 3) MENU GENERATOR\n",
+        "# ----------------\n",
+        "cuisine_profiles = {\n",
+        "    \"Italian\": {\n",
+        "        \"adjs\": [\"Classic\",\"Rustic\",\"Creamy\",\"Baked\",\"Cheesy\",\"Tomato-Basil\",\"Garlic\",\"Sicilian\",\"Tuscan\",\"Spicy\",\"Homemade\",\"Nonna's\"],\n",
+        "        \"mains\": [\"Pasta\",\"Risotto\",\"Lasagna\",\"Chicken Parmesan\",\"Gnocchi\",\"Polenta\",\"Ravioli\",\"Meatballs\",\"Ziti\",\"Alfredo\"],\n",
+        "        \"extras\": [\"with Mushrooms\",\"with Spinach\",\"Al Forno\",\"Primavera\",\"Supremo\",\"Rustica\",\"Delight\",\"Special\"]\n",
+        "    },\n",
+        "    \"Mediterranean\": {\n",
+        "        \"adjs\": [\"Spicy\",\"Fresh\",\"Roasted\",\"Grandma's\",\"Tahini-Drizzled\",\"Zesty\",\"Lemon\",\"Grilled\",\"Golden\",\"Herbed\"],\n",
+        "        \"mains\": [\"Shakshuka\",\"Eggplant\",\"Falafel\",\"Hummus Plate\",\"Kebab\",\"Couscous\",\"Shawarma\",\"Lamb Chops\",\"Fish Fillet\"],\n",
+        "        \"extras\": [\"with Pita\",\"Bowl\",\"Platter\",\"Salad\",\"Stew\",\"with Yogurt Sauce\",\"Feast\",\"Medley\"]\n",
+        "    },\n",
+        "    \"Asian_Fusion\": {\n",
+        "        \"adjs\": [\"Spicy\",\"Golden\",\"Soy-Glazed\",\"Ginger\",\"Crispy\",\"Steamed\",\"Wok-Fried\",\"Teriyaki\",\"Szechuan\",\"Sweet & Sour\"],\n",
+        "        \"mains\": [\"Chicken\",\"Tofu\",\"Beef\",\"Rice Bowl\",\"Noodles\",\"Dumplings\",\"Stir-Fry\",\"Duck\",\"Prawns\"],\n",
+        "        \"extras\": [\"Delight\",\"Surprise\",\"Box\",\"Feast\",\"with Cashews\",\"with Broccoli\",\"Dragon Style\"]\n",
+        "    },\n",
+        "    \"Dessert\": {\n",
+        "        \"adjs\": [\"Sweet\",\"Chocolate\",\"Fluffy\",\"Cinnamon\",\"Glazed\",\"Homemade\",\"Vanilla\",\"Berry\",\"Dark\",\"Creamy\"],\n",
+        "        \"mains\": [\"Cake\",\"Cookies\",\"Apple Pie\",\"Brownies\",\"Pudding\",\"Rugelach\",\"Muffins\",\"Cheesecake\",\"Tart\"],\n",
+        "        \"extras\": [\"Swirl\",\"Crumble\",\"Bites\",\"Bars\",\"Supreme\",\"Dream\",\"Celebration\"]\n",
+        "    }\n",
+        "}\n",
+        "\n",
+        "def build_prompts(target_count: int):\n",
+        "    prompt_data = []\n",
+        "    per_cuisine = max(1, target_count // len(cuisine_profiles))\n",
+        "\n",
+        "    for cuisine, data in cuisine_profiles.items():\n",
+        "        for _ in range(per_cuisine):\n",
+        "            dish_name = f\"{random.choice(data['adjs'])} {cuisine} {random.choice(data['mains'])} {random.choice(data['extras'])}\"\n",
+        "\n",
+        "            prompt = f\"\"\"<|im_start|>system\n",
+        "You are a helpful assistant. Follow the exact format of the example provided. Be brief.\n",
+        "Rules:\n",
+        "- Keep output short.\n",
+        "- MUST include: Title:, Ingredients:, Instructions:\n",
+        "- MUST end with: <END_RECIPE>\n",
+        "- Output ONLY the recipe (no extra commentary).\n",
+        "<|im_end|>\n",
+        "<|im_start|>user\n",
+        "Example:\n",
+        "{grandma_template}\n",
+        "\n",
+        "Task:\n",
+        "Generate a recipe for '{dish_name}' using exactly the same style and format.\n",
+        "<|im_end|>\n",
+        "<|im_start|>assistant\n",
+        "\"\"\"\n",
+        "            prompt_data.append({\"title\": dish_name, \"prompt\": prompt})\n",
+        "\n",
+        "    while len(prompt_data) < target_count:\n",
+        "        prompt_data.append(random.choice(prompt_data))\n",
+        "\n",
+        "    random.shuffle(prompt_data)\n",
+        "    return prompt_data[:target_count]\n",
+        "\n",
+        "# ----------------\n",
+        "# 4) PARSER\n",
+        "# ----------------\n",
+        "def parse_recipe(clean_text: str, fallback_title: str):\n",
+        "    if \"<END_RECIPE>\" in clean_text:\n",
+        "        clean_text = clean_text.split(\"<END_RECIPE>\")[0].strip()\n",
+        "\n",
+        "    title = fallback_title\n",
+        "    ingredients = \"Parse Error\"\n",
+        "    instructions = clean_text\n",
+        "\n",
+        "    m = re.search(r'(?im)^\\s*Title:\\s*(.+)\\s*$', clean_text)\n",
+        "    if m:\n",
+        "        title = m.group(1).strip()\n",
+        "\n",
+        "    parts = re.split(r'(?im)^\\s*Ingredients:\\s*$|^\\s*Instructions:\\s*$', clean_text)\n",
+        "    if len(parts) >= 3:\n",
+        "        ingredients = parts[1].strip()\n",
+        "        instructions = parts[2].strip()\n",
+        "\n",
+        "    return title, ingredients, instructions, clean_text\n",
+        "\n",
+        "# ----------------\n",
+        "# 5) PIPELINE SETUP (FIXED)\n",
+        "# ----------------\n",
+        "print(f\"CUDA Available: {torch.cuda.is_available()}\")\n",
+        "dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n",
+        "\n",
+        "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)\n",
+        "\n",
+        "# --- THE FIX IS HERE ---\n",
+        "tokenizer.padding_side = \"left\"   # Explicitly set left padding\n",
+        "# -----------------------\n",
+        "\n",
+        "model = AutoModelForCausalLM.from_pretrained(\n",
+        "    MODEL_ID,\n",
+        "    torch_dtype=dtype,\n",
+        "    device_map=\"auto\"\n",
+        ")\n",
+        "\n",
+        "if tokenizer.pad_token_id is None:\n",
+        "    tokenizer.pad_token = tokenizer.eos_token\n",
+        "\n",
+        "pipe = pipeline(\n",
+        "    \"text-generation\",\n",
+        "    model=model,\n",
+        "    tokenizer=tokenizer\n",
+        ")\n",
+        "\n",
+        "gen_kwargs = dict(\n",
+        "    max_new_tokens=MAX_NEW_TOKENS,\n",
+        "    do_sample=True,\n",
+        "    temperature=0.9,\n",
+        "    top_p=0.95,\n",
+        "    repetition_penalty=1.05,\n",
+        "    return_full_text=False,\n",
+        "    pad_token_id=tokenizer.pad_token_id,\n",
+        "    eos_token_id=tokenizer.eos_token_id\n",
+        ")\n",
+        "\n",
+        "# ----------------\n",
+        "# 6) RESUME SUPPORT & GENERATION\n",
+        "# ----------------\n",
+        "existing = 0\n",
+        "if os.path.exists(OUT_JSONL):\n",
+        "    with open(OUT_JSONL, \"r\", encoding=\"utf-8\") as f:\n",
+        "        for _ in f:\n",
+        "            existing += 1\n",
+        "    print(f\"Found existing {existing} rows. Resuming...\")\n",
+        "\n",
+        "need = max(0, TARGET_COUNT - existing)\n",
+        "\n",
+        "if need > 0:\n",
+        "    prompt_data = build_prompts(need)\n",
+        "    print(f\"🚀 Starting generation for {len(prompt_data)} recipes...\")\n",
+        "\n",
+        "    def run_with_batchsize(prompts, batch_size):\n",
+        "        with torch.inference_mode():\n",
+        "            return pipe(prompts, batch_size=batch_size, **gen_kwargs)\n",
+        "\n",
+        "    start = time.time()\n",
+        "    written = 0\n",
+        "\n",
+        "    with open(OUT_JSONL, \"a\", encoding=\"utf-8\") as f_out:\n",
+        "        for i in tqdm(range(0, len(prompt_data), SAVE_EVERY), desc=\"Generating chunks\"):\n",
+        "            chunk = prompt_data[i:i+SAVE_EVERY]\n",
+        "            chunk_prompts = [x[\"prompt\"] for x in chunk]\n",
+        "\n",
+        "            try:\n",
+        "                results = run_with_batchsize(chunk_prompts, BATCH_SIZE)\n",
+        "            except RuntimeError as e:\n",
+        "                if \"out of memory\" in str(e).lower():\n",
+        "                    torch.cuda.empty_cache()\n",
+        "                    print(\"⚠️ OOM detected. Retrying with reduced batch size (8)...\")\n",
+        "                    results = run_with_batchsize(chunk_prompts, 8)\n",
+        "                else:\n",
+        "                    raise\n",
+        "\n",
+        "            for j, out in enumerate(results):\n",
+        "                gen_text = out[0][\"generated_text\"] if isinstance(out, list) else out.get(\"generated_text\", \"\")\n",
+        "\n",
+        "                clean_text = gen_text.strip()\n",
+        "                title, ingreds, instrs, raw = parse_recipe(clean_text, chunk[j][\"title\"])\n",
+        "\n",
+        "                row = {\n",
+        "                    \"Title\": title,\n",
+        "                    \"Ingredients\": ingreds,\n",
+        "                    \"Instructions\": instrs,\n",
+        "                    \"Raw_Output\": raw\n",
+        "                }\n",
+        "                f_out.write(json.dumps(row, ensure_ascii=False) + \"\\n\")\n",
+        "                written += 1\n",
+        "\n",
+        "            f_out.flush()\n",
+        "\n",
+        "    elapsed = time.time() - start\n",
+        "    print(f\"✅ Generation done! {written} recipes in {elapsed/60:.1f} minutes.\")\n",
+        "\n",
+        "else:\n",
+        "    print(\"✅ Target reached. No new generation needed.\")\n",
+        "\n",
+        "# ----------------\n",
+        "# 7) EXPORT TO CSV\n",
+        "# ----------------\n",
+        "print(\"Exporting to CSV...\")\n",
+        "rows = []\n",
+        "with open(OUT_JSONL, \"r\", encoding=\"utf-8\") as f:\n",
+        "    for line in f:\n",
+        "        rows.append(json.loads(line))\n",
+        "\n",
+        "df = pd.DataFrame(rows)\n",
+        "df.to_csv(OUT_CSV, index=False)\n",
+        "print(f\"🎉 FINAL SUCCESS! Saved '{OUT_CSV}' with {len(df)} recipes.\")\n",
+        "print(df[['Title', 'Ingredients']].head())"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 379,
+          "referenced_widgets": [
+            "362ad3c800864e88b4718c36c61aff6f",
+            "04671a1d41404a3f8d3118d963162d55",
+            "c140514ea9094d0a83d0eb871e1c96d8",
+            "db4e7a6835774140a26c28d8af93457b",
+            "7c570d0c1dce4a218f7a9d537ceb2b43",
+            "f7b4a83a6921499c841a38ec75c09d27",
+            "11ebd72684464498bd59b5677d26fb6f",
+            "baee00ce0df8491c8813b15e1341e545",
+            "01d8bca7a75a41249a9af5c40d397286",
+            "1fe94bdf961f4fa1bb724499ab5ce5e3",
+            "5efb08d4cf874b9c8c424f09cdd2f1e8"
+          ]
+        },
+        "id": "WhYOWuJPXLcT",
+        "outputId": "ce796ad9-b1d3-4a7e-bfab-a6118c763c3c"
+      },
+      "execution_count": 14,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "CUDA Available: True\n"
+          ]
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+            ],
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "362ad3c800864e88b4718c36c61aff6f"
+            }
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Device set to use cuda:0\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Found existing 10000 rows. Resuming...\n",
+            "✅ Target reached. No new generation needed.\n",
+            "Exporting to CSV...\n",
+            "🎉 FINAL SUCCESS! Saved 'RecipeData_10K.csv' with 10000 recipes.\n",
+            "                                            Title  \\\n",
+            "0          Zesty Mediterranean Lamb Chops Platter   \n",
+            "1         Szechuan Asian_Fusion Tofu with Cashews   \n",
+            "2         Zesty Mediterranean Hummus Plate Medley   \n",
+            "3           Tuscan Italian Ravioli with Mushrooms   \n",
+            "4  Lemon Mediterranean Shawarma with Yogurt Sauce   \n",
+            "\n",
+            "                                         Ingredients  \n",
+            "0  - Lamb Chops 6\\n- Lemon (freshly squeezed) 1\\n...  \n",
+            "1  - Tofu 500g\\n- Cashews 100g\\n- Soy Sauce 3 tbs...  \n",
+            "2  - Chickpeas 500g\\n- Olive Oil 2 tbsp\\n- Lemon ...  \n",
+            "3  - Flour 500g\\n- Eggs 3\\n- Fillings: ricotta ch...  \n",
+            "4  - Chicken or lamb (1kg)\\n- Olive oil\\n- Lemon ...  \n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [],
+      "metadata": {
+        "id": "RUYFuxuXqJmB"
+      }
+    }
+  ]
+}