Spaces:

Liori25
/

CookBookAI

Sleeping

App Files Files Community

Liori25 commited on Jan 13

Commit

aaebbeb

verified ·

1 Parent(s): 6c9827f

Delete SynthaticDataGeneration.ipynb

Browse files

Files changed (1) hide show

SynthaticDataGeneration.ipynb +0 -744

SynthaticDataGeneration.ipynb DELETED Viewed

@@ -1,744 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "gpuType": "A100"
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "GPU",
-    "widgets": {
-      "application/vnd.jupyter.widget-state+json": {
-        "362ad3c800864e88b4718c36c61aff6f": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "HBoxModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_04671a1d41404a3f8d3118d963162d55",
-              "IPY_MODEL_c140514ea9094d0a83d0eb871e1c96d8",
-              "IPY_MODEL_db4e7a6835774140a26c28d8af93457b"
-            ],
-            "layout": "IPY_MODEL_7c570d0c1dce4a218f7a9d537ceb2b43"
-          }
-        },
-        "04671a1d41404a3f8d3118d963162d55": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "HTMLModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_f7b4a83a6921499c841a38ec75c09d27",
-            "placeholder": "",
-            "style": "IPY_MODEL_11ebd72684464498bd59b5677d26fb6f",
-            "value": "Loading checkpoint shards: 100%"
-          }
-        },
-        "c140514ea9094d0a83d0eb871e1c96d8": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "FloatProgressModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_baee00ce0df8491c8813b15e1341e545",
-            "max": 2,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_01d8bca7a75a41249a9af5c40d397286",
-            "value": 2
-          }
-        },
-        "db4e7a6835774140a26c28d8af93457b": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "HTMLModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_1fe94bdf961f4fa1bb724499ab5ce5e3",
-            "placeholder": "",
-            "style": "IPY_MODEL_5efb08d4cf874b9c8c424f09cdd2f1e8",
-            "value": " 2/2 [00:01&lt;00:00,  1.12it/s]"
-          }
-        },
-        "7c570d0c1dce4a218f7a9d537ceb2b43": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "model_module_version": "1.2.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "f7b4a83a6921499c841a38ec75c09d27": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "model_module_version": "1.2.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "11ebd72684464498bd59b5677d26fb6f": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "DescriptionStyleModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "baee00ce0df8491c8813b15e1341e545": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "model_module_version": "1.2.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "01d8bca7a75a41249a9af5c40d397286": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "ProgressStyleModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "1fe94bdf961f4fa1bb724499ab5ce5e3": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "model_module_version": "1.2.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "5efb08d4cf874b9c8c424f09cdd2f1e8": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "DescriptionStyleModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        }
-      }
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Final Project DS Course"
-      ],
-      "metadata": {
-        "id": "_64vlsYnLasu"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Part 1: Synthetic Data Generation"
-      ],
-      "metadata": {
-        "id": "JkWTOu9TMGHS"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "**Project Overview:**\n",
-        "This project involves building an AI-powered application that digitizes handwritten recipes from images using Optical Character Recognition (OCR) and Natural Language Processing. By generating vector embeddings of the extracted text, the system identifies and retrieves three semantically similar recipes from a synthetically generated dataset of 10,000 entries. The final solution is deployed as an interactive web interface on Hugging Face Spaces, bridging the gap between physical archives and digital accessibility."
-      ],
-      "metadata": {
-        "id": "IgUr5Or9L_0y"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "!pip install -q -U transformers torch accelerate pandas tqdm\n",
-        "print(\"✅ Installations complete.\")"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "j8Ws9fnAZGEb",
-        "outputId": "9556a8c4-d980-4366-d4bc-d18218ad33bf"
-      },
-      "execution_count": 10,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m91.2/91.2 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.4/12.4 MB\u001b[0m \u001b[31m137.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
-            "google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.\u001b[0m\u001b[31m\n",
-            "\u001b[0m✅ Installations complete.\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# ================================\n",
-        "# ONE-SHOT: FAST + STABLE 10K RECIPE GENERATION (A100 OPTIMIZED)\n",
-        "# FIXED: Padding Side Error\n",
-        "# ================================\n",
-        "\n",
-        "import os, json, random, re, time\n",
-        "import pandas as pd\n",
-        "from tqdm.auto import tqdm\n",
-        "\n",
-        "import torch\n",
-        "from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM\n",
-        "\n",
-        "# ----------------\n",
-        "# 1) SETTINGS\n",
-        "# ----------------\n",
-        "TARGET_COUNT = 10_000\n",
-        "SAVE_EVERY = 500\n",
-        "BATCH_SIZE = 64\n",
-        "MAX_NEW_TOKENS = 150\n",
-        "OUT_JSONL = \"RecipeData_10K.jsonl\"\n",
-        "OUT_CSV = \"RecipeData_10K.csv\"\n",
-        "\n",
-        "# Model: Qwen 2.5 3B (Fast & Smart)\n",
-        "MODEL_ID = \"Qwen/Qwen2.5-3B-Instruct\"\n",
-        "\n",
-        "# ----------------\n",
-        "# 2) EXAMPLE TEMPLATE\n",
-        "# ----------------\n",
-        "grandma_template = \"\"\"\n",
-        "Title: Granma's Meatballs\n",
-        "Ingredients:\n",
-        "- Meat 1kg\n",
-        "- Tomatos 8\n",
-        "- Onion (as much as you like)\n",
-        "- Spices: salt, pepper, chili\n",
-        "- Parsley\n",
-        "- Bread crumbs (2 spoons)\n",
-        "Instructions:\n",
-        "In one bowl mix it all, eventually create the meat balls, put in a pot, and cook it all for 40 minutes approximately.\n",
-        "<END_RECIPE>\n",
-        "\"\"\".strip()\n",
-        "\n",
-        "# ----------------\n",
-        "# 3) MENU GENERATOR\n",
-        "# ----------------\n",
-        "cuisine_profiles = {\n",
-        "    \"Italian\": {\n",
-        "        \"adjs\": [\"Classic\",\"Rustic\",\"Creamy\",\"Baked\",\"Cheesy\",\"Tomato-Basil\",\"Garlic\",\"Sicilian\",\"Tuscan\",\"Spicy\",\"Homemade\",\"Nonna's\"],\n",
-        "        \"mains\": [\"Pasta\",\"Risotto\",\"Lasagna\",\"Chicken Parmesan\",\"Gnocchi\",\"Polenta\",\"Ravioli\",\"Meatballs\",\"Ziti\",\"Alfredo\"],\n",
-        "        \"extras\": [\"with Mushrooms\",\"with Spinach\",\"Al Forno\",\"Primavera\",\"Supremo\",\"Rustica\",\"Delight\",\"Special\"]\n",
-        "    },\n",
-        "    \"Mediterranean\": {\n",
-        "        \"adjs\": [\"Spicy\",\"Fresh\",\"Roasted\",\"Grandma's\",\"Tahini-Drizzled\",\"Zesty\",\"Lemon\",\"Grilled\",\"Golden\",\"Herbed\"],\n",
-        "        \"mains\": [\"Shakshuka\",\"Eggplant\",\"Falafel\",\"Hummus Plate\",\"Kebab\",\"Couscous\",\"Shawarma\",\"Lamb Chops\",\"Fish Fillet\"],\n",
-        "        \"extras\": [\"with Pita\",\"Bowl\",\"Platter\",\"Salad\",\"Stew\",\"with Yogurt Sauce\",\"Feast\",\"Medley\"]\n",
-        "    },\n",
-        "    \"Asian_Fusion\": {\n",
-        "        \"adjs\": [\"Spicy\",\"Golden\",\"Soy-Glazed\",\"Ginger\",\"Crispy\",\"Steamed\",\"Wok-Fried\",\"Teriyaki\",\"Szechuan\",\"Sweet & Sour\"],\n",
-        "        \"mains\": [\"Chicken\",\"Tofu\",\"Beef\",\"Rice Bowl\",\"Noodles\",\"Dumplings\",\"Stir-Fry\",\"Duck\",\"Prawns\"],\n",
-        "        \"extras\": [\"Delight\",\"Surprise\",\"Box\",\"Feast\",\"with Cashews\",\"with Broccoli\",\"Dragon Style\"]\n",
-        "    },\n",
-        "    \"Dessert\": {\n",
-        "        \"adjs\": [\"Sweet\",\"Chocolate\",\"Fluffy\",\"Cinnamon\",\"Glazed\",\"Homemade\",\"Vanilla\",\"Berry\",\"Dark\",\"Creamy\"],\n",
-        "        \"mains\": [\"Cake\",\"Cookies\",\"Apple Pie\",\"Brownies\",\"Pudding\",\"Rugelach\",\"Muffins\",\"Cheesecake\",\"Tart\"],\n",
-        "        \"extras\": [\"Swirl\",\"Crumble\",\"Bites\",\"Bars\",\"Supreme\",\"Dream\",\"Celebration\"]\n",
-        "    }\n",
-        "}\n",
-        "\n",
-        "def build_prompts(target_count: int):\n",
-        "    prompt_data = []\n",
-        "    per_cuisine = max(1, target_count // len(cuisine_profiles))\n",
-        "\n",
-        "    for cuisine, data in cuisine_profiles.items():\n",
-        "        for _ in range(per_cuisine):\n",
-        "            dish_name = f\"{random.choice(data['adjs'])} {cuisine} {random.choice(data['mains'])} {random.choice(data['extras'])}\"\n",
-        "\n",
-        "            prompt = f\"\"\"<|im_start|>system\n",
-        "You are a helpful assistant. Follow the exact format of the example provided. Be brief.\n",
-        "Rules:\n",
-        "- Keep output short.\n",
-        "- MUST include: Title:, Ingredients:, Instructions:\n",
-        "- MUST end with: <END_RECIPE>\n",
-        "- Output ONLY the recipe (no extra commentary).\n",
-        "<|im_end|>\n",
-        "<|im_start|>user\n",
-        "Example:\n",
-        "{grandma_template}\n",
-        "\n",
-        "Task:\n",
-        "Generate a recipe for '{dish_name}' using exactly the same style and format.\n",
-        "<|im_end|>\n",
-        "<|im_start|>assistant\n",
-        "\"\"\"\n",
-        "            prompt_data.append({\"title\": dish_name, \"prompt\": prompt})\n",
-        "\n",
-        "    while len(prompt_data) < target_count:\n",
-        "        prompt_data.append(random.choice(prompt_data))\n",
-        "\n",
-        "    random.shuffle(prompt_data)\n",
-        "    return prompt_data[:target_count]\n",
-        "\n",
-        "# ----------------\n",
-        "# 4) PARSER\n",
-        "# ----------------\n",
-        "def parse_recipe(clean_text: str, fallback_title: str):\n",
-        "    if \"<END_RECIPE>\" in clean_text:\n",
-        "        clean_text = clean_text.split(\"<END_RECIPE>\")[0].strip()\n",
-        "\n",
-        "    title = fallback_title\n",
-        "    ingredients = \"Parse Error\"\n",
-        "    instructions = clean_text\n",
-        "\n",
-        "    m = re.search(r'(?im)^\\s*Title:\\s*(.+)\\s*$', clean_text)\n",
-        "    if m:\n",
-        "        title = m.group(1).strip()\n",
-        "\n",
-        "    parts = re.split(r'(?im)^\\s*Ingredients:\\s*$|^\\s*Instructions:\\s*$', clean_text)\n",
-        "    if len(parts) >= 3:\n",
-        "        ingredients = parts[1].strip()\n",
-        "        instructions = parts[2].strip()\n",
-        "\n",
-        "    return title, ingredients, instructions, clean_text\n",
-        "\n",
-        "# ----------------\n",
-        "# 5) PIPELINE SETUP (FIXED)\n",
-        "# ----------------\n",
-        "print(f\"CUDA Available: {torch.cuda.is_available()}\")\n",
-        "dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n",
-        "\n",
-        "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)\n",
-        "\n",
-        "# --- THE FIX IS HERE ---\n",
-        "tokenizer.padding_side = \"left\"   # Explicitly set left padding\n",
-        "# -----------------------\n",
-        "\n",
-        "model = AutoModelForCausalLM.from_pretrained(\n",
-        "    MODEL_ID,\n",
-        "    torch_dtype=dtype,\n",
-        "    device_map=\"auto\"\n",
-        ")\n",
-        "\n",
-        "if tokenizer.pad_token_id is None:\n",
-        "    tokenizer.pad_token = tokenizer.eos_token\n",
-        "\n",
-        "pipe = pipeline(\n",
-        "    \"text-generation\",\n",
-        "    model=model,\n",
-        "    tokenizer=tokenizer\n",
-        ")\n",
-        "\n",
-        "gen_kwargs = dict(\n",
-        "    max_new_tokens=MAX_NEW_TOKENS,\n",
-        "    do_sample=True,\n",
-        "    temperature=0.9,\n",
-        "    top_p=0.95,\n",
-        "    repetition_penalty=1.05,\n",
-        "    return_full_text=False,\n",
-        "    pad_token_id=tokenizer.pad_token_id,\n",
-        "    eos_token_id=tokenizer.eos_token_id\n",
-        ")\n",
-        "\n",
-        "# ----------------\n",
-        "# 6) RESUME SUPPORT & GENERATION\n",
-        "# ----------------\n",
-        "existing = 0\n",
-        "if os.path.exists(OUT_JSONL):\n",
-        "    with open(OUT_JSONL, \"r\", encoding=\"utf-8\") as f:\n",
-        "        for _ in f:\n",
-        "            existing += 1\n",
-        "    print(f\"Found existing {existing} rows. Resuming...\")\n",
-        "\n",
-        "need = max(0, TARGET_COUNT - existing)\n",
-        "\n",
-        "if need > 0:\n",
-        "    prompt_data = build_prompts(need)\n",
-        "    print(f\"🚀 Starting generation for {len(prompt_data)} recipes...\")\n",
-        "\n",
-        "    def run_with_batchsize(prompts, batch_size):\n",
-        "        with torch.inference_mode():\n",
-        "            return pipe(prompts, batch_size=batch_size, **gen_kwargs)\n",
-        "\n",
-        "    start = time.time()\n",
-        "    written = 0\n",
-        "\n",
-        "    with open(OUT_JSONL, \"a\", encoding=\"utf-8\") as f_out:\n",
-        "        for i in tqdm(range(0, len(prompt_data), SAVE_EVERY), desc=\"Generating chunks\"):\n",
-        "            chunk = prompt_data[i:i+SAVE_EVERY]\n",
-        "            chunk_prompts = [x[\"prompt\"] for x in chunk]\n",
-        "\n",
-        "            try:\n",
-        "                results = run_with_batchsize(chunk_prompts, BATCH_SIZE)\n",
-        "            except RuntimeError as e:\n",
-        "                if \"out of memory\" in str(e).lower():\n",
-        "                    torch.cuda.empty_cache()\n",
-        "                    print(\"⚠️ OOM detected. Retrying with reduced batch size (8)...\")\n",
-        "                    results = run_with_batchsize(chunk_prompts, 8)\n",
-        "                else:\n",
-        "                    raise\n",
-        "\n",
-        "            for j, out in enumerate(results):\n",
-        "                gen_text = out[0][\"generated_text\"] if isinstance(out, list) else out.get(\"generated_text\", \"\")\n",
-        "\n",
-        "                clean_text = gen_text.strip()\n",
-        "                title, ingreds, instrs, raw = parse_recipe(clean_text, chunk[j][\"title\"])\n",
-        "\n",
-        "                row = {\n",
-        "                    \"Title\": title,\n",
-        "                    \"Ingredients\": ingreds,\n",
-        "                    \"Instructions\": instrs,\n",
-        "                    \"Raw_Output\": raw\n",
-        "                }\n",
-        "                f_out.write(json.dumps(row, ensure_ascii=False) + \"\\n\")\n",
-        "                written += 1\n",
-        "\n",
-        "            f_out.flush()\n",
-        "\n",
-        "    elapsed = time.time() - start\n",
-        "    print(f\"✅ Generation done! {written} recipes in {elapsed/60:.1f} minutes.\")\n",
-        "\n",
-        "else:\n",
-        "    print(\"✅ Target reached. No new generation needed.\")\n",
-        "\n",
-        "# ----------------\n",
-        "# 7) EXPORT TO CSV\n",
-        "# ----------------\n",
-        "print(\"Exporting to CSV...\")\n",
-        "rows = []\n",
-        "with open(OUT_JSONL, \"r\", encoding=\"utf-8\") as f:\n",
-        "    for line in f:\n",
-        "        rows.append(json.loads(line))\n",
-        "\n",
-        "df = pd.DataFrame(rows)\n",
-        "df.to_csv(OUT_CSV, index=False)\n",
-        "print(f\"🎉 FINAL SUCCESS! Saved '{OUT_CSV}' with {len(df)} recipes.\")\n",
-        "print(df[['Title', 'Ingredients']].head())"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 379,
-          "referenced_widgets": [
-            "362ad3c800864e88b4718c36c61aff6f",
-            "04671a1d41404a3f8d3118d963162d55",
-            "c140514ea9094d0a83d0eb871e1c96d8",
-            "db4e7a6835774140a26c28d8af93457b",
-            "7c570d0c1dce4a218f7a9d537ceb2b43",
-            "f7b4a83a6921499c841a38ec75c09d27",
-            "11ebd72684464498bd59b5677d26fb6f",
-            "baee00ce0df8491c8813b15e1341e545",
-            "01d8bca7a75a41249a9af5c40d397286",
-            "1fe94bdf961f4fa1bb724499ab5ce5e3",
-            "5efb08d4cf874b9c8c424f09cdd2f1e8"
-          ]
-        },
-        "id": "WhYOWuJPXLcT",
-        "outputId": "ce796ad9-b1d3-4a7e-bfab-a6118c763c3c"
-      },
-      "execution_count": 14,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "CUDA Available: True\n"
-          ]
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "text/plain": [
-              "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
-            ],
-            "application/vnd.jupyter.widget-view+json": {
-              "version_major": 2,
-              "version_minor": 0,
-              "model_id": "362ad3c800864e88b4718c36c61aff6f"
-            }
-          },
-          "metadata": {}
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "Device set to use cuda:0\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Found existing 10000 rows. Resuming...\n",
-            "✅ Target reached. No new generation needed.\n",
-            "Exporting to CSV...\n",
-            "🎉 FINAL SUCCESS! Saved 'RecipeData_10K.csv' with 10000 recipes.\n",
-            "                                            Title  \\\n",
-            "0          Zesty Mediterranean Lamb Chops Platter   \n",
-            "1         Szechuan Asian_Fusion Tofu with Cashews   \n",
-            "2         Zesty Mediterranean Hummus Plate Medley   \n",
-            "3           Tuscan Italian Ravioli with Mushrooms   \n",
-            "4  Lemon Mediterranean Shawarma with Yogurt Sauce   \n",
-            "\n",
-            "                                         Ingredients  \n",
-            "0  - Lamb Chops 6\\n- Lemon (freshly squeezed) 1\\n...  \n",
-            "1  - Tofu 500g\\n- Cashews 100g\\n- Soy Sauce 3 tbs...  \n",
-            "2  - Chickpeas 500g\\n- Olive Oil 2 tbsp\\n- Lemon ...  \n",
-            "3  - Flour 500g\\n- Eggs 3\\n- Fillings: ricotta ch...  \n",
-            "4  - Chicken or lamb (1kg)\\n- Olive oil\\n- Lemon ...  \n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [],
-      "metadata": {
-        "id": "RUYFuxuXqJmB"
-      }
-    }
-  ]
-}