{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "metadata": {}, "source": [ "import pandas as pd\n", "import numpy as np\n", "import random\n", "import warnings\n", "import os\n", "import urllib.request\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "random.seed(2025)\n", "np.random.seed(2025)" ] }, { "cell_type": "code", "metadata": {}, "source": [ "file_path = \"food_data.csv.gz\"\n", "\n", "if not os.path.exists(file_path):\n", " print(\"Downloading dataset... this may take a few minutes.\")\n", " urllib.request.urlretrieve(\n", " \"https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv.gz\",\n", " file_path\n", " )\n", " print(\"Download complete.\")\n", "else:\n", " print(\"Dataset already exists.\")\n", "\n", "print(\"File ready:\", file_path)" ] }, { "cell_type": "code", "metadata": {}, "source": [ "columns_to_keep = [\n", " \"product_name\",\n", " \"categories\",\n", " \"energy-kcal_100g\",\n", " \"fat_100g\",\n", " \"sugars_100g\",\n", " \"salt_100g\",\n", " \"proteins_100g\",\n", " \"fiber_100g\",\n", " \"nutriscore_grade\",\n", " \"ingredients_text\"\n", "]\n", "\n", "df_food = pd.read_csv(\n", " file_path,\n", " sep=\"\\t\",\n", " compression=\"gzip\",\n", " usecols=columns_to_keep,\n", " low_memory=False,\n", " nrows=50000\n", ")\n", "\n", "print(\"Initial shape:\", df_food.shape)\n", "df_food.head()" ] }, { "cell_type": "code", "metadata": {}, "source": [ "df_food = df_food.dropna(subset=[\"product_name\", \"categories\"])\n", "\n", "numeric_cols = [\n", " \"energy-kcal_100g\",\n", " \"fat_100g\",\n", " \"sugars_100g\",\n", " \"salt_100g\",\n", " \"proteins_100g\",\n", " \"fiber_100g\"\n", "]\n", "\n", "for col in numeric_cols:\n", " df_food[col] = pd.to_numeric(df_food[col], errors=\"coerce\")\n", "\n", "df_food = df_food.dropna(subset=[\n", " \"energy-kcal_100g\",\n", " \"sugars_100g\",\n", " \"fat_100g\",\n", " \"salt_100g\"\n", "])\n", "\n", "print(\"After cleaning:\", df_food.shape)\n", "df_food.head()" ] }, { "cell_type": "code", "metadata": {}, "source": [ "df_food = df_food.sample(n=3000, random_state=42)\n", "df_food.reset_index(drop=True, inplace=True)\n", "\n", "print(\"Final sampled shape:\", df_food.shape)\n", "df_food.head()" ] }, { "cell_type": "code", "metadata": {}, "source": [ "def generate_health_score(row):\n", " score = 0\n", "\n", " if row[\"sugars_100g\"] <= 5:\n", " score += 2\n", " elif row[\"sugars_100g\"] <= 12:\n", " score += 1\n", "\n", " if row[\"fat_100g\"] <= 3:\n", " score += 2\n", " elif row[\"fat_100g\"] <= 10:\n", " score += 1\n", "\n", " if row[\"salt_100g\"] <= 0.3:\n", " score += 2\n", " elif row[\"salt_100g\"] <= 1.5:\n", " score += 1\n", "\n", " if pd.notna(row[\"fiber_100g\"]) and row[\"fiber_100g\"] >= 3:\n", " score += 1\n", "\n", " if pd.notna(row[\"proteins_100g\"]) and row[\"proteins_100g\"] >= 5:\n", " score += 1\n", "\n", " return score\n", "\n", "df_food[\"health_score\"] = df_food.apply(generate_health_score, axis=1)" ] }, { "cell_type": "code", "metadata": {}, "source": [ "def get_health_label(score):\n", " if score <= 2:\n", " return \"unhealthy\"\n", " elif score <= 5:\n", " return \"moderate\"\n", " else:\n", " return \"healthy\"\n", "\n", "df_food[\"health_label\"] = df_food[\"health_score\"].apply(get_health_label)\n", "df_food[\"health_label\"].value_counts()" ] }, { "cell_type": "code", "metadata": {}, "source": [ "synthetic_reviews_by_health = {\n", " \"healthy\": [\n", " \"Healthy choice\",\n", " \"Good nutrition\",\n", " \"Balanced product\"\n", " ],\n", " \"moderate\": [\n", " \"Okay product\",\n", " \"Average choice\",\n", " \"Moderate nutrition\"\n", " ],\n", " \"unhealthy\": [\n", " \"Too much sugar\",\n", " \"Not healthy\",\n", " \"Should be avoided\"\n", " ]\n", "}\n", "\n", "review_rows = []\n", "\n", "for _, row in df_food.iterrows():\n", " for review in random.sample(synthetic_reviews_by_health[row[\"health_label\"]], 3):\n", " review_rows.append({\n", " \"product_name\": row[\"product_name\"],\n", " \"health_label\": row[\"health_label\"],\n", " \"review_text\": review\n", " })\n", "\n", "df_reviews = pd.DataFrame(review_rows)" ] }, { "cell_type": "code", "metadata": {}, "source": [ "df_food.to_csv(\"clean_food_products.csv\", index=False)\n", "df_reviews.to_csv(\"synthetic_food_reviews.csv\", index=False)\n", "\n", "print(\"Saved files successfully\")" ] } ] }