Spaces:

ESCP
/

codingworkshop_group4B

Sleeping

File size: 15,966 Bytes

{
 "nbformat": 4,
 "nbformat_minor": 5,
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.10.0"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# \ud83c\udf77 **Wine Data Analysis & Visualization**\n---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## **1.** \ud83d\udce6 Install required packages"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "!pip install pandas matplotlib seaborn numpy vaderSentiment statsmodels scikit-learn"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## **2.** \u2705 Load & inspect input datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *a. Initial setup*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "import pandas as pd\nimport numpy as np\nimport random"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *b. Load reviews and sales datasets*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "df_reviews = pd.read_csv(\"synthetic_wine_reviews.csv\")\ndf_sales = pd.read_csv(\"synthetic_wine_sales.csv\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *c. View first few lines*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "print(df_sales.head())\nprint(df_reviews.head())"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *d. Quality check*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "def quality_check(df, name=\"DataFrame\"):\n    print(f\"\\n\ud83d\udd0d Quality Check Report for: {name}\")\n    print(\"=\" * (25 + len(name)))\n    print(f\"\\n\ud83d\udccf Shape: {df.shape}\")\n    print(\"\\n\ud83d\udd20 Column Types:\")\n    print(df.dtypes)\n    print(\"\\n\u2753 Missing Values:\")\n    print(df.isnull().sum())\n    duplicate_count = df.duplicated().sum()\n    print(f\"\\n\ud83d\udccb Duplicate Rows: {duplicate_count}\")\n    print(\"\\n\ud83d\udcca Summary Statistics:\")\n    print(df.describe())\n\nquality_check(df_reviews, \"Wine Reviews\")\nquality_check(df_sales, \"Wine Sales\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## **3.** \ud83c\udfad Perform sentiment analysis using VADER"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *a. Initial setup*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n\nanalyzer = SentimentIntensityAnalyzer()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *b. Create sentiment classification function*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "def get_sentiment_label(text):\n    score = analyzer.polarity_scores(text)[\"compound\"]\n    if score >= 0.05:\n        return \"positive\"\n    elif score <= -0.05:\n        return \"negative\"\n    else:\n        return \"neutral\" "
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *c. Apply VADER to reviews*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "df_reviews[\"vader_sentiment\"] = df_reviews[\"review_text\"].apply(get_sentiment_label)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *d. Compare VADER results with original labels*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "match_rate = (df_reviews[\"vader_sentiment\"] == df_reviews[\"sentiment_label\"]).mean()\nprint(f\"VADER agreement with original labels: {match_rate:.1%}\")\nprint(\"\\nVADER sentiment distribution:\")\nprint(df_reviews[\"vader_sentiment\"].value_counts())"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## **4.** \ud83d\udcca Data Visualization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *a. Initial setup*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "import matplotlib.pyplot as plt\nimport seaborn as sns\nimport matplotlib.dates as mdates"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *b. Output folders for HF app*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "from pathlib import Path\n\nART_DIR = Path(\"artifacts\")\nPY_FIG = ART_DIR / \"py\" / \"figures\"\nPY_TAB = ART_DIR / \"py\" / \"tables\"\n\nfor p in [PY_FIG, PY_TAB]:\n    p.mkdir(parents=True, exist_ok=True)\n\nprint(\"\u2705 Output folders ready\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *c. Sample 5 wines per popularity level*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "sampled_titles = []\nfor pop_score in sorted(df_reviews[\"popularity_score\"].dropna().unique()):\n    all_titles = df_reviews[df_reviews[\"popularity_score\"] == pop_score][\"title\"].unique()\n    sampled = random.sample(list(all_titles), min(5, len(all_titles)))\n    sampled_titles.extend(sampled)\n\nprint(f\"Sampled {len(sampled_titles)} wines for visualization\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *d. Prepare sampled data*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "sampled_sales = df_sales[df_sales[\"title\"].isin(sampled_titles)].copy()\nsampled_reviews = df_reviews[df_reviews[\"title\"].isin(sampled_titles)].copy()\nsampled_wines = df_reviews[df_reviews[\"title\"].isin(sampled_titles)].copy()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *e. Plot sales trends over time*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "df_sales[\"month\"] = pd.to_datetime(df_sales[\"month\"])\n\npopularity_colors = {\n    1: \"darkred\", 2: \"orangered\", 3: \"gold\", 4: \"mediumseagreen\", 5: \"royalblue\"\n}\n\nplt.figure(figsize=(20, 8))\nfor title in sampled_titles:\n    row = sampled_wines[sampled_wines[\"title\"] == title].iloc[0]\n    color = popularity_colors.get(row[\"popularity_score\"], \"gray\")\n    subset = sampled_sales[sampled_sales[\"title\"] == title]\n    plt.plot(subset[\"month\"], subset[\"units_sold\"], color=color, alpha=0.6, linewidth=1.2)\n\nlegend_elements = [plt.Line2D([0], [0], color=c, lw=2, label=f\"Popularity {p}\")\n                   for p, c in sorted(popularity_colors.items())]\nplt.legend(handles=legend_elements, fontsize=10)\nplt.title(\"Wine Sales Trends by Popularity Score (Sampled)\", fontsize=14)\nplt.xlabel(\"Month\")\nplt.ylabel(\"Units Sold\")\nplt.tight_layout()\nplt.savefig(PY_FIG / \"sales_trends.png\", dpi=150)\nplt.show()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *f. Sentiment distribution per wine*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "sampled_reviews[\"grouped_title\"] = sampled_reviews[\"points\"].astype(str) + \"pts | \" + sampled_reviews[\"title\"].str[:30]\n\nsentiment_counts = (\n    sampled_reviews.groupby([\"grouped_title\", \"vader_sentiment\"])\n    .size()\n    .unstack(fill_value=0)[[\"negative\", \"neutral\", \"positive\"]]\n)\n\nsentiment_counts.reset_index().to_csv(PY_TAB / \"sentiment_counts_sampled.csv\", index=False)\n\nsentiment_counts.plot(kind=\"barh\", stacked=True, figsize=(14, 12),\n                      color=[\"#e74c3c\", \"#f39c12\", \"#2ecc71\"])\nplt.title(\"Sentiment Distribution by Wine (VADER)\", fontsize=14)\nplt.xlabel(\"Number of Reviews\")\nplt.ylabel(\"\")\nplt.tight_layout()\nplt.savefig(PY_FIG / \"sentiment_distribution.png\", dpi=150)\nplt.show()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *g. Price vs Points scatter*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "df_wines = pd.read_csv(\"wine_data.csv\")\nplt.figure(figsize=(10, 6))\nscatter = plt.scatter(df_wines[\"points\"], df_wines[\"price\"], \n                      alpha=0.4, c=df_wines[\"points\"], cmap=\"viridis\", s=20)\nplt.colorbar(scatter, label=\"Points\")\nplt.title(\"Wine Price vs. Rating (Points)\", fontsize=14)\nplt.xlabel(\"Points (Rating)\")\nplt.ylabel(\"Price (USD)\")\nplt.tight_layout()\nplt.savefig(PY_FIG / \"price_vs_points.png\", dpi=150)\nplt.show()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## **5.** \ud83d\udd2e Forecast wine sales with ARIMA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *a. Initial setup*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "import statsmodels.api as sm\nfrom itertools import product as iter_product\nimport matplotlib.cm as cm"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *b. Find best ARIMA parameters*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "def find_best_arima(series, p_range=(0, 5), d_range=(0, 2), q_range=(0, 1)):\n    best_aic = float(\"inf\")\n    best_order = None\n    best_model = None\n\n    for p, d, q in iter_product(range(p_range[0], p_range[1] + 1),\n                                range(d_range[0], d_range[1] + 1),\n                                range(q_range[0], q_range[1] + 1)):\n        try:\n            model = sm.tsa.ARIMA(series, order=(p, d, q))\n            fitted = model.fit()\n            if fitted.aic < best_aic:\n                best_aic = fitted.aic\n                best_order = (p, d, q)\n                best_model = fitted\n        except:\n            continue\n\n    return best_model, best_order"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *c. Forecast and plot*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "colors = sns.color_palette(\"tab10\", len(sampled_titles))\n\nplt.figure(figsize=(16, 10))\n\nfor i, title in enumerate(sampled_titles):\n    wine_sales = sampled_sales[sampled_sales[\"title\"] == title].copy()\n    wine_sales[\"month\"] = pd.to_datetime(wine_sales[\"month\"])\n    wine_sales = wine_sales.sort_values(\"month\").set_index(\"month\")\n    ts = wine_sales[\"units_sold\"]\n\n    if len(ts) < 6:\n        continue\n\n    model, order = find_best_arima(ts)\n    if model is None:\n        continue\n\n    forecast = model.forecast(steps=6)\n    future_dates = pd.date_range(start=ts.index[-1], periods=7, freq=\"M\")[1:]\n\n    plt.plot(ts.index, ts.values, color=colors[i], alpha=0.6, linewidth=1)\n    plt.plot(future_dates, forecast.values, color=colors[i], linestyle=\"--\", alpha=0.8, linewidth=1)\n\nplt.title(\"ARIMA Sales Forecast for Sampled Wines (6-Month Horizon)\", fontsize=14)\nplt.xlabel(\"Month\")\nplt.ylabel(\"Units Sold\")\nplt.tight_layout()\nplt.savefig(PY_FIG / \"arima_forecasts.png\", dpi=150)\nplt.show()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## **6.** \ud83c\udff7\ufe0f Pricing decisions based on sentiment & sales"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *a. Average sales per wine*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "avg_sales = df_sales.groupby(\"title\")[\"units_sold\"].mean().reset_index()\navg_sales.columns = [\"title\", \"avg_units_sold\"]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *b. Sentiment distribution per wine*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "sent_counts = df_reviews.groupby([\"title\", \"vader_sentiment\"]).size().unstack(fill_value=0)\nsent_counts[\"total\"] = sent_counts.sum(axis=1)\nsent_counts[\"positive_ratio\"] = sent_counts.get(\"positive\", 0) / sent_counts[\"total\"]\nsent_counts[\"negative_ratio\"] = sent_counts.get(\"negative\", 0) / sent_counts[\"total\"]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *c. Merge*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "df_decision = avg_sales.merge(sent_counts, on=\"title\", how=\"left\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *d. Pricing decision function*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "def pricing_decision(row):\n    if row[\"avg_units_sold\"] >= 120 and row[\"positive_ratio\"] >= 0.6:\n        return \"Increase Price\"\n    elif row[\"avg_units_sold\"] <= 60 and row[\"negative_ratio\"] >= 0.4:\n        return \"Decrease Price\"\n    else:\n        return \"Keep Price\"\n\ndf_decision[\"pricing_action\"] = df_decision.apply(pricing_decision, axis=1)\nprint(df_decision[[\"title\", \"avg_units_sold\", \"positive_ratio\", \"negative_ratio\", \"pricing_action\"]].head(10))"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### *e. Pricing action distribution*"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "print(\"\\nPricing Action Distribution:\")\nprint(df_decision[\"pricing_action\"].value_counts())\n\ndf_decision[[\"title\", \"avg_units_sold\", \"positive_ratio\", \"negative_ratio\", \"pricing_action\"]].to_csv(\n    PY_TAB / \"pricing_decisions.csv\", index=False\n)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## **7.** \ud83d\udcbe Save outputs for HF dashboard"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "import json\n\n# Dashboard table\ndf_dashboard = df_sales.groupby(\"month\", as_index=False).agg(\n    total_units_sold=(\"units_sold\", \"sum\"),\n    total_revenue=(\"revenue\", \"sum\") if \"revenue\" in df_sales.columns else (\"units_sold\", \"sum\"),\n)\ndf_dashboard.to_csv(PY_TAB / \"df_dashboard.csv\", index=False)\n\n# Top sellers\ntop = df_sales.groupby(\"title\")[\"units_sold\"].sum().reset_index().sort_values(\"units_sold\", ascending=False).head(20)\ntop.to_csv(PY_TAB / \"top_titles_by_units_sold.csv\", index=False)\n\n# KPIs\nkpis = {\n    \"n_titles\": int(df_sales[\"title\"].nunique()),\n    \"n_months\": int(df_sales[\"month\"].nunique()),\n    \"total_units_sold\": int(df_sales[\"units_sold\"].sum()),\n}\nif \"revenue\" in df_sales.columns:\n    kpis[\"total_revenue\"] = round(float(df_sales[\"revenue\"].sum()), 2)\n\nwith open(PY_TAB / \"kpis.json\", \"w\") as f:\n    json.dump(kpis, f)\n\nprint(\"All artifacts saved!\")\nprint(json.dumps(kpis, indent=2))"
   ],
   "outputs": [],
   "execution_count": null
  }
 ]
}