Spaces:

ESCP
/

RX12-BubbleBustersGroup12

Sleeping

File size: 22,186 Bytes
{
 "nbformat": 4,
 "nbformat_minor": 5,
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "# **🤖 Data Analysis & Visualization**"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "## **1.** 📦 Install required packages"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "!pip install pandas matplotlib seaborn numpy scipy"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "## **2.** ✅️ Load & inspect the dataset"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *a. Initial setup*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "import pandas as pd\nimport numpy as np\nimport json, re, warnings\nfrom collections import Counter\nfrom pathlib import Path\nwarnings.filterwarnings(\"ignore\")\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *b. ✋🏻🛑⛔️ Create the df dataframe from the ai_bubble_clean.csv file*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "df = pd.read_csv(\"ai_bubble_clean.csv\")\ndf[\"Date\"]      = pd.to_datetime(df[\"Date\"])\ndf[\"YearMonth\"] = pd.to_datetime(df[\"YearMonth\"])"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *c. ✋🏻🛑⛔️ Visualize the first few lines of df*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "df.head()"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *d. Run a quality check on the dataset*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "def quality_check(df, name=\"DataFrame\"):\n    print(f\"\\n🔍 Quality Check Report for: {name}\")\n    print(\"=\" * (25 + len(name)))\n    print(f\"\\n📏 Shape: {df.shape}\")\n    print(\"\\n🔠 Column Types:\")\n    print(df.dtypes)\n    print(\"\\n❓ Missing Values:\")\n    print(df.isnull().sum())\n    print(f\"\\n📋 Duplicate Rows: {df.duplicated().sum()}\")\n    print(\"\\n📊 Summary Statistics:\")\n    display(df.describe(include=\"all\").transpose())\n    print(\"\\n👀 Sample Rows:\")\n    display(df.sample(min(5, len(df))))\n\nquality_check(df, \"AI Bubble Sentiment Dataset\")\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "## **3.** 📊 Set up output folders and plot style"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *a. Create artifact folders for the Hugging Face dashboard*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "import matplotlib\nmatplotlib.use(\"Agg\")\nimport matplotlib.pyplot as plt\nimport matplotlib.dates as mdates\nfrom matplotlib.ticker import MaxNLocator\nimport seaborn as sns\n\nART_DIR = Path(\"artifacts\")\nPY_FIG  = ART_DIR / \"py\" / \"figures\"\nPY_TAB  = ART_DIR / \"py\" / \"tables\"\n\nfor p in [PY_FIG, PY_TAB]:\n    p.mkdir(parents=True, exist_ok=True)\n\nPALETTE     = {\"bullish\": \"#2ecc71\", \"neutral\": \"#3498db\", \"bearish\": \"#e74c3c\"}\nESCP_PURPLE = \"#2e0052\"\nsns.set_theme(style=\"whitegrid\", font_scale=1.1)\n\nprint(\"✅ Output folders:\")\nprint(\" -\", PY_FIG.resolve())\nprint(\" -\", PY_TAB.resolve())\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "## **4.** 🧭 Overall sentiment distribution"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *a. Compute sentiment, platform and topic counts*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "sent_counts  = df[\"Sentiment\"].value_counts().reindex([\"bullish\",\"neutral\",\"bearish\"])\nplat_counts  = df[\"Platform\"].value_counts().head(6)\ntopic_counts = df[\"Topic\"].value_counts()\nprint(sent_counts)\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *b. Plot the 3-panel overview figure*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "fig, axes = plt.subplots(1, 3, figsize=(16, 5))\nfig.suptitle(\"AI Bubble Sentiment — Dataset Overview\", fontsize=15,\n             fontweight=\"bold\", color=ESCP_PURPLE)\n\n# Pie — sentiment\naxes[0].pie(sent_counts,\n    labels=[f\"{s.title()}\\n{n}\" for s, n in zip(sent_counts.index, sent_counts)],\n    colors=[PALETTE[s] for s in sent_counts.index],\n    autopct=\"%1.1f%%\", startangle=90,\n    wedgeprops={\"edgecolor\":\"white\",\"linewidth\":2})\naxes[0].set_title(\"Sentiment Distribution\", fontweight=\"bold\")\n\n# Bar — platform\naxes[1].barh(plat_counts.index[::-1], plat_counts.values[::-1],\n             color=ESCP_PURPLE, alpha=0.8)\naxes[1].set_title(\"Comments by Platform\", fontweight=\"bold\")\naxes[1].set_xlabel(\"Number of Comments\")\nfor i, v in enumerate(plat_counts.values[::-1]):\n    axes[1].text(v + 0.3, i, str(v), va=\"center\", fontsize=9)\n\n# Bar — topic\naxes[2].bar(topic_counts.index, topic_counts.values,\n    color=[\"#9b59b6\",\"#3498db\",\"#e67e22\",\"#1abc9c\"],\n    edgecolor=\"white\", linewidth=1.5)\naxes[2].set_title(\"Comments by Topic\", fontweight=\"bold\")\naxes[2].set_ylabel(\"Count\"); axes[2].set_xlabel(\"Topic\")\nfor i, v in enumerate(topic_counts.values):\n    axes[2].text(i, v + 0.3, str(v), ha=\"center\", fontsize=9)\n\nplt.tight_layout()\nplt.savefig(PY_FIG / \"01_overview_distributions.png\", dpi=150)\nplt.show()\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "## **5.** 📈 Sentiment over time"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *a. Aggregate monthly comment counts per sentiment*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "monthly = (df.groupby([\"YearMonth\",\"Sentiment\"]).size()\n             .unstack(fill_value=0)\n             .reindex(columns=[\"bullish\",\"neutral\",\"bearish\"], fill_value=0))\nmonthly.index = pd.to_datetime(monthly.index)\n\nmonthly_score = df.groupby(\"YearMonth\")[\"SentScore\"].mean()\nmonthly_score.index = pd.to_datetime(monthly_score.index)\nrolling_score = monthly_score.rolling(3, min_periods=1).mean()\n\nmonthly_out = monthly.copy()\nmonthly_out[\"avg_score\"] = monthly_score\nmonthly_out.index = monthly_out.index.strftime(\"%Y-%m\")\nmonthly_out.reset_index(inplace=True)\nmonthly_out.columns = [\"month\",\"bullish\",\"neutral\",\"bearish\",\"avg_score\"]\nmonthly_out.to_csv(PY_TAB / \"monthly_sentiment.csv\", index=False)\nprint(monthly_out.head())\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *b. Plot stacked area + rolling sentiment score*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 9), sharex=True)\nfig.suptitle(\"AI Bubble Sentiment Over Time\", fontsize=15,\n             fontweight=\"bold\", color=ESCP_PURPLE)\n\nax1.stackplot(monthly.index,\n    monthly[\"bullish\"], monthly[\"neutral\"], monthly[\"bearish\"],\n    labels=[\"Bullish\",\"Neutral\",\"Bearish\"],\n    colors=[PALETTE[\"bullish\"],PALETTE[\"neutral\"],PALETTE[\"bearish\"]], alpha=0.75)\nax1.set_ylabel(\"Number of Comments\")\nax1.set_title(\"Monthly Comment Volume by Sentiment\", fontweight=\"bold\")\nax1.legend(loc=\"upper left\", framealpha=0.8)\nax1.yaxis.set_major_locator(MaxNLocator(integer=True))\n\nax2.axhline(0, color=\"black\", lw=0.8, ls=\"--\", alpha=0.6)\nax2.fill_between(rolling_score.index, rolling_score, 0,\n    where=(rolling_score >= 0), interpolate=True,\n    color=PALETTE[\"bullish\"], alpha=0.4, label=\"Bullish zone\")\nax2.fill_between(rolling_score.index, rolling_score, 0,\n    where=(rolling_score < 0), interpolate=True,\n    color=PALETTE[\"bearish\"], alpha=0.4, label=\"Bearish zone\")\nax2.plot(rolling_score.index, rolling_score, color=\"black\", lw=1.5, label=\"3-month avg\")\nax2.set_ylabel(\"Avg Sentiment Score\\n(+1=bullish, -1=bearish)\")\nax2.set_xlabel(\"Date\")\nax2.set_title(\"Rolling Average Sentiment Score (3-month window)\", fontweight=\"bold\")\nax2.legend(loc=\"upper left\", framealpha=0.8)\nax2.xaxis.set_major_formatter(mdates.DateFormatter(\"%b '%y\"))\nax2.xaxis.set_major_locator(mdates.MonthLocator(interval=4))\nplt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha=\"right\")\n\nplt.tight_layout()\nplt.savefig(PY_FIG / \"02_sentiment_over_time.png\", dpi=150)\nplt.show()\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "## **6.** 🔥 Sentiment breakdown by topic"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *a. ✋🏻🛑⛔️ Compute cross-tabulation of Topic vs Sentiment*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "cross     = pd.crosstab(df[\"Topic\"], df[\"Sentiment\"])[[\"bullish\",\"neutral\",\"bearish\"]]\ncross_pct = cross.div(cross.sum(axis=1), axis=0) * 100\ncross.reset_index().to_csv(PY_TAB / \"sentiment_by_topic.csv\", index=False)\nprint(cross)\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *b. Plot absolute counts and % heatmaps side by side*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))\nfig.suptitle(\"Sentiment vs. Topic\", fontsize=15, fontweight=\"bold\", color=ESCP_PURPLE)\n\nsns.heatmap(cross, annot=True, fmt=\"d\", cmap=\"YlOrRd\",\n    linewidths=0.5, linecolor=\"white\", cbar_kws={\"label\":\"Count\"}, ax=ax1)\nax1.set_title(\"Absolute Comment Counts\", fontweight=\"bold\")\n\nsns.heatmap(cross_pct, annot=True, fmt=\".1f\", cmap=\"RdYlGn\",\n    linewidths=0.5, linecolor=\"white\",\n    cbar_kws={\"label\":\"% within Topic\"}, ax=ax2, vmin=0, vmax=60)\nax2.set_title(\"% of Comments per Topic\", fontweight=\"bold\")\n\nplt.tight_layout()\nplt.savefig(PY_FIG / \"03_sentiment_by_topic.png\", dpi=150)\nplt.show()\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "## **7.** 🌐 Sentiment breakdown by platform"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *a. Compute platform cross-tabulation*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "top_platforms = df[\"Platform\"].value_counts().head(6).index\ndf_plat    = df[df[\"Platform\"].isin(top_platforms)]\nplat_cross = pd.crosstab(df_plat[\"Platform\"], df_plat[\"Sentiment\"])[[\"bullish\",\"neutral\",\"bearish\"]]\nplat_pct   = plat_cross.div(plat_cross.sum(axis=1), axis=0) * 100\nplat_cross.reset_index().to_csv(PY_TAB / \"sentiment_by_platform.csv\", index=False)\nprint(plat_pct)\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *b. Plot stacked bar chart by platform*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "fig, ax = plt.subplots(figsize=(12, 6))\nplat_pct.plot(kind=\"bar\", stacked=True, ax=ax,\n    color=[PALETTE[\"bullish\"],PALETTE[\"neutral\"],PALETTE[\"bearish\"]],\n    edgecolor=\"white\", linewidth=0.8)\nax.set_title(\"Sentiment Distribution by Platform (%)\", fontsize=14,\n             fontweight=\"bold\", color=ESCP_PURPLE)\nax.set_xlabel(\"Platform\"); ax.set_ylabel(\"% of Comments\")\nax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha=\"right\")\nax.legend(title=\"Sentiment\", bbox_to_anchor=(1.01, 1), loc=\"upper left\")\nax.axhline(50, color=\"black\", ls=\"--\", alpha=0.4, lw=0.9)\nplt.tight_layout()\nplt.savefig(PY_FIG / \"04_sentiment_by_platform.png\", dpi=150)\nplt.show()\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "## **8.** 📅 Yearly sentiment shift"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *a. Aggregate by year*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "yearly     = pd.crosstab(df[\"Year\"], df[\"Sentiment\"])[[\"bullish\",\"neutral\",\"bearish\"]]\nyearly_pct = yearly.div(yearly.sum(axis=1), axis=0) * 100\nyearly.reset_index().to_csv(PY_TAB / \"yearly_sentiment.csv\", index=False)\nprint(yearly_pct)\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *b. ✋🏻🛑⛔️ Plot volume and share grouped bars side by side*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))\nfig.suptitle(\"How Sentiment Has Shifted Year-over-Year\", fontsize=14,\n             fontweight=\"bold\", color=ESCP_PURPLE)\n\nyearly.plot(kind=\"bar\", ax=ax1,\n    color=[PALETTE[\"bullish\"],PALETTE[\"neutral\"],PALETTE[\"bearish\"]], edgecolor=\"white\")\nax1.set_title(\"Comment Volume by Year\", fontweight=\"bold\")\nax1.set_xticklabels(yearly.index, rotation=0); ax1.legend(title=\"Sentiment\")\n\nyearly_pct.plot(kind=\"bar\", stacked=True, ax=ax2,\n    color=[PALETTE[\"bullish\"],PALETTE[\"neutral\"],PALETTE[\"bearish\"]], edgecolor=\"white\")\nax2.set_title(\"Sentiment Share by Year (%)\", fontweight=\"bold\")\nax2.set_xticklabels(yearly_pct.index, rotation=0)\nax2.legend(title=\"Sentiment\", bbox_to_anchor=(1.01, 1))\nplt.tight_layout()\nplt.savefig(PY_FIG / \"05_yearly_sentiment_shift.png\", dpi=150)\nplt.show()\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "## **9.** 🔤 Word frequency by sentiment"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *a. Define stopwords and top_words function*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "STOPWORDS = {\n    \"the\",\"a\",\"an\",\"is\",\"it\",\"in\",\"of\",\"and\",\"to\",\"for\",\"on\",\"are\",\"that\",\n    \"this\",\"with\",\"as\",\"but\",\"not\",\"be\",\"at\",\"by\",\"or\",\"from\",\"have\",\"has\",\n    \"will\",\"was\",\"were\",\"been\",\"they\",\"their\",\"we\",\"our\",\"i\",\"you\",\"he\",\"she\",\n    \"its\",\"so\",\"if\",\"than\",\"more\",\"just\",\"can\",\"about\",\"what\",\"which\",\"would\",\n    \"also\",\"there\",\"these\",\"those\",\"all\",\"some\",\"any\",\"up\",\"how\",\"very\",\"much\",\n    \"when\",\"who\",\"one\",\"my\",\"do\",\"had\",\"get\",\"out\",\"even\",\"into\",\"like\",\"no\",\n    \"after\",\"them\",\"your\",\"such\",\"because\",\"am\",\"over\",\"does\",\"make\",\"only\",\"really\"\n}\n\ndef top_words(texts, n=20):\n    words = []\n    for t in texts:\n        words.extend(re.findall(r\"\\b[a-z]{3,}\\b\", str(t).lower()))\n    return Counter(w for w in words if w not in STOPWORDS).most_common(n)\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *b. ✋🏻🛑⛔️ Plot top 20 words for each sentiment label*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "fig, axes = plt.subplots(1, 3, figsize=(18, 7))\nfig.suptitle(\"Top Words by Sentiment\", fontsize=14, fontweight=\"bold\", color=ESCP_PURPLE)\n\nfor ax, sent in zip(axes, [\"bullish\",\"neutral\",\"bearish\"]):\n    pairs = top_words(df[df[\"Sentiment\"] == sent][\"Comment\"])\n    words, freqs = zip(*pairs)\n    ax.barh(list(words)[::-1], list(freqs)[::-1],\n            color=PALETTE[sent], alpha=0.85, edgecolor=\"white\")\n    ax.set_title(f\"{sent.title()} Comments\", fontweight=\"bold\", color=PALETTE[sent])\n    ax.set_xlabel(\"Frequency\")\n\nplt.tight_layout()\nplt.savefig(PY_FIG / \"06_word_frequency_by_sentiment.png\", dpi=150)\nplt.show()\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "## **10.** 📐 AI Bubble Risk Score"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *a. Define bubble_risk function — bearish share of (bullish+bearish) per month*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "def bubble_risk(group):\n    b     = (group == \"bearish\").sum()\n    u     = (group == \"bullish\").sum()\n    total = b + u\n    return b / total if total > 0 else np.nan\n\nmonthly_risk = df.groupby(\"YearMonth\")[\"Sentiment\"].apply(bubble_risk)\nmonthly_risk.index = pd.to_datetime(monthly_risk.index)\nrolling_risk = monthly_risk.rolling(3, min_periods=1).mean()\n\npd.DataFrame({\n    \"month\": rolling_risk.index.strftime(\"%Y-%m\"),\n    \"bubble_risk_score\": rolling_risk.round(3).values\n}).to_csv(PY_TAB / \"bubble_risk_score.csv\", index=False)\nprint(rolling_risk.tail())\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "### *b. Plot bubble risk score over time*"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "fig, ax = plt.subplots(figsize=(13, 5))\nax.fill_between(rolling_risk.index, rolling_risk, 0.5,\n    where=(rolling_risk > 0.5), interpolate=True,\n    color=\"#e74c3c\", alpha=0.3, label=\"Bearish-dominant\")\nax.fill_between(rolling_risk.index, rolling_risk, 0.5,\n    where=(rolling_risk <= 0.5), interpolate=True,\n    color=\"#2ecc71\", alpha=0.3, label=\"Bullish-dominant\")\nax.plot(rolling_risk.index, rolling_risk, color=\"#2c3e50\", lw=2,\n        label=\"Risk score (3-mo avg)\")\nax.axhline(0.5, color=\"gray\", ls=\"--\", lw=1, alpha=0.7, label=\"Neutral threshold\")\nax.set_ylim(0, 1)\nax.set_ylabel(\"Bubble Risk Score\\n(0 = all bullish, 1 = all bearish)\")\nax.set_title(\"AI Bubble Risk Score Over Time\", fontsize=14,\n             fontweight=\"bold\", color=ESCP_PURPLE)\nax.xaxis.set_major_formatter(mdates.DateFormatter(\"%b '%y\"))\nax.xaxis.set_major_locator(mdates.MonthLocator(interval=4))\nplt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha=\"right\")\nax.legend(framealpha=0.8)\nplt.tight_layout()\nplt.savefig(PY_FIG / \"07_bubble_risk_score.png\", dpi=150)\nplt.show()\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "## **11.** 💾 Save Python outputs for the Hugging Face dashboard"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "This section exports **HF-ready artifacts** into the folder structure the app expects:\n- `artifacts/py/figures/` — all chart images\n- `artifacts/py/tables/` — tables and KPI JSON\n"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "from scipy.stats import chi2_contingency\n\n# Chi-square test\nchi2_stat, p, dof, _ = chi2_contingency(cross.values)\npd.DataFrame({\n    \"Test\":               [\"Chi-Square (Sentiment vs Topic)\"],\n    \"Chi2_Statistic\":     [round(chi2_stat, 3)],\n    \"p_value\":            [round(p, 4)],\n    \"Degrees_of_Freedom\": [dof],\n    \"Significant_alpha_05\": [\"Yes\" if p < 0.05 else \"No\"],\n}).to_csv(PY_TAB / \"chi_square_result.csv\", index=False)\n\n# KPIs JSON\nlatest_3mo  = df[df[\"Date\"] >= df[\"Date\"].max() - pd.DateOffset(months=3)]\nlatest_risk = (latest_3mo[\"Sentiment\"]==\"bearish\").sum() / max(\n    (latest_3mo[\"Sentiment\"]==\"bearish\").sum() +\n    (latest_3mo[\"Sentiment\"]==\"bullish\").sum(), 1)\n\nkpis = {\n    \"total_comments\":      int(len(df)),\n    \"date_range\":          f\"{df['Date'].min().strftime('%b %Y')} – {df['Date'].max().strftime('%b %Y')}\",\n    \"n_platforms\":         int(df[\"Platform\"].nunique()),\n    \"n_topics\":            int(df[\"Topic\"].nunique()),\n    \"pct_bearish\":         round((df[\"Sentiment\"]==\"bearish\").mean()*100, 1),\n    \"pct_bullish\":         round((df[\"Sentiment\"]==\"bullish\").mean()*100, 1),\n    \"pct_neutral\":         round((df[\"Sentiment\"]==\"neutral\").mean()*100, 1),\n    \"avg_sentiment_score\": round(df[\"SentScore\"].mean(), 3),\n    \"latest_bubble_risk\":  round(float(latest_risk), 3),\n    \"chi2_p_value\":        round(p, 4),\n    \"most_bearish_topic\":  str(cross_pct[\"bearish\"].idxmax()),\n    \"most_bullish_topic\":  str(cross_pct[\"bullish\"].idxmax()),\n    \"dominant_platform\":   str(df[\"Platform\"].value_counts().index[0]),\n}\nwith open(PY_TAB / \"kpis.json\", \"w\") as f:\n    json.dump(kpis, f, indent=2)\n\nprint(\"✅ All Python artifacts saved\")\nprint(f\"   Figures : {len(list(PY_FIG.glob('*.png')))}\")\nprint(f\"   Tables  : {len(list(PY_TAB.glob('*.csv')) + list(PY_TAB.glob('*.json')))}\")\nfor k, v in kpis.items():\n    print(f\"   {k}: {v}\")\n"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "✅ **Output for R notebook**: `ai_bubble_clean.csv` in the working directory (produced by `datacreation.ipynb`)."
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": "## **12.** ⬇️ Download all Python artifacts"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "# ── Download all Python artifacts as a ZIP ────────────────────────────────────\nimport shutil, zipfile, os\nfrom pathlib import Path\n\nzip_path = \"python_analysis_artifacts.zip\"\n\nwith zipfile.ZipFile(zip_path, \"w\", zipfile.ZIP_DEFLATED) as zf:\n    for p in Path(\"artifacts/py\").rglob(\"*\"):\n        if p.is_file():\n            zf.write(p, p.relative_to(\"artifacts/py\"))\n\nprint(\"📦 ZIP contents:\")\nwith zipfile.ZipFile(zip_path, \"r\") as zf:\n    for name in sorted(zf.namelist()):\n        print(f\"   {name}\")\n\n# Colab: triggers a browser download\n# HuggingFace / local: the ZIP is saved next to the notebook\ntry:\n    from google.colab import files\n    files.download(zip_path)\n    print(\"\\n✅ Download started!\")\nexcept ImportError:\n    print(f\"\\n✅ Saved as: {Path(zip_path).resolve()}\")\n"
  }
 ]
}