{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "# Data collection, creation, storage, and processing" ], "metadata": { "id": "rTMYVnFH-Abb" } }, { "cell_type": "markdown", "source": [ "## 1. Install required packages" ], "metadata": { "id": "BaXcfRTV8yJO" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "78FfoEGA8tat", "collapsed": true }, "outputs": [], "source": [ "!pip install -q pandas requests beautifulsoup4 lxml yfinance" ] }, { "cell_type": "markdown", "source": [ "## 2. Web-scrape news headlines and stock price data" ], "metadata": { "id": "15bQFwPK-MmX" } }, { "cell_type": "markdown", "source": [ "### a. Initial setup" ], "metadata": { "id": "s09UWUWK-fic" } }, { "cell_type": "code", "source": [ "import time\n", "import requests\n", "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import yfinance as yf\n", "from datetime import datetime\n", "\n", "# -----------------------------\n", "# Configuration\n", "# -----------------------------\n", "tickers = [\n", " \"AAPL\", \"MSFT\", \"NVDA\", \"AMZN\", \"GOOGL\",\n", " \"META\", \"TSLA\", \"JPM\", \"XOM\", \"NFLX\"\n", "]\n", "\n", "LOOKBACK_DAYS = 7\n", "\n", "today = pd.Timestamp.today().normalize()\n", "start_ts = today - pd.Timedelta(days=LOOKBACK_DAYS)\n", "end_ts_exclusive = today + pd.Timedelta(days=1)\n", "\n", "start_date_str = start_ts.strftime(\"%Y-%m-%d\")\n", "end_date_str = end_ts_exclusive.strftime(\"%Y-%m-%d\")\n", "\n", "base_url = \"https://finviz.com/quote.ashx?t={}\"\n", "headers = {\n", " \"User-Agent\": (\n", " \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n", " \"AppleWebKit/537.36 (KHTML, like Gecko) \"\n", " \"Chrome/124.0.0.0 Safari/537.36\"\n", " )\n", "}\n", "\n", "print(\"Rolling window start:\", start_ts.date())\n", "print(\"Rolling window end:\", today.date())\n", "print(\"Tickers:\", tickers)" ], "metadata": { "id": "SoER8bjH-Lv5", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "cc96b450-fb9d-4e65-a60f-a8ddd9d03491" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Rolling window start: 2026-04-03\n", "Rolling window end: 2026-04-10\n", "Tickers: ['AAPL', 'MSFT', 'NVDA', 'AMZN', 'GOOGL', 'META', 'TSLA', 'JPM', 'XOM', 'NFLX']\n" ] } ] }, { "cell_type": "markdown", "source": [ "### b. Scrape recent news headlines" ], "metadata": { "id": "mLkOtx3x_7jj" } }, { "cell_type": "code", "source": [ "news_data = []\n", "\n", "for ticker in tickers:\n", " url = base_url.format(ticker)\n", " print(f\"Scraping headlines for {ticker}...\")\n", "\n", " try:\n", " response = requests.get(url, headers=headers, timeout=20)\n", " except Exception as e:\n", " print(f\"Request failed for {ticker}: {e}\")\n", " continue\n", "\n", " if response.status_code != 200:\n", " print(f\"Failed to fetch {ticker}: status {response.status_code}\")\n", " continue\n", "\n", " soup = BeautifulSoup(response.content, \"lxml\")\n", " news_table = soup.find(\"table\", id=\"news-table\")\n", "\n", " if news_table is None:\n", " print(f\"No news table found for {ticker}\")\n", " continue\n", "\n", " current_date = None\n", "\n", " for row in news_table.find_all(\"tr\"):\n", " headline_tag = row.find(\"a\")\n", " date_tag = row.find(\"td\")\n", "\n", " if headline_tag is None or date_tag is None:\n", " continue\n", "\n", " headline = headline_tag.get_text(strip=True)\n", " date_text = date_tag.get_text(\" \", strip=True)\n", " parts = date_text.split()\n", "\n", " raw_date = None\n", " news_time_str = None\n", "\n", " if len(parts) == 2:\n", " raw_date = parts[0]\n", " news_time_str = parts[1]\n", "\n", " if raw_date == \"Today\":\n", " current_date = pd.Timestamp.today().normalize()\n", " elif raw_date == \"Yesterday\":\n", " current_date = pd.Timestamp.today().normalize() - pd.Timedelta(days=1)\n", " else:\n", " current_date = pd.to_datetime(raw_date, format=\"%b-%d-%y\", errors=\"coerce\")\n", "\n", " elif len(parts) == 1:\n", " news_time_str = parts[0]\n", " else:\n", " continue\n", "\n", " if pd.isna(current_date) or current_date is None:\n", " continue\n", "\n", " try:\n", " news_time = datetime.strptime(news_time_str, \"%I:%M%p\").time()\n", " news_datetime = pd.Timestamp.combine(current_date.date(), news_time)\n", " except Exception:\n", " news_datetime = pd.NaT\n", "\n", " news_data.append({\n", " \"ticker\": ticker,\n", " \"date\": pd.Timestamp(current_date).normalize(),\n", " \"datetime\": news_datetime,\n", " \"time_text\": news_time_str,\n", " \"headline\": headline\n", " })\n", "\n", " time.sleep(1.5)" ], "metadata": { "id": "miT2o6JV__WH", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "9641f041-911c-4c40-e501-2c44f653c30f" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Scraping headlines for AAPL...\n", "Scraping headlines for MSFT...\n", "Scraping headlines for NVDA...\n", "Scraping headlines for AMZN...\n", "Scraping headlines for GOOGL...\n", "Scraping headlines for META...\n", "Scraping headlines for TSLA...\n", "Scraping headlines for JPM...\n", "Scraping headlines for XOM...\n", "Scraping headlines for NFLX...\n" ] } ] }, { "cell_type": "markdown", "source": [ "### c. Create and clean the news dataframe df_news" ], "metadata": { "id": "wzSnvG8CGP8X" } }, { "cell_type": "code", "source": [ "df_news = pd.DataFrame(news_data)\n", "\n", "if not df_news.empty:\n", " df_news[\"ticker\"] = df_news[\"ticker\"].astype(str).str.upper().str.strip()\n", " df_news[\"date\"] = pd.to_datetime(df_news[\"date\"], errors=\"coerce\").dt.normalize()\n", " df_news[\"datetime\"] = pd.to_datetime(df_news[\"datetime\"], errors=\"coerce\")\n", " df_news[\"headline\"] = df_news[\"headline\"].astype(str).str.strip()\n", "\n", " # Keep only rolling 7-day window\n", " df_news = df_news[\n", " (df_news[\"date\"] >= start_ts) &\n", " (df_news[\"date\"] < end_ts_exclusive)\n", " ].copy()\n", "\n", " # Remove duplicates\n", " df_news = df_news.drop_duplicates(subset=[\"ticker\", \"date\", \"headline\"]).copy()\n", "\n", " # Sort for readability\n", " df_news = df_news.sort_values([\"ticker\", \"date\", \"datetime\"]).reset_index(drop=True)" ], "metadata": { "collapsed": true, "id": "zusyHCF6GT55" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### d. Save df_news to csv" ], "metadata": { "id": "rAt36sXzjJyW" } }, { "cell_type": "code", "source": [ "df_news.to_csv(\"scraped_news_data.csv\", index=False)" ], "metadata": { "id": "QZ46O4CDjMMc" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### e. Download stock closing prices data" ], "metadata": { "id": "hStzeY0jAEyg" } }, { "cell_type": "code", "source": [ "raw_prices = yf.download(\n", " tickers=tickers,\n", " start=start_date_str,\n", " end=end_date_str,\n", " auto_adjust=False,\n", " progress=False,\n", " group_by=\"ticker\",\n", " threads=True\n", ")" ], "metadata": { "id": "SvW3aqPsAGsu" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### f. Create and clean the prices dataframe df_prices" ], "metadata": { "id": "93BQiQNxFsP1" } }, { "cell_type": "code", "source": [ "price_frames = []\n", "\n", "for ticker in tickers:\n", " try:\n", " temp = raw_prices[ticker].copy().reset_index()\n", " except Exception:\n", " print(f\"No price data found for {ticker}\")\n", " continue\n", "\n", " # Standardize column names\n", " temp.columns = [str(col).strip().lower().replace(\" \", \"_\") for col in temp.columns]\n", "\n", " # Keep only date + close\n", " temp = temp[[\"date\", \"close\"]].copy()\n", " temp[\"ticker\"] = ticker\n", "\n", " price_frames.append(temp)\n", "\n", "# Combine all tickers\n", "df_prices = pd.concat(price_frames, ignore_index=True) if price_frames else pd.DataFrame()\n", "\n", "# -----------------------------\n", "# Clean dataframe\n", "# -----------------------------\n", "if not df_prices.empty:\n", " df_prices[\"ticker\"] = df_prices[\"ticker\"].astype(str).str.upper().str.strip()\n", " df_prices[\"date\"] = pd.to_datetime(df_prices[\"date\"], errors=\"coerce\").dt.normalize()\n", "\n", " df_prices = df_prices.dropna(subset=[\"ticker\", \"date\", \"close\"]).copy()\n", " df_prices = df_prices.drop_duplicates(subset=[\"ticker\", \"date\"]).copy()\n", " df_prices = df_prices.sort_values([\"ticker\", \"date\"]).reset_index(drop=True)" ], "metadata": { "id": "3T-rc_lBKNz8" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### g. Aggregate headlines to daily stock-levels" ], "metadata": { "id": "NPIp2aG5MDSL" } }, { "cell_type": "code", "source": [ "if df_news.empty:\n", " news_daily = pd.DataFrame(columns=[\n", " \"ticker\", \"date\", \"headline_count\", \"headlines_text\"\n", " ])\n", "else:\n", " news_daily = (\n", " df_news.groupby([\"ticker\", \"date\"], as_index=False)\n", " .agg(\n", " headline_count=(\"headline\", \"count\"),\n", " headlines_text=(\"headline\", lambda x: \" || \".join(x.astype(str)))\n", " )\n", " )\n" ], "metadata": { "id": "OtT_8XHOMIE_" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### h. Save df_prices to csv" ], "metadata": { "id": "KBAwupZ0i5n8" } }, { "cell_type": "code", "source": [ "df_prices.to_csv(\"stock_price_data.csv\", index=False)" ], "metadata": { "id": "i2xLQEsFi81T" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### i. Create the final real dataset df_final" ], "metadata": { "id": "h6E4hjfWANsW" } }, { "cell_type": "code", "source": [ "df_final = pd.merge(\n", " df_prices,\n", " news_daily,\n", " on=[\"ticker\", \"date\"],\n", " how=\"left\"\n", ")\n", "\n", "df_final[\"headline_count\"] = df_final[\"headline_count\"].fillna(0).astype(int)\n", "df_final[\"headlines_text\"] = df_final[\"headlines_text\"].fillna(\"\")\n", "\n", "df_final = df_final.sort_values([\"ticker\", \"date\"]).reset_index(drop=True)\n", "\n", "# Next-day return for later analysis\n", "df_final[\"next_day_return\"] = (\n", " df_final.groupby(\"ticker\")[\"close\"].pct_change().shift(-1)\n", ")" ], "metadata": { "id": "8icGxSI7AVGi" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### j. Save the final merged dataset as CSV" ], "metadata": { "id": "AaaQbzUOF3Yv" } }, { "cell_type": "code", "source": [ "output_file = \"final_real_dataset.csv\"\n", "df_final.to_csv(output_file, index=False)\n", "print(f\"Saved final dataset: {output_file}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NuU1fTNJF9J5", "outputId": "159daf6d-6d07-463b-fdfa-ad62a03af4e1" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Saved final dataset: final_real_dataset.csv\n" ] } ] }, { "cell_type": "markdown", "source": [ "### k. View the first few lines" ], "metadata": { "id": "XFo4hd20AaZE" } }, { "cell_type": "code", "source": [ "print(\"Final merged dataset preview:\")\n", "display(df_final.head(30))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "sLW7pvU8PJRK", "outputId": "d3bff42f-dc5a-4228-b863-17fd94b77ff4", "collapsed": true }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Final merged dataset preview:\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " date close ticker headline_count \\\n", "0 2026-04-06 258.859985 AAPL 13 \n", "1 2026-04-07 253.500000 AAPL 29 \n", "2 2026-04-08 258.899994 AAPL 26 \n", "3 2026-04-09 260.489990 AAPL 7 \n", "4 2026-04-10 260.480011 AAPL 20 \n", "5 2026-04-06 212.789993 AMZN 0 \n", "6 2026-04-07 213.770004 AMZN 10 \n", "7 2026-04-08 221.250000 AMZN 19 \n", "8 2026-04-09 233.649994 AMZN 43 \n", "9 2026-04-10 238.380005 AMZN 28 \n", "10 2026-04-06 299.989990 GOOGL 0 \n", "11 2026-04-07 305.459991 GOOGL 28 \n", "12 2026-04-08 317.320007 GOOGL 20 \n", "13 2026-04-09 318.489990 GOOGL 28 \n", "14 2026-04-10 317.239990 GOOGL 24 \n", "15 2026-04-06 295.450012 JPM 33 \n", "16 2026-04-07 297.399994 JPM 9 \n", "17 2026-04-08 307.970001 JPM 7 \n", "18 2026-04-09 310.329987 JPM 3 \n", "19 2026-04-10 309.869995 JPM 7 \n", "20 2026-04-06 573.020020 META 0 \n", "21 2026-04-07 575.049988 META 4 \n", "22 2026-04-08 612.419983 META 26 \n", "23 2026-04-09 628.390015 META 48 \n", "24 2026-04-10 629.859985 META 22 \n", "25 2026-04-06 372.880005 MSFT 15 \n", "26 2026-04-07 372.290009 MSFT 24 \n", "27 2026-04-08 374.329987 MSFT 19 \n", "28 2026-04-09 373.070007 MSFT 21 \n", "29 2026-04-10 370.869995 MSFT 11 \n", "\n", " headlines_text next_day_return \n", "0 Jack Dorseys Bitchat Removed From Apple Store ... -0.020706 \n", "1 Apple's iPhone sales continue to surge as iPho... 0.021302 \n", "2 Warren Buffett Is Retired. His Latest Advice C... 0.006141 \n", "3 Is Qualcomm Stock A Cheap Cash Machine With Bi... -0.000038 \n", "4 TSMC's Q1 revenue jumps 35% y/y, beats market ... NaN \n", "5 0.004606 \n", "6 USPS to retain 80% of its Amazon deliveries un... 0.034991 \n", "7 Uber deploys AWS custom chips to scale AI and ... 0.056045 \n", "8 Amazon cloud unit's AI revenue run rate exceed... 0.020244 \n", "9 Amazon CEO Andy Jassy takes a shot at Nvidia, ... NaN \n", "10 0.018234 \n", "11 Anthropic Reports $30 Billion Run Rate, Expand... 0.038827 \n", "12 What Justifies Broadcom's 50x Multiple? || How... 0.003687 \n", "13 Four Risks To Watch For Palantir Stock In The ... -0.003925 \n", "14 Big Tech puts financial heft behind next-gen n... NaN \n", "15 Have Other Epstein Crimes Gone Unpunished? || ... 0.006600 \n", "16 Jamie Dimon says JPMorgan Chase could enter pr... 0.035541 \n", "17 Exxon Sees 6% of Its Worldwide Output Shut on ... 0.007663 \n", "18 Bailey Warns Iran War Is Compounding Private C... -0.001482 \n", "19 Anthropic Has Banks, Regulators Rattled Over A... NaN \n", "20 0.003543 \n", "21 The dividend yield on the S&P 500 is now at 50... 0.064986 \n", "22 What Justifies Broadcom's 50x Multiple? || How... 0.026077 \n", "23 Meta, Oracle and Qualcomm share details on lay... 0.002339 \n", "24 Chinas Alibaba shifts towards revenue over ope... NaN \n", "25 Have Other Epstein Crimes Gone Unpunished? || ... -0.001582 \n", "26 Worldcoin Prices Dips As Sam Altmans Trust Cri... 0.005480 \n", "27 Why some investors are shifting their money fr... -0.003366 \n", "28 Microsoft & Oracle have a 'tremendous opportun... -0.005897 \n", "29 Why Microsoft and Google could be the best Mag... NaN " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateclosetickerheadline_countheadlines_textnext_day_return
02026-04-06258.859985AAPL13Jack Dorseys Bitchat Removed From Apple Store ...-0.020706
12026-04-07253.500000AAPL29Apple's iPhone sales continue to surge as iPho...0.021302
22026-04-08258.899994AAPL26Warren Buffett Is Retired. His Latest Advice C...0.006141
32026-04-09260.489990AAPL7Is Qualcomm Stock A Cheap Cash Machine With Bi...-0.000038
42026-04-10260.480011AAPL20TSMC's Q1 revenue jumps 35% y/y, beats market ...NaN
52026-04-06212.789993AMZN00.004606
62026-04-07213.770004AMZN10USPS to retain 80% of its Amazon deliveries un...0.034991
72026-04-08221.250000AMZN19Uber deploys AWS custom chips to scale AI and ...0.056045
82026-04-09233.649994AMZN43Amazon cloud unit's AI revenue run rate exceed...0.020244
92026-04-10238.380005AMZN28Amazon CEO Andy Jassy takes a shot at Nvidia, ...NaN
102026-04-06299.989990GOOGL00.018234
112026-04-07305.459991GOOGL28Anthropic Reports $30 Billion Run Rate, Expand...0.038827
122026-04-08317.320007GOOGL20What Justifies Broadcom's 50x Multiple? || How...0.003687
132026-04-09318.489990GOOGL28Four Risks To Watch For Palantir Stock In The ...-0.003925
142026-04-10317.239990GOOGL24Big Tech puts financial heft behind next-gen n...NaN
152026-04-06295.450012JPM33Have Other Epstein Crimes Gone Unpunished? || ...0.006600
162026-04-07297.399994JPM9Jamie Dimon says JPMorgan Chase could enter pr...0.035541
172026-04-08307.970001JPM7Exxon Sees 6% of Its Worldwide Output Shut on ...0.007663
182026-04-09310.329987JPM3Bailey Warns Iran War Is Compounding Private C...-0.001482
192026-04-10309.869995JPM7Anthropic Has Banks, Regulators Rattled Over A...NaN
202026-04-06573.020020META00.003543
212026-04-07575.049988META4The dividend yield on the S&P 500 is now at 50...0.064986
222026-04-08612.419983META26What Justifies Broadcom's 50x Multiple? || How...0.026077
232026-04-09628.390015META48Meta, Oracle and Qualcomm share details on lay...0.002339
242026-04-10629.859985META22Chinas Alibaba shifts towards revenue over ope...NaN
252026-04-06372.880005MSFT15Have Other Epstein Crimes Gone Unpunished? || ...-0.001582
262026-04-07372.290009MSFT24Worldcoin Prices Dips As Sam Altmans Trust Cri...0.005480
272026-04-08374.329987MSFT19Why some investors are shifting their money fr...-0.003366
282026-04-09373.070007MSFT21Microsoft & Oracle have a 'tremendous opportun...-0.005897
292026-04-10370.869995MSFT11Why Microsoft and Google could be the best Mag...NaN
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"display(df_final\",\n \"rows\": 30,\n \"fields\": [\n {\n \"column\": \"date\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2026-04-06 00:00:00\",\n \"max\": \"2026-04-10 00:00:00\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"2026-04-07 00:00:00\",\n \"2026-04-10 00:00:00\",\n \"2026-04-08 00:00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"close\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 126.94489238438246,\n \"min\": 212.7899932861328,\n \"max\": 629.8599853515625,\n \"num_unique_values\": 30,\n \"samples\": [\n 374.3299865722656,\n 295.45001220703125,\n 628.3900146484375\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ticker\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"AAPL\",\n \"AMZN\",\n \"MSFT\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"headline_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12,\n \"min\": 0,\n \"max\": 48,\n \"num_unique_values\": 20,\n \"samples\": [\n 13,\n 15,\n 48\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"headlines_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 28,\n \"samples\": [\n \"Amazon CEO Andy Jassy takes a shot at Nvidia, says its own chips are a better bang for your buck || Chips are still where the AI trade's rubber meets the road || Jeff Bezos could fund 'insulin in America' plus Texas school lunches with a wealth tax and keep $215B, says Warren || Big Tech puts financial heft behind next-gen nuclear power as AI demand surges || Markets Are Buying the Iran Cease-Fire. This Can Turn Optimism Into a Lasting Rally. || Anthropic Has Banks, Regulators Rattled Over AI Cyber Risks. Some Big Names Could Benefit. || Bessent, Powell Rattled by Anthropic Cybersecurity Risks. Why CrowdStrike Could Benefit. || Boom Chaga owner buys Kombucha Town || Amazon adds fuel surcharge for sellers. But will you have to pay more? || Is Microsoft Stock A Smarter Buy Than Google After Its Massive 250% Rally? || Meta stock is still 'dirt cheap' despite Muse Spark rally || CoreWeave Shares Jump on Multi-Year AI Deal With Anthropic || Dreem Health joins Amazons Health Benefits Connector || Amazon to scale up drone delivery in 2025, CEO says || Nasdaq on track to exit correction territory as torrid stock-market rebound picks up steam || Stock Market Rally Revs Higher On Iran Ceasefire; Amazon, AI Names In Focus: Weekly Review || This might be the best time for you to load up on Big Tech stocks || What's Next for Amazon Stock? Charts Say Buy. || Amazon DSPs in NYC fight for survival against no subcontractor proposal || Investing.coms stocks of the week || Bessent, Fed's Powell met with bank CEOs over potent new Anthropic AI || How OpenAI's reported ad plans could change the AI business model || CoreWeave stock surges on multiyear AI deal with Anthropic || A Technician's Take on the Mag 7, Dow Transports, and More || Wall Street Banks Try Out Anthropics Mythos as US Urges Testing || Why Alphabet is a 'really misunderstood story' for investors || 5 To Watch: The Masters Edition & The Business of the Tournament || Wall Street Banks Try Out Anthropics Mythos as US Urges\",\n \"Why some investors are shifting their money from Big Tech to ETFs || Why Iran could end the bull market || How Can Meta Stock Fall 30%? || Retail investors are buying the dip in Tesla stock || Why Microsoft is a 'screaming buy.' || US-Iran ceasefire sends investors back to their most beloved stocks || Anthropic Rolls Out Mythos AI to Select Cyber Partners || Microsoft Seen as Mispriced Despite AI Strength || Nasdaq tech leads strong Wall Street rebound after Iran ceasefire deal || Anthropic launches Mythos Preview and industry cybersecurity initiative Project Glasswing || Apple stock is the new retail investor darling || Anthropic Launches Project Glasswing With Amazon, Apple, Microsoft To Test Mythos AI || Anthropic Limits Mythos Model Release in Bid to Stave Off Hacks || Anthropic Hires Microsoft AI Leader As It Commits $50 Billion To Infrastructure || BofA hikes 2026 chips forecast to $1.3 trillion, names Nvidia, Broadcom, Marvell, AMD as top drivers || How Wall Street from Iran War Angst to Watching for a New S&P 500 Record || How Markets Went from Iran War Angst to Watching for a New S&P 500 Record || Veteran Analyst Says Microsoft, Salesforce to Lead Tech Rally After Ceasefire || Microsoft announces quarterly earnings release date\",\n \"Amazon cloud unit's AI revenue run rate exceeds $15 billion, CEO says || Microsoft Stocks Path To $450 || Amazon Pharmacy to Offer Eli Lilly and Company's New GLP-1 Pill Foundayo via Same-Day Delivery || 3 charts reveal Big Tech's biggest problem this earnings season || Factbox-From OpenAI to Nvidia, firms channel billions into AI infrastructure as demand booms || Amazon's CEO doubles down on AI spending: 'We are not going to be conservative' || Amazon CEO Releases His Shareholder Letter. He Has One Message on AI. || Amazons Jassy says AI revenue run rate is over $15 Bln, hints at future chip sale || AI is supercharging the cybersecurity fight || 'Superyacht' Sales Are Rising Again. Who's Buying and How Much They Cost. || Amazon CEO takes aim at Nvidia, Intel, Starlink, more in annual shareholder letter || Amazon CEO Jassy says company could sell AI chips, raising stakes for Nvidia, AMD || Amazon CEO Says AI Chip Business Is Booming. It Could Be as Big as Broadcom. || Amazon CEO Andy Jassy defends AI spending spree in shareholder letter || Amazon vs Nvidia? CEO floats future chip business to shareholders || Amazon Is Considering Selling Its AI Chips to Other Companies || Amazon CEO Andy Jassy defends AI investment plan in annual letter to shareholders || Big Tech ETF rises as Amazon posts biggest gains in its portfolio || Morgan Stanley survey shows modest improvement in 2026 IT budget expectations || Amazon Stock Rises -- CEO Says AI Business Has Hit $15 Billion Run Rate || Amazon to sell new weight-loss pill Foundayo via kiosks, home delivery || Does Amazon really have more growth avenues than McDonald's? || Amazon Pharmacy adds Eli Lilly weight loss pill Foundayo same-day delivery || Amazon Pharmacy adds same-day delivery for Eli Lilly's Foundayo weight-loss pill || Amazon's Chip Business Tops $20 Billion Run Rate || Amazon Push Into GLP-1 Access Expands Healthcare Reach || US Import Volumes Hold Steady Despite Iran War, but Cost Pressures Concern Retailers || Eli Lilly's Foundayo Weight-Loss Pill Hits the Shelves. It's a Turning Point for the Stock. || Amazon may sell Trainium AI chips to third parties in shot at Nvidia || Amazon CEO Says Chip Business 'On Fire' As AWS Steps Up Challenge To Nvidia || Amazon stock is in focus as DOJ launches antitrust NFL probe || The AI Trade Is Reawakening. Why the Timing Couldn't Be Better. || Amazon Stock Set to Post Biggest Gain in More Than 5 Months. Why CEO Letter Has the Market Excited. || Amazon CEO's AI chip vision could end up challenging Nvidia || Dow Jones Futures: Stocks Extend Rally On Iran War News, Amazon; CPI Inflation, Nvidia Chipmaker Due || Investors Bid Up Amazon Stock After Chief's Annual Letter || Market Talk: The bull market will not be 'derailed' by war || Amazon's CEO Says Its AI Chips Business Is 'On Fire' || Amazons AI-Powered Tool Matches Dogs and Cats with Potential Adopters || Amazon Commits $25 Billion To Data Center Expansion || Amazon Pushes Deeper Into Healthcare With Lilly Drug || Apple to shutter its first unionized US store in Maryland || Amazon signals chip export ambitions as in-house silicon business tops US$20 billion run rate\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"next_day_return\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.02054147248470628,\n \"min\": -0.020706117804507373,\n \"max\": 0.06498564630983283,\n \"num_unique_values\": 24,\n \"samples\": [\n 0.01823394579408988,\n 0.0035425782564793007,\n -0.020706117804507373\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "## 3. Create a meaningful connection between real & synthetic datasets" ], "metadata": { "id": "OBJAqsEY0KSI" } }, { "cell_type": "markdown", "source": [ "### a. Initial setup" ], "metadata": { "id": "Nlep0DuK0DYo" } }, { "cell_type": "code", "source": [ "import numpy as np\n", "import random\n", "\n", "# Reproducibility\n", "np.random.seed(42)\n", "random.seed(42)\n", "\n", "# Work on a copy\n", "df_real = df_final.copy()" ], "metadata": { "id": "30rkSD3Tz_3e" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### b. Generate market attention scores based on headline_count (with some randomness)" ], "metadata": { "id": "OlLdFMkw0cW4" } }, { "cell_type": "code", "source": [ "# Find the maximum headline count in the real dataset\n", "max_headlines = df_real[\"headline_count\"].max()\n", "\n", "print(\"Maximum headline count in dataset:\", max_headlines)\n", "\n", "\n", "def generate_market_attention_score(headline_count, max_headlines):\n", " \"\"\"\n", " Create a synthetic market attention score from the real number of headlines.\n", " The score is scaled as a percentage of the maximum observed headline count,\n", " with a small amount of randomness added to make the feature more realistic.\n", " \"\"\"\n", " if pd.isna(headline_count) or max_headlines == 0:\n", " return 0\n", "\n", " # Scale headline count to a 0-100 range\n", " base_score = (headline_count / max_headlines) * 100\n", "\n", " # Add small random noise\n", " noise = np.random.normal(loc=0, scale=5)\n", "\n", " score = base_score + noise\n", "\n", " # Keep score in a clean 0-100 range\n", " score = max(0, min(100, score))\n", "\n", " return round(score, 2)\n", "\n", "\n", "# Create the market_attention_score column\n", "df_real[\"market_attention_score\"] = df_real[\"headline_count\"].apply(\n", " lambda x: generate_market_attention_score(x, max_headlines)\n", ")\n", "\n", "# Calculate quartiles from the generated score\n", "q1 = df_real[\"market_attention_score\"].quantile(0.25)\n", "q2 = df_real[\"market_attention_score\"].quantile(0.50)\n", "q3 = df_real[\"market_attention_score\"].quantile(0.75)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-bKcfMgpSexp", "outputId": "2f20b02e-e83d-47f9-cc5e-68a93de6e694" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Maximum headline count in dataset: 48\n" ] } ] }, { "cell_type": "markdown", "source": [ "### c. Decide on the sentiment_label based on the market attention score" ], "metadata": { "id": "7v_17aMgw2Rr" } }, { "cell_type": "code", "source": [ "def get_sentiment_label(attention_score, q1, q3):\n", " \"\"\"\n", " Convert the market attention score into a synthetic sentiment label\n", " using quartile-based thresholds.\n", " \"\"\"\n", " if pd.isna(attention_score):\n", " return \"neutral\"\n", " elif attention_score < q1:\n", " return \"negative\"\n", " elif attention_score > q3:\n", " return \"positive\"\n", " else:\n", " return \"neutral\"\n", "\n", "\n", "# Create the sentiment_label column using quartiles\n", "df_real[\"sentiment_label\"] = df_real[\"market_attention_score\"].apply(\n", " lambda x: get_sentiment_label(x, q1, q3)\n", ")\n", "\n", "# View result\n", "df_real[[\"ticker\", \"date\", \"headline_count\", \"market_attention_score\", \"sentiment_label\"]].head(10)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 556 }, "id": "pTWWb1dMS7XJ", "outputId": "731c8fac-b7c1-46d5-d5f9-a1893d92bcab" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ticker date headline_count market_attention_score sentiment_label\n", "0 AAPL 2026-04-06 13 29.57 neutral\n", "1 AAPL 2026-04-07 29 59.73 positive\n", "2 AAPL 2026-04-08 26 57.41 positive\n", "3 AAPL 2026-04-09 7 22.20 neutral\n", "4 AAPL 2026-04-10 20 40.50 neutral\n", "5 AMZN 2026-04-06 0 0.00 negative\n", "6 AMZN 2026-04-07 10 28.73 neutral\n", "7 AMZN 2026-04-08 19 43.42 neutral\n", "8 AMZN 2026-04-09 43 87.24 positive\n", "9 AMZN 2026-04-10 28 61.05 positive" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tickerdateheadline_countmarket_attention_scoresentiment_label
0AAPL2026-04-061329.57neutral
1AAPL2026-04-072959.73positive
2AAPL2026-04-082657.41positive
3AAPL2026-04-09722.20neutral
4AAPL2026-04-102040.50neutral
5AMZN2026-04-0600.00negative
6AMZN2026-04-071028.73neutral
7AMZN2026-04-081943.42neutral
8AMZN2026-04-094387.24positive
9AMZN2026-04-102861.05positive
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"df_real[[\\\"ticker\\\", \\\"date\\\", \\\"headline_count\\\", \\\"market_attention_score\\\", \\\"sentiment_label\\\"]]\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"ticker\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"AMZN\",\n \"AAPL\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"date\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2026-04-06 00:00:00\",\n \"max\": \"2026-04-10 00:00:00\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"2026-04-07 00:00:00\",\n \"2026-04-10 00:00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"headline_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12,\n \"min\": 0,\n \"max\": 43,\n \"num_unique_values\": 10,\n \"samples\": [\n 43,\n 29\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"market_attention_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 24.591288091517285,\n \"min\": 0.0,\n \"max\": 87.24,\n \"num_unique_values\": 10,\n \"samples\": [\n 87.24,\n 59.73\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "markdown", "source": [ "### d. Savings the enriched real dataset & view first few lines" ], "metadata": { "id": "d4Mf15Q-Jxd5" } }, { "cell_type": "code", "source": [ "df_real.to_csv(\"real_dataset_with_market_attention_and_sentiment.csv\", index=False)\n", "print(\"Saved: real_dataset_with_market_attention_and_sentiment.csv\")" ], "metadata": { "id": "vJGpI6itJyKf", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "a4e25cd2-d2e9-4be2-eaf6-ab1bf3e9d030" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Saved: real_dataset_with_market_attention_and_sentiment.csv\n" ] } ] }, { "cell_type": "markdown", "source": [ "## 4. Generate synthetic stock movement data" ], "metadata": { "id": "rAs6ET4KdBbK" } }, { "cell_type": "markdown", "source": [ "### a. Create a generate_price_path function that generates stock-price patterns based on sentiment_label (with some randomness)" ], "metadata": { "id": "jMe9BJyn0dOQ" } }, { "cell_type": "code", "source": [ "def generate_price_path(start_price, sentiment_label, num_days=10):\n", " \"\"\"\n", " Generate a synthetic stock-price path influenced by sentiment.\n", "\n", " Positive sentiment -> upward trend\n", " Neutral sentiment -> mostly flat / mild movement\n", " Negative sentiment -> downward trend\n", "\n", " Some randomness is included to keep the paths realistic.\n", " \"\"\"\n", " prices = []\n", " current_price = float(start_price)\n", "\n", " # Set drift and volatility by sentiment label\n", " if sentiment_label == \"positive\":\n", " drift = 0.006\n", " volatility = 0.012\n", " elif sentiment_label == \"neutral\":\n", " drift = 0.0005\n", " volatility = 0.010\n", " else: # negative\n", " drift = -0.005\n", " volatility = 0.013\n", "\n", " for day in range(1, num_days + 1):\n", " # Random day-to-day variation\n", " random_shock = np.random.normal(loc=0, scale=volatility)\n", "\n", " # Slight trend reinforcement over time\n", " momentum = drift * (day / num_days)\n", "\n", " # Final daily return\n", " daily_return = drift + momentum + random_shock\n", "\n", " # Optional occasional larger shock for realism\n", " if np.random.rand() < 0.10:\n", " daily_return += np.random.normal(0, 0.02)\n", "\n", " # Update synthetic price\n", " current_price = current_price * (1 + daily_return)\n", "\n", " prices.append({\n", " \"day_offset\": day,\n", " \"synthetic_daily_return\": round(daily_return, 5),\n", " \"synthetic_close\": round(current_price, 2)\n", " })\n", "\n", " return prices\n", "\n", "\n", "# Run the function as part of building synthetic_stock_data\n", "synthetic_stock_data = []\n", "\n", "for _, row in df_real.iterrows():\n", " base_price = row[\"close\"]\n", " sentiment = row[\"sentiment_label\"]\n", "\n", " if pd.isna(base_price):\n", " continue\n", "\n", " synthetic_path = generate_price_path(\n", " start_price=base_price,\n", " sentiment_label=sentiment,\n", " num_days=10\n", " )\n", "\n", " for point in synthetic_path:\n", " synthetic_stock_data.append({\n", " \"ticker\": row[\"ticker\"],\n", " \"source_date\": row[\"date\"],\n", " \"sentiment_label\": sentiment,\n", " \"market_attention_score\": row[\"market_attention_score\"],\n", " \"day_offset\": point[\"day_offset\"],\n", " \"synthetic_daily_return\": point[\"synthetic_daily_return\"],\n", " \"synthetic_close\": point[\"synthetic_close\"]\n", " })\n", "\n", "# Create dataframe\n", "df_synthetic_stock = pd.DataFrame(synthetic_stock_data)\n" ], "metadata": { "id": "YsDt8ijgdGMY" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### b. Create a df_synthetic_stock dataframe from synthetic_stock_data" ], "metadata": { "id": "BYs58qYMeTr3" } }, { "cell_type": "code", "source": [ "df_synthetic_stock = pd.DataFrame(synthetic_stock_data)" ], "metadata": { "id": "WrsijmHqeTOO" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Save df_synthetic_stock as synthetic_stock_movement_data.csv & view first few lines" ], "metadata": { "id": "LfgkdZS9Tpu5" } }, { "cell_type": "code", "source": [ "df_synthetic_stock.to_csv(\"synthetic_stock_movement_data.csv\", index=False)\n", "print(\"Saved: synthetic_stock_movement_data.csv\")\n", "\n", "df_synthetic_stock.head(10)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 400 }, "id": "ysqeLXbYTxzO", "outputId": "b0750bc5-8d8b-4a26-a137-e59d36c57c9e" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Saved: synthetic_stock_movement_data.csv\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " ticker source_date sentiment_label market_attention_score day_offset \\\n", "0 AAPL 2026-04-06 neutral 29.57 1 \n", "1 AAPL 2026-04-06 neutral 29.57 2 \n", "2 AAPL 2026-04-06 neutral 29.57 3 \n", "3 AAPL 2026-04-06 neutral 29.57 4 \n", "4 AAPL 2026-04-06 neutral 29.57 5 \n", "5 AAPL 2026-04-06 neutral 29.57 6 \n", "6 AAPL 2026-04-06 neutral 29.57 7 \n", "7 AAPL 2026-04-06 neutral 29.57 8 \n", "8 AAPL 2026-04-06 neutral 29.57 9 \n", "9 AAPL 2026-04-06 neutral 29.57 10 \n", "\n", " synthetic_daily_return synthetic_close \n", "0 -0.00391 257.85 \n", "1 0.00642 259.50 \n", "2 0.00953 261.98 \n", "3 -0.00137 261.62 \n", "4 -0.00548 260.18 \n", "5 0.00771 262.19 \n", "6 -0.01021 259.51 \n", "7 -0.01106 256.64 \n", "8 0.00023 256.70 \n", "9 0.01104 259.53 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tickersource_datesentiment_labelmarket_attention_scoreday_offsetsynthetic_daily_returnsynthetic_close
0AAPL2026-04-06neutral29.571-0.00391257.85
1AAPL2026-04-06neutral29.5720.00642259.50
2AAPL2026-04-06neutral29.5730.00953261.98
3AAPL2026-04-06neutral29.574-0.00137261.62
4AAPL2026-04-06neutral29.575-0.00548260.18
5AAPL2026-04-06neutral29.5760.00771262.19
6AAPL2026-04-06neutral29.577-0.01021259.51
7AAPL2026-04-06neutral29.578-0.01106256.64
8AAPL2026-04-06neutral29.5790.00023256.70
9AAPL2026-04-06neutral29.57100.01104259.53
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_synthetic_stock", "summary": "{\n \"name\": \"df_synthetic_stock\",\n \"rows\": 500,\n \"fields\": [\n {\n \"column\": \"ticker\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"TSLA\",\n \"AMZN\",\n \"MSFT\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"source_date\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2026-04-06 00:00:00\",\n \"max\": \"2026-04-10 00:00:00\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"2026-04-07 00:00:00\",\n \"2026-04-10 00:00:00\",\n \"2026-04-08 00:00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"positive\",\n \"negative\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"market_attention_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 25.52431212351776,\n \"min\": 0.0,\n \"max\": 92.88,\n \"num_unique_values\": 47,\n \"samples\": [\n 40.75,\n 59.19,\n 41.46\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"day_offset\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2,\n \"min\": 1,\n \"max\": 10,\n \"num_unique_values\": 10,\n \"samples\": [\n 9,\n 2,\n 6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"synthetic_daily_return\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.014840355688254502,\n \"min\": -0.05456,\n \"max\": 0.05703,\n \"num_unique_values\": 482,\n \"samples\": [\n 0.00208,\n -0.01429,\n -0.02077\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"synthetic_close\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 137.3968449514224,\n \"min\": 84.56,\n \"max\": 702.53,\n \"num_unique_values\": 500,\n \"samples\": [\n 174.61,\n 222.87,\n 180.21\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 19 } ] }, { "cell_type": "markdown", "source": [ "## 5. Generate synthetic financial headlines" ], "metadata": { "id": "lCyGDIr9dHA-" } }, { "cell_type": "markdown", "source": [ "### a. Create a list of 50 distinct generic financial headlines for the sentiment labels \"positive\", \"neutral\", and \"negative\"" ], "metadata": { "id": "zRTG9qhZe1k7" } }, { "cell_type": "code", "source": [ "positive_headlines = [\n", " \"Company reports stronger-than-expected quarterly earnings\",\n", " \"Shares climb after upbeat revenue forecast\",\n", " \"Analysts raise price target following strong results\",\n", " \"Stock gains as market reacts positively to guidance\",\n", " \"Firm announces successful product launch\",\n", " \"Investors welcome improved profit margins\",\n", " \"Company expands into promising new market\",\n", " \"Stock rises after strategic partnership announcement\",\n", " \"Business outlook improves on solid demand\",\n", " \"Shares advance as growth accelerates\",\n", " \"Market responds positively to cost-cutting measures\",\n", " \"Company posts robust sales growth\",\n", " \"Stock rallies after optimistic management comments\",\n", " \"Firm beats expectations on both revenue and earnings\",\n", " \"Investors reward strong operating performance\",\n", " \"Company benefits from favorable industry trends\",\n", " \"Stock jumps after positive sector momentum\",\n", " \"Business confidence rises following update\",\n", " \"Shares move higher on bullish sentiment\",\n", " \"Company sees improving customer demand\",\n", " \"Stock gains after successful expansion plan\",\n", " \"Strong quarterly update boosts investor confidence\",\n", " \"Firm reports better-than-anticipated cash flow\",\n", " \"Investors react positively to operational improvements\",\n", " \"Company delivers resilient performance in key segment\",\n", " \"Shares increase as margins beat estimates\",\n", " \"Stock trends upward after favorable outlook\",\n", " \"Business posts solid year-over-year growth\",\n", " \"Company strengthens position in competitive market\",\n", " \"Positive momentum continues after earnings release\",\n", " \"Shares rise on strong guidance revision\",\n", " \"Firm reports encouraging user growth\",\n", " \"Company records stable gains across major divisions\",\n", " \"Stock advances after improved forecast\",\n", " \"Investor optimism grows around company performance\",\n", " \"Company announces value-creating acquisition\",\n", " \"Shares benefit from positive analyst sentiment\",\n", " \"Stock rises after earnings surprise\",\n", " \"Company posts healthy balance sheet update\",\n", " \"Market confidence improves after strategic announcement\",\n", " \"Business update signals continued expansion\",\n", " \"Shares climb on stronger profitability\",\n", " \"Company sees solid momentum entering new quarter\",\n", " \"Stock reacts well to favorable macro conditions\",\n", " \"Firm posts positive free cash flow results\",\n", " \"Investors respond well to management execution\",\n", " \"Company demonstrates strong competitive advantage\",\n", " \"Stock gains on rising institutional interest\",\n", " \"Business outlook remains upbeat amid strong demand\",\n", " \"Shares finish higher after positive market reaction\"\n", "]\n", "\n", "neutral_headlines = [\n", " \"Company shares trade mostly flat after market open\",\n", " \"Investors await upcoming earnings release\",\n", " \"Stock holds steady amid mixed market signals\",\n", " \"Firm announces routine operational update\",\n", " \"Company maintains previous guidance\",\n", " \"Shares show limited movement in quiet session\",\n", " \"Analysts keep neutral outlook on stock\",\n", " \"Business performance remains broadly stable\",\n", " \"Company reports in-line quarterly results\",\n", " \"Stock trades sideways as investors assess outlook\",\n", " \"Firm sees balanced risks in coming quarter\",\n", " \"Market reaction remains muted after company update\",\n", " \"Investors monitor developments without major shift\",\n", " \"Stock closes little changed after announcement\",\n", " \"Company continues steady performance in core markets\",\n", " \"Shares remain stable following routine filing\",\n", " \"Analyst commentary remains mixed on company\",\n", " \"Business update suggests moderate momentum\",\n", " \"Company maintains current market position\",\n", " \"Stock shows limited volatility during trading\",\n", " \"Investors assess company outlook cautiously\",\n", " \"Firm posts results broadly in line with expectations\",\n", " \"Company activity remains consistent quarter over quarter\",\n", " \"Shares remain range-bound in recent session\",\n", " \"Business conditions remain unchanged for now\",\n", " \"Stock pauses after previous gains\",\n", " \"Company provides standard corporate update\",\n", " \"Investors remain on the sidelines ahead of catalyst\",\n", " \"Trading volume stays near average levels\",\n", " \"Firm continues to operate in stable conditions\",\n", " \"Company outlook appears balanced\",\n", " \"Stock reflects mixed sentiment among investors\",\n", " \"Business trends remain steady across segments\",\n", " \"Shares drift without clear direction\",\n", " \"Company sees modest changes in recent activity\",\n", " \"Market participants await further information\",\n", " \"Stock performance remains largely unchanged\",\n", " \"Firm maintains stable demand environment\",\n", " \"Company update draws limited market response\",\n", " \"Shares remain near recent average levels\",\n", " \"Analysts note both opportunities and risks\",\n", " \"Business remains resilient but unchanged\",\n", " \"Stock consolidates after recent movement\",\n", " \"Company reports no major surprises this quarter\",\n", " \"Investors continue to monitor company fundamentals\",\n", " \"Trading sentiment remains neutral around stock\",\n", " \"Company performance stays in expected range\",\n", " \"Shares show minor moves during session\",\n", " \"Business environment remains relatively stable\",\n", " \"Stock holds near prior closing levels\"\n", "]\n", "\n", "negative_headlines = [\n", " \"Company misses earnings expectations in latest quarter\",\n", " \"Shares fall after weak revenue guidance\",\n", " \"Analysts cut price target following disappointing results\",\n", " \"Stock declines as outlook weakens\",\n", " \"Firm reports lower-than-expected demand\",\n", " \"Investors react negatively to margin pressure\",\n", " \"Company faces headwinds in key markets\",\n", " \"Stock drops after cautious management comments\",\n", " \"Business update raises concerns about growth\",\n", " \"Shares slide on weaker profitability\",\n", " \"Market responds poorly to earnings miss\",\n", " \"Company lowers forecast for upcoming quarter\",\n", " \"Stock falls as uncertainty increases\",\n", " \"Firm reports declining sales in core segment\",\n", " \"Investors worry about rising costs\",\n", " \"Company struggles with slowing momentum\",\n", " \"Shares retreat after unfavorable guidance\",\n", " \"Stock under pressure amid sector weakness\",\n", " \"Business outlook deteriorates after update\",\n", " \"Company sees softer customer demand\",\n", " \"Shares tumble on disappointing announcement\",\n", " \"Analysts highlight downside risks for stock\",\n", " \"Stock weakens following operational setback\",\n", " \"Company posts lower profit than expected\",\n", " \"Investors react cautiously to weaker cash flow\",\n", " \"Shares move lower after negative market reaction\",\n", " \"Firm warns of near-term uncertainty\",\n", " \"Stock declines on falling investor confidence\",\n", " \"Company reports subdued performance in key division\",\n", " \"Shares retreat as growth slows\",\n", " \"Business update disappoints the market\",\n", " \"Stock reacts negatively to lowered expectations\",\n", " \"Company faces pressure from macroeconomic conditions\",\n", " \"Investors question strength of company outlook\",\n", " \"Shares weaken after mixed earnings report\",\n", " \"Stock slides on increased competitive pressure\",\n", " \"Company issues cautious forward guidance\",\n", " \"Market sentiment turns negative after results\",\n", " \"Business momentum appears to be fading\",\n", " \"Shares fall despite management reassurances\",\n", " \"Firm reports slowdown in customer activity\",\n", " \"Stock drops on weak sector sentiment\",\n", " \"Company underperforms market expectations\",\n", " \"Investors respond poorly to strategic uncertainty\",\n", " \"Shares lose ground after unfavorable update\",\n", " \"Company sees margin compression in latest quarter\",\n", " \"Stock remains under pressure after report\",\n", " \"Business faces near-term downside risks\",\n", " \"Shares extend losses on disappointing signals\",\n", " \"Company outlook remains challenged in current environment\"\n", "]\n", "\n", "print(len(positive_headlines), len(neutral_headlines), len(negative_headlines))" ], "metadata": { "id": "chxnRox1ezg4", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e1470429-7927-4d62-fd8c-cab17b2d1123" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "50 50 50\n" ] } ] }, { "cell_type": "markdown", "source": [ "### b. Generate 10 headlines per stock-date using random sampling from the corresponding 50" ], "metadata": { "id": "LsIBiv8ye6FB" } }, { "cell_type": "code", "source": [ "synthetic_headline_rows = []\n", "\n", "# Full headline pools\n", "headline_dict = {\n", " \"positive\": positive_headlines,\n", " \"neutral\": neutral_headlines,\n", " \"negative\": negative_headlines\n", "}\n", "\n", "# Sampling weights by stock-date sentiment\n", "# These control the probability of drawing each headline type\n", "sampling_weights = {\n", " \"positive\": {\"positive\": 0.6, \"neutral\": 0.3, \"negative\": 0.1},\n", " \"neutral\": {\"positive\": 0.2, \"neutral\": 0.6, \"negative\": 0.2},\n", " \"negative\": {\"positive\": 0.1, \"neutral\": 0.3, \"negative\": 0.6}\n", "}\n", "\n", "for _, row in df_real.iterrows():\n", " row_sentiment = row[\"sentiment_label\"]\n", " weights = sampling_weights[row_sentiment]\n", "\n", " used_headlines = set()\n", "\n", " for i in range(1, 11): # 10 headlines total per stock-date\n", " # Randomly choose a headline category based on weighted probabilities\n", " chosen_category = random.choices(\n", " population=[\"positive\", \"neutral\", \"negative\"],\n", " weights=[\n", " weights[\"positive\"],\n", " weights[\"neutral\"],\n", " weights[\"negative\"]\n", " ],\n", " k=1\n", " )[0]\n", "\n", " # Sample one headline from that category, avoiding duplicates within the same stock-date if possible\n", " available_headlines = [\n", " h for h in headline_dict[chosen_category]\n", " if h not in used_headlines\n", " ]\n", "\n", " if not available_headlines:\n", " available_headlines = headline_dict[chosen_category]\n", "\n", " chosen_headline = random.choice(available_headlines)\n", " used_headlines.add(chosen_headline)\n", "\n", " synthetic_headline_rows.append({\n", " \"ticker\": row[\"ticker\"],\n", " \"source_date\": row[\"date\"],\n", " \"sentiment_label\": row_sentiment,\n", " \"headline_number\": i,\n", " \"synthetic_headline\": chosen_headline,\n", " \"headline_category\": chosen_category\n", " })\n", "\n", "# Create dataframe\n", "df_synthetic_headlines = pd.DataFrame(synthetic_headline_rows)" ], "metadata": { "id": "sCkZf_2Pe9Ga" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### c. Create the final dataframe df_synthetic_headlines & save it as synthetic_financial_headlines.csv" ], "metadata": { "id": "DjmaVlxAfC3O" } }, { "cell_type": "code", "source": [ "df_synthetic_headlines = pd.DataFrame(synthetic_headline_rows)\n", "\n", "df_synthetic_headlines.to_csv(\"synthetic_financial_headlines.csv\", index=False)\n", "print(\"Saved: synthetic_financial_headlines.csv\")\n", "\n", "display(df_synthetic_headlines.head(20))" ], "metadata": { "id": "Kgxosw9QfHZr", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "189510d2-e41a-4a6f-cbd8-242231be2ae0" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Saved: synthetic_financial_headlines.csv\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " ticker source_date sentiment_label headline_number \\\n", "0 AAPL 2026-04-06 neutral 1 \n", "1 AAPL 2026-04-06 neutral 2 \n", "2 AAPL 2026-04-06 neutral 3 \n", "3 AAPL 2026-04-06 neutral 4 \n", "4 AAPL 2026-04-06 neutral 5 \n", "5 AAPL 2026-04-06 neutral 6 \n", "6 AAPL 2026-04-06 neutral 7 \n", "7 AAPL 2026-04-06 neutral 8 \n", "8 AAPL 2026-04-06 neutral 9 \n", "9 AAPL 2026-04-06 neutral 10 \n", "10 AAPL 2026-04-07 positive 1 \n", "11 AAPL 2026-04-07 positive 2 \n", "12 AAPL 2026-04-07 positive 3 \n", "13 AAPL 2026-04-07 positive 4 \n", "14 AAPL 2026-04-07 positive 5 \n", "15 AAPL 2026-04-07 positive 6 \n", "16 AAPL 2026-04-07 positive 7 \n", "17 AAPL 2026-04-07 positive 8 \n", "18 AAPL 2026-04-07 positive 9 \n", "19 AAPL 2026-04-07 positive 10 \n", "\n", " synthetic_headline headline_category \n", "0 Investors await upcoming earnings release neutral \n", "1 Analyst commentary remains mixed on company neutral \n", "2 Stock holds near prior closing levels neutral \n", "3 Stock gains on rising institutional interest positive \n", "4 Investors react negatively to margin pressure negative \n", "5 Firm announces routine operational update neutral \n", "6 Firm beats expectations on both revenue and ea... positive \n", "7 Business remains resilient but unchanged neutral \n", "8 Stock rallies after optimistic management comm... positive \n", "9 Business environment remains relatively stable neutral \n", "10 Investors reward strong operating performance positive \n", "11 Shares move higher on bullish sentiment positive \n", "12 Company shares trade mostly flat after market ... neutral \n", "13 Market reaction remains muted after company up... neutral \n", "14 Shares remain range-bound in recent session neutral \n", "15 Firm beats expectations on both revenue and ea... positive \n", "16 Analysts highlight downside risks for stock negative \n", "17 Business posts solid year-over-year growth positive \n", "18 Shares increase as margins beat estimates positive \n", "19 Firm announces routine operational update neutral " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tickersource_datesentiment_labelheadline_numbersynthetic_headlineheadline_category
0AAPL2026-04-06neutral1Investors await upcoming earnings releaseneutral
1AAPL2026-04-06neutral2Analyst commentary remains mixed on companyneutral
2AAPL2026-04-06neutral3Stock holds near prior closing levelsneutral
3AAPL2026-04-06neutral4Stock gains on rising institutional interestpositive
4AAPL2026-04-06neutral5Investors react negatively to margin pressurenegative
5AAPL2026-04-06neutral6Firm announces routine operational updateneutral
6AAPL2026-04-06neutral7Firm beats expectations on both revenue and ea...positive
7AAPL2026-04-06neutral8Business remains resilient but unchangedneutral
8AAPL2026-04-06neutral9Stock rallies after optimistic management comm...positive
9AAPL2026-04-06neutral10Business environment remains relatively stableneutral
10AAPL2026-04-07positive1Investors reward strong operating performancepositive
11AAPL2026-04-07positive2Shares move higher on bullish sentimentpositive
12AAPL2026-04-07positive3Company shares trade mostly flat after market ...neutral
13AAPL2026-04-07positive4Market reaction remains muted after company up...neutral
14AAPL2026-04-07positive5Shares remain range-bound in recent sessionneutral
15AAPL2026-04-07positive6Firm beats expectations on both revenue and ea...positive
16AAPL2026-04-07positive7Analysts highlight downside risks for stocknegative
17AAPL2026-04-07positive8Business posts solid year-over-year growthpositive
18AAPL2026-04-07positive9Shares increase as margins beat estimatespositive
19AAPL2026-04-07positive10Firm announces routine operational updateneutral
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"display(df_synthetic_headlines\",\n \"rows\": 20,\n \"fields\": [\n {\n \"column\": \"ticker\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"AAPL\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"source_date\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2026-04-06 00:00:00\",\n \"max\": \"2026-04-07 00:00:00\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"2026-04-07 00:00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"headline_number\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2,\n \"min\": 1,\n \"max\": 10,\n \"num_unique_values\": 10,\n \"samples\": [\n 9\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"synthetic_headline\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"Investors await upcoming earnings release\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"headline_category\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "## 6. Combine synthetic components into one final synthetic dataset" ], "metadata": { "id": "05eQtBPhgHRY" } }, { "cell_type": "markdown", "source": [ "### a. Create a synthetic combined dataset" ], "metadata": { "id": "oOx_mgz71Yte" } }, { "cell_type": "code", "source": [ "# Summarise synthetic stock paths to one row per ticker-date\n", "synthetic_stock_summary = (\n", " df_synthetic_stock.groupby([\"ticker\", \"source_date\"], as_index=False)\n", " .agg(\n", " avg_synthetic_return=(\"synthetic_daily_return\", \"mean\"),\n", " synthetic_volatility=(\"synthetic_daily_return\", \"std\"),\n", " final_synthetic_close=(\"synthetic_close\", \"last\")\n", " )\n", ")\n", "\n", "# Summarise synthetic headlines to one row per ticker-date\n", "synthetic_headline_summary = (\n", " df_synthetic_headlines.groupby([\"ticker\", \"source_date\"], as_index=False)\n", " .agg(\n", " synthetic_headline_count=(\"synthetic_headline\", \"count\"),\n", " synthetic_headlines_text=(\"synthetic_headline\", lambda x: \" || \".join(x.astype(str))),\n", " positive_headline_count=(\"headline_category\", lambda x: (x == \"positive\").sum()),\n", " neutral_headline_count=(\"headline_category\", lambda x: (x == \"neutral\").sum()),\n", " negative_headline_count=(\"headline_category\", lambda x: (x == \"negative\").sum())\n", " )\n", ")\n", "\n", "# Merge synthetic summaries into one final synthetic dataset\n", "df_synthetic_final = pd.merge(\n", " df_real[[\"ticker\", \"date\", \"close\", \"headline_count\", \"market_attention_score\", \"sentiment_label\"]],\n", " synthetic_stock_summary,\n", " left_on=[\"ticker\", \"date\"],\n", " right_on=[\"ticker\", \"source_date\"],\n", " how=\"left\"\n", ")\n", "\n", "df_synthetic_final = pd.merge(\n", " df_synthetic_final,\n", " synthetic_headline_summary,\n", " left_on=[\"ticker\", \"date\"],\n", " right_on=[\"ticker\", \"source_date\"],\n", " how=\"left\"\n", ")\n", "\n", "# Drop duplicate merge keys\n", "df_synthetic_final = df_synthetic_final.drop(\n", " columns=[\"source_date_x\", \"source_date_y\"],\n", " errors=\"ignore\"\n", ")\n", "\n", "# Optional: reorder columns to match the real dataset style more closely\n", "df_synthetic_final = df_synthetic_final[\n", " [\n", " \"ticker\",\n", " \"date\",\n", " \"close\",\n", " \"headline_count\",\n", " \"market_attention_score\",\n", " \"sentiment_label\",\n", " \"avg_synthetic_return\",\n", " \"synthetic_volatility\",\n", " \"final_synthetic_close\",\n", " \"synthetic_headline_count\",\n", " \"positive_headline_count\",\n", " \"neutral_headline_count\",\n", " \"negative_headline_count\",\n", " \"synthetic_headlines_text\"\n", " ]\n", "].copy()" ], "metadata": { "collapsed": true, "id": "YEQwi_sqgJ-g" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### b. Save it as synthetic_only_dataset.csv" ], "metadata": { "id": "KIZiN0o3haD0" } }, { "cell_type": "code", "source": [ "df_synthetic_final.to_csv(\"final_synthetic_dataset.csv\", index=False)\n", "print(\"Saved: final_synthetic_dataset.csv\")" ], "metadata": { "id": "GixlTkUyhdmB", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "aad5358a-92d9-4537-ad6b-5c2eac5a5df9" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Saved: final_synthetic_dataset.csv\n" ] } ] }, { "cell_type": "markdown", "source": [ "### c. View first few lines" ], "metadata": { "id": "i73ElhKF1yiR" } }, { "cell_type": "code", "source": [ "display(df_synthetic_final.head(10))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 574 }, "id": "6gWk1gc611on", "outputId": "20acbe91-51c0-4b4f-ab22-15daca255f58" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ " ticker date close headline_count market_attention_score \\\n", "0 AAPL 2026-04-06 258.859985 13 29.57 \n", "1 AAPL 2026-04-07 253.500000 29 59.73 \n", "2 AAPL 2026-04-08 258.899994 26 57.41 \n", "3 AAPL 2026-04-09 260.489990 7 22.20 \n", "4 AAPL 2026-04-10 260.480011 20 40.50 \n", "5 AMZN 2026-04-06 212.789993 0 0.00 \n", "6 AMZN 2026-04-07 213.770004 10 28.73 \n", "7 AMZN 2026-04-08 221.250000 19 43.42 \n", "8 AMZN 2026-04-09 233.649994 43 87.24 \n", "9 AMZN 2026-04-10 238.380005 28 61.05 \n", "\n", " sentiment_label avg_synthetic_return synthetic_volatility \\\n", "0 neutral 0.000290 0.008071 \n", "1 positive 0.007028 0.012363 \n", "2 positive 0.008240 0.012922 \n", "3 neutral 0.001543 0.009618 \n", "4 neutral 0.000151 0.017220 \n", "5 negative -0.008547 0.017343 \n", "6 neutral 0.005595 0.011042 \n", "7 neutral 0.005049 0.015248 \n", "8 positive 0.018966 0.015949 \n", "9 positive 0.011784 0.018745 \n", "\n", " final_synthetic_close synthetic_headline_count positive_headline_count \\\n", "0 259.53 10 3 \n", "1 271.70 10 5 \n", "2 280.83 10 5 \n", "3 264.43 10 2 \n", "4 260.53 10 1 \n", "5 195.02 10 0 \n", "6 225.92 10 1 \n", "7 232.44 10 1 \n", "8 281.63 10 6 \n", "9 267.59 10 7 \n", "\n", " neutral_headline_count negative_headline_count \\\n", "0 6 1 \n", "1 4 1 \n", "2 4 1 \n", "3 6 2 \n", "4 7 2 \n", "5 3 7 \n", "6 6 3 \n", "7 5 4 \n", "8 2 2 \n", "9 2 1 \n", "\n", " synthetic_headlines_text \n", "0 Investors await upcoming earnings release || A... \n", "1 Investors reward strong operating performance ... \n", "2 Company sees modest changes in recent activity... \n", "3 Analysts note both opportunities and risks || ... \n", "4 Stock holds near prior closing levels || Compa... \n", "5 Shares retreat after unfavorable guidance || C... \n", "6 Market participants await further information ... \n", "7 Analysts keep neutral outlook on stock || Comp... \n", "8 Company benefits from favorable industry trend... \n", "9 Investors monitor developments without major s... " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tickerdatecloseheadline_countmarket_attention_scoresentiment_labelavg_synthetic_returnsynthetic_volatilityfinal_synthetic_closesynthetic_headline_countpositive_headline_countneutral_headline_countnegative_headline_countsynthetic_headlines_text
0AAPL2026-04-06258.8599851329.57neutral0.0002900.008071259.5310361Investors await upcoming earnings release || A...
1AAPL2026-04-07253.5000002959.73positive0.0070280.012363271.7010541Investors reward strong operating performance ...
2AAPL2026-04-08258.8999942657.41positive0.0082400.012922280.8310541Company sees modest changes in recent activity...
3AAPL2026-04-09260.489990722.20neutral0.0015430.009618264.4310262Analysts note both opportunities and risks || ...
4AAPL2026-04-10260.4800112040.50neutral0.0001510.017220260.5310172Stock holds near prior closing levels || Compa...
5AMZN2026-04-06212.78999300.00negative-0.0085470.017343195.0210037Shares retreat after unfavorable guidance || C...
6AMZN2026-04-07213.7700041028.73neutral0.0055950.011042225.9210163Market participants await further information ...
7AMZN2026-04-08221.2500001943.42neutral0.0050490.015248232.4410154Analysts keep neutral outlook on stock || Comp...
8AMZN2026-04-09233.6499944387.24positive0.0189660.015949281.6310622Company benefits from favorable industry trend...
9AMZN2026-04-10238.3800052861.05positive0.0117840.018745267.5910721Investors monitor developments without major s...
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"display(df_synthetic_final\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"ticker\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"AMZN\",\n \"AAPL\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"date\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2026-04-06 00:00:00\",\n \"max\": \"2026-04-10 00:00:00\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"2026-04-07 00:00:00\",\n \"2026-04-10 00:00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"close\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 19.84038400018096,\n \"min\": 212.7899932861328,\n \"max\": 260.489990234375,\n \"num_unique_values\": 10,\n \"samples\": [\n 233.64999389648438,\n 253.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"headline_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12,\n \"min\": 0,\n \"max\": 43,\n \"num_unique_values\": 10,\n \"samples\": [\n 43,\n 29\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"market_attention_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 24.591288091517285,\n \"min\": 0.0,\n \"max\": 87.24,\n \"num_unique_values\": 10,\n \"samples\": [\n 87.24,\n 59.73\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"avg_synthetic_return\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.007432709995538489,\n \"min\": -0.008547,\n \"max\": 0.018966,\n \"num_unique_values\": 10,\n \"samples\": [\n 0.018966,\n 0.0070279999999999995\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"synthetic_volatility\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0035921213498186816,\n \"min\": 0.008071472397689697,\n \"max\": 0.018744805621232187,\n \"num_unique_values\": 10,\n \"samples\": [\n 0.0159487263162646,\n 0.012362548101243351\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"final_synthetic_close\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 27.679106761438504,\n \"min\": 195.02,\n \"max\": 281.63,\n \"num_unique_values\": 10,\n \"samples\": [\n 281.63,\n 271.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"synthetic_headline_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 10,\n \"max\": 10,\n \"num_unique_values\": 1,\n \"samples\": [\n 10\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"positive_headline_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2,\n \"min\": 0,\n \"max\": 7,\n \"num_unique_values\": 7,\n \"samples\": [\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"neutral_headline_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 2,\n \"max\": 7,\n \"num_unique_values\": 6,\n \"samples\": [\n 6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"negative_headline_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 7,\n \"num_unique_values\": 5,\n \"samples\": [\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"synthetic_headlines_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Company benefits from favorable industry trends || Shares show limited movement in quiet session || Company records stable gains across major divisions || Stock holds near prior closing levels || Business outlook improves on solid demand || Stock advances after improved forecast || Market responds poorly to earnings miss || Company sees solid momentum entering new quarter || Investors reward strong operating performance || Company outlook remains challenged in current environment\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} } ] } ] }