{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import datetime\n", "import pytz\n", "import numpy as np\n", "from utils.ipynb_helpers import read_data, write_df, convert_tz\n", "\n", "\n", "# Location to open raw data from data providers\n", "DATA_RAW = \"data/raw\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Read Data From All-Data CSV (Multi Index Columns)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "stock=True\n", "df_all = read_data(os.path.join(DATA_RAW, \"realdata_pol_1h.csv\"), stock=stock)\n", "# df_all = read_data(os.path.join(DATA_RAW, \"other/PEMSBAY.csv\"), stock=stock)\n", "\n", "df_all = df_all[df_all.columns[:-12]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Filtering & Processing the Master Dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def percentage_nans(data, sort=True):\n", " percent_missing = data.isnull().sum() * 100 / len(data)\n", " missing_value_df = pd.DataFrame(\n", " {\"percent_missing\": percent_missing} #'column_name': data.columns,\n", " )\n", " if sort:\n", " missing_value_df.sort_values(\"percent_missing\", inplace=True)\n", " return missing_value_df\n", "\n", "\n", "def filter_percentage_nans(data, thresh=0.1):\n", " thresh *= 100\n", " per_nans = percentage_nans(data, sort=False)\n", " return data.loc[:, per_nans[per_nans[\"percent_missing\"] < thresh].index]\n", "\n", "\n", "def filter_intra_ticker(data, cols=[\"close\"]):\n", " if cols is None:\n", " return data\n", " return data.iloc[\n", " :, data.columns.get_level_values(1).isin(cols)\n", " ] # data.xs(\"close\",level=1, axis=1)\n", "\n", "\n", "def no_premarket_after_hours(data):\n", " mkt_start = datetime.time(hour=9, minute=30, tzinfo=pytz.timezone(\"US/Eastern\"))\n", " mkt_end = datetime.time(hour=15, minute=59, tzinfo=pytz.timezone(\"US/Eastern\"))\n", " data = convert_tz(data, time_zone=\"US/Eastern\")\n", " data = data.between_time(mkt_start, mkt_end)\n", " data = convert_tz(data, time_zone=\"UTC\")\n", " return data\n", "\n", "\n", "def add_technical(data):\n", " for ticker in data.columns.get_level_values(0).unique():\n", " # Assumption: close/open values are positive and a zero value means that datapoint is missing so we say no change\n", " data[ticker, \"pctchange\"] = (\n", " data[ticker, \"close\"] / data[ticker, \"open\"] - 1\n", " ).fillna(0.0).replace([np.inf, -np.inf, -1], 0.0)\n", " data[ticker, \"logpctchange\"] = np.log(\n", " data[ticker, \"close\"] / data[ticker, \"open\"]\n", " ).fillna(0.0).replace([np.inf, -np.inf], 0.0)\n", "\n", "\n", " # data[ticker, \"pctchange-1\"] = data[ticker, \"pctchange\"].shift(1,fill_value=0.0)\n", " # data[ticker, \"pctchange-2\"] = data[ticker, \"pctchange\"].shift(2,fill_value=0.0)\n", "\n", " data[ticker, \"shortsma\"] = (\n", " data[ticker, \"close\"].rolling(5).mean().fillna(data[ticker, \"close\"])\n", " )\n", " # data[ticker,'shortma-1'] = data[ticker,'shortsma'].shift(1)\n", " # data[ticker,'shortma-2'] = data[ticker,'shortsma'].shift(2)\n", " # print(data.columns.sort_values())\n", " data = data.reindex(sorted(data.columns), axis=1)\n", " # data.reindex(columns=data.columns.sort_values().get_level_values(0).unique(), level=0)\n", " return data\n", "\n", "if stock:\n", " # Filter df_all to normal hours\n", " df_all = no_premarket_after_hours(df_all)\n", "\n", "percentage_nans(df_all).tail(40)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = filter_percentage_nans(df_all, 0.08) #0.40\n", "print(df.columns.get_level_values(0).unique())\n", "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Add & filter columns\n", "df = add_technical(df)\n", "\n", "# None\n", "# [\"close\"]\n", "# [\"pctchange\"]\n", "# [\"open\", \"high\", \"low\", \"close\", \"volume\", 'pctchange', \"shortsma\"]\n", "df = filter_intra_ticker(\n", " df, cols=[\"open\", \"close\", \"pctchange\", \"logpctchange\", \"shortsma\"]\n", ")\n", "\n", "df.head(20)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "df_t = df[\"WTI\", \"pctchange\"]\n", "start_date = \"2022-10-01\"\n", "end_date = \"2022-11-01\"\n", "f1 = df_t[df.index > start_date]\n", "f2 = f1[f1.index < end_date]\n", "print(f2)\n", "# f = plt.figure()\n", "# f.set_figwidth(60)\n", "# f.set_figheight(20)\n", "plt.figure(figsize=(24,4))\n", "plt.plot(np.arange(f2.index.to_numpy().shape[0]), 3.3* np.cumprod(f2.to_numpy()+1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Fill NaNs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def ffill_nans(data):\n", " data = data.fillna(method=\"ffill\")\n", " data = data.dropna()\n", " return data\n", "\n", "\n", "def del_nans_ffill(data, thresh):\n", " data = data.dropna(thresh=thresh)\n", " data = ffill_nans(data)\n", " return data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = ffill_nans(df)\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Clip Outliers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def clip_outliers(data, p=0.005):\n", " lower = data.quantile(p)\n", " upper = data.quantile(1 - p)\n", "\n", " return data.clip(lower=lower, upper=upper, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if stock:\n", " df = clip_outliers(df)\n", "\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Save Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Sometimes it errors bc the path doesn't exist but just run it again\n", "write_df(df, \"data/stock/material_1h.csv\")\n", "# write_df(df, \"data/other/PEMSBAY.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extras" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Read data and convert to percent delta" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# df_new = read_data(\"data/stock/close_1h.csv\")\n", "\n", "# print(\"Before:\\n\", df_new.head())\n", "# df_new = df_new.pct_change()\n", "# df_new.iloc[0] = 0\n", "\n", "# print(\"After:\\n\",df_new.head())\n", "# write_df(df_new, \"data/stock/close_1h_pct_change.csv\")" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "vscode": { "interpreter": { "hash": "51980e48e269f7c05efac26b22569386591d7f1d45336266d53ed7fc3ab7efc6" } } }, "nbformat": 4, "nbformat_minor": 2 }