{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from time import sleep\n", "import datetime\n", "import os\n", "from utils.ipynb_helpers import read_data, write_df, convert_tz, add_tz\n", "from dotenv import load_dotenv\n", "\n", "# Create a .env file and add your keys\n", "load_dotenv()\n", "\n", "# Location to save raw data from data providers\n", "DATA_RAW = \"data/raw\"\n", "\n", "\n", "equities = [\"XOM\", \"CVX\", \"COP\", \"BP\", \"PBR\", \"WTI\", \"TTE\", \"EQNR\", \"EOG\", \"ENB\", \"SLB\"]\n", "more_equities = []\n", "\n", "crude_oil = [\"CL=F\", \"BZ=F\"] # wti, brent,\n", "random = [\"TSLA\", \"AAPL\"]\n", "\n", "materials_equities = [\"BHP\", \"LIN\", \"RIO\", \"VALE\", \"APD\", \"FCX\", \"SHW\", \"SCCO\", \"CTVA\", \"ECL\", \"NUE\", \"NTR\"]\n", "\n", "\n", "# https://en.wikipedia.org/wiki/List_of_countries_by_oil_production\n", "# https://www.weforum.org/agenda/2016/05/which-economies-are-most-reliant-on-oil/\n", "# OPEC: Iran, Iraq, Kuwait, Saudi Arabia, Venezuela\n", "# fx_opec = [_, \"C:USDIQD\", \"C:USDKWD\", \"C:USDSAR\", \"C:USDVEF\"]\n", "\n", "# OPEC+: Algeria, Angola, Congo, Equatorial Guinea, Gabon, Libya, Nigeria, United Arab Emirates\n", "# fx_opec_pp = [\"C:USDDZD\",_, \"C:USDCDF\", \"C:USDGNF\", _, \"C:USDLYD\", \"C:USDNGN\", \"C:USDAED\"]\n", "\n", "# Large: US, Russia, China, Canada, Norway\n", "# Other important: Qatar, Kazakhstan\n", "# fx_other= [\"C:USDQAR\", \"C:USDKZT\"]\n", "\n", "fx = [\"C:USDSAR\", \"C:USDAED\"]\n", "\n", "\n", "tickers = equities # + crude_oil" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Get Data From Data Provider" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Y Finance\n", "\n", "import yfinance as yf\n", "\n", "\n", "def use_yfinance(\n", " tickers, out_file, timeframe=\"day\", start=\"2000-01-01\", end=\"2023-01-01\"\n", "):\n", " assert timeframe == \"day\", \"Use day timeframe for day\"\n", "\n", " data = yf.download(tickers, start=start, end=end, group_by=\"ticker\")\n", "\n", " if len(tickers) == 1:\n", " data = pd.concat([data], axis=1, keys=[tickers[0]])\n", "\n", " data.index.rename(\"date\", inplace=True)\n", " data.rename(columns=lambda x: str.lower(x), level=1, inplace=True)\n", "\n", " if data.index.to_series().dt.tz is None:\n", " print(\"Adding time\")\n", " data = add_tz(data, time_zone=\"UTC\")\n", "\n", " if out_file is not None:\n", " write_df(data, out_file)\n", "\n", " return data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Alpha Vantage\n", "\n", "\n", "def csv_str_to_df(decoded_content, ticker):\n", " \"\"\"CSV string to df\"\"\"\n", " lines = decoded_content.splitlines()\n", " data = pd.DataFrame(\n", " [row.split(\",\") for row in lines[1:]],\n", " columns=[\"date\", *lines[0].split(\",\")[1:]],\n", " )\n", "\n", " data = data.reset_index(drop=True).set_index(\"date\")\n", " data.index = pd.to_datetime(data.index)\n", "\n", " # Add timezome -- we assume it is sent in with unlabled eastern time\n", " if data.index.to_series().dt.tz is None:\n", " print(\"CONVERTING TIME\")\n", " data = add_tz(data, time_zone=\"US/Eastern\")\n", " data = convert_tz(data, time_zone=\"UTC\")\n", " data = pd.concat([data], axis=1, keys=[ticker])\n", " return data\n", "\n", "\n", "def alpha_vantage_get_ticker_data(ticker, time=\"1min\", year=1, month=1):\n", " \"\"\"Function to get (ticker, year, month) data using alpha vantage's time series intraday extended API\"\"\"\n", " ALPHA_VANTAGE_API_KEY = os.environ.get(\"ALPHA_VANTAGE_API_KEY\")\n", " import requests\n", "\n", " CSV_URL = f\"https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol={ticker}&interval={time}&slice=year{year}month{month}&apikey={ALPHA_VANTAGE_API_KEY}\"\n", "\n", " while True:\n", " with requests.Session() as s:\n", " download = s.get(CSV_URL)\n", " decoded_content = download.content.decode(\"utf-8\")\n", " print(\n", " f\"ticker: {ticker}, y{year} m{month}; response length: {len(decoded_content)}\"\n", " )\n", "\n", " if len(decoded_content) == 236:\n", " # API too many requests\n", " sleep(60)\n", " elif len(decoded_content) <= 243:\n", " # Token doesn't exist or something\n", " print(f\"Error getting {ticker}, y{year}, m{month}. We are skipping\")\n", " print(decoded_content)\n", " return None\n", " else:\n", " return csv_str_to_df(decoded_content, ticker)\n", "\n", "\n", "def use_alpha_vantage(tickers, out_file, time=\"1min\"):\n", " \"\"\"Function to get multiple full tickers data using alpha vantage's time series intraday extended API\"\"\"\n", "\n", " dfs = []\n", " for ticker in tickers:\n", " t_dfs = []\n", " for year in range(1, 3):\n", " for month in range(1, 13):\n", " df_temp = alpha_vantage_get_ticker_data(\n", " ticker, time=time, year=year, month=month\n", " )\n", " if df_temp is not None:\n", " t_dfs.append(df_temp)\n", "\n", " if len(t_dfs):\n", " dfs.append(pd.concat(t_dfs, axis=0))\n", " else:\n", " print(f\"Skipped {ticker}.\")\n", " df = pd.concat(dfs, axis=1, sort=True)\n", " df.index.rename(\"date\", inplace=True)\n", "\n", " write_df(df, out_file)\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Alpaca\n", "\n", "\n", "def use_alpaca(tickers, out_file, timeframe=\"1Minute\", start=\"2017-01-01\"):\n", " APCA_API_BASE_URL = os.environ.get(\"APCA_API_BASE_URL\")\n", " APCA_API_KEY_ID = os.environ.get(\"APCA_API_KEY_ID\")\n", " APCA_API_SECRET_KEY = os.environ.get(\"APCA_API_SECRET_KEY\")\n", " import alpaca_trade_api as tradeapi\n", "\n", " alpaca = tradeapi.REST(\n", " key_id=APCA_API_KEY_ID,\n", " secret_key=APCA_API_SECRET_KEY,\n", " base_url=APCA_API_BASE_URL,\n", " )\n", " account = alpaca.get_account()\n", " print(account.status)\n", "\n", " dfs = []\n", " for ticker in tickers:\n", " print(\"Getting\", ticker)\n", " df = alpaca.get_bars(ticker, timeframe, start).df\n", " print(\"Recieved\", ticker)\n", " df.index.name = \"date\"\n", " df = pd.concat([df], axis=1, keys=[ticker])\n", " dfs.append(df)\n", " df = pd.concat(dfs, axis=1, sort=True)\n", " df.index.rename(\"date\", inplace=True)\n", "\n", " if out_file is not None:\n", " write_df(df, out_file)\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Polygon\n", "\n", "\n", "def use_polygon(tickers, out_file, multiplier=1, timespan=\"minute\", start=\"2000-01-01\"):\n", " POLYGON_API_KEY = os.environ.get(\"POLYGON_API_KEY\")\n", " from polygon import RESTClient\n", "\n", " client = RESTClient(POLYGON_API_KEY)\n", " dfs = []\n", " end = datetime.datetime.utcnow()\n", " start_og = start\n", " for ticker in tickers:\n", " start = start_og\n", " df_agg = None\n", " response_len = None\n", " i = 0\n", " print(\"Getting\", ticker)\n", " while response_len != 1:\n", " i += 1\n", " aggs = client.get_aggs(\n", " ticker,\n", " multiplier,\n", " timespan,\n", " start,\n", " end,\n", " adjusted=True,\n", " sort=\"asc\",\n", " limit=50000,\n", " )\n", " df = pd.DataFrame(aggs)\n", " df.index = pd.DatetimeIndex(\n", " pd.to_datetime(df[\"timestamp\"], unit=\"ms\", utc=True)\n", " )\n", " df.index.name = \"date\"\n", " df = df.filter([\"open\", \"high\", \"low\", \"close\", \"volume\", \"vwap\"], axis=1)\n", " response_len = len(df.index)\n", " start = df.last_valid_index()\n", " print(i, response_len)\n", " if df_agg is not None:\n", " df_agg.drop(index=df_agg.index[-1], axis=0, inplace=True)\n", " df_agg = pd.merge(df_agg.reset_index(), df.reset_index(), how=\"outer\")\n", " df_agg = df_agg.set_index(\"date\")\n", " else:\n", " df_agg = df\n", " sleep(1) # Attempt to be nice\n", " df_agg = pd.concat([df_agg], axis=1, keys=[ticker])\n", " dfs.append(df_agg)\n", " print(\"Recieved\", ticker)\n", "\n", " df = pd.concat(dfs, axis=1, sort=True)\n", " df.index.rename(\"date\", inplace=True)\n", "\n", " if out_file is not None:\n", " write_df(df, out_file)\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Yahoo Finance\n", "df = use_yfinance(\n", " [\"AAPL\", \"TSLA\"], os.path.join(DATA_RAW, \"aapl_day_full.csv\"), start=\"1970-01-01\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Alpha Vantage\n", "df = use_alpha_vantage(tickers, os.path.join(DATA_RAW, \"realdata.csv\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Alpaca\n", "df = use_alpaca(\n", " tickers + random, os.path.join(DATA_RAW, \"realdata_alp_1h.csv\"), timeframe=\"1Hour\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Polygon\n", "df = use_polygon(\n", " materials_equities,\n", " os.path.join(DATA_RAW, \"materials_1h.csv\"),\n", " multiplier=1,\n", " timespan=\"hour\",\n", " start=\"2000-01-01\",\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extras" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Read Data From All-Data CSV (Multi Index Columns)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_all = read_data(os.path.join(DATA_RAW, \"realdata.csv\"))\n", "# df = read_data(\"tsla_aapl.csv\")\n", "print(df_all.head())\n", "print(df.head())\n", "print(df_all.columns)\n", "print(df.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Concatenate two datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "run = False\n", "if run and not df.columns.equals(df_all.columns):\n", " df_new = write_df(\n", " pd.concat([df_all, df], axis=1), os.path.join(DATA_RAW, \"realdata.csv\")\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Remove rows with a lot of NANs\n", "This is important when using FX data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_f = df.copy()\n", "df_f = df_f.dropna(axis=0, thresh=50) #80\n", "write_df(df_f, os.path.join(DATA_RAW, \"realdata_pol_1h.csv\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.tail(80)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "vscode": { "interpreter": { "hash": "51980e48e269f7c05efac26b22569386591d7f1d45336266d53ed7fc3ab7efc6" } } }, "nbformat": 4, "nbformat_minor": 2 }