{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from time import sleep\n", "import datetime\n", "import os\n", "from utils.ipynb_helpers import read_data, write_df, convert_tz, add_tz\n", "from dotenv import load_dotenv\n", "\n", "# Create a .env file and add your keys\n", "load_dotenv()\n", "\n", "# Location to save raw data from data providers\n", "DATA_RAW = \"data/raw\"\n", "\n", "\n", "equities = [\"XOM\", \"CVX\", \"COP\", \"BP\", \"PBR\", \"WTI\", \"TTE\", \"EQNR\", \"EOG\", \"ENB\", \"SLB\"]\n", "more_equities = []\n", "\n", "crude_oil = [\"CL=F\", \"BZ=F\"] # wti, brent,\n", "random = [\"TSLA\", \"AAPL\"]\n", "\n", "materials_equities = [\"BHP\", \"LIN\", \"RIO\", \"VALE\", \"APD\", \"FCX\", \"SHW\", \"SCCO\", \"CTVA\", \"ECL\", \"NUE\", \"NTR\"]\n", "\n", "\n", "# https://en.wikipedia.org/wiki/List_of_countries_by_oil_production\n", "# https://www.weforum.org/agenda/2016/05/which-economies-are-most-reliant-on-oil/\n", "# OPEC: Iran, Iraq, Kuwait, Saudi Arabia, Venezuela\n", "# fx_opec = [_, \"C:USDIQD\", \"C:USDKWD\", \"C:USDSAR\", \"C:USDVEF\"]\n", "\n", "# OPEC+: Algeria, Angola, Congo, Equatorial Guinea, Gabon, Libya, Nigeria, United Arab Emirates\n", "# fx_opec_pp = [\"C:USDDZD\",_, \"C:USDCDF\", \"C:USDGNF\", _, \"C:USDLYD\", \"C:USDNGN\", \"C:USDAED\"]\n", "\n", "# Large: US, Russia, China, Canada, Norway\n", "# Other important: Qatar, Kazakhstan\n", "# fx_other= [\"C:USDQAR\", \"C:USDKZT\"]\n", "\n", "fx = [\"C:USDSAR\", \"C:USDAED\"]\n", "\n", "tickers = equities # + crude_oil" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Get Data From Data Provider" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Y Finance\n", "\n", "import yfinance as yf\n", "\n", "\n", "def use_yfinance(\n", " tickers, out_file, timeframe=\"day\", start=\"2000-01-01\", end=\"2023-01-01\"\n", "):\n", " assert timeframe == \"day\", \"Use day timeframe for day\"\n", "\n", " data = yf.download(tickers, start=start, end=end, group_by=\"ticker\")\n", "\n", " if len(tickers) == 1:\n", " data = pd.concat([data], axis=1, keys=[tickers[0]])\n", "\n", " data.index.rename(\"date\", inplace=True)\n", " data.rename(columns=lambda x: str.lower(x), level=1, inplace=True)\n", "\n", " if data.index.to_series().dt.tz is None:\n", " print(\"Adding time\")\n", " data = add_tz(data, time_zone=\"UTC\")\n", "\n", " if out_file is not None:\n", " write_df(data, out_file)\n", "\n", " return data" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# Alpha Vantage\n", "\n", "\n", "def csv_str_to_df(decoded_content, ticker):\n", " \"\"\"CSV string to df\"\"\"\n", " print(decoded_content[:1000])\n", " lines = decoded_content.splitlines()\n", " print(len(lines), lines[0].split(\",\")[1:])\n", " print(lines[2])\n", " #while(1):pass\n", " data = pd.DataFrame(\n", " [row.split(\",\") for row in lines[1:]],\n", " columns=[\"date\", *lines[0].split(\",\")[1:]],\n", " )\n", " \n", "\n", " data = data.reset_index(drop=True).set_index(\"date\")\n", " data.index = pd.to_datetime(data.index)\n", "\n", " # Add timezome -- we assume it is sent in with unlabled eastern time\n", " if data.index.to_series().dt.tz is None:\n", " print(\"CONVERTING TIME\")\n", " data = add_tz(data, time_zone=\"US/Eastern\")\n", " data = convert_tz(data, time_zone=\"UTC\")\n", " data = pd.concat([data], axis=1, keys=[ticker])\n", " return data\n", "\n", "\n", "def alpha_vantage_get_ticker_data(ticker, time=\"1min\", year=1, month=1):\n", " \"\"\"Function to get (ticker, year, month) data using alpha vantage's time series intraday extended API\"\"\"\n", " ALPHA_VANTAGE_API_KEY = os.environ.get(\"ALPHA_VANTAGE_API_KEY\")\n", " import requests\n", "\n", " CSV_URL = f\"https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={ticker}&interval={time}&month={2026-year}-{11-month:02d}&outputsize=full&apikey={ALPHA_VANTAGE_API_KEY}\"\n", "\n", " while True:\n", " with requests.Session() as s:\n", " download = s.get(CSV_URL)\n", " decoded_content = download.content.decode(\"utf-8\")\n", " print(\n", " f\"ticker: {ticker}, y{year} m{month}; response length: {len(decoded_content)}\"\n", " )\n", "\n", " if len(decoded_content) == 236:\n", " # API too many requests\n", " sleep(60)\n", " elif len(decoded_content) <= 243:\n", " # Token doesn't exist or something\n", " print(f\"Error getting {ticker}, y{year}, m{month}. We are skipping\")\n", " print(decoded_content)\n", " return None\n", " else:\n", " return csv_str_to_df(decoded_content, ticker)\n", "\n", "\n", "def use_alpha_vantage(tickers, out_file, time=\"1min\"):\n", " \"\"\"Function to get multiple full tickers data using alpha vantage's time series intraday extended API\"\"\"\n", "\n", " dfs = []\n", " for ticker in tickers:\n", " t_dfs = []\n", " for year in range(1, 3):\n", " for month in range(1, 13):\n", " df_temp = alpha_vantage_get_ticker_data(\n", " ticker, time=time, year=year, month=month\n", " )\n", " if df_temp is not None:\n", " t_dfs.append(df_temp)\n", "\n", " if len(t_dfs):\n", " dfs.append(pd.concat(t_dfs, axis=0))\n", " else:\n", " print(f\"Skipped {ticker}.\")\n", " df = pd.concat(dfs, axis=1, sort=True)\n", " df.index.rename(\"date\", inplace=True)\n", "\n", " write_df(df, out_file)\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# Alpaca\n", "\n", "\n", "def use_alpaca(tickers, out_file, timeframe=\"1Minute\", start=\"2017-01-01\"):\n", " APCA_API_BASE_URL = os.environ.get(\"APCA_API_BASE_URL\")\n", " APCA_API_KEY_ID = os.environ.get(\"APCA_API_KEY_ID\")\n", " APCA_API_SECRET_KEY = os.environ.get(\"APCA_API_SECRET_KEY\")\n", " import alpaca_trade_api as tradeapi\n", "\n", " alpaca = tradeapi.REST(\n", " key_id=APCA_API_KEY_ID,\n", " secret_key=APCA_API_SECRET_KEY,\n", " base_url=APCA_API_BASE_URL,\n", " )\n", " account = alpaca.get_account()\n", " print(account.status)\n", "\n", " dfs = []\n", " for ticker in tickers:\n", " print(\"Getting\", ticker)\n", " df = alpaca.get_bars(ticker, timeframe, start).df\n", " print(\"Recieved\", ticker)\n", " df.index.name = \"date\"\n", " df = pd.concat([df], axis=1, keys=[ticker])\n", " dfs.append(df)\n", " df = pd.concat(dfs, axis=1, sort=True)\n", " df.index.rename(\"date\", inplace=True)\n", "\n", " if out_file is not None:\n", " write_df(df, out_file)\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# Polygon\n", "\n", "\n", "def use_polygon(tickers, out_file, multiplier=1, timespan=\"minute\", start=\"2000-01-01\"):\n", " POLYGON_API_KEY = os.environ.get(\"POLYGON_API_KEY\")\n", " from polygon import RESTClient\n", "\n", " client = RESTClient(POLYGON_API_KEY)\n", " dfs = []\n", " end = datetime.datetime.utcnow()\n", " start_og = start\n", " for ticker in tickers:\n", " start = start_og\n", " df_agg = None\n", " response_len = None\n", " i = 0\n", " print(\"Getting\", ticker)\n", " while response_len != 1:\n", " i += 1\n", " aggs = client.get_aggs(\n", " ticker,\n", " multiplier,\n", " timespan,\n", " start,\n", " end,\n", " adjusted=True,\n", " sort=\"asc\",\n", " limit=50000,\n", " )\n", " df = pd.DataFrame(aggs)\n", " df.index = pd.DatetimeIndex(\n", " pd.to_datetime(df[\"timestamp\"], unit=\"ms\", utc=True)\n", " )\n", " df.index.name = \"date\"\n", " df = df.filter([\"open\", \"high\", \"low\", \"close\", \"volume\", \"vwap\"], axis=1)\n", " response_len = len(df.index)\n", " start = df.last_valid_index()\n", " print(i, response_len)\n", " if df_agg is not None:\n", " df_agg.drop(index=df_agg.index[-1], axis=0, inplace=True)\n", " df_agg = pd.merge(df_agg.reset_index(), df.reset_index(), how=\"outer\")\n", " df_agg = df_agg.set_index(\"date\")\n", " else:\n", " df_agg = df\n", " sleep(1) # Attempt to be nice\n", " df_agg = pd.concat([df_agg], axis=1, keys=[ticker])\n", " dfs.append(df_agg)\n", " print(\"Recieved\", ticker)\n", "\n", " df = pd.concat(dfs, axis=1, sort=True)\n", " df.index.rename(\"date\", inplace=True)\n", "\n", " if out_file is not None:\n", " write_df(df, out_file)\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7521/3255818553.py:11: FutureWarning: YF.download() has changed argument auto_adjust default to True\n", " data = yf.download(tickers, start=start, end=end, group_by=\"ticker\")\n", "[*********************100%***********************] 2 of 2 completed" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Adding time\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Yahoo Finance\n", "df = use_yfinance(\n", " [\"AAPL\", \"TSLA\"], os.path.join(DATA_RAW, \"aapl_day_full.csv\"), start=\"1970-01-01\"\n", ")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ticker: XOM, y1 m1; response length: 2481441\n", "{\n", " \"Meta Data\": {\n", " \"1. Information\": \"Intraday (1min) open, high, low, close prices and volume\",\n", " \"2. Symbol\": \"XOM\",\n", " \"3. Last Refreshed\": \"2025-10-22 19:59:00\",\n", " \"4. Interval\": \"1min\",\n", " \"5. Output Size\": \"Full size\",\n", " \"6. Time Zone\": \"US/Eastern\"\n", " },\n", " \"Time Series (1min)\": {\n", " \"2025-10-22 19:59:00\": {\n", " \"1. open\": \"115.2100\",\n", " \"2. high\": \"115.3900\",\n", " \"3. low\": \"115.2100\",\n", " \"4. close\": \"115.3900\",\n", " \"5. volume\": \"105\"\n", " },\n", " \"2025-10-22 19:58:00\": {\n", " \"1. open\": \"115.4800\",\n", " \"2. high\": \"115.4800\",\n", " \"3. low\": \"115.2000\",\n", " \"4. close\": \"115.2000\",\n", " \"5. volume\": \"6\"\n", " },\n", " \"2025-10-22 19:57:00\": {\n", " \"1. open\": \"115.4800\",\n", " \"2. high\": \"115.4800\",\n", " \"3. low\": \"115.3800\",\n", " \"4. close\": \"115.3800\",\n", " \"5. volume\": \"170\"\n", " },\n", " \"2025-10-22 19:56:00\n", "80589 []\n", " \"1. Information\": \"Intraday (1min) open, high, low, close prices and volume\",\n" ] }, { "ename": "ValueError", "evalue": "1 columns passed, passed data had 5 columns", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/internals/construction.py:939\u001b[0m, in \u001b[0;36m_finalize_columns_and_data\u001b[0;34m(content, columns, dtype)\u001b[0m\n\u001b[1;32m 938\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 939\u001b[0m columns \u001b[38;5;241m=\u001b[39m \u001b[43m_validate_or_indexify_columns\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 940\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mAssertionError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m 941\u001b[0m \u001b[38;5;66;03m# GH#26429 do not raise user-facing AssertionError\u001b[39;00m\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/internals/construction.py:986\u001b[0m, in \u001b[0;36m_validate_or_indexify_columns\u001b[0;34m(content, columns)\u001b[0m\n\u001b[1;32m 984\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_mi_list \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(columns) \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(content): \u001b[38;5;66;03m# pragma: no cover\u001b[39;00m\n\u001b[1;32m 985\u001b[0m \u001b[38;5;66;03m# caller's responsibility to check for this...\u001b[39;00m\n\u001b[0;32m--> 986\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAssertionError\u001b[39;00m(\n\u001b[1;32m 987\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(columns)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m columns passed, passed data had \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 988\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(content)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m columns\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 989\u001b[0m )\n\u001b[1;32m 990\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_mi_list:\n\u001b[1;32m 991\u001b[0m \u001b[38;5;66;03m# check if nested list column, length of each sub-list should be equal\u001b[39;00m\n", "\u001b[0;31mAssertionError\u001b[0m: 1 columns passed, passed data had 5 columns", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[25], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Alpha Vantage\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43muse_alpha_vantage\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtickers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mDATA_RAW\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrealdata.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", "Cell \u001b[0;32mIn[22], line 64\u001b[0m, in \u001b[0;36muse_alpha_vantage\u001b[0;34m(tickers, out_file, time)\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m year \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m3\u001b[39m):\n\u001b[1;32m 63\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m month \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m13\u001b[39m):\n\u001b[0;32m---> 64\u001b[0m df_temp \u001b[38;5;241m=\u001b[39m \u001b[43malpha_vantage_get_ticker_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 65\u001b[0m \u001b[43m \u001b[49m\u001b[43mticker\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtime\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtime\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43myear\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43myear\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmonth\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmonth\u001b[49m\n\u001b[1;32m 66\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 67\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m df_temp \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 68\u001b[0m t_dfs\u001b[38;5;241m.\u001b[39mappend(df_temp)\n", "Cell \u001b[0;32mIn[22], line 53\u001b[0m, in \u001b[0;36malpha_vantage_get_ticker_data\u001b[0;34m(ticker, time, year, month)\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 53\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcsv_str_to_df\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdecoded_content\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mticker\u001b[49m\u001b[43m)\u001b[49m\n", "Cell \u001b[0;32mIn[22], line 11\u001b[0m, in \u001b[0;36mcsv_str_to_df\u001b[0;34m(decoded_content, ticker)\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28mprint\u001b[39m(lines[\u001b[38;5;241m2\u001b[39m])\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m#while(1):pass\u001b[39;00m\n\u001b[0;32m---> 11\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mDataFrame\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mrow\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m,\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrow\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mlines\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdate\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mlines\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m,\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m data \u001b[38;5;241m=\u001b[39m data\u001b[38;5;241m.\u001b[39mreset_index(drop\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\u001b[38;5;241m.\u001b[39mset_index(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdate\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 18\u001b[0m data\u001b[38;5;241m.\u001b[39mindex \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mto_datetime(data\u001b[38;5;241m.\u001b[39mindex)\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/frame.py:851\u001b[0m, in \u001b[0;36mDataFrame.__init__\u001b[0;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[1;32m 849\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m columns \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 850\u001b[0m columns \u001b[38;5;241m=\u001b[39m ensure_index(columns)\n\u001b[0;32m--> 851\u001b[0m arrays, columns, index \u001b[38;5;241m=\u001b[39m \u001b[43mnested_data_to_arrays\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 852\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# error: Argument 3 to \"nested_data_to_arrays\" has incompatible\u001b[39;49;00m\n\u001b[1;32m 853\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type \"Optional[Collection[Any]]\"; expected \"Optional[Index]\"\u001b[39;49;00m\n\u001b[1;32m 854\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 855\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 856\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[arg-type]\u001b[39;49;00m\n\u001b[1;32m 857\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 858\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 859\u001b[0m mgr \u001b[38;5;241m=\u001b[39m arrays_to_mgr(\n\u001b[1;32m 860\u001b[0m arrays,\n\u001b[1;32m 861\u001b[0m columns,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 864\u001b[0m typ\u001b[38;5;241m=\u001b[39mmanager,\n\u001b[1;32m 865\u001b[0m )\n\u001b[1;32m 866\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/internals/construction.py:520\u001b[0m, in \u001b[0;36mnested_data_to_arrays\u001b[0;34m(data, columns, index, dtype)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_named_tuple(data[\u001b[38;5;241m0\u001b[39m]) \u001b[38;5;129;01mand\u001b[39;00m columns \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 518\u001b[0m columns \u001b[38;5;241m=\u001b[39m ensure_index(data[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39m_fields)\n\u001b[0;32m--> 520\u001b[0m arrays, columns \u001b[38;5;241m=\u001b[39m \u001b[43mto_arrays\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 521\u001b[0m columns \u001b[38;5;241m=\u001b[39m ensure_index(columns)\n\u001b[1;32m 523\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m index \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/internals/construction.py:845\u001b[0m, in \u001b[0;36mto_arrays\u001b[0;34m(data, columns, dtype)\u001b[0m\n\u001b[1;32m 842\u001b[0m data \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mtuple\u001b[39m(x) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m data]\n\u001b[1;32m 843\u001b[0m arr \u001b[38;5;241m=\u001b[39m _list_to_arrays(data)\n\u001b[0;32m--> 845\u001b[0m content, columns \u001b[38;5;241m=\u001b[39m \u001b[43m_finalize_columns_and_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 846\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m content, columns\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/internals/construction.py:942\u001b[0m, in \u001b[0;36m_finalize_columns_and_data\u001b[0;34m(content, columns, dtype)\u001b[0m\n\u001b[1;32m 939\u001b[0m columns \u001b[38;5;241m=\u001b[39m _validate_or_indexify_columns(contents, columns)\n\u001b[1;32m 940\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mAssertionError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m 941\u001b[0m \u001b[38;5;66;03m# GH#26429 do not raise user-facing AssertionError\u001b[39;00m\n\u001b[0;32m--> 942\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(err) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 944\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(contents) \u001b[38;5;129;01mand\u001b[39;00m contents[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m np\u001b[38;5;241m.\u001b[39mobject_:\n\u001b[1;32m 945\u001b[0m contents \u001b[38;5;241m=\u001b[39m convert_object_array(contents, dtype\u001b[38;5;241m=\u001b[39mdtype)\n", "\u001b[0;31mValueError\u001b[0m: 1 columns passed, passed data had 5 columns" ] } ], "source": [ "# Alpha Vantage\n", "df = use_alpha_vantage(tickers, os.path.join(DATA_RAW, \"realdata.csv\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Alpaca\n", "df = use_alpaca(\n", " tickers + random, os.path.join(DATA_RAW, \"realdata_alp_1h.csv\"), timeframe=\"1Hour\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Polygon\n", "df = use_polygon(\n", " materials_equities,\n", " os.path.join(DATA_RAW, \"materials_1h.csv\"),\n", " multiplier=1,\n", " timespan=\"hour\",\n", " start=\"2000-01-01\",\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extras" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Read Data From All-Data CSV (Multi Index Columns)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_all = read_data(os.path.join(DATA_RAW, \"realdata.csv\"))\n", "# df = read_data(\"tsla_aapl.csv\")\n", "print(df_all.head())\n", "print(df.head())\n", "print(df_all.columns)\n", "print(df.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Concatenate two datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "run = False\n", "if run and not df.columns.equals(df_all.columns):\n", " df_new = write_df(\n", " pd.concat([df_all, df], axis=1), os.path.join(DATA_RAW, \"realdata.csv\")\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Remove rows with a lot of NANs\n", "This is important when using FX data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_f = df.copy()\n", "df_f = df_f.dropna(axis=0, thresh=50) #80\n", "write_df(df_f, os.path.join(DATA_RAW, \"realdata_pol_1h.csv\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.tail(80)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" }, "vscode": { "interpreter": { "hash": "51980e48e269f7c05efac26b22569386591d7f1d45336266d53ed7fc3ab7efc6" } } }, "nbformat": 4, "nbformat_minor": 4 }