#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd from time import sleep import datetime import os from utils.ipynb_helpers import read_data, write_df, convert_tz, add_tz from dotenv import load_dotenv import traceback # Create a .env file and add your keys load_dotenv() # Location to save raw data from data providers DATA_RAW = "data/raw" equities = ["XOM", "CVX", "COP", "BP", "PBR", "WTI", "TTE", "EQNR", "EOG", "ENB", "SLB"] more_equities = [] crude_oil = ["CL=F", "BZ=F"] # wti, brent, random = ["TSLA", "AAPL"] materials_equities = ["BHP", "LIN", "RIO", "VALE", "APD", "FCX", "SHW", "SCCO", "CTVA", "ECL", "NUE", "NTR"] # https://en.wikipedia.org/wiki/List_of_countries_by_oil_production # https://www.weforum.org/agenda/2016/05/which-economies-are-most-reliant-on-oil/ # OPEC: Iran, Iraq, Kuwait, Saudi Arabia, Venezuela # fx_opec = [_, "C:USDIQD", "C:USDKWD", "C:USDSAR", "C:USDVEF"] # OPEC+: Algeria, Angola, Congo, Equatorial Guinea, Gabon, Libya, Nigeria, United Arab Emirates # fx_opec_pp = ["C:USDDZD",_, "C:USDCDF", "C:USDGNF", _, "C:USDLYD", "C:USDNGN", "C:USDAED"] # Large: US, Russia, China, Canada, Norway # Other important: Qatar, Kazakhstan # fx_other= ["C:USDQAR", "C:USDKZT"] fx = ["C:USDSAR", "C:USDAED"] tickers = equities # + crude_oil # ##### Get Data From Data Provider # In[2]: # Y Finance import yfinance as yf def use_yfinance( tickers, out_file, timeframe="day", start="2000-01-01", end="2023-01-01" ): assert timeframe == "day", "Use day timeframe for day" data = yf.download(tickers, start=start, end=end, group_by="ticker", threads=False) if len(tickers) == 1: data = pd.concat([data], axis=1, keys=[tickers[0]]) data.index.rename("date", inplace=True) data.rename(columns=lambda x: str.lower(x), level=1, inplace=True) if data.index.to_series().dt.tz is None: print("Adding time") data = add_tz(data, time_zone="UTC") if out_file is not None: write_df(data, out_file) return data # In[22]: # Alpha Vantage def csv_str_to_df(decoded_content, ticker): """CSV string to df""" lines = decoded_content.splitlines() print(lines[-20:]) lines = [ "".join([ lines[i+j][8:-3] if j//6==0 else lines[i+j][12:-1] for j in range(6) ]) for i in range(10, len(lines), 6)] print(len(lines)) print(lines[-20:]) while(1):pass data = pd.DataFrame( [row.split(",") for row in lines[1:]], columns=["date", "open", "high", "low", "close", "volume"], ) data = data.reset_index(drop=True).set_index("date") data.index = pd.to_datetime(data.index) # Add timezome -- we assume it is sent in with unlabled eastern time if data.index.to_series().dt.tz is None: print("CONVERTING TIME") data = add_tz(data, time_zone="US/Eastern") data = convert_tz(data, time_zone="UTC") data = pd.concat([data], axis=1, keys=[ticker]) return data def alpha_vantage_get_ticker_data(ticker, time="1min", year=1, month=1): """Function to get (ticker, year, month) data using alpha vantage's time series intraday extended API""" ALPHA_VANTAGE_API_KEY = "VGRS7MNEHU6K8FAZ" import requests CSV_URL = f"https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={ticker}&interval={time}&month={2026-year}-{11-month:02d}&outputsize=full&apikey={ALPHA_VANTAGE_API_KEY}" while True: with requests.Session() as s: download = s.get(CSV_URL) # save to local file decoded_content = download.content.decode("utf-8") print( f"ticker: {ticker}, y{year} m{month}; response length: {len(decoded_content)}" ) if len(decoded_content) == 236: # API too many requests sleep(60) elif len(decoded_content) <= 243: # Token doesn't exist or something print(f"Error getting {ticker}, y{year}, m{month}. We are skipping") print(decoded_content) return None else: return csv_str_to_df(decoded_content, ticker) def use_alpha_vantage(tickers, out_file, time="1min"): """Function to get multiple full tickers data using alpha vantage's time series intraday extended API""" dfs = [] for ticker in tickers: t_dfs = [] for year in range(1, 3): for month in range(1, 13): df_temp = alpha_vantage_get_ticker_data( ticker, time=time, year=year, month=month ) if df_temp is not None: t_dfs.append(df_temp) if len(t_dfs): dfs.append(pd.concat(t_dfs, axis=0)) else: print(f"Skipped {ticker}.") df = pd.concat(dfs, axis=1, sort=True) while(1):pass df.index.rename("date", inplace=True) write_df(df, out_file) return df # In[23]: # Alpaca def use_alpaca(tickers, out_file, timeframe="1Minute", start="2017-01-01"): APCA_API_BASE_URL = os.environ.get("APCA_API_BASE_URL") APCA_API_KEY_ID = os.environ.get("APCA_API_KEY_ID") APCA_API_SECRET_KEY = os.environ.get("APCA_API_SECRET_KEY") import alpaca_trade_api as tradeapi alpaca = tradeapi.REST( key_id=APCA_API_KEY_ID, secret_key=APCA_API_SECRET_KEY, base_url=APCA_API_BASE_URL, ) account = alpaca.get_account() print(account.status) dfs = [] for ticker in tickers: print("Getting", ticker) df = alpaca.get_bars(ticker, timeframe, start).df print("Recieved", ticker) df.index.name = "date" df = pd.concat([df], axis=1, keys=[ticker]) dfs.append(df) df = pd.concat(dfs, axis=1, sort=True) df.index.rename("date", inplace=True) if out_file is not None: write_df(df, out_file) return df # In[24]: # Polygon def try_until_suc(request_func, *args, **kwargs): while True: try: res = request_func(*args, **kwargs) except Exception as e: print("Error Message:", e) print(f"Traceback Details: {traceback.format_exc()}") # Get full traceback as a string print("retry sending request...") sleep(5) else: break return res def use_polygon(tickers, out_file, multiplier=1, timespan="minute", start="2000-01-01"): POLYGON_API_KEY = "i0tmf9psII0FV_W7cAHs5PSKSVlqns72" from polygon import RESTClient client = RESTClient(POLYGON_API_KEY) # aggs = client.get_aggs("AAPL", 1, "day", "2000-01-01", "2001-01-01") # print(aggs[0].timestamp) # while(1):pass dfs = [] end = datetime.datetime.utcnow() start_og = start for ticker in tickers: start = start_og df_agg = None response_len = None i = 0 print("Getting", ticker) while response_len != 1: i += 1 aggs = try_until_suc( client.get_aggs, ticker, multiplier, timespan, start, end, adjusted=True, sort="asc", limit=50000, ) df = pd.DataFrame(aggs) df.index = pd.DatetimeIndex( pd.to_datetime(df["timestamp"], unit="ms", utc=True) ) df.index.name = "date" df = df.filter(["open", "high", "low", "close", "volume", "vwap"], axis=1) response_len = len(df.index) start = df.last_valid_index() print(i, response_len) if df_agg is not None: df_agg.drop(index=df_agg.index[-1], axis=0, inplace=True) df_agg = pd.merge(df_agg.reset_index(), df.reset_index(), how="outer") df_agg = df_agg.set_index("date") else: df_agg = df sleep(12) # Attempt to be nice df_agg = pd.concat([df_agg], axis=1, keys=[ticker]) dfs.append(df_agg) print("Recieved", ticker) df = pd.concat(dfs, axis=1, sort=True) df.index.rename("date", inplace=True) if out_file is not None: write_df(df, out_file) return df # In[6]: # Yahoo Finance # df = use_yfinance( # random, os.path.join(DATA_RAW, "aapl_day_full.csv"), start="1970-01-01", # ) # In[25]: # Alpha Vantage # df = use_alpha_vantage(tickers, os.path.join(DATA_RAW, "realdata.csv"), time="1h") # In[ ]: # # Alpaca # df = use_alpaca( # tickers + random, os.path.join(DATA_RAW, "realdata_alp_1h.csv"), timeframe="1Hour" # ) # # In[ ]: # Polygon df = use_polygon( tickers, os.path.join(DATA_RAW, "realdata.csv"), multiplier=1, timespan="hour", start="2000-01-01", ) # In[ ]: df.head() # ## Extras # ##### Read Data From All-Data CSV (Multi Index Columns) # In[ ]: df_all = read_data(os.path.join(DATA_RAW, "realdata.csv")) # df = read_data("tsla_aapl.csv") print(df_all.head()) print(df.head()) print(df_all.columns) print(df.columns) # ##### Concatenate two datasets # In[ ]: run = False if run and not df.columns.equals(df_all.columns): df_new = write_df( pd.concat([df_all, df], axis=1), os.path.join(DATA_RAW, "realdata.csv") ) # ### Remove rows with a lot of NANs # This is important when using FX data # In[ ]: df_f = df.copy() df_f = df_f.dropna(axis=0, thresh=50) #80 write_df(df_f, os.path.join(DATA_RAW, "realdata_pol_1h.csv")) # In[ ]: df.tail(80)