| |
| |
|
|
| |
|
|
|
|
| import pandas as pd |
| from time import sleep |
| import datetime |
| import os |
| from utils.ipynb_helpers import read_data, write_df, convert_tz, add_tz |
| from dotenv import load_dotenv |
| import traceback |
|
|
| |
| load_dotenv() |
|
|
| |
| DATA_RAW = "data/raw" |
|
|
|
|
| equities = ["XOM", "CVX", "COP", "BP", "PBR", "WTI", "TTE", "EQNR", "EOG", "ENB", "SLB"] |
| more_equities = [] |
|
|
| crude_oil = ["CL=F", "BZ=F"] |
| random = ["TSLA", "AAPL"] |
|
|
| materials_equities = ["BHP", "LIN", "RIO", "VALE", "APD", "FCX", "SHW", "SCCO", "CTVA", "ECL", "NUE", "NTR"] |
|
|
|
|
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
|
|
| fx = ["C:USDSAR", "C:USDAED"] |
|
|
| tickers = equities |
|
|
|
|
| |
|
|
| |
|
|
|
|
| |
|
|
| import yfinance as yf |
|
|
|
|
| def use_yfinance( |
| tickers, out_file, timeframe="day", start="2000-01-01", end="2023-01-01" |
| ): |
| assert timeframe == "day", "Use day timeframe for day" |
|
|
| data = yf.download(tickers, start=start, end=end, group_by="ticker", threads=False) |
|
|
| if len(tickers) == 1: |
| data = pd.concat([data], axis=1, keys=[tickers[0]]) |
|
|
| data.index.rename("date", inplace=True) |
| data.rename(columns=lambda x: str.lower(x), level=1, inplace=True) |
|
|
| if data.index.to_series().dt.tz is None: |
| print("Adding time") |
| data = add_tz(data, time_zone="UTC") |
|
|
| if out_file is not None: |
| write_df(data, out_file) |
|
|
| return data |
|
|
|
|
| |
|
|
|
|
| |
|
|
|
|
| def csv_str_to_df(decoded_content, ticker): |
| """CSV string to df""" |
| lines = decoded_content.splitlines() |
| print(lines[-20:]) |
| lines = [ "".join([ lines[i+j][8:-3] if j//6==0 else lines[i+j][12:-1] for j in range(6) ]) for i in range(10, len(lines), 6)] |
| print(len(lines)) |
| print(lines[-20:]) |
| while(1):pass |
| data = pd.DataFrame( |
| [row.split(",") for row in lines[1:]], |
| columns=["date", "open", "high", "low", "close", "volume"], |
| ) |
|
|
|
|
| data = data.reset_index(drop=True).set_index("date") |
| data.index = pd.to_datetime(data.index) |
|
|
| |
| if data.index.to_series().dt.tz is None: |
| print("CONVERTING TIME") |
| data = add_tz(data, time_zone="US/Eastern") |
| data = convert_tz(data, time_zone="UTC") |
| data = pd.concat([data], axis=1, keys=[ticker]) |
| return data |
|
|
|
|
| def alpha_vantage_get_ticker_data(ticker, time="1min", year=1, month=1): |
| """Function to get (ticker, year, month) data using alpha vantage's time series intraday extended API""" |
| ALPHA_VANTAGE_API_KEY = "VGRS7MNEHU6K8FAZ" |
| import requests |
|
|
| CSV_URL = f"https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={ticker}&interval={time}&month={2026-year}-{11-month:02d}&outputsize=full&apikey={ALPHA_VANTAGE_API_KEY}" |
|
|
| while True: |
| with requests.Session() as s: |
| download = s.get(CSV_URL) |
| |
| decoded_content = download.content.decode("utf-8") |
| print( |
| f"ticker: {ticker}, y{year} m{month}; response length: {len(decoded_content)}" |
| ) |
|
|
| if len(decoded_content) == 236: |
| |
| sleep(60) |
| elif len(decoded_content) <= 243: |
| |
| print(f"Error getting {ticker}, y{year}, m{month}. We are skipping") |
| print(decoded_content) |
| return None |
| else: |
| return csv_str_to_df(decoded_content, ticker) |
|
|
|
|
| def use_alpha_vantage(tickers, out_file, time="1min"): |
| """Function to get multiple full tickers data using alpha vantage's time series intraday extended API""" |
|
|
| dfs = [] |
| for ticker in tickers: |
| t_dfs = [] |
| for year in range(1, 3): |
| for month in range(1, 13): |
| df_temp = alpha_vantage_get_ticker_data( |
| ticker, time=time, year=year, month=month |
| ) |
| if df_temp is not None: |
| t_dfs.append(df_temp) |
|
|
| if len(t_dfs): |
| dfs.append(pd.concat(t_dfs, axis=0)) |
| else: |
| print(f"Skipped {ticker}.") |
| df = pd.concat(dfs, axis=1, sort=True) |
| while(1):pass |
| df.index.rename("date", inplace=True) |
|
|
| write_df(df, out_file) |
|
|
| return df |
|
|
|
|
| |
|
|
|
|
| |
|
|
|
|
| def use_alpaca(tickers, out_file, timeframe="1Minute", start="2017-01-01"): |
| APCA_API_BASE_URL = os.environ.get("APCA_API_BASE_URL") |
| APCA_API_KEY_ID = os.environ.get("APCA_API_KEY_ID") |
| APCA_API_SECRET_KEY = os.environ.get("APCA_API_SECRET_KEY") |
| import alpaca_trade_api as tradeapi |
|
|
| alpaca = tradeapi.REST( |
| key_id=APCA_API_KEY_ID, |
| secret_key=APCA_API_SECRET_KEY, |
| base_url=APCA_API_BASE_URL, |
| ) |
| account = alpaca.get_account() |
| print(account.status) |
|
|
| dfs = [] |
| for ticker in tickers: |
| print("Getting", ticker) |
| df = alpaca.get_bars(ticker, timeframe, start).df |
| print("Recieved", ticker) |
| df.index.name = "date" |
| df = pd.concat([df], axis=1, keys=[ticker]) |
| dfs.append(df) |
| df = pd.concat(dfs, axis=1, sort=True) |
| df.index.rename("date", inplace=True) |
|
|
| if out_file is not None: |
| write_df(df, out_file) |
|
|
| return df |
|
|
|
|
| |
|
|
|
|
| |
|
|
| def try_until_suc(request_func, *args, **kwargs): |
| while True: |
| try: |
| res = request_func(*args, **kwargs) |
| except Exception as e: |
| print("Error Message:", e) |
| print(f"Traceback Details: {traceback.format_exc()}") |
| print("retry sending request...") |
| sleep(5) |
| else: |
| break |
| return res |
|
|
|
|
| def use_polygon(tickers, out_file, multiplier=1, timespan="minute", start="2000-01-01"): |
| POLYGON_API_KEY = "i0tmf9psII0FV_W7cAHs5PSKSVlqns72" |
| from polygon import RESTClient |
|
|
| client = RESTClient(POLYGON_API_KEY) |
| |
| |
| |
| dfs = [] |
| end = datetime.datetime.utcnow() |
| start_og = start |
| for ticker in tickers: |
| start = start_og |
| df_agg = None |
| response_len = None |
| i = 0 |
| print("Getting", ticker) |
| while response_len != 1: |
| i += 1 |
| aggs = try_until_suc( |
| client.get_aggs, |
| ticker, |
| multiplier, |
| timespan, |
| start, |
| end, |
| adjusted=True, |
| sort="asc", |
| limit=50000, |
| ) |
| df = pd.DataFrame(aggs) |
| df.index = pd.DatetimeIndex( |
| pd.to_datetime(df["timestamp"], unit="ms", utc=True) |
| ) |
| df.index.name = "date" |
| df = df.filter(["open", "high", "low", "close", "volume", "vwap"], axis=1) |
| response_len = len(df.index) |
| start = df.last_valid_index() |
| print(i, response_len) |
| if df_agg is not None: |
| df_agg.drop(index=df_agg.index[-1], axis=0, inplace=True) |
| df_agg = pd.merge(df_agg.reset_index(), df.reset_index(), how="outer") |
| df_agg = df_agg.set_index("date") |
| else: |
| df_agg = df |
| sleep(12) |
| df_agg = pd.concat([df_agg], axis=1, keys=[ticker]) |
| dfs.append(df_agg) |
| print("Recieved", ticker) |
|
|
| df = pd.concat(dfs, axis=1, sort=True) |
| df.index.rename("date", inplace=True) |
|
|
| if out_file is not None: |
| write_df(df, out_file) |
|
|
| return df |
|
|
|
|
| |
|
|
|
|
| |
| |
| |
| |
|
|
|
|
| |
|
|
|
|
| |
| |
|
|
|
|
| |
|
|
|
|
| |
| |
| |
| |
|
|
|
|
| |
|
|
|
|
| |
| df = use_polygon( |
| tickers, |
| os.path.join(DATA_RAW, "realdata.csv"), |
| multiplier=1, |
| timespan="hour", |
| start="2000-01-01", |
| ) |
|
|
|
|
| |
|
|
|
|
| df.head() |
|
|
|
|
| |
|
|
| |
|
|
| |
|
|
|
|
| df_all = read_data(os.path.join(DATA_RAW, "realdata.csv")) |
| |
| print(df_all.head()) |
| print(df.head()) |
| print(df_all.columns) |
| print(df.columns) |
|
|
|
|
| |
|
|
| |
|
|
|
|
| run = False |
| if run and not df.columns.equals(df_all.columns): |
| df_new = write_df( |
| pd.concat([df_all, df], axis=1), os.path.join(DATA_RAW, "realdata.csv") |
| ) |
|
|
|
|
| |
| |
|
|
| |
|
|
|
|
| df_f = df.copy() |
| df_f = df_f.dropna(axis=0, thresh=50) |
| write_df(df_f, os.path.join(DATA_RAW, "realdata_pol_1h.csv")) |
|
|
|
|
| |
|
|
|
|
| df.tail(80) |
|
|
|
|