#!/usr/bin/env python # coding: utf-8 # In[ ]: import pandas as pd import os import datetime import pytz import numpy as np from utils.ipynb_helpers import read_data, write_df, convert_tz # Location to open raw data from data providers DATA_RAW = "data/raw" # ##### Read Data From All-Data CSV (Multi Index Columns) # In[ ]: stock=True df_all = read_data(os.path.join(DATA_RAW, "realdata_pol_1h.csv"), stock=stock) # df_all = read_data(os.path.join(DATA_RAW, "other/PEMSBAY.csv"), stock=stock) df_all = df_all[df_all.columns[:-12]] # # Filtering & Processing the Master Dataset # In[ ]: def percentage_nans(data, sort=True): percent_missing = data.isnull().sum() * 100 / len(data) missing_value_df = pd.DataFrame( {"percent_missing": percent_missing} #'column_name': data.columns, ) if sort: missing_value_df.sort_values("percent_missing", inplace=True) return missing_value_df def filter_percentage_nans(data, thresh=0.1): thresh *= 100 per_nans = percentage_nans(data, sort=False) return data.loc[:, per_nans[per_nans["percent_missing"] < thresh].index] def filter_intra_ticker(data, cols=["close"]): if cols is None: return data return data.iloc[ :, data.columns.get_level_values(1).isin(cols) ] # data.xs("close",level=1, axis=1) def no_premarket_after_hours(data): mkt_start = datetime.time(hour=9, minute=30, tzinfo=pytz.timezone("US/Eastern")) mkt_end = datetime.time(hour=15, minute=59, tzinfo=pytz.timezone("US/Eastern")) data = convert_tz(data, time_zone="US/Eastern") data = data.between_time(mkt_start, mkt_end) data = convert_tz(data, time_zone="UTC") return data def add_technical(data): for ticker in data.columns.get_level_values(0).unique(): # Assumption: close/open values are positive and a zero value means that datapoint is missing so we say no change data[ticker, "pctchange"] = ( data[ticker, "close"] / data[ticker, "open"] - 1 ).fillna(0.0).replace([np.inf, -np.inf, -1], 0.0) data[ticker, "logpctchange"] = np.log( data[ticker, "close"] / data[ticker, "open"] ).fillna(0.0).replace([np.inf, -np.inf], 0.0) # data[ticker, "pctchange-1"] = data[ticker, "pctchange"].shift(1,fill_value=0.0) # data[ticker, "pctchange-2"] = data[ticker, "pctchange"].shift(2,fill_value=0.0) data[ticker, "shortsma"] = ( data[ticker, "close"].rolling(5).mean().fillna(data[ticker, "close"]) ) # data[ticker,'shortma-1'] = data[ticker,'shortsma'].shift(1) # data[ticker,'shortma-2'] = data[ticker,'shortsma'].shift(2) # print(data.columns.sort_values()) data = data.reindex(sorted(data.columns), axis=1) # data.reindex(columns=data.columns.sort_values().get_level_values(0).unique(), level=0) return data if stock: # Filter df_all to normal hours df_all = no_premarket_after_hours(df_all) percentage_nans(df_all).tail(40) # In[ ]: df = filter_percentage_nans(df_all, 0.08) #0.40 print(df.columns.get_level_values(0).unique()) df.columns # In[ ]: # Add & filter columns df = add_technical(df) # None # ["close"] # ["pctchange"] # ["open", "high", "low", "close", "volume", 'pctchange', "shortsma"] df = filter_intra_ticker( df, cols=["open", "close", "pctchange", "logpctchange", "shortsma"] ) df.head(20) # In[ ]: import matplotlib.pyplot as plt df_t = df["WTI", "pctchange"] start_date = "2022-10-01" end_date = "2022-11-01" f1 = df_t[df.index > start_date] f2 = f1[f1.index < end_date] print(f2) # f = plt.figure() # f.set_figwidth(60) # f.set_figheight(20) plt.figure(figsize=(24,4)) plt.plot(np.arange(f2.index.to_numpy().shape[0]), 3.3* np.cumprod(f2.to_numpy()+1)) # ##### Fill NaNs # In[ ]: def ffill_nans(data): data = data.ffill() # data = data.fillna(method="ffill") data = data.dropna() return data def del_nans_ffill(data, thresh): data = data.dropna(thresh=thresh) data = ffill_nans(data) return data # In[ ]: df = ffill_nans(df) df.head() # #### Clip Outliers # In[ ]: def clip_outliers(data, p=0.005): lower = data.quantile(p) upper = data.quantile(1 - p) return data.clip(lower=lower, upper=upper, axis=1) # In[ ]: if stock: df = clip_outliers(df) df.head() # ##### Save Data # In[ ]: # Sometimes it errors bc the path doesn't exist but just run it again write_df(df, "data/stock/material_1h.csv") # write_df(df, "data/other/PEMSBAY.csv") # ## Extras # ##### Read data and convert to percent delta # In[ ]: # df_new = read_data("data/stock/close_1h.csv") # print("Before:\n", df_new.head()) # df_new = df_new.pct_change() # df_new.iloc[0] = 0 # print("After:\n",df_new.head()) # write_df(df_new, "data/stock/close_1h_pct_change.csv") plt.show()