WaveLSFromer / data_prepare.py
ducheng678
Initial WaveLSFromer project
093b0a5
Raw
History Blame Contribute Delete
4.85 kB
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
import pandas as pd
import os
import datetime
import pytz
import numpy as np
from utils.ipynb_helpers import read_data, write_df, convert_tz
# Location to open raw data from data providers
DATA_RAW = "data/raw"
# ##### Read Data From All-Data CSV (Multi Index Columns)
# In[ ]:
stock=True
df_all = read_data(os.path.join(DATA_RAW, "realdata_pol_1h.csv"), stock=stock)
# df_all = read_data(os.path.join(DATA_RAW, "other/PEMSBAY.csv"), stock=stock)
df_all = df_all[df_all.columns[:-12]]
# # Filtering & Processing the Master Dataset
# In[ ]:
def percentage_nans(data, sort=True):
percent_missing = data.isnull().sum() * 100 / len(data)
missing_value_df = pd.DataFrame(
{"percent_missing": percent_missing} #'column_name': data.columns,
)
if sort:
missing_value_df.sort_values("percent_missing", inplace=True)
return missing_value_df
def filter_percentage_nans(data, thresh=0.1):
thresh *= 100
per_nans = percentage_nans(data, sort=False)
return data.loc[:, per_nans[per_nans["percent_missing"] < thresh].index]
def filter_intra_ticker(data, cols=["close"]):
if cols is None:
return data
return data.iloc[
:, data.columns.get_level_values(1).isin(cols)
] # data.xs("close",level=1, axis=1)
def no_premarket_after_hours(data):
mkt_start = datetime.time(hour=9, minute=30, tzinfo=pytz.timezone("US/Eastern"))
mkt_end = datetime.time(hour=15, minute=59, tzinfo=pytz.timezone("US/Eastern"))
data = convert_tz(data, time_zone="US/Eastern")
data = data.between_time(mkt_start, mkt_end)
data = convert_tz(data, time_zone="UTC")
return data
def add_technical(data):
for ticker in data.columns.get_level_values(0).unique():
# Assumption: close/open values are positive and a zero value means that datapoint is missing so we say no change
data[ticker, "pctchange"] = (
data[ticker, "close"] / data[ticker, "open"] - 1
).fillna(0.0).replace([np.inf, -np.inf, -1], 0.0)
data[ticker, "logpctchange"] = np.log(
data[ticker, "close"] / data[ticker, "open"]
).fillna(0.0).replace([np.inf, -np.inf], 0.0)
# data[ticker, "pctchange-1"] = data[ticker, "pctchange"].shift(1,fill_value=0.0)
# data[ticker, "pctchange-2"] = data[ticker, "pctchange"].shift(2,fill_value=0.0)
data[ticker, "shortsma"] = (
data[ticker, "close"].rolling(5).mean().fillna(data[ticker, "close"])
)
# data[ticker,'shortma-1'] = data[ticker,'shortsma'].shift(1)
# data[ticker,'shortma-2'] = data[ticker,'shortsma'].shift(2)
# print(data.columns.sort_values())
data = data.reindex(sorted(data.columns), axis=1)
# data.reindex(columns=data.columns.sort_values().get_level_values(0).unique(), level=0)
return data
if stock:
# Filter df_all to normal hours
df_all = no_premarket_after_hours(df_all)
percentage_nans(df_all).tail(40)
# In[ ]:
df = filter_percentage_nans(df_all, 0.08) #0.40
print(df.columns.get_level_values(0).unique())
df.columns
# In[ ]:
# Add & filter columns
df = add_technical(df)
# None
# ["close"]
# ["pctchange"]
# ["open", "high", "low", "close", "volume", 'pctchange', "shortsma"]
df = filter_intra_ticker(
df, cols=["open", "close", "pctchange", "logpctchange", "shortsma"]
)
df.head(20)
# In[ ]:
import matplotlib.pyplot as plt
df_t = df["WTI", "pctchange"]
start_date = "2022-10-01"
end_date = "2022-11-01"
f1 = df_t[df.index > start_date]
f2 = f1[f1.index < end_date]
print(f2)
# f = plt.figure()
# f.set_figwidth(60)
# f.set_figheight(20)
plt.figure(figsize=(24,4))
plt.plot(np.arange(f2.index.to_numpy().shape[0]), 3.3* np.cumprod(f2.to_numpy()+1))
# ##### Fill NaNs
# In[ ]:
def ffill_nans(data):
data = data.ffill()
# data = data.fillna(method="ffill")
data = data.dropna()
return data
def del_nans_ffill(data, thresh):
data = data.dropna(thresh=thresh)
data = ffill_nans(data)
return data
# In[ ]:
df = ffill_nans(df)
df.head()
# #### Clip Outliers
# In[ ]:
def clip_outliers(data, p=0.005):
lower = data.quantile(p)
upper = data.quantile(1 - p)
return data.clip(lower=lower, upper=upper, axis=1)
# In[ ]:
if stock:
df = clip_outliers(df)
df.head()
# ##### Save Data
# In[ ]:
# Sometimes it errors bc the path doesn't exist but just run it again
write_df(df, "data/stock/material_1h.csv")
# write_df(df, "data/other/PEMSBAY.csv")
# ## Extras
# ##### Read data and convert to percent delta
# In[ ]:
# df_new = read_data("data/stock/close_1h.csv")
# print("Before:\n", df_new.head())
# df_new = df_new.pct_change()
# df_new.iloc[0] = 0
# print("After:\n",df_new.head())
# write_df(df_new, "data/stock/close_1h_pct_change.csv")
plt.show()