| |
| |
|
|
| |
|
|
|
|
| import pandas as pd |
| import os |
| import datetime |
| import pytz |
| import numpy as np |
| from utils.ipynb_helpers import read_data, write_df, convert_tz |
|
|
|
|
| |
| DATA_RAW = "data/raw" |
|
|
|
|
| |
|
|
| |
|
|
|
|
| stock=True |
| df_all = read_data(os.path.join(DATA_RAW, "realdata_pol_1h.csv"), stock=stock) |
| |
|
|
| df_all = df_all[df_all.columns[:-12]] |
|
|
|
|
| |
|
|
| |
|
|
|
|
| def percentage_nans(data, sort=True): |
| percent_missing = data.isnull().sum() * 100 / len(data) |
| missing_value_df = pd.DataFrame( |
| {"percent_missing": percent_missing} |
| ) |
| if sort: |
| missing_value_df.sort_values("percent_missing", inplace=True) |
| return missing_value_df |
|
|
|
|
| def filter_percentage_nans(data, thresh=0.1): |
| thresh *= 100 |
| per_nans = percentage_nans(data, sort=False) |
| return data.loc[:, per_nans[per_nans["percent_missing"] < thresh].index] |
|
|
|
|
| def filter_intra_ticker(data, cols=["close"]): |
| if cols is None: |
| return data |
| return data.iloc[ |
| :, data.columns.get_level_values(1).isin(cols) |
| ] |
|
|
|
|
| def no_premarket_after_hours(data): |
| mkt_start = datetime.time(hour=9, minute=30, tzinfo=pytz.timezone("US/Eastern")) |
| mkt_end = datetime.time(hour=15, minute=59, tzinfo=pytz.timezone("US/Eastern")) |
| data = convert_tz(data, time_zone="US/Eastern") |
| data = data.between_time(mkt_start, mkt_end) |
| data = convert_tz(data, time_zone="UTC") |
| return data |
|
|
|
|
| def add_technical(data): |
| for ticker in data.columns.get_level_values(0).unique(): |
| |
| data[ticker, "pctchange"] = ( |
| data[ticker, "close"] / data[ticker, "open"] - 1 |
| ).fillna(0.0).replace([np.inf, -np.inf, -1], 0.0) |
| data[ticker, "logpctchange"] = np.log( |
| data[ticker, "close"] / data[ticker, "open"] |
| ).fillna(0.0).replace([np.inf, -np.inf], 0.0) |
|
|
|
|
| |
| |
|
|
| data[ticker, "shortsma"] = ( |
| data[ticker, "close"].rolling(5).mean().fillna(data[ticker, "close"]) |
| ) |
| |
| |
| |
| data = data.reindex(sorted(data.columns), axis=1) |
| |
| return data |
|
|
| if stock: |
| |
| df_all = no_premarket_after_hours(df_all) |
|
|
| percentage_nans(df_all).tail(40) |
|
|
|
|
| |
|
|
|
|
| df = filter_percentage_nans(df_all, 0.08) |
| print(df.columns.get_level_values(0).unique()) |
| df.columns |
|
|
|
|
| |
|
|
|
|
| |
| df = add_technical(df) |
|
|
| |
| |
| |
| |
| df = filter_intra_ticker( |
| df, cols=["open", "close", "pctchange", "logpctchange", "shortsma"] |
| ) |
|
|
| df.head(20) |
|
|
|
|
| |
|
|
|
|
| import matplotlib.pyplot as plt |
| df_t = df["WTI", "pctchange"] |
| start_date = "2022-10-01" |
| end_date = "2022-11-01" |
| f1 = df_t[df.index > start_date] |
| f2 = f1[f1.index < end_date] |
| print(f2) |
| |
| |
| |
| plt.figure(figsize=(24,4)) |
| plt.plot(np.arange(f2.index.to_numpy().shape[0]), 3.3* np.cumprod(f2.to_numpy()+1)) |
|
|
|
|
| |
|
|
| |
|
|
|
|
| def ffill_nans(data): |
| data = data.ffill() |
| |
| data = data.dropna() |
| return data |
|
|
|
|
| def del_nans_ffill(data, thresh): |
| data = data.dropna(thresh=thresh) |
| data = ffill_nans(data) |
| return data |
|
|
|
|
| |
|
|
|
|
| df = ffill_nans(df) |
| df.head() |
|
|
|
|
| |
|
|
| |
|
|
|
|
| def clip_outliers(data, p=0.005): |
| lower = data.quantile(p) |
| upper = data.quantile(1 - p) |
|
|
| return data.clip(lower=lower, upper=upper, axis=1) |
|
|
|
|
| |
|
|
|
|
| if stock: |
| df = clip_outliers(df) |
|
|
| df.head() |
|
|
|
|
| |
|
|
| |
|
|
|
|
| |
| write_df(df, "data/stock/material_1h.csv") |
| |
|
|
|
|
| |
|
|
| |
|
|
| |
|
|
|
|
| |
|
|
| |
| |
| |
|
|
| |
| |
| plt.show() |
|
|
|
|