Spaces:
Sleeping
Sleeping
File size: 2,157 Bytes
2d00e44 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | import pandas as pd
import numpy as np
from typing import List
def clean_indicator(
df : pd.DataFrame,
feature_cols : List[str] | None = None,
drop_col_frac_threshold : float = 0.2
) -> pd.DataFrame:
"""
function to clean dataframe (remove all NaN values without lookahead)
args:
df : pandas dataframe
feature_cols : list of colms to treat as feats. Default -> None => use all colmns
drop_col_frac_threshold : drop the colmns with total NaN values greater than this
"""
df = df.copy()
if feature_cols is None:
exclude = {
"Adj Close",
"Dividends",
"Stock Splits",
} # keep Close/Open/High/Low/Volume
feature_cols = [c for c in df.columns if c not in exclude]
#compute first valid and last valid positions
n = len(df)
first_positions = {}
last_positions = {}
for c in feature_cols:
fv = df[c].first_valid_index()
lv = df[c].last_valid_index()
first_positions[c] = df.index.get_loc(fv) if fv is not None else n
last_positions[c] = df.index.get_loc(lv) if lv is not None else -1
#triming window logic -> remove head warmi
#find features where all have values
start_pos = max(
first_positions.values()
)
#find last position where all features have values
end_pos = min(last_positions.values())
if start_pos >= end_pos:
# not enough overlap: as fallback, choose start = median of first positions, end = max of last positions
start_pos = int(np.median(list(first_positions.values())))
end_pos = int(np.median([pos for pos in last_positions.values() if pos >= 0]))
df_trim = df.iloc[start_pos : (end_pos + 1)].copy()
frac_nans = df_trim.isna().mean()
drop_cols = frac_nans[frac_nans > drop_col_frac_threshold].index.tolist()
# don't drop imp price columns
essential = {"Open", "High", "Low", "Close", "Volume"}
drop_cols = [c for c in drop_cols if c not in essential]
df_trim = df_trim.drop(columns=drop_cols)
df_imputed = df_trim.fillna(method="ffill") # type: ignore
medians = df_imputed.median()
df_imputed = df_imputed.fillna(medians)
return df_imputed |