Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from typing import List | |
| def clean_indicator( | |
| df : pd.DataFrame, | |
| feature_cols : List[str] | None = None, | |
| drop_col_frac_threshold : float = 0.2 | |
| ) -> pd.DataFrame: | |
| """ | |
| function to clean dataframe (remove all NaN values without lookahead) | |
| args: | |
| df : pandas dataframe | |
| feature_cols : list of colms to treat as feats. Default -> None => use all colmns | |
| drop_col_frac_threshold : drop the colmns with total NaN values greater than this | |
| """ | |
| df = df.copy() | |
| if feature_cols is None: | |
| exclude = { | |
| "Adj Close", | |
| "Dividends", | |
| "Stock Splits", | |
| } # keep Close/Open/High/Low/Volume | |
| feature_cols = [c for c in df.columns if c not in exclude] | |
| #compute first valid and last valid positions | |
| n = len(df) | |
| first_positions = {} | |
| last_positions = {} | |
| for c in feature_cols: | |
| fv = df[c].first_valid_index() | |
| lv = df[c].last_valid_index() | |
| first_positions[c] = df.index.get_loc(fv) if fv is not None else n | |
| last_positions[c] = df.index.get_loc(lv) if lv is not None else -1 | |
| #triming window logic -> remove head warmi | |
| #find features where all have values | |
| start_pos = max( | |
| first_positions.values() | |
| ) | |
| #find last position where all features have values | |
| end_pos = min(last_positions.values()) | |
| if start_pos >= end_pos: | |
| # not enough overlap: as fallback, choose start = median of first positions, end = max of last positions | |
| start_pos = int(np.median(list(first_positions.values()))) | |
| end_pos = int(np.median([pos for pos in last_positions.values() if pos >= 0])) | |
| df_trim = df.iloc[start_pos : (end_pos + 1)].copy() | |
| frac_nans = df_trim.isna().mean() | |
| drop_cols = frac_nans[frac_nans > drop_col_frac_threshold].index.tolist() | |
| # don't drop imp price columns | |
| essential = {"Open", "High", "Low", "Close", "Volume"} | |
| drop_cols = [c for c in drop_cols if c not in essential] | |
| df_trim = df_trim.drop(columns=drop_cols) | |
| df_imputed = df_trim.fillna(method="ffill") # type: ignore | |
| medians = df_imputed.median() | |
| df_imputed = df_imputed.fillna(medians) | |
| return df_imputed |