File size: 2,157 Bytes
2d00e44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
import numpy as np

from typing import List

def clean_indicator(
    df : pd.DataFrame,
    feature_cols : List[str] | None = None,
    drop_col_frac_threshold : float = 0.2
) -> pd.DataFrame:
  
  """
  function to clean dataframe (remove all NaN values without lookahead)
  args:
    df : pandas dataframe
    feature_cols : list of colms to treat as feats. Default -> None => use all colmns
    drop_col_frac_threshold : drop the colmns with total NaN values greater than this
  """

  df = df.copy()

  if feature_cols is None:

    exclude = {
            "Adj Close",
            "Dividends",
            "Stock Splits",
        }  # keep Close/Open/High/Low/Volume
    
    feature_cols = [c for c in df.columns if c not in exclude]

  #compute first valid and last valid positions
  n = len(df)
  first_positions = {}
  last_positions = {}

  for c in feature_cols:
    fv = df[c].first_valid_index()
    lv = df[c].last_valid_index()
    first_positions[c] = df.index.get_loc(fv) if fv is not None else n
    last_positions[c] = df.index.get_loc(lv) if lv is not None else -1

  #triming window logic -> remove head warmi
  #find features where all have values
  start_pos = max(
        first_positions.values()
    )
  #find last position where all features have values
  end_pos = min(last_positions.values())  
  
  if start_pos >= end_pos:
    # not enough overlap: as fallback, choose start = median of first positions, end = max of last positions
    start_pos = int(np.median(list(first_positions.values())))
    end_pos = int(np.median([pos for pos in last_positions.values() if pos >= 0]))

  df_trim = df.iloc[start_pos : (end_pos + 1)].copy()

  frac_nans = df_trim.isna().mean()
  drop_cols = frac_nans[frac_nans > drop_col_frac_threshold].index.tolist()
  # don't drop imp price columns
  essential = {"Open", "High", "Low", "Close", "Volume"}
  
  drop_cols = [c for c in drop_cols if c not in essential]
  
  df_trim = df_trim.drop(columns=drop_cols)

  df_imputed = df_trim.fillna(method="ffill") # type: ignore
  
  medians = df_imputed.median()
  df_imputed = df_imputed.fillna(medians)


  return df_imputed