MHamzaShahid commited on
Commit
c323a8d
·
verified ·
1 Parent(s): a3c60c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -83
app.py CHANGED
@@ -13,58 +13,34 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
13
  # ================================
14
  # 1️⃣ Custom Preprocessing Functions
15
  # ================================
16
-
17
  def temp_cat(X):
18
- if isinstance(X, pd.DataFrame):
19
- X['avg_temp_cat'] = pd.cut(
20
- X['avg_temp'],
21
- bins=[0, 5, 10, 20, 30, np.inf],
22
- labels=['very_cold', 'cold', 'warm', 'hot', 'very_hot']
23
- )
24
- return X
25
- else:
26
- X = pd.DataFrame(X)
27
- X['avg_temp_cat'] = pd.cut(
28
- X['avg_temp'],
29
- bins=[0, 5, 10, 20, 30, np.inf],
30
- labels=['very_cold', 'cold', 'warm', 'hot', 'very_hot']
31
- )
32
- return X
33
-
34
 
35
  def clean(X):
36
- if isinstance(X, pd.DataFrame):
37
- return X.dropna()
38
- else:
39
- return pd.DataFrame(X).dropna()
40
-
41
 
42
  def proxy_humidity(X):
43
- if isinstance(X, pd.DataFrame):
44
- X["proxy_humidity"] = X["average_rain_fall_mm_per_year"] / (X["avg_temp"] + 1)
45
- return X
46
- else:
47
- X = pd.DataFrame(X)
48
- X["proxy_humidity"] = X["average_rain_fall_mm_per_year"] / (X["avg_temp"] + 1)
49
- return X
50
 
51
 
52
  # ================================
53
  # 2️⃣ Transformers and Pipelines
54
  # ================================
55
-
56
- temp_cat_transformer = FunctionTransformer(temp_cat)
57
  temp_cat_pipeline = make_pipeline(
58
- temp_cat_transformer,
59
- OrdinalEncoder(
60
- handle_unknown='use_encoded_value',
61
- unknown_value=-1
62
- )
63
  )
64
 
65
- clean_transformer = FunctionTransformer(clean)
66
  clean_pipeline = make_pipeline(
67
- clean_transformer,
68
  StandardScaler()
69
  )
70
 
@@ -73,25 +49,19 @@ cat_pipeline = make_pipeline(
73
  OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
74
  )
75
 
76
- proxy_humidity_transformer = FunctionTransformer(proxy_humidity)
77
  proxy_humidity_pipeline = make_pipeline(
78
- proxy_humidity_transformer,
79
  StandardScaler()
80
  )
81
 
82
- square_transformer = FunctionTransformer(np.square)
83
- square_pipeline = make_pipeline(square_transformer, StandardScaler())
84
-
85
- log_transformer = FunctionTransformer(np.log1p)
86
- log_pipeline = make_pipeline(log_transformer, StandardScaler())
87
-
88
  default_num_pipeline = make_pipeline(StandardScaler())
89
 
90
 
91
  # ================================
92
  # 3️⃣ Custom Feature Selector
93
  # ================================
94
-
95
  class CorrelationThresholdSelector(BaseEstimator, TransformerMixin):
96
  def __init__(self, threshold=0.9, target_threshold=0.0, method="pearson", min_variance=0.0):
97
  self.threshold = threshold
@@ -100,69 +70,47 @@ class CorrelationThresholdSelector(BaseEstimator, TransformerMixin):
100
  self.min_variance = min_variance
101
 
102
  def fit(self, X, y):
103
- X_original = X
104
  X_arr, y_arr = check_X_y(X, y, accept_sparse=False, dtype=np.float64)
105
  n_features = X_arr.shape[1]
106
- self.n_features_in_ = n_features
107
-
108
- if hasattr(X_original, "columns"):
109
- self.feature_names_in_ = np.asarray(X_original.columns)
110
- else:
111
- self.feature_names_in_ = np.array([f"f{i}" for i in range(n_features)])
112
-
113
- if n_features <= 1:
114
- self.features_to_drop_ = np.array([], dtype=int)
115
- self.selected_features_ = np.arange(n_features, dtype=int)
116
- return self
117
 
118
  X_df = pd.DataFrame(X_arr, columns=self.feature_names_in_)
119
  variances = X_df.var(numeric_only=True)
120
- low_var_mask = variances <= self.min_variance
121
- low_var_idx = np.where(low_var_mask)[0].tolist()
122
 
123
  corr_mat = X_df.corr(method=self.method).abs().values
124
  np.fill_diagonal(corr_mat, 0.0)
125
 
126
  y_series = pd.Series(y_arr)
127
- target_corr_series = X_df.corrwith(y_series, method=self.method).abs().fillna(0.0)
128
- target_corr = target_corr_series.values
129
 
130
  visited, drops = set(), set()
131
-
132
  for i in range(n_features):
133
  if i in visited or i in low_var_idx:
134
  continue
135
  correlated_idx = set(np.where(corr_mat[i] > self.threshold)[0].tolist())
136
  cluster = {i} | correlated_idx
137
  visited |= cluster
138
- if len(cluster) == 1:
139
- continue
140
- best = max(cluster, key=lambda idx: (target_corr[idx], X_df.iloc[:, idx].var()))
141
- if self.target_threshold > 0 and target_corr[best] < self.target_threshold:
142
- drops |= cluster
143
- else:
144
- cluster.remove(best)
145
- drops |= cluster
146
 
147
  drops |= set(low_var_idx)
148
- self.features_to_drop_ = np.array(sorted(drops), dtype=int)
149
- retained = sorted(set(range(n_features)) - set(self.features_to_drop_))
150
- self.selected_features_ = np.array(retained, dtype=int)
151
- self.selected_feature_names_ = self.feature_names_in_[self.selected_features_].tolist()
152
- self.dropped_feature_names_ = self.feature_names_in_[self.features_to_drop_].tolist()
153
  return self
154
 
155
  def transform(self, X):
156
  check_is_fitted(self, "selected_features_")
157
  X_arr = check_array(X, accept_sparse=False, dtype=np.float64)
158
- if self.selected_features_.size == 0:
159
- return np.empty((X_arr.shape[0], 0), dtype=X_arr.dtype)
160
- sel = np.asarray(self.selected_features_, dtype=int)
161
- return X_arr[:, sel]
162
 
163
 
164
  # ================================
165
- # 4️⃣ Register All Functions for joblib
166
  # ================================
167
  sys.modules['__main__'].temp_cat = temp_cat
168
  sys.modules['__main__'].clean = clean
@@ -230,7 +178,7 @@ def predict_yield(data: CropInput):
230
 
231
 
232
  # ================================
233
- # 9️⃣ Local Run
234
  # ================================
235
  if __name__ == "__main__":
236
  import uvicorn
 
13
  # ================================
14
  # 1️⃣ Custom Preprocessing Functions
15
  # ================================
 
16
  def temp_cat(X):
17
+ X = pd.DataFrame(X)
18
+ X['avg_temp_cat'] = pd.cut(
19
+ X['avg_temp'],
20
+ bins=[0, 5, 10, 20, 30, np.inf],
21
+ labels=['very_cold', 'cold', 'warm', 'hot', 'very_hot']
22
+ )
23
+ return X
 
 
 
 
 
 
 
 
 
24
 
25
  def clean(X):
26
+ return pd.DataFrame(X).dropna()
 
 
 
 
27
 
28
  def proxy_humidity(X):
29
+ X = pd.DataFrame(X)
30
+ X["proxy_humidity"] = X["average_rain_fall_mm_per_year"] / (X["avg_temp"] + 1)
31
+ return X
 
 
 
 
32
 
33
 
34
  # ================================
35
  # 2️⃣ Transformers and Pipelines
36
  # ================================
 
 
37
  temp_cat_pipeline = make_pipeline(
38
+ FunctionTransformer(temp_cat),
39
+ OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
 
 
 
40
  )
41
 
 
42
  clean_pipeline = make_pipeline(
43
+ FunctionTransformer(clean),
44
  StandardScaler()
45
  )
46
 
 
49
  OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
50
  )
51
 
 
52
  proxy_humidity_pipeline = make_pipeline(
53
+ FunctionTransformer(proxy_humidity),
54
  StandardScaler()
55
  )
56
 
57
+ square_pipeline = make_pipeline(FunctionTransformer(np.square), StandardScaler())
58
+ log_pipeline = make_pipeline(FunctionTransformer(np.log1p), StandardScaler())
 
 
 
 
59
  default_num_pipeline = make_pipeline(StandardScaler())
60
 
61
 
62
  # ================================
63
  # 3️⃣ Custom Feature Selector
64
  # ================================
 
65
  class CorrelationThresholdSelector(BaseEstimator, TransformerMixin):
66
  def __init__(self, threshold=0.9, target_threshold=0.0, method="pearson", min_variance=0.0):
67
  self.threshold = threshold
 
70
  self.min_variance = min_variance
71
 
72
  def fit(self, X, y):
 
73
  X_arr, y_arr = check_X_y(X, y, accept_sparse=False, dtype=np.float64)
74
  n_features = X_arr.shape[1]
75
+ self.feature_names_in_ = np.array(getattr(X, "columns", [f"f{i}" for i in range(n_features)]))
 
 
 
 
 
 
 
 
 
 
76
 
77
  X_df = pd.DataFrame(X_arr, columns=self.feature_names_in_)
78
  variances = X_df.var(numeric_only=True)
79
+ low_var_idx = np.where(variances <= self.min_variance)[0].tolist()
 
80
 
81
  corr_mat = X_df.corr(method=self.method).abs().values
82
  np.fill_diagonal(corr_mat, 0.0)
83
 
84
  y_series = pd.Series(y_arr)
85
+ target_corr = X_df.corrwith(y_series, method=self.method).abs().fillna(0.0).values
 
86
 
87
  visited, drops = set(), set()
 
88
  for i in range(n_features):
89
  if i in visited or i in low_var_idx:
90
  continue
91
  correlated_idx = set(np.where(corr_mat[i] > self.threshold)[0].tolist())
92
  cluster = {i} | correlated_idx
93
  visited |= cluster
94
+ if len(cluster) > 1:
95
+ best = max(cluster, key=lambda idx: (target_corr[idx], X_df.iloc[:, idx].var()))
96
+ if self.target_threshold > 0 and target_corr[best] < self.target_threshold:
97
+ drops |= cluster
98
+ else:
99
+ cluster.remove(best)
100
+ drops |= cluster
 
101
 
102
  drops |= set(low_var_idx)
103
+ self.selected_features_ = np.array(sorted(set(range(n_features)) - drops), dtype=int)
 
 
 
 
104
  return self
105
 
106
  def transform(self, X):
107
  check_is_fitted(self, "selected_features_")
108
  X_arr = check_array(X, accept_sparse=False, dtype=np.float64)
109
+ return X_arr[:, self.selected_features_]
 
 
 
110
 
111
 
112
  # ================================
113
+ # 4️⃣ Register Custom Functions for joblib
114
  # ================================
115
  sys.modules['__main__'].temp_cat = temp_cat
116
  sys.modules['__main__'].clean = clean
 
178
 
179
 
180
  # ================================
181
+ # 9️⃣ Local or Hugging Face Run
182
  # ================================
183
  if __name__ == "__main__":
184
  import uvicorn