singhn9 commited on
Commit
bafc17b
·
verified ·
1 Parent(s): 71b41b9

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +119 -40
src/streamlit_app.py CHANGED
@@ -37,13 +37,30 @@ ENSEMBLE_ARTIFACT = os.path.join(DATA_DIR, "ensemble_models.joblib")
37
  # -------------------------
38
  # Utility: generate advanced dataset if missing
39
  # -------------------------
40
- def generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=60):
 
 
 
 
 
 
41
  """
42
  Generates a large synthetic, physics-aligned dataset with many engineered features.
43
- Saves CSV and metadata JSON and a short annotated bibliography PDF (text).
 
 
 
 
 
 
 
 
44
  """
45
  np.random.seed(random_seed)
46
  os.makedirs(DATA_DIR, exist_ok=True)
 
 
 
47
  # --- base natural features across 8 use cases (expanded)
48
  natural_feats = [
49
  "vibration_x","vibration_y","motor_current","rpm","bearing_temp","ambient_temp","lube_pressure","power_factor",
@@ -55,49 +72,76 @@ def generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=6
55
  "batch_id_numeric","weight_input","weight_output","time_in_queue","conveyor_speed",
56
  "shell_temp","lining_thickness","water_flow","cooling_out_temp","heat_flux"
57
  ]
58
- # dedupe if duplicated names
59
- natural_feats = list(dict.fromkeys(natural_feats))
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  # helper sampling heuristics
62
  def sample_col(name, n):
63
  name_l = name.lower()
64
  if "furnace_temp" in name_l or name_l.endswith("_temp") or "tap_temp" in name_l:
65
- return np.random.normal(1550, 50, n)
 
66
  if name_l in ("tap_temp","mold_temp","shell_temp","cooling_out_temp","exit_temp"):
67
- return np.random.normal(200 if "mold" not in name_l else 1500, 30, n)
 
68
  if "offgas_co2" in name_l:
69
- return np.abs(np.random.normal(15,4,n))
 
70
  if "offgas_co" in name_l:
71
- return np.abs(np.random.normal(20,5,n))
 
72
  if "o2" in name_l:
73
- return np.clip(np.random.normal(5,1,n), 0.01, 60)
 
74
  if "arc_power" in name_l or "motor_load" in name_l:
75
- return np.abs(np.random.normal(600,120,n))
 
76
  if "rpm" in name_l:
77
- return np.abs(np.random.normal(120,30,n))
 
78
  if "vibration" in name_l:
79
- return np.abs(np.random.normal(0.4,0.15,n))
 
80
  if "bearing_temp" in name_l:
81
- return np.random.normal(65,5,n)
 
82
  if "chemical" in name_l or "spectro" in name_l:
83
- return np.random.normal(0.7,0.15,n)
 
84
  if "weight" in name_l:
85
- return np.random.normal(1000,100,n)
 
86
  if "conveyor_speed" in name_l or "casting_speed" in name_l:
87
- return np.random.normal(2.5,0.6,n)
 
88
  if "power_factor" in name_l:
89
- return np.clip(np.random.normal(0.92,0.03,n),0.6,1.0)
 
90
  if "image_entropy_proxy" in name_l:
91
- return np.abs(np.random.normal(0.5,0.25,n))
 
92
  if "batch_id" in name_l:
93
  return np.random.randint(1000,9999,n)
94
  if "time_since" in name_l or "time_in_queue" in name_l:
95
- return np.abs(np.random.normal(30,20,n))
 
96
  if "heat_flux" in name_l:
97
- return np.abs(np.random.normal(1000,300,n))
98
- return np.random.normal(0,1,n)
 
99
 
100
- # build DF
101
  df = pd.DataFrame({c: sample_col(c, n_rows) for c in natural_feats})
102
 
103
  # timestamps & metadata
@@ -129,19 +173,15 @@ def generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=6
129
  df["carbon_power_ratio"] = df["carbon_proxy"] / (df["arc_power"] + 1e-6)
130
  df["temp_power_sqrt"] = df["furnace_temp"] * np.sqrt(np.abs(df["arc_power"]) + 1e-6)
131
 
132
- # polynomial features limited to first 12 numeric columns to avoid explosion
133
  numeric = df.select_dtypes(include=[np.number]).fillna(0)
134
  poly_source_cols = numeric.columns[:12].tolist()
135
  poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
136
  poly_mat = poly.fit_transform(numeric[poly_source_cols])
137
  poly_names = poly.get_feature_names_out(poly_source_cols)
138
  poly_df = pd.DataFrame(poly_mat, columns=[f"poly__{n}" for n in poly_names], index=df.index)
139
- # drop identical originals and limit new cols
140
  keep_poly = [c for c in poly_df.columns if c.replace("poly__","") not in poly_source_cols]
141
- if len(keep_poly) > 0:
142
- poly_df = poly_df[keep_poly].iloc[:, :max_polynomial_new]
143
- else:
144
- poly_df = poly_df.iloc[:, :0]
145
  df = pd.concat([df, poly_df], axis=1)
146
 
147
  # PCA embeddings across numeric sensors
@@ -156,22 +196,19 @@ def generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=6
156
  kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
157
  df["operating_mode"] = kmeans.fit_predict(scaled)
158
 
159
- # surrogate models to create short-horizon predicted states (fast regressors)
160
- # furnace_temp_next surrogate
161
  surrogate_df = df.copy()
162
  surrogate_df["furnace_temp_next"] = surrogate_df["furnace_temp"].shift(-1).fillna(method="ffill")
163
  features_for_surrogate = [c for c in ["furnace_temp","arc_power","o2_probe_pct","offgas_co","offgas_co2"] if c in df.columns]
164
  if len(features_for_surrogate) >= 2:
165
  X = surrogate_df[features_for_surrogate].fillna(0)
166
  y = surrogate_df["furnace_temp_next"]
167
- from sklearn.ensemble import RandomForestRegressor
168
  rf = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
169
  rf.fit(X, y)
170
  df["pred_temp_30s"] = rf.predict(X)
171
  else:
172
  df["pred_temp_30s"] = df["furnace_temp"]
173
 
174
- # surrogate for carbon proxy
175
  if all(c in df.columns for c in ["offgas_co","offgas_co2","o2_probe_pct"]):
176
  X2 = df[["offgas_co","offgas_co2","o2_probe_pct"]].fillna(0)
177
  rf2 = RandomForestRegressor(n_estimators=50, random_state=1, n_jobs=-1)
@@ -184,7 +221,7 @@ def generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=6
184
  df["refractory_limit_flag"] = (df["lining_thickness"] < 140).astype(int)
185
  df["max_allowed_power_delta"] = np.clip(df["arc_power"].diff().abs().fillna(0), 0, 2000)
186
 
187
- # simple rule-based target action for demo
188
  df["ARC_ON"] = ((df["arc_power"] > df["arc_power"].median()) & (df["carbon_proxy"] < 1.0)).astype(int)
189
  df["prediction_confidence"] = np.clip(np.random.beta(2,5, n_rows), 0.05, 0.99)
190
 
@@ -195,7 +232,6 @@ def generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=6
195
 
196
  # save CSV & metadata
197
  df.to_csv(CSV_PATH, index=False)
198
-
199
  meta = []
200
  for col in df.columns:
201
  if col in natural_feats:
@@ -215,7 +251,7 @@ def generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=6
215
  with open(META_PATH, "w") as f:
216
  json.dump(meta, f, indent=2)
217
 
218
- # annotated bibliography text saved as simple PDF-like text (clients accept PDF)
219
  try:
220
  from fpdf import FPDF
221
  pdf = FPDF('P','mm','A4')
@@ -241,9 +277,9 @@ def generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=6
241
  pdf.ln(2)
242
  pdf.output(PDF_PATH)
243
  except Exception as e:
244
- # fallback: simple text file
245
  with open(PDF_PATH.replace(".pdf",".txt"), "w") as tf:
246
  tf.write("Annotated bibliography generated. Install fpdf for PDF output.\n")
 
247
  return CSV_PATH, META_PATH, PDF_PATH
248
 
249
  # -------------------------
@@ -349,16 +385,59 @@ with tabs[4]:
349
  arc_power_sd = st.slider("Synthetic Arc Power σ (spread)", 50, 300, 120, step=10)
350
  st.markdown("---")
351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  if st.button("Regenerate Synthetic Dataset with Updated Variance"):
353
  with st.spinner("Regenerating synthetic data..."):
 
 
 
 
354
  CSV_PATH, META_PATH, PDF_PATH = generate_advanced_flatfile(
355
  n_rows=3000,
356
- random_seed=random_seed,
357
- max_polynomial_new=60
 
 
358
  )
 
 
359
  st.cache_data.clear()
360
- df, meta_df = load_data()
361
- st.success("Synthetic dataset regenerated with new variance settings.")
 
 
 
 
 
 
 
 
 
 
 
362
 
363
  if train_button:
364
  with st.spinner("Preparing data and training ensemble..."):
 
37
  # -------------------------
38
  # Utility: generate advanced dataset if missing
39
  # -------------------------
40
+ def generate_advanced_flatfile(
41
+ n_rows=3000,
42
+ random_seed=42,
43
+ max_polynomial_new=60,
44
+ global_variance_multiplier=1.0,
45
+ variance_overrides=None,
46
+ ):
47
  """
48
  Generates a large synthetic, physics-aligned dataset with many engineered features.
49
+ Allows control of variability per feature (through variance_overrides) or globally
50
+ (via global_variance_multiplier).
51
+
52
+ Args:
53
+ n_rows: number of samples
54
+ random_seed: RNG seed
55
+ max_polynomial_new: limit on number of polynomial expansion features
56
+ global_variance_multiplier: multiplier applied to all default stddevs
57
+ variance_overrides: dict mapping feature name or substring → stddev multiplier
58
  """
59
  np.random.seed(random_seed)
60
  os.makedirs(DATA_DIR, exist_ok=True)
61
+ if variance_overrides is None:
62
+ variance_overrides = {}
63
+
64
  # --- base natural features across 8 use cases (expanded)
65
  natural_feats = [
66
  "vibration_x","vibration_y","motor_current","rpm","bearing_temp","ambient_temp","lube_pressure","power_factor",
 
72
  "batch_id_numeric","weight_input","weight_output","time_in_queue","conveyor_speed",
73
  "shell_temp","lining_thickness","water_flow","cooling_out_temp","heat_flux"
74
  ]
75
+ natural_feats = list(dict.fromkeys(natural_feats)) # dedupe
76
+
77
+ # helper: compute adjusted stddev
78
+ def effective_sd(feature_name, base_sd):
79
+ # exact name override
80
+ if feature_name in variance_overrides:
81
+ return float(variance_overrides[feature_name])
82
+ # substring override
83
+ for key, val in variance_overrides.items():
84
+ if key in feature_name:
85
+ return float(val)
86
+ # fallback: scaled base
87
+ return float(base_sd) * float(global_variance_multiplier)
88
 
89
  # helper sampling heuristics
90
  def sample_col(name, n):
91
  name_l = name.lower()
92
  if "furnace_temp" in name_l or name_l.endswith("_temp") or "tap_temp" in name_l:
93
+ sd = effective_sd("furnace_temp", 50)
94
+ return np.random.normal(1550, sd, n)
95
  if name_l in ("tap_temp","mold_temp","shell_temp","cooling_out_temp","exit_temp"):
96
+ sd = effective_sd(name_l, 30)
97
+ return np.random.normal(200 if "mold" not in name_l else 1500, sd, n)
98
  if "offgas_co2" in name_l:
99
+ sd = effective_sd("offgas_co2", 4)
100
+ return np.abs(np.random.normal(15, sd, n))
101
  if "offgas_co" in name_l:
102
+ sd = effective_sd("offgas_co", 5)
103
+ return np.abs(np.random.normal(20, sd, n))
104
  if "o2" in name_l:
105
+ sd = effective_sd("o2_probe_pct", 1)
106
+ return np.clip(np.random.normal(5, sd, n), 0.01, 60)
107
  if "arc_power" in name_l or "motor_load" in name_l:
108
+ sd = effective_sd("arc_power", 120)
109
+ return np.abs(np.random.normal(600, sd, n))
110
  if "rpm" in name_l:
111
+ sd = effective_sd("rpm", 30)
112
+ return np.abs(np.random.normal(120, sd, n))
113
  if "vibration" in name_l:
114
+ sd = effective_sd("vibration", 0.15)
115
+ return np.abs(np.random.normal(0.4, sd, n))
116
  if "bearing_temp" in name_l:
117
+ sd = effective_sd("bearing_temp", 5)
118
+ return np.random.normal(65, sd, n)
119
  if "chemical" in name_l or "spectro" in name_l:
120
+ sd = effective_sd("chemical", 0.15)
121
+ return np.random.normal(0.7, sd, n)
122
  if "weight" in name_l:
123
+ sd = effective_sd("weight", 100)
124
+ return np.random.normal(1000, sd, n)
125
  if "conveyor_speed" in name_l or "casting_speed" in name_l:
126
+ sd = effective_sd("casting_speed", 0.6)
127
+ return np.random.normal(2.5, sd, n)
128
  if "power_factor" in name_l:
129
+ sd = effective_sd("power_factor", 0.03)
130
+ return np.clip(np.random.normal(0.92, sd, n), 0.6, 1.0)
131
  if "image_entropy_proxy" in name_l:
132
+ sd = effective_sd("image_entropy_proxy", 0.25)
133
+ return np.abs(np.random.normal(0.5, sd, n))
134
  if "batch_id" in name_l:
135
  return np.random.randint(1000,9999,n)
136
  if "time_since" in name_l or "time_in_queue" in name_l:
137
+ sd = effective_sd("time_since", 20)
138
+ return np.abs(np.random.normal(30, sd, n))
139
  if "heat_flux" in name_l:
140
+ sd = effective_sd("heat_flux", 300)
141
+ return np.abs(np.random.normal(1000, sd, n))
142
+ return np.random.normal(0, effective_sd(name_l, 1), n)
143
 
144
+ # build DataFrame
145
  df = pd.DataFrame({c: sample_col(c, n_rows) for c in natural_feats})
146
 
147
  # timestamps & metadata
 
173
  df["carbon_power_ratio"] = df["carbon_proxy"] / (df["arc_power"] + 1e-6)
174
  df["temp_power_sqrt"] = df["furnace_temp"] * np.sqrt(np.abs(df["arc_power"]) + 1e-6)
175
 
176
+ # polynomial features limited to first 12 numeric columns
177
  numeric = df.select_dtypes(include=[np.number]).fillna(0)
178
  poly_source_cols = numeric.columns[:12].tolist()
179
  poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
180
  poly_mat = poly.fit_transform(numeric[poly_source_cols])
181
  poly_names = poly.get_feature_names_out(poly_source_cols)
182
  poly_df = pd.DataFrame(poly_mat, columns=[f"poly__{n}" for n in poly_names], index=df.index)
 
183
  keep_poly = [c for c in poly_df.columns if c.replace("poly__","") not in poly_source_cols]
184
+ poly_df = poly_df[keep_poly].iloc[:, :max_polynomial_new] if len(keep_poly) > 0 else poly_df.iloc[:, :0]
 
 
 
185
  df = pd.concat([df, poly_df], axis=1)
186
 
187
  # PCA embeddings across numeric sensors
 
196
  kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
197
  df["operating_mode"] = kmeans.fit_predict(scaled)
198
 
199
+ # surrogate models
 
200
  surrogate_df = df.copy()
201
  surrogate_df["furnace_temp_next"] = surrogate_df["furnace_temp"].shift(-1).fillna(method="ffill")
202
  features_for_surrogate = [c for c in ["furnace_temp","arc_power","o2_probe_pct","offgas_co","offgas_co2"] if c in df.columns]
203
  if len(features_for_surrogate) >= 2:
204
  X = surrogate_df[features_for_surrogate].fillna(0)
205
  y = surrogate_df["furnace_temp_next"]
 
206
  rf = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
207
  rf.fit(X, y)
208
  df["pred_temp_30s"] = rf.predict(X)
209
  else:
210
  df["pred_temp_30s"] = df["furnace_temp"]
211
 
 
212
  if all(c in df.columns for c in ["offgas_co","offgas_co2","o2_probe_pct"]):
213
  X2 = df[["offgas_co","offgas_co2","o2_probe_pct"]].fillna(0)
214
  rf2 = RandomForestRegressor(n_estimators=50, random_state=1, n_jobs=-1)
 
221
  df["refractory_limit_flag"] = (df["lining_thickness"] < 140).astype(int)
222
  df["max_allowed_power_delta"] = np.clip(df["arc_power"].diff().abs().fillna(0), 0, 2000)
223
 
224
+ # rule-based target
225
  df["ARC_ON"] = ((df["arc_power"] > df["arc_power"].median()) & (df["carbon_proxy"] < 1.0)).astype(int)
226
  df["prediction_confidence"] = np.clip(np.random.beta(2,5, n_rows), 0.05, 0.99)
227
 
 
232
 
233
  # save CSV & metadata
234
  df.to_csv(CSV_PATH, index=False)
 
235
  meta = []
236
  for col in df.columns:
237
  if col in natural_feats:
 
251
  with open(META_PATH, "w") as f:
252
  json.dump(meta, f, indent=2)
253
 
254
+ # annotated bibliography
255
  try:
256
  from fpdf import FPDF
257
  pdf = FPDF('P','mm','A4')
 
277
  pdf.ln(2)
278
  pdf.output(PDF_PATH)
279
  except Exception as e:
 
280
  with open(PDF_PATH.replace(".pdf",".txt"), "w") as tf:
281
  tf.write("Annotated bibliography generated. Install fpdf for PDF output.\n")
282
+
283
  return CSV_PATH, META_PATH, PDF_PATH
284
 
285
  # -------------------------
 
385
  arc_power_sd = st.slider("Synthetic Arc Power σ (spread)", 50, 300, 120, step=10)
386
  st.markdown("---")
387
 
388
+ # --- Variance Controls UI ---
389
+ st.markdown("#### Variance controls (global & per-feature)")
390
+ global_var_mult = st.slider(
391
+ "Global variance multiplier", 0.1, 5.0, 1.0, step=0.1,
392
+ help="Multiply base standard deviations by this factor for all features."
393
+ )
394
+
395
+ # Optional: choose features to override
396
+ feat_for_override = st.multiselect(
397
+ "Select features to override variance (optional)", numeric_cols, max_selections=8
398
+ )
399
+ variance_overrides = {}
400
+ if feat_for_override:
401
+ st.markdown("Set multipliers for selected features")
402
+ for f in feat_for_override:
403
+ mult = st.number_input(
404
+ f"Variance multiplier for {f}", min_value=0.1, max_value=10.0,
405
+ value=1.0, step=0.1, key=f"mult_{f}"
406
+ )
407
+ variance_overrides[f] = float(mult)
408
+
409
+ st.markdown("---")
410
+
411
+ # --- Regeneration button ---
412
  if st.button("Regenerate Synthetic Dataset with Updated Variance"):
413
  with st.spinner("Regenerating synthetic data..."):
414
+ variance_overrides.update({
415
+ "furnace_temp": furnace_temp_sd / 50,
416
+ "arc_power": arc_power_sd / 120
417
+ })
418
  CSV_PATH, META_PATH, PDF_PATH = generate_advanced_flatfile(
419
  n_rows=3000,
420
+ random_seed=int(random_seed),
421
+ max_polynomial_new=60,
422
+ global_variance_multiplier=float(global_var_mult),
423
+ variance_overrides=variance_overrides,
424
  )
425
+
426
+ # Clear cache and reload fresh
427
  st.cache_data.clear()
428
+ df, meta_df = load_data(csv_path=CSV_PATH + f"?t={int(time.time())}", meta_path=META_PATH)
429
+
430
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
431
+
432
+ st.success(
433
+ f"Synthetic dataset regenerated — {len(df)} rows × {len(df.columns)} features "
434
+ f"(Global×{global_var_mult:.2f}; Overrides={len(variance_overrides)})"
435
+ )
436
+ st.caption(
437
+ f"Mean furnace_temp: {df['furnace_temp'].mean():.2f}, "
438
+ f"Std furnace_temp: {df['furnace_temp'].std():.2f}"
439
+ )
440
+
441
 
442
  if train_button:
443
  with st.spinner("Preparing data and training ensemble..."):