singhn9 commited on
Commit
a66dff8
Β·
verified Β·
1 Parent(s): 6ed7e1a

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +35 -8
src/streamlit_app.py CHANGED
@@ -27,7 +27,8 @@ import shap
27
  # Config & paths
28
  # -------------------------
29
  st.set_page_config(page_title="AI Feature Universe Explorer β€” Advanced + SHAP", layout="wide")
30
- DATA_DIR = "/mnt/data"
 
31
  CSV_PATH = os.path.join(DATA_DIR, "flatfile_universe_advanced.csv")
32
  META_PATH = os.path.join(DATA_DIR, "feature_metadata_advanced.json")
33
  PDF_PATH = os.path.join(DATA_DIR, "annotated_bibliography.pdf")
@@ -249,7 +250,7 @@ def generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=6
249
  # Ensure dataset exists
250
  # -------------------------
251
  if not os.path.exists(CSV_PATH) or not os.path.exists(META_PATH):
252
- with st.spinner("Generating advanced feature universe (this may take ~20-60s)..."):
253
  CSV_PATH, META_PATH, PDF_PATH = generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=80)
254
  st.success(f"Generated dataset and metadata: {CSV_PATH}")
255
 
@@ -268,7 +269,7 @@ df, meta_df = load_data()
268
  # -------------------------
269
  # Sidebar filters & UI
270
  # -------------------------
271
- st.sidebar.title("πŸ”Ž Feature Explorer - Advanced + SHAP")
272
  feat_types = sorted(meta_df["source_type"].unique().tolist())
273
  selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types)
274
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
@@ -332,6 +333,31 @@ with tabs[4]:
332
  features = st.multiselect("Model input features (select many; start with defaults)", numeric_cols, default=default_features)
333
  sample_size = st.slider("Sample rows to use for training (speed vs fidelity)", min_value=200, max_value=min(4000, df.shape[0]), value=1000, step=100)
334
  train_button = st.button("Train ensemble & compute SHAP (recommended sample only)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
  if train_button:
337
  with st.spinner("Preparing data and training ensemble..."):
@@ -457,9 +483,10 @@ with tabs[4]:
457
  st.warning(f"Could not plot waterfall: {e}")
458
 
459
 
460
- # ----- πŸ“Œ Target & Business Impact tab
 
461
  with tabs[5]:
462
- st.subheader("🎯 Recommended Target Variables by Use Case")
463
  st.markdown("Each use case maps to a practical target variable that drives measurable business impact.")
464
 
465
  target_table = pd.DataFrame([
@@ -475,7 +502,7 @@ with tabs[5]:
475
  st.dataframe(target_table, use_container_width=True)
476
 
477
  st.markdown("---")
478
- st.subheader(" Business Framing for Clients")
479
  st.markdown("These metrics show approximate annual benefits from small process improvements.")
480
 
481
  business_table = pd.DataFrame([
@@ -489,9 +516,9 @@ with tabs[5]:
489
  st.dataframe(business_table, use_container_width=True)
490
  st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.")
491
 
492
- # ----- πŸ“š Bibliography tab
493
  with tabs[6]:
494
- st.subheader("πŸ“š Annotated Bibliography & Feature Justification")
495
  st.markdown("""
496
  This section summarizes published research supporting the feature design and modeling choices.
497
  """)
 
27
  # Config & paths
28
  # -------------------------
29
  st.set_page_config(page_title="AI Feature Universe Explorer β€” Advanced + SHAP", layout="wide")
30
+ DATA_DIR = os.getenv("DATA_DIR", "./data")
31
+ os.makedirs(DATA_DIR, exist_ok=True)
32
  CSV_PATH = os.path.join(DATA_DIR, "flatfile_universe_advanced.csv")
33
  META_PATH = os.path.join(DATA_DIR, "feature_metadata_advanced.json")
34
  PDF_PATH = os.path.join(DATA_DIR, "annotated_bibliography.pdf")
 
250
  # Ensure dataset exists
251
  # -------------------------
252
  if not os.path.exists(CSV_PATH) or not os.path.exists(META_PATH):
253
+ with st.spinner("Generating synthetic features (this may take ~20-60s)..."):
254
  CSV_PATH, META_PATH, PDF_PATH = generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=80)
255
  st.success(f"Generated dataset and metadata: {CSV_PATH}")
256
 
 
269
  # -------------------------
270
  # Sidebar filters & UI
271
  # -------------------------
272
+ st.sidebar.title("Feature Explorer - Advanced + SHAP")
273
  feat_types = sorted(meta_df["source_type"].unique().tolist())
274
  selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types)
275
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
 
333
  features = st.multiselect("Model input features (select many; start with defaults)", numeric_cols, default=default_features)
334
  sample_size = st.slider("Sample rows to use for training (speed vs fidelity)", min_value=200, max_value=min(4000, df.shape[0]), value=1000, step=100)
335
  train_button = st.button("Train ensemble & compute SHAP (recommended sample only)")
336
+ # Model Remediation & Tuning Options
337
+ st.markdown("### Model Remediation & Tuning Options")
338
+ st.info("Use these to improve flat or low-variance predictions without editing code.")
339
+
340
+ colA, colB, colC = st.columns(3)
341
+ with colA:
342
+ apply_scaling = st.checkbox("Apply StandardScaler()", value=False)
343
+ feature_filter = st.checkbox("Use key furnace-relevant features", value=True)
344
+ with colB:
345
+ random_seed = st.number_input("Random Seed", min_value=0, max_value=9999, value=42)
346
+ n_estimators = st.slider("n_estimators (trees)", 50, 300, 150, step=25)
347
+ with colC:
348
+ furnace_temp_sd = st.slider("Synthetic Furnace Temp Οƒ (spread)", 20, 200, 50, step=10)
349
+ arc_power_sd = st.slider("Synthetic Arc Power Οƒ (spread)", 50, 300, 120, step=10)
350
+ st.markdown("---")
351
+
352
+ if st.button("Regenerate Synthetic Dataset with Updated Variance"):
353
+ with st.spinner("Regenerating synthetic data..."):
354
+ CSV_PATH, META_PATH, PDF_PATH = generate_advanced_flatfile(
355
+ n_rows=3000,
356
+ random_seed=random_seed,
357
+ max_polynomial_new=60
358
+ )
359
+ df, meta_df = load_data()
360
+ st.success("Synthetic dataset regenerated with new variance settings.")
361
 
362
  if train_button:
363
  with st.spinner("Preparing data and training ensemble..."):
 
483
  st.warning(f"Could not plot waterfall: {e}")
484
 
485
 
486
+
487
+ # ----- Target & Business Impact tab
488
  with tabs[5]:
489
+ st.subheader("Recommended Target Variables by Use Case")
490
  st.markdown("Each use case maps to a practical target variable that drives measurable business impact.")
491
 
492
  target_table = pd.DataFrame([
 
502
  st.dataframe(target_table, use_container_width=True)
503
 
504
  st.markdown("---")
505
+ st.subheader("Business Framing for Clients")
506
  st.markdown("These metrics show approximate annual benefits from small process improvements.")
507
 
508
  business_table = pd.DataFrame([
 
516
  st.dataframe(business_table, use_container_width=True)
517
  st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.")
518
 
519
+ # ----- Bibliography tab
520
  with tabs[6]:
521
+ st.subheader("Annotated Bibliography & Feature Justification")
522
  st.markdown("""
523
  This section summarizes published research supporting the feature design and modeling choices.
524
  """)