Update src/streamlit_app.py
Browse files- src/streamlit_app.py +35 -8
src/streamlit_app.py
CHANGED
|
@@ -27,7 +27,8 @@ import shap
|
|
| 27 |
# Config & paths
|
| 28 |
# -------------------------
|
| 29 |
st.set_page_config(page_title="AI Feature Universe Explorer β Advanced + SHAP", layout="wide")
|
| 30 |
-
DATA_DIR = "
|
|
|
|
| 31 |
CSV_PATH = os.path.join(DATA_DIR, "flatfile_universe_advanced.csv")
|
| 32 |
META_PATH = os.path.join(DATA_DIR, "feature_metadata_advanced.json")
|
| 33 |
PDF_PATH = os.path.join(DATA_DIR, "annotated_bibliography.pdf")
|
|
@@ -249,7 +250,7 @@ def generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=6
|
|
| 249 |
# Ensure dataset exists
|
| 250 |
# -------------------------
|
| 251 |
if not os.path.exists(CSV_PATH) or not os.path.exists(META_PATH):
|
| 252 |
-
with st.spinner("Generating
|
| 253 |
CSV_PATH, META_PATH, PDF_PATH = generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=80)
|
| 254 |
st.success(f"Generated dataset and metadata: {CSV_PATH}")
|
| 255 |
|
|
@@ -268,7 +269,7 @@ df, meta_df = load_data()
|
|
| 268 |
# -------------------------
|
| 269 |
# Sidebar filters & UI
|
| 270 |
# -------------------------
|
| 271 |
-
st.sidebar.title("
|
| 272 |
feat_types = sorted(meta_df["source_type"].unique().tolist())
|
| 273 |
selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types)
|
| 274 |
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
@@ -332,6 +333,31 @@ with tabs[4]:
|
|
| 332 |
features = st.multiselect("Model input features (select many; start with defaults)", numeric_cols, default=default_features)
|
| 333 |
sample_size = st.slider("Sample rows to use for training (speed vs fidelity)", min_value=200, max_value=min(4000, df.shape[0]), value=1000, step=100)
|
| 334 |
train_button = st.button("Train ensemble & compute SHAP (recommended sample only)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
|
| 336 |
if train_button:
|
| 337 |
with st.spinner("Preparing data and training ensemble..."):
|
|
@@ -457,9 +483,10 @@ with tabs[4]:
|
|
| 457 |
st.warning(f"Could not plot waterfall: {e}")
|
| 458 |
|
| 459 |
|
| 460 |
-
|
|
|
|
| 461 |
with tabs[5]:
|
| 462 |
-
st.subheader("
|
| 463 |
st.markdown("Each use case maps to a practical target variable that drives measurable business impact.")
|
| 464 |
|
| 465 |
target_table = pd.DataFrame([
|
|
@@ -475,7 +502,7 @@ with tabs[5]:
|
|
| 475 |
st.dataframe(target_table, use_container_width=True)
|
| 476 |
|
| 477 |
st.markdown("---")
|
| 478 |
-
st.subheader("
|
| 479 |
st.markdown("These metrics show approximate annual benefits from small process improvements.")
|
| 480 |
|
| 481 |
business_table = pd.DataFrame([
|
|
@@ -489,9 +516,9 @@ with tabs[5]:
|
|
| 489 |
st.dataframe(business_table, use_container_width=True)
|
| 490 |
st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.")
|
| 491 |
|
| 492 |
-
# -----
|
| 493 |
with tabs[6]:
|
| 494 |
-
st.subheader("
|
| 495 |
st.markdown("""
|
| 496 |
This section summarizes published research supporting the feature design and modeling choices.
|
| 497 |
""")
|
|
|
|
| 27 |
# Config & paths
|
| 28 |
# -------------------------
|
| 29 |
st.set_page_config(page_title="AI Feature Universe Explorer β Advanced + SHAP", layout="wide")
|
| 30 |
+
DATA_DIR = os.getenv("DATA_DIR", "./data")
|
| 31 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
| 32 |
CSV_PATH = os.path.join(DATA_DIR, "flatfile_universe_advanced.csv")
|
| 33 |
META_PATH = os.path.join(DATA_DIR, "feature_metadata_advanced.json")
|
| 34 |
PDF_PATH = os.path.join(DATA_DIR, "annotated_bibliography.pdf")
|
|
|
|
| 250 |
# Ensure dataset exists
|
| 251 |
# -------------------------
|
| 252 |
if not os.path.exists(CSV_PATH) or not os.path.exists(META_PATH):
|
| 253 |
+
with st.spinner("Generating synthetic features (this may take ~20-60s)..."):
|
| 254 |
CSV_PATH, META_PATH, PDF_PATH = generate_advanced_flatfile(n_rows=3000, random_seed=42, max_polynomial_new=80)
|
| 255 |
st.success(f"Generated dataset and metadata: {CSV_PATH}")
|
| 256 |
|
|
|
|
| 269 |
# -------------------------
|
| 270 |
# Sidebar filters & UI
|
| 271 |
# -------------------------
|
| 272 |
+
st.sidebar.title("Feature Explorer - Advanced + SHAP")
|
| 273 |
feat_types = sorted(meta_df["source_type"].unique().tolist())
|
| 274 |
selected_types = st.sidebar.multiselect("Feature type", feat_types, default=feat_types)
|
| 275 |
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
|
|
| 333 |
features = st.multiselect("Model input features (select many; start with defaults)", numeric_cols, default=default_features)
|
| 334 |
sample_size = st.slider("Sample rows to use for training (speed vs fidelity)", min_value=200, max_value=min(4000, df.shape[0]), value=1000, step=100)
|
| 335 |
train_button = st.button("Train ensemble & compute SHAP (recommended sample only)")
|
| 336 |
+
# Model Remediation & Tuning Options
|
| 337 |
+
st.markdown("### Model Remediation & Tuning Options")
|
| 338 |
+
st.info("Use these to improve flat or low-variance predictions without editing code.")
|
| 339 |
+
|
| 340 |
+
colA, colB, colC = st.columns(3)
|
| 341 |
+
with colA:
|
| 342 |
+
apply_scaling = st.checkbox("Apply StandardScaler()", value=False)
|
| 343 |
+
feature_filter = st.checkbox("Use key furnace-relevant features", value=True)
|
| 344 |
+
with colB:
|
| 345 |
+
random_seed = st.number_input("Random Seed", min_value=0, max_value=9999, value=42)
|
| 346 |
+
n_estimators = st.slider("n_estimators (trees)", 50, 300, 150, step=25)
|
| 347 |
+
with colC:
|
| 348 |
+
furnace_temp_sd = st.slider("Synthetic Furnace Temp Ο (spread)", 20, 200, 50, step=10)
|
| 349 |
+
arc_power_sd = st.slider("Synthetic Arc Power Ο (spread)", 50, 300, 120, step=10)
|
| 350 |
+
st.markdown("---")
|
| 351 |
+
|
| 352 |
+
if st.button("Regenerate Synthetic Dataset with Updated Variance"):
|
| 353 |
+
with st.spinner("Regenerating synthetic data..."):
|
| 354 |
+
CSV_PATH, META_PATH, PDF_PATH = generate_advanced_flatfile(
|
| 355 |
+
n_rows=3000,
|
| 356 |
+
random_seed=random_seed,
|
| 357 |
+
max_polynomial_new=60
|
| 358 |
+
)
|
| 359 |
+
df, meta_df = load_data()
|
| 360 |
+
st.success("Synthetic dataset regenerated with new variance settings.")
|
| 361 |
|
| 362 |
if train_button:
|
| 363 |
with st.spinner("Preparing data and training ensemble..."):
|
|
|
|
| 483 |
st.warning(f"Could not plot waterfall: {e}")
|
| 484 |
|
| 485 |
|
| 486 |
+
|
| 487 |
+
# ----- Target & Business Impact tab
|
| 488 |
with tabs[5]:
|
| 489 |
+
st.subheader("Recommended Target Variables by Use Case")
|
| 490 |
st.markdown("Each use case maps to a practical target variable that drives measurable business impact.")
|
| 491 |
|
| 492 |
target_table = pd.DataFrame([
|
|
|
|
| 502 |
st.dataframe(target_table, use_container_width=True)
|
| 503 |
|
| 504 |
st.markdown("---")
|
| 505 |
+
st.subheader("Business Framing for Clients")
|
| 506 |
st.markdown("These metrics show approximate annual benefits from small process improvements.")
|
| 507 |
|
| 508 |
business_table = pd.DataFrame([
|
|
|
|
| 516 |
st.dataframe(business_table, use_container_width=True)
|
| 517 |
st.info("These numbers are indicative averages; actual benefits depend on plant capacity and process efficiency.")
|
| 518 |
|
| 519 |
+
# ----- Bibliography tab
|
| 520 |
with tabs[6]:
|
| 521 |
+
st.subheader("Annotated Bibliography & Feature Justification")
|
| 522 |
st.markdown("""
|
| 523 |
This section summarizes published research supporting the feature design and modeling choices.
|
| 524 |
""")
|