Update src/streamlit_app.py
Browse files- src/streamlit_app.py +53 -27
src/streamlit_app.py
CHANGED
|
@@ -320,6 +320,8 @@ def load_data(csv_path=CSV_PATH, meta_path=META_PATH):
|
|
| 320 |
return df_local, pd.DataFrame(meta_local)
|
| 321 |
|
| 322 |
df, meta_df = load_data()
|
|
|
|
|
|
|
| 323 |
# -------------------------
|
| 324 |
# Sidebar filters & UI
|
| 325 |
# -------------------------
|
|
@@ -483,37 +485,61 @@ with tabs[4]:
|
|
| 483 |
|
| 484 |
# Select only valid feature columns
|
| 485 |
cols_needed = [c for c in features if c in df.columns]
|
| 486 |
-
#
|
| 487 |
-
if target
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
else:
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
|
| 503 |
|
| 504 |
# Construct X and y
|
| 505 |
-
X = sub_df[
|
| 506 |
-
y = sub_df[
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
if isinstance(y, pd.DataFrame):
|
| 510 |
-
if y.shape[1] == 1:
|
| 511 |
-
y = y.iloc[:, 0]
|
| 512 |
-
else:
|
| 513 |
-
st.error(f"Multi-output target detected (shape {y.shape}). Select a single target column.")
|
| 514 |
-
st.stop()
|
| 515 |
-
|
| 516 |
-
y = pd.Series(np.ravel(y), name=target)
|
| 517 |
|
| 518 |
|
| 519 |
# Drop known leak or identifier columns
|
|
|
|
| 320 |
return df_local, pd.DataFrame(meta_local)
|
| 321 |
|
| 322 |
df, meta_df = load_data()
|
| 323 |
+
df = df.loc[:, ~df.columns.duplicated()]
|
| 324 |
+
|
| 325 |
# -------------------------
|
| 326 |
# Sidebar filters & UI
|
| 327 |
# -------------------------
|
|
|
|
| 485 |
|
| 486 |
# Select only valid feature columns
|
| 487 |
cols_needed = [c for c in features if c in df.columns]
|
| 488 |
+
# Match exact name first
|
| 489 |
+
if isinstance(target, (list, tuple)):
|
| 490 |
+
st.warning(f"Target provided as list/tuple; using first element `{target[0]}` as target.")
|
| 491 |
+
target = target[0]
|
| 492 |
+
|
| 493 |
+
# Select only valid feature columns
|
| 494 |
+
cols_needed = [c for c in features if c in df.columns]
|
| 495 |
+
|
| 496 |
+
# --- Force single exact target column ---
|
| 497 |
+
if target in df.columns:
|
| 498 |
+
target_col = target
|
| 499 |
+
else:
|
| 500 |
+
# Case-insensitive exact match
|
| 501 |
+
matches = [c for c in df.columns if c.lower() == target.lower()]
|
| 502 |
+
if matches:
|
| 503 |
+
target_col = matches[0]
|
| 504 |
+
st.info(f"Auto-corrected to exact match: `{target_col}`")
|
| 505 |
else:
|
| 506 |
+
# Partial substring match (e.g., 'furnace_temp' vs 'furnace_temp_next')
|
| 507 |
+
matches = [c for c in df.columns if target.lower() in c.lower()]
|
| 508 |
+
if len(matches) == 1:
|
| 509 |
+
target_col = matches[0]
|
| 510 |
+
st.info(f"Auto-corrected to closest match: `{target_col}`")
|
| 511 |
+
elif len(matches) > 1:
|
| 512 |
+
# Prefer '_temp', '_ratio', or exact substring equality
|
| 513 |
+
preferred = [m for m in matches if m.endswith("_temp") or m.endswith("_ratio") or m == target]
|
| 514 |
+
if preferred:
|
| 515 |
+
target_col = preferred[0]
|
| 516 |
+
st.warning(f"Multiple matches found {matches}. Using `{target_col}`.")
|
| 517 |
+
else:
|
| 518 |
+
target_col = matches[0]
|
| 519 |
+
st.warning(f"Multiple matches found {matches}. Using first: `{target_col}`.")
|
| 520 |
+
else:
|
| 521 |
+
st.error(f"Target `{target}` not found in dataframe columns.")
|
| 522 |
+
st.stop()
|
| 523 |
+
|
| 524 |
+
# --- Build sub_df safely — ensure unique and valid target ---
|
| 525 |
+
valid_features = [c for c in cols_needed if c in df.columns and c != target_col]
|
| 526 |
+
if not valid_features:
|
| 527 |
+
st.error("No valid feature columns remain after cleaning. Check feature selection.")
|
| 528 |
+
st.stop()
|
| 529 |
+
|
| 530 |
+
sub_df = df.loc[:, valid_features + [target_col]].copy()
|
| 531 |
+
sub_df = sub_df.sample(n=sample_size, random_state=42).reset_index(drop=True)
|
| 532 |
+
|
| 533 |
+
# --- Construct clean X and y ---
|
| 534 |
+
X = sub_df.drop(columns=[target_col])
|
| 535 |
+
y = pd.Series(np.ravel(sub_df[target_col]), name=target_col)
|
| 536 |
|
| 537 |
|
| 538 |
# Construct X and y
|
| 539 |
+
X = sub_df.drop(columns=[target_col])
|
| 540 |
+
y = sub_df[target_col]
|
| 541 |
+
y = pd.Series(np.ravel(y), name=target_col)
|
| 542 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
|
| 544 |
|
| 545 |
# Drop known leak or identifier columns
|