singhn9 commited on
Commit
85836c8
·
verified ·
1 Parent(s): d49d8b0

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +53 -27
src/streamlit_app.py CHANGED
@@ -320,6 +320,8 @@ def load_data(csv_path=CSV_PATH, meta_path=META_PATH):
320
  return df_local, pd.DataFrame(meta_local)
321
 
322
  df, meta_df = load_data()
 
 
323
  # -------------------------
324
  # Sidebar filters & UI
325
  # -------------------------
@@ -483,37 +485,61 @@ with tabs[4]:
483
 
484
  # Select only valid feature columns
485
  cols_needed = [c for c in features if c in df.columns]
486
- # --- Build sub_df safely (force exact column match) ---
487
- if target not in df.columns:
488
- # try case-insensitive or partial fallback once
489
- matches = [c for c in df.columns if target.lower() in c.lower()]
490
- if len(matches) == 1:
491
- target = matches[0]
492
- st.info(f"Auto-corrected target to exact match: `{target}`")
493
- elif len(matches) > 1:
494
- st.warning(f"Multiple columns match '{target}': {matches}. Using first: {matches[0]}")
495
- target = matches[0]
 
 
 
 
 
 
 
496
  else:
497
- st.error(f"Target `{target}` not found in dataframe columns.")
498
- st.stop()
499
-
500
- # Now build sub_df strictly
501
- sub_df = df.loc[:, cols_needed + [target]].sample(n=sample_size, random_state=42).reset_index(drop=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
 
503
 
504
  # Construct X and y
505
- X = sub_df[cols_needed].copy()
506
- y = sub_df[[target]].copy()
507
-
508
- # Convert y to 1-D Series
509
- if isinstance(y, pd.DataFrame):
510
- if y.shape[1] == 1:
511
- y = y.iloc[:, 0]
512
- else:
513
- st.error(f"Multi-output target detected (shape {y.shape}). Select a single target column.")
514
- st.stop()
515
-
516
- y = pd.Series(np.ravel(y), name=target)
517
 
518
 
519
  # Drop known leak or identifier columns
 
320
  return df_local, pd.DataFrame(meta_local)
321
 
322
  df, meta_df = load_data()
323
+ df = df.loc[:, ~df.columns.duplicated()]
324
+
325
  # -------------------------
326
  # Sidebar filters & UI
327
  # -------------------------
 
485
 
486
  # Select only valid feature columns
487
  cols_needed = [c for c in features if c in df.columns]
488
+ # Match exact name first
489
+ if isinstance(target, (list, tuple)):
490
+ st.warning(f"Target provided as list/tuple; using first element `{target[0]}` as target.")
491
+ target = target[0]
492
+
493
+ # Select only valid feature columns
494
+ cols_needed = [c for c in features if c in df.columns]
495
+
496
+ # --- Force single exact target column ---
497
+ if target in df.columns:
498
+ target_col = target
499
+ else:
500
+ # Case-insensitive exact match
501
+ matches = [c for c in df.columns if c.lower() == target.lower()]
502
+ if matches:
503
+ target_col = matches[0]
504
+ st.info(f"Auto-corrected to exact match: `{target_col}`")
505
  else:
506
+ # Partial substring match (e.g., 'furnace_temp' vs 'furnace_temp_next')
507
+ matches = [c for c in df.columns if target.lower() in c.lower()]
508
+ if len(matches) == 1:
509
+ target_col = matches[0]
510
+ st.info(f"Auto-corrected to closest match: `{target_col}`")
511
+ elif len(matches) > 1:
512
+ # Prefer '_temp', '_ratio', or exact substring equality
513
+ preferred = [m for m in matches if m.endswith("_temp") or m.endswith("_ratio") or m == target]
514
+ if preferred:
515
+ target_col = preferred[0]
516
+ st.warning(f"Multiple matches found {matches}. Using `{target_col}`.")
517
+ else:
518
+ target_col = matches[0]
519
+ st.warning(f"Multiple matches found {matches}. Using first: `{target_col}`.")
520
+ else:
521
+ st.error(f"Target `{target}` not found in dataframe columns.")
522
+ st.stop()
523
+
524
+ # --- Build sub_df safely — ensure unique and valid target ---
525
+ valid_features = [c for c in cols_needed if c in df.columns and c != target_col]
526
+ if not valid_features:
527
+ st.error("No valid feature columns remain after cleaning. Check feature selection.")
528
+ st.stop()
529
+
530
+ sub_df = df.loc[:, valid_features + [target_col]].copy()
531
+ sub_df = sub_df.sample(n=sample_size, random_state=42).reset_index(drop=True)
532
+
533
+ # --- Construct clean X and y ---
534
+ X = sub_df.drop(columns=[target_col])
535
+ y = pd.Series(np.ravel(sub_df[target_col]), name=target_col)
536
 
537
 
538
  # Construct X and y
539
+ X = sub_df.drop(columns=[target_col])
540
+ y = sub_df[target_col]
541
+ y = pd.Series(np.ravel(y), name=target_col)
542
+
 
 
 
 
 
 
 
 
543
 
544
 
545
  # Drop known leak or identifier columns