singhn9 commited on
Commit
4d9b97c
·
verified ·
1 Parent(s): 4605aa4

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +52 -90
src/streamlit_app.py CHANGED
@@ -432,13 +432,14 @@ with tabs[3]:
432
  st.subheader("Summary statistics (numeric features)")
433
  st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
434
 
 
435
  # ----- AutoML + SHAP tab (Expanded)
436
  with tabs[4]:
437
  st.subheader("AutoML Ensemble — Expanded Families + Stacking + SHAP")
438
 
439
- # --- Universal numeric cleaner (runs once per tab) ---
440
  def clean_entire_df(df):
441
- """Cleans dataframe of any bracketed/scientific string numbers like '[1.551E3]'."""
442
  df_clean = df.copy()
443
  for col in df_clean.columns:
444
  if df_clean[col].dtype == object:
@@ -456,7 +457,7 @@ with tabs[4]:
456
  return df_clean
457
 
458
  df = clean_entire_df(df)
459
- st.caption(" Dataset cleaned globally — all numeric-like values converted safely.")
460
 
461
  # --- Use Case Selection ---
462
  use_case = st.selectbox(
@@ -541,49 +542,23 @@ with tabs[4]:
541
 
542
  # --- Family tuner ---
543
  def tune_family(fam, X_local, y_local, n_trials=20):
544
- import optuna
545
- from sklearn.model_selection import cross_val_score
546
- from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
547
-
548
  def obj(trial):
549
  if fam == "RandomForest":
550
  m = RandomForestRegressor(
551
  n_estimators=trial.suggest_int("n_estimators", 100, 800),
552
  max_depth=trial.suggest_int("max_depth", 4, 30),
553
- random_state=42, n_jobs=-1,
554
- )
555
  elif fam == "ExtraTrees":
556
  m = ExtraTreesRegressor(
557
  n_estimators=trial.suggest_int("n_estimators", 100, 800),
558
  max_depth=trial.suggest_int("max_depth", 4, 30),
559
- random_state=42, n_jobs=-1,
560
- )
561
- elif fam == "XGBoost" and optional_families.get("XGBoost"):
562
- m = xgb.XGBRegressor(
563
- n_estimators=trial.suggest_int("n_estimators", 100, 800),
564
- max_depth=trial.suggest_int("max_depth", 3, 12),
565
- learning_rate=trial.suggest_float("lr", 0.01, 0.3, log=True),
566
- tree_method="hist", verbosity=0
567
- )
568
- elif fam == "LightGBM" and optional_families.get("LightGBM"):
569
- m = lgb.LGBMRegressor(
570
- n_estimators=trial.suggest_int("n_estimators", 100, 800),
571
- max_depth=trial.suggest_int("max_depth", 3, 16),
572
- learning_rate=trial.suggest_float("lr", 0.01, 0.3, log=True)
573
- )
574
- elif fam == "CatBoost" and optional_families.get("CatBoost"):
575
- m = cb.CatBoostRegressor(
576
- iterations=trial.suggest_int("iterations", 200, 800),
577
- depth=trial.suggest_int("depth", 4, 10),
578
- learning_rate=trial.suggest_float("lr", 0.01, 0.3, log=True),
579
- verbose=0
580
- )
581
  else:
582
  m = RandomForestRegressor(random_state=42)
583
  try:
584
  return np.mean(cross_val_score(m, X_local, y_local, cv=3, scoring="r2"))
585
  except Exception:
586
- return -999
587
 
588
  study = optuna.create_study(direction="maximize")
589
  study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
@@ -603,33 +578,34 @@ with tabs[4]:
603
  for fam in families:
604
  tuned_results.append(tune_family(fam, X, y, n_trials=max_trials))
605
 
606
- lb = pd.DataFrame([{"family": r["family"], "cv_r2": r["cv_score"]} for r in tuned_results]).sort_values("cv_r2", ascending=False)
 
 
607
  st.dataframe(lb.round(4))
608
 
609
  # --- Stacking ---
610
  from sklearn.feature_selection import SelectKBest, f_regression
611
  from sklearn.linear_model import LinearRegression
612
- from sklearn.model_selection import KFold, train_test_split
613
  from sklearn.metrics import r2_score
614
 
615
  scaler = StandardScaler()
616
  X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
617
  selector = SelectKBest(f_regression, k=min(40, X_scaled.shape[1]))
618
- X_sel = pd.DataFrame(selector.fit_transform(X_scaled, y), columns=[X.columns[i] for i in selector.get_support(indices=True)])
 
 
 
619
 
620
- # --- Safe stacking ensemble build ---
621
  kf = KFold(n_splits=5, shuffle=True, random_state=42)
622
  oof_preds = pd.DataFrame(index=X_sel.index)
623
  base_models = []
624
-
625
- # Explicitly filter valid models (no truthiness eval)
626
- valid_results = []
627
- for r in tuned_results:
628
- m = r.get("model_obj", None)
629
- if isinstance(m, object) and hasattr(m, "fit") and callable(getattr(m, "fit", None)):
630
- valid_results.append((r["family"], r))
631
-
632
- # Train each base model safely
633
  for fam, entry in valid_results:
634
  model = entry["model_obj"]
635
  preds = np.zeros(X_sel.shape[0])
@@ -645,14 +621,6 @@ with tabs[4]:
645
  base_models.append({"family": fam, "model": model})
646
  except Exception as e:
647
  st.warning(f"⚠️ {fam} full-fit failed: {e}")
648
-
649
- # Meta model on OOF predictions
650
- meta = LinearRegression(positive=True)
651
- meta.fit(oof_preds, y)
652
- y_pred = meta.predict(oof_preds)
653
- final_r2 = r2_score(y, y_pred)
654
- st.success(f"Stacked Ensemble R² = {final_r2:.4f}")
655
-
656
 
657
  meta = LinearRegression(positive=True)
658
  meta.fit(oof_preds, y)
@@ -669,8 +637,7 @@ with tabs[4]:
669
  sample_X = X_sel.sample(min(300, len(X_sel)), random_state=42)
670
  expl = shap.TreeExplainer(top_base)
671
  shap_vals = expl.shap_values(sample_X)
672
- if isinstance(shap_vals, list):
673
- shap_vals = shap_vals[0]
674
  imp = pd.DataFrame({
675
  "Feature": sample_X.columns,
676
  "Mean |SHAP|": np.abs(shap_vals).mean(axis=0),
@@ -688,71 +655,66 @@ with tabs[4]:
688
  recs.append(f"`{r['Feature']}` neutral for `{target}`")
689
  st.write("\n".join(recs))
690
 
691
- # --- Hugging Face advisory ---
692
- import requests, json, textwrap
693
-
694
  HF_TOKEN = os.getenv("HF_TOKEN")
695
  if not HF_TOKEN:
696
  st.error("HF_TOKEN not detected in environment or secrets.toml.")
697
  else:
698
- # Correct endpoint per Hugging Face Router API
699
- API_URL = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3-8B-Instruct"
700
-
701
  headers = {
702
  "Authorization": f"Bearer {HF_TOKEN}",
703
  "Content-Type": "application/json",
704
  }
705
-
706
- # Prepare prompt
707
  prompt = textwrap.dedent(f"""
708
  You are an expert metallurgical process advisor.
709
  Analyze these SHAP-based operator recommendations and rewrite them
710
  as a concise 3-line professional advisory note.
711
-
712
  Recommendations: {recs}
713
  Target variable: {target}
714
  Use case: {use_case}
715
  """)
716
-
717
- # HF Router supports both "inputs" and OpenAI-style "messages"
718
  payload = {
719
- "inputs": prompt,
720
- "parameters": {
721
- "max_new_tokens": 200,
722
- "temperature": 0.5,
723
- "top_p": 0.95,
724
- "return_full_text": False
725
- }
 
726
  }
727
-
728
  with st.spinner("Generating operator advisory (Llama 3-8B)…"):
729
  try:
730
  resp = requests.post(API_URL, headers=headers, json=payload, timeout=90)
731
-
732
  if resp.status_code != 200:
733
  st.warning(f"HF API error {resp.status_code}: {resp.text}")
734
  else:
735
  try:
736
  data = resp.json()
737
- text = ""
738
-
739
- # The router returns list-based structure for text generation
740
- if isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
741
- text = data[0]["generated_text"].strip()
742
- elif isinstance(data, dict) and "generated_text" in data:
743
- text = data["generated_text"].strip()
744
-
745
- if text:
746
- st.success(" Operator Advisory Generated:")
747
- st.info(text)
748
  else:
749
- st.warning(f"Operator advisory skipped: no text returned.\nRaw response:\n{data}")
750
-
751
- except json.JSONDecodeError:
752
- st.warning(f"Operator advisory skipped: invalid JSON.\nRaw response:\n{resp.text}")
753
  except Exception as e:
754
  st.warning(f"Operator advisory skipped: {e}")
755
 
 
 
756
 
757
 
758
  # ----- Business Impact tab
 
432
  st.subheader("Summary statistics (numeric features)")
433
  st.dataframe(df.describe().T.style.format("{:.3f}"), height=500)
434
 
435
+ # ----- AutoML + SHAP tab (Expanded)
436
  # ----- AutoML + SHAP tab (Expanded)
437
  with tabs[4]:
438
  st.subheader("AutoML Ensemble — Expanded Families + Stacking + SHAP")
439
 
440
+ # --- Global numeric cleaner ---
441
  def clean_entire_df(df):
442
+ """Cleans dataframe of bracketed/scientific string numbers like '[1.551E3]'."""
443
  df_clean = df.copy()
444
  for col in df_clean.columns:
445
  if df_clean[col].dtype == object:
 
457
  return df_clean
458
 
459
  df = clean_entire_df(df)
460
+ st.caption(" Dataset cleaned globally — all numeric-like values converted safely.")
461
 
462
  # --- Use Case Selection ---
463
  use_case = st.selectbox(
 
542
 
543
  # --- Family tuner ---
544
  def tune_family(fam, X_local, y_local, n_trials=20):
 
 
 
 
545
  def obj(trial):
546
  if fam == "RandomForest":
547
  m = RandomForestRegressor(
548
  n_estimators=trial.suggest_int("n_estimators", 100, 800),
549
  max_depth=trial.suggest_int("max_depth", 4, 30),
550
+ random_state=42, n_jobs=-1)
 
551
  elif fam == "ExtraTrees":
552
  m = ExtraTreesRegressor(
553
  n_estimators=trial.suggest_int("n_estimators", 100, 800),
554
  max_depth=trial.suggest_int("max_depth", 4, 30),
555
+ random_state=42, n_jobs=-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
556
  else:
557
  m = RandomForestRegressor(random_state=42)
558
  try:
559
  return np.mean(cross_val_score(m, X_local, y_local, cv=3, scoring="r2"))
560
  except Exception:
561
+ return -999.0
562
 
563
  study = optuna.create_study(direction="maximize")
564
  study.optimize(obj, n_trials=n_trials, show_progress_bar=False)
 
578
  for fam in families:
579
  tuned_results.append(tune_family(fam, X, y, n_trials=max_trials))
580
 
581
+ lb = pd.DataFrame(
582
+ [{"family": r["family"], "cv_r2": r["cv_score"]} for r in tuned_results]
583
+ ).sort_values("cv_r2", ascending=False)
584
  st.dataframe(lb.round(4))
585
 
586
  # --- Stacking ---
587
  from sklearn.feature_selection import SelectKBest, f_regression
588
  from sklearn.linear_model import LinearRegression
589
+ from sklearn.model_selection import KFold
590
  from sklearn.metrics import r2_score
591
 
592
  scaler = StandardScaler()
593
  X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
594
  selector = SelectKBest(f_regression, k=min(40, X_scaled.shape[1]))
595
+ X_sel = pd.DataFrame(
596
+ selector.fit_transform(X_scaled, y),
597
+ columns=[X.columns[i] for i in selector.get_support(indices=True)]
598
+ )
599
 
 
600
  kf = KFold(n_splits=5, shuffle=True, random_state=42)
601
  oof_preds = pd.DataFrame(index=X_sel.index)
602
  base_models = []
603
+
604
+ valid_results = [
605
+ (r["family"], r) for r in tuned_results
606
+ if r.get("model_obj") is not None and hasattr(r["model_obj"], "fit")
607
+ ]
608
+
 
 
 
609
  for fam, entry in valid_results:
610
  model = entry["model_obj"]
611
  preds = np.zeros(X_sel.shape[0])
 
621
  base_models.append({"family": fam, "model": model})
622
  except Exception as e:
623
  st.warning(f"⚠️ {fam} full-fit failed: {e}")
 
 
 
 
 
 
 
 
624
 
625
  meta = LinearRegression(positive=True)
626
  meta.fit(oof_preds, y)
 
637
  sample_X = X_sel.sample(min(300, len(X_sel)), random_state=42)
638
  expl = shap.TreeExplainer(top_base)
639
  shap_vals = expl.shap_values(sample_X)
640
+ if isinstance(shap_vals, list): shap_vals = shap_vals[0]
 
641
  imp = pd.DataFrame({
642
  "Feature": sample_X.columns,
643
  "Mean |SHAP|": np.abs(shap_vals).mean(axis=0),
 
655
  recs.append(f"`{r['Feature']}` neutral for `{target}`")
656
  st.write("\n".join(recs))
657
 
658
+ # --- Hugging Face Router Chat API (OpenAI-Compatible Format) ---
659
+ import requests, textwrap
660
+
661
  HF_TOKEN = os.getenv("HF_TOKEN")
662
  if not HF_TOKEN:
663
  st.error("HF_TOKEN not detected in environment or secrets.toml.")
664
  else:
665
+ API_URL = "https://router.huggingface.co/v1/chat/completions"
 
 
666
  headers = {
667
  "Authorization": f"Bearer {HF_TOKEN}",
668
  "Content-Type": "application/json",
669
  }
670
+
 
671
  prompt = textwrap.dedent(f"""
672
  You are an expert metallurgical process advisor.
673
  Analyze these SHAP-based operator recommendations and rewrite them
674
  as a concise 3-line professional advisory note.
675
+
676
  Recommendations: {recs}
677
  Target variable: {target}
678
  Use case: {use_case}
679
  """)
680
+
 
681
  payload = {
682
+ "model": "meta-llama/Meta-Llama-3-8B-Instruct",
683
+ "messages": [
684
+ {"role": "system", "content": "You are a concise metallurgical advisor."},
685
+ {"role": "user", "content": prompt}
686
+ ],
687
+ "temperature": 0.5,
688
+ "max_tokens": 200,
689
+ "stream": False
690
  }
691
+
692
  with st.spinner("Generating operator advisory (Llama 3-8B)…"):
693
  try:
694
  resp = requests.post(API_URL, headers=headers, json=payload, timeout=90)
 
695
  if resp.status_code != 200:
696
  st.warning(f"HF API error {resp.status_code}: {resp.text}")
697
  else:
698
  try:
699
  data = resp.json()
700
+ msg = (
701
+ data.get("choices", [{}])[0]
702
+ .get("message", {})
703
+ .get("content", "")
704
+ .strip()
705
+ )
706
+ if msg:
707
+ st.success("✅ Operator Advisory Generated:")
708
+ st.info(msg)
 
 
709
  else:
710
+ st.warning(f"Operator advisory skipped: empty response.\nRaw: {data}")
711
+ except Exception as e:
712
+ st.warning(f"Operator advisory skipped: JSON parse error — {e}")
 
713
  except Exception as e:
714
  st.warning(f"Operator advisory skipped: {e}")
715
 
716
+ except Exception as e:
717
+ st.warning(f"Operator advisory skipped: {e}")
718
 
719
 
720
  # ----- Business Impact tab