Spaces:

Inframat-x
/

ML-Chatbot

Running

App Files Files Community

Inframat-x commited on Nov 2, 2025

Commit

a3b7322

verified ·

1 Parent(s): b902f03

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -54

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 # ================================================================
 # Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
-# - Predictor tab: identical behavior (kept)
 # - Literature tab: Hybrid RAG; LLM runs silently when available
-# - UX: no visible "LLM & Controls" window; prediction=0.0 if incomplete
 # ================================================================
 # ---------------------- Runtime flags (HF-safe) ----------------------
@@ -12,7 +12,7 @@ os.environ["TRANSFORMERS_NO_FLAX"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # ------------------------------- Imports ------------------------------
-import re, time, joblib, warnings, json
 from pathlib import Path
 from typing import List, Dict, Any
@@ -49,6 +49,7 @@ LLM_AVAILABLE = (OPENAI_API_KEY is not None and OPENAI_API_KEY.strip() != "" and
 # ========================= Predictor (kept) =========================
 CF_COL     = "Conductive Filler Conc. (wt%)"
 TARGET_COL = "Stress GF (MPa-1)"
 MAIN_VARIABLES = [
     "Filler 1 Type",
@@ -112,24 +113,54 @@ OPTIONAL_FIELDS = {
     "Filler 2 Dimensionality",
 }
-DIM_CHOICES     = ["0D", "1D", "2D", "3D", "NA"]
-CURRENT_CHOICES = ["DC", "AC", "NA"]
 MODEL_CANDIDATES = [
     "stress_gf_xgb.joblib",
     "models/stress_gf_xgb.joblib",
     "/home/user/app/stress_gf_xgb.joblib",
 ]
 def _load_model_or_error():
-    for p in MODEL_CANDIDATES:
         if os.path.exists(p):
             try:
                 return joblib.load(p)
             except Exception as e:
-                return f"Could not load model from {p}: {e}"
-    return ("Model file not found. Upload your trained pipeline as "
-            "stress_gf_xgb.joblib (or put it in models/).")
 def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
     row = {}
@@ -143,14 +174,16 @@ def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
                     row[col] = float(v)
                 except Exception:
                     row[col] = np.nan
         else:
-            row[col] = "" if v in (None, "NA") else str(v).strip()
     return pd.DataFrame([row], columns=MAIN_VARIABLES)
 def _is_complete(form_dict: dict) -> bool:
-    for col in MAIN_VARIABLES:
-        if col in OPTIONAL_FIELDS:
-            continue
         v = form_dict.get(col, None)
         if col in NUMERIC_COLS:
             try:
@@ -158,31 +191,48 @@ def _is_complete(form_dict: dict) -> bool:
                     return False
             except Exception:
                 return False
-        elif col in CATEGORICAL_COLS:
-            s = "" if v in (None, "NA") else str(v).strip()
-            if s == "":
-                return False
         else:
-            s = "" if v is None else str(v).strip()
-            if s == "":
                 return False
     return True
 def predict_fn(**kwargs):
     if not _is_complete(kwargs):
         return 0.0
     mdl = _load_model_or_error()
-    if isinstance(mdl, str):
-        return mdl
     X_new = _coerce_to_row(kwargs)
     try:
-        y_log = mdl.predict(X_new)        # model predicts log1p(target)
-        y = float(np.expm1(y_log)[0])     # back to original scale MPa^-1
-        if -1e-10 < y < 0:
-            y = 0.0
         return y
     except Exception as e:
-        return f"Prediction error: {e}"
 EXAMPLE = {
     "Filler 1 Type": "CNT",
@@ -191,7 +241,7 @@ EXAMPLE = {
     "Filler 1 Length (mm)": 1.2,
     CF_COL: 0.5,
     "Filler 2 Type": "",
-    "Filler 2 Dimensionality": "NA",
     "Filler 2 Diameter (µm)": None,
     "Filler 2 Length (mm)": None,
     "Specimen Volume (mm3)": 1000,
@@ -219,9 +269,9 @@ def _clear_all():
         if col in NUMERIC_COLS:
             cleared.append(None)
         elif col in {"Filler 1 Dimensionality", "Filler 2 Dimensionality"}:
-            cleared.append("NA")
         elif col == "Current Type":
-            cleared.append("NA")
         else:
             cleared.append("")
     return cleared
@@ -343,7 +393,7 @@ def build_or_load_hybrid(pdf_dir: Path):
             emb = None
     joblib.dump(vectorizer, TFIDF_VECT_PATH)
-    joblib.dump(X_tfidF:=X_tfidf, TFIDF_MAT_PATH)  # assign + save
     if BM25Okapi is not None:
         joblib.dump(all_tokens, BM25_TOK_PATH)
     meta.to_parquet(RAG_META_PATH, index=False)
@@ -354,7 +404,7 @@ bm25 = BM25Okapi(bm25_tokens) if (BM25Okapi is not None and bm25_tokens is not N
 st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
 def _extract_page(text_chunk: str) -> str:
-    m = list(re.finditer(r"\\[\\[PAGE=(\\d+)\\]\\]", text_chunk or ""))
     return (m[-1].group(1) if m else "?")
 def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
@@ -383,7 +433,7 @@ def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAUL
     # BM25 scores
     if bm25 is not None:
-        q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\\-\\/\\.%%]+", query)]
         bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
     else:
         bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
@@ -456,10 +506,9 @@ def mmr_select_sentences(question: str, hits: pd.DataFrame, top_n=4, pool_per_ch
         for i in remain:
             sim_to_sel = max(sim_fn(i, j) for j in selected_idx) if selected_idx else 0.0
             score = lambda_div * rel[i] - (1 - lambda_div) * sim_to_sel
-            cand_scores.append((score, i))
-        cand_scores.sort(reverse=True)
-        best_i = cand_scores[0][1]
-        selected.append(pool[best_i]); selected_idx.append(best_i); remain.remove(best_i)
     return selected
 def compose_extractive(selected: List[Dict[str, Any]]) -> str:
@@ -518,7 +567,6 @@ def rag_reply(
     srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
     coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
-    # Hidden policy: if strict==True → no paraphrasing; else try LLM if available
     if strict_quotes_only:
         if not selected:
             return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
@@ -644,27 +692,28 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
             with gr.Row():
                 with gr.Column(scale=7):
                     with gr.Accordion("Primary conductive filler", open=True, elem_classes=["card"]):
-                        f1_type = gr.Textbox(label="Filler 1 Type", placeholder="e.g., CNT, Graphite, Steel fiber")
-                        f1_diam = gr.Number(label="Filler 1 Diameter (µm)")
-                        f1_len  = gr.Number(label="Filler 1 Length (mm)")
-                        cf_conc = gr.Number(label=f"{CF_COL}", info="Weight percent of total binder")
-                        f1_dim  = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 1 Dimensionality")
                     with gr.Accordion("Secondary filler (optional)", open=False, elem_classes=["card"]):
                         f2_type = gr.Textbox(label="Filler 2 Type", placeholder="Optional")
                         f2_diam = gr.Number(label="Filler 2 Diameter (µm)")
                         f2_len  = gr.Number(label="Filler 2 Length (mm)")
-                        f2_dim  = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 2 Dimensionality")
                     with gr.Accordion("Mix design & specimen", open=False, elem_classes=["card"]):
-                        spec_vol  = gr.Number(label="Specimen Volume (mm3)")
-                        probe_cnt = gr.Number(label="Probe Count")
-                        probe_mat = gr.Textbox(label="Probe Material", placeholder="e.g., Copper, Silver paste")
-                        wb        = gr.Number(label="W/B")
-                        sb        = gr.Number(label="S/B")
-                        gauge_len = gr.Number(label="Gauge Length (mm)")
-                        curing    = gr.Textbox(label="Curing Condition", placeholder="e.g., 28d water, 20°C")
-                        n_fillers = gr.Number(label="Number of Fillers")
                     with gr.Accordion("Processing", open=False, elem_classes=["card"]):
                         dry_temp = gr.Number(label="Drying Temperature (°C)")
@@ -672,8 +721,8 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
                     with gr.Accordion("Mechanical & electrical loading", open=False, elem_classes=["card"]):
                         load_rate = gr.Number(label="Loading Rate (MPa/s)")
-                        E_mod     = gr.Number(label="Modulus of Elasticity (GPa)")
-                        current   = gr.Dropdown(CURRENT_CHOICES, value="NA", label="Current Type")
                         voltage   = gr.Number(label="Applied Voltage (V)")
                 with gr.Column(scale=5):
@@ -687,7 +736,7 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
                     with gr.Accordion("About this model", open=False, elem_classes=["card"]):
                         gr.Markdown(
                             "- Pipeline: ColumnTransformer → (RobustScaler + OneHot) → XGBoost\n"
-                            "- Target: Stress GF (MPa<sup>-1</sup>) on original scale (model trains on log1p).\n"
                             "- Missing values are safely imputed per-feature.\n"
                             "- Trained columns:\n"
                             f"  `{', '.join(MAIN_VARIABLES)}`",

 # ================================================================
 # Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
+# - Predictor tab: identical behavior (kept) + * marks for required fields
 # - Literature tab: Hybrid RAG; LLM runs silently when available
+# - UX: no visible "LLM & Controls" window; prediction=0.0 if incomplete/error
 # ================================================================
 # ---------------------- Runtime flags (HF-safe) ----------------------
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # ------------------------------- Imports ------------------------------
+import re, time, joblib, warnings, json, traceback
 from pathlib import Path
 from typing import List, Dict, Any
 # ========================= Predictor (kept) =========================
 CF_COL     = "Conductive Filler Conc. (wt%)"
 TARGET_COL = "Stress GF (MPa-1)"
+CANON_NA   = "NA"  # <-- canonical placeholder for categoricals
 MAIN_VARIABLES = [
     "Filler 1 Type",
     "Filler 2 Dimensionality",
 }
+# Only these fields are required
+REQUIRED_FIELDS = {
+    "Filler 1 Type",
+    "Filler 1 Diameter (µm)",
+    "Filler 1 Length (mm)",
+    CF_COL,
+    "Filler 1 Dimensionality",
+    "Specimen Volume (mm3)",
+    "Probe Count",
+    "Probe Material",
+    "W/B",
+    "S/B",
+    "Gauge Length (mm)",
+    "Curing Condition",
+    "Number of Fillers",
+    "Modulus of Elasticity (GPa)"
+}
+DIM_CHOICES     = ["0D", "1D", "2D", "3D", CANON_NA]
+CURRENT_CHOICES = ["DC", "AC", CANON_NA]
 MODEL_CANDIDATES = [
     "stress_gf_xgb.joblib",
     "models/stress_gf_xgb.joblib",
     "/home/user/app/stress_gf_xgb.joblib",
+    os.getenv("MODEL_PATH", "")
 ]
 def _load_model_or_error():
+    for p in [x for x in MODEL_CANDIDATES if x]:
         if os.path.exists(p):
             try:
                 return joblib.load(p)
             except Exception as e:
+                print(f"[ModelLoad] Error from {p}: {e}")
+                traceback.print_exc()
+                return None
+    print("[ModelLoad] Model file not found. Upload stress_gf_xgb.joblib (or set MODEL_PATH).")
+    return None
+def _canon_cat(v: Any) -> str:
+    """Stable, canonical category placeholder normalization."""
+    if v is None:
+        return CANON_NA
+    s = str(v).strip()
+    if s == "" or s.upper() in {"N/A", "NONE", "NULL"}:
+        return CANON_NA
+    return s
 def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
     row = {}
                     row[col] = float(v)
                 except Exception:
                     row[col] = np.nan
+        elif col in CATEGORICAL_COLS:
+            row[col] = _canon_cat(v)
         else:
+            # non-numeric, non-categorical (free text) — keep trimmed, but not empty
+            s = str(v).strip() if v is not None else ""
+            row[col] = s if s else CANON_NA
     return pd.DataFrame([row], columns=MAIN_VARIABLES)
 def _is_complete(form_dict: dict) -> bool:
+    for col in REQUIRED_FIELDS:
         v = form_dict.get(col, None)
         if col in NUMERIC_COLS:
             try:
                     return False
             except Exception:
                 return False
         else:
+            s = _canon_cat(v)
+            if s == CANON_NA:
                 return False
     return True
+def _align_columns_to_model(df: pd.DataFrame, mdl) -> pd.DataFrame:
+    """Align incoming dataframe columns to the model's expected feature order."""
+    try:
+        feat = getattr(mdl, "feature_names_in_", None)
+        if feat is not None and len(feat) > 0:
+            for c in feat:
+                if c not in df.columns:
+                    df[c] = np.nan
+            df = df[list(feat)]
+    except Exception as e:
+        print(f"[Align] Could not align columns: {e}")
+        traceback.print_exc()
+    return df
 def predict_fn(**kwargs):
+    # Return a NUMBER always (0.0 on incomplete or any error) to keep gr.Number happy
     if not _is_complete(kwargs):
         return 0.0
     mdl = _load_model_or_error()
+    if mdl is None:
+        return 0.0
     X_new = _coerce_to_row(kwargs)
+    X_new = _align_columns_to_model(X_new, mdl)
     try:
+        y_raw = mdl.predict(X_new)               # model may predict log1p(target) or original scale
+        # If your model was trained on log1p, set mdl.target_is_log1p_ = True before saving.
+        if getattr(mdl, "target_is_log1p_", False):
+            y = np.expm1(y_raw)
+        else:
+            y = y_raw
+        y = float(np.maximum(np.asarray(y).ravel()[0], 0.0))
         return y
     except Exception as e:
+        print(f"[Predict] {e}")
+        traceback.print_exc()
+        return 0.0
 EXAMPLE = {
     "Filler 1 Type": "CNT",
     "Filler 1 Length (mm)": 1.2,
     CF_COL: 0.5,
     "Filler 2 Type": "",
+    "Filler 2 Dimensionality": CANON_NA,
     "Filler 2 Diameter (µm)": None,
     "Filler 2 Length (mm)": None,
     "Specimen Volume (mm3)": 1000,
         if col in NUMERIC_COLS:
             cleared.append(None)
         elif col in {"Filler 1 Dimensionality", "Filler 2 Dimensionality"}:
+            cleared.append(CANON_NA)
         elif col == "Current Type":
+            cleared.append(CANON_NA)
         else:
             cleared.append("")
     return cleared
             emb = None
     joblib.dump(vectorizer, TFIDF_VECT_PATH)
+    joblib.dump(X_tfidF:=X_tfidf, TFIDF_MAT_PATH)
     if BM25Okapi is not None:
         joblib.dump(all_tokens, BM25_TOK_PATH)
     meta.to_parquet(RAG_META_PATH, index=False)
 st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
 def _extract_page(text_chunk: str) -> str:
+    m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
     return (m[-1].group(1) if m else "?")
 def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
     # BM25 scores
     if bm25 is not None:
+        q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\-\/\.%]+", query)]
         bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
     else:
         bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
         for i in remain:
             sim_to_sel = max(sim_fn(i, j) for j in selected_idx) if selected_idx else 0.0
             score = lambda_div * rel[i] - (1 - lambda_div) * sim_to_sel
+            cand_scores.sort(reverse=True)
+            best_i = cand_scores[0][1]
+            selected.append(pool[best_i]); selected_idx.append(best_i); remain.remove(best_i)
     return selected
 def compose_extractive(selected: List[Dict[str, Any]]) -> str:
     srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
     coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
     if strict_quotes_only:
         if not selected:
             return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
             with gr.Row():
                 with gr.Column(scale=7):
                     with gr.Accordion("Primary conductive filler", open=True, elem_classes=["card"]):
+                        # * marks on required inputs only
+                        f1_type = gr.Textbox(label="Filler 1 Type *", placeholder="e.g., CNT, Graphite, Steel fiber")
+                        f1_diam = gr.Number(label="Filler 1 Diameter (µm) *")
+                        f1_len  = gr.Number(label="Filler 1 Length (mm) *")
+                        cf_conc = gr.Number(label=f"{CF_COL} *", info="Weight percent of total binder")
+                        f1_dim  = gr.Dropdown(DIM_CHOICES, value=CANON_NA, label="Filler 1 Dimensionality *")
                     with gr.Accordion("Secondary filler (optional)", open=False, elem_classes=["card"]):
                         f2_type = gr.Textbox(label="Filler 2 Type", placeholder="Optional")
                         f2_diam = gr.Number(label="Filler 2 Diameter (µm)")
                         f2_len  = gr.Number(label="Filler 2 Length (mm)")
+                        f2_dim  = gr.Dropdown(DIM_CHOICES, value=CANON_NA, label="Filler 2 Dimensionality")
                     with gr.Accordion("Mix design & specimen", open=False, elem_classes=["card"]):
+                        spec_vol  = gr.Number(label="Specimen Volume (mm3) *")
+                        probe_cnt = gr.Number(label="Probe Count *")
+                        probe_mat = gr.Textbox(label="Probe Material *", placeholder="e.g., Copper, Silver paste")
+                        wb        = gr.Number(label="W/B *")
+                        sb        = gr.Number(label="S/B *")
+                        gauge_len = gr.Number(label="Gauge Length (mm) *")
+                        curing    = gr.Textbox(label="Curing Condition *", placeholder="e.g., 28d water, 20°C")
+                        n_fillers = gr.Number(label="Number of Fillers *")
                     with gr.Accordion("Processing", open=False, elem_classes=["card"]):
                         dry_temp = gr.Number(label="Drying Temperature (°C)")
                     with gr.Accordion("Mechanical & electrical loading", open=False, elem_classes=["card"]):
                         load_rate = gr.Number(label="Loading Rate (MPa/s)")
+                        E_mod     = gr.Number(label="Modulus of Elasticity (GPa) *")
+                        current   = gr.Dropdown(CURRENT_CHOICES, value=CANON_NA, label="Current Type")
                         voltage   = gr.Number(label="Applied Voltage (V)")
                 with gr.Column(scale=5):
                     with gr.Accordion("About this model", open=False, elem_classes=["card"]):
                         gr.Markdown(
                             "- Pipeline: ColumnTransformer → (RobustScaler + OneHot) → XGBoost\n"
+                            "- Target: Stress GF (MPa<sup>-1</sup>) on original scale (model may train on log1p; saved flag used at inference).\n"
                             "- Missing values are safely imputed per-feature.\n"
                             "- Trained columns:\n"
                             f"  `{', '.join(MAIN_VARIABLES)}`",