Spaces:

OmarOmar91
/

chatbot2

Sleeping

App Files Files Community

OmarOmar91 commited on Oct 31, 2025

Commit

0043242

verified ·

1 Parent(s): 41f2c66

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -72

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
 # ================================================================
 # Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
-# - Predictor tab: identical behavior to the "second code" (kept)
-# - Literature tab: from the "first code" (Hybrid RAG + MMR)
-# - UX: LLM ON by default (if key present), mutual-exclusion w/ Strict,
-#       science-oriented styling, and prediction=0.0 if inputs incomplete
 # ================================================================
 # ---------------------- Runtime flags (HF-safe) ----------------------
@@ -44,10 +43,10 @@ try:
 except Exception:
     OpenAI = None
-# LLM availability flag → enables default ON when available
 LLM_AVAILABLE = (OPENAI_API_KEY is not None and OPENAI_API_KEY.strip() != "" and OpenAI is not None)
-# ========================= Predictor (kept same as 2nd) =========================
 CF_COL     = "Conductive Filler Conc. (wt%)"
 TARGET_COL = "Stress GF (MPa-1)"
@@ -106,7 +105,6 @@ CATEGORICAL_COLS = {
     "Current Type"
 }
-# Optional fields (allowed to be missing). All others required for prediction.
 OPTIONAL_FIELDS = {
     "Filler 2 Type",
     "Filler 2 Diameter (µm)",
@@ -150,12 +148,6 @@ def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
     return pd.DataFrame([row], columns=MAIN_VARIABLES)
 def _is_complete(form_dict: dict) -> bool:
-    """
-    Completeness rule:
-    - All fields except OPTIONAL_FIELDS must be present.
-    - For NUMERIC_COLS (except optional), require not-NaN.
-    - For CATEGORICAL_COLS (except optional), require non-empty string (no 'NA').
-    """
     for col in MAIN_VARIABLES:
         if col in OPTIONAL_FIELDS:
             continue
@@ -177,10 +169,8 @@ def _is_complete(form_dict: dict) -> bool:
     return True
 def predict_fn(**kwargs):
-    # If incomplete, return 0.0 per requirement
     if not _is_complete(kwargs):
         return 0.0
     mdl = _load_model_or_error()
     if isinstance(mdl, str):
         return mdl
@@ -236,8 +226,7 @@ def _clear_all():
             cleared.append("")
     return cleared
-# ========================= Hybrid RAG (from 1st code) =========================
-# Configuration
 ARTIFACT_DIR    = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
 TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
 TFIDF_MAT_PATH  = ARTIFACT_DIR / "tfidf_matrix.joblib"
@@ -245,16 +234,13 @@ BM25_TOK_PATH   = ARTIFACT_DIR / "bm25_tokens.joblib"
 EMB_NPY_PATH    = ARTIFACT_DIR / "chunk_embeddings.npy"
 RAG_META_PATH   = ARTIFACT_DIR / "chunks.parquet"
-# PDF source (HF-safe: rely on local /papers by default)
 LOCAL_PDF_DIR = Path("papers"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
 USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
-# Retrieval weights
 W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
 W_BM25_DEFAULT  = 0.50 if not USE_DENSE else 0.30
 W_EMB_DEFAULT   = 0.00 if USE_DENSE is False else 0.40
-# Simple text processing
 _SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
 TOKEN_RE       = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
 def sent_split(text: str) -> List[str]:
@@ -263,7 +249,6 @@ def sent_split(text: str) -> List[str]:
 def tokenize(text: str) -> List[str]:
     return [t.lower() for t in TOKEN_RE.findall(text)]
-# PDF text extraction
 def _extract_pdf_text(pdf_path: Path) -> str:
     try:
         import fitz
@@ -305,7 +290,6 @@ def _safe_init_st_model(name: str):
         USE_DENSE = False
         return None
-# Build or load index
 def build_or_load_hybrid(pdf_dir: Path):
     have_cache = (TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
                   and RAG_META_PATH.exists()
@@ -358,13 +342,11 @@ def build_or_load_hybrid(pdf_dir: Path):
             print("Dense embedding failed:", e)
             emb = None
-    # Save artifacts
     joblib.dump(vectorizer, TFIDF_VECT_PATH)
-    joblib.dump(X_tfidf, TFIDF_MAT_PATH)
     if BM25Okapi is not None:
         joblib.dump(all_tokens, BM25_TOK_PATH)
     meta.to_parquet(RAG_META_PATH, index=False)
     return vectorizer, X_tfidf, meta, all_tokens, emb
 tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(LOCAL_PDF_DIR)
@@ -372,7 +354,7 @@ bm25 = BM25Okapi(bm25_tokens) if (BM25Okapi is not None and bm25_tokens is not N
 st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
 def _extract_page(text_chunk: str) -> str:
-    m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
     return (m[-1].group(1) if m else "?")
 def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
@@ -401,7 +383,7 @@ def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAUL
     # BM25 scores
     if bm25 is not None:
-        q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\-/\.%]+", query)]
         bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
     else:
         bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
@@ -536,6 +518,7 @@ def rag_reply(
     srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
     coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
     if strict_quotes_only:
         if not selected:
             return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
@@ -585,15 +568,6 @@ def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
     except Exception as e:
         return f"RAG error: {e}"
-# ---------- Small helpers to keep checkboxes mutually exclusive ----------
-def _strict_on(strict, use_llm):
-    # If strict is turned on, force LLM off; otherwise keep LLM as is.
-    return False if strict else use_llm
-def _llm_on(use_llm, strict):
-    # If LLM is turned on, force strict off; otherwise keep strict as is.
-    return False if use_llm else strict
 # ========================= UI (science-oriented styling) =========================
 CSS = """
 /* Science-oriented: crisp contrast + readable numerics */
@@ -605,7 +579,7 @@ CSS = """
 label {color: #e8f7ff !important; text-shadow: 0 1px 0 rgba(0,0,0,0.35); cursor: pointer;}
 input[type="number"] {font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;}
-/* Checkbox clickability fixes (themes can overlay labels) */
 input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !important; }
 .gr-checkbox label, .gr-check-radio label { pointer-events: auto !important; cursor: pointer; }
 #rag-tab input[type="checkbox"] { accent-color: #60a5fa !important; }
@@ -720,7 +694,6 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
                             elem_classes=["prose"]
                         )
-            # Wire predictor buttons
             inputs_in_order = [
                 f1_type, f1_diam, f1_len, cf_conc,
                 f1_dim,  f2_type, f2_diam, f2_len,
@@ -742,7 +715,7 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
         with gr.Tab("📚 Ask the Literature (Hybrid RAG + MMR)", elem_id="rag-tab"):
             gr.Markdown(
                 "Upload PDFs into the repository folder <code>papers/</code> then reload the Space. "
-                "Answers cite (Doc.pdf, p.X). Toggle strict quotes or optional LLM paraphrasing."
             )
             with gr.Row():
                 top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
@@ -754,43 +727,21 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
                 w_bm25  = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT,  step=0.05, label="BM25 weight")
                 w_emb   = gr.Slider(0.0, 1.0, value=(0.0 if not USE_DENSE else 0.40), step=0.05, label="Dense weight (set 0 if disabled)")
-            with gr.Accordion("LLM & Controls", open=False):
-                strict_quotes_only = gr.Checkbox(
-                    value=False, label="Strict quotes only (no paraphrasing)", interactive=True
-                )
-                # LLM ON by default when available
-                use_llm = gr.Checkbox(
-                    value=LLM_AVAILABLE, label="Use LLM to paraphrase selected sentences",
-                    interactive=LLM_AVAILABLE
-                )
-                model_name  = gr.Textbox(
-                    value=os.getenv("OPENAI_MODEL", OPENAI_MODEL),
-                    label="LLM model", placeholder="e.g., gpt-5 or gpt-5-mini"
-                )
-                temperature = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="Temperature")
-                # Availability banner
-                llm_status = gr.Markdown(
-                    "**LLM status:** " + (
-                        "✅ Ready — paraphrasing enabled by default." if LLM_AVAILABLE
-                        else "⚠️ Disabled — set `OPENAI_API_KEY` to enable paraphrasing."
-                    )
-                )
-                # Make the two checkboxes mutually exclusive
-                strict_quotes_only.change(
-                    _strict_on, inputs=[strict_quotes_only, use_llm], outputs=use_llm
-                )
-                use_llm.change(
-                    _llm_on, inputs=[use_llm, strict_quotes_only], outputs=strict_quotes_only
-                )
             gr.ChatInterface(
                 fn=rag_chat_fn,
-                additional_inputs=[top_k, n_sentences, include_passages, use_llm, model_name,
-                                   temperature, strict_quotes_only, w_tfidf, w_bm25, w_emb],
                 title="Literature Q&A",
-                description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations. Toggle strict/LLM modes."
             )
 # ------------- Launch -------------

 # ================================================================
 # Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
+# - Predictor tab: identical behavior (kept)
+# - Literature tab: Hybrid RAG; LLM runs silently when available
+# - UX: no visible "LLM & Controls" window; prediction=0.0 if incomplete
 # ================================================================
 # ---------------------- Runtime flags (HF-safe) ----------------------
 except Exception:
     OpenAI = None
+# LLM availability flag — used internally; UI remains hidden
 LLM_AVAILABLE = (OPENAI_API_KEY is not None and OPENAI_API_KEY.strip() != "" and OpenAI is not None)
+# ========================= Predictor (kept) =========================
 CF_COL     = "Conductive Filler Conc. (wt%)"
 TARGET_COL = "Stress GF (MPa-1)"
     "Current Type"
 }
 OPTIONAL_FIELDS = {
     "Filler 2 Type",
     "Filler 2 Diameter (µm)",
     return pd.DataFrame([row], columns=MAIN_VARIABLES)
 def _is_complete(form_dict: dict) -> bool:
     for col in MAIN_VARIABLES:
         if col in OPTIONAL_FIELDS:
             continue
     return True
 def predict_fn(**kwargs):
     if not _is_complete(kwargs):
         return 0.0
     mdl = _load_model_or_error()
     if isinstance(mdl, str):
         return mdl
             cleared.append("")
     return cleared
+# ========================= Hybrid RAG =========================
 ARTIFACT_DIR    = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
 TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
 TFIDF_MAT_PATH  = ARTIFACT_DIR / "tfidf_matrix.joblib"
 EMB_NPY_PATH    = ARTIFACT_DIR / "chunk_embeddings.npy"
 RAG_META_PATH   = ARTIFACT_DIR / "chunks.parquet"
 LOCAL_PDF_DIR = Path("papers"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
 USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
 W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
 W_BM25_DEFAULT  = 0.50 if not USE_DENSE else 0.30
 W_EMB_DEFAULT   = 0.00 if USE_DENSE is False else 0.40
 _SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
 TOKEN_RE       = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
 def sent_split(text: str) -> List[str]:
 def tokenize(text: str) -> List[str]:
     return [t.lower() for t in TOKEN_RE.findall(text)]
 def _extract_pdf_text(pdf_path: Path) -> str:
     try:
         import fitz
         USE_DENSE = False
         return None
 def build_or_load_hybrid(pdf_dir: Path):
     have_cache = (TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
                   and RAG_META_PATH.exists()
             print("Dense embedding failed:", e)
             emb = None
     joblib.dump(vectorizer, TFIDF_VECT_PATH)
+    joblib.dump(X_tfidF:=X_tfidf, TFIDF_MAT_PATH)  # assign + save
     if BM25Okapi is not None:
         joblib.dump(all_tokens, BM25_TOK_PATH)
     meta.to_parquet(RAG_META_PATH, index=False)
     return vectorizer, X_tfidf, meta, all_tokens, emb
 tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(LOCAL_PDF_DIR)
 st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
 def _extract_page(text_chunk: str) -> str:
+    m = list(re.finditer(r"\\[\\[PAGE=(\\d+)\\]\\]", text_chunk or ""))
     return (m[-1].group(1) if m else "?")
 def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
     # BM25 scores
     if bm25 is not None:
+        q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\\-\\/\\.%%]+", query)]
         bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
     else:
         bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
     srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
     coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
+    # Hidden policy: if strict==True → no paraphrasing; else try LLM if available
     if strict_quotes_only:
         if not selected:
             return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
     except Exception as e:
         return f"RAG error: {e}"
 # ========================= UI (science-oriented styling) =========================
 CSS = """
 /* Science-oriented: crisp contrast + readable numerics */
 label {color: #e8f7ff !important; text-shadow: 0 1px 0 rgba(0,0,0,0.35); cursor: pointer;}
 input[type="number"] {font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;}
+/* Checkbox clickability fixes */
 input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !important; }
 .gr-checkbox label, .gr-check-radio label { pointer-events: auto !important; cursor: pointer; }
 #rag-tab input[type="checkbox"] { accent-color: #60a5fa !important; }
                             elem_classes=["prose"]
                         )
             inputs_in_order = [
                 f1_type, f1_diam, f1_len, cf_conc,
                 f1_dim,  f2_type, f2_diam, f2_len,
         with gr.Tab("📚 Ask the Literature (Hybrid RAG + MMR)", elem_id="rag-tab"):
             gr.Markdown(
                 "Upload PDFs into the repository folder <code>papers/</code> then reload the Space. "
+                "Answers cite (Doc.pdf, p.X)."
             )
             with gr.Row():
                 top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
                 w_bm25  = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT,  step=0.05, label="BM25 weight")
                 w_emb   = gr.Slider(0.0, 1.0, value=(0.0 if not USE_DENSE else 0.40), step=0.05, label="Dense weight (set 0 if disabled)")
+            # ---- Hidden states for LLM behavior (no visible controls) ----
+            state_use_llm     = gr.State(LLM_AVAILABLE)   # True when key present; else False
+            state_model_name  = gr.State(os.getenv("OPENAI_MODEL", OPENAI_MODEL))
+            state_temperature = gr.State(0.2)
+            state_strict      = gr.State(False)           # hidden: default to not-strict
             gr.ChatInterface(
                 fn=rag_chat_fn,
+                additional_inputs=[
+                    top_k, n_sentences, include_passages,
+                    state_use_llm, state_model_name, state_temperature, state_strict,
+                    w_tfidf, w_bm25, w_emb
+                ],
                 title="Literature Q&A",
+                description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations."
             )
 # ------------- Launch -------------