Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
# ================================================================
|
| 2 |
# Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
|
| 3 |
-
# - Predictor tab: identical behavior
|
| 4 |
-
# - Literature tab:
|
| 5 |
-
# -
|
| 6 |
# ================================================================
|
| 7 |
|
| 8 |
# ---------------------- Runtime flags (HF-safe) ----------------------
|
|
@@ -43,7 +43,10 @@ try:
|
|
| 43 |
except Exception:
|
| 44 |
OpenAI = None
|
| 45 |
|
| 46 |
-
#
|
|
|
|
|
|
|
|
|
|
| 47 |
CF_COL = "Conductive Filler Conc. (wt%)"
|
| 48 |
TARGET_COL = "Stress GF (MPa-1)"
|
| 49 |
|
|
@@ -102,6 +105,13 @@ CATEGORICAL_COLS = {
|
|
| 102 |
"Current Type"
|
| 103 |
}
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
DIM_CHOICES = ["0D", "1D", "2D", "3D", "NA"]
|
| 106 |
CURRENT_CHOICES = ["DC", "AC", "NA"]
|
| 107 |
|
|
@@ -137,7 +147,30 @@ def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
|
|
| 137 |
row[col] = "" if v in (None, "NA") else str(v).strip()
|
| 138 |
return pd.DataFrame([row], columns=MAIN_VARIABLES)
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
def predict_fn(**kwargs):
|
|
|
|
|
|
|
| 141 |
mdl = _load_model_or_error()
|
| 142 |
if isinstance(mdl, str):
|
| 143 |
return mdl
|
|
@@ -193,8 +226,7 @@ def _clear_all():
|
|
| 193 |
cleared.append("")
|
| 194 |
return cleared
|
| 195 |
|
| 196 |
-
# ========================= Hybrid RAG
|
| 197 |
-
# Configuration
|
| 198 |
ARTIFACT_DIR = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
|
| 199 |
TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
|
| 200 |
TFIDF_MAT_PATH = ARTIFACT_DIR / "tfidf_matrix.joblib"
|
|
@@ -202,16 +234,13 @@ BM25_TOK_PATH = ARTIFACT_DIR / "bm25_tokens.joblib"
|
|
| 202 |
EMB_NPY_PATH = ARTIFACT_DIR / "chunk_embeddings.npy"
|
| 203 |
RAG_META_PATH = ARTIFACT_DIR / "chunks.parquet"
|
| 204 |
|
| 205 |
-
|
| 206 |
-
LOCAL_PDF_DIR = Path("./literature_pdfs"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
|
| 207 |
USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
|
| 208 |
|
| 209 |
-
# Retrieval weights
|
| 210 |
W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
|
| 211 |
W_BM25_DEFAULT = 0.50 if not USE_DENSE else 0.30
|
| 212 |
-
W_EMB_DEFAULT = 0.00 if
|
| 213 |
|
| 214 |
-
# Simple text processing
|
| 215 |
_SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
|
| 216 |
TOKEN_RE = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
|
| 217 |
def sent_split(text: str) -> List[str]:
|
|
@@ -220,7 +249,6 @@ def sent_split(text: str) -> List[str]:
|
|
| 220 |
def tokenize(text: str) -> List[str]:
|
| 221 |
return [t.lower() for t in TOKEN_RE.findall(text)]
|
| 222 |
|
| 223 |
-
# PDF text extraction (PyMuPDF preferred; pypdf fallback)
|
| 224 |
def _extract_pdf_text(pdf_path: Path) -> str:
|
| 225 |
try:
|
| 226 |
import fitz
|
|
@@ -262,7 +290,6 @@ def _safe_init_st_model(name: str):
|
|
| 262 |
USE_DENSE = False
|
| 263 |
return None
|
| 264 |
|
| 265 |
-
# Build or load index
|
| 266 |
def build_or_load_hybrid(pdf_dir: Path):
|
| 267 |
have_cache = (TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
|
| 268 |
and RAG_META_PATH.exists()
|
|
@@ -287,13 +314,11 @@ def build_or_load_hybrid(pdf_dir: Path):
|
|
| 287 |
rows.append({"doc_path": str(pdf), "chunk_id": i, "text": ch})
|
| 288 |
all_tokens.append(tokenize(ch))
|
| 289 |
if not rows:
|
| 290 |
-
# create empty stub to avoid crashes; UI will message user to upload PDFs
|
| 291 |
meta = pd.DataFrame(columns=["doc_path", "chunk_id", "text"])
|
| 292 |
vectorizer = None; X_tfidf = None; emb = None; all_tokens = None
|
| 293 |
return vectorizer, X_tfidf, meta, all_tokens, emb
|
| 294 |
|
| 295 |
meta = pd.DataFrame(rows)
|
| 296 |
-
|
| 297 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 298 |
vectorizer = TfidfVectorizer(
|
| 299 |
ngram_range=(1,2),
|
|
@@ -317,13 +342,11 @@ def build_or_load_hybrid(pdf_dir: Path):
|
|
| 317 |
print("Dense embedding failed:", e)
|
| 318 |
emb = None
|
| 319 |
|
| 320 |
-
# Save artifacts
|
| 321 |
joblib.dump(vectorizer, TFIDF_VECT_PATH)
|
| 322 |
-
joblib.dump(X_tfidf, TFIDF_MAT_PATH)
|
| 323 |
if BM25Okapi is not None:
|
| 324 |
joblib.dump(all_tokens, BM25_TOK_PATH)
|
| 325 |
meta.to_parquet(RAG_META_PATH, index=False)
|
| 326 |
-
|
| 327 |
return vectorizer, X_tfidf, meta, all_tokens, emb
|
| 328 |
|
| 329 |
tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(LOCAL_PDF_DIR)
|
|
@@ -331,7 +354,7 @@ bm25 = BM25Okapi(bm25_tokens) if (BM25Okapi is not None and bm25_tokens is not N
|
|
| 331 |
st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
|
| 332 |
|
| 333 |
def _extract_page(text_chunk: str) -> str:
|
| 334 |
-
m = list(re.finditer(r"
|
| 335 |
return (m[-1].group(1) if m else "?")
|
| 336 |
|
| 337 |
def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
|
|
@@ -360,7 +383,7 @@ def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAUL
|
|
| 360 |
|
| 361 |
# BM25 scores
|
| 362 |
if bm25 is not None:
|
| 363 |
-
q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_
|
| 364 |
bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
|
| 365 |
else:
|
| 366 |
bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
|
|
@@ -403,7 +426,6 @@ def mmr_select_sentences(question: str, hits: pd.DataFrame, top_n=4, pool_per_ch
|
|
| 403 |
|
| 404 |
sent_texts = [p["sent"] for p in pool]
|
| 405 |
|
| 406 |
-
# Embedding-based relevance if available, else TF-IDF
|
| 407 |
use_dense = USE_DENSE and st_query_model is not None
|
| 408 |
if use_dense:
|
| 409 |
try:
|
|
@@ -446,7 +468,7 @@ def compose_extractive(selected: List[Dict[str, Any]]) -> str:
|
|
| 446 |
return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
|
| 447 |
|
| 448 |
def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2) -> str:
|
| 449 |
-
if
|
| 450 |
return None
|
| 451 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 452 |
model = model or OPENAI_MODEL
|
|
@@ -496,6 +518,7 @@ def rag_reply(
|
|
| 496 |
srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
|
| 497 |
coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
|
| 498 |
|
|
|
|
| 499 |
if strict_quotes_only:
|
| 500 |
if not selected:
|
| 501 |
return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
|
|
@@ -545,24 +568,60 @@ def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
|
|
| 545 |
except Exception as e:
|
| 546 |
return f"RAG error: {e}"
|
| 547 |
|
| 548 |
-
# ========================= UI (
|
| 549 |
CSS = """
|
| 550 |
-
/*
|
|
|
|
| 551 |
.gradio-container {
|
| 552 |
-
background: linear-gradient(135deg, #
|
| 553 |
}
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
"""
|
| 558 |
|
| 559 |
theme = gr.themes.Soft(
|
| 560 |
primary_hue="blue",
|
| 561 |
neutral_hue="green"
|
| 562 |
).set(
|
| 563 |
-
body_background_fill="#
|
| 564 |
body_text_color="#e0f2fe",
|
| 565 |
-
input_background_fill="#
|
| 566 |
input_border_color="#1e40af",
|
| 567 |
button_primary_background_fill="#2563eb",
|
| 568 |
button_primary_text_color="#ffffff",
|
|
@@ -574,9 +633,8 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 574 |
gr.Markdown(
|
| 575 |
"<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
|
| 576 |
"<p style='opacity:.9'>"
|
| 577 |
-
"Left
|
| 578 |
-
"Right
|
| 579 |
-
"Upload PDFs into <code>papers/</code> in your Space repo."
|
| 580 |
"</p>"
|
| 581 |
)
|
| 582 |
|
|
@@ -620,7 +678,7 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 620 |
|
| 621 |
with gr.Column(scale=5):
|
| 622 |
with gr.Group(elem_classes=["card"]):
|
| 623 |
-
out_pred = gr.Number(label="Predicted Stress GF (MPa-1)", precision=6)
|
| 624 |
with gr.Row():
|
| 625 |
btn_pred = gr.Button("Predict", variant="primary")
|
| 626 |
btn_clear = gr.Button("Clear")
|
|
@@ -628,14 +686,14 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 628 |
|
| 629 |
with gr.Accordion("About this model", open=False, elem_classes=["card"]):
|
| 630 |
gr.Markdown(
|
| 631 |
-
"- Pipeline: ColumnTransformer
|
| 632 |
-
"- Target: Stress GF (MPa
|
| 633 |
"- Missing values are safely imputed per-feature.\n"
|
| 634 |
"- Trained columns:\n"
|
| 635 |
-
f" `{', '.join(MAIN_VARIABLES)}`"
|
|
|
|
| 636 |
)
|
| 637 |
|
| 638 |
-
# Wire predictor buttons
|
| 639 |
inputs_in_order = [
|
| 640 |
f1_type, f1_diam, f1_len, cf_conc,
|
| 641 |
f1_dim, f2_type, f2_diam, f2_len,
|
|
@@ -650,38 +708,42 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 650 |
return predict_fn(**data)
|
| 651 |
|
| 652 |
btn_pred.click(_predict_wrapper, inputs=inputs_in_order, outputs=out_pred)
|
| 653 |
-
btn_clear.click(lambda: _clear_all(), inputs=None, outputs=inputs_in_order)
|
| 654 |
btn_demo.click(lambda: _fill_example(), inputs=None, outputs=inputs_in_order)
|
| 655 |
|
| 656 |
# ------------------------- Literature Tab -------------------------
|
| 657 |
-
with gr.Tab("📚 Ask the Literature (Hybrid RAG + MMR)"):
|
| 658 |
gr.Markdown(
|
| 659 |
"Upload PDFs into the repository folder <code>papers/</code> then reload the Space. "
|
| 660 |
-
"Answers cite (Doc.pdf, p.X).
|
| 661 |
)
|
| 662 |
with gr.Row():
|
| 663 |
top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
|
| 664 |
n_sentences = gr.Slider(2, 6, value=4, step=1, label="Answer length (sentences)")
|
| 665 |
-
include_passages = gr.Checkbox(value=False, label="Include supporting passages")
|
|
|
|
| 666 |
with gr.Accordion("Retriever weights (advanced)", open=False):
|
| 667 |
w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
|
| 668 |
w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
|
| 669 |
-
w_emb = gr.Slider(0.0, 1.0, value=
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
|
|
|
| 676 |
gr.ChatInterface(
|
| 677 |
fn=rag_chat_fn,
|
| 678 |
-
additional_inputs=[
|
| 679 |
-
|
|
|
|
|
|
|
|
|
|
| 680 |
title="Literature Q&A",
|
| 681 |
-
description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations.
|
| 682 |
)
|
| 683 |
|
| 684 |
# ------------- Launch -------------
|
| 685 |
if __name__ == "__main__":
|
| 686 |
-
# queue() helps HF Spaces with concurrency; show_error suggests upload PDFs if none
|
| 687 |
demo.queue().launch()
|
|
|
|
| 1 |
# ================================================================
|
| 2 |
# Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
|
| 3 |
+
# - Predictor tab: identical behavior (kept)
|
| 4 |
+
# - Literature tab: Hybrid RAG; LLM runs silently when available
|
| 5 |
+
# - UX: no visible "LLM & Controls" window; prediction=0.0 if incomplete
|
| 6 |
# ================================================================
|
| 7 |
|
| 8 |
# ---------------------- Runtime flags (HF-safe) ----------------------
|
|
|
|
| 43 |
except Exception:
|
| 44 |
OpenAI = None
|
| 45 |
|
| 46 |
+
# LLM availability flag — used internally; UI remains hidden
|
| 47 |
+
LLM_AVAILABLE = (OPENAI_API_KEY is not None and OPENAI_API_KEY.strip() != "" and OpenAI is not None)
|
| 48 |
+
|
| 49 |
+
# ========================= Predictor (kept) =========================
|
| 50 |
CF_COL = "Conductive Filler Conc. (wt%)"
|
| 51 |
TARGET_COL = "Stress GF (MPa-1)"
|
| 52 |
|
|
|
|
| 105 |
"Current Type"
|
| 106 |
}
|
| 107 |
|
| 108 |
+
OPTIONAL_FIELDS = {
|
| 109 |
+
"Filler 2 Type",
|
| 110 |
+
"Filler 2 Diameter (µm)",
|
| 111 |
+
"Filler 2 Length (mm)",
|
| 112 |
+
"Filler 2 Dimensionality",
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
DIM_CHOICES = ["0D", "1D", "2D", "3D", "NA"]
|
| 116 |
CURRENT_CHOICES = ["DC", "AC", "NA"]
|
| 117 |
|
|
|
|
| 147 |
row[col] = "" if v in (None, "NA") else str(v).strip()
|
| 148 |
return pd.DataFrame([row], columns=MAIN_VARIABLES)
|
| 149 |
|
| 150 |
+
def _is_complete(form_dict: dict) -> bool:
|
| 151 |
+
for col in MAIN_VARIABLES:
|
| 152 |
+
if col in OPTIONAL_FIELDS:
|
| 153 |
+
continue
|
| 154 |
+
v = form_dict.get(col, None)
|
| 155 |
+
if col in NUMERIC_COLS:
|
| 156 |
+
try:
|
| 157 |
+
if v in ("", None) or (isinstance(v, float) and np.isnan(v)):
|
| 158 |
+
return False
|
| 159 |
+
except Exception:
|
| 160 |
+
return False
|
| 161 |
+
elif col in CATEGORICAL_COLS:
|
| 162 |
+
s = "" if v in (None, "NA") else str(v).strip()
|
| 163 |
+
if s == "":
|
| 164 |
+
return False
|
| 165 |
+
else:
|
| 166 |
+
s = "" if v is None else str(v).strip()
|
| 167 |
+
if s == "":
|
| 168 |
+
return False
|
| 169 |
+
return True
|
| 170 |
+
|
| 171 |
def predict_fn(**kwargs):
|
| 172 |
+
if not _is_complete(kwargs):
|
| 173 |
+
return 0.0
|
| 174 |
mdl = _load_model_or_error()
|
| 175 |
if isinstance(mdl, str):
|
| 176 |
return mdl
|
|
|
|
| 226 |
cleared.append("")
|
| 227 |
return cleared
|
| 228 |
|
| 229 |
+
# ========================= Hybrid RAG =========================
|
|
|
|
| 230 |
ARTIFACT_DIR = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
|
| 231 |
TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
|
| 232 |
TFIDF_MAT_PATH = ARTIFACT_DIR / "tfidf_matrix.joblib"
|
|
|
|
| 234 |
EMB_NPY_PATH = ARTIFACT_DIR / "chunk_embeddings.npy"
|
| 235 |
RAG_META_PATH = ARTIFACT_DIR / "chunks.parquet"
|
| 236 |
|
| 237 |
+
LOCAL_PDF_DIR = Path("papers"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
|
|
|
|
| 238 |
USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
|
| 239 |
|
|
|
|
| 240 |
W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
|
| 241 |
W_BM25_DEFAULT = 0.50 if not USE_DENSE else 0.30
|
| 242 |
+
W_EMB_DEFAULT = 0.00 if USE_DENSE is False else 0.40
|
| 243 |
|
|
|
|
| 244 |
_SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
|
| 245 |
TOKEN_RE = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
|
| 246 |
def sent_split(text: str) -> List[str]:
|
|
|
|
| 249 |
def tokenize(text: str) -> List[str]:
|
| 250 |
return [t.lower() for t in TOKEN_RE.findall(text)]
|
| 251 |
|
|
|
|
| 252 |
def _extract_pdf_text(pdf_path: Path) -> str:
|
| 253 |
try:
|
| 254 |
import fitz
|
|
|
|
| 290 |
USE_DENSE = False
|
| 291 |
return None
|
| 292 |
|
|
|
|
| 293 |
def build_or_load_hybrid(pdf_dir: Path):
|
| 294 |
have_cache = (TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
|
| 295 |
and RAG_META_PATH.exists()
|
|
|
|
| 314 |
rows.append({"doc_path": str(pdf), "chunk_id": i, "text": ch})
|
| 315 |
all_tokens.append(tokenize(ch))
|
| 316 |
if not rows:
|
|
|
|
| 317 |
meta = pd.DataFrame(columns=["doc_path", "chunk_id", "text"])
|
| 318 |
vectorizer = None; X_tfidf = None; emb = None; all_tokens = None
|
| 319 |
return vectorizer, X_tfidf, meta, all_tokens, emb
|
| 320 |
|
| 321 |
meta = pd.DataFrame(rows)
|
|
|
|
| 322 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 323 |
vectorizer = TfidfVectorizer(
|
| 324 |
ngram_range=(1,2),
|
|
|
|
| 342 |
print("Dense embedding failed:", e)
|
| 343 |
emb = None
|
| 344 |
|
|
|
|
| 345 |
joblib.dump(vectorizer, TFIDF_VECT_PATH)
|
| 346 |
+
joblib.dump(X_tfidF:=X_tfidf, TFIDF_MAT_PATH) # assign + save
|
| 347 |
if BM25Okapi is not None:
|
| 348 |
joblib.dump(all_tokens, BM25_TOK_PATH)
|
| 349 |
meta.to_parquet(RAG_META_PATH, index=False)
|
|
|
|
| 350 |
return vectorizer, X_tfidf, meta, all_tokens, emb
|
| 351 |
|
| 352 |
tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(LOCAL_PDF_DIR)
|
|
|
|
| 354 |
st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
|
| 355 |
|
| 356 |
def _extract_page(text_chunk: str) -> str:
|
| 357 |
+
m = list(re.finditer(r"\\[\\[PAGE=(\\d+)\\]\\]", text_chunk or ""))
|
| 358 |
return (m[-1].group(1) if m else "?")
|
| 359 |
|
| 360 |
def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
|
|
|
|
| 383 |
|
| 384 |
# BM25 scores
|
| 385 |
if bm25 is not None:
|
| 386 |
+
q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\\-\\/\\.%%]+", query)]
|
| 387 |
bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
|
| 388 |
else:
|
| 389 |
bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
|
|
|
|
| 426 |
|
| 427 |
sent_texts = [p["sent"] for p in pool]
|
| 428 |
|
|
|
|
| 429 |
use_dense = USE_DENSE and st_query_model is not None
|
| 430 |
if use_dense:
|
| 431 |
try:
|
|
|
|
| 468 |
return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
|
| 469 |
|
| 470 |
def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2) -> str:
|
| 471 |
+
if not LLM_AVAILABLE:
|
| 472 |
return None
|
| 473 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 474 |
model = model or OPENAI_MODEL
|
|
|
|
| 518 |
srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
|
| 519 |
coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
|
| 520 |
|
| 521 |
+
# Hidden policy: if strict==True → no paraphrasing; else try LLM if available
|
| 522 |
if strict_quotes_only:
|
| 523 |
if not selected:
|
| 524 |
return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
|
|
|
|
| 568 |
except Exception as e:
|
| 569 |
return f"RAG error: {e}"
|
| 570 |
|
| 571 |
+
# ========================= UI (science-oriented styling) =========================
|
| 572 |
CSS = """
|
| 573 |
+
/* Science-oriented: crisp contrast + readable numerics */
|
| 574 |
+
* {font-family: ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, 'Helvetica Neue', Arial;}
|
| 575 |
.gradio-container {
|
| 576 |
+
background: linear-gradient(135deg, #0b1020 0%, #0c2b1a 60%, #0a2b4d 100%) !important;
|
| 577 |
}
|
| 578 |
+
.card {background: rgba(255,255,255,0.06) !important; border: 1px solid rgba(255,255,255,0.14); border-radius: 12px;}
|
| 579 |
+
label {color: #e8f7ff !important; text-shadow: 0 1px 0 rgba(0,0,0,0.35); cursor: pointer;}
|
| 580 |
+
input[type="number"] {font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;}
|
| 581 |
+
|
| 582 |
+
/* Checkbox clickability fixes */
|
| 583 |
+
input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !important; }
|
| 584 |
+
.gr-checkbox label, .gr-check-radio label { pointer-events: auto !important; cursor: pointer; }
|
| 585 |
+
#rag-tab input[type="checkbox"] { accent-color: #60a5fa !important; }
|
| 586 |
+
|
| 587 |
+
/* RAG tab background and elements */
|
| 588 |
+
#rag-tab .block, #rag-tab .group, #rag-tab .accordion {
|
| 589 |
+
background: linear-gradient(160deg, #1f2937 0%, #14532d 55%, #0b3b68 100%) !important;
|
| 590 |
+
border-radius: 12px;
|
| 591 |
+
border: 1px solid rgba(255,255,255,0.14);
|
| 592 |
+
}
|
| 593 |
+
#rag-tab input, #rag-tab textarea, #rag-tab select, #rag-tab .scroll-hide, #rag-tab .chatbot textarea {
|
| 594 |
+
background: rgba(17, 24, 39, 0.85) !important;
|
| 595 |
+
border: 1px solid #60a5fa !important;
|
| 596 |
+
color: #e5f2ff !important;
|
| 597 |
+
}
|
| 598 |
+
#rag-tab input[type="range"] { accent-color: #22c55e !important; }
|
| 599 |
+
#rag-tab button { border-radius: 10px !important; font-weight: 600 !important; }
|
| 600 |
+
#rag-tab .chatbot {
|
| 601 |
+
background: rgba(15, 23, 42, 0.6) !important;
|
| 602 |
+
border: 1px solid rgba(148, 163, 184, 0.35) !important;
|
| 603 |
+
}
|
| 604 |
+
#rag-tab .message.user {
|
| 605 |
+
background: rgba(34, 197, 94, 0.15) !important;
|
| 606 |
+
border-left: 3px solid #22c55e !important;
|
| 607 |
+
}
|
| 608 |
+
#rag-tab .message.bot {
|
| 609 |
+
background: rgba(59, 130, 246, 0.15) !important;
|
| 610 |
+
border-left: 3px solid #60a5fa !important;
|
| 611 |
+
color: #eef6ff !important;
|
| 612 |
+
}
|
| 613 |
+
|
| 614 |
+
/* Predictor output emphasis */
|
| 615 |
+
#pred-out .wrap { font-size: 20px; font-weight: 700; color: #ecfdf5; }
|
| 616 |
"""
|
| 617 |
|
| 618 |
theme = gr.themes.Soft(
|
| 619 |
primary_hue="blue",
|
| 620 |
neutral_hue="green"
|
| 621 |
).set(
|
| 622 |
+
body_background_fill="#0b1020",
|
| 623 |
body_text_color="#e0f2fe",
|
| 624 |
+
input_background_fill="#0f172a",
|
| 625 |
input_border_color="#1e40af",
|
| 626 |
button_primary_background_fill="#2563eb",
|
| 627 |
button_primary_text_color="#ffffff",
|
|
|
|
| 633 |
gr.Markdown(
|
| 634 |
"<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
|
| 635 |
"<p style='opacity:.9'>"
|
| 636 |
+
"Left: ML prediction for Stress Gauge Factor (original scale, MPa<sup>-1</sup>). "
|
| 637 |
+
"Right: Literature Q&A via Hybrid RAG (BM25 + TF-IDF + optional dense) with MMR sentence selection."
|
|
|
|
| 638 |
"</p>"
|
| 639 |
)
|
| 640 |
|
|
|
|
| 678 |
|
| 679 |
with gr.Column(scale=5):
|
| 680 |
with gr.Group(elem_classes=["card"]):
|
| 681 |
+
out_pred = gr.Number(label="Predicted Stress GF (MPa-1)", value=0.0, precision=6, elem_id="pred-out")
|
| 682 |
with gr.Row():
|
| 683 |
btn_pred = gr.Button("Predict", variant="primary")
|
| 684 |
btn_clear = gr.Button("Clear")
|
|
|
|
| 686 |
|
| 687 |
with gr.Accordion("About this model", open=False, elem_classes=["card"]):
|
| 688 |
gr.Markdown(
|
| 689 |
+
"- Pipeline: ColumnTransformer → (RobustScaler + OneHot) → XGBoost\n"
|
| 690 |
+
"- Target: Stress GF (MPa<sup>-1</sup>) on original scale (model trains on log1p).\n"
|
| 691 |
"- Missing values are safely imputed per-feature.\n"
|
| 692 |
"- Trained columns:\n"
|
| 693 |
+
f" `{', '.join(MAIN_VARIABLES)}`",
|
| 694 |
+
elem_classes=["prose"]
|
| 695 |
)
|
| 696 |
|
|
|
|
| 697 |
inputs_in_order = [
|
| 698 |
f1_type, f1_diam, f1_len, cf_conc,
|
| 699 |
f1_dim, f2_type, f2_diam, f2_len,
|
|
|
|
| 708 |
return predict_fn(**data)
|
| 709 |
|
| 710 |
btn_pred.click(_predict_wrapper, inputs=inputs_in_order, outputs=out_pred)
|
| 711 |
+
btn_clear.click(lambda: _clear_all(), inputs=None, outputs=inputs_in_order).then(lambda: 0.0, outputs=out_pred)
|
| 712 |
btn_demo.click(lambda: _fill_example(), inputs=None, outputs=inputs_in_order)
|
| 713 |
|
| 714 |
# ------------------------- Literature Tab -------------------------
|
| 715 |
+
with gr.Tab("📚 Ask the Literature (Hybrid RAG + MMR)", elem_id="rag-tab"):
|
| 716 |
gr.Markdown(
|
| 717 |
"Upload PDFs into the repository folder <code>papers/</code> then reload the Space. "
|
| 718 |
+
"Answers cite (Doc.pdf, p.X)."
|
| 719 |
)
|
| 720 |
with gr.Row():
|
| 721 |
top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
|
| 722 |
n_sentences = gr.Slider(2, 6, value=4, step=1, label="Answer length (sentences)")
|
| 723 |
+
include_passages = gr.Checkbox(value=False, label="Include supporting passages", interactive=True)
|
| 724 |
+
|
| 725 |
with gr.Accordion("Retriever weights (advanced)", open=False):
|
| 726 |
w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
|
| 727 |
w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
|
| 728 |
+
w_emb = gr.Slider(0.0, 1.0, value=(0.0 if not USE_DENSE else 0.40), step=0.05, label="Dense weight (set 0 if disabled)")
|
| 729 |
+
|
| 730 |
+
# ---- Hidden states for LLM behavior (no visible controls) ----
|
| 731 |
+
state_use_llm = gr.State(LLM_AVAILABLE) # True when key present; else False
|
| 732 |
+
state_model_name = gr.State(os.getenv("OPENAI_MODEL", OPENAI_MODEL))
|
| 733 |
+
state_temperature = gr.State(0.2)
|
| 734 |
+
state_strict = gr.State(False) # hidden: default to not-strict
|
| 735 |
+
|
| 736 |
gr.ChatInterface(
|
| 737 |
fn=rag_chat_fn,
|
| 738 |
+
additional_inputs=[
|
| 739 |
+
top_k, n_sentences, include_passages,
|
| 740 |
+
state_use_llm, state_model_name, state_temperature, state_strict,
|
| 741 |
+
w_tfidf, w_bm25, w_emb
|
| 742 |
+
],
|
| 743 |
title="Literature Q&A",
|
| 744 |
+
description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations."
|
| 745 |
)
|
| 746 |
|
| 747 |
# ------------- Launch -------------
|
| 748 |
if __name__ == "__main__":
|
|
|
|
| 749 |
demo.queue().launch()
|