OmarOmar91 commited on
Commit
0043242
·
verified ·
1 Parent(s): 41f2c66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -72
app.py CHANGED
@@ -1,9 +1,8 @@
1
  # ================================================================
2
  # Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
3
- # - Predictor tab: identical behavior to the "second code" (kept)
4
- # - Literature tab: from the "first code" (Hybrid RAG + MMR)
5
- # - UX: LLM ON by default (if key present), mutual-exclusion w/ Strict,
6
- # science-oriented styling, and prediction=0.0 if inputs incomplete
7
  # ================================================================
8
 
9
  # ---------------------- Runtime flags (HF-safe) ----------------------
@@ -44,10 +43,10 @@ try:
44
  except Exception:
45
  OpenAI = None
46
 
47
- # LLM availability flag enables default ON when available
48
  LLM_AVAILABLE = (OPENAI_API_KEY is not None and OPENAI_API_KEY.strip() != "" and OpenAI is not None)
49
 
50
- # ========================= Predictor (kept same as 2nd) =========================
51
  CF_COL = "Conductive Filler Conc. (wt%)"
52
  TARGET_COL = "Stress GF (MPa-1)"
53
 
@@ -106,7 +105,6 @@ CATEGORICAL_COLS = {
106
  "Current Type"
107
  }
108
 
109
- # Optional fields (allowed to be missing). All others required for prediction.
110
  OPTIONAL_FIELDS = {
111
  "Filler 2 Type",
112
  "Filler 2 Diameter (µm)",
@@ -150,12 +148,6 @@ def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
150
  return pd.DataFrame([row], columns=MAIN_VARIABLES)
151
 
152
  def _is_complete(form_dict: dict) -> bool:
153
- """
154
- Completeness rule:
155
- - All fields except OPTIONAL_FIELDS must be present.
156
- - For NUMERIC_COLS (except optional), require not-NaN.
157
- - For CATEGORICAL_COLS (except optional), require non-empty string (no 'NA').
158
- """
159
  for col in MAIN_VARIABLES:
160
  if col in OPTIONAL_FIELDS:
161
  continue
@@ -177,10 +169,8 @@ def _is_complete(form_dict: dict) -> bool:
177
  return True
178
 
179
  def predict_fn(**kwargs):
180
- # If incomplete, return 0.0 per requirement
181
  if not _is_complete(kwargs):
182
  return 0.0
183
-
184
  mdl = _load_model_or_error()
185
  if isinstance(mdl, str):
186
  return mdl
@@ -236,8 +226,7 @@ def _clear_all():
236
  cleared.append("")
237
  return cleared
238
 
239
- # ========================= Hybrid RAG (from 1st code) =========================
240
- # Configuration
241
  ARTIFACT_DIR = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
242
  TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
243
  TFIDF_MAT_PATH = ARTIFACT_DIR / "tfidf_matrix.joblib"
@@ -245,16 +234,13 @@ BM25_TOK_PATH = ARTIFACT_DIR / "bm25_tokens.joblib"
245
  EMB_NPY_PATH = ARTIFACT_DIR / "chunk_embeddings.npy"
246
  RAG_META_PATH = ARTIFACT_DIR / "chunks.parquet"
247
 
248
- # PDF source (HF-safe: rely on local /papers by default)
249
  LOCAL_PDF_DIR = Path("papers"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
250
  USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
251
 
252
- # Retrieval weights
253
  W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
254
  W_BM25_DEFAULT = 0.50 if not USE_DENSE else 0.30
255
  W_EMB_DEFAULT = 0.00 if USE_DENSE is False else 0.40
256
 
257
- # Simple text processing
258
  _SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
259
  TOKEN_RE = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
260
  def sent_split(text: str) -> List[str]:
@@ -263,7 +249,6 @@ def sent_split(text: str) -> List[str]:
263
  def tokenize(text: str) -> List[str]:
264
  return [t.lower() for t in TOKEN_RE.findall(text)]
265
 
266
- # PDF text extraction
267
  def _extract_pdf_text(pdf_path: Path) -> str:
268
  try:
269
  import fitz
@@ -305,7 +290,6 @@ def _safe_init_st_model(name: str):
305
  USE_DENSE = False
306
  return None
307
 
308
- # Build or load index
309
  def build_or_load_hybrid(pdf_dir: Path):
310
  have_cache = (TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
311
  and RAG_META_PATH.exists()
@@ -358,13 +342,11 @@ def build_or_load_hybrid(pdf_dir: Path):
358
  print("Dense embedding failed:", e)
359
  emb = None
360
 
361
- # Save artifacts
362
  joblib.dump(vectorizer, TFIDF_VECT_PATH)
363
- joblib.dump(X_tfidf, TFIDF_MAT_PATH)
364
  if BM25Okapi is not None:
365
  joblib.dump(all_tokens, BM25_TOK_PATH)
366
  meta.to_parquet(RAG_META_PATH, index=False)
367
-
368
  return vectorizer, X_tfidf, meta, all_tokens, emb
369
 
370
  tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(LOCAL_PDF_DIR)
@@ -372,7 +354,7 @@ bm25 = BM25Okapi(bm25_tokens) if (BM25Okapi is not None and bm25_tokens is not N
372
  st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
373
 
374
  def _extract_page(text_chunk: str) -> str:
375
- m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
376
  return (m[-1].group(1) if m else "?")
377
 
378
  def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
@@ -401,7 +383,7 @@ def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAUL
401
 
402
  # BM25 scores
403
  if bm25 is not None:
404
- q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\-/\.%]+", query)]
405
  bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
406
  else:
407
  bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
@@ -536,6 +518,7 @@ def rag_reply(
536
  srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
537
  coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
538
 
 
539
  if strict_quotes_only:
540
  if not selected:
541
  return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
@@ -585,15 +568,6 @@ def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
585
  except Exception as e:
586
  return f"RAG error: {e}"
587
 
588
- # ---------- Small helpers to keep checkboxes mutually exclusive ----------
589
- def _strict_on(strict, use_llm):
590
- # If strict is turned on, force LLM off; otherwise keep LLM as is.
591
- return False if strict else use_llm
592
-
593
- def _llm_on(use_llm, strict):
594
- # If LLM is turned on, force strict off; otherwise keep strict as is.
595
- return False if use_llm else strict
596
-
597
  # ========================= UI (science-oriented styling) =========================
598
  CSS = """
599
  /* Science-oriented: crisp contrast + readable numerics */
@@ -605,7 +579,7 @@ CSS = """
605
  label {color: #e8f7ff !important; text-shadow: 0 1px 0 rgba(0,0,0,0.35); cursor: pointer;}
606
  input[type="number"] {font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;}
607
 
608
- /* Checkbox clickability fixes (themes can overlay labels) */
609
  input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !important; }
610
  .gr-checkbox label, .gr-check-radio label { pointer-events: auto !important; cursor: pointer; }
611
  #rag-tab input[type="checkbox"] { accent-color: #60a5fa !important; }
@@ -720,7 +694,6 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
720
  elem_classes=["prose"]
721
  )
722
 
723
- # Wire predictor buttons
724
  inputs_in_order = [
725
  f1_type, f1_diam, f1_len, cf_conc,
726
  f1_dim, f2_type, f2_diam, f2_len,
@@ -742,7 +715,7 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
742
  with gr.Tab("📚 Ask the Literature (Hybrid RAG + MMR)", elem_id="rag-tab"):
743
  gr.Markdown(
744
  "Upload PDFs into the repository folder <code>papers/</code> then reload the Space. "
745
- "Answers cite (Doc.pdf, p.X). Toggle strict quotes or optional LLM paraphrasing."
746
  )
747
  with gr.Row():
748
  top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
@@ -754,43 +727,21 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
754
  w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
755
  w_emb = gr.Slider(0.0, 1.0, value=(0.0 if not USE_DENSE else 0.40), step=0.05, label="Dense weight (set 0 if disabled)")
756
 
757
- with gr.Accordion("LLM & Controls", open=False):
758
- strict_quotes_only = gr.Checkbox(
759
- value=False, label="Strict quotes only (no paraphrasing)", interactive=True
760
- )
761
- # LLM ON by default when available
762
- use_llm = gr.Checkbox(
763
- value=LLM_AVAILABLE, label="Use LLM to paraphrase selected sentences",
764
- interactive=LLM_AVAILABLE
765
- )
766
- model_name = gr.Textbox(
767
- value=os.getenv("OPENAI_MODEL", OPENAI_MODEL),
768
- label="LLM model", placeholder="e.g., gpt-5 or gpt-5-mini"
769
- )
770
- temperature = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="Temperature")
771
-
772
- # Availability banner
773
- llm_status = gr.Markdown(
774
- "**LLM status:** " + (
775
- "✅ Ready — paraphrasing enabled by default." if LLM_AVAILABLE
776
- else "⚠️ Disabled — set `OPENAI_API_KEY` to enable paraphrasing."
777
- )
778
- )
779
-
780
- # Make the two checkboxes mutually exclusive
781
- strict_quotes_only.change(
782
- _strict_on, inputs=[strict_quotes_only, use_llm], outputs=use_llm
783
- )
784
- use_llm.change(
785
- _llm_on, inputs=[use_llm, strict_quotes_only], outputs=strict_quotes_only
786
- )
787
 
788
  gr.ChatInterface(
789
  fn=rag_chat_fn,
790
- additional_inputs=[top_k, n_sentences, include_passages, use_llm, model_name,
791
- temperature, strict_quotes_only, w_tfidf, w_bm25, w_emb],
 
 
 
792
  title="Literature Q&A",
793
- description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations. Toggle strict/LLM modes."
794
  )
795
 
796
  # ------------- Launch -------------
 
1
  # ================================================================
2
  # Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
3
+ # - Predictor tab: identical behavior (kept)
4
+ # - Literature tab: Hybrid RAG; LLM runs silently when available
5
+ # - UX: no visible "LLM & Controls" window; prediction=0.0 if incomplete
 
6
  # ================================================================
7
 
8
  # ---------------------- Runtime flags (HF-safe) ----------------------
 
43
  except Exception:
44
  OpenAI = None
45
 
46
+ # LLM availability flag used internally; UI remains hidden
47
  LLM_AVAILABLE = (OPENAI_API_KEY is not None and OPENAI_API_KEY.strip() != "" and OpenAI is not None)
48
 
49
+ # ========================= Predictor (kept) =========================
50
  CF_COL = "Conductive Filler Conc. (wt%)"
51
  TARGET_COL = "Stress GF (MPa-1)"
52
 
 
105
  "Current Type"
106
  }
107
 
 
108
  OPTIONAL_FIELDS = {
109
  "Filler 2 Type",
110
  "Filler 2 Diameter (µm)",
 
148
  return pd.DataFrame([row], columns=MAIN_VARIABLES)
149
 
150
  def _is_complete(form_dict: dict) -> bool:
 
 
 
 
 
 
151
  for col in MAIN_VARIABLES:
152
  if col in OPTIONAL_FIELDS:
153
  continue
 
169
  return True
170
 
171
  def predict_fn(**kwargs):
 
172
  if not _is_complete(kwargs):
173
  return 0.0
 
174
  mdl = _load_model_or_error()
175
  if isinstance(mdl, str):
176
  return mdl
 
226
  cleared.append("")
227
  return cleared
228
 
229
+ # ========================= Hybrid RAG =========================
 
230
  ARTIFACT_DIR = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
231
  TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
232
  TFIDF_MAT_PATH = ARTIFACT_DIR / "tfidf_matrix.joblib"
 
234
  EMB_NPY_PATH = ARTIFACT_DIR / "chunk_embeddings.npy"
235
  RAG_META_PATH = ARTIFACT_DIR / "chunks.parquet"
236
 
 
237
  LOCAL_PDF_DIR = Path("papers"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
238
  USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
239
 
 
240
  W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
241
  W_BM25_DEFAULT = 0.50 if not USE_DENSE else 0.30
242
  W_EMB_DEFAULT = 0.00 if USE_DENSE is False else 0.40
243
 
 
244
  _SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
245
  TOKEN_RE = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
246
  def sent_split(text: str) -> List[str]:
 
249
  def tokenize(text: str) -> List[str]:
250
  return [t.lower() for t in TOKEN_RE.findall(text)]
251
 
 
252
  def _extract_pdf_text(pdf_path: Path) -> str:
253
  try:
254
  import fitz
 
290
  USE_DENSE = False
291
  return None
292
 
 
293
  def build_or_load_hybrid(pdf_dir: Path):
294
  have_cache = (TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
295
  and RAG_META_PATH.exists()
 
342
  print("Dense embedding failed:", e)
343
  emb = None
344
 
 
345
  joblib.dump(vectorizer, TFIDF_VECT_PATH)
346
+ joblib.dump(X_tfidF:=X_tfidf, TFIDF_MAT_PATH) # assign + save
347
  if BM25Okapi is not None:
348
  joblib.dump(all_tokens, BM25_TOK_PATH)
349
  meta.to_parquet(RAG_META_PATH, index=False)
 
350
  return vectorizer, X_tfidf, meta, all_tokens, emb
351
 
352
  tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(LOCAL_PDF_DIR)
 
354
  st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
355
 
356
  def _extract_page(text_chunk: str) -> str:
357
+ m = list(re.finditer(r"\\[\\[PAGE=(\\d+)\\]\\]", text_chunk or ""))
358
  return (m[-1].group(1) if m else "?")
359
 
360
  def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
 
383
 
384
  # BM25 scores
385
  if bm25 is not None:
386
+ q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\\-\\/\\.%%]+", query)]
387
  bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
388
  else:
389
  bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
 
518
  srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
519
  coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
520
 
521
+ # Hidden policy: if strict==True → no paraphrasing; else try LLM if available
522
  if strict_quotes_only:
523
  if not selected:
524
  return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
 
568
  except Exception as e:
569
  return f"RAG error: {e}"
570
 
 
 
 
 
 
 
 
 
 
571
  # ========================= UI (science-oriented styling) =========================
572
  CSS = """
573
  /* Science-oriented: crisp contrast + readable numerics */
 
579
  label {color: #e8f7ff !important; text-shadow: 0 1px 0 rgba(0,0,0,0.35); cursor: pointer;}
580
  input[type="number"] {font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;}
581
 
582
+ /* Checkbox clickability fixes */
583
  input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !important; }
584
  .gr-checkbox label, .gr-check-radio label { pointer-events: auto !important; cursor: pointer; }
585
  #rag-tab input[type="checkbox"] { accent-color: #60a5fa !important; }
 
694
  elem_classes=["prose"]
695
  )
696
 
 
697
  inputs_in_order = [
698
  f1_type, f1_diam, f1_len, cf_conc,
699
  f1_dim, f2_type, f2_diam, f2_len,
 
715
  with gr.Tab("📚 Ask the Literature (Hybrid RAG + MMR)", elem_id="rag-tab"):
716
  gr.Markdown(
717
  "Upload PDFs into the repository folder <code>papers/</code> then reload the Space. "
718
+ "Answers cite (Doc.pdf, p.X)."
719
  )
720
  with gr.Row():
721
  top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
 
727
  w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
728
  w_emb = gr.Slider(0.0, 1.0, value=(0.0 if not USE_DENSE else 0.40), step=0.05, label="Dense weight (set 0 if disabled)")
729
 
730
+ # ---- Hidden states for LLM behavior (no visible controls) ----
731
+ state_use_llm = gr.State(LLM_AVAILABLE) # True when key present; else False
732
+ state_model_name = gr.State(os.getenv("OPENAI_MODEL", OPENAI_MODEL))
733
+ state_temperature = gr.State(0.2)
734
+ state_strict = gr.State(False) # hidden: default to not-strict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
735
 
736
  gr.ChatInterface(
737
  fn=rag_chat_fn,
738
+ additional_inputs=[
739
+ top_k, n_sentences, include_passages,
740
+ state_use_llm, state_model_name, state_temperature, state_strict,
741
+ w_tfidf, w_bm25, w_emb
742
+ ],
743
  title="Literature Q&A",
744
+ description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations."
745
  )
746
 
747
  # ------------- Launch -------------