Inframat-x commited on
Commit
ed83d97
·
verified ·
1 Parent(s): 6d84002

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +239 -27
app.py CHANGED
@@ -5,6 +5,7 @@
5
  # - Predictor: safe model caching + safe feature alignment
6
  # - Stable categoricals ("NA"); no over-strict completeness gate
7
  # - Fixed [[PAGE=...]] regex
 
8
  # ================================================================
9
 
10
  # ---------------------- Runtime flags (HF-safe) ----------------------
@@ -14,7 +15,7 @@ os.environ["TRANSFORMERS_NO_FLAX"] = "1"
14
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
15
 
16
  # ------------------------------- Imports ------------------------------
17
- import re, joblib, warnings, json, traceback
18
  from pathlib import Path
19
  from typing import List, Dict, Any
20
 
@@ -548,9 +549,27 @@ def compose_extractive(selected: List[Dict[str, Any]]) -> str:
548
  return ""
549
  return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
550
 
551
- def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2) -> str:
552
- if not LLM_AVAILABLE:
 
 
 
 
 
 
 
 
 
 
 
 
553
  return None
 
 
 
 
 
 
554
  client = OpenAI(api_key=OPENAI_API_KEY)
555
  model = model or OPENAI_MODEL
556
  SYSTEM_PROMPT = (
@@ -573,9 +592,19 @@ def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = N
573
  ],
574
  temperature=temperature,
575
  )
576
- return getattr(resp, "output_text", None) or str(resp)
 
 
 
 
 
 
 
 
 
 
577
  except Exception:
578
- return None
579
 
580
  def rag_reply(
581
  question: str,
@@ -590,41 +619,139 @@ def rag_reply(
590
  w_bm25: float = W_BM25_DEFAULT,
591
  w_emb: float = W_EMB_DEFAULT
592
  ) -> str:
 
 
 
 
 
593
  hits = hybrid_search(question, k=k, w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb)
594
- if hits is None or hits.empty:
595
- return "No indexed PDFs found. Upload PDFs to the 'papers/' folder and reload the Space."
596
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
  selected = mmr_select_sentences(question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7)
598
  header_cites = "; ".join(f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})" for _, r in hits.head(6).iterrows())
599
  srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
600
  coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
601
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602
  if strict_quotes_only:
603
  if not selected:
604
- return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
605
- msg = "**Quoted Passages:**\n- " + "\n- ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
606
- msg += f"\n\n**Citations:** {header_cites}{coverage_note}"
607
- if include_passages:
608
- msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
609
- return msg
610
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
  extractive = compose_extractive(selected)
 
 
612
  if use_llm and selected:
613
  lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
614
- llm_text = synthesize_with_llm(question, lines, model=model, temperature=temperature)
 
 
 
 
615
  if llm_text:
616
- msg = f"**Answer (LLM synthesis):** {llm_text}\n\n**Citations:** {header_cites}{coverage_note}"
617
  if include_passages:
618
- msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
619
- return msg
620
-
621
- if not extractive:
622
- return f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
623
-
624
- msg = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
625
- if include_passages:
626
- msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
627
- return msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628
 
629
  def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
630
  use_llm, model_name, temperature, strict_quotes_only,
@@ -664,7 +791,7 @@ input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !i
664
  .gr-checkbox label, .gr-check-radio label { pointer-events: auto !important; cursor: pointer; }
665
  #rag-tab input[type="checkbox"] { accent-color: #60a5fa !important; }
666
 
667
- /* RAG tab background and elements */
668
  #rag-tab .block, #rag-tab .group, #rag-tab .accordion {
669
  background: linear-gradient(160deg, #1f2937 0%, #14532d 55%, #0b3b68 100%) !important;
670
  border-radius: 12px;
@@ -691,6 +818,35 @@ input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !i
691
  color: #eef6ff !important;
692
  }
693
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
694
  /* Predictor output emphasis */
695
  #pred-out .wrap { font-size: 20px; font-weight: 700; color: #ecfdf5; }
696
  """
@@ -826,6 +982,62 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
826
  description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations."
827
  )
828
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
829
  # ------------- Launch -------------
830
  if __name__ == "__main__":
831
  demo.queue().launch()
 
5
  # - Predictor: safe model caching + safe feature alignment
6
  # - Stable categoricals ("NA"); no over-strict completeness gate
7
  # - Fixed [[PAGE=...]] regex
8
+ # - NEW: Lightweight instrumentation (JSONL logs per RAG turn)
9
  # ================================================================
10
 
11
  # ---------------------- Runtime flags (HF-safe) ----------------------
 
15
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
16
 
17
  # ------------------------------- Imports ------------------------------
18
+ import re, joblib, warnings, json, traceback, time, uuid, subprocess, sys
19
  from pathlib import Path
20
  from typing import List, Dict, Any
21
 
 
549
  return ""
550
  return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
551
 
552
+ # ========================= NEW: Instrumentation helpers =========================
553
+ LOG_PATH = ARTIFACT_DIR / "rag_logs.jsonl"
554
+ OPENAI_IN_COST_PER_1K = float(os.getenv("OPENAI_COST_IN_PER_1K", "0"))
555
+ OPENAI_OUT_COST_PER_1K = float(os.getenv("OPENAI_COST_OUT_PER_1K", "0"))
556
+
557
+ def _safe_write_jsonl(path: Path, record: dict):
558
+ try:
559
+ with open(path, "a", encoding="utf-8") as f:
560
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
561
+ except Exception as e:
562
+ print("[Log] write failed:", e)
563
+
564
+ def _calc_cost_usd(prompt_toks, completion_toks):
565
+ if prompt_toks is None or completion_toks is None:
566
  return None
567
+ return (prompt_toks / 1000.0) * OPENAI_IN_COST_PER_1K + (completion_toks / 1000.0) * OPENAI_OUT_COST_PER_1K
568
+
569
+ # ----------------- Modified to return (text, usage_dict) -----------------
570
+ def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2):
571
+ if not LLM_AVAILABLE:
572
+ return None, None
573
  client = OpenAI(api_key=OPENAI_API_KEY)
574
  model = model or OPENAI_MODEL
575
  SYSTEM_PROMPT = (
 
592
  ],
593
  temperature=temperature,
594
  )
595
+ out_text = getattr(resp, "output_text", None) or str(resp)
596
+ usage = None
597
+ try:
598
+ u = getattr(resp, "usage", None)
599
+ if u:
600
+ pt = getattr(u, "prompt_tokens", None) if hasattr(u, "prompt_tokens") else u.get("prompt_tokens", None)
601
+ ct = getattr(u, "completion_tokens", None) if hasattr(u, "completion_tokens") else u.get("completion_tokens", None)
602
+ usage = {"prompt_tokens": pt, "completion_tokens": ct}
603
+ except Exception:
604
+ usage = None
605
+ return out_text, usage
606
  except Exception:
607
+ return None, None
608
 
609
  def rag_reply(
610
  question: str,
 
619
  w_bm25: float = W_BM25_DEFAULT,
620
  w_emb: float = W_EMB_DEFAULT
621
  ) -> str:
622
+ run_id = str(uuid.uuid4())
623
+ t0_total = time.time()
624
+ t0_retr = time.time()
625
+
626
+ # --- Retrieval ---
627
  hits = hybrid_search(question, k=k, w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb)
628
+ t1_retr = time.time()
629
+ latency_ms_retriever = int((t1_retr - t0_retr) * 1000)
630
 
631
+ if hits is None or hits.empty:
632
+ final = "No indexed PDFs found. Upload PDFs to the 'papers/' folder and reload the Space."
633
+ record = {
634
+ "run_id": run_id,
635
+ "ts": int(time.time()*1000),
636
+ "inputs": {
637
+ "question": question, "top_k": int(k), "n_sentences": int(n_sentences),
638
+ "w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
639
+ "use_llm": bool(use_llm), "model": model, "temperature": float(temperature)
640
+ },
641
+ "retrieval": {"hits": [], "latency_ms_retriever": latency_ms_retriever},
642
+ "output": {"final_answer": final, "used_sentences": []},
643
+ "latency_ms_total": int((time.time()-t0_total)*1000),
644
+ "openai": None
645
+ }
646
+ _safe_write_jsonl(LOG_PATH, record)
647
+ return final
648
+
649
+ # Select sentences
650
  selected = mmr_select_sentences(question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7)
651
  header_cites = "; ".join(f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})" for _, r in hits.head(6).iterrows())
652
  srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
653
  coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
654
 
655
+ # Prepare retrieval list for logging
656
+ retr_list = []
657
+ for _, r in hits.iterrows():
658
+ retr_list.append({
659
+ "doc": Path(r["doc_path"]).name,
660
+ "page": _extract_page(r["text"]),
661
+ "score_tfidf": float(r.get("score_tfidf", 0.0)),
662
+ "score_bm25": float(r.get("score_bm25", 0.0)),
663
+ "score_dense": float(r.get("score_dense", 0.0)),
664
+ "combo_score": float(r.get("score", 0.0)),
665
+ })
666
+
667
+ # Strict quotes only (no LLM)
668
  if strict_quotes_only:
669
  if not selected:
670
+ final = f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
671
+ else:
672
+ final = "**Quoted Passages:**\n- " + "\n- ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
673
+ final += f"\n\n**Citations:** {header_cites}{coverage_note}"
674
+ if include_passages:
675
+ final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
676
+
677
+ record = {
678
+ "run_id": run_id,
679
+ "ts": int(time.time()*1000),
680
+ "inputs": {
681
+ "question": question, "top_k": int(k), "n_sentences": int(n_sentences),
682
+ "w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
683
+ "use_llm": False, "model": None, "temperature": float(temperature)
684
+ },
685
+ "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
686
+ "output": {
687
+ "final_answer": final,
688
+ "used_sentences": [{"sent": s["sent"], "doc": s["doc"], "page": s["page"]} for s in selected]
689
+ },
690
+ "latency_ms_total": int((time.time()-t0_total)*1000),
691
+ "openai": None
692
+ }
693
+ _safe_write_jsonl(LOG_PATH, record)
694
+ return final
695
+
696
+ # Extractive or LLM synthesis
697
  extractive = compose_extractive(selected)
698
+ llm_usage = None
699
+ llm_latency_ms = None
700
  if use_llm and selected:
701
  lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
702
+ t0_llm = time.time()
703
+ llm_text, llm_usage = synthesize_with_llm(question, lines, model=model, temperature=temperature)
704
+ t1_llm = time.time()
705
+ llm_latency_ms = int((t1_llm - t0_llm) * 1000)
706
+
707
  if llm_text:
708
+ final = f"**Answer (LLM synthesis):** {llm_text}\n\n**Citations:** {header_cites}{coverage_note}"
709
  if include_passages:
710
+ final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
711
+ else:
712
+ if not extractive:
713
+ final = f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
714
+ else:
715
+ final = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
716
+ if include_passages:
717
+ final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
718
+ else:
719
+ if not extractive:
720
+ final = f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
721
+ else:
722
+ final = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
723
+ if include_passages:
724
+ final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
725
+
726
+ # --------- Log full run ---------
727
+ prompt_toks = llm_usage.get("prompt_tokens") if llm_usage else None
728
+ completion_toks = llm_usage.get("completion_tokens") if llm_usage else None
729
+ cost_usd = _calc_cost_usd(prompt_toks, completion_toks)
730
+
731
+ total_ms = int((time.time() - t0_total) * 1000)
732
+ record = {
733
+ "run_id": run_id,
734
+ "ts": int(time.time()*1000),
735
+ "inputs": {
736
+ "question": question, "top_k": int(k), "n_sentences": int(n_sentences),
737
+ "w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
738
+ "use_llm": bool(use_llm), "model": model, "temperature": float(temperature)
739
+ },
740
+ "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
741
+ "output": {
742
+ "final_answer": final,
743
+ "used_sentences": [{"sent": s["sent"], "doc": s["doc"], "page": s["page"]} for s in selected]
744
+ },
745
+ "latency_ms_total": total_ms,
746
+ "latency_ms_llm": llm_latency_ms,
747
+ "openai": {
748
+ "prompt_tokens": prompt_toks,
749
+ "completion_tokens": completion_toks,
750
+ "cost_usd": cost_usd
751
+ } if use_llm else None
752
+ }
753
+ _safe_write_jsonl(LOG_PATH, record)
754
+ return final
755
 
756
  def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
757
  use_llm, model_name, temperature, strict_quotes_only,
 
791
  .gr-checkbox label, .gr-check-radio label { pointer-events: auto !important; cursor: pointer; }
792
  #rag-tab input[type="checkbox"] { accent-color: #60a5fa !important; }
793
 
794
+ /* RAG tab styling */
795
  #rag-tab .block, #rag-tab .group, #rag-tab .accordion {
796
  background: linear-gradient(160deg, #1f2937 0%, #14532d 55%, #0b3b68 100%) !important;
797
  border-radius: 12px;
 
818
  color: #eef6ff !important;
819
  }
820
 
821
+ /* NEW — Evaluate tab dark/high-contrast styling */
822
+ #eval-tab .block, #eval-tab .group, #eval-tab .accordion {
823
+ background: linear-gradient(165deg, #0a0f1f 0%, #0d1a31 60%, #0a1c2e 100%) !important;
824
+ border-radius: 12px;
825
+ border: 1px solid rgba(139, 197, 255, 0.28);
826
+ }
827
+ #eval-tab label, #eval-tab .markdown, #eval-tab .prose, #eval-tab p, #eval-tab span {
828
+ color: #e6f2ff !important;
829
+ }
830
+ #eval-tab input, #eval-tab .gr-file, #eval-tab .scroll-hide, #eval-tab textarea, #eval-tab select {
831
+ background: rgba(8, 13, 26, 0.9) !important;
832
+ border: 1px solid #3b82f6 !important;
833
+ color: #dbeafe !important;
834
+ }
835
+ #eval-tab input[type="range"] { accent-color: #22c55e !important; }
836
+ #eval-tab button {
837
+ border-radius: 10px !important;
838
+ font-weight: 700 !important;
839
+ background: #0ea5e9 !important;
840
+ color: #001321 !important;
841
+ border: 1px solid #7dd3fc !important;
842
+ }
843
+ #eval-tab .gr-json, #eval-tab .markdown pre, #eval-tab .markdown code {
844
+ background: rgba(2, 6, 23, 0.85) !important;
845
+ color: #e2e8f0 !important;
846
+ border: 1px solid rgba(148, 163, 184, 0.3) !important;
847
+ border-radius: 10px !important;
848
+ }
849
+
850
  /* Predictor output emphasis */
851
  #pred-out .wrap { font-size: 20px; font-weight: 700; color: #ecfdf5; }
852
  """
 
982
  description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations."
983
  )
984
 
985
+ # ====== Evaluate (Gold vs Logs) — darker, higher-contrast ======
986
+ with gr.Tab("📏 Evaluate (Gold vs Logs)", elem_id="eval-tab"):
987
+ gr.Markdown("Upload your **gold.csv** and compute metrics against the app logs.")
988
+ with gr.Row():
989
+ gold_file = gr.File(label="gold.csv", file_types=[".csv"], interactive=True)
990
+ k_slider = gr.Slider(3, 12, value=8, step=1, label="k for Hit/Recall/nDCG")
991
+ with gr.Row():
992
+ btn_eval = gr.Button("Compute Metrics", variant="primary")
993
+ with gr.Row():
994
+ out_perq = gr.File(label="Per-question metrics (CSV)")
995
+ out_agg = gr.File(label="Aggregate metrics (JSON)")
996
+ out_json = gr.JSON(label="Aggregate summary")
997
+ out_log = gr.Markdown(label="Run log")
998
+
999
+ def _run_eval_inproc(gold_path: str, k: int = 8):
1000
+ import json as _json
1001
+ out_dir = str(ARTIFACT_DIR)
1002
+ logs = str(LOG_PATH)
1003
+ cmd = [
1004
+ sys.executable, "rag_eval_metrics.py",
1005
+ "--gold_csv", gold_path,
1006
+ "--logs_jsonl", logs,
1007
+ "--k", str(k),
1008
+ "--out_dir", out_dir
1009
+ ]
1010
+ try:
1011
+ p = subprocess.run(cmd, capture_output=True, text=True, check=False)
1012
+ stdout = p.stdout or ""
1013
+ stderr = p.stderr or ""
1014
+ perq = ARTIFACT_DIR / "metrics_per_question.csv"
1015
+ agg = ARTIFACT_DIR / "metrics_aggregate.json"
1016
+ agg_json = {}
1017
+ if agg.exists():
1018
+ agg_json = _json.loads(agg.read_text(encoding="utf-8"))
1019
+ report = "```\n" + (stdout.strip() or "(no stdout)") + ("\n" + stderr.strip() if stderr else "") + "\n```"
1020
+ return (str(perq) if perq.exists() else None,
1021
+ str(agg) if agg.exists() else None,
1022
+ agg_json,
1023
+ report)
1024
+ except Exception as e:
1025
+ return (None, None, {}, f"**Eval error:** {e}")
1026
+
1027
+ def _eval_wrapper(gf, k):
1028
+ from pathlib import Path
1029
+ if gf is None:
1030
+ default_gold = Path("gold.csv")
1031
+ if not default_gold.exists():
1032
+ return None, None, {}, "**No gold.csv provided or found in repo root.**"
1033
+ gold_path = str(default_gold)
1034
+ else:
1035
+ gold_path = gf.name
1036
+ return _run_eval_inproc(gold_path, int(k))
1037
+
1038
+ btn_eval.click(_eval_wrapper, inputs=[gold_file, k_slider],
1039
+ outputs=[out_perq, out_agg, out_json, out_log])
1040
+
1041
  # ------------- Launch -------------
1042
  if __name__ == "__main__":
1043
  demo.queue().launch()