Spaces:

Inframat-x
/

For_evaluation

Sleeping

App Files Files Community

Inframat-x commited on Nov 6, 2025

Commit

ed83d97

verified ·

1 Parent(s): 6d84002

Update app.py

Browse files

Files changed (1) hide show

app.py +239 -27

app.py CHANGED Viewed

@@ -5,6 +5,7 @@
 # - Predictor: safe model caching + safe feature alignment
 # - Stable categoricals ("NA"); no over-strict completeness gate
 # - Fixed [[PAGE=...]] regex
 # ================================================================
 # ---------------------- Runtime flags (HF-safe) ----------------------
@@ -14,7 +15,7 @@ os.environ["TRANSFORMERS_NO_FLAX"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # ------------------------------- Imports ------------------------------
-import re, joblib, warnings, json, traceback
 from pathlib import Path
 from typing import List, Dict, Any
@@ -548,9 +549,27 @@ def compose_extractive(selected: List[Dict[str, Any]]) -> str:
         return ""
     return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
-def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2) -> str:
-    if not LLM_AVAILABLE:
         return None
     client = OpenAI(api_key=OPENAI_API_KEY)
     model = model or OPENAI_MODEL
     SYSTEM_PROMPT = (
@@ -573,9 +592,19 @@ def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = N
             ],
             temperature=temperature,
         )
-        return getattr(resp, "output_text", None) or str(resp)
     except Exception:
-        return None
 def rag_reply(
     question: str,
@@ -590,41 +619,139 @@ def rag_reply(
     w_bm25: float  = W_BM25_DEFAULT,
     w_emb: float   = W_EMB_DEFAULT
 ) -> str:
     hits = hybrid_search(question, k=k, w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb)
-    if hits is None or hits.empty:
-        return "No indexed PDFs found. Upload PDFs to the 'papers/' folder and reload the Space."
     selected = mmr_select_sentences(question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7)
     header_cites = "; ".join(f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})" for _, r in hits.head(6).iterrows())
     srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
     coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
     if strict_quotes_only:
         if not selected:
-            return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
-        msg = "**Quoted Passages:**\n- " + "\n- ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
-        msg += f"\n\n**Citations:** {header_cites}{coverage_note}"
-        if include_passages:
-            msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
-        return msg
     extractive = compose_extractive(selected)
     if use_llm and selected:
         lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
-        llm_text = synthesize_with_llm(question, lines, model=model, temperature=temperature)
         if llm_text:
-            msg = f"**Answer (LLM synthesis):** {llm_text}\n\n**Citations:** {header_cites}{coverage_note}"
             if include_passages:
-                msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
-            return msg
-    if not extractive:
-        return f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
-    msg = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
-    if include_passages:
-        msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
-    return msg
 def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
                 use_llm, model_name, temperature, strict_quotes_only,
@@ -664,7 +791,7 @@ input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !i
 .gr-checkbox label, .gr-check-radio label { pointer-events: auto !important; cursor: pointer; }
 #rag-tab input[type="checkbox"] { accent-color: #60a5fa !important; }
-/* RAG tab background and elements */
 #rag-tab .block, #rag-tab .group, #rag-tab .accordion {
     background: linear-gradient(160deg, #1f2937 0%, #14532d 55%, #0b3b68 100%) !important;
     border-radius: 12px;
@@ -691,6 +818,35 @@ input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !i
     color: #eef6ff !important;
 }
 /* Predictor output emphasis */
 #pred-out .wrap { font-size: 20px; font-weight: 700; color: #ecfdf5; }
 """
@@ -826,6 +982,62 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
                 description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations."
             )
 # ------------- Launch -------------
 if __name__ == "__main__":
     demo.queue().launch()

 # - Predictor: safe model caching + safe feature alignment
 # - Stable categoricals ("NA"); no over-strict completeness gate
 # - Fixed [[PAGE=...]] regex
+# - NEW: Lightweight instrumentation (JSONL logs per RAG turn)
 # ================================================================
 # ---------------------- Runtime flags (HF-safe) ----------------------
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # ------------------------------- Imports ------------------------------
+import re, joblib, warnings, json, traceback, time, uuid, subprocess, sys
 from pathlib import Path
 from typing import List, Dict, Any
         return ""
     return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
+# ========================= NEW: Instrumentation helpers =========================
+LOG_PATH = ARTIFACT_DIR / "rag_logs.jsonl"
+OPENAI_IN_COST_PER_1K  = float(os.getenv("OPENAI_COST_IN_PER_1K", "0"))
+OPENAI_OUT_COST_PER_1K = float(os.getenv("OPENAI_COST_OUT_PER_1K", "0"))
+def _safe_write_jsonl(path: Path, record: dict):
+    try:
+        with open(path, "a", encoding="utf-8") as f:
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
+    except Exception as e:
+        print("[Log] write failed:", e)
+def _calc_cost_usd(prompt_toks, completion_toks):
+    if prompt_toks is None or completion_toks is None:
         return None
+    return (prompt_toks / 1000.0) * OPENAI_IN_COST_PER_1K + (completion_toks / 1000.0) * OPENAI_OUT_COST_PER_1K
+# ----------------- Modified to return (text, usage_dict) -----------------
+def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2):
+    if not LLM_AVAILABLE:
+        return None, None
     client = OpenAI(api_key=OPENAI_API_KEY)
     model = model or OPENAI_MODEL
     SYSTEM_PROMPT = (
             ],
             temperature=temperature,
         )
+        out_text = getattr(resp, "output_text", None) or str(resp)
+        usage = None
+        try:
+            u = getattr(resp, "usage", None)
+            if u:
+                pt = getattr(u, "prompt_tokens", None) if hasattr(u, "prompt_tokens") else u.get("prompt_tokens", None)
+                ct = getattr(u, "completion_tokens", None) if hasattr(u, "completion_tokens") else u.get("completion_tokens", None)
+                usage = {"prompt_tokens": pt, "completion_tokens": ct}
+        except Exception:
+            usage = None
+        return out_text, usage
     except Exception:
+        return None, None
 def rag_reply(
     question: str,
     w_bm25: float  = W_BM25_DEFAULT,
     w_emb: float   = W_EMB_DEFAULT
 ) -> str:
+    run_id = str(uuid.uuid4())
+    t0_total = time.time()
+    t0_retr  = time.time()
+    # --- Retrieval ---
     hits = hybrid_search(question, k=k, w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb)
+    t1_retr = time.time()
+    latency_ms_retriever = int((t1_retr - t0_retr) * 1000)
+    if hits is None or hits.empty:
+        final = "No indexed PDFs found. Upload PDFs to the 'papers/' folder and reload the Space."
+        record = {
+            "run_id": run_id,
+            "ts": int(time.time()*1000),
+            "inputs": {
+                "question": question, "top_k": int(k), "n_sentences": int(n_sentences),
+                "w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
+                "use_llm": bool(use_llm), "model": model, "temperature": float(temperature)
+            },
+            "retrieval": {"hits": [], "latency_ms_retriever": latency_ms_retriever},
+            "output": {"final_answer": final, "used_sentences": []},
+            "latency_ms_total": int((time.time()-t0_total)*1000),
+            "openai": None
+        }
+        _safe_write_jsonl(LOG_PATH, record)
+        return final
+    # Select sentences
     selected = mmr_select_sentences(question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7)
     header_cites = "; ".join(f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})" for _, r in hits.head(6).iterrows())
     srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
     coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
+    # Prepare retrieval list for logging
+    retr_list = []
+    for _, r in hits.iterrows():
+        retr_list.append({
+            "doc": Path(r["doc_path"]).name,
+            "page": _extract_page(r["text"]),
+            "score_tfidf": float(r.get("score_tfidf", 0.0)),
+            "score_bm25": float(r.get("score_bm25", 0.0)),
+            "score_dense": float(r.get("score_dense", 0.0)),
+            "combo_score": float(r.get("score", 0.0)),
+        })
+    # Strict quotes only (no LLM)
     if strict_quotes_only:
         if not selected:
+            final = f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
+        else:
+            final = "**Quoted Passages:**\n- " + "\n- ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
+            final += f"\n\n**Citations:** {header_cites}{coverage_note}"
+            if include_passages:
+                final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
+        record = {
+            "run_id": run_id,
+            "ts": int(time.time()*1000),
+            "inputs": {
+                "question": question, "top_k": int(k), "n_sentences": int(n_sentences),
+                "w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
+                "use_llm": False, "model": None, "temperature": float(temperature)
+            },
+            "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
+            "output": {
+                "final_answer": final,
+                "used_sentences": [{"sent": s["sent"], "doc": s["doc"], "page": s["page"]} for s in selected]
+            },
+            "latency_ms_total": int((time.time()-t0_total)*1000),
+            "openai": None
+        }
+        _safe_write_jsonl(LOG_PATH, record)
+        return final
+    # Extractive or LLM synthesis
     extractive = compose_extractive(selected)
+    llm_usage = None
+    llm_latency_ms = None
     if use_llm and selected:
         lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
+        t0_llm = time.time()
+        llm_text, llm_usage = synthesize_with_llm(question, lines, model=model, temperature=temperature)
+        t1_llm = time.time()
+        llm_latency_ms = int((t1_llm - t0_llm) * 1000)
         if llm_text:
+            final = f"**Answer (LLM synthesis):** {llm_text}\n\n**Citations:** {header_cites}{coverage_note}"
             if include_passages:
+                final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
+        else:
+            if not extractive:
+                final = f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
+            else:
+                final = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
+                if include_passages:
+                    final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
+    else:
+        if not extractive:
+            final = f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
+        else:
+            final = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
+            if include_passages:
+                final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
+    # --------- Log full run ---------
+    prompt_toks = llm_usage.get("prompt_tokens") if llm_usage else None
+    completion_toks = llm_usage.get("completion_tokens") if llm_usage else None
+    cost_usd = _calc_cost_usd(prompt_toks, completion_toks)
+    total_ms = int((time.time() - t0_total) * 1000)
+    record = {
+        "run_id": run_id,
+        "ts": int(time.time()*1000),
+        "inputs": {
+            "question": question, "top_k": int(k), "n_sentences": int(n_sentences),
+            "w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
+            "use_llm": bool(use_llm), "model": model, "temperature": float(temperature)
+        },
+        "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
+        "output": {
+            "final_answer": final,
+            "used_sentences": [{"sent": s["sent"], "doc": s["doc"], "page": s["page"]} for s in selected]
+        },
+        "latency_ms_total": total_ms,
+        "latency_ms_llm": llm_latency_ms,
+        "openai": {
+            "prompt_tokens": prompt_toks,
+            "completion_tokens": completion_toks,
+            "cost_usd": cost_usd
+        } if use_llm else None
+    }
+    _safe_write_jsonl(LOG_PATH, record)
+    return final
 def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
                 use_llm, model_name, temperature, strict_quotes_only,
 .gr-checkbox label, .gr-check-radio label { pointer-events: auto !important; cursor: pointer; }
 #rag-tab input[type="checkbox"] { accent-color: #60a5fa !important; }
+/* RAG tab styling */
 #rag-tab .block, #rag-tab .group, #rag-tab .accordion {
     background: linear-gradient(160deg, #1f2937 0%, #14532d 55%, #0b3b68 100%) !important;
     border-radius: 12px;
     color: #eef6ff !important;
 }
+/* NEW — Evaluate tab dark/high-contrast styling */
+#eval-tab .block, #eval-tab .group, #eval-tab .accordion {
+    background: linear-gradient(165deg, #0a0f1f 0%, #0d1a31 60%, #0a1c2e 100%) !important;
+    border-radius: 12px;
+    border: 1px solid rgba(139, 197, 255, 0.28);
+}
+#eval-tab label, #eval-tab .markdown, #eval-tab .prose, #eval-tab p, #eval-tab span {
+    color: #e6f2ff !important;
+}
+#eval-tab input, #eval-tab .gr-file, #eval-tab .scroll-hide, #eval-tab textarea, #eval-tab select {
+    background: rgba(8, 13, 26, 0.9) !important;
+    border: 1px solid #3b82f6 !important;
+    color: #dbeafe !important;
+}
+#eval-tab input[type="range"] { accent-color: #22c55e !important; }
+#eval-tab button {
+    border-radius: 10px !important;
+    font-weight: 700 !important;
+    background: #0ea5e9 !important;
+    color: #001321 !important;
+    border: 1px solid #7dd3fc !important;
+}
+#eval-tab .gr-json, #eval-tab .markdown pre, #eval-tab .markdown code {
+    background: rgba(2, 6, 23, 0.85) !important;
+    color: #e2e8f0 !important;
+    border: 1px solid rgba(148, 163, 184, 0.3) !important;
+    border-radius: 10px !important;
+}
 /* Predictor output emphasis */
 #pred-out .wrap { font-size: 20px; font-weight: 700; color: #ecfdf5; }
 """
                 description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations."
             )
+        # ====== Evaluate (Gold vs Logs) — darker, higher-contrast ======
+        with gr.Tab("📏 Evaluate (Gold vs Logs)", elem_id="eval-tab"):
+            gr.Markdown("Upload your **gold.csv** and compute metrics against the app logs.")
+            with gr.Row():
+                gold_file = gr.File(label="gold.csv", file_types=[".csv"], interactive=True)
+                k_slider  = gr.Slider(3, 12, value=8, step=1, label="k for Hit/Recall/nDCG")
+            with gr.Row():
+                btn_eval = gr.Button("Compute Metrics", variant="primary")
+            with gr.Row():
+                out_perq = gr.File(label="Per-question metrics (CSV)")
+                out_agg  = gr.File(label="Aggregate metrics (JSON)")
+            out_json = gr.JSON(label="Aggregate summary")
+            out_log  = gr.Markdown(label="Run log")
+            def _run_eval_inproc(gold_path: str, k: int = 8):
+                import json as _json
+                out_dir = str(ARTIFACT_DIR)
+                logs = str(LOG_PATH)
+                cmd = [
+                    sys.executable, "rag_eval_metrics.py",
+                    "--gold_csv", gold_path,
+                    "--logs_jsonl", logs,
+                    "--k", str(k),
+                    "--out_dir", out_dir
+                ]
+                try:
+                    p = subprocess.run(cmd, capture_output=True, text=True, check=False)
+                    stdout = p.stdout or ""
+                    stderr = p.stderr or ""
+                    perq = ARTIFACT_DIR / "metrics_per_question.csv"
+                    agg  = ARTIFACT_DIR / "metrics_aggregate.json"
+                    agg_json = {}
+                    if agg.exists():
+                        agg_json = _json.loads(agg.read_text(encoding="utf-8"))
+                    report = "```\n" + (stdout.strip() or "(no stdout)") + ("\n" + stderr.strip() if stderr else "") + "\n```"
+                    return (str(perq) if perq.exists() else None,
+                            str(agg)  if agg.exists()  else None,
+                            agg_json,
+                            report)
+                except Exception as e:
+                    return (None, None, {}, f"**Eval error:** {e}")
+            def _eval_wrapper(gf, k):
+                from pathlib import Path
+                if gf is None:
+                    default_gold = Path("gold.csv")
+                    if not default_gold.exists():
+                        return None, None, {}, "**No gold.csv provided or found in repo root.**"
+                    gold_path = str(default_gold)
+                else:
+                    gold_path = gf.name
+                return _run_eval_inproc(gold_path, int(k))
+            btn_eval.click(_eval_wrapper, inputs=[gold_file, k_slider],
+                           outputs=[out_perq, out_agg, out_json, out_log])
 # ------------- Launch -------------
 if __name__ == "__main__":
     demo.queue().launch()