Update app.py
Browse files
app.py
CHANGED
|
@@ -35,6 +35,19 @@ import uuid
|
|
| 35 |
from pathlib import Path
|
| 36 |
from urllib.parse import quote
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
# ---------------------------------------------------------------------------
|
| 39 |
# Agent import β graceful stub when agent.py is absent during dev/testing
|
| 40 |
# ---------------------------------------------------------------------------
|
|
@@ -86,6 +99,7 @@ EMPTY_REVIEW_DF = pd.DataFrame(columns=REVIEW_COLUMNS)
|
|
| 86 |
MISTRAL_KEY_MISSING = not bool(os.environ.get("MISTRAL_API_KEY", ""))
|
| 87 |
GROQ_KEY_MISSING = not bool(os.environ.get("GROQ_API_KEY", ""))
|
| 88 |
UPLOADS_DIR = Path("uploads")
|
|
|
|
| 89 |
OUTPUTS_DIR = Path(__file__).resolve().parent / "outputs"
|
| 90 |
|
| 91 |
# ---------------------------------------------------------------------------
|
|
@@ -596,6 +610,198 @@ def build_file_list_html(paths: list[str]) -> str:
|
|
| 596 |
return "\n".join(items)
|
| 597 |
|
| 598 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
# ---------------------------------------------------------------------------
|
| 600 |
# Helper β placeholder chart HTML
|
| 601 |
# ---------------------------------------------------------------------------
|
|
@@ -619,6 +825,158 @@ def build_placeholder_chart(chart_type: str) -> str:
|
|
| 619 |
<style>@keyframes grow {{ from{{width:0%}} to{{width:75%}} }}</style>"""
|
| 620 |
|
| 621 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
# ---------------------------------------------------------------------------
|
| 623 |
# Core interaction handlers
|
| 624 |
# ---------------------------------------------------------------------------
|
|
@@ -719,12 +1077,24 @@ def submit_review(review_df, agent_state: dict, chat_history: list):
|
|
| 719 |
FIX BUG 3 β write parsed review rows into agent_state["review_df"]
|
| 720 |
BEFORE calling the agent, so _parse_review_df() receives the populated list.
|
| 721 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
# Store the review table in state so agent.py can read it
|
| 723 |
agent_state["review_df"] = review_df.to_dict(orient="records")
|
| 724 |
agent_state["review_submitted"] = True
|
| 725 |
|
| 726 |
# Send a short trigger message β the agent reads state, not the payload
|
| 727 |
-
msg =
|
| 728 |
results = []
|
| 729 |
for state in handle_chat(msg, chat_history, agent_state):
|
| 730 |
results = state
|
|
@@ -732,6 +1102,39 @@ def submit_review(review_df, agent_state: dict, chat_history: list):
|
|
| 732 |
return new_history, new_state, phase_html
|
| 733 |
|
| 734 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 735 |
def refresh_downloads(agent_state: dict):
|
| 736 |
"""Return downloadable artefact paths from agent state."""
|
| 737 |
files = agent_state.get("output_files", [])
|
|
@@ -868,6 +1271,10 @@ def build_app() -> gr.Blocks:
|
|
| 868 |
with gr.Column(elem_classes=["panel-card", "panel-results"]):
|
| 869 |
gr.HTML("""<div class="card-title"><span>Results</span></div>""")
|
| 870 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 871 |
with gr.Tabs(elem_classes=["tabs"]):
|
| 872 |
|
| 873 |
# ββ Tab 1: Review Table βββββββββββββββββββββββββββββ
|
|
@@ -875,8 +1282,11 @@ def build_app() -> gr.Blocks:
|
|
| 875 |
gr.HTML("""
|
| 876 |
<p style='font-size:0.78rem;color:var(--text-muted);margin:0 0 12px;'>
|
| 877 |
Edit <b>Approve</b>, <b>Rename To</b>, and <b>Reasoning</b> columns inline,
|
|
|
|
| 878 |
then click <b>Submit Review</b>. Use <b>verify</b> in chat at Phase 2
|
| 879 |
or Phase 5.5 to see Mistral vs Groq comparisons directly in chat output.
|
|
|
|
|
|
|
| 880 |
</p>""")
|
| 881 |
|
| 882 |
review_table = gr.Dataframe(
|
|
@@ -905,6 +1315,11 @@ def build_app() -> gr.Blocks:
|
|
| 905 |
elem_classes=["btn-success"],
|
| 906 |
)
|
| 907 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 908 |
# ββ Tab 2: Charts βββββββββββββββββββββββββββββββββββ
|
| 909 |
with gr.TabItem("Charts", elem_classes=["tabitem"]):
|
| 910 |
chart_selector = gr.Dropdown(
|
|
@@ -941,6 +1356,70 @@ def build_app() -> gr.Blocks:
|
|
| 941 |
elem_classes=["btn-secondary"],
|
| 942 |
)
|
| 943 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 944 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 945 |
# Event wiring
|
| 946 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -1010,9 +1489,44 @@ def build_app() -> gr.Blocks:
|
|
| 1010 |
refresh_review_table(a),
|
| 1011 |
*refresh_downloads(a),
|
| 1012 |
get_chart_html(selected_chart, a),
|
|
|
|
|
|
|
| 1013 |
),
|
| 1014 |
inputs=[chart_selector, agent_state],
|
| 1015 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1016 |
)
|
| 1017 |
|
| 1018 |
return app
|
|
|
|
| 35 |
from pathlib import Path
|
| 36 |
from urllib.parse import quote
|
| 37 |
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
# Method extraction tools β direct invocation (standalone tab, no agent)
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
try:
|
| 42 |
+
from tools import (
|
| 43 |
+
extract_methods_from_pdfs,
|
| 44 |
+
OUTPUT_DIR as TOOLS_OUTPUT_DIR,
|
| 45 |
+
_load_json as tools_load_json,
|
| 46 |
+
)
|
| 47 |
+
METHOD_TOOLS_AVAILABLE = True
|
| 48 |
+
except ImportError:
|
| 49 |
+
METHOD_TOOLS_AVAILABLE = False
|
| 50 |
+
|
| 51 |
# ---------------------------------------------------------------------------
|
| 52 |
# Agent import β graceful stub when agent.py is absent during dev/testing
|
| 53 |
# ---------------------------------------------------------------------------
|
|
|
|
| 99 |
MISTRAL_KEY_MISSING = not bool(os.environ.get("MISTRAL_API_KEY", ""))
|
| 100 |
GROQ_KEY_MISSING = not bool(os.environ.get("GROQ_API_KEY", ""))
|
| 101 |
UPLOADS_DIR = Path("uploads")
|
| 102 |
+
PDF_UPLOADS_DIR = Path("uploads") / "pdfs"
|
| 103 |
OUTPUTS_DIR = Path(__file__).resolve().parent / "outputs"
|
| 104 |
|
| 105 |
# ---------------------------------------------------------------------------
|
|
|
|
| 610 |
return "\n".join(items)
|
| 611 |
|
| 612 |
|
| 613 |
+
# ---------------------------------------------------------------------------
|
| 614 |
+
# Helper β cluster stats HTML
|
| 615 |
+
# ---------------------------------------------------------------------------
|
| 616 |
+
def build_cluster_stats_html(agent_state: dict) -> str:
|
| 617 |
+
run_key = agent_state.get("run_key", "abstract")
|
| 618 |
+
opt_path = OUTPUTS_DIR / run_key / "optimization.json"
|
| 619 |
+
if not opt_path.exists():
|
| 620 |
+
return (
|
| 621 |
+
"<p style='color:var(--text-muted);font-size:0.83rem;padding:6px 0 2px;'>"
|
| 622 |
+
"No clustering stats yet. Run topic discovery to generate optimization stats."
|
| 623 |
+
"</p>"
|
| 624 |
+
)
|
| 625 |
+
|
| 626 |
+
try:
|
| 627 |
+
rounds = json.loads(opt_path.read_text(encoding="utf-8"))
|
| 628 |
+
except Exception:
|
| 629 |
+
rounds = []
|
| 630 |
+
|
| 631 |
+
if not isinstance(rounds, list) or not rounds:
|
| 632 |
+
return (
|
| 633 |
+
"<p style='color:var(--text-muted);font-size:0.83rem;padding:6px 0 2px;'>"
|
| 634 |
+
"Optimization stats are unavailable or empty."
|
| 635 |
+
"</p>"
|
| 636 |
+
)
|
| 637 |
+
|
| 638 |
+
first = rounds[0]
|
| 639 |
+
last = rounds[-1]
|
| 640 |
+
first_clusters = int(first.get("metrics", {}).get("n_clusters", 0))
|
| 641 |
+
last_clusters = int(last.get("metrics", {}).get("n_clusters", 0))
|
| 642 |
+
|
| 643 |
+
before_round = first
|
| 644 |
+
after_round = last
|
| 645 |
+
if last_clusters > first_clusters:
|
| 646 |
+
before_round, after_round = last, first
|
| 647 |
+
|
| 648 |
+
def _metrics_block(metrics: dict) -> str:
|
| 649 |
+
if not isinstance(metrics, dict):
|
| 650 |
+
return "<div style='color:var(--text-muted);'>No metrics</div>"
|
| 651 |
+
return (
|
| 652 |
+
"<div style='display:grid;gap:4px;font-size:0.78rem;'>"
|
| 653 |
+
f"<div>Clusters: <b>{int(metrics.get('n_clusters', 0))}</b></div>"
|
| 654 |
+
f"<div>Noise ratio: <b>{metrics.get('noise_ratio', 0.0):.2f}</b></div>"
|
| 655 |
+
f"<div>Min/Med/Mean/Max size: <b>{metrics.get('min_size', 0):.0f}</b> / "
|
| 656 |
+
f"<b>{metrics.get('median_size', 0):.0f}</b> / "
|
| 657 |
+
f"<b>{metrics.get('mean_size', 0):.0f}</b> / "
|
| 658 |
+
f"<b>{metrics.get('max_size', 0):.0f}</b></div>"
|
| 659 |
+
"</div>"
|
| 660 |
+
)
|
| 661 |
+
|
| 662 |
+
def _params_line(params: dict) -> str:
|
| 663 |
+
if not isinstance(params, dict):
|
| 664 |
+
return ""
|
| 665 |
+
return (
|
| 666 |
+
f"min_cluster_size={params.get('min_cluster_size', '')}, "
|
| 667 |
+
f"max_cluster_size={params.get('max_cluster_size', '')}, "
|
| 668 |
+
f"min_samples={params.get('min_samples', '')}"
|
| 669 |
+
)
|
| 670 |
+
|
| 671 |
+
before_label = "Before optimization (more)"
|
| 672 |
+
after_label = "After optimization (less)" if len(rounds) > 1 else "After optimization (no change)"
|
| 673 |
+
|
| 674 |
+
return f"""
|
| 675 |
+
<div style='display:grid;gap:10px;'>
|
| 676 |
+
<div style='font-size:0.82rem;color:var(--text-secondary);font-weight:600;'>Cluster stats</div>
|
| 677 |
+
<div style='display:grid;grid-template-columns:1fr 1fr;gap:12px;'>
|
| 678 |
+
<div style='background:var(--bg-elevated);border:1px solid var(--border);border-radius:10px;padding:10px 12px;'>
|
| 679 |
+
<div style='font-size:0.78rem;color:var(--text-secondary);margin-bottom:6px;'>{before_label}</div>
|
| 680 |
+
<div style='font-size:0.74rem;color:var(--text-muted);margin-bottom:6px;'>
|
| 681 |
+
{_params_line(before_round.get('params', {}))}
|
| 682 |
+
</div>
|
| 683 |
+
{_metrics_block(before_round.get('metrics', {}))}
|
| 684 |
+
</div>
|
| 685 |
+
<div style='background:var(--bg-elevated);border:1px solid var(--border);border-radius:10px;padding:10px 12px;'>
|
| 686 |
+
<div style='font-size:0.78rem;color:var(--text-secondary);margin-bottom:6px;'>{after_label}</div>
|
| 687 |
+
<div style='font-size:0.74rem;color:var(--text-muted);margin-bottom:6px;'>
|
| 688 |
+
{_params_line(after_round.get('params', {}))}
|
| 689 |
+
</div>
|
| 690 |
+
{_metrics_block(after_round.get('metrics', {}))}
|
| 691 |
+
</div>
|
| 692 |
+
</div>
|
| 693 |
+
</div>"""
|
| 694 |
+
|
| 695 |
+
|
| 696 |
+
# ---------------------------------------------------------------------------
|
| 697 |
+
# Helper β cluster info HTML
|
| 698 |
+
# ---------------------------------------------------------------------------
|
| 699 |
+
def build_cluster_info_html(agent_state: dict) -> str:
|
| 700 |
+
run_key = agent_state.get("run_key", "abstract")
|
| 701 |
+
summaries_path = OUTPUTS_DIR / run_key / "summaries.json"
|
| 702 |
+
labels_path = OUTPUTS_DIR / run_key / "labels.json"
|
| 703 |
+
|
| 704 |
+
if not summaries_path.exists():
|
| 705 |
+
return (
|
| 706 |
+
"<p style='color:var(--text-muted);font-size:0.83rem;padding:6px 0 2px;'>"
|
| 707 |
+
"No clusters yet. Run topic discovery to generate cluster summaries."
|
| 708 |
+
"</p>"
|
| 709 |
+
)
|
| 710 |
+
|
| 711 |
+
try:
|
| 712 |
+
summaries = json.loads(summaries_path.read_text(encoding="utf-8"))
|
| 713 |
+
except Exception:
|
| 714 |
+
summaries = []
|
| 715 |
+
|
| 716 |
+
labels = []
|
| 717 |
+
if labels_path.exists():
|
| 718 |
+
try:
|
| 719 |
+
labels = json.loads(labels_path.read_text(encoding="utf-8"))
|
| 720 |
+
except Exception:
|
| 721 |
+
labels = []
|
| 722 |
+
|
| 723 |
+
label_by_id = {
|
| 724 |
+
int(row.get("cluster_id", -1)): (
|
| 725 |
+
row.get("adjudicated_label")
|
| 726 |
+
or row.get("mistral_label")
|
| 727 |
+
or row.get("label")
|
| 728 |
+
or ""
|
| 729 |
+
)
|
| 730 |
+
for row in labels
|
| 731 |
+
if isinstance(row, dict)
|
| 732 |
+
}
|
| 733 |
+
|
| 734 |
+
def _escape_html(text: object) -> str:
|
| 735 |
+
return (
|
| 736 |
+
str(text or "")
|
| 737 |
+
.replace("&", "&")
|
| 738 |
+
.replace("<", "<")
|
| 739 |
+
.replace(">", ">")
|
| 740 |
+
)
|
| 741 |
+
|
| 742 |
+
def _format_papers(papers: list[dict]) -> str:
|
| 743 |
+
if not papers:
|
| 744 |
+
return ""
|
| 745 |
+
items = []
|
| 746 |
+
for entry in papers[:3]:
|
| 747 |
+
if not isinstance(entry, dict):
|
| 748 |
+
continue
|
| 749 |
+
title = str(entry.get("paper_title") or entry.get("title") or "").strip()
|
| 750 |
+
if not title:
|
| 751 |
+
continue
|
| 752 |
+
count = entry.get("count")
|
| 753 |
+
items.append(
|
| 754 |
+
f"{_escape_html(title)} ({count})" if count else _escape_html(title)
|
| 755 |
+
)
|
| 756 |
+
return "; ".join(items)
|
| 757 |
+
|
| 758 |
+
def _cluster_card(summary: dict) -> str:
|
| 759 |
+
cid = int(summary.get("cluster_id", -1))
|
| 760 |
+
label = _escape_html(label_by_id.get(cid, ""))
|
| 761 |
+
size = int(summary.get("size", 0))
|
| 762 |
+
evidence = summary.get("evidence", [])
|
| 763 |
+
top_evidence = _escape_html(evidence[0]) if evidence else ""
|
| 764 |
+
paper_count = summary.get("paper_count", "")
|
| 765 |
+
top_papers = _format_papers(summary.get("top_papers", []))
|
| 766 |
+
|
| 767 |
+
if not label:
|
| 768 |
+
return ""
|
| 769 |
+
|
| 770 |
+
return (
|
| 771 |
+
"<details style='background:var(--bg-elevated);border:1px solid var(--border);"
|
| 772 |
+
"border-radius:10px;padding:10px 12px;'>"
|
| 773 |
+
f"<summary style='cursor:pointer;font-size:0.84rem;font-weight:600;color:var(--text-primary);'>"
|
| 774 |
+
f"Cluster {cid} β {label or 'Unlabeled'} ({size} sentences)</summary>"
|
| 775 |
+
"<div style='margin-top:8px;font-size:0.78rem;color:var(--text-secondary);display:grid;gap:6px;'>"
|
| 776 |
+
f"<div><b>Top evidence:</b> {top_evidence}</div>"
|
| 777 |
+
f"<div><b>Papers:</b> {paper_count} | {top_papers}</div>"
|
| 778 |
+
"</div>"
|
| 779 |
+
"</details>"
|
| 780 |
+
)
|
| 781 |
+
|
| 782 |
+
if not isinstance(summaries, list) or not summaries:
|
| 783 |
+
return (
|
| 784 |
+
"<p style='color:var(--text-muted);font-size:0.83rem;padding:6px 0 2px;'>"
|
| 785 |
+
"Cluster summaries are empty."
|
| 786 |
+
"</p>"
|
| 787 |
+
)
|
| 788 |
+
|
| 789 |
+
cards = "\n".join(filter(None, map(_cluster_card, summaries)))
|
| 790 |
+
if not cards:
|
| 791 |
+
return (
|
| 792 |
+
"<p style='color:var(--text-muted);font-size:0.83rem;padding:6px 0 2px;'>"
|
| 793 |
+
"No labeled clusters yet. Run labeling or VERIFY to populate labels."
|
| 794 |
+
"</p>"
|
| 795 |
+
)
|
| 796 |
+
return (
|
| 797 |
+
"<div style='display:grid;gap:10px;'>"
|
| 798 |
+
"<div style='font-size:0.82rem;color:var(--text-secondary);font-weight:600;'>"
|
| 799 |
+
"Cluster details</div>"
|
| 800 |
+
f"{cards}"
|
| 801 |
+
"</div>"
|
| 802 |
+
)
|
| 803 |
+
|
| 804 |
+
|
| 805 |
# ---------------------------------------------------------------------------
|
| 806 |
# Helper β placeholder chart HTML
|
| 807 |
# ---------------------------------------------------------------------------
|
|
|
|
| 825 |
<style>@keyframes grow {{ from{{width:0%}} to{{width:75%}} }}</style>"""
|
| 826 |
|
| 827 |
|
| 828 |
+
# ---------------------------------------------------------------------------
|
| 829 |
+
# Method Extraction β helper functions
|
| 830 |
+
# ---------------------------------------------------------------------------
|
| 831 |
+
|
| 832 |
+
def build_method_stats_html(result: dict) -> str:
|
| 833 |
+
"""Build stats HTML for method extraction results."""
|
| 834 |
+
if not result or result.get("error"):
|
| 835 |
+
return (
|
| 836 |
+
"<p style='color:var(--text-muted);font-size:0.83rem;padding:6px 0;'>"
|
| 837 |
+
"Upload PDFs and click <b>Run Method Extraction</b> to start."
|
| 838 |
+
"</p>"
|
| 839 |
+
)
|
| 840 |
+
n_papers = result.get("n_papers", 0)
|
| 841 |
+
n_extracted = result.get("n_extracted", 0)
|
| 842 |
+
return f"""
|
| 843 |
+
<div class="stats-grid fade-in" style="grid-template-columns:1fr 1fr;">
|
| 844 |
+
<div class="stat-card accent">
|
| 845 |
+
<div class="stat-value">{n_papers}</div>
|
| 846 |
+
<div class="stat-label">PDFs Processed</div>
|
| 847 |
+
</div>
|
| 848 |
+
<div class="stat-card success">
|
| 849 |
+
<div class="stat-value">{n_extracted}</div>
|
| 850 |
+
<div class="stat-label">Methods Identified</div>
|
| 851 |
+
</div>
|
| 852 |
+
</div>
|
| 853 |
+
"""
|
| 854 |
+
|
| 855 |
+
|
| 856 |
+
def get_method_results_df() -> pd.DataFrame:
|
| 857 |
+
"""Return the method summary dataframe."""
|
| 858 |
+
columns = [
|
| 859 |
+
"Paper ID",
|
| 860 |
+
"Paper Title",
|
| 861 |
+
"Computational Methods",
|
| 862 |
+
]
|
| 863 |
+
csv_path = OUTPUTS_DIR / "methods" / "method_summary.csv"
|
| 864 |
+
if csv_path.exists():
|
| 865 |
+
try:
|
| 866 |
+
df = pd.read_csv(csv_path)
|
| 867 |
+
except Exception:
|
| 868 |
+
return pd.DataFrame(columns=columns)
|
| 869 |
+
for col in columns:
|
| 870 |
+
if col not in df.columns:
|
| 871 |
+
df[col] = ""
|
| 872 |
+
return df[columns]
|
| 873 |
+
return pd.DataFrame(columns=columns)
|
| 874 |
+
|
| 875 |
+
|
| 876 |
+
def get_method_technique_df() -> pd.DataFrame:
|
| 877 |
+
"""Return the technique-to-papers summary dataframe."""
|
| 878 |
+
columns = ["Main Computational Technique", "Algorithms", "Papers"]
|
| 879 |
+
csv_path = OUTPUTS_DIR / "methods" / "technique_to_papers.csv"
|
| 880 |
+
if csv_path.exists():
|
| 881 |
+
try:
|
| 882 |
+
df = pd.read_csv(csv_path)
|
| 883 |
+
except Exception:
|
| 884 |
+
return pd.DataFrame(columns=columns)
|
| 885 |
+
for col in columns:
|
| 886 |
+
if col not in df.columns:
|
| 887 |
+
df[col] = ""
|
| 888 |
+
return df[columns]
|
| 889 |
+
return pd.DataFrame(columns=columns)
|
| 890 |
+
|
| 891 |
+
|
| 892 |
+
def get_method_download_file() -> list[str]:
|
| 893 |
+
"""Return downloadable method CSV."""
|
| 894 |
+
technique_path = OUTPUTS_DIR / "methods" / "technique_to_papers.csv"
|
| 895 |
+
if technique_path.exists():
|
| 896 |
+
return [str(technique_path)]
|
| 897 |
+
return None
|
| 898 |
+
|
| 899 |
+
|
| 900 |
+
# ---------------------------------------------------------------------------
|
| 901 |
+
# Method Extraction β interaction handlers
|
| 902 |
+
# ---------------------------------------------------------------------------
|
| 903 |
+
|
| 904 |
+
def handle_pdf_upload(file_objs):
|
| 905 |
+
"""Copy uploaded PDFs to a stable directory."""
|
| 906 |
+
if not file_objs:
|
| 907 |
+
return (
|
| 908 |
+
"<div class='status-pill idle'><div class='dot'></div>No PDFs uploaded</div>",
|
| 909 |
+
"<p style='color:var(--text-muted);font-size:0.83rem;'>Upload PDF research papers to extract methods.</p>",
|
| 910 |
+
)
|
| 911 |
+
|
| 912 |
+
PDF_UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
|
| 913 |
+
# Clear previous uploads
|
| 914 |
+
for old in PDF_UPLOADS_DIR.glob("*.pdf"):
|
| 915 |
+
old.unlink()
|
| 916 |
+
for old in PDF_UPLOADS_DIR.glob("*.PDF"):
|
| 917 |
+
old.unlink()
|
| 918 |
+
|
| 919 |
+
count = 0
|
| 920 |
+
for f in file_objs:
|
| 921 |
+
src = Path(f.name) if hasattr(f, 'name') else Path(f)
|
| 922 |
+
if src.suffix.lower() == ".pdf":
|
| 923 |
+
dst = PDF_UPLOADS_DIR / f"{uuid.uuid4().hex[:8]}_{src.name}"
|
| 924 |
+
shutil.copy2(src, dst)
|
| 925 |
+
count += 1
|
| 926 |
+
|
| 927 |
+
status = f"<div class='status-pill ready'><div class='dot'></div>{count} PDFs ready</div>"
|
| 928 |
+
stats = f"""
|
| 929 |
+
<div class="stats-grid fade-in">
|
| 930 |
+
<div class="stat-card accent">
|
| 931 |
+
<div class="stat-value">{count}</div>
|
| 932 |
+
<div class="stat-label">PDFs Uploaded</div>
|
| 933 |
+
</div>
|
| 934 |
+
</div>"""
|
| 935 |
+
return status, stats
|
| 936 |
+
|
| 937 |
+
|
| 938 |
+
def run_method_extraction_pipeline():
|
| 939 |
+
"""Run the method extraction pipeline."""
|
| 940 |
+
if not METHOD_TOOLS_AVAILABLE:
|
| 941 |
+
return (
|
| 942 |
+
build_method_stats_html({"error": True}),
|
| 943 |
+
"<div class='status-pill idle'><div class='dot'></div>Tools unavailable</div>",
|
| 944 |
+
get_method_technique_df(),
|
| 945 |
+
get_method_download_file(),
|
| 946 |
+
)
|
| 947 |
+
|
| 948 |
+
pdf_dir = str(PDF_UPLOADS_DIR.resolve())
|
| 949 |
+
if not PDF_UPLOADS_DIR.exists() or not list(PDF_UPLOADS_DIR.glob("*.pdf")) + list(PDF_UPLOADS_DIR.glob("*.PDF")):
|
| 950 |
+
return (
|
| 951 |
+
"<p style='color:var(--danger);font-size:0.83rem;'>No PDFs found. Upload PDFs first.</p>",
|
| 952 |
+
"<div class='status-pill idle'><div class='dot'></div>No PDFs</div>",
|
| 953 |
+
get_method_technique_df(),
|
| 954 |
+
get_method_download_file(),
|
| 955 |
+
)
|
| 956 |
+
|
| 957 |
+
# Step 1: Extract + LLM Processing
|
| 958 |
+
result = extract_methods_from_pdfs.invoke({"pdf_dir": pdf_dir})
|
| 959 |
+
|
| 960 |
+
if isinstance(result, dict) and result.get("error"):
|
| 961 |
+
return (
|
| 962 |
+
f"<p style='color:var(--danger);font-size:0.83rem;'>{result['error']}</p>",
|
| 963 |
+
"<div class='status-pill idle'><div class='dot'></div>Extraction failed</div>",
|
| 964 |
+
get_method_technique_df(),
|
| 965 |
+
get_method_download_file(),
|
| 966 |
+
)
|
| 967 |
+
|
| 968 |
+
# Build UI outputs
|
| 969 |
+
stats_html = build_method_stats_html(result)
|
| 970 |
+
status_html = "<div class='status-pill ready'><div class='dot'></div>Extraction complete</div>"
|
| 971 |
+
|
| 972 |
+
return (
|
| 973 |
+
stats_html,
|
| 974 |
+
status_html,
|
| 975 |
+
get_method_technique_df(),
|
| 976 |
+
get_method_download_file(),
|
| 977 |
+
)
|
| 978 |
+
|
| 979 |
+
|
| 980 |
# ---------------------------------------------------------------------------
|
| 981 |
# Core interaction handlers
|
| 982 |
# ---------------------------------------------------------------------------
|
|
|
|
| 1077 |
FIX BUG 3 β write parsed review rows into agent_state["review_df"]
|
| 1078 |
BEFORE calling the agent, so _parse_review_df() receives the populated list.
|
| 1079 |
"""
|
| 1080 |
+
def _next_phase_message(state: dict) -> str:
|
| 1081 |
+
gate = state.get("stop_gate")
|
| 1082 |
+
if gate == "STOP_GATE_1_AWAIT_REVIEW_TABLE":
|
| 1083 |
+
return "Review table submitted. Please proceed to Phase 3 and consolidate themes."
|
| 1084 |
+
if gate == "STOP_GATE_2_AWAIT_THEME_MERGE":
|
| 1085 |
+
return "Theme merge confirmed. Please proceed to Phase 4 for saturation check."
|
| 1086 |
+
if gate == "STOP_GATE_3_AWAIT_SATURATION_SIGNOFF":
|
| 1087 |
+
return "Saturation sign-off confirmed. Please proceed to Phase 5 for naming themes."
|
| 1088 |
+
if gate == "STOP_GATE_4_AWAIT_TAXONOMY_REVIEW":
|
| 1089 |
+
return "Taxonomy review confirmed. Please proceed to Phase 6 to finalize outputs."
|
| 1090 |
+
return "Review table submitted. Please proceed to the next phase."
|
| 1091 |
+
|
| 1092 |
# Store the review table in state so agent.py can read it
|
| 1093 |
agent_state["review_df"] = review_df.to_dict(orient="records")
|
| 1094 |
agent_state["review_submitted"] = True
|
| 1095 |
|
| 1096 |
# Send a short trigger message β the agent reads state, not the payload
|
| 1097 |
+
msg = _next_phase_message(agent_state)
|
| 1098 |
results = []
|
| 1099 |
for state in handle_chat(msg, chat_history, agent_state):
|
| 1100 |
results = state
|
|
|
|
| 1102 |
return new_history, new_state, phase_html
|
| 1103 |
|
| 1104 |
|
| 1105 |
+
def auto_accept_review(agent_state: dict, chat_history: list, enabled: bool):
|
| 1106 |
+
"""Auto-approve Phase 2 review rows and submit when enabled."""
|
| 1107 |
+
if not enabled:
|
| 1108 |
+
return chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
|
| 1109 |
+
|
| 1110 |
+
gate = agent_state.get("stop_gate")
|
| 1111 |
+
if gate != "STOP_GATE_1_AWAIT_REVIEW_TABLE":
|
| 1112 |
+
return chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
|
| 1113 |
+
|
| 1114 |
+
if agent_state.get("review_submitted"):
|
| 1115 |
+
return chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
|
| 1116 |
+
|
| 1117 |
+
if agent_state.get("auto_accept_last_gate") == gate:
|
| 1118 |
+
return chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
|
| 1119 |
+
|
| 1120 |
+
rows = agent_state.get("review_df", [])
|
| 1121 |
+
if not rows:
|
| 1122 |
+
return chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
|
| 1123 |
+
|
| 1124 |
+
df = pd.DataFrame(rows)
|
| 1125 |
+
if "Approve" in df.columns:
|
| 1126 |
+
df["Approve"] = True
|
| 1127 |
+
if "Rename To" in df.columns and "Topic Label" in df.columns:
|
| 1128 |
+
df["Rename To"] = df["Rename To"].fillna("").astype(str)
|
| 1129 |
+
df["Rename To"] = df.apply(
|
| 1130 |
+
lambda r: r["Rename To"] or r["Topic Label"], axis=1
|
| 1131 |
+
)
|
| 1132 |
+
|
| 1133 |
+
new_history, new_state, phase_html = submit_review(df, agent_state, chat_history)
|
| 1134 |
+
new_state["auto_accept_last_gate"] = gate
|
| 1135 |
+
return new_history, new_state, phase_html
|
| 1136 |
+
|
| 1137 |
+
|
| 1138 |
def refresh_downloads(agent_state: dict):
|
| 1139 |
"""Return downloadable artefact paths from agent state."""
|
| 1140 |
files = agent_state.get("output_files", [])
|
|
|
|
| 1271 |
with gr.Column(elem_classes=["panel-card", "panel-results"]):
|
| 1272 |
gr.HTML("""<div class="card-title"><span>Results</span></div>""")
|
| 1273 |
|
| 1274 |
+
cluster_stats = gr.HTML(
|
| 1275 |
+
value=build_cluster_stats_html({}),
|
| 1276 |
+
)
|
| 1277 |
+
|
| 1278 |
with gr.Tabs(elem_classes=["tabs"]):
|
| 1279 |
|
| 1280 |
# ββ Tab 1: Review Table βββββββββββββββββββββββββββββ
|
|
|
|
| 1282 |
gr.HTML("""
|
| 1283 |
<p style='font-size:0.78rem;color:var(--text-muted);margin:0 0 12px;'>
|
| 1284 |
Edit <b>Approve</b>, <b>Rename To</b>, and <b>Reasoning</b> columns inline,
|
| 1285 |
+
and use the <b>Papers</b> column to see the top 3 paper titles per cluster.
|
| 1286 |
then click <b>Submit Review</b>. Use <b>verify</b> in chat at Phase 2
|
| 1287 |
or Phase 5.5 to see Mistral vs Groq comparisons directly in chat output.
|
| 1288 |
+
Phase 2 verification also adds an adjudicated best label.
|
| 1289 |
+
Enable <b>Auto-accept Phase 2 review</b> to skip manual submission.
|
| 1290 |
</p>""")
|
| 1291 |
|
| 1292 |
review_table = gr.Dataframe(
|
|
|
|
| 1315 |
elem_classes=["btn-success"],
|
| 1316 |
)
|
| 1317 |
|
| 1318 |
+
auto_accept_toggle = gr.Checkbox(
|
| 1319 |
+
label="Auto-accept Phase 2 review and continue",
|
| 1320 |
+
value=False,
|
| 1321 |
+
)
|
| 1322 |
+
|
| 1323 |
# ββ Tab 2: Charts βββββββββββββββββββββββββββββββββββ
|
| 1324 |
with gr.TabItem("Charts", elem_classes=["tabitem"]):
|
| 1325 |
chart_selector = gr.Dropdown(
|
|
|
|
| 1356 |
elem_classes=["btn-secondary"],
|
| 1357 |
)
|
| 1358 |
|
| 1359 |
+
# ββ Tab 4: Clusters βββββββββββββββββββββββββββββββββ
|
| 1360 |
+
with gr.TabItem("Clusters", elem_classes=["tabitem"]):
|
| 1361 |
+
cluster_info_html = gr.HTML(
|
| 1362 |
+
value=build_cluster_info_html({}),
|
| 1363 |
+
)
|
| 1364 |
+
|
| 1365 |
+
# ββ METHOD EXTRACTION β Standalone panel ββββββββββββββββββββββ
|
| 1366 |
+
with gr.Column(elem_classes=["panel-card"]):
|
| 1367 |
+
gr.HTML("""
|
| 1368 |
+
<div class="card-title">
|
| 1369 |
+
<span>π Computational Methodology Extraction</span>
|
| 1370 |
+
</div>
|
| 1371 |
+
<p style='font-size:0.78rem;color:var(--text-muted);margin:0 0 12px;'>
|
| 1372 |
+
Upload research PDFs to identify the specific computational methods
|
| 1373 |
+
used in each paper (text-only extraction via PyMuPDF + LLM).
|
| 1374 |
+
</p>
|
| 1375 |
+
""")
|
| 1376 |
+
|
| 1377 |
+
with gr.Row():
|
| 1378 |
+
with gr.Column(scale=1):
|
| 1379 |
+
pdf_upload = gr.File(
|
| 1380 |
+
label="Upload Research PDFs",
|
| 1381 |
+
file_types=[".pdf"],
|
| 1382 |
+
file_count="multiple",
|
| 1383 |
+
interactive=True,
|
| 1384 |
+
elem_id="pdf-upload",
|
| 1385 |
+
)
|
| 1386 |
+
with gr.Column(scale=1):
|
| 1387 |
+
method_status = gr.HTML(
|
| 1388 |
+
value="<div class='status-pill idle'><div class='dot'></div>Awaiting PDF upload</div>"
|
| 1389 |
+
)
|
| 1390 |
+
method_stats = gr.HTML(
|
| 1391 |
+
value="<p style='color:var(--text-muted);font-size:0.83rem;'>"
|
| 1392 |
+
"Upload PDF research papers to extract methods.</p>"
|
| 1393 |
+
)
|
| 1394 |
+
|
| 1395 |
+
run_methods_btn = gr.Button(
|
| 1396 |
+
"π Extract Computational Methods",
|
| 1397 |
+
variant="primary",
|
| 1398 |
+
elem_classes=["btn-primary"],
|
| 1399 |
+
)
|
| 1400 |
+
|
| 1401 |
+
gr.HTML("<hr style='border:none;border-top:1px solid var(--border);margin:12px 0;'>")
|
| 1402 |
+
|
| 1403 |
+
# Results Dataframe
|
| 1404 |
+
gr.HTML("""
|
| 1405 |
+
<div style='font-size:0.82rem;color:var(--text-secondary);font-weight:600;margin-bottom:8px;'>
|
| 1406 |
+
Computational Techniques β Algorithms β Papers
|
| 1407 |
+
</div>""")
|
| 1408 |
+
method_technique_df = gr.Dataframe(
|
| 1409 |
+
headers=["Main Computational Technique", "Algorithms", "Papers"],
|
| 1410 |
+
interactive=False,
|
| 1411 |
+
wrap=True,
|
| 1412 |
+
)
|
| 1413 |
+
|
| 1414 |
+
gr.HTML("<hr style='border:none;border-top:1px solid var(--border);margin:12px 0;'>")
|
| 1415 |
+
|
| 1416 |
+
# CSV Download
|
| 1417 |
+
method_dl_files = gr.File(
|
| 1418 |
+
label="Download CSV Report",
|
| 1419 |
+
file_count="multiple",
|
| 1420 |
+
interactive=False,
|
| 1421 |
+
)
|
| 1422 |
+
|
| 1423 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1424 |
# Event wiring
|
| 1425 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 1489 |
refresh_review_table(a),
|
| 1490 |
*refresh_downloads(a),
|
| 1491 |
get_chart_html(selected_chart, a),
|
| 1492 |
+
build_cluster_stats_html(a),
|
| 1493 |
+
build_cluster_info_html(a),
|
| 1494 |
),
|
| 1495 |
inputs=[chart_selector, agent_state],
|
| 1496 |
+
outputs=[
|
| 1497 |
+
review_table,
|
| 1498 |
+
download_file_list_html,
|
| 1499 |
+
download_files,
|
| 1500 |
+
chart_display,
|
| 1501 |
+
cluster_stats,
|
| 1502 |
+
cluster_info_html,
|
| 1503 |
+
],
|
| 1504 |
+
)
|
| 1505 |
+
|
| 1506 |
+
# Auto-accept Phase 2 review when enabled.
|
| 1507 |
+
chatbot.change(
|
| 1508 |
+
fn=auto_accept_review,
|
| 1509 |
+
inputs=[agent_state, chatbot, auto_accept_toggle],
|
| 1510 |
+
outputs=[chatbot, agent_state, phase_bar],
|
| 1511 |
+
)
|
| 1512 |
+
|
| 1513 |
+
# ββ Method Extraction event wiring βββββββββββββββββββββββββββββ
|
| 1514 |
+
|
| 1515 |
+
pdf_upload.change(
|
| 1516 |
+
fn=handle_pdf_upload,
|
| 1517 |
+
inputs=[pdf_upload],
|
| 1518 |
+
outputs=[method_status, method_stats],
|
| 1519 |
+
)
|
| 1520 |
+
|
| 1521 |
+
run_methods_btn.click(
|
| 1522 |
+
fn=run_method_extraction_pipeline,
|
| 1523 |
+
inputs=[],
|
| 1524 |
+
outputs=[
|
| 1525 |
+
method_stats,
|
| 1526 |
+
method_status,
|
| 1527 |
+
method_technique_df,
|
| 1528 |
+
method_dl_files,
|
| 1529 |
+
],
|
| 1530 |
)
|
| 1531 |
|
| 1532 |
return app
|