atharvthite05 commited on
Commit
ff66bd7
Β·
verified Β·
1 Parent(s): 6cd378e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +516 -2
app.py CHANGED
@@ -35,6 +35,19 @@ import uuid
35
  from pathlib import Path
36
  from urllib.parse import quote
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # ---------------------------------------------------------------------------
39
  # Agent import β€” graceful stub when agent.py is absent during dev/testing
40
  # ---------------------------------------------------------------------------
@@ -86,6 +99,7 @@ EMPTY_REVIEW_DF = pd.DataFrame(columns=REVIEW_COLUMNS)
86
  MISTRAL_KEY_MISSING = not bool(os.environ.get("MISTRAL_API_KEY", ""))
87
  GROQ_KEY_MISSING = not bool(os.environ.get("GROQ_API_KEY", ""))
88
  UPLOADS_DIR = Path("uploads")
 
89
  OUTPUTS_DIR = Path(__file__).resolve().parent / "outputs"
90
 
91
  # ---------------------------------------------------------------------------
@@ -596,6 +610,198 @@ def build_file_list_html(paths: list[str]) -> str:
596
  return "\n".join(items)
597
 
598
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
  # ---------------------------------------------------------------------------
600
  # Helper β€” placeholder chart HTML
601
  # ---------------------------------------------------------------------------
@@ -619,6 +825,158 @@ def build_placeholder_chart(chart_type: str) -> str:
619
  <style>@keyframes grow {{ from{{width:0%}} to{{width:75%}} }}</style>"""
620
 
621
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
622
  # ---------------------------------------------------------------------------
623
  # Core interaction handlers
624
  # ---------------------------------------------------------------------------
@@ -719,12 +1077,24 @@ def submit_review(review_df, agent_state: dict, chat_history: list):
719
  FIX BUG 3 β€” write parsed review rows into agent_state["review_df"]
720
  BEFORE calling the agent, so _parse_review_df() receives the populated list.
721
  """
 
 
 
 
 
 
 
 
 
 
 
 
722
  # Store the review table in state so agent.py can read it
723
  agent_state["review_df"] = review_df.to_dict(orient="records")
724
  agent_state["review_submitted"] = True
725
 
726
  # Send a short trigger message β€” the agent reads state, not the payload
727
- msg = "Review table submitted. Please proceed to Phase 3 and consolidate themes."
728
  results = []
729
  for state in handle_chat(msg, chat_history, agent_state):
730
  results = state
@@ -732,6 +1102,39 @@ def submit_review(review_df, agent_state: dict, chat_history: list):
732
  return new_history, new_state, phase_html
733
 
734
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
735
  def refresh_downloads(agent_state: dict):
736
  """Return downloadable artefact paths from agent state."""
737
  files = agent_state.get("output_files", [])
@@ -868,6 +1271,10 @@ def build_app() -> gr.Blocks:
868
  with gr.Column(elem_classes=["panel-card", "panel-results"]):
869
  gr.HTML("""<div class="card-title"><span>Results</span></div>""")
870
 
 
 
 
 
871
  with gr.Tabs(elem_classes=["tabs"]):
872
 
873
  # ── Tab 1: Review Table ─────────────────────────────
@@ -875,8 +1282,11 @@ def build_app() -> gr.Blocks:
875
  gr.HTML("""
876
  <p style='font-size:0.78rem;color:var(--text-muted);margin:0 0 12px;'>
877
  Edit <b>Approve</b>, <b>Rename To</b>, and <b>Reasoning</b> columns inline,
 
878
  then click <b>Submit Review</b>. Use <b>verify</b> in chat at Phase 2
879
  or Phase 5.5 to see Mistral vs Groq comparisons directly in chat output.
 
 
880
  </p>""")
881
 
882
  review_table = gr.Dataframe(
@@ -905,6 +1315,11 @@ def build_app() -> gr.Blocks:
905
  elem_classes=["btn-success"],
906
  )
907
 
 
 
 
 
 
908
  # ── Tab 2: Charts ───────────────────────────────────
909
  with gr.TabItem("Charts", elem_classes=["tabitem"]):
910
  chart_selector = gr.Dropdown(
@@ -941,6 +1356,70 @@ def build_app() -> gr.Blocks:
941
  elem_classes=["btn-secondary"],
942
  )
943
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
944
  # ────────────────────────────────────────────────────────────────
945
  # Event wiring
946
  # ────────────────────────────────────────────────────────────────
@@ -1010,9 +1489,44 @@ def build_app() -> gr.Blocks:
1010
  refresh_review_table(a),
1011
  *refresh_downloads(a),
1012
  get_chart_html(selected_chart, a),
 
 
1013
  ),
1014
  inputs=[chart_selector, agent_state],
1015
- outputs=[review_table, download_file_list_html, download_files, chart_display],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1016
  )
1017
 
1018
  return app
 
35
  from pathlib import Path
36
  from urllib.parse import quote
37
 
38
+ # ---------------------------------------------------------------------------
39
+ # Method extraction tools β€” direct invocation (standalone tab, no agent)
40
+ # ---------------------------------------------------------------------------
41
+ try:
42
+ from tools import (
43
+ extract_methods_from_pdfs,
44
+ OUTPUT_DIR as TOOLS_OUTPUT_DIR,
45
+ _load_json as tools_load_json,
46
+ )
47
+ METHOD_TOOLS_AVAILABLE = True
48
+ except ImportError:
49
+ METHOD_TOOLS_AVAILABLE = False
50
+
51
  # ---------------------------------------------------------------------------
52
  # Agent import β€” graceful stub when agent.py is absent during dev/testing
53
  # ---------------------------------------------------------------------------
 
99
  MISTRAL_KEY_MISSING = not bool(os.environ.get("MISTRAL_API_KEY", ""))
100
  GROQ_KEY_MISSING = not bool(os.environ.get("GROQ_API_KEY", ""))
101
  UPLOADS_DIR = Path("uploads")
102
+ PDF_UPLOADS_DIR = Path("uploads") / "pdfs"
103
  OUTPUTS_DIR = Path(__file__).resolve().parent / "outputs"
104
 
105
  # ---------------------------------------------------------------------------
 
610
  return "\n".join(items)
611
 
612
 
613
+ # ---------------------------------------------------------------------------
614
+ # Helper β€” cluster stats HTML
615
+ # ---------------------------------------------------------------------------
616
+ def build_cluster_stats_html(agent_state: dict) -> str:
617
+ run_key = agent_state.get("run_key", "abstract")
618
+ opt_path = OUTPUTS_DIR / run_key / "optimization.json"
619
+ if not opt_path.exists():
620
+ return (
621
+ "<p style='color:var(--text-muted);font-size:0.83rem;padding:6px 0 2px;'>"
622
+ "No clustering stats yet. Run topic discovery to generate optimization stats."
623
+ "</p>"
624
+ )
625
+
626
+ try:
627
+ rounds = json.loads(opt_path.read_text(encoding="utf-8"))
628
+ except Exception:
629
+ rounds = []
630
+
631
+ if not isinstance(rounds, list) or not rounds:
632
+ return (
633
+ "<p style='color:var(--text-muted);font-size:0.83rem;padding:6px 0 2px;'>"
634
+ "Optimization stats are unavailable or empty."
635
+ "</p>"
636
+ )
637
+
638
+ first = rounds[0]
639
+ last = rounds[-1]
640
+ first_clusters = int(first.get("metrics", {}).get("n_clusters", 0))
641
+ last_clusters = int(last.get("metrics", {}).get("n_clusters", 0))
642
+
643
+ before_round = first
644
+ after_round = last
645
+ if last_clusters > first_clusters:
646
+ before_round, after_round = last, first
647
+
648
+ def _metrics_block(metrics: dict) -> str:
649
+ if not isinstance(metrics, dict):
650
+ return "<div style='color:var(--text-muted);'>No metrics</div>"
651
+ return (
652
+ "<div style='display:grid;gap:4px;font-size:0.78rem;'>"
653
+ f"<div>Clusters: <b>{int(metrics.get('n_clusters', 0))}</b></div>"
654
+ f"<div>Noise ratio: <b>{metrics.get('noise_ratio', 0.0):.2f}</b></div>"
655
+ f"<div>Min/Med/Mean/Max size: <b>{metrics.get('min_size', 0):.0f}</b> / "
656
+ f"<b>{metrics.get('median_size', 0):.0f}</b> / "
657
+ f"<b>{metrics.get('mean_size', 0):.0f}</b> / "
658
+ f"<b>{metrics.get('max_size', 0):.0f}</b></div>"
659
+ "</div>"
660
+ )
661
+
662
+ def _params_line(params: dict) -> str:
663
+ if not isinstance(params, dict):
664
+ return ""
665
+ return (
666
+ f"min_cluster_size={params.get('min_cluster_size', '')}, "
667
+ f"max_cluster_size={params.get('max_cluster_size', '')}, "
668
+ f"min_samples={params.get('min_samples', '')}"
669
+ )
670
+
671
+ before_label = "Before optimization (more)"
672
+ after_label = "After optimization (less)" if len(rounds) > 1 else "After optimization (no change)"
673
+
674
+ return f"""
675
+ <div style='display:grid;gap:10px;'>
676
+ <div style='font-size:0.82rem;color:var(--text-secondary);font-weight:600;'>Cluster stats</div>
677
+ <div style='display:grid;grid-template-columns:1fr 1fr;gap:12px;'>
678
+ <div style='background:var(--bg-elevated);border:1px solid var(--border);border-radius:10px;padding:10px 12px;'>
679
+ <div style='font-size:0.78rem;color:var(--text-secondary);margin-bottom:6px;'>{before_label}</div>
680
+ <div style='font-size:0.74rem;color:var(--text-muted);margin-bottom:6px;'>
681
+ {_params_line(before_round.get('params', {}))}
682
+ </div>
683
+ {_metrics_block(before_round.get('metrics', {}))}
684
+ </div>
685
+ <div style='background:var(--bg-elevated);border:1px solid var(--border);border-radius:10px;padding:10px 12px;'>
686
+ <div style='font-size:0.78rem;color:var(--text-secondary);margin-bottom:6px;'>{after_label}</div>
687
+ <div style='font-size:0.74rem;color:var(--text-muted);margin-bottom:6px;'>
688
+ {_params_line(after_round.get('params', {}))}
689
+ </div>
690
+ {_metrics_block(after_round.get('metrics', {}))}
691
+ </div>
692
+ </div>
693
+ </div>"""
694
+
695
+
696
+ # ---------------------------------------------------------------------------
697
+ # Helper β€” cluster info HTML
698
+ # ---------------------------------------------------------------------------
699
+ def build_cluster_info_html(agent_state: dict) -> str:
700
+ run_key = agent_state.get("run_key", "abstract")
701
+ summaries_path = OUTPUTS_DIR / run_key / "summaries.json"
702
+ labels_path = OUTPUTS_DIR / run_key / "labels.json"
703
+
704
+ if not summaries_path.exists():
705
+ return (
706
+ "<p style='color:var(--text-muted);font-size:0.83rem;padding:6px 0 2px;'>"
707
+ "No clusters yet. Run topic discovery to generate cluster summaries."
708
+ "</p>"
709
+ )
710
+
711
+ try:
712
+ summaries = json.loads(summaries_path.read_text(encoding="utf-8"))
713
+ except Exception:
714
+ summaries = []
715
+
716
+ labels = []
717
+ if labels_path.exists():
718
+ try:
719
+ labels = json.loads(labels_path.read_text(encoding="utf-8"))
720
+ except Exception:
721
+ labels = []
722
+
723
+ label_by_id = {
724
+ int(row.get("cluster_id", -1)): (
725
+ row.get("adjudicated_label")
726
+ or row.get("mistral_label")
727
+ or row.get("label")
728
+ or ""
729
+ )
730
+ for row in labels
731
+ if isinstance(row, dict)
732
+ }
733
+
734
+ def _escape_html(text: object) -> str:
735
+ return (
736
+ str(text or "")
737
+ .replace("&", "&amp;")
738
+ .replace("<", "&lt;")
739
+ .replace(">", "&gt;")
740
+ )
741
+
742
+ def _format_papers(papers: list[dict]) -> str:
743
+ if not papers:
744
+ return ""
745
+ items = []
746
+ for entry in papers[:3]:
747
+ if not isinstance(entry, dict):
748
+ continue
749
+ title = str(entry.get("paper_title") or entry.get("title") or "").strip()
750
+ if not title:
751
+ continue
752
+ count = entry.get("count")
753
+ items.append(
754
+ f"{_escape_html(title)} ({count})" if count else _escape_html(title)
755
+ )
756
+ return "; ".join(items)
757
+
758
+ def _cluster_card(summary: dict) -> str:
759
+ cid = int(summary.get("cluster_id", -1))
760
+ label = _escape_html(label_by_id.get(cid, ""))
761
+ size = int(summary.get("size", 0))
762
+ evidence = summary.get("evidence", [])
763
+ top_evidence = _escape_html(evidence[0]) if evidence else ""
764
+ paper_count = summary.get("paper_count", "")
765
+ top_papers = _format_papers(summary.get("top_papers", []))
766
+
767
+ if not label:
768
+ return ""
769
+
770
+ return (
771
+ "<details style='background:var(--bg-elevated);border:1px solid var(--border);"
772
+ "border-radius:10px;padding:10px 12px;'>"
773
+ f"<summary style='cursor:pointer;font-size:0.84rem;font-weight:600;color:var(--text-primary);'>"
774
+ f"Cluster {cid} β€” {label or 'Unlabeled'} ({size} sentences)</summary>"
775
+ "<div style='margin-top:8px;font-size:0.78rem;color:var(--text-secondary);display:grid;gap:6px;'>"
776
+ f"<div><b>Top evidence:</b> {top_evidence}</div>"
777
+ f"<div><b>Papers:</b> {paper_count} | {top_papers}</div>"
778
+ "</div>"
779
+ "</details>"
780
+ )
781
+
782
+ if not isinstance(summaries, list) or not summaries:
783
+ return (
784
+ "<p style='color:var(--text-muted);font-size:0.83rem;padding:6px 0 2px;'>"
785
+ "Cluster summaries are empty."
786
+ "</p>"
787
+ )
788
+
789
+ cards = "\n".join(filter(None, map(_cluster_card, summaries)))
790
+ if not cards:
791
+ return (
792
+ "<p style='color:var(--text-muted);font-size:0.83rem;padding:6px 0 2px;'>"
793
+ "No labeled clusters yet. Run labeling or VERIFY to populate labels."
794
+ "</p>"
795
+ )
796
+ return (
797
+ "<div style='display:grid;gap:10px;'>"
798
+ "<div style='font-size:0.82rem;color:var(--text-secondary);font-weight:600;'>"
799
+ "Cluster details</div>"
800
+ f"{cards}"
801
+ "</div>"
802
+ )
803
+
804
+
805
  # ---------------------------------------------------------------------------
806
  # Helper β€” placeholder chart HTML
807
  # ---------------------------------------------------------------------------
 
825
  <style>@keyframes grow {{ from{{width:0%}} to{{width:75%}} }}</style>"""
826
 
827
 
828
+ # ---------------------------------------------------------------------------
829
+ # Method Extraction β€” helper functions
830
+ # ---------------------------------------------------------------------------
831
+
832
+ def build_method_stats_html(result: dict) -> str:
833
+ """Build stats HTML for method extraction results."""
834
+ if not result or result.get("error"):
835
+ return (
836
+ "<p style='color:var(--text-muted);font-size:0.83rem;padding:6px 0;'>"
837
+ "Upload PDFs and click <b>Run Method Extraction</b> to start."
838
+ "</p>"
839
+ )
840
+ n_papers = result.get("n_papers", 0)
841
+ n_extracted = result.get("n_extracted", 0)
842
+ return f"""
843
+ <div class="stats-grid fade-in" style="grid-template-columns:1fr 1fr;">
844
+ <div class="stat-card accent">
845
+ <div class="stat-value">{n_papers}</div>
846
+ <div class="stat-label">PDFs Processed</div>
847
+ </div>
848
+ <div class="stat-card success">
849
+ <div class="stat-value">{n_extracted}</div>
850
+ <div class="stat-label">Methods Identified</div>
851
+ </div>
852
+ </div>
853
+ """
854
+
855
+
856
+ def get_method_results_df() -> pd.DataFrame:
857
+ """Return the method summary dataframe."""
858
+ columns = [
859
+ "Paper ID",
860
+ "Paper Title",
861
+ "Computational Methods",
862
+ ]
863
+ csv_path = OUTPUTS_DIR / "methods" / "method_summary.csv"
864
+ if csv_path.exists():
865
+ try:
866
+ df = pd.read_csv(csv_path)
867
+ except Exception:
868
+ return pd.DataFrame(columns=columns)
869
+ for col in columns:
870
+ if col not in df.columns:
871
+ df[col] = ""
872
+ return df[columns]
873
+ return pd.DataFrame(columns=columns)
874
+
875
+
876
+ def get_method_technique_df() -> pd.DataFrame:
877
+ """Return the technique-to-papers summary dataframe."""
878
+ columns = ["Main Computational Technique", "Algorithms", "Papers"]
879
+ csv_path = OUTPUTS_DIR / "methods" / "technique_to_papers.csv"
880
+ if csv_path.exists():
881
+ try:
882
+ df = pd.read_csv(csv_path)
883
+ except Exception:
884
+ return pd.DataFrame(columns=columns)
885
+ for col in columns:
886
+ if col not in df.columns:
887
+ df[col] = ""
888
+ return df[columns]
889
+ return pd.DataFrame(columns=columns)
890
+
891
+
892
+ def get_method_download_file() -> list[str]:
893
+ """Return downloadable method CSV."""
894
+ technique_path = OUTPUTS_DIR / "methods" / "technique_to_papers.csv"
895
+ if technique_path.exists():
896
+ return [str(technique_path)]
897
+ return None
898
+
899
+
900
+ # ---------------------------------------------------------------------------
901
+ # Method Extraction β€” interaction handlers
902
+ # ---------------------------------------------------------------------------
903
+
904
+ def handle_pdf_upload(file_objs):
905
+ """Copy uploaded PDFs to a stable directory."""
906
+ if not file_objs:
907
+ return (
908
+ "<div class='status-pill idle'><div class='dot'></div>No PDFs uploaded</div>",
909
+ "<p style='color:var(--text-muted);font-size:0.83rem;'>Upload PDF research papers to extract methods.</p>",
910
+ )
911
+
912
+ PDF_UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
913
+ # Clear previous uploads
914
+ for old in PDF_UPLOADS_DIR.glob("*.pdf"):
915
+ old.unlink()
916
+ for old in PDF_UPLOADS_DIR.glob("*.PDF"):
917
+ old.unlink()
918
+
919
+ count = 0
920
+ for f in file_objs:
921
+ src = Path(f.name) if hasattr(f, 'name') else Path(f)
922
+ if src.suffix.lower() == ".pdf":
923
+ dst = PDF_UPLOADS_DIR / f"{uuid.uuid4().hex[:8]}_{src.name}"
924
+ shutil.copy2(src, dst)
925
+ count += 1
926
+
927
+ status = f"<div class='status-pill ready'><div class='dot'></div>{count} PDFs ready</div>"
928
+ stats = f"""
929
+ <div class="stats-grid fade-in">
930
+ <div class="stat-card accent">
931
+ <div class="stat-value">{count}</div>
932
+ <div class="stat-label">PDFs Uploaded</div>
933
+ </div>
934
+ </div>"""
935
+ return status, stats
936
+
937
+
938
+ def run_method_extraction_pipeline():
939
+ """Run the method extraction pipeline."""
940
+ if not METHOD_TOOLS_AVAILABLE:
941
+ return (
942
+ build_method_stats_html({"error": True}),
943
+ "<div class='status-pill idle'><div class='dot'></div>Tools unavailable</div>",
944
+ get_method_technique_df(),
945
+ get_method_download_file(),
946
+ )
947
+
948
+ pdf_dir = str(PDF_UPLOADS_DIR.resolve())
949
+ if not PDF_UPLOADS_DIR.exists() or not list(PDF_UPLOADS_DIR.glob("*.pdf")) + list(PDF_UPLOADS_DIR.glob("*.PDF")):
950
+ return (
951
+ "<p style='color:var(--danger);font-size:0.83rem;'>No PDFs found. Upload PDFs first.</p>",
952
+ "<div class='status-pill idle'><div class='dot'></div>No PDFs</div>",
953
+ get_method_technique_df(),
954
+ get_method_download_file(),
955
+ )
956
+
957
+ # Step 1: Extract + LLM Processing
958
+ result = extract_methods_from_pdfs.invoke({"pdf_dir": pdf_dir})
959
+
960
+ if isinstance(result, dict) and result.get("error"):
961
+ return (
962
+ f"<p style='color:var(--danger);font-size:0.83rem;'>{result['error']}</p>",
963
+ "<div class='status-pill idle'><div class='dot'></div>Extraction failed</div>",
964
+ get_method_technique_df(),
965
+ get_method_download_file(),
966
+ )
967
+
968
+ # Build UI outputs
969
+ stats_html = build_method_stats_html(result)
970
+ status_html = "<div class='status-pill ready'><div class='dot'></div>Extraction complete</div>"
971
+
972
+ return (
973
+ stats_html,
974
+ status_html,
975
+ get_method_technique_df(),
976
+ get_method_download_file(),
977
+ )
978
+
979
+
980
  # ---------------------------------------------------------------------------
981
  # Core interaction handlers
982
  # ---------------------------------------------------------------------------
 
1077
  FIX BUG 3 β€” write parsed review rows into agent_state["review_df"]
1078
  BEFORE calling the agent, so _parse_review_df() receives the populated list.
1079
  """
1080
+ def _next_phase_message(state: dict) -> str:
1081
+ gate = state.get("stop_gate")
1082
+ if gate == "STOP_GATE_1_AWAIT_REVIEW_TABLE":
1083
+ return "Review table submitted. Please proceed to Phase 3 and consolidate themes."
1084
+ if gate == "STOP_GATE_2_AWAIT_THEME_MERGE":
1085
+ return "Theme merge confirmed. Please proceed to Phase 4 for saturation check."
1086
+ if gate == "STOP_GATE_3_AWAIT_SATURATION_SIGNOFF":
1087
+ return "Saturation sign-off confirmed. Please proceed to Phase 5 for naming themes."
1088
+ if gate == "STOP_GATE_4_AWAIT_TAXONOMY_REVIEW":
1089
+ return "Taxonomy review confirmed. Please proceed to Phase 6 to finalize outputs."
1090
+ return "Review table submitted. Please proceed to the next phase."
1091
+
1092
  # Store the review table in state so agent.py can read it
1093
  agent_state["review_df"] = review_df.to_dict(orient="records")
1094
  agent_state["review_submitted"] = True
1095
 
1096
  # Send a short trigger message β€” the agent reads state, not the payload
1097
+ msg = _next_phase_message(agent_state)
1098
  results = []
1099
  for state in handle_chat(msg, chat_history, agent_state):
1100
  results = state
 
1102
  return new_history, new_state, phase_html
1103
 
1104
 
1105
+ def auto_accept_review(agent_state: dict, chat_history: list, enabled: bool):
1106
+ """Auto-approve Phase 2 review rows and submit when enabled."""
1107
+ if not enabled:
1108
+ return chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
1109
+
1110
+ gate = agent_state.get("stop_gate")
1111
+ if gate != "STOP_GATE_1_AWAIT_REVIEW_TABLE":
1112
+ return chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
1113
+
1114
+ if agent_state.get("review_submitted"):
1115
+ return chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
1116
+
1117
+ if agent_state.get("auto_accept_last_gate") == gate:
1118
+ return chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
1119
+
1120
+ rows = agent_state.get("review_df", [])
1121
+ if not rows:
1122
+ return chat_history, agent_state, build_phase_html(agent_state.get("phase", 0))
1123
+
1124
+ df = pd.DataFrame(rows)
1125
+ if "Approve" in df.columns:
1126
+ df["Approve"] = True
1127
+ if "Rename To" in df.columns and "Topic Label" in df.columns:
1128
+ df["Rename To"] = df["Rename To"].fillna("").astype(str)
1129
+ df["Rename To"] = df.apply(
1130
+ lambda r: r["Rename To"] or r["Topic Label"], axis=1
1131
+ )
1132
+
1133
+ new_history, new_state, phase_html = submit_review(df, agent_state, chat_history)
1134
+ new_state["auto_accept_last_gate"] = gate
1135
+ return new_history, new_state, phase_html
1136
+
1137
+
1138
  def refresh_downloads(agent_state: dict):
1139
  """Return downloadable artefact paths from agent state."""
1140
  files = agent_state.get("output_files", [])
 
1271
  with gr.Column(elem_classes=["panel-card", "panel-results"]):
1272
  gr.HTML("""<div class="card-title"><span>Results</span></div>""")
1273
 
1274
+ cluster_stats = gr.HTML(
1275
+ value=build_cluster_stats_html({}),
1276
+ )
1277
+
1278
  with gr.Tabs(elem_classes=["tabs"]):
1279
 
1280
  # ── Tab 1: Review Table ─────────────────────────────
 
1282
  gr.HTML("""
1283
  <p style='font-size:0.78rem;color:var(--text-muted);margin:0 0 12px;'>
1284
  Edit <b>Approve</b>, <b>Rename To</b>, and <b>Reasoning</b> columns inline,
1285
+ and use the <b>Papers</b> column to see the top 3 paper titles per cluster.
1286
  then click <b>Submit Review</b>. Use <b>verify</b> in chat at Phase 2
1287
  or Phase 5.5 to see Mistral vs Groq comparisons directly in chat output.
1288
+ Phase 2 verification also adds an adjudicated best label.
1289
+ Enable <b>Auto-accept Phase 2 review</b> to skip manual submission.
1290
  </p>""")
1291
 
1292
  review_table = gr.Dataframe(
 
1315
  elem_classes=["btn-success"],
1316
  )
1317
 
1318
+ auto_accept_toggle = gr.Checkbox(
1319
+ label="Auto-accept Phase 2 review and continue",
1320
+ value=False,
1321
+ )
1322
+
1323
  # ── Tab 2: Charts ───────────────────────────────────
1324
  with gr.TabItem("Charts", elem_classes=["tabitem"]):
1325
  chart_selector = gr.Dropdown(
 
1356
  elem_classes=["btn-secondary"],
1357
  )
1358
 
1359
+ # ── Tab 4: Clusters ─────────────────────────────────
1360
+ with gr.TabItem("Clusters", elem_classes=["tabitem"]):
1361
+ cluster_info_html = gr.HTML(
1362
+ value=build_cluster_info_html({}),
1363
+ )
1364
+
1365
+ # ── METHOD EXTRACTION β€” Standalone panel ──────────────────────
1366
+ with gr.Column(elem_classes=["panel-card"]):
1367
+ gr.HTML("""
1368
+ <div class="card-title">
1369
+ <span>πŸ“„ Computational Methodology Extraction</span>
1370
+ </div>
1371
+ <p style='font-size:0.78rem;color:var(--text-muted);margin:0 0 12px;'>
1372
+ Upload research PDFs to identify the specific computational methods
1373
+ used in each paper (text-only extraction via PyMuPDF + LLM).
1374
+ </p>
1375
+ """)
1376
+
1377
+ with gr.Row():
1378
+ with gr.Column(scale=1):
1379
+ pdf_upload = gr.File(
1380
+ label="Upload Research PDFs",
1381
+ file_types=[".pdf"],
1382
+ file_count="multiple",
1383
+ interactive=True,
1384
+ elem_id="pdf-upload",
1385
+ )
1386
+ with gr.Column(scale=1):
1387
+ method_status = gr.HTML(
1388
+ value="<div class='status-pill idle'><div class='dot'></div>Awaiting PDF upload</div>"
1389
+ )
1390
+ method_stats = gr.HTML(
1391
+ value="<p style='color:var(--text-muted);font-size:0.83rem;'>"
1392
+ "Upload PDF research papers to extract methods.</p>"
1393
+ )
1394
+
1395
+ run_methods_btn = gr.Button(
1396
+ "πŸš€ Extract Computational Methods",
1397
+ variant="primary",
1398
+ elem_classes=["btn-primary"],
1399
+ )
1400
+
1401
+ gr.HTML("<hr style='border:none;border-top:1px solid var(--border);margin:12px 0;'>")
1402
+
1403
+ # Results Dataframe
1404
+ gr.HTML("""
1405
+ <div style='font-size:0.82rem;color:var(--text-secondary);font-weight:600;margin-bottom:8px;'>
1406
+ Computational Techniques β†’ Algorithms β†’ Papers
1407
+ </div>""")
1408
+ method_technique_df = gr.Dataframe(
1409
+ headers=["Main Computational Technique", "Algorithms", "Papers"],
1410
+ interactive=False,
1411
+ wrap=True,
1412
+ )
1413
+
1414
+ gr.HTML("<hr style='border:none;border-top:1px solid var(--border);margin:12px 0;'>")
1415
+
1416
+ # CSV Download
1417
+ method_dl_files = gr.File(
1418
+ label="Download CSV Report",
1419
+ file_count="multiple",
1420
+ interactive=False,
1421
+ )
1422
+
1423
  # ────────────────────────────────────────────────────────────────
1424
  # Event wiring
1425
  # ────────────────────────────────────────────────────────────────
 
1489
  refresh_review_table(a),
1490
  *refresh_downloads(a),
1491
  get_chart_html(selected_chart, a),
1492
+ build_cluster_stats_html(a),
1493
+ build_cluster_info_html(a),
1494
  ),
1495
  inputs=[chart_selector, agent_state],
1496
+ outputs=[
1497
+ review_table,
1498
+ download_file_list_html,
1499
+ download_files,
1500
+ chart_display,
1501
+ cluster_stats,
1502
+ cluster_info_html,
1503
+ ],
1504
+ )
1505
+
1506
+ # Auto-accept Phase 2 review when enabled.
1507
+ chatbot.change(
1508
+ fn=auto_accept_review,
1509
+ inputs=[agent_state, chatbot, auto_accept_toggle],
1510
+ outputs=[chatbot, agent_state, phase_bar],
1511
+ )
1512
+
1513
+ # ── Method Extraction event wiring ─────────────────────────────
1514
+
1515
+ pdf_upload.change(
1516
+ fn=handle_pdf_upload,
1517
+ inputs=[pdf_upload],
1518
+ outputs=[method_status, method_stats],
1519
+ )
1520
+
1521
+ run_methods_btn.click(
1522
+ fn=run_method_extraction_pipeline,
1523
+ inputs=[],
1524
+ outputs=[
1525
+ method_stats,
1526
+ method_status,
1527
+ method_technique_df,
1528
+ method_dl_files,
1529
+ ],
1530
  )
1531
 
1532
  return app