Spaces:

atharvthite05
/

BERTopic_Thematic_Analysis_Agent

Sleeping

App Files Files Community

atharvthite05 commited on 6 days ago

Commit

6cd378e

verified ·

1 Parent(s): 60bc73f

Update agent.py

Browse files

Files changed (1) hide show

agent.py +86 -15

agent.py CHANGED Viewed

@@ -92,8 +92,10 @@ DEFAULT_RUN_KEY: str = "abstract"
 THREAD_PREFIX:   str = "TA-"
 MAX_USER_MESSAGE_CHARS: int = 4000
 VERIFY_CHAT_MAX_ROWS: int = 20
-PROVIDER_RETRY_ATTEMPTS: int = 3
-PROVIDER_RETRY_BASE_DELAY_S: float = 1.5
 # FIX ISSUE 4 — surface missing API key immediately at import time
 _KEY_MISSING = not bool(MISTRAL_API_KEY)
@@ -313,6 +315,7 @@ After researcher confirms:
    → Filters publisher boilerplate (copyright, license text)
     → Embeds with SPECTER2 (L2-normalized)
     → UMAP reduces dimensions for HDBSCAN clustering
    → Finds 5 nearest centroid sentences per topic
    → Saves Plotly HTML visualizations
    → Saves embeddings + summaries checkpoints
@@ -360,7 +363,7 @@ After researcher confirms:
    | Research Area | General research area (NOT PACIS — that comes later in Phase 5.5) |
    | Confidence | How well the 5 sentences match the label |
    | Sentences | Number of sentences clustered here |
-   | Papers | Number of unique papers contributing sentences |
    | Approve | Edit: yes/no — keep or reject this topic |
    | Rename To | Edit: type new name if label is wrong |
    | Your Reasoning | Edit: why you renamed/rejected |"
@@ -742,10 +745,25 @@ def _is_transient_provider_error(exc: Exception) -> bool:
         or '"raw_status_code":503' in msg
         or '"raw_status_code":502' in msg
         or '"raw_status_code":504' in msg
         or "service unavailable" in msg
     )
 def _invoke_react_with_retries(enriched: str, thread_id: str) -> dict:
     """Call the ReAct graph with bounded retries for transient provider failures."""
     last_exc: Exception | None = None
@@ -762,7 +780,10 @@ def _invoke_react_with_retries(enriched: str, thread_id: str) -> dict:
                 raise
             last_exc = exc
             if attempt < PROVIDER_RETRY_ATTEMPTS - 1:
-                time.sleep(PROVIDER_RETRY_BASE_DELAY_S * (attempt + 1))
                 continue
             raise last_exc
@@ -825,6 +846,7 @@ def _collect_output_files(state: dict) -> list[str]:
     rdir       = _run_dir(run_key)
     candidates = [
         str(rdir / "summaries.json"),
         str(rdir / "labels.json"),
         str(rdir / "labels_verification.json"),
         str(rdir / "themes.json"),
@@ -933,12 +955,60 @@ def _populate_review_df(state: dict) -> dict:
             "Approve", "Rename To", "Reasoning"
     """
     labels_path = OUTPUT_DIR / state.get("run_key", DEFAULT_RUN_KEY) / "labels.json"
     def _reasoning_cell(row: dict) -> str:
-                return str(
-                        row.get("mistral_reasoning")
-                        or row.get("reasoning", "")
-                ).strip()
     return (
         {
@@ -946,12 +1016,12 @@ def _populate_review_df(state: dict) -> dict:
             "review_df": list(map(
                 lambda r: {
                     "#":           r.get("cluster_id", 0),
-                    "Topic Label": r.get("label") or r.get("mistral_label", ""),
                     "Top Evidence":r["evidence"][0] if r.get("evidence") else "",
                     "Sentences":   r.get("size", 0),
-                    "Papers":      "",
                     "Approve":     False,
-                    "Rename To":   r.get("label") or r.get("mistral_label", ""),
                     "Reasoning":   _reasoning_cell(r),
                 },
                 _load_json(labels_path),
@@ -1031,15 +1101,16 @@ def _build_verify_chat_report(rows: list[dict]) -> str:
     shown = rows[:VERIFY_CHAT_MAX_ROWS]
     header = [
-        "| # | Mistral Label | Groq-Ollama Label | Groq-GPT Label |",
-        "|---|---|---|---|",
     ]
     lines = list(map(
         lambda r: (
             f"| {int(r.get('cluster_id', 0))} "
             f"| {_sanitize_markdown_cell(r.get('mistral_label') or r.get('label', ''))} "
             f"| {_sanitize_markdown_cell(r.get('groq_ollama_label') or r.get('groq_label', ''))} "
-            f"| {_sanitize_markdown_cell(r.get('groq_gpt_label', ''))} |"
         ),
         shown,
     ))
@@ -1122,7 +1193,7 @@ def _handle_verify_command(state: dict) -> tuple[str, dict]:
         reply = (
             "VERIFY complete. Groq-Ollama and Groq-GPT topic labeling has been added for Phase 2 topics.\n\n"
             f"Verified topics: {verified_count}/{labelled_count}\n"
-            "Mistral vs Groq-Ollama vs Groq-GPT comparison is shown below in chat.\n\n"
             f"{report}\n\n"
             "Compare labels, edit Rename To/Approve, then click Submit Review to continue.\n\n"
             "[STOP GATE 1 — AWAITING REVIEW TABLE SUBMISSION]"

 THREAD_PREFIX:   str = "TA-"
 MAX_USER_MESSAGE_CHARS: int = 4000
 VERIFY_CHAT_MAX_ROWS: int = 20
+PROVIDER_RETRY_ATTEMPTS: int = 4
+PROVIDER_RETRY_BASE_DELAY_S: float = 2.0
+PROVIDER_RETRY_RATE_LIMIT_DELAY_S: float = 6.0
+PROVIDER_RETRY_MAX_DELAY_S: float = 18.0
 # FIX ISSUE 4 — surface missing API key immediately at import time
 _KEY_MISSING = not bool(MISTRAL_API_KEY)
    → Filters publisher boilerplate (copyright, license text)
     → Embeds with SPECTER2 (L2-normalized)
     → UMAP reduces dimensions for HDBSCAN clustering
+    → Auto-optimizes HDBSCAN parameters after the first run (optimization.json)
    → Finds 5 nearest centroid sentences per topic
    → Saves Plotly HTML visualizations
    → Saves embeddings + summaries checkpoints
    | Research Area | General research area (NOT PACIS — that comes later in Phase 5.5) |
    | Confidence | How well the 5 sentences match the label |
    | Sentences | Number of sentences clustered here |
+    | Papers | Unique paper count plus top 3 paper titles |
    | Approve | Edit: yes/no — keep or reject this topic |
    | Rename To | Edit: type new name if label is wrong |
    | Your Reasoning | Edit: why you renamed/rejected |"
         or '"raw_status_code":503' in msg
         or '"raw_status_code":502' in msg
         or '"raw_status_code":504' in msg
+        or '"raw_status_code":429' in msg
+        or '"status":429' in msg
+        or "too many requests" in msg
+        or "rate limit" in msg
         or "service unavailable" in msg
     )
+def _is_rate_limit_error(exc: Exception) -> bool:
+    msg = str(exc).lower()
+    return (
+        "rate limit" in msg
+        or "too many requests" in msg
+        or '"raw_status_code":429' in msg
+        or '"status":429' in msg
+        or "status code: 429" in msg
+    )
 def _invoke_react_with_retries(enriched: str, thread_id: str) -> dict:
     """Call the ReAct graph with bounded retries for transient provider failures."""
     last_exc: Exception | None = None
                 raise
             last_exc = exc
             if attempt < PROVIDER_RETRY_ATTEMPTS - 1:
+                delay = PROVIDER_RETRY_BASE_DELAY_S * (attempt + 1)
+                if _is_rate_limit_error(exc):
+                    delay = max(delay, PROVIDER_RETRY_RATE_LIMIT_DELAY_S * (attempt + 1))
+                time.sleep(min(PROVIDER_RETRY_MAX_DELAY_S, delay))
                 continue
             raise last_exc
     rdir       = _run_dir(run_key)
     candidates = [
         str(rdir / "summaries.json"),
+        str(rdir / "optimization.json"),
         str(rdir / "labels.json"),
         str(rdir / "labels_verification.json"),
         str(rdir / "themes.json"),
             "Approve", "Rename To", "Reasoning"
     """
     labels_path = OUTPUT_DIR / state.get("run_key", DEFAULT_RUN_KEY) / "labels.json"
+    summaries_path = OUTPUT_DIR / state.get("run_key", DEFAULT_RUN_KEY) / "summaries.json"
+    summaries = _load_json(summaries_path) if summaries_path.exists() else []
+    summary_by_id = {
+        int(item.get("cluster_id", -1)): item
+        for item in summaries
+        if isinstance(item, dict)
+    }
     def _reasoning_cell(row: dict) -> str:
+        return str(
+            row.get("adjudicated_reasoning")
+            or row.get("mistral_reasoning")
+            or row.get("reasoning", "")
+        ).strip()
+    def _papers_cell(row: dict) -> str:
+        cid = int(row.get("cluster_id", row.get("#", -1)) or -1)
+        summary = summary_by_id.get(cid, {})
+        count = row.get("paper_count")
+        if count is None:
+            count = summary.get("paper_count")
+        top_papers = row.get("top_papers") or summary.get("top_papers", [])
+        if isinstance(top_papers, list) and top_papers:
+            titles = []
+            for entry in top_papers[:3]:
+                if isinstance(entry, dict):
+                    title = str(
+                        entry.get("paper_title")
+                        or entry.get("title")
+                        or ""
+                    ).strip()
+                    paper_count = entry.get("count")
+                    if title:
+                        titles.append(
+                            f"{title} ({paper_count})"
+                            if paper_count
+                            else title
+                        )
+                else:
+                    titles.append(str(entry))
+            title_str = "; ".join(filter(None, titles))
+            if count:
+                return f"{count} | {title_str}" if title_str else str(count)
+            return title_str
+        return str(count) if count else ""
+    def _label_value(row: dict) -> str:
+        return str(
+            row.get("adjudicated_label")
+            or row.get("mistral_label")
+            or row.get("label")
+            or ""
+        ).strip()
     return (
         {
             "review_df": list(map(
                 lambda r: {
                     "#":           r.get("cluster_id", 0),
+                    "Topic Label": _label_value(r),
                     "Top Evidence":r["evidence"][0] if r.get("evidence") else "",
                     "Sentences":   r.get("size", 0),
+                    "Papers":      _papers_cell(r),
                     "Approve":     False,
+                    "Rename To":   _label_value(r),
                     "Reasoning":   _reasoning_cell(r),
                 },
                 _load_json(labels_path),
     shown = rows[:VERIFY_CHAT_MAX_ROWS]
     header = [
+        "| # | Mistral Label | Groq-Ollama Label | Groq-GPT Label | Best Label |",
+        "|---|---|---|---|---|",
     ]
     lines = list(map(
         lambda r: (
             f"| {int(r.get('cluster_id', 0))} "
             f"| {_sanitize_markdown_cell(r.get('mistral_label') or r.get('label', ''))} "
             f"| {_sanitize_markdown_cell(r.get('groq_ollama_label') or r.get('groq_label', ''))} "
+            f"| {_sanitize_markdown_cell(r.get('groq_gpt_label', ''))} "
+            f"| {_sanitize_markdown_cell(r.get('adjudicated_label', ''))} |"
         ),
         shown,
     ))
         reply = (
             "VERIFY complete. Groq-Ollama and Groq-GPT topic labeling has been added for Phase 2 topics.\n\n"
             f"Verified topics: {verified_count}/{labelled_count}\n"
+            "Mistral vs Groq-Ollama vs Groq-GPT comparison (plus adjudicated best label) is shown below in chat.\n\n"
             f"{report}\n\n"
             "Compare labels, edit Rename To/Approve, then click Submit Review to continue.\n\n"
             "[STOP GATE 1 — AWAITING REVIEW TABLE SUBMISSION]"