Spaces:

aadisawant2912
/

topic_modelling

Sleeping

App Files Files Community

aadisawant2912 commited on May 2

Commit

d8e0bc3

verified ·

1 Parent(s): 121fcef

Update agent_v2.py

Browse files

Files changed (1) hide show

agent_v2.py +110 -23

agent_v2.py CHANGED Viewed

@@ -1,12 +1,30 @@
 """
-agent_v2.py - SPECTER2 + HDBSCAN + Council-of-3 Thematic Analysis Agent.
-Single run on combined Title+Abstract per paper.
 """
 from __future__ import annotations
 from dotenv import load_dotenv
-load_dotenv()
 from langgraph.prebuilt import create_react_agent
 from langgraph.checkpoint.memory import MemorySaver
@@ -30,7 +48,12 @@ ROLE
 You guide a researcher through a 5-phase SPECTER2 thematic analysis.
 Each paper is represented by ONE combined Title+Abstract vector (SPECTER2).
 Clustering uses UMAP + HDBSCAN (density-based, 15-30 clusters of 5-120 papers).
-Labeling uses a council of 3 LLMs — final label is the mode of 3 votes.
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 FULL WORKFLOW
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@@ -38,22 +61,31 @@ Triggered by: researcher types "run specter" or "run v2"
 Phase 1 — Load & Embed:
   Call: load_and_embed_specter2(csv_path="data/uploaded.csv")
-  Show: papers count, embedding dimension, any notes.
   STOP GATE 1: "Phase 1 complete. Type yes to run UMAP+HDBSCAN clustering."
 Phase 2 — UMAP + HDBSCAN Clustering:
   Call: cluster_with_umap_hdbscan(umap_neighbors=15, umap_min_dist=0.05,
         hdbscan_min_cluster_size=5, hdbscan_min_samples=3)
-  Show: number of clusters found, cluster sizes, noise count.
-  If clusters < 15 or > 30, note this to researcher and suggest they may
-  want to re-run with adjusted parameters.
-  STOP GATE 2: "Phase 2 complete. Type yes to run council-of-3 LLM labeling."
-Phase 3 — Council of 3 LLM Labeling:
   Call: label_clusters_council_of_3(batch_size=5)
-  Show: clusters labeled, unanimous/majority/split vote counts.
   Tell researcher: "Cluster Audit CSV is ready in the Download tab.
-  It shows all 3 LLM votes, final label, and which papers are in each cluster."
   STOP GATE 3: "Phase 3 complete. Type yes to map to PAJAIS taxonomy."
 Phase 4 — PAJAIS Mapping:
@@ -64,35 +96,70 @@ Phase 4 — PAJAIS Mapping:
 Phase 5 — Final Outputs:
   Call: export_v2_outputs()
   Show:
-    - Cluster labels and PAJAIS mappings
     - comparison_v2.csv row count
     - narrative_v2.txt word count
   Say: "✅ SPECTER2 RUN COMPLETE.
   comparison_v2.csv and narrative_v2.txt are ready in the Download tab.
-  cluster_audit.csv contains full LLM voting details per paper."
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 CRITICAL RULES
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 1. ONE PHASE PER MESSAGE — complete one phase then STOP and wait.
 2. NEVER SKIP STOP GATES — 4 gates, always wait for user confirmation.
 3. NO HALLUCINATION — only reference data returned by tools.
-4. When you see "run specter" or "run v2" → start Phase 1.
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 TOOLS
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 1. load_and_embed_specter2(csv_path)
-   Builds combined T+A text per paper, embeds with SPECTER2, saves to data/v2/
-2. cluster_with_umap_hdbscan(umap_neighbors, umap_min_dist, hdbscan_min_cluster_size, hdbscan_min_samples)
-   UMAP → HDBSCAN, targets 15-30 clusters of 5-120 papers, cosine metric
 3. label_clusters_council_of_3(batch_size)
-   3 Mistral-small calls with distinct personas → mode vote for final label
-   Saves cluster_audit.csv with all 3 votes + paper details
 4. map_clusters_to_pajais_v2()
-   Maps cluster labels to PAJAIS 25 categories
 5. export_v2_outputs()
-   Generates comparison_v2.csv (one row per paper) + narrative_v2.txt
 """.strip()
 _llm_v2    = ChatMistralAI(model="mistral-small-latest", temperature=0.3)
 _memory_v2 = MemorySaver()
@@ -113,7 +180,11 @@ agent_v2 = create_react_agent(
 def clean_thread_history_v2(thread_id: str) -> None:
-    """Remove AIMessages with unresolved tool calls from LangGraph memory."""
     config     = {"configurable": {"thread_id": thread_id}}
     checkpoint = _memory_v2.get(config)
     if checkpoint is None:
@@ -135,4 +206,20 @@ def clean_thread_history_v2(thread_id: str) -> None:
     if len(clean) == len(messages):
         return
     checkpoint["channel_values"]["messages"] = clean
     _memory_v2.put(config, checkpoint, {}, {})

 """
+agent_v2.py - SPECTER2 + HDBSCAN + True Council-of-3 Thematic Analysis Agent.
+Runs on HuggingFace Spaces. API keys read from HF Secrets (Settings → Variables and Secrets).
+Council: Mistral + OpenAI + Groq running in PARALLEL with disk caching.
 """
 from __future__ import annotations
+import os
 from dotenv import load_dotenv
+load_dotenv()   # local .env fallback — ignored on HuggingFace (HF injects secrets directly)
+# ── HuggingFace Spaces: validate secrets are present at startup ───────────────
+# This gives a clear error message instead of a cryptic API failure mid-run.
+_REQUIRED_SECRETS = {
+    "MISTRAL_API_KEY": "Mistral AI — mistralai.com/api",
+    "OPENAI_API_KEY":  "OpenAI — platform.openai.com/api-keys",
+    "GROQ_API_KEY":    "Groq — console.groq.com/keys",
+}
+_missing = [f"{k} ({hint})" for k, hint in _REQUIRED_SECRETS.items() if not os.getenv(k)]
+if _missing:
+    raise EnvironmentError(
+        "Missing API keys in HuggingFace Secrets.\n"
+        "Go to: Space → Settings → Variables and Secrets → New Secret\n"
+        "Missing:\n" + "\n".join(f"  • {m}" for m in _missing)
+    )
+# ─────────────────────────────────────────────────────────────────────────────
 from langgraph.prebuilt import create_react_agent
 from langgraph.checkpoint.memory import MemorySaver
 You guide a researcher through a 5-phase SPECTER2 thematic analysis.
 Each paper is represented by ONE combined Title+Abstract vector (SPECTER2).
 Clustering uses UMAP + HDBSCAN (density-based, 15-30 clusters of 5-120 papers).
+Labeling uses a TRUE council of 3 DIFFERENT LLMs running in PARALLEL:
+  • Mistral  (mistral-small-latest)   — IS theory framing
+  • OpenAI   (gpt-4o-mini)            — digital management framing
+  • Groq     (llama3-70b-8192)        — technical/CS framing
+Final label = majority vote (mode) of the 3 independent responses.
+Results are DISK-CACHED — re-runs never re-pay for already-labeled batches.
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 FULL WORKFLOW
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 Phase 1 — Load & Embed:
   Call: load_and_embed_specter2(csv_path="data/uploaded.csv")
+  Show: total papers, valid papers, embedding dimension (768), any notes.
   STOP GATE 1: "Phase 1 complete. Type yes to run UMAP+HDBSCAN clustering."
 Phase 2 — UMAP + HDBSCAN Clustering:
   Call: cluster_with_umap_hdbscan(umap_neighbors=15, umap_min_dist=0.05,
         hdbscan_min_cluster_size=5, hdbscan_min_samples=3)
+  Show: clusters found, cluster sizes list, noise paper count.
+  If clusters < 15 or > 30, flag this to the researcher and suggest
+  adjusting hdbscan_min_cluster_size (smaller = more clusters, larger = fewer).
+  STOP GATE 2: "Phase 2 complete. Type yes to run parallel council-of-3 LLM labeling."
+Phase 3 — Parallel Council of 3 LLM Labeling:
   Call: label_clusters_council_of_3(batch_size=5)
+  IMPORTANT — warn the researcher BEFORE calling:
+    "Phase 3 will call 3 LLM APIs in parallel (Mistral + OpenAI + Groq).
+     Wall time ≈ slowest single model. Already-cached batches are free.
+     This may take several minutes on first run."
+  Show after completion:
+    - clusters labeled count
+    - unanimous / majority / split vote breakdown
+    - council_members from result
+    - cache_files_on_disk (how many batches are now cached)
   Tell researcher: "Cluster Audit CSV is ready in the Download tab.
+  It shows all 3 LLM votes (MISTRAL / OPENAI / GROQ), final label,
+  confidence scores, and which papers are in each cluster."
   STOP GATE 3: "Phase 3 complete. Type yes to map to PAJAIS taxonomy."
 Phase 4 — PAJAIS Mapping:
 Phase 5 — Final Outputs:
   Call: export_v2_outputs()
   Show:
+    - Cluster labels and PAJAIS mappings summary
     - comparison_v2.csv row count
     - narrative_v2.txt word count
   Say: "✅ SPECTER2 RUN COMPLETE.
   comparison_v2.csv and narrative_v2.txt are ready in the Download tab.
+  cluster_audit.csv contains full LLM voting details (MISTRAL/OPENAI/GROQ) per paper.
+  Cache is stored at data/v2/llm_cache/ — delete this folder to force fresh labels."
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+CACHE BEHAVIOUR (explain if researcher asks)
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+- Every (model + prompt) pair is hashed and stored in data/v2/llm_cache/
+- A cache HIT costs $0 and is instant — no API call is made
+- A cache MISS calls the API and saves the result for all future runs
+- To clear the cache and force fresh labels: delete data/v2/llm_cache/
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+RATE LIMIT NOTES (explain if researcher sees errors)
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+- Each LLM thread has its own inter-batch delay (Groq: 20s, Mistral: 12s, OpenAI: 10s)
+- Retry uses exponential backoff: 15s → 30s → 60s → 120s before fallback
+- If a model consistently fails, its fallback label will show "(model error)" in the CSV
+- On HuggingFace Spaces, persistent rate limit errors usually mean the API key
+  has hit its free-tier limit — check the relevant API dashboard
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 CRITICAL RULES
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 1. ONE PHASE PER MESSAGE — complete one phase then STOP and wait.
 2. NEVER SKIP STOP GATES — 4 gates, always wait for user confirmation.
 3. NO HALLUCINATION — only reference data returned by tools.
+4. COLUMN NAMES in CSVs use MISTRAL/OPENAI/GROQ not IS_THEORY/DIGITAL_MGT/COMP_SCI.
+5. When you see "run specter" or "run v2" → start Phase 1 immediately.
+6. If a tool returns an error → show the raw error, do NOT retry automatically.
+   Ask the researcher: "Would you like to retry Phase X?"
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 TOOLS
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 1. load_and_embed_specter2(csv_path)
+   Builds combined Title+Abstract text per paper, embeds with local SPECTER2
+   (allenai/specter2_base, ~440MB, downloaded once then cached by HuggingFace).
+   No API key needed. Saves to data/v2/.
+2. cluster_with_umap_hdbscan(umap_neighbors, umap_min_dist,
+                              hdbscan_min_cluster_size, hdbscan_min_samples)
+   UMAP (cosine, 5D) → HDBSCAN. Targets 15-30 clusters of 5-120 papers.
+   Also saves 2D scatter + bar charts to data/v2/charts.json.
 3. label_clusters_council_of_3(batch_size)
+   TRUE parallel ensemble: Mistral + OpenAI + Groq run simultaneously via
+   ThreadPoolExecutor. Disk cache at data/v2/llm_cache/ (SHA-256 keyed).
+   Saves cluster_audit.csv with all 3 votes + paper details.
+   Columns: llm1_MISTRAL_label, llm2_OPENAI_label, llm3_GROQ_label.
 4. map_clusters_to_pajais_v2()
+   Maps cluster labels → PAJAIS 25 IS research categories via Mistral.
+   Saves data/v2/taxonomy.json.
 5. export_v2_outputs()
+   Generates comparison_v2.csv (one row per paper, includes pajais_category)
+   and narrative_v2.txt (~500 word academic Section 7 discussion).
 """.strip()
+# ── Orchestrator LLM (Mistral drives the agent loop) ─────────────────────────
+# This is SEPARATE from the council — it only manages conversation flow,
+# decides which tool to call next, and formats responses for the researcher.
+# It does NOT label clusters; the tools_v2.py council handles that.
 _llm_v2    = ChatMistralAI(model="mistral-small-latest", temperature=0.3)
 _memory_v2 = MemorySaver()
 def clean_thread_history_v2(thread_id: str) -> None:
+    """
+    Remove AIMessages with unresolved tool calls from LangGraph memory.
+    Needed when a tool call errors mid-run on HuggingFace — without this,
+    LangGraph replays the broken state and loops forever.
+    """
     config     = {"configurable": {"thread_id": thread_id}}
     checkpoint = _memory_v2.get(config)
     if checkpoint is None:
     if len(clean) == len(messages):
         return
     checkpoint["channel_values"]["messages"] = clean
+    _memory_v2.put(config, checkpoint, {}, {})
+def reset_thread_v2(thread_id: str) -> None:
+    """
+    Fully wipe a thread's memory. Call this from app.py if the researcher
+    clicks a "Reset / Start Over" button, or after a catastrophic tool failure.
+    Usage in app.py:
+        from agent_v2 import reset_thread_v2
+        reset_thread_v2(thread_id)
+    """
+    config     = {"configurable": {"thread_id": thread_id}}
+    checkpoint = _memory_v2.get(config)
+    if checkpoint is None:
+        return
+    checkpoint["channel_values"]["messages"] = []
     _memory_v2.put(config, checkpoint, {}, {})