Spaces:

aadisawant2912
/

topic_modelling

Sleeping

App Files Files Community

aadisawant2912 commited on May 2

Commit

cb5fffb

verified ·

1 Parent(s): bccf63d

Create agent_v2.py

Browse files

Files changed (1) hide show

agent_v2.py +138 -0

agent_v2.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""
+agent_v2.py - SPECTER2 + HDBSCAN + Council-of-3 Thematic Analysis Agent.
+Single run on combined Title+Abstract per paper.
+"""
+from __future__ import annotations
+from dotenv import load_dotenv
+load_dotenv()
+from langgraph.prebuilt import create_react_agent
+from langgraph.checkpoint.memory import MemorySaver
+from langchain_mistralai import ChatMistralAI
+from langchain_core.messages import AIMessage, ToolMessage
+from tools_v2 import (
+    load_and_embed_specter2,
+    cluster_with_umap_hdbscan,
+    label_clusters_council_of_3,
+    map_clusters_to_pajais_v2,
+    export_v2_outputs,
+)
+SYSTEM_PROMPT_V2 = """
+You are a computational thematic analysis expert for systematic literature reviews
+in Information Systems, using SPECTER2 embeddings + HDBSCAN clustering.
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+ROLE
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+You guide a researcher through a 5-phase SPECTER2 thematic analysis.
+Each paper is represented by ONE combined Title+Abstract vector (SPECTER2).
+Clustering uses UMAP + HDBSCAN (density-based, 15-30 clusters of 5-120 papers).
+Labeling uses a council of 3 LLMs — final label is the mode of 3 votes.
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+FULL WORKFLOW
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+Triggered by: researcher types "run specter" or "run v2"
+Phase 1 — Load & Embed:
+  Call: load_and_embed_specter2(csv_path="data/uploaded.csv")
+  Show: papers count, embedding dimension, any notes.
+  STOP GATE 1: "Phase 1 complete. Type yes to run UMAP+HDBSCAN clustering."
+Phase 2 — UMAP + HDBSCAN Clustering:
+  Call: cluster_with_umap_hdbscan(umap_neighbors=15, umap_min_dist=0.05,
+        hdbscan_min_cluster_size=5, hdbscan_min_samples=3)
+  Show: number of clusters found, cluster sizes, noise count.
+  If clusters < 15 or > 30, note this to researcher and suggest they may
+  want to re-run with adjusted parameters.
+  STOP GATE 2: "Phase 2 complete. Type yes to run council-of-3 LLM labeling."
+Phase 3 — Council of 3 LLM Labeling:
+  Call: label_clusters_council_of_3(batch_size=5)
+  Show: clusters labeled, unanimous/majority/split vote counts.
+  Tell researcher: "Cluster Audit CSV is ready in the Download tab.
+  It shows all 3 LLM votes, final label, and which papers are in each cluster."
+  STOP GATE 3: "Phase 3 complete. Type yes to map to PAJAIS taxonomy."
+Phase 4 — PAJAIS Mapping:
+  Call: map_clusters_to_pajais_v2()
+  Show: table of Cluster | Label | PAJAIS Category | Confidence | Rationale
+  STOP GATE 4: "Phase 4 complete. Type yes to generate final outputs."
+Phase 5 — Final Outputs:
+  Call: export_v2_outputs()
+  Show:
+    - Cluster labels and PAJAIS mappings
+    - comparison_v2.csv row count
+    - narrative_v2.txt word count
+  Say: "✅ SPECTER2 RUN COMPLETE.
+  comparison_v2.csv and narrative_v2.txt are ready in the Download tab.
+  cluster_audit.csv contains full LLM voting details per paper."
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+CRITICAL RULES
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+1. ONE PHASE PER MESSAGE — complete one phase then STOP and wait.
+2. NEVER SKIP STOP GATES — 4 gates, always wait for user confirmation.
+3. NO HALLUCINATION — only reference data returned by tools.
+4. When you see "run specter" or "run v2" → start Phase 1.
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+TOOLS
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+1. load_and_embed_specter2(csv_path)
+   Builds combined T+A text per paper, embeds with SPECTER2, saves to data/v2/
+2. cluster_with_umap_hdbscan(umap_neighbors, umap_min_dist, hdbscan_min_cluster_size, hdbscan_min_samples)
+   UMAP → HDBSCAN, targets 15-30 clusters of 5-120 papers, cosine metric
+3. label_clusters_council_of_3(batch_size)
+   3 Mistral-small calls with distinct personas → mode vote for final label
+   Saves cluster_audit.csv with all 3 votes + paper details
+4. map_clusters_to_pajais_v2()
+   Maps cluster labels to PAJAIS 25 categories
+5. export_v2_outputs()
+   Generates comparison_v2.csv (one row per paper) + narrative_v2.txt
+""".strip()
+_llm_v2    = ChatMistralAI(model="mistral-small-latest", temperature=0.3)
+_memory_v2 = MemorySaver()
+_tools_v2 = [
+    load_and_embed_specter2,
+    cluster_with_umap_hdbscan,
+    label_clusters_council_of_3,
+    map_clusters_to_pajais_v2,
+    export_v2_outputs,
+]
+agent_v2 = create_react_agent(
+    model=_llm_v2,
+    tools=_tools_v2,
+    checkpointer=_memory_v2,
+    prompt=SYSTEM_PROMPT_V2,
+)
+def clean_thread_history_v2(thread_id: str) -> None:
+    """Remove AIMessages with unresolved tool calls from LangGraph memory."""
+    config     = {"configurable": {"thread_id": thread_id}}
+    checkpoint = _memory_v2.get(config)
+    if checkpoint is None:
+        return
+    messages = checkpoint.get("channel_values", {}).get("messages", [])
+    if not messages:
+        return
+    responded_ids = set(
+        msg.tool_call_id
+        for msg in messages
+        if isinstance(msg, ToolMessage)
+    )
+    def is_safe(msg):
+        if not isinstance(msg, AIMessage):
+            return True
+        calls = getattr(msg, "tool_calls", [])
+        return (not calls) or all(c.get("id") in responded_ids for c in calls)
+    clean = list(filter(is_safe, messages))
+    if len(clean) == len(messages):
+        return
+    checkpoint["channel_values"]["messages"] = clean
+    _memory_v2.put(config, checkpoint, {}, {})