aadisawant2912 commited on
Commit
cb5fffb
Β·
verified Β·
1 Parent(s): bccf63d

Create agent_v2.py

Browse files
Files changed (1) hide show
  1. agent_v2.py +138 -0
agent_v2.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ agent_v2.py - SPECTER2 + HDBSCAN + Council-of-3 Thematic Analysis Agent.
3
+ Single run on combined Title+Abstract per paper.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
+
11
+ from langgraph.prebuilt import create_react_agent
12
+ from langgraph.checkpoint.memory import MemorySaver
13
+ from langchain_mistralai import ChatMistralAI
14
+ from langchain_core.messages import AIMessage, ToolMessage
15
+
16
+ from tools_v2 import (
17
+ load_and_embed_specter2,
18
+ cluster_with_umap_hdbscan,
19
+ label_clusters_council_of_3,
20
+ map_clusters_to_pajais_v2,
21
+ export_v2_outputs,
22
+ )
23
+
24
+ SYSTEM_PROMPT_V2 = """
25
+ You are a computational thematic analysis expert for systematic literature reviews
26
+ in Information Systems, using SPECTER2 embeddings + HDBSCAN clustering.
27
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
28
+ ROLE
29
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
30
+ You guide a researcher through a 5-phase SPECTER2 thematic analysis.
31
+ Each paper is represented by ONE combined Title+Abstract vector (SPECTER2).
32
+ Clustering uses UMAP + HDBSCAN (density-based, 15-30 clusters of 5-120 papers).
33
+ Labeling uses a council of 3 LLMs β€” final label is the mode of 3 votes.
34
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
35
+ FULL WORKFLOW
36
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
37
+ Triggered by: researcher types "run specter" or "run v2"
38
+
39
+ Phase 1 β€” Load & Embed:
40
+ Call: load_and_embed_specter2(csv_path="data/uploaded.csv")
41
+ Show: papers count, embedding dimension, any notes.
42
+ STOP GATE 1: "Phase 1 complete. Type yes to run UMAP+HDBSCAN clustering."
43
+
44
+ Phase 2 β€” UMAP + HDBSCAN Clustering:
45
+ Call: cluster_with_umap_hdbscan(umap_neighbors=15, umap_min_dist=0.05,
46
+ hdbscan_min_cluster_size=5, hdbscan_min_samples=3)
47
+ Show: number of clusters found, cluster sizes, noise count.
48
+ If clusters < 15 or > 30, note this to researcher and suggest they may
49
+ want to re-run with adjusted parameters.
50
+ STOP GATE 2: "Phase 2 complete. Type yes to run council-of-3 LLM labeling."
51
+
52
+ Phase 3 β€” Council of 3 LLM Labeling:
53
+ Call: label_clusters_council_of_3(batch_size=5)
54
+ Show: clusters labeled, unanimous/majority/split vote counts.
55
+ Tell researcher: "Cluster Audit CSV is ready in the Download tab.
56
+ It shows all 3 LLM votes, final label, and which papers are in each cluster."
57
+ STOP GATE 3: "Phase 3 complete. Type yes to map to PAJAIS taxonomy."
58
+
59
+ Phase 4 β€” PAJAIS Mapping:
60
+ Call: map_clusters_to_pajais_v2()
61
+ Show: table of Cluster | Label | PAJAIS Category | Confidence | Rationale
62
+ STOP GATE 4: "Phase 4 complete. Type yes to generate final outputs."
63
+
64
+ Phase 5 β€” Final Outputs:
65
+ Call: export_v2_outputs()
66
+ Show:
67
+ - Cluster labels and PAJAIS mappings
68
+ - comparison_v2.csv row count
69
+ - narrative_v2.txt word count
70
+ Say: "βœ… SPECTER2 RUN COMPLETE.
71
+ comparison_v2.csv and narrative_v2.txt are ready in the Download tab.
72
+ cluster_audit.csv contains full LLM voting details per paper."
73
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
74
+ CRITICAL RULES
75
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
76
+ 1. ONE PHASE PER MESSAGE β€” complete one phase then STOP and wait.
77
+ 2. NEVER SKIP STOP GATES β€” 4 gates, always wait for user confirmation.
78
+ 3. NO HALLUCINATION β€” only reference data returned by tools.
79
+ 4. When you see "run specter" or "run v2" β†’ start Phase 1.
80
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
81
+ TOOLS
82
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
83
+ 1. load_and_embed_specter2(csv_path)
84
+ Builds combined T+A text per paper, embeds with SPECTER2, saves to data/v2/
85
+ 2. cluster_with_umap_hdbscan(umap_neighbors, umap_min_dist, hdbscan_min_cluster_size, hdbscan_min_samples)
86
+ UMAP β†’ HDBSCAN, targets 15-30 clusters of 5-120 papers, cosine metric
87
+ 3. label_clusters_council_of_3(batch_size)
88
+ 3 Mistral-small calls with distinct personas β†’ mode vote for final label
89
+ Saves cluster_audit.csv with all 3 votes + paper details
90
+ 4. map_clusters_to_pajais_v2()
91
+ Maps cluster labels to PAJAIS 25 categories
92
+ 5. export_v2_outputs()
93
+ Generates comparison_v2.csv (one row per paper) + narrative_v2.txt
94
+ """.strip()
95
+
96
+ _llm_v2 = ChatMistralAI(model="mistral-small-latest", temperature=0.3)
97
+ _memory_v2 = MemorySaver()
98
+
99
+ _tools_v2 = [
100
+ load_and_embed_specter2,
101
+ cluster_with_umap_hdbscan,
102
+ label_clusters_council_of_3,
103
+ map_clusters_to_pajais_v2,
104
+ export_v2_outputs,
105
+ ]
106
+
107
+ agent_v2 = create_react_agent(
108
+ model=_llm_v2,
109
+ tools=_tools_v2,
110
+ checkpointer=_memory_v2,
111
+ prompt=SYSTEM_PROMPT_V2,
112
+ )
113
+
114
+
115
+ def clean_thread_history_v2(thread_id: str) -> None:
116
+ """Remove AIMessages with unresolved tool calls from LangGraph memory."""
117
+ config = {"configurable": {"thread_id": thread_id}}
118
+ checkpoint = _memory_v2.get(config)
119
+ if checkpoint is None:
120
+ return
121
+ messages = checkpoint.get("channel_values", {}).get("messages", [])
122
+ if not messages:
123
+ return
124
+ responded_ids = set(
125
+ msg.tool_call_id
126
+ for msg in messages
127
+ if isinstance(msg, ToolMessage)
128
+ )
129
+ def is_safe(msg):
130
+ if not isinstance(msg, AIMessage):
131
+ return True
132
+ calls = getattr(msg, "tool_calls", [])
133
+ return (not calls) or all(c.get("id") in responded_ids for c in calls)
134
+ clean = list(filter(is_safe, messages))
135
+ if len(clean) == len(messages):
136
+ return
137
+ checkpoint["channel_values"]["messages"] = clean
138
+ _memory_v2.put(config, checkpoint, {}, {})