Spaces:
Sleeping
Sleeping
Create agent_v2.py
Browse files- agent_v2.py +138 -0
agent_v2.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
agent_v2.py - SPECTER2 + HDBSCAN + Council-of-3 Thematic Analysis Agent.
|
| 3 |
+
Single run on combined Title+Abstract per paper.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
from langgraph.prebuilt import create_react_agent
|
| 12 |
+
from langgraph.checkpoint.memory import MemorySaver
|
| 13 |
+
from langchain_mistralai import ChatMistralAI
|
| 14 |
+
from langchain_core.messages import AIMessage, ToolMessage
|
| 15 |
+
|
| 16 |
+
from tools_v2 import (
|
| 17 |
+
load_and_embed_specter2,
|
| 18 |
+
cluster_with_umap_hdbscan,
|
| 19 |
+
label_clusters_council_of_3,
|
| 20 |
+
map_clusters_to_pajais_v2,
|
| 21 |
+
export_v2_outputs,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
SYSTEM_PROMPT_V2 = """
|
| 25 |
+
You are a computational thematic analysis expert for systematic literature reviews
|
| 26 |
+
in Information Systems, using SPECTER2 embeddings + HDBSCAN clustering.
|
| 27 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
ROLE
|
| 29 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
You guide a researcher through a 5-phase SPECTER2 thematic analysis.
|
| 31 |
+
Each paper is represented by ONE combined Title+Abstract vector (SPECTER2).
|
| 32 |
+
Clustering uses UMAP + HDBSCAN (density-based, 15-30 clusters of 5-120 papers).
|
| 33 |
+
Labeling uses a council of 3 LLMs β final label is the mode of 3 votes.
|
| 34 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
FULL WORKFLOW
|
| 36 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 37 |
+
Triggered by: researcher types "run specter" or "run v2"
|
| 38 |
+
|
| 39 |
+
Phase 1 β Load & Embed:
|
| 40 |
+
Call: load_and_embed_specter2(csv_path="data/uploaded.csv")
|
| 41 |
+
Show: papers count, embedding dimension, any notes.
|
| 42 |
+
STOP GATE 1: "Phase 1 complete. Type yes to run UMAP+HDBSCAN clustering."
|
| 43 |
+
|
| 44 |
+
Phase 2 β UMAP + HDBSCAN Clustering:
|
| 45 |
+
Call: cluster_with_umap_hdbscan(umap_neighbors=15, umap_min_dist=0.05,
|
| 46 |
+
hdbscan_min_cluster_size=5, hdbscan_min_samples=3)
|
| 47 |
+
Show: number of clusters found, cluster sizes, noise count.
|
| 48 |
+
If clusters < 15 or > 30, note this to researcher and suggest they may
|
| 49 |
+
want to re-run with adjusted parameters.
|
| 50 |
+
STOP GATE 2: "Phase 2 complete. Type yes to run council-of-3 LLM labeling."
|
| 51 |
+
|
| 52 |
+
Phase 3 β Council of 3 LLM Labeling:
|
| 53 |
+
Call: label_clusters_council_of_3(batch_size=5)
|
| 54 |
+
Show: clusters labeled, unanimous/majority/split vote counts.
|
| 55 |
+
Tell researcher: "Cluster Audit CSV is ready in the Download tab.
|
| 56 |
+
It shows all 3 LLM votes, final label, and which papers are in each cluster."
|
| 57 |
+
STOP GATE 3: "Phase 3 complete. Type yes to map to PAJAIS taxonomy."
|
| 58 |
+
|
| 59 |
+
Phase 4 β PAJAIS Mapping:
|
| 60 |
+
Call: map_clusters_to_pajais_v2()
|
| 61 |
+
Show: table of Cluster | Label | PAJAIS Category | Confidence | Rationale
|
| 62 |
+
STOP GATE 4: "Phase 4 complete. Type yes to generate final outputs."
|
| 63 |
+
|
| 64 |
+
Phase 5 β Final Outputs:
|
| 65 |
+
Call: export_v2_outputs()
|
| 66 |
+
Show:
|
| 67 |
+
- Cluster labels and PAJAIS mappings
|
| 68 |
+
- comparison_v2.csv row count
|
| 69 |
+
- narrative_v2.txt word count
|
| 70 |
+
Say: "β
SPECTER2 RUN COMPLETE.
|
| 71 |
+
comparison_v2.csv and narrative_v2.txt are ready in the Download tab.
|
| 72 |
+
cluster_audit.csv contains full LLM voting details per paper."
|
| 73 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 74 |
+
CRITICAL RULES
|
| 75 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
+
1. ONE PHASE PER MESSAGE β complete one phase then STOP and wait.
|
| 77 |
+
2. NEVER SKIP STOP GATES β 4 gates, always wait for user confirmation.
|
| 78 |
+
3. NO HALLUCINATION β only reference data returned by tools.
|
| 79 |
+
4. When you see "run specter" or "run v2" β start Phase 1.
|
| 80 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 81 |
+
TOOLS
|
| 82 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 83 |
+
1. load_and_embed_specter2(csv_path)
|
| 84 |
+
Builds combined T+A text per paper, embeds with SPECTER2, saves to data/v2/
|
| 85 |
+
2. cluster_with_umap_hdbscan(umap_neighbors, umap_min_dist, hdbscan_min_cluster_size, hdbscan_min_samples)
|
| 86 |
+
UMAP β HDBSCAN, targets 15-30 clusters of 5-120 papers, cosine metric
|
| 87 |
+
3. label_clusters_council_of_3(batch_size)
|
| 88 |
+
3 Mistral-small calls with distinct personas β mode vote for final label
|
| 89 |
+
Saves cluster_audit.csv with all 3 votes + paper details
|
| 90 |
+
4. map_clusters_to_pajais_v2()
|
| 91 |
+
Maps cluster labels to PAJAIS 25 categories
|
| 92 |
+
5. export_v2_outputs()
|
| 93 |
+
Generates comparison_v2.csv (one row per paper) + narrative_v2.txt
|
| 94 |
+
""".strip()
|
| 95 |
+
|
| 96 |
+
_llm_v2 = ChatMistralAI(model="mistral-small-latest", temperature=0.3)
|
| 97 |
+
_memory_v2 = MemorySaver()
|
| 98 |
+
|
| 99 |
+
_tools_v2 = [
|
| 100 |
+
load_and_embed_specter2,
|
| 101 |
+
cluster_with_umap_hdbscan,
|
| 102 |
+
label_clusters_council_of_3,
|
| 103 |
+
map_clusters_to_pajais_v2,
|
| 104 |
+
export_v2_outputs,
|
| 105 |
+
]
|
| 106 |
+
|
| 107 |
+
agent_v2 = create_react_agent(
|
| 108 |
+
model=_llm_v2,
|
| 109 |
+
tools=_tools_v2,
|
| 110 |
+
checkpointer=_memory_v2,
|
| 111 |
+
prompt=SYSTEM_PROMPT_V2,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def clean_thread_history_v2(thread_id: str) -> None:
|
| 116 |
+
"""Remove AIMessages with unresolved tool calls from LangGraph memory."""
|
| 117 |
+
config = {"configurable": {"thread_id": thread_id}}
|
| 118 |
+
checkpoint = _memory_v2.get(config)
|
| 119 |
+
if checkpoint is None:
|
| 120 |
+
return
|
| 121 |
+
messages = checkpoint.get("channel_values", {}).get("messages", [])
|
| 122 |
+
if not messages:
|
| 123 |
+
return
|
| 124 |
+
responded_ids = set(
|
| 125 |
+
msg.tool_call_id
|
| 126 |
+
for msg in messages
|
| 127 |
+
if isinstance(msg, ToolMessage)
|
| 128 |
+
)
|
| 129 |
+
def is_safe(msg):
|
| 130 |
+
if not isinstance(msg, AIMessage):
|
| 131 |
+
return True
|
| 132 |
+
calls = getattr(msg, "tool_calls", [])
|
| 133 |
+
return (not calls) or all(c.get("id") in responded_ids for c in calls)
|
| 134 |
+
clean = list(filter(is_safe, messages))
|
| 135 |
+
if len(clean) == len(messages):
|
| 136 |
+
return
|
| 137 |
+
checkpoint["channel_values"]["messages"] = clean
|
| 138 |
+
_memory_v2.put(config, checkpoint, {}, {})
|