Spaces:

aadisawant2912
/

topic_modelling

Sleeping

App Files Files Community

topic_modelling / agent_v2.py

aadisawant2912

Update agent_v2.py

5b7181e verified 27 days ago

raw

history blame contribute delete

11.7 kB

	"""
	agent_v2.py - SPECTER2 + HDBSCAN + True Council-of-3 Thematic Analysis Agent.
	Runs on HuggingFace Spaces. API keys read from HF Secrets (Settings → Variables and Secrets).
	Council: Mistral + OpenAI + Groq running in PARALLEL with disk caching.
	"""

	from __future__ import annotations

	import os
	from dotenv import load_dotenv
	load_dotenv() # local .env fallback — ignored on HuggingFace (HF injects secrets directly)

	# ── HuggingFace Spaces: validate secrets are present at startup ───────────────
	# This gives a clear error message instead of a cryptic API failure mid-run.
	import os
	_key_status = {
	"MISTRAL_API_KEY": bool(os.getenv("MISTRAL_API_KEY")),
	"GROQ_API_KEY": bool(os.getenv("GROQ_API_KEY")),
	"GOOGLE_API_KEY": bool(os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY")),
	}
	for _k, _ok in _key_status.items():
	print(f" Secret check: {_k} → {'✅ found' if _ok else '⚠️ MISSING'}")

	# remap GEMINI_API_KEY → GOOGLE_API_KEY if needed
	if not os.getenv("GOOGLE_API_KEY") and os.getenv("GEMINI_API_KEY"):
	os.environ["GOOGLE_API_KEY"] = os.environ["GEMINI_API_KEY"]
	print(" Remapped GEMINI_API_KEY → GOOGLE_API_KEY")

	from langgraph.prebuilt import create_react_agent
	from langgraph.checkpoint.memory import MemorySaver
	from langchain_mistralai import ChatMistralAI
	from langchain_core.messages import AIMessage, ToolMessage

	from tools_v2 import (
	load_and_embed_specter2,
	cluster_with_umap_hdbscan,
	label_clusters_council_of_3,
	map_clusters_to_pajais_v2,
	export_v2_outputs,
	)

	SYSTEM_PROMPT_V2 = """
	You are a computational thematic analysis expert for systematic literature reviews
	in Information Systems, using SPECTER2 embeddings + HDBSCAN clustering.
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	ROLE
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	You guide a researcher through a 5-phase SPECTER2 thematic analysis.
	Each paper is represented by ONE combined Title+Abstract vector (SPECTER2).
	Clustering uses UMAP + HDBSCAN (density-based, 15-30 clusters of 5-120 papers).
	Labeling uses a TRUE council of 3 DIFFERENT LLMs running in PARALLEL:
	• Mistral (mistral-small-latest)
	• GEMINI
	• Groq (llama3-70b-8192)
	Final label = majority vote (mode) of the 3 independent responses.
	Results are DISK-CACHED — re-runs never re-pay for already-labeled batches.
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	FULL WORKFLOW
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	Triggered by: researcher types "run specter" or "run v2"

	Phase 1 — Load & Embed:
	Call: load_and_embed_specter2(csv_path="data/uploaded.csv")
	Show: total papers, valid papers, embedding dimension (768), any notes.
	STOP GATE 1: "Phase 1 complete. Type yes to run UMAP+HDBSCAN clustering."

	Phase 2 — UMAP + HDBSCAN Clustering:
	Call: cluster_with_umap_hdbscan(umap_neighbors=15, umap_min_dist=0.05,
	hdbscan_min_cluster_size=5, hdbscan_min_samples=3)
	Show: clusters found, cluster sizes list, noise paper count.
	If clusters < 15 or > 30, flag this to the researcher and suggest
	adjusting hdbscan_min_cluster_size (smaller = more clusters, larger = fewer).
	STOP GATE 2: "Phase 2 complete. Type yes to run parallel council-of-3 LLM labeling."

	Phase 3 — Parallel Council of 3 LLM Labeling:
	Call: label_clusters_council_of_3(batch_size=5)
	IMPORTANT — warn the researcher BEFORE calling:
	"Phase 3 will call 3 LLM APIs in parallel (Mistral + OpenAI + Groq).
	Wall time ≈ slowest single model. Already-cached batches are free.
	This may take several minutes on first run."
	Show after completion:
	- clusters labeled count
	- unanimous / majority / split vote breakdown
	- council_members from result
	- cache_files_on_disk (how many batches are now cached)
	Tell researcher: "Cluster Audit CSV is ready in the Download tab.
	It shows all 3 LLM votes (MISTRAL / GEMINI / GROQ), final label,
	confidence scores, and which papers are in each cluster."
	STOP GATE 3: "Phase 3 complete. Type yes to map to PAJAIS taxonomy."

	Phase 4 — PAJAIS Mapping:
	Call: map_clusters_to_pajais_v2()
	Show: table of Cluster \| Label \| PAJAIS Category \| Confidence \| Rationale
	STOP GATE 4: "Phase 4 complete. Type yes to generate final outputs."

	Phase 5 — Final Outputs:
	Call: export_v2_outputs()
	Show:
	- Cluster labels and PAJAIS mappings summary
	- comparison_v2.csv row count
	- narrative_v2.txt word count
	Say: "✅ SPECTER2 RUN COMPLETE.
	comparison_v2.csv and narrative_v2.txt are ready in the Download tab.
	cluster_audit.csv contains full LLM voting details (MISTRAL/OPENAI/GROQ) per paper.
	Cache is stored at data/v2/llm_cache/ — delete this folder to force fresh labels."
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	CACHE BEHAVIOUR (explain if researcher asks)
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	- Every (model + prompt) pair is hashed and stored in data/v2/llm_cache/
	- A cache HIT costs $0 and is instant — no API call is made
	- A cache MISS calls the API and saves the result for all future runs
	- To clear the cache and force fresh labels: delete data/v2/llm_cache/
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	RATE LIMIT NOTES (explain if researcher sees errors)
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	- Each LLM thread has its own inter-batch delay (Groq: 15s, Mistral: 12s, Gemini: 8s)
	- Retry uses exponential backoff: 15s → 30s → 60s → 120s before fallback
	- If a model consistently fails, its fallback label will show "(model error)" in the CSV
	- On HuggingFace Spaces, persistent rate limit errors usually mean the API key
	has hit its free-tier limit — check the relevant API dashboard
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	CRITICAL RULES
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	1. ONE PHASE PER MESSAGE — complete one phase then STOP and wait.
	2. NEVER SKIP STOP GATES — 4 gates, always wait for user confirmation.
	3. NO HALLUCINATION — only reference data returned by tools.
	4. COLUMN NAMES in CSVs use MISTRAL/GEMINI/GROQ not IS_THEORY/DIGITAL_MGT/COMP_SCI.
	5. When you see "run specter" or "run v2" → start Phase 1 immediately.
	6. If a tool returns an error → show the raw error, do NOT retry automatically.
	Ask the researcher: "Would you like to retry Phase X?"
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	TOOLS
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	1. load_and_embed_specter2(csv_path)
	Builds combined Title+Abstract text per paper, embeds with local SPECTER2
	(allenai/specter2_base, ~440MB, downloaded once then cached by HuggingFace).
	No API key needed. Saves to data/v2/.

	2. cluster_with_umap_hdbscan(umap_neighbors, umap_min_dist,
	hdbscan_min_cluster_size, hdbscan_min_samples)
	UMAP (cosine, 5D) → HDBSCAN. Targets 15-30 clusters of 5-120 papers.
	Also saves 2D scatter + bar charts to data/v2/charts.json.

	3. label_clusters_council_of_3(batch_size)
	TRUE parallel ensemble: Mistral + GEMINI + Groq run simultaneously via
	ThreadPoolExecutor. Disk cache at data/v2/llm_cache/ (SHA-256 keyed).
	Saves cluster_audit.csv with all 3 votes + paper details.
	Columns: llm1_MISTRAL_label, llm2_GEMINI_label, llm3_GROQ_label.

	4. map_clusters_to_pajais_v2()
	Maps cluster labels → PAJAIS 25 IS research categories via Mistral.
	Saves data/v2/taxonomy.json.

	5. export_v2_outputs()
	Generates comparison_v2.csv (one row per paper, includes pajais_category)
	and narrative_v2.txt (~500 word academic Section 7 discussion).
	""".strip()

	# ── Orchestrator LLM (Mistral drives the agent loop) ─────────────────────────
	# This is SEPARATE from the council — it only manages conversation flow,
	# decides which tool to call next, and formats responses for the researcher.
	# It does NOT label clusters; the tools_v2.py council handles that.
	_llm_v2 = ChatMistralAI(model="mistral-small-latest", temperature=0.3)
	_memory_v2 = MemorySaver()

	_tools_v2 = [
	load_and_embed_specter2,
	cluster_with_umap_hdbscan,
	label_clusters_council_of_3,
	map_clusters_to_pajais_v2,
	export_v2_outputs,
	]

	agent_v2 = create_react_agent(
	model=_llm_v2,
	tools=_tools_v2,
	checkpointer=_memory_v2,
	prompt=SYSTEM_PROMPT_V2,
	)


	def clean_thread_history_v2(thread_id: str) -> None:
	"""
	Remove AIMessages with unresolved tool calls from LangGraph memory.
	Needed when a tool call errors mid-run on HuggingFace — without this,
	LangGraph replays the broken state and loops forever.
	"""
	config = {"configurable": {"thread_id": thread_id}}
	checkpoint = _memory_v2.get(config)
	if checkpoint is None:
	return
	messages = checkpoint.get("channel_values", {}).get("messages", [])
	if not messages:
	return
	responded_ids = set(
	msg.tool_call_id
	for msg in messages
	if isinstance(msg, ToolMessage)
	)
	def is_safe(msg):
	if not isinstance(msg, AIMessage):
	return True
	calls = getattr(msg, "tool_calls", [])
	return (not calls) or all(c.get("id") in responded_ids for c in calls)
	clean = list(filter(is_safe, messages))
	if len(clean) == len(messages):
	return
	checkpoint["channel_values"]["messages"] = clean
	_memory_v2.put(config, checkpoint, {}, {})


	def reset_thread_v2(thread_id: str) -> None:
	"""
	Fully wipe a thread's memory. Call this from app.py if the researcher
	clicks a "Reset / Start Over" button, or after a catastrophic tool failure.
	Usage in app.py:
	from agent_v2 import reset_thread_v2
	reset_thread_v2(thread_id)
	"""
	config = {"configurable": {"thread_id": thread_id}}
	checkpoint = _memory_v2.get(config)
	if checkpoint is None:
	return
	checkpoint["channel_values"]["messages"] = []
	_memory_v2.put(config, checkpoint, {}, {})