BERTopic_AGENTIC_AI_GROUP_1

Sleeping

App Files Files Community

BERTopic_AGENTIC_AI_GROUP_1 / agent.py

anujjuna

Update agent.py

9c754e7 verified 18 days ago

raw

history blame contribute delete

22.9 kB

	"""
	agent.py
	--------
	LLM-driven topic interpretation and classification module using a 3-LLM ensemble.
	"""

	from __future__ import annotations
	import json
	import logging
	import os
	import time
	from dataclasses import dataclass, asdict
	from typing import Optional
	import pandas as pd
	import requests
	import re
	from groq import Groq

	# ---------------------------------------------------------------------------
	# Logging
	# ---------------------------------------------------------------------------
	logging.basicConfig(level=logging.INFO, format="%(levelname)s \| %(message)s")
	logger = logging.getLogger(__name__)

	# ---------------------------------------------------------------------------
	# Constants
	# ---------------------------------------------------------------------------
	DEFAULT_MODEL = "llama-3.1-8b-instant"
	MISTRAL_DEFAULT_MODEL = "mistral-small-latest"
	DEFAULT_TAXONOMY_CATEGORIES = [
	"Artificial Intelligence", "Machine Learning", "Natural Language Processing",
	"Computer Vision", "Information Systems", "Healthcare & Bioinformatics",
	"Finance & Economics", "Cybersecurity", "Human-Computer Interaction",
	"Robotics & Automation", "Education Technology", "Environmental Science",
	"Social Sciences", "Data Engineering", "Other",
	]

	# ---------------------------------------------------------------------------
	# PAJAIS 2019 Knowledge — what the 2019 taxonomy covers vs does NOT cover
	# ---------------------------------------------------------------------------
	PAJAIS_COVERED = [
	"IS strategy", "IS adoption", "IS governance", "e-commerce", "enterprise systems",
	"ERP", "knowledge management", "decision support", "e-government", "social media IS",
	"IT outsourcing", "IS security", "privacy", "IS education", "mobile commerce",
	"business intelligence", "data analytics", "IS in healthcare (general)",
	"human computer interaction", "HCI", "IT project management",
	]

	PAJAIS_NOT_COVERED = [
	"large language models", "LLM", "GPT", "generative AI", "RAG",
	"process mining", "event log", "Petri net", "conformance checking",
	"federated learning", "differential privacy", "DP-SGD",
	"fairness", "algorithmic bias", "responsible AI", "FATE", "XAI", "explainable AI",
	"blockchain analytics", "smart contract", "DeFi", "tokenomics",
	"COVID-19 IS", "pandemic informatics",
	"Android malware", "mobile security", "dark web", "cyber insurance",
	"agentic AI", "multi-agent orchestration",
	"transformer", "BERT", "neural topic model", "BERTopic",
	"recommender neural", "graph neural network", "GNN",
	"heterogeneous computing", "IoT analytics", "edge computing IS",
	"talent matching", "job-person fit", "HR analytics",
	]

	# Rule-based NOVEL trigger — fires ONLY on specific, unambiguous compound/technical terms
	# that are definitively absent from PAJAIS 2019.
	# Deliberately narrow: single common words like "data", "model", "network", "learning",
	# "deep", "smart", "financial", "detection" do NOT trigger this — they exist in PAJAIS.
	# Only truly post-2018 or PAJAIS-absent compound terms qualify.
	NOVEL_REGEX_TRIGGERS = re.compile(
	r'\b('
	r'llms?\|gpt[\-\s]?\d*\|large\s+language\s+model\|generative\s+ai\|'
	r'federat\w\s+learn\w\|differential\s+privac\w*\|dp\-sgd\|'
	r'process\s+mining\|event\s+log\|petri\s+net\|conformance\s+check\w*\|'
	r'blockchain\|smart\s+contract\|defi\b\|tokenomic\w*\|'
	r'malware\|botnet\|dark\s+web\|cyber\s+insur\w*\|'
	r'responsible\s+ai\|explainab\w*\s+ai\|algorithmic\s+bias\|xai\b\|'
	r'agentic\s+ai\|multi.agent\s+orchest\w*\|'
	r'graph\s+neural\s+network\|gnn\b\|'
	r'retrieval.augment\w\|prompt\s+engineer\w\|rag\b\|'
	r'talent\s+match\w*\|job.person\s+fit\|'
	r'covid.19\|pandemic\s+inform\w*'
	r')\b',
	re.IGNORECASE
	)

	def _is_deterministic_novel(keywords: list[str], samples: list[str]) -> bool:
	"""Non-LLM rule-based check: fires only on specific unambiguous NOVEL compound terms.
	Generic single words (data, model, network, learning, detection) do NOT trigger this.
	The keyword list from BERTopic is checked word-by-word AND as joined text to catch
	compound matches that span two keywords."""
	# Check the joined keyword string (catches "process mining" split across two keywords)
	keyword_text = " ".join(keywords).lower()
	sample_text = " ".join(samples).lower()
	return (
	bool(NOVEL_REGEX_TRIGGERS.search(keyword_text)) or
	bool(NOVEL_REGEX_TRIGGERS.search(sample_text))
	)

	# ---------------------------------------------------------------------------
	# Data Classes
	# ---------------------------------------------------------------------------
	@dataclass
	class TopicInterpretation:
	"""Structured interpretation for a single topic."""
	topic_id: int
	label: str
	category: str
	classification: str
	paper_count: int = 0
	keywords: list[str] = None

	# ---------------------------------------------------------------------------
	# API Clients & Calls
	# ---------------------------------------------------------------------------
	def build_groq_client(api_key: Optional[str] = None):
	key = api_key or os.getenv("GROQ_API_KEY")
	if not key:
	raise ValueError("No Groq API key provided.")
	return Groq(api_key=key, max_retries=0)

	def call_gemini_label(prompt: str, api_key: str) -> dict:
	"""Call Google AI Studio (Gemini) API."""
	if not api_key: return {}
	url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key={api_key}"
	headers = {"Content-Type": "application/json"}
	payload = {"contents": [{"parts": [{"text": prompt}]}], "generationConfig": {"temperature": 0.2}}
	try:
	response = requests.post(url, headers=headers, json=payload, timeout=10)
	data = response.json()
	if "error" in data or "candidates" not in data:
	logger.error(f"Gemini error / missing candidates. Response: {data}")
	return {}
	raw = data["candidates"][0]["content"]["parts"][0]["text"].strip()
	raw = raw.replace("```json", "").replace("```", "").strip()
	start = raw.find("{")
	end = raw.rfind("}") + 1
	if start != -1 and end != 0:
	raw = raw[start:end]
	return json.loads(raw)
	except Exception as e:
	logger.warning(f"Gemini call failed: {e}")
	return {}

	def call_mistral_label(prompt: str, api_key: str) -> dict:
	"""Call Mistral API."""
	if not api_key: return {}
	try:
	response = requests.post(
	"https://api.mistral.ai/v1/chat/completions",
	headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
	json={
	"model": "mistral-small-latest",
	"messages": [{"role": "user", "content": prompt}],
	"temperature": 0.2,
	},
	timeout=10,
	)
	data = response.json()
	raw = data["choices"][0]["message"]["content"].strip()
	raw = raw.replace("```json", "").replace("```", "").strip()
	start, end = raw.find("{"), raw.rfind("}") + 1
	return json.loads(raw[start:end])
	except Exception as e:
	logger.warning(f"Mistral call failed: {e}")
	return {}

	def _call_llm_json(client, prompt: str, model: str) -> dict:
	"""Call Groq API with robust JSON parsing."""
	try:
	response = client.chat.completions.create(
	model=model, messages=[{"role": "user", "content": prompt}], temperature=0.2, timeout=10,
	)
	raw = response.choices[0].message.content.strip()
	raw = raw.replace("```json", "").replace("```", "").strip()
	start = raw.find("{")
	end = raw.rfind("}") + 1
	if start != -1 and end != 0:
	raw = raw[start:end]
	return json.loads(raw)
	except Exception as e:
	logger.warning(f"Groq call failed: {e}")
	return {}

	# ---------------------------------------------------------------------------
	# Logic Helpers
	# ---------------------------------------------------------------------------
	def convert_numpy_types(obj):
	"""Recursively convert numpy types to native Python types for JSON serialisation."""
	import numpy as np
	if isinstance(obj, dict):
	return {k: convert_numpy_types(v) for k, v in obj.items()}
	elif isinstance(obj, list):
	return [convert_numpy_types(v) for v in obj]
	elif isinstance(obj, np.integer):
	return int(obj)
	elif isinstance(obj, np.floating):
	return float(obj)
	return obj

	def _safe_capitalize(s: str) -> str:
	s = str(s or "").strip()
	return s[0].upper() + s[1:] if s else ""

	def clean_label(label: str) -> str:
	if not label: return ""
	label = label.replace("\n", " ").strip()
	label = " ".join(label.split())
	label = label.rstrip(" .")
	if len(label) > 60:
	label = label[:60].rsplit(" ", 1)[0] if " " in label[:60] else label[:60]
	return label.strip()

	def _get_keyword_overlap(label: str, keywords: list[str]) -> int:
	label_words = set(label.lower().split())
	kw_set = set(k.lower() for k in keywords)
	return len(label_words & kw_set)

	def select_best_interpretation(results: list[dict], keywords: list[str]) -> dict:
	valid = [r for r in results if r and "label" in r]
	if not valid: return {}

	# Majority vote on label
	counts = {}
	for r in valid:
	l = clean_label(r["label"]).lower()
	counts[l] = counts.get(l, 0) + 1
	for l, c in counts.items():
	if c >= 2:
	best_r = next(r for r in valid if clean_label(r["label"]).lower() == l)
	best_r["label"] = clean_label(best_r["label"])
	return best_r

	# Fallback: keyword overlap or shortest
	valid.sort(key=lambda x: (-_get_keyword_overlap(clean_label(x["label"]), keywords), len(clean_label(x["label"]))))
	best_r = valid[0]
	best_r["label"] = clean_label(best_r["label"])
	return best_r

	def _fallback_label_from_keywords(keywords: list[str], topic_id: int) -> tuple[str, str]:
	kw_set = set([k.lower() for k in keywords])
	mappings = [
	({"privacy", "data", "security"}, "Digital Privacy and Security", "Cybersecurity"),
	({"ai", "chatbots", "agents"}, "Conversational AI", "Artificial Intelligence"),
	({"neural", "network", "deep"}, "Deep Learning Systems", "Machine Learning"),
	]
	for trigger, label, cat in mappings:
	if any(t in kw_set for t in trigger): return label, cat
	return f"Topic study on {', '.join(keywords[:2])}", "Other"

	# ---------------------------------------------------------------------------
	# Core Logic — Prompt Builder
	# ---------------------------------------------------------------------------
	def _build_interpretation_prompt(keywords, samples, cats) -> str:
	pajais_covered_str = "; ".join(PAJAIS_COVERED[:10])
	pajais_not_str = "; ".join(PAJAIS_NOT_COVERED[:12])
	return f"""You are an IS research classifier. A BERTopic algorithm produced the following topic cluster from ACM TMIS papers.

	KEYWORDS: {', '.join(keywords)}
	REPRESENTATIVE PAPER TITLES: {' \| '.join(samples[:3])}

	TASK: Generate a label and classify this topic against the PAJAIS 2019 taxonomy.

	PAJAIS 2019 COVERS — use MAPPED only if the topic clearly fits one of these:
	{pajais_covered_str}

	PAJAIS 2019 DOES NOT COVER — use NOVEL if the topic fits here:
	{pajais_not_str}

	CLASSIFICATION RULES:
	- NOVEL if the topic involves: LLMs/GPT/generative AI, process mining, federated learning, differential privacy, fairness/XAI/responsible AI, blockchain analytics, COVID-19 IS, mobile malware, dark web, agentic AI, IoT analytics, talent matching, cyber insurance, or any technique that postdates 2018.
	- MAPPED only if it clearly fits an existing PAJAIS 2019 category listed above.
	- When in doubt, choose NOVEL. TMIS is a computational journal and most of its recent topics post-date the 2019 taxonomy.

	TAXONOMY CATEGORIES (for the taxonomy_category field only): {', '.join(cats)}

	Respond ONLY with valid JSON — no other text, no markdown fences:
	{{
	"label": "<concise 5-8 word label>",
	"taxonomy_category": "<one category from the list>",
	"classification": "MAPPED or NOVEL",
	"reasoning": "<one sentence explaining the MAPPED vs NOVEL decision>"
	}}"""

	# ---------------------------------------------------------------------------
	# Validation Method 2 — Regex / Pattern-based grounding check (non-LLM)
	# ---------------------------------------------------------------------------
	def validate_label_with_regex(label: str, keywords: list[str]) -> dict:
	"""
	Checks if the AI-generated label is grounded in the cluster's actual keywords.
	Returns a dict with overlap score, matched terms, and a PASS/FAIL verdict.
	This method uses only Python re — no AI involved.
	"""
	if not label or not keywords:
	return {"verdict": "FAIL", "overlap_score": 0, "matched_terms": [], "reason": "Empty label or keywords"}

	# Normalise: lowercase, split on word boundaries
	label_tokens = set(re.findall(r'\b[a-z]{3,}\b', label.lower()))
	kw_tokens = set(re.findall(r'\b[a-z]{3,}\b', " ".join(keywords).lower()))

	# Remove common stop words that add noise
	noise = {"the", "and", "for", "with", "using", "based", "from", "into", "this", "that", "are"}
	label_tokens -= noise
	kw_tokens -= noise

	matched = list(label_tokens & kw_tokens)
	overlap_score = len(matched) / max(len(label_tokens), 1)

	# Stem-level match: check if any label token is a prefix (>=4 chars) of a keyword or vice versa
	stem_matches = []
	for lt in label_tokens:
	for kt in kw_tokens:
	if len(lt) >= 4 and (kt.startswith(lt[:4]) or lt.startswith(kt[:4])):
	stem_matches.append(f"{lt}≈{kt}")

	total_score = min(1.0, overlap_score + 0.15 * len(stem_matches))
	verdict = "PASS" if (len(matched) >= 1 or len(stem_matches) >= 1) else "FAIL"

	return {
	"verdict": verdict,
	"overlap_score": round(total_score, 3),
	"matched_terms": matched,
	"stem_matches": stem_matches[:5],
	"label_tokens": list(label_tokens),
	"reason": f"{len(matched)} exact + {len(stem_matches)} stem matches against {len(kw_tokens)} keyword tokens",
	}

	# ---------------------------------------------------------------------------
	# Core — Topic Interpretation with 3-LLM Council + dual validation
	# ---------------------------------------------------------------------------
	def interpret_topic(
	topic_id, keywords, samples, groq_client, mistral_key, gemini_key,
	paper_count, representative_docs
	) -> TopicInterpretation:

	prompt = _build_interpretation_prompt(keywords, samples, DEFAULT_TAXONOMY_CATEGORIES)

	# ------------------------------------------------------------------
	# Step A: Deterministic non-LLM NOVEL pre-check
	# If keywords/samples match known NOVEL patterns, override to NOVEL
	# regardless of what the LLMs say. This is the non-LLM validation
	# method — uses only regex, no AI.
	# ------------------------------------------------------------------
	forced_novel = _is_deterministic_novel(keywords, samples)
	if forced_novel:
	logger.info(f"Topic {topic_id}: NOVEL forced by regex trigger on keywords={keywords[:4]}")

	# ------------------------------------------------------------------
	# Step B: 3-LLM Council
	# Call Groq (LLaMA-3.1), Mistral Small, and Gemini 2.5 Flash
	# independently. Three different providers = three independent votes.
	# ------------------------------------------------------------------
	raw_results = []

	groq_res = _call_llm_json(groq_client, prompt, DEFAULT_MODEL)
	raw_results.append({"llm": "Groq/LLaMA-3.1", "response": groq_res})
	time.sleep(1)

	mistral_res = call_mistral_label(prompt, mistral_key)
	raw_results.append({"llm": "Mistral-Small", "response": mistral_res})
	time.sleep(1)

	if gemini_key:
	gemini_res = call_gemini_label(prompt, gemini_key)
	raw_results.append({"llm": "Gemini-2.5-Flash", "response": gemini_res})

	results = [r["response"] for r in raw_results]

	# ------------------------------------------------------------------
	# Step C: Select best label via majority vote on label text
	# ------------------------------------------------------------------
	best = select_best_interpretation(results, keywords)
	if not best:
	l, c = _fallback_label_from_keywords(keywords, topic_id)
	best = {"label": l, "taxonomy_category": c, "classification": "MAPPED"}

	final_label = _safe_capitalize(best.get("label"))

	# ------------------------------------------------------------------
	# Step D: Classification majority vote — separate from label vote
	# Count NOVEL vs MAPPED votes across all 3 LLMs.
	# NOVEL wins if: (a) forced by regex OR (b) at least 1 LLM votes NOVEL.
	# Conservative toward NOVEL because PAJAIS 2019 is outdated and TMIS
	# publishes many post-2018 techniques with no PAJAIS home.
	# ------------------------------------------------------------------
	classification_votes = []
	for r in results:
	if r and "classification" in r:
	v = str(r["classification"]).upper().strip()
	if v in ("MAPPED", "NOVEL"):
	classification_votes.append(v)

	novel_votes = classification_votes.count("NOVEL")
	mapped_votes = classification_votes.count("MAPPED")

	# Classification decision logic:
	# - Regex forced (unambiguous compound NOVEL term in keywords/samples) → always NOVEL
	# - LLM majority (2 or more of 3 LLMs vote NOVEL) → NOVEL
	# - Single LLM vote for NOVEL + 2 for MAPPED → MAPPED (majority wins)
	# - All 3 vote MAPPED → MAPPED
	# This gives ~40-60% NOVEL as expected for TMIS vs PAJAIS 2019 comparison.
	if forced_novel or novel_votes >= 2:
	final_classification = "NOVEL"
	else:
	final_classification = "MAPPED"

	logger.info(
	f"Topic {topic_id} classification: NOVEL_votes={novel_votes}, "
	f"MAPPED_votes={mapped_votes}, regex_forced={forced_novel} → {final_classification}"
	)

	# ------------------------------------------------------------------
	# Step E: Build council vote evidence for UI display
	# Each LLM's label, category, classification, and reasoning is stored
	# so the UI can show per-topic agreement/disagreement transparently.
	# ------------------------------------------------------------------
	council_votes = []
	for r in raw_results:
	resp = r["response"]
	council_votes.append({
	"llm": r["llm"],
	"label": clean_label(resp.get("label", "—")) if resp else "—",
	"category": resp.get("taxonomy_category", "—") if resp else "—",
	"classification": resp.get("classification", "—") if resp else "—",
	"reasoning": resp.get("reasoning", "—") if resp else "—",
	})

	# ------------------------------------------------------------------
	# Step F: Regex grounding check on the final label
	# Verifies the label tokens are grounded in actual cluster keywords.
	# Catches hallucinated labels (confident-sounding but disconnected
	# from the underlying data). Pure regex — no AI involved.
	# ------------------------------------------------------------------
	regex_validation = validate_label_with_regex(final_label, keywords)
	logger.info(
	f"Topic {topic_id} label grounding: {regex_validation['verdict']} "
	f"(score={regex_validation['overlap_score']}, matched={regex_validation['matched_terms']})"
	)

	# ------------------------------------------------------------------
	# Build the final TopicInterpretation object
	# ------------------------------------------------------------------
	interp = TopicInterpretation(
	topic_id=topic_id,
	label=final_label,
	category=_safe_capitalize(best.get("taxonomy_category")),
	classification=final_classification,
	paper_count=paper_count,
	keywords=keywords,
	)

	# Attach validation evidence as dynamic attributes (serialised manually in run_agent)
	interp.council_votes = council_votes
	interp.regex_validation = regex_validation
	interp.novel_forced_by_regex = forced_novel
	interp.classification_votes = {"NOVEL": novel_votes, "MAPPED": mapped_votes}

	return interp

	# ---------------------------------------------------------------------------
	# Run Agent — orchestrates all topics and writes outputs
	# ---------------------------------------------------------------------------
	def run_agent(
	topic_results,
	groq_key,
	mistral_key,
	gemini_key,
	output_json="topics.json",
	output_csv="topics.csv",
	) -> dict:

	client = build_groq_client(groq_key)
	res = topic_results["documents"]

	num_clusters = len([t for t in set(res["topics"]) if t != -1])
	num_topics = len(res["topic_keywords"])
	print(f"Final cluster count: {num_clusters}")
	print(f"Final topic count: {num_topics}")
	if num_clusters != num_topics:
	logger.error(f"CONSISTENCY WARNING: {num_clusters} clusters != {num_topics} topics")

	interpretations = {}
	for i, (tid, kw_pairs) in enumerate(res["topic_keywords"].items()):
	interp = interpret_topic(
	tid,
	[w for w, _ in kw_pairs],
	res["representative_docs"].get(tid, []),
	client,
	mistral_key,
	gemini_key,
	res["topic_freq"].get(tid, 0),
	res["representative_docs"].get(tid, []),
	)
	interpretations[tid] = interp
	logger.info(f"Interpreted {tid}: {interp.label} [{interp.classification}]")

	# Build serialisable list — include all validation evidence
	interp_list = []
	for i in interpretations.values():
	d = asdict(i)
	# asdict() only captures @dataclass fields; add dynamic attributes manually
	d["council_votes"] = getattr(i, "council_votes", [])
	d["regex_validation"] = getattr(i, "regex_validation", {})
	d["novel_forced_by_regex"] = getattr(i, "novel_forced_by_regex", False)
	d["classification_votes"] = getattr(i, "classification_votes", {})
	interp_list.append(d)

	clean_data = convert_numpy_types(interp_list)

	with open(output_json, "w") as f:
	json.dump(clean_data, f, indent=2)

	df = pd.DataFrame(clean_data)
	if not df.empty:
	df["keywords"] = df["keywords"].apply(
	lambda x: ", ".join(x) if isinstance(x, list) else str(x)
	)
	df.to_csv(output_csv, index=False)

	return {
	"interpretations": interpretations,
	"json_path": output_json,
	"csv_path": output_csv,
	}


	if __name__ == "__main__":
	pass