""" tools.py — BERTopic Thematic Analysis Pipeline Tools ===================================================== Nine LangChain @tool functions implementing Braun & Clarke's (2006) six-phase thematic analysis pipeline. Conventions ----------- - All tools accept / return plain Python dicts (JSON-serialisable). - Artefacts are written to OUTPUT_DIR / run_key / . - Functional style throughout: map, operator, numpy vectorised ops. - No for/while loops, no try/except, no if/else. Fixes applied (v2) ------------------ - BUG 1 : run_bertopic_discovery() now saves sent_labels.npy — per-sentence cluster-label array required by Tool 4. - BUG 1 : consolidate_into_themes() _build_theme() rewritten — centroid computed from actual merged-cluster embeddings via sent_labels.npy mask (no dead `if False` scaffolding). - ISSUE 1: generate_comparison_csv() guards against missing title run with a .exists() check instead of hard-crashing. Dependencies ------------ pip install langchain langchain-core langchain-mistralai langchain-groq sentence-transformers scikit-learn plotly pandas numpy """ # --------------------------------------------------------------------------- # Stdlib # --------------------------------------------------------------------------- import json import os import re import time from functools import reduce from pathlib import Path from operator import itemgetter # --------------------------------------------------------------------------- # Third-party # --------------------------------------------------------------------------- import numpy as np import pandas as pd import plotly.express as px import plotly.graph_objects as go import plotly.figure_factory as ff from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import normalize from sentence_transformers import SentenceTransformer import hdbscan import umap import fitz # PyMuPDF — text-only PDF extraction from langchain_core.tools import tool from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_mistralai import ChatMistralAI try: from langchain_groq import ChatGroq # type: ignore[import-not-found] except ImportError: ChatGroq = None # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- MISTRAL_API_KEY: str = os.environ.get("MISTRAL_API_KEY", "") MODEL_NAME: str = "mistral-small-latest" GROQ_API_KEY: str = os.environ.get("GROQ_API_KEY", "") GROQ_MODEL_NAME: str = os.environ.get("GROQ_MODEL_NAME", "llama-3.3-70b-versatile") GROQ_OLLAMA_MODEL_NAME: str = os.environ.get("GROQ_OLLAMA_MODEL_NAME", "llama-3.3-70b-versatile") GROQ_GPT_MODEL_NAME: str = os.environ.get("GROQ_GPT_MODEL_NAME", "openai/gpt-oss-120b") GROQ_JUDGE_MODEL_NAME: str = os.environ.get("GROQ_JUDGE_MODEL_NAME", "llama-3.1-8b-instant") EMBED_MODEL: str = "allenai/specter2_base" BASE_DIR: Path = Path(__file__).resolve().parent OUTPUT_DIR: Path = BASE_DIR / "outputs" N_EVIDENCE: int = 5 # sentences kept per cluster centroid DISTANCE_THRESH: float = 0.35 # cosine-distance threshold (1 - similarity) RANDOM_SEED: int = 42 LLM_TIMEOUT_S: int = 45 LLM_MAX_RETRIES: int = 3 MAX_LABEL_CLUSTERS: int = 60 MIN_CLUSTER_SIZE_FOR_LABEL: int = 20 MAX_TOOL_RETURN_PREVIEW: int = 12 PROVIDER_RETRY_ATTEMPTS: int = 4 PROVIDER_RETRY_BASE_DELAY_S: float = 2.0 PROVIDER_RETRY_RATE_LIMIT_DELAY_S: float = 6.0 PROVIDER_RETRY_MAX_DELAY_S: float = 18.0 HDBSCAN_MIN_CLUSTER_SIZE: int = 20 HDBSCAN_MIN_SAMPLES: int = 5 HDBSCAN_MAX_CLUSTER_SIZE: int = 120 UMAP_N_NEIGHBORS: int = 15 UMAP_MIN_DIST: float = 0.0 UMAP_N_COMPONENTS_CLUSTER: int = 5 UMAP_N_COMPONENTS_VIZ: int = 2 AUTO_OPTIMIZE_CLUSTERS: bool = True OPTIMIZE_MAX_ITERS: int = 6 OPTIMIZE_STABLE_ROUNDS: int = 2 OPTIMIZE_MIN_IMPROVEMENT: float = 0.01 OPTIMIZE_TARGET_CLUSTER_MIN: int = 20 OPTIMIZE_TARGET_CLUSTER_MAX: int = 120 OPTIMIZE_TARGET_NOISE_MAX: float = 0.50 OPTIMIZE_MIN_CLUSTER_SIZE_MIN: int = 5 OPTIMIZE_MIN_CLUSTER_SIZE_MAX: int = 60 OPTIMIZE_MAX_CLUSTER_SIZE_MIN: int = 40 OPTIMIZE_MAX_CLUSTER_SIZE_MAX: int = 200 OPTIMIZE_MIN_SAMPLES_MIN: int = 1 OPTIMIZE_MIN_SAMPLES_MAX: int = 15 # Run configurations — keys map to source columns RUN_CONFIGS: dict[str, list[str]] = { "abstract": ["Abstract"], "title": ["Title"], "keywords": [ "Author Keywords", "Author Keywords Plus", "Index Keywords", "Keywords", "Author_Keywords", ], } # PAJAIS 25-category taxonomy (Pan-Pacific Journal of AIS) PAJAIS_TAXONOMY: list[str] = [ "Artificial Intelligence & Machine Learning", "Big Data & Analytics", "Blockchain & Distributed Ledger", "Cloud Computing & Infrastructure", "Cybersecurity & Privacy", "Decision Support Systems", "Digital Business & E-Commerce", "Digital Health & Telemedicine", "Digital Innovation & Transformation", "Enterprise Systems & ERP", "Fintech & Digital Finance", "Green IS & Sustainability", "Human-Computer Interaction", "Information Systems Strategy", "IT Governance & Management", "Knowledge Management", "Mobile Computing & IoT", "Natural Language Processing & Text Mining", "Organizational Behavior & IS", "Platform Ecosystems & APIs", "Privacy & Ethics in IS", "Smart Cities & Digital Government", "Social Media & Collaboration", "Supply Chain & Logistics IS", "Virtual Reality & Immersive Technologies", ] # Boilerplate patterns to strip from abstracts _BOILERPLATE_RE = re.compile( r"(©\s*\d{4}.*?(?:rights reserved|elsevier|springer|wiley)[^.]*\.?)" r"|(all rights reserved\.?)" r"|(published by.*?(?:ltd|inc|llc)[^.]*\.?)" r"|(doi:\s*\S+)", re.IGNORECASE, ) # Sentence splitter — split on sentence-boundary punctuation, keep >= 20 chars _SENT_RE = re.compile(r"(?<=[.!?])\s+") _KEYWORD_SPLIT_RE = re.compile(r"\s*[;|]\s*") _KEYWORD_COMMA_RE = re.compile(r"\s*,\s*") # --------------------------------------------------------------------------- # Private helpers (pure functions, no side-effects) # --------------------------------------------------------------------------- def _ensure_dir(path: Path) -> Path: path.mkdir(parents=True, exist_ok=True) return path def _run_dir(run_key: str) -> Path: return _ensure_dir(OUTPUT_DIR / run_key) def _clean_text(text: str) -> str: return _BOILERPLATE_RE.sub("", str(text)).strip() def _split_sentences(text: str) -> list[str]: return list(filter( lambda s: len(s.strip()) >= 20, _SENT_RE.split(_clean_text(text)), )) def _split_keywords(text: str) -> list[str]: cleaned = _clean_text(text).replace("\n", " ").strip() if not cleaned: return [] primary = list(filter(None, map(str.strip, _KEYWORD_SPLIT_RE.split(cleaned)))) terms = ( primary if len(primary) > 1 else list(filter(None, map(str.strip, _KEYWORD_COMMA_RE.split(cleaned)))) ) return list(dict.fromkeys(filter(lambda t: len(t) >= 2, terms))) def _resolve_column_name(df: pd.DataFrame, candidates: list[str]) -> str | None: normalised = { str(col).strip().lower(): col for col in df.columns } return next( (normalised.get(str(c).strip().lower()) for c in candidates if normalised.get(str(c).strip().lower()) is not None), None, ) def _texts_for_candidates(df: pd.DataFrame, candidates: list[str]) -> tuple[list[str], str | None]: col = _resolve_column_name(df, candidates) return ( df[col].dropna().astype(str).tolist(), col, ) if col else ([], None) def _embed(sentences: list[str]) -> np.ndarray: """Encode sentences to L2-normalised SPECTER2 vectors.""" model = SentenceTransformer(EMBED_MODEL, trust_remote_code=True) raw = model.encode(sentences, show_progress_bar=False, batch_size=64) return normalize(raw, norm="l2") # unit-norm -> cosine = dot product def _umap_reduce(embeddings: np.ndarray, n_components: int) -> np.ndarray: reducer = umap.UMAP( n_neighbors=UMAP_N_NEIGHBORS, min_dist=UMAP_MIN_DIST, n_components=n_components, metric="cosine", random_state=RANDOM_SEED, ) return reducer.fit_transform(embeddings) def _cluster(embeddings: np.ndarray, min_cluster_size: int, max_cluster_size: int, min_samples: int) -> np.ndarray: return hdbscan.HDBSCAN( min_cluster_size=min_cluster_size, min_samples=min_samples, metric="euclidean", cluster_selection_method="eom", max_cluster_size=max_cluster_size, ).fit_predict(embeddings) def _centroid(embeddings: np.ndarray) -> np.ndarray: """Mean-pool rows then re-normalise to unit length.""" return normalize(embeddings.mean(axis=0, keepdims=True), norm="l2")[0] def _top_k_indices(embeddings: np.ndarray, centroid: np.ndarray, k: int) -> np.ndarray: sims = cosine_similarity(embeddings, centroid.reshape(1, -1)).flatten() return np.argsort(sims)[::-1][:k] def _llm() -> ChatMistralAI: return ChatMistralAI( model=MODEL_NAME, api_key=MISTRAL_API_KEY, temperature=0.2, random_seed=RANDOM_SEED, timeout=LLM_TIMEOUT_S, max_retries=LLM_MAX_RETRIES, ) def _llm_groq(model_name: str): if ChatGroq is None: raise RuntimeError( "langchain-groq is not installed. Install dependencies from requirements.txt " "to enable Groq topic-label verification." ) return ChatGroq( model=model_name, api_key=GROQ_API_KEY, temperature=0.2, timeout=LLM_TIMEOUT_S, max_retries=LLM_MAX_RETRIES, ) def _groq_ollama_enabled() -> bool: return bool(GROQ_API_KEY) and ChatGroq is not None and bool(GROQ_OLLAMA_MODEL_NAME) def _groq_gpt_enabled() -> bool: return bool(GROQ_API_KEY) and ChatGroq is not None and bool(GROQ_GPT_MODEL_NAME) def _groq_judge_enabled() -> bool: return bool(GROQ_API_KEY) and ChatGroq is not None and bool(GROQ_JUDGE_MODEL_NAME) def _to_float(value: object, fallback: float = 0.0) -> float: try: return float(value) except (TypeError, ValueError): return float(fallback) def _clamp_int(value: object, low: int, high: int, fallback: int) -> int: try: casted = int(value) except (TypeError, ValueError): casted = int(fallback) return max(low, min(high, casted)) def _cluster_metrics(labels: np.ndarray) -> dict: labels_arr = np.array(labels, dtype=np.int32) n_sentences = int(labels_arr.shape[0]) noise_count = int((labels_arr == -1).sum()) unique_ids = sorted(filter(lambda v: v != -1, set(labels_arr.tolist()))) sizes = list(map(lambda cid: int((labels_arr == cid).sum()), unique_ids)) if sizes: min_size = float(np.min(sizes)) median_size = float(np.median(sizes)) mean_size = float(np.mean(sizes)) max_size = float(np.max(sizes)) else: min_size = 0.0 median_size = 0.0 mean_size = 0.0 max_size = 0.0 return { "n_sentences": n_sentences, "n_clusters": int(len(unique_ids)), "noise_ratio": float(noise_count) / float(max(1, n_sentences)), "min_size": min_size, "median_size": median_size, "mean_size": mean_size, "max_size": max_size, } def _heuristic_hdbscan_tweak(metrics: dict, params: dict) -> dict: n_clusters = int(metrics.get("n_clusters", 0)) noise_ratio = float(metrics.get("noise_ratio", 0.0)) min_cluster_size = int(params.get("min_cluster_size", HDBSCAN_MIN_CLUSTER_SIZE)) max_cluster_size = int(params.get("max_cluster_size", HDBSCAN_MAX_CLUSTER_SIZE)) min_samples = int(params.get("min_samples", HDBSCAN_MIN_SAMPLES)) action = "accept" reasoning = "Cluster metrics are within target ranges." if n_clusters < OPTIMIZE_TARGET_CLUSTER_MIN: min_cluster_size = max( OPTIMIZE_MIN_CLUSTER_SIZE_MIN, int(round(min_cluster_size * 0.8)), ) min_samples = max(OPTIMIZE_MIN_SAMPLES_MIN, min_samples - 1) action = "tweak" reasoning = "Too few clusters; reducing min_cluster_size and min_samples." elif n_clusters > OPTIMIZE_TARGET_CLUSTER_MAX: min_cluster_size = min( OPTIMIZE_MIN_CLUSTER_SIZE_MAX, int(round(min_cluster_size * 1.2)), ) min_samples = min(OPTIMIZE_MIN_SAMPLES_MAX, min_samples + 1) action = "tweak" reasoning = "Too many clusters; increasing min_cluster_size and min_samples." elif noise_ratio > OPTIMIZE_TARGET_NOISE_MAX: min_cluster_size = max( OPTIMIZE_MIN_CLUSTER_SIZE_MIN, int(round(min_cluster_size * 0.85)), ) min_samples = max(OPTIMIZE_MIN_SAMPLES_MIN, min_samples - 1) action = "tweak" reasoning = "Noise ratio is high; lowering min_cluster_size and min_samples." return { "action": action, "min_cluster_size": min_cluster_size, "max_cluster_size": max_cluster_size, "min_samples": min_samples, "reasoning": reasoning, } def _normalize_hdbscan_suggestion(suggestion: dict, current: dict) -> dict: action = str(suggestion.get("action", "accept")).strip().lower() action = action if action in {"accept", "tweak"} else "accept" min_cluster_size = _clamp_int( suggestion.get("min_cluster_size", current.get("min_cluster_size")), OPTIMIZE_MIN_CLUSTER_SIZE_MIN, OPTIMIZE_MIN_CLUSTER_SIZE_MAX, current.get("min_cluster_size", HDBSCAN_MIN_CLUSTER_SIZE), ) max_cluster_size = _clamp_int( suggestion.get("max_cluster_size", current.get("max_cluster_size")), OPTIMIZE_MAX_CLUSTER_SIZE_MIN, OPTIMIZE_MAX_CLUSTER_SIZE_MAX, current.get("max_cluster_size", HDBSCAN_MAX_CLUSTER_SIZE), ) min_samples = _clamp_int( suggestion.get("min_samples", current.get("min_samples")), OPTIMIZE_MIN_SAMPLES_MIN, OPTIMIZE_MIN_SAMPLES_MAX, current.get("min_samples", HDBSCAN_MIN_SAMPLES), ) if max_cluster_size < min_cluster_size: max_cluster_size = min_cluster_size + 1 return { "action": action, "min_cluster_size": min_cluster_size, "max_cluster_size": max_cluster_size, "min_samples": min_samples, "reasoning": str(suggestion.get("reasoning", "")).strip(), } def _metrics_in_target(metrics: dict) -> bool: n_clusters = int(metrics.get("n_clusters", 0)) noise_ratio = float(metrics.get("noise_ratio", 1.0)) return ( OPTIMIZE_TARGET_CLUSTER_MIN <= n_clusters <= OPTIMIZE_TARGET_CLUSTER_MAX and noise_ratio <= OPTIMIZE_TARGET_NOISE_MAX ) def _optimization_score(metrics: dict) -> float: n_clusters = int(metrics.get("n_clusters", 0)) noise_ratio = float(metrics.get("noise_ratio", 1.0)) if n_clusters < OPTIMIZE_TARGET_CLUSTER_MIN: cluster_penalty = (OPTIMIZE_TARGET_CLUSTER_MIN - n_clusters) / max( OPTIMIZE_TARGET_CLUSTER_MIN, 1, ) elif n_clusters > OPTIMIZE_TARGET_CLUSTER_MAX: cluster_penalty = (n_clusters - OPTIMIZE_TARGET_CLUSTER_MAX) / max( OPTIMIZE_TARGET_CLUSTER_MAX, 1, ) else: cluster_penalty = 0.0 noise_penalty = max(0.0, noise_ratio - OPTIMIZE_TARGET_NOISE_MAX) / max( OPTIMIZE_TARGET_NOISE_MAX, 1e-6, ) return 1.0 - min(1.0, cluster_penalty + noise_penalty) def _load_sentence_meta(run_key: str, sentences: list[str]) -> list[dict]: meta_path = OUTPUT_DIR / run_key / "sentence_meta.json" if not meta_path.exists(): return [ { "sentence": s, "paper_title": "", "paper_id": None, } for s in sentences ] meta = _load_json(meta_path) if not isinstance(meta, list): return [ { "sentence": s, "paper_title": "", "paper_id": None, } for s in sentences ] if len(meta) != len(sentences): return [ { "sentence": s, "paper_title": "", "paper_id": None, } for s in sentences ] return meta def _top_papers_for_mask(meta: list[dict], mask: np.ndarray, k: int = 3) -> dict: counts: dict[tuple[object, str], int] = {} for idx, entry in enumerate(meta): if not mask[idx]: continue paper_id = entry.get("paper_id") title = str(entry.get("paper_title") or entry.get("title") or "").strip() if not title: title = f"Paper {paper_id}" if paper_id is not None else "Unknown" key = (paper_id, title) counts[key] = counts.get(key, 0) + 1 ordered = sorted( counts.items(), key=lambda kv: (-kv[1], str(kv[0][1]).lower()), ) top = [ {"paper_id": pid, "paper_title": title, "count": count} for (pid, title), count in ordered[:k] ] return { "paper_count": int(len(counts)), "top_papers": top, } def _is_transient_provider_error(exc: Exception) -> bool: """Detect transient provider outages (Mistral/Groq) that should be retried.""" msg = str(exc).lower() return ( "unreachable_backend" in msg or "internal server error" in msg or '"code":"1100"' in msg or '"raw_status_code":503' in msg or '"raw_status_code":502' in msg or '"raw_status_code":504' in msg or '"status":503' in msg or '"status":502' in msg or '"status":504' in msg or '"status":429' in msg or "too many requests" in msg or "rate limit" in msg or "gateway timeout" in msg or "service unavailable" in msg ) def _is_rate_limit_error(exc: Exception) -> bool: msg = str(exc).lower() return ( "rate limit" in msg or "too many requests" in msg or '"raw_status_code":429' in msg or '"status":429' in msg or "status code: 429" in msg ) def _invoke_with_retries(fn): """Run an LLM call with bounded linear backoff on transient provider errors.""" last_exc: Exception | None = None for attempt in range(PROVIDER_RETRY_ATTEMPTS): try: return fn() except Exception as exc: if not _is_transient_provider_error(exc): raise last_exc = exc if attempt < PROVIDER_RETRY_ATTEMPTS - 1: delay = PROVIDER_RETRY_BASE_DELAY_S * (attempt + 1) if _is_rate_limit_error(exc): delay = max(delay, PROVIDER_RETRY_RATE_LIMIT_DELAY_S * (attempt + 1)) time.sleep(min(PROVIDER_RETRY_MAX_DELAY_S, delay)) continue raise last_exc raise RuntimeError("Unexpected retry flow in _invoke_with_retries") def _save_json(path: Path, data: object) -> None: path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") def _load_json(path: Path) -> object: return json.loads(path.read_text(encoding="utf-8")) # --------------------------------------------------------------------------- # Plotly chart builders # --------------------------------------------------------------------------- def _chart_intertopic(summaries: list[dict]) -> go.Figure: df = pd.DataFrame(summaries) return px.scatter( df, x="cx", y="cy", size="size", text="cluster_id", color="size", color_continuous_scale="Blues", title="Intertopic Distance Map", labels={"cx": "Dim-1", "cy": "Dim-2", "size": "Sentences"}, template="plotly_dark", ) def _chart_top_words(summaries: list[dict]) -> go.Figure: df = ( pd.DataFrame(summaries) .nlargest(20, "size") .assign(label=lambda d: d["cluster_id"].astype(str)) ) return px.bar( df, x="size", y="label", orientation="h", title="Top Clusters by Sentence Count", labels={"size": "Sentences", "label": "Cluster"}, color="size", color_continuous_scale="Teal", template="plotly_dark", ) def _chart_hierarchy(labels: list[int], embeddings: np.ndarray) -> go.Figure: unique = sorted(filter(lambda v: v != -1, set(labels))) if not unique: fig = go.Figure() fig.update_layout(title="Cluster Hierarchy", template="plotly_dark") return fig labels_arr = np.array(labels) centroids = np.vstack([ _centroid(embeddings[labels_arr == lbl]) for lbl in unique ]) dist_mat = 1 - cosine_similarity(centroids) fig = ff.create_dendrogram( dist_mat, labels=[str(l) for l in unique], colorscale=px.colors.sequential.Blues, ) fig.update_layout(title="Cluster Hierarchy", template="plotly_dark") return fig def _chart_heatmap(labels: list[int], embeddings: np.ndarray) -> go.Figure: unique = sorted(filter(lambda v: v != -1, set(labels))) if not unique: fig = go.Figure() fig.update_layout(title="Cluster Similarity Heatmap", template="plotly_dark") return fig labels_arr = np.array(labels) centroids = np.vstack([ _centroid(embeddings[labels_arr == lbl]) for lbl in unique ]) sim_mat = cosine_similarity(centroids) return px.imshow( sim_mat, x=[str(l) for l in unique], y=[str(l) for l in unique], color_continuous_scale="Blues", title="Cluster Similarity Heatmap", template="plotly_dark", ) def _save_chart(fig: go.Figure, path: Path) -> str: fig.write_html(str(path), full_html=True, include_plotlyjs="cdn") return str(path) _OPTIMIZE_PROMPT = PromptTemplate.from_template( """You are optimizing HDBSCAN clustering parameters for BERTopic. Current parameters: min_cluster_size: {min_cluster_size} max_cluster_size: {max_cluster_size} min_samples: {min_samples} Clustering metrics: n_sentences: {n_sentences} n_clusters: {n_clusters} noise_ratio: {noise_ratio} min_size: {min_size} median_size: {median_size} mean_size: {mean_size} max_size: {max_size} Constraints: - Only adjust min_cluster_size, max_cluster_size, min_samples. - Keep min_cluster_size within [{min_cluster_size_min}, {min_cluster_size_max}]. - Keep max_cluster_size within [{max_cluster_size_min}, {max_cluster_size_max}]. - Keep min_samples within [{min_samples_min}, {min_samples_max}]. - Prefer n_clusters in [{target_cluster_min}, {target_cluster_max}]. - Prefer noise_ratio <= {target_noise_max}. Return RAW JSON with exactly these keys: action: "accept" or "tweak" min_cluster_size: int max_cluster_size: int min_samples: int reasoning: short sentence If clustering already looks good, set action="accept" and repeat the current values. Respond with RAW JSON only. """ ) def _recommend_hdbscan_params(metrics: dict, params: dict) -> dict: if not MISTRAL_API_KEY: return _normalize_hdbscan_suggestion( _heuristic_hdbscan_tweak(metrics, params), params, ) chain = _OPTIMIZE_PROMPT | _llm() | JsonOutputParser() payload = { **metrics, **params, "min_cluster_size_min": OPTIMIZE_MIN_CLUSTER_SIZE_MIN, "min_cluster_size_max": OPTIMIZE_MIN_CLUSTER_SIZE_MAX, "max_cluster_size_min": OPTIMIZE_MAX_CLUSTER_SIZE_MIN, "max_cluster_size_max": OPTIMIZE_MAX_CLUSTER_SIZE_MAX, "min_samples_min": OPTIMIZE_MIN_SAMPLES_MIN, "min_samples_max": OPTIMIZE_MIN_SAMPLES_MAX, "target_cluster_min": OPTIMIZE_TARGET_CLUSTER_MIN, "target_cluster_max": OPTIMIZE_TARGET_CLUSTER_MAX, "target_noise_max": OPTIMIZE_TARGET_NOISE_MAX, } try: suggestion = _invoke_with_retries(lambda: chain.invoke(payload)) except Exception: suggestion = {} if not isinstance(suggestion, dict) or not suggestion: suggestion = _heuristic_hdbscan_tweak(metrics, params) return _normalize_hdbscan_suggestion(suggestion, params) # ============================================================================ # TOOL 1 — load_scopus_csv # ============================================================================ @tool def load_scopus_csv(filepath: str) -> dict: """ Load a Scopus-exported CSV and extract corpus statistics. Parameters ---------- filepath : str Absolute or relative path to the CSV file. Returns ------- dict with keys: paper_count, abstract_sentence_count, title_sentence_count, keywords_term_count, columns, sample_abstracts, filepath """ df = pd.read_csv(filepath).rename(columns=str.strip) abstract_texts, abstract_col = _texts_for_candidates(df, RUN_CONFIGS["abstract"]) title_texts, title_col = _texts_for_candidates(df, RUN_CONFIGS["title"]) keywords_texts, keywords_col = _texts_for_candidates(df, RUN_CONFIGS["keywords"]) titles_for_meta = ( df[title_col].fillna("").astype(str).tolist() if title_col else [""] * len(df) ) def _build_sentences_and_meta(text_col: str | None, splitter) -> tuple[list[str], list[dict]]: if not text_col: return [], [] texts = df[text_col].fillna("").astype(str).tolist() sentences: list[str] = [] meta: list[dict] = [] for idx, (text, title) in enumerate(zip(texts, titles_for_meta), start=1): parts = splitter(text) if not parts: continue sentences.extend(parts) meta.extend( { "sentence": part, "paper_title": title or f"Paper {idx}", "paper_id": idx, } for part in parts ) return sentences, meta abstract_sentences, abstract_meta = _build_sentences_and_meta( abstract_col, _split_sentences ) title_sentences, title_meta = _build_sentences_and_meta( title_col, _split_sentences ) keywords_terms, keywords_meta = _build_sentences_and_meta( keywords_col, _split_keywords ) _ensure_dir(OUTPUT_DIR / "abstract") _ensure_dir(OUTPUT_DIR / "title") _ensure_dir(OUTPUT_DIR / "keywords") _save_json(OUTPUT_DIR / "abstract" / "sentences.json", abstract_sentences) _save_json(OUTPUT_DIR / "abstract" / "sentence_meta.json", abstract_meta) _save_json(OUTPUT_DIR / "title" / "sentences.json", title_sentences) _save_json(OUTPUT_DIR / "title" / "sentence_meta.json", title_meta) _save_json(OUTPUT_DIR / "keywords" / "sentences.json", keywords_terms) _save_json(OUTPUT_DIR / "keywords" / "sentence_meta.json", keywords_meta) df.to_csv(OUTPUT_DIR / "corpus.csv", index=False) return { "paper_count": int(len(df)), "abstract_sentence_count": int(len(abstract_sentences)), "title_sentence_count": int(len(title_sentences)), "keywords_term_count": int(len(keywords_terms)), "detected_columns": { "abstract": abstract_col, "title": title_col, "keywords": keywords_col, }, "columns": df.columns.tolist(), "sample_abstracts": abstract_texts[:3], "filepath": str(filepath), } # ============================================================================ # TOOL 2 — run_bertopic_discovery # ============================================================================ @tool def run_bertopic_discovery( run_key: str, threshold: float = DISTANCE_THRESH, min_cluster_size: int = HDBSCAN_MIN_CLUSTER_SIZE, max_cluster_size: int = HDBSCAN_MAX_CLUSTER_SIZE, min_samples: int = HDBSCAN_MIN_SAMPLES, auto_optimize: bool = AUTO_OPTIMIZE_CLUSTERS, max_optimize_iters: int = OPTIMIZE_MAX_ITERS, ) -> dict: """ Embed sentences, cluster with UMAP + HDBSCAN, extract evidence, and generate four Plotly charts. Saved artefacts --------------- emb.npy : (N, D) float32 L2-normalised embeddings sent_labels.npy : (N,) int32 per-sentence cluster label [BUG 1 FIX] summaries.json : list of cluster dicts with evidence sentences optimization.json : list of optimization rounds and metrics Parameters ---------- run_key : str — "abstract" or "title" or "keywords" threshold : float — legacy arg (ignored by HDBSCAN) min_cluster_size : int — HDBSCAN minimum cluster size max_cluster_size : int — HDBSCAN maximum cluster size min_samples : int — HDBSCAN min_samples auto_optimize : bool — run LLM-guided optimization loop max_optimize_iters : int — max optimization rounds after initial run Returns ------- dict with keys: run_key, n_clusters, n_sentences, threshold, chart_paths, summaries_path, embeddings_path, optimization_path """ if run_key not in RUN_CONFIGS: return { "run_key": run_key, "n_clusters": 0, "n_sentences": 0, "threshold": threshold, "chart_paths": {}, "error": ( f"Unsupported run_key: {run_key}. " f"Use one of: {', '.join(RUN_CONFIGS.keys())}." ), } rdir = _run_dir(run_key) sent_path = OUTPUT_DIR / run_key / "sentences.json" if not sent_path.exists(): return { "run_key": run_key, "n_clusters": 0, "n_sentences": 0, "threshold": threshold, "chart_paths": {}, "error": ( f"Missing sentences artifact: {sent_path}. " "Run load_scopus_csv first." ), } sentences = _load_json(sent_path) if not sentences: return { "run_key": run_key, "n_clusters": 0, "n_sentences": 0, "threshold": threshold, "chart_paths": {}, "error": ( f"No sentences/terms found for run_key={run_key}. " "Check that the corresponding source column exists in the CSV." ), } sentence_meta = _load_sentence_meta(run_key, sentences) emb_path = rdir / "emb.npy" embeddings = None if emb_path.exists(): cached = np.load(str(emb_path)) if cached.shape[0] == len(sentences): embeddings = cached if embeddings is None: embeddings = _embed(sentences) np.save(str(emb_path), embeddings) cluster_space = _umap_reduce(embeddings, UMAP_N_COMPONENTS_CLUSTER) umap_2d = _umap_reduce(embeddings, UMAP_N_COMPONENTS_VIZ) def _run_hdbscan(params: dict) -> tuple[list[int], dict]: labels_local = _cluster( cluster_space, min_cluster_size=int(params.get("min_cluster_size", HDBSCAN_MIN_CLUSTER_SIZE)), max_cluster_size=int(params.get("max_cluster_size", HDBSCAN_MAX_CLUSTER_SIZE)), min_samples=int(params.get("min_samples", HDBSCAN_MIN_SAMPLES)), ).tolist() return labels_local, _cluster_metrics(np.array(labels_local)) current_params = { "min_cluster_size": int(min_cluster_size), "max_cluster_size": int(max_cluster_size), "min_samples": int(min_samples), } labels, metrics = _run_hdbscan(current_params) optimization_log = [ { "round": 0, "params": current_params, "metrics": metrics, } ] best_score = _optimization_score(metrics) stable_rounds = 0 seen_params = {( current_params["min_cluster_size"], current_params["max_cluster_size"], current_params["min_samples"], )} if bool(auto_optimize) and int(max_optimize_iters) > 0: for round_idx in range(1, int(max_optimize_iters) + 1): suggestion = _recommend_hdbscan_params(metrics, current_params) if suggestion.get("action") == "accept": optimization_log.append({ "round": round_idx, "params": current_params, "metrics": metrics, "action": "accept", "reasoning": suggestion.get("reasoning", ""), }) break next_params = { "min_cluster_size": int(suggestion.get("min_cluster_size")), "max_cluster_size": int(suggestion.get("max_cluster_size")), "min_samples": int(suggestion.get("min_samples")), } next_key = ( next_params["min_cluster_size"], next_params["max_cluster_size"], next_params["min_samples"], ) if next_key in seen_params: optimization_log.append({ "round": round_idx, "params": current_params, "metrics": metrics, "action": "stop", "reasoning": "Repeated parameter set; stopping optimization.", }) break labels, metrics = _run_hdbscan(next_params) optimization_log.append({ "round": round_idx, "params": next_params, "metrics": metrics, "reasoning": suggestion.get("reasoning", ""), }) current_params = next_params seen_params.add(next_key) score = _optimization_score(metrics) if score <= best_score + OPTIMIZE_MIN_IMPROVEMENT: stable_rounds += 1 else: best_score = score stable_rounds = 0 if _metrics_in_target(metrics): break if stable_rounds >= OPTIMIZE_STABLE_ROUNDS: break optimization_path = rdir / "optimization.json" _save_json(optimization_path, optimization_log) unique_ids = sorted(filter(lambda v: v != -1, set(labels))) # FIX BUG 1 — persist per-sentence label array so Tool 4 can build # correct cluster masks without any guesswork or scaffolding. np.save(str(rdir / "sent_labels.npy"), np.array(labels, dtype=np.int32)) labels_arr = np.array(labels) if not unique_ids: _save_json(rdir / "summaries.json", []) return { "run_key": run_key, "n_clusters": 0, "n_sentences": int(len(sentences)), "threshold": threshold, "min_cluster_size": int(current_params["min_cluster_size"]), "max_cluster_size": int(current_params["max_cluster_size"]), "min_samples": int(current_params["min_samples"]), "chart_paths": {}, "summaries_path": str(rdir / "summaries.json"), "embeddings_path": str(rdir / "emb.npy"), "optimization_path": str(optimization_path), "error": "HDBSCAN produced no clusters (all points labeled as noise).", } def _cluster_summary(cid: int) -> dict: mask = labels_arr == cid c_emb = embeddings[mask] c_umap = umap_2d[mask] c_sent = list(np.array(sentences)[mask]) ctroid = _centroid(c_emb) top_idx = _top_k_indices(c_emb, ctroid, N_EVIDENCE) coords = ( c_umap.mean(axis=0) if c_umap.shape[0] > 0 else np.zeros(UMAP_N_COMPONENTS_VIZ, dtype=np.float32) ) paper_stats = _top_papers_for_mask(sentence_meta, mask, k=3) return { "cluster_id": int(cid), "size": int(mask.sum()), "cx": float(coords[0]), "cy": float(coords[1]), "evidence": list(np.array(c_sent)[top_idx]), "paper_count": paper_stats.get("paper_count", 0), "top_papers": paper_stats.get("top_papers", []), } summaries = list(map(_cluster_summary, unique_ids)) _save_json(rdir / "summaries.json", summaries) chart_paths = { "Intertopic Map": _save_chart(_chart_intertopic(summaries), rdir / "intertopic.html"), "Top Words": _save_chart(_chart_top_words(summaries), rdir / "topwords.html"), "Hierarchy": _save_chart(_chart_hierarchy(labels, embeddings), rdir / "hierarchy.html"), "Heatmap": _save_chart(_chart_heatmap(labels, embeddings), rdir / "heatmap.html"), } return { "run_key": run_key, "n_clusters": int(len(unique_ids)), "n_sentences": int(len(sentences)), "threshold": threshold, "min_cluster_size": int(current_params["min_cluster_size"]), "max_cluster_size": int(current_params["max_cluster_size"]), "min_samples": int(current_params["min_samples"]), "chart_paths": chart_paths, "summaries_path": str(rdir / "summaries.json"), "embeddings_path": str(rdir / "emb.npy"), "optimization_path": str(optimization_path), } # ============================================================================ # TOOL 3 — label_topics_with_llm # ============================================================================ _LABEL_PROMPT = PromptTemplate.from_template( """You are an expert academic researcher specialising in Information Systems. Given the following cluster of research sentences, return a JSON object with EXACTLY these keys: label : short research-area name (<= 6 words) category : broader IS research category confidence : float 0.0-1.0 reasoning : one sentence explaining your choice niche : boolean - true if highly specialised / narrow Cluster ID : {cluster_id} Sentence count: {size} Evidence sentences: {evidence} Respond with RAW JSON only. No markdown, no explanation outside the JSON. """ ) _LABEL_JUDGE_PROMPT = PromptTemplate.from_template( """You are an expert label adjudicator. Choose the single best label from the candidates below based on the evidence sentences. Cluster ID : {cluster_id} Sentence count: {size} Evidence sentences: {evidence} Candidate labels: 1) Mistral Label: {mistral_label} Category: {mistral_category} Confidence: {mistral_confidence} Reasoning: {mistral_reasoning} 2) Groq-Ollama Label: {groq_ollama_label} Category: {groq_ollama_category} Confidence: {groq_ollama_confidence} Reasoning: {groq_ollama_reasoning} 3) Groq-GPT Label: {groq_gpt_label} Category: {groq_gpt_category} Confidence: {groq_gpt_confidence} Reasoning: {groq_gpt_reasoning} Rules: - Choose exactly one of the three labels. Do not invent a new label. - Pick the label that best matches the evidence and is most specific. - If two are equally good, prefer the one with higher confidence. Return RAW JSON with exactly these keys: best_label: string best_category: string chosen_source: string # one of: mistral, groq_ollama, groq_gpt best_reasoning: string Respond with RAW JSON only. """ ) @tool def label_topics_with_llm(run_key: str) -> dict: """ Label each cluster with Mistral only (default Phase 2 labeling pass). Parameters ---------- run_key : str — "abstract" or "title" or "keywords" Returns ------- dict with keys: run_key, labels_path, labelled_count, labels_preview (list of dicts) """ rdir = _run_dir(run_key) summaries_path = rdir / "summaries.json" if not summaries_path.exists(): return { "run_key": run_key, "labels_path": str(rdir / "labels.json"), "labelled_count": 0, "total_clusters": 0, "selected_clusters": 0, "skipped_clusters": 0, "labels_preview": [], "error": ( f"Missing discovery artifact: {summaries_path}. " "Run run_bertopic_discovery first for this run_key." ), } summaries = _load_json(summaries_path) ranked = sorted( filter(lambda s: s.get("size", 0) >= MIN_CLUSTER_SIZE_FOR_LABEL, summaries), key=lambda s: s.get("size", 0), reverse=True, ) selected = ranked[:MAX_LABEL_CLUSTERS] chain_mistral = _LABEL_PROMPT | _llm() | JsonOutputParser() def _evidence_block(summary: dict) -> str: return "\n".join( f" {i+1}. {s}" for i, s in enumerate(summary["evidence"]) ) def _label_one(summary: dict) -> dict: result = _invoke_with_retries(lambda: chain_mistral.invoke({ "cluster_id": summary["cluster_id"], "size": summary["size"], "evidence": _evidence_block(summary), })) return { **summary, **result, "mistral_label": result.get("label", ""), "mistral_category": result.get("category", ""), "mistral_confidence": _to_float(result.get("confidence"), 0.0), "mistral_reasoning": result.get("reasoning", ""), "mistral_niche": bool(result.get("niche", False)), "groq_label": "", "groq_category": "", "groq_confidence": 0.0, "groq_reasoning": "", "groq_niche": False, "groq_ollama_label": "", "groq_ollama_category": "", "groq_ollama_confidence": 0.0, "groq_ollama_reasoning": "", "groq_ollama_niche": False, "groq_gpt_label": "", "groq_gpt_category": "", "groq_gpt_confidence": 0.0, "groq_gpt_reasoning": "", "groq_gpt_niche": False, "verification_done": False, "verification_done_ollama": False, "verification_done_gpt": False, "verification_note": ( "Run VERIFY in Phase 2 to compare with Groq-Ollama and Groq-GPT labels." ), } labelled = list(map(_label_one, selected)) _save_json(rdir / "labels.json", labelled) # Keep tool output compact so the ReAct transcript does not overflow model context. preview = list(map( lambda r: { "cluster_id": r.get("cluster_id"), "label": r.get("label"), "category": r.get("category"), "confidence": r.get("confidence"), "mistral_label": r.get("mistral_label", ""), "groq_label": r.get("groq_label", ""), "groq_ollama_label": r.get("groq_ollama_label", r.get("groq_label", "")), "groq_gpt_label": r.get("groq_gpt_label", ""), "size": r.get("size"), "niche": r.get("niche", False), }, labelled[:MAX_TOOL_RETURN_PREVIEW], )) return { "run_key": run_key, "labels_path": str(rdir / "labels.json"), "labelled_count": len(labelled), "total_clusters": len(summaries), "selected_clusters": len(selected), "skipped_clusters": max(0, len(summaries) - len(selected)), "groq_enabled": _groq_ollama_enabled() and _groq_gpt_enabled(), "mode_note": "Single-model labeling complete (Mistral). Send VERIFY in Phase 2 to run Groq-Ollama and Groq-GPT verification.", "labels_preview": preview, } @tool def verify_topic_labels_with_groq(run_key: str) -> dict: """ Run Groq topic labeling for already-labeled topics and append comparison fields into labels.json so UI review table can show Mistral vs Groq-Ollama vs Groq-GPT labels, plus an adjudicated best label when GROQ_JUDGE_MODEL_NAME is configured. Parameters ---------- run_key : str — "abstract" or "title" or "keywords" Returns ------- dict with keys: run_key, labels_path, verification_path, verified_count, labels_preview """ rdir = _run_dir(run_key) labels_path = rdir / "labels.json" summaries_path = rdir / "summaries.json" if not _groq_ollama_enabled() or not _groq_gpt_enabled(): return { "run_key": run_key, "labels_path": str(labels_path), "verified_count": 0, "labels_preview": [], "error": ( "GROQ_API_KEY or Groq model config is missing, or langchain-groq is unavailable. " "Set GROQ_API_KEY and GROQ_GPT_MODEL_NAME (and optionally GROQ_OLLAMA_MODEL_NAME) " "and install requirements to use VERIFY." ), } if not labels_path.exists(): return { "run_key": run_key, "labels_path": str(labels_path), "verified_count": 0, "labels_preview": [], "error": ( f"Missing labels artifact: {labels_path}. " "Run label_topics_with_llm first." ), } if not summaries_path.exists(): return { "run_key": run_key, "labels_path": str(labels_path), "verified_count": 0, "labels_preview": [], "error": ( f"Missing summaries artifact: {summaries_path}. " "Run run_bertopic_discovery first." ), } labels_data = _load_json(labels_path) summaries = _load_json(summaries_path) summary_by_id = { int(s.get("cluster_id", -1)): s for s in summaries } target_rows = list(filter( lambda r: int(r.get("cluster_id", -1)) in summary_by_id, labels_data, )) chain_groq_ollama = _LABEL_PROMPT | _llm_groq(GROQ_OLLAMA_MODEL_NAME) | JsonOutputParser() chain_groq_gpt = _LABEL_PROMPT | _llm_groq(GROQ_GPT_MODEL_NAME) | JsonOutputParser() chain_judge = ( _LABEL_JUDGE_PROMPT | _llm_groq(GROQ_JUDGE_MODEL_NAME) | JsonOutputParser() if _groq_judge_enabled() else None ) def _evidence_block(summary: dict) -> str: return "\n".join( f" {i+1}. {s}" for i, s in enumerate(summary.get("evidence", [])) ) def _label_with_groq(row: dict) -> tuple[int, dict, dict]: cid = int(row.get("cluster_id", -1)) summary = summary_by_id[cid] payload = { "cluster_id": summary["cluster_id"], "size": summary["size"], "evidence": _evidence_block(summary), } groq_ollama = _invoke_with_retries(lambda: chain_groq_ollama.invoke(payload)) groq_gpt = _invoke_with_retries(lambda: chain_groq_gpt.invoke(payload)) return cid, groq_ollama, groq_gpt groq_pairs = list(map(_label_with_groq, target_rows)) groq_ollama_by_id = {cid: data for cid, data, _ in groq_pairs} groq_gpt_by_id = {cid: data for cid, _, data in groq_pairs} def _judge_label(row: dict) -> tuple[int, dict]: if chain_judge is None: return int(row.get("cluster_id", -1)), {} cid = int(row.get("cluster_id", -1)) summary = summary_by_id[cid] groq_ollama = groq_ollama_by_id.get(cid, {}) groq_gpt = groq_gpt_by_id.get(cid, {}) payload = { "cluster_id": summary.get("cluster_id"), "size": summary.get("size"), "evidence": _evidence_block(summary), "mistral_label": str(row.get("mistral_label") or row.get("label", "")).strip(), "mistral_category": str(row.get("mistral_category") or row.get("category", "")).strip(), "mistral_confidence": _to_float(row.get("mistral_confidence", row.get("confidence", 0.0)), 0.0), "mistral_reasoning": str(row.get("mistral_reasoning") or row.get("reasoning", "")).strip(), "groq_ollama_label": str(groq_ollama.get("label", "")).strip(), "groq_ollama_category": str(groq_ollama.get("category", "")).strip(), "groq_ollama_confidence": _to_float(groq_ollama.get("confidence"), 0.0), "groq_ollama_reasoning": str(groq_ollama.get("reasoning", "")).strip(), "groq_gpt_label": str(groq_gpt.get("label", "")).strip(), "groq_gpt_category": str(groq_gpt.get("category", "")).strip(), "groq_gpt_confidence": _to_float(groq_gpt.get("confidence"), 0.0), "groq_gpt_reasoning": str(groq_gpt.get("reasoning", "")).strip(), } try: result = _invoke_with_retries(lambda: chain_judge.invoke(payload)) except Exception: result = {} return cid, result judge_pairs = list(map(_judge_label, target_rows)) if chain_judge else [] judge_by_id = {cid: data for cid, data in judge_pairs} def _merge_row(row: dict) -> dict: cid = int(row.get("cluster_id", -1)) groq_ollama = groq_ollama_by_id.get(cid, {}) groq_gpt = groq_gpt_by_id.get(cid, {}) adjudicated = judge_by_id.get(cid, {}) has_groq_ollama = bool(groq_ollama) has_groq_gpt = bool(groq_gpt) mistral_label = str(row.get("mistral_label") or row.get("label", "")).strip() groq_ollama_label = str(groq_ollama.get("label", "")).strip() groq_gpt_label = str(groq_gpt.get("label", "")).strip() adjudicated_label = str(adjudicated.get("best_label", "")).strip() is_agreement = ( all([mistral_label, groq_ollama_label, groq_gpt_label]) and mistral_label.lower() == groq_ollama_label.lower() and mistral_label.lower() == groq_gpt_label.lower() ) return { **row, "mistral_label": mistral_label, "mistral_category": row.get("mistral_category") or row.get("category", ""), "mistral_confidence": _to_float( row.get("mistral_confidence", row.get("confidence", 0.0)), 0.0, ), "mistral_reasoning": row.get("mistral_reasoning") or row.get("reasoning", ""), "mistral_niche": bool(row.get("mistral_niche", row.get("niche", False))), "groq_label": groq_ollama_label, "groq_category": groq_ollama.get("category", ""), "groq_confidence": _to_float(groq_ollama.get("confidence"), 0.0), "groq_reasoning": groq_ollama.get("reasoning", ""), "groq_niche": bool(groq_ollama.get("niche", False)), "groq_ollama_label": groq_ollama_label, "groq_ollama_category": groq_ollama.get("category", ""), "groq_ollama_confidence": _to_float(groq_ollama.get("confidence"), 0.0), "groq_ollama_reasoning": groq_ollama.get("reasoning", ""), "groq_ollama_niche": bool(groq_ollama.get("niche", False)), "groq_gpt_label": groq_gpt_label, "groq_gpt_category": groq_gpt.get("category", ""), "groq_gpt_confidence": _to_float(groq_gpt.get("confidence"), 0.0), "groq_gpt_reasoning": groq_gpt.get("reasoning", ""), "groq_gpt_niche": bool(groq_gpt.get("niche", False)), "adjudicated_label": adjudicated_label, "adjudicated_category": str(adjudicated.get("best_category", "")).strip(), "adjudicated_reasoning": str(adjudicated.get("best_reasoning", "")).strip(), "adjudicated_source": str(adjudicated.get("chosen_source", "")).strip(), "adjudication_done": bool(adjudicated_label), "adjudication_note": ( "Adjudicated label available." if adjudicated_label else "Adjudication unavailable for this topic." ), "verification_done": has_groq_ollama and has_groq_gpt, "verification_done_ollama": has_groq_ollama, "verification_done_gpt": has_groq_gpt, "verification_note": ( "Mistral, Groq-Ollama, and Groq-GPT labels match." if is_agreement else "Model labels differ. Review before approval." ) if has_groq_ollama and has_groq_gpt else "Groq labeling unavailable for this topic.", } verified_rows = list(map(_merge_row, labels_data)) verification_path = rdir / "labels_verification.json" _save_json(labels_path, verified_rows) _save_json(verification_path, verified_rows) preview = list(map( lambda r: { "cluster_id": r.get("cluster_id"), "mistral_label": r.get("mistral_label", ""), "groq_ollama_label": r.get("groq_ollama_label", r.get("groq_label", "")), "groq_gpt_label": r.get("groq_gpt_label", ""), "adjudicated_label": r.get("adjudicated_label", ""), "verification_note": r.get("verification_note", ""), }, verified_rows[:MAX_TOOL_RETURN_PREVIEW], )) verified_count = sum( 1 for row in verified_rows if row.get("groq_ollama_label") and row.get("groq_gpt_label") ) return { "run_key": run_key, "labels_path": str(labels_path), "verification_path": str(verification_path), "verified_count": int(verified_count), "labelled_count": int(len(verified_rows)), "labels_preview": preview, } # ============================================================================ # TOOL 4 — consolidate_into_themes # ============================================================================ @tool def consolidate_into_themes(run_key: str, theme_map: dict) -> dict: """ Merge approved / renamed topics into consolidated themes and recompute centroids from the actual merged-cluster embeddings. Parameters ---------- run_key : str — "abstract" or "title" or "keywords" theme_map : dict — {new_theme_name: [cluster_id, ...], ...} Only approved topics need appear here. Returns ------- dict with keys: run_key, theme_count, themes_path, themes_preview (list of dicts) """ rdir = _run_dir(run_key) labels_data = _load_json(rdir / "labels.json") embeddings = np.load(str(rdir / "emb.npy")) # (N, 384) sent_labels = np.load(str(rdir / "sent_labels.npy")) # (N,) — FIX BUG 1 # Index label dicts by cluster_id for O(1) lookup label_idx = {item["cluster_id"]: item for item in labels_data} def _build_theme(theme_name: str, cids: list[int]) -> dict: """ Build one consolidated theme from a list of cluster IDs. Evidence : top-N sentences pooled across all merged clusters Centroid : L2-normalised mean of all embeddings in the merged set Size : total sentence count across merged clusters """ member_labels = list(map(label_idx.get, cids)) # Pool evidence sentences from all member clusters all_evidence = reduce( lambda acc, lbl: acc + lbl["evidence"], filter(None, member_labels), [], ) # Total sentence count across merged clusters total_size = reduce( lambda acc, lbl: acc + lbl.get("size", 0), filter(None, member_labels), 0, ) # FIX BUG 1 — build correct cluster mask using persisted sent_labels cluster_mask = np.isin(sent_labels, np.array(cids, dtype=np.int32)) theme_embeddings = embeddings[cluster_mask] # (M, 384) # Guard: if mask is somehow empty fall back to zero vector theme_centroid = ( _centroid(theme_embeddings) if theme_embeddings.shape[0] > 0 else np.zeros(embeddings.shape[1], dtype=np.float32) ) return { "theme_name": theme_name, "cluster_ids": cids, "size": total_size, "evidence": all_evidence[:N_EVIDENCE], "centroid": theme_centroid.tolist(), "sub_labels": list(map( itemgetter("label"), filter(None, member_labels), )), } themes = list(map( lambda kv: _build_theme(kv[0], kv[1]), theme_map.items(), )) _save_json(rdir / "themes.json", themes) preview = list(map( lambda t: { "theme_name": t.get("theme_name"), "size": t.get("size", 0), "cluster_count": len(t.get("cluster_ids", [])), }, themes[:MAX_TOOL_RETURN_PREVIEW], )) return { "run_key": run_key, "theme_count": len(themes), "themes_path": str(rdir / "themes.json"), "themes_preview": preview, } # ============================================================================ # TOOL 5 — compare_with_taxonomy # ============================================================================ _TAXONOMY_PROMPT = PromptTemplate.from_template( """You are an IS research taxonomist. Map the following research theme to the PAJAIS taxonomy. Return RAW JSON with EXACTLY these keys: theme_name : the input theme name (unchanged) pajais_match : best matching PAJAIS category OR the string "NOVEL" confidence : float 0.0-1.0 reasoning : one sentence is_novel : boolean PAJAIS categories: {taxonomy} Theme to map: Name : {theme_name} Evidence : {evidence} Respond with RAW JSON only. No markdown. """ ) @tool def compare_with_taxonomy(run_key: str) -> dict: """ Map consolidated themes to PAJAIS taxonomy via Mistral. Parameters ---------- run_key : str — "abstract" or "title" or "keywords" Returns ------- dict with keys: run_key, taxonomy_path, mapped_count, novel_count, mapping_preview """ rdir = _run_dir(run_key) themes = _load_json(rdir / "themes.json") chain = _TAXONOMY_PROMPT | _llm() | JsonOutputParser() taxonomy_str = "\n".join(f" - {cat}" for cat in PAJAIS_TAXONOMY) def _map_theme(theme: dict) -> dict: result = _invoke_with_retries(lambda: chain.invoke({ "taxonomy": taxonomy_str, "theme_name": theme["theme_name"], "evidence": " | ".join(theme.get("evidence", [])[:3]), })) return {**theme, **result} taxonomy_map = list(map(_map_theme, themes)) _save_json(rdir / "taxonomy_map.json", taxonomy_map) novel_count = sum(1 for t in taxonomy_map if t.get("is_novel", False)) mapped_count = len(taxonomy_map) - novel_count preview = list(map( lambda t: { "theme_name": t.get("theme_name"), "pajais_match": t.get("pajais_match", "NOVEL"), "confidence": t.get("confidence", 0), "is_novel": t.get("is_novel", False), }, taxonomy_map[:MAX_TOOL_RETURN_PREVIEW], )) return { "run_key": run_key, "taxonomy_path": str(rdir / "taxonomy_map.json"), "mapped_count": mapped_count, "novel_count": novel_count, "mapping_preview": preview, } @tool def verify_taxonomy_mapping_with_groq(run_key: str) -> dict: """ Run Groq validation for PAJAIS taxonomy mappings and persist side-by-side Mistral/Groq mapping fields for each theme. Parameters ---------- run_key : str — "abstract" or "title" or "keywords" Returns ------- dict with keys: run_key, taxonomy_path, verification_path, verified_count, mapping_preview """ if not _groq_ollama_enabled(): return { "run_key": run_key, "taxonomy_path": str(_run_dir(run_key) / "taxonomy_map.json"), "verified_count": 0, "mapping_preview": [], "error": ( "GROQ_API_KEY is missing or langchain-groq is unavailable. " "Set GROQ_API_KEY and install requirements to use VERIFY." ), } rdir = _run_dir(run_key) themes_path = rdir / "themes.json" taxonomy_path = rdir / "taxonomy_map.json" if not themes_path.exists(): return { "run_key": run_key, "taxonomy_path": str(taxonomy_path), "verified_count": 0, "mapping_preview": [], "error": ( f"Missing themes artifact: {themes_path}. " "Run consolidate_into_themes first." ), } if not taxonomy_path.exists(): return { "run_key": run_key, "taxonomy_path": str(taxonomy_path), "verified_count": 0, "mapping_preview": [], "error": ( f"Missing taxonomy artifact: {taxonomy_path}. " "Run compare_with_taxonomy first." ), } themes = _load_json(themes_path) taxonomy_map = _load_json(taxonomy_path) taxonomy_str = "\n".join(f" - {cat}" for cat in PAJAIS_TAXONOMY) chain_groq = _TAXONOMY_PROMPT | _llm_groq(GROQ_OLLAMA_MODEL_NAME) | JsonOutputParser() def _map_theme_with_groq(theme: dict) -> dict: return _invoke_with_retries(lambda: chain_groq.invoke({ "taxonomy": taxonomy_str, "theme_name": theme["theme_name"], "evidence": " | ".join(theme.get("evidence", [])[:3]), })) groq_maps = list(map(_map_theme_with_groq, themes)) groq_by_theme = { str(item.get("theme_name", "")).strip(): item for item in groq_maps } def _merge_mappings(mistral_row: dict) -> dict: theme_name = str(mistral_row.get("theme_name", "")).strip() groq_row = groq_by_theme.get(theme_name, {}) groq_match = str(groq_row.get("pajais_match", "")).strip() mistral_match = str(mistral_row.get("pajais_match", "")).strip() is_same = bool(groq_match) and (groq_match.lower() == mistral_match.lower()) return { **mistral_row, "mistral_pajais_match": mistral_match, "mistral_confidence": _to_float( mistral_row.get("mistral_confidence", mistral_row.get("confidence", 0.0)), 0.0, ), "mistral_reasoning": str( mistral_row.get("mistral_reasoning", mistral_row.get("reasoning", "")) ), "mistral_is_novel": bool( mistral_row.get("mistral_is_novel", mistral_row.get("is_novel", False)) ), "groq_pajais_match": groq_match, "groq_confidence": _to_float(groq_row.get("confidence"), 0.0), "groq_reasoning": str(groq_row.get("reasoning", "")), "groq_is_novel": bool(groq_row.get("is_novel", False)), "taxonomy_verification_done": bool(groq_row), "taxonomy_verification_note": ( "Mistral and Groq taxonomy mapping match." if is_same else "Mistral and Groq taxonomy mapping differ." ) if groq_row else "Groq taxonomy mapping unavailable for this theme.", } merged_rows = list(map(_merge_mappings, taxonomy_map)) verification_path = rdir / "taxonomy_verification.json" _save_json(taxonomy_path, merged_rows) _save_json(verification_path, merged_rows) preview = list(map( lambda row: { "theme_name": row.get("theme_name", ""), "mistral_pajais_match": row.get("mistral_pajais_match", row.get("pajais_match", "")), "groq_pajais_match": row.get("groq_pajais_match", ""), "taxonomy_verification_note": row.get("taxonomy_verification_note", ""), }, merged_rows[:MAX_TOOL_RETURN_PREVIEW], )) verified_count = sum(1 for row in merged_rows if row.get("groq_pajais_match")) return { "run_key": run_key, "taxonomy_path": str(taxonomy_path), "verification_path": str(verification_path), "verified_count": int(verified_count), "mapped_count": int(len(merged_rows)), "mapping_preview": preview, } # ============================================================================ # TOOL 6 — generate_comparison_csv # ============================================================================ @tool def generate_comparison_csv() -> dict: """ Side-by-side comparison of abstract/title/keywords theme mappings. Each run is optional. Missing runs produce empty columns. Returns ------- dict with keys: csv_path, row_count, columns, preview (list of dicts) """ abstract_path = OUTPUT_DIR / "abstract" / "taxonomy_map.json" title_path = OUTPUT_DIR / "title" / "taxonomy_map.json" keywords_path = OUTPUT_DIR / "keywords" / "taxonomy_map.json" abstract_map = _load_json(abstract_path) if abstract_path.exists() else [] title_map = _load_json(title_path) if title_path.exists() else [] keywords_map = _load_json(keywords_path) if keywords_path.exists() else [] if not (abstract_map or title_map or keywords_map): return { "csv_path": str(OUTPUT_DIR / "comparison.csv"), "row_count": 0, "columns": [], "preview": [], "error": ( "No taxonomy_map.json files found for abstract/title/keywords. " "Run compare_with_taxonomy for at least one run first." ), } def _row(a_theme: dict | None, t_theme: dict | None, k_theme: dict | None) -> dict: return { "Abstract Theme": a_theme.get("theme_name", "") if a_theme else "", "Abstract PAJAIS": a_theme.get("pajais_match", "") if a_theme else "", "Abstract Confidence": a_theme.get("confidence", 0) if a_theme else 0, "Abstract Novel": a_theme.get("is_novel", False) if a_theme else False, "Title Theme": t_theme.get("theme_name", "") if t_theme else "", "Title PAJAIS": t_theme.get("pajais_match", "") if t_theme else "", "Title Confidence": t_theme.get("confidence", 0) if t_theme else 0, "Title Novel": t_theme.get("is_novel", False) if t_theme else False, "Keywords Theme": k_theme.get("theme_name", "") if k_theme else "", "Keywords PAJAIS": k_theme.get("pajais_match", "") if k_theme else "", "Keywords Confidence": k_theme.get("confidence", 0) if k_theme else 0, "Keywords Novel": k_theme.get("is_novel", False) if k_theme else False, } max_len = max(len(abstract_map), len(title_map), len(keywords_map), 1) padded_a = abstract_map + [{}] * (max_len - len(abstract_map)) padded_t = title_map + [{}] * (max_len - len(title_map)) padded_k = keywords_map + [{}] * (max_len - len(keywords_map)) rows = list(map(_row, padded_a, padded_t, padded_k)) df = pd.DataFrame(rows) out_path = OUTPUT_DIR / "comparison.csv" df.to_csv(out_path, index=False) return { "csv_path": str(out_path), "row_count": len(df), "columns": df.columns.tolist(), "preview": df.head(5).to_dict(orient="records"), } # ============================================================================ # TOOL 7 — export_narrative # ============================================================================ _NARRATIVE_PROMPT = PromptTemplate.from_template( """You are an academic researcher writing a methodology and findings section. Write a 500-word academic narrative describing the thematic analysis results below. Structure: (1) methodology overview, (2) major themes found across runs, (3) PAJAIS alignment, (4) novel contributions, (5) limitations. Use formal academic English. Do NOT use bullet points. Abstract themes & taxonomy: {abstract_themes} Title themes & taxonomy: {title_themes} Keywords themes & taxonomy: {keywords_themes} Respond with plain text only. """ ) @tool def export_narrative(run_key: str) -> dict: """ Generate a 500-word academic narrative and save to narrative.txt. Parameters ---------- run_key : str — "abstract" or "title" or "keywords" (primary source) Returns ------- dict with keys: narrative_path, word_count, preview (first 300 chars) """ rdir = _run_dir(run_key) abstract_path = OUTPUT_DIR / "abstract" / "taxonomy_map.json" title_path = OUTPUT_DIR / "title" / "taxonomy_map.json" keywords_path = OUTPUT_DIR / "keywords" / "taxonomy_map.json" abstract_map = _load_json(abstract_path) if abstract_path.exists() else [] title_map = _load_json(title_path) if title_path.exists() else [] keywords_map = _load_json(keywords_path) if keywords_path.exists() else [] if not (abstract_map or title_map or keywords_map): return { "narrative_path": str(rdir / "narrative.txt"), "word_count": 0, "preview": "", "error": ( "No taxonomy mappings found for abstract/title/keywords. " "Run compare_with_taxonomy before export_narrative." ), } def _theme_summary(t: dict) -> str: return ( f" - {t.get('theme_name','?')} -> {t.get('pajais_match','?')} " f"(conf={t.get('confidence',0):.2f}, novel={t.get('is_novel',False)})" ) abstract_str = "\n".join(map(_theme_summary, abstract_map)) title_str = "\n".join(map(_theme_summary, title_map)) or "Not run." keywords_str = "\n".join(map(_theme_summary, keywords_map)) or "Not run." chain = _NARRATIVE_PROMPT | _llm() response = _invoke_with_retries(lambda: chain.invoke({ "abstract_themes": abstract_str, "title_themes": title_str, "keywords_themes": keywords_str, })) narrative = response.content if hasattr(response, "content") else str(response) out_path = rdir / "narrative.txt" out_path.write_text(narrative, encoding="utf-8") return { "narrative_path": str(out_path), "word_count": len(narrative.split()), "preview": narrative[:300], } # ============================================================================ # METHOD EXTRACTION — Per-Paper Computational Method Identification # ============================================================================ def _extract_text_from_pdf(pdf_path: str) -> str: """Extract all text from a PDF using PyMuPDF (text only, no images).""" import fitz doc = fitz.open(pdf_path) pages = [] for page in doc: pages.append(page.get_text("text")) doc.close() return "\n".join(pages) def _extract_title_from_pdf(full_text: str) -> str: """Try to extract the paper title from the first few lines of text.""" lines = full_text.strip().split("\n") title_lines = [] for line in lines[:10]: stripped = line.strip() if not stripped: if title_lines: break continue low = stripped.lower() if low.startswith("abstract") or low.startswith("keyword"): break if len(stripped) > 10: title_lines.append(stripped) if len(title_lines) >= 2: break return " ".join(title_lines)[:200] if title_lines else "" def _chunk_text(text: str, chunk_size: int = 12000, overlap: int = 1000) -> list[str]: """Split text into chunks of `chunk_size` characters with `overlap`.""" if not text: return [] chunks = [] start = 0 text_len = len(text) while start < text_len: end = start + chunk_size chunks.append(text[start:end]) if end >= text_len: break start = end - overlap return chunks # LLM prompt — extracts computational methods from a single paper's method section _EXTRACT_METHODS_PROMPT = PromptTemplate.from_template( """You are an expert IS research methodologist. Read this excerpt from a research paper and identify ALL computational techniques used. The excerpt may come from methods or results. Use: - explicit method statements ("this study uses", "we employed") - analytical technique mentions in results (beta coefficients, BERT scores, LDA topics, network centrality) - sample/data descriptions (N=, dataset, corpus) - implicit method cues from results presentation (e.g., beta tables imply regression) Do not guess beyond evidence in the excerpt. A "computational method" or "analytical technique" refers to specific algorithms, statistical tests, machine learning models, NLP techniques, network measures, or simulation/optimization approaches. Paper: {paper_name} Excerpt text: {method_text} Return a JSON object with EXACTLY this key: computational_methods : list of specific algorithms, models, or computational techniques found. Be very specific. DO NOT just say "Machine Learning", name the algorithm. Examples: ["Random Forest", "BERT", "K-means clustering", "LSTM", "XGBoost", "LDA topic modeling", "PLS-SEM", "CB-SEM", "OLS Regression", "ANOVA", "Network centrality", "Louvain community detection", "Sentiment Analysis (VADER)"] Return an empty list [] if absolutely no specific computational techniques or statistical models are mentioned. Respond with RAW JSON only. No markdown, no explanation. """ ) @tool def extract_methods_from_pdfs(pdf_dir: str) -> dict: """ Extract computational methods from each PDF paper. For each PDF: extract text (no images) → split into overlapping chunks → send each chunk to Mistral LLM → aggregate identified methods per paper. Parameters ---------- pdf_dir : str — directory containing PDF files Returns ------- dict with keys: n_papers, results (list of per-paper method dicts), csv_path """ pdf_dir_path = Path(pdf_dir) if not pdf_dir_path.exists(): return {"error": f"PDF directory not found: {pdf_dir}"} pdf_files = sorted( [str(p) for p in pdf_dir_path.glob("*.pdf")] + [str(p) for p in pdf_dir_path.glob("*.PDF")] ) if not pdf_files: return {"error": f"No PDF files found in {pdf_dir}"} rdir = _ensure_dir(OUTPUT_DIR / "methods") # Step 1: Extract full text from all PDFs and chunk them paper_chunks = [] for idx, pdf_path in enumerate(pdf_files, start=1): try: full_text = _extract_text_from_pdf(pdf_path) title = Path(pdf_path).stem chunks = _chunk_text(full_text) paper_chunks.append({ "paper_id": idx, "paper_filename": Path(pdf_path).stem, "paper_title": title, "chunks": chunks, }) except Exception as exc: paper_chunks.append({ "paper_id": idx, "paper_filename": Path(pdf_path).stem, "paper_title": Path(pdf_path).stem, "chunks": [], "error": str(exc), }) # Step 2: For each paper, use LLM on all chunks and aggregate if not MISTRAL_API_KEY: return { "n_papers": len(pdf_files), "results": paper_chunks, "error": "MISTRAL_API_KEY not set — extracted text chunks but cannot identify methods via LLM.", } chain = _EXTRACT_METHODS_PROMPT | _llm() | JsonOutputParser() paper_results = [] for entry in paper_chunks: chunks = entry.get("chunks", []) if not chunks: paper_results.append({ "paper_id": entry["paper_id"], "paper_filename": entry["paper_filename"], "paper_title": entry.get("paper_title", ""), "computational_methods": [], "extraction_note": "No text extracted", }) continue all_comp_methods = set() # Process each chunk for chunk in chunks: if len(chunk) < 50: continue try: result = _invoke_with_retries(lambda c=chunk: chain.invoke({ "paper_name": entry.get("paper_title", entry.get("paper_filename", "")), "method_text": c, })) # Collect computational methods cm = result.get("computational_methods", []) if isinstance(cm, list): for item in cm: if isinstance(item, str) and item.strip(): all_comp_methods.add(item.strip()) elif isinstance(cm, str) and cm.strip(): all_comp_methods.add(cm.strip()) except Exception as exc: pass # Skip failed chunks paper_results.append({ "paper_id": entry["paper_id"], "paper_filename": entry["paper_filename"], "paper_title": entry.get("paper_title", ""), "computational_methods": sorted(list(all_comp_methods)), "chunks_processed": len(chunks) }) # Save results _save_json(rdir / "method_results.json", paper_results) # Build CSV rows = [] for r in paper_results: comp_methods = r.get("computational_methods", []) if isinstance(comp_methods, list): comp_str = ", ".join(comp_methods) else: comp_str = str(comp_methods) rows.append({ "Paper ID": r.get("paper_id", ""), "Paper Title": r.get("paper_title", r.get("paper_filename", "")), "Computational Methods": comp_str, }) df = pd.DataFrame(rows) csv_path = rdir / "method_summary.csv" df.to_csv(csv_path, index=False) def _clean_technique_name(name: str) -> str: return re.sub(r"\s+", " ", name.strip()) def _normalize_technique_key(name: str) -> str: cleaned = re.sub(r"[^a-z0-9+ ]", " ", name.lower()) cleaned = re.sub(r"\s+", " ", cleaned).strip() cleaned = cleaned.replace("forests", "forest") cleaned = cleaned.replace("trees", "tree") cleaned = cleaned.replace("networks", "network") cleaned = cleaned.replace("models", "model") cleaned = cleaned.replace("transformers", "transformer") cleaned = cleaned.replace("embeddings", "embedding") cleaned = cleaned.replace("topics", "topic") cleaned = cleaned.replace("measures", "measure") return cleaned canonical_patterns = [ (re.compile(r"\bbert\b"), "BERT"), (re.compile(r"\broberta\b"), "RoBERTa"), (re.compile(r"\bxlm[- ]?roberta\b"), "XLM-RoBERTa"), (re.compile(r"\bgpt[- ]?[0-9]*\b"), "GPT"), (re.compile(r"\bt5\b"), "T5"), (re.compile(r"\bword2vec\b"), "Word2Vec"), (re.compile(r"\bglove\b"), "GloVe"), (re.compile(r"\bdoc2vec\b"), "Doc2Vec"), (re.compile(r"\bfasttext\b"), "fastText"), (re.compile(r"\bspecter\b"), "SPECTER"), (re.compile(r"\bsentence[- ]?transformer"), "Sentence-Transformers"), (re.compile(r"\btf[- ]?idf\b"), "TF-IDF"), (re.compile(r"\bbm25\b"), "BM25"), (re.compile(r"\bbag of words\b|\bbow\b"), "Bag-of-words"), (re.compile(r"\blda\b|\blatent dirichlet allocation\b"), "LDA topic modeling"), (re.compile(r"\bnmf\b|\bnon[- ]?negative matrix factorization\b"), "NMF topic modeling"), (re.compile(r"\blsa\b|\blsi\b|\blatent semantic analysis\b"), "LSA"), (re.compile(r"\bbertopic\b"), "BERTopic"), (re.compile(r"\bk[- ]?means\b"), "K-means clustering"), (re.compile(r"\bhierarchical clustering\b"), "Hierarchical clustering"), (re.compile(r"\bdbscan\b"), "DBSCAN"), (re.compile(r"\bhdbscan\b"), "HDBSCAN"), (re.compile(r"\bgmm\b|\bgaussian mixture\b"), "Gaussian mixture model"), (re.compile(r"\bpca\b|\bprincipal component analysis\b"), "PCA"), (re.compile(r"\bsvd\b|\bsingular value decomposition\b"), "SVD"), (re.compile(r"\btsne\b|\bt-sne\b"), "t-SNE"), (re.compile(r"\bumap\b"), "UMAP"), (re.compile(r"\bner\b|\bnamed entity recognition\b"), "Named entity recognition"), (re.compile(r"\bsentiment\b"), "Sentiment analysis"), (re.compile(r"\brandom forest\b"), "Random Forest"), (re.compile(r"\bdecision tree\b"), "Decision Tree"), (re.compile(r"\bgradient boosting\b|\bxgboost\b|\blightgbm\b|\bcatboost\b"), "Gradient boosting"), (re.compile(r"\bsvm\b|\bsupport vector machine\b"), "SVM"), (re.compile(r"\bknn\b|\bk[- ]?nearest neighbor\b"), "KNN"), (re.compile(r"\bnaive bayes\b"), "Naive Bayes"), (re.compile(r"\bneural network\b|\bdeep learning\b|\bmlp\b"), "Neural networks"), (re.compile(r"\bcnn\b|\bconvolutional neural network\b"), "CNN"), (re.compile(r"\brnn\b|\brecurrent neural network\b"), "RNN"), (re.compile(r"\blstm\b"), "LSTM"), (re.compile(r"\bgru\b"), "GRU"), (re.compile(r"\bautoencoder\b"), "Autoencoder"), (re.compile(r"\btransformer\b"), "Transformers"), (re.compile(r"\bfine[- ]?tuning\b"), "Model fine-tuning"), (re.compile(r"\bpls[- ]?sem\b|\bpartial least squares\b"), "PLS-SEM"), (re.compile(r"\bcb[- ]?sem\b|\bcovariance[- ]?based sem\b"), "CB-SEM"), (re.compile(r"\bsem\b|\bstructural equation model\b"), "SEM"), (re.compile(r"\bglmm\b|\bgeneralized linear mixed model\b"), "GLMM"), (re.compile(r"\birt\b|\bitem response theory\b"), "IRT"), (re.compile(r"\bbayesian\b"), "Bayesian inference"), (re.compile(r"\bmediation\b"), "Mediation analysis"), (re.compile(r"\bmoderation\b"), "Moderation analysis"), (re.compile(r"\bchi[- ]?square\b|\bchi square\b"), "Chi-square test"), (re.compile(r"\banova\b"), "ANOVA"), (re.compile(r"\bmanova\b"), "MANOVA"), (re.compile(r"\bancova\b"), "ANCOVA"), (re.compile(r"\bmancova\b"), "MANCOVA"), (re.compile(r"\bt[- ]?test\b"), "t-test"), (re.compile(r"\bwilcoxon\b"), "Wilcoxon test"), (re.compile(r"\bkruskal[- ]?wallis\b"), "Kruskal-Wallis test"), (re.compile(r"\bfactor analysis\b"), "Factor analysis"), (re.compile(r"\btime[- ]?series\b"), "Time-series analysis"), (re.compile(r"\barima\b"), "ARIMA"), (re.compile(r"\bsarima\b"), "SARIMA"), (re.compile(r"\bvar\b|\bvector autoregression\b"), "VAR"), (re.compile(r"\bprophet\b"), "Prophet"), (re.compile(r"\bpanel regression\b|\bpanel data\b"), "Panel regression"), (re.compile(r"\bfixed effects\b"), "Fixed-effects regression"), (re.compile(r"\brandom effects\b"), "Random-effects regression"), (re.compile(r"\bmultilevel\b|\bhierarchical linear model\b|\bhlm\b|\bmixed effects\b"), "Multilevel / mixed-effects regression"), (re.compile(r"\bglm\b|\bgeneralized linear model\b"), "Generalized linear model"), (re.compile(r"\bgls\b|\bgeneralized least squares\b"), "Generalized least squares"), (re.compile(r"\bgee\b|\bgeneralized estimating equation\b"), "GEE"), (re.compile(r"\bgmm\b|\bgeneralized method of moments\b"), "GMM"), (re.compile(r"\b2sls\b|\btwo[- ]?stage least squares\b"), "2SLS"), (re.compile(r"\b3sls\b|\bthree[- ]?stage least squares\b"), "3SLS"), (re.compile(r"\binstrumental variable\b|\biv\b"), "Instrumental variables"), (re.compile(r"\btobit\b"), "Tobit regression"), (re.compile(r"\bheckman\b"), "Heckman selection model"), (re.compile(r"\bpoisson\b"), "Poisson regression"), (re.compile(r"\bnegative binomial\b"), "Negative binomial regression"), (re.compile(r"\bprobit\b"), "Probit regression"), (re.compile(r"\bsurvival analysis\b|\bcox\b|\bhazard model\b|\bkaplan[- ]?meier\b"), "Survival analysis"), (re.compile(r"\blatent class analysis\b|\blca\b"), "Latent class analysis"), (re.compile(r"\blatent profile analysis\b|\blpa\b"), "Latent profile analysis"), (re.compile(r"\blogistic regression\b"), "Logistic regression"), (re.compile(r"\bols\b|\borderinary least squares\b|\blinear regression\b|\bmultiple regression\b"), "Linear regression (OLS)"), (re.compile(r"\bridge regression\b|\bridge\b"), "Ridge regression"), (re.compile(r"\blasso\b"), "LASSO regression"), (re.compile(r"\belastic net\b"), "Elastic Net regression"), (re.compile(r"\bregression\b"), "Regression"), (re.compile(r"\bcentrality\b"), "Network centrality"), (re.compile(r"\bcommunity detection\b|\blouvain\b|\bleiden\b"), "Community detection"), (re.compile(r"\bergm\b|\bexponential random graph\b"), "ERGM"), (re.compile(r"\blink prediction\b"), "Link prediction"), (re.compile(r"\bpagerank\b|\bpage rank\b"), "PageRank"), (re.compile(r"\bgraph neural network\b|\bgnn\b"), "Graph neural networks"), (re.compile(r"\bhidden markov\b|\bhmm\b"), "Hidden Markov Model"), (re.compile(r"\bmarkov chain\b|\bmarkov model\b"), "Markov models"), (re.compile(r"\bkalman filter\b"), "Kalman filter"), (re.compile(r"\bstate[- ]?space\b"), "State-space models"), (re.compile(r"\bhawkes\b"), "Hawkes process"), (re.compile(r"\brecommender\b|\bcollaborative filtering\b|\bmatrix factorization\b"), "Recommender systems"), (re.compile(r"\bahp\b|\banalytic hierarchy process\b"), "AHP"), (re.compile(r"\btopsis\b"), "TOPSIS"), (re.compile(r"\bvikor\b"), "VIKOR"), (re.compile(r"\bpromethee\b"), "PROMETHEE"), (re.compile(r"\bdematel\b"), "DEMATEL"), (re.compile(r"\bdea\b|\bdata envelopment analysis\b"), "DEA"), (re.compile(r"\bsfa\b|\bstochastic frontier\b"), "SFA"), (re.compile(r"\bagent[- ]?based\b"), "Agent-based simulation"), (re.compile(r"\bmonte carlo\b"), "Monte Carlo simulation"), (re.compile(r"\bbayesian optimization\b"), "Bayesian optimization"), (re.compile(r"\blinear programming\b|\binteger programming\b|\bmixed integer\b"), "Mathematical optimization"), (re.compile(r"\bgenetic algorithm\b"), "Genetic algorithms"), (re.compile(r"\bsimulated annealing\b"), "Simulated annealing"), ] def _canonicalize_technique(name: str) -> tuple[str, str]: cleaned = _normalize_technique_key(name) for pattern, canonical in canonical_patterns: if pattern.search(cleaned): return canonical, canonical.lower() display = " ".join(word.capitalize() for word in cleaned.split()) display = display or _clean_technique_name(name) return display, display.lower() category_patterns = [ (re.compile(r"\b(bert|roberta|xlm roberta|gpt|t5|transformer|fine[- ]?tuning)\b"), "Transformers"), (re.compile(r"\b(word2vec|glove|doc2vec|fasttext|specter|sentence[- ]?transformer|embedding|tf[- ]?idf|bm25|bag of words|bow)\b"), "Embeddings / Representation"), (re.compile(r"\b(topic modeling|lda|nmf|bertopic|lsa|lsi)\b"), "Topic Modeling"), (re.compile(r"\b(k[- ]?means|hierarchical clustering|dbscan|hdbscan|gaussian mixture|gmm|clustering)\b"), "Clustering"), (re.compile(r"\b(pca|svd|t-sne|tsne|umap|dimensionality reduction)\b"), "Dimensionality Reduction"), (re.compile(r"\b(arima|sarima|var|prophet|time[- ]?series)\b"), "Time Series / Forecasting"), (re.compile(r"\b(panel data|panel regression|fixed effects|random effects|multilevel|hierarchical linear model|hlm|mixed effects|glm|gls|gee|gmm|2sls|3sls|instrumental variable|tobit|heckman|poisson|negative binomial|probit|logit)\b"), "Econometric / Panel Models"), (re.compile(r"\b(ols|linear regression|logistic regression|ridge|lasso|elastic net|regression)\b"), "Regression"), (re.compile(r"\b(sem|pls[- ]?sem|cb[- ]?sem|structural equation|cfa|efa)\b"), "SEM"), (re.compile(r"\b(latent class analysis|latent profile analysis|latent variable|mixture model)\b"), "Latent Variable Models"), (re.compile(r"\b(grad(ient)? boosting|xgboost|lightgbm|catboost)\b"), "Boosting / Ensembles"), (re.compile(r"\b(random forest|decision tree|svm|knn|naive bayes)\b"), "Classic ML"), (re.compile(r"\b(neural network|deep learning|lstm|cnn|rnn|gru|mlp|autoencoder)\b"), "Deep Learning"), (re.compile(r"\b(ner|named entity recognition|sentiment|nlp|text mining|tokenization|stemming|lemmatization|keyword extraction)\b"), "NLP / Text Mining"), (re.compile(r"\b(network|centrality|community detection|louvain|leiden|ergm|link prediction|pagerank|graph neural network|gnn)\b"), "Network Analysis"), (re.compile(r"\b(agent[- ]?based|monte carlo|bayesian optimization|linear programming|integer programming|genetic algorithm|simulated annealing)\b"), "Simulation / Optimization"), (re.compile(r"\b(survival|cox|hazard|kaplan[- ]?meier)\b"), "Survival / Event History"), (re.compile(r"\b(bayesian|mcmc|gibbs|variational)\b"), "Bayesian Methods"), (re.compile(r"\b(anova|manova|ancova|mancova|t[- ]?test|chi[- ]?square|factor analysis|glmm|irt|mediation|moderation|wilcoxon|kruskal[- ]?wallis)\b"), "Statistical Tests / Models"), (re.compile(r"\b(difference[- ]?in[- ]?differences|did|regression discontinuity|rdd|instrumental variable|iv|propensity score|matching)\b"), "Causal Inference"), (re.compile(r"\b(recommender|collaborative filtering|matrix factorization)\b"), "Recommender Systems"), (re.compile(r"\b(hidden markov|hmm|markov|kalman|state[- ]?space|hawkes)\b"), "Sequence / Stochastic Processes"), (re.compile(r"\b(ahp|analytic hierarchy process|topsis|vikor|promethee|dematel)\b"), "Decision Analysis / MCDA"), (re.compile(r"\b(dea|data envelopment analysis|stochastic frontier|sfa|frontier analysis)\b"), "Efficiency / Frontier Analysis"), ] def _categorize_technique(*names: str) -> str: for name in names: if not name: continue key = _normalize_technique_key(name) for pattern, category in category_patterns: if pattern.search(key): return category fallback_keywords = [ ("Classic ML", ["classifier", "classification", "predictive model", "prediction", "supervised"]), ("Clustering", ["cluster", "clustering"]), ("Topic Modeling", ["topic", "semantic"]), ("Embeddings / Representation", ["embedding", "vector", "tf idf", "bow", "bag of words"]), ("Regression", ["regression", "logit", "probit", "panel", "fixed effects", "random effects", "glm", "gls", "gee", "gmm"]), ("SEM", ["sem", "structural equation", "factor", "latent"]), ("Bayesian Methods", ["bayesian", "mcmc", "gibbs", "prior", "posterior"]), ("Time Series / Forecasting", ["time series", "forecast", "arima", "sarima", "var", "prophet"]), ("NLP / Text Mining", ["nlp", "text", "token", "lemma", "stem", "language"]), ("Network Analysis", ["network", "graph", "node", "edge"]), ("Simulation / Optimization", ["simulation", "optimi", "heuristic", "metaheuristic", "monte carlo", "agent-based"]), ] for category, keywords in fallback_keywords: if any(k in key for k in keywords): return category if any(token in key for token in ["model", "analysis", "estimation", "test"]): return "Statistical Tests / Models" return "Other" category_map: dict[str, dict[str, object]] = {} for r in paper_results: paper_title = r.get("paper_title") or r.get("paper_filename") or "" paper_id = r.get("paper_id", "") paper_label = str(paper_title or paper_id) methods = r.get("computational_methods", []) if isinstance(methods, list): techniques = set([m.strip() for m in methods if isinstance(m, str) and m.strip()]) elif isinstance(methods, str) and methods.strip(): techniques = set([m.strip() for m in re.split(r"[;,]", methods) if m.strip()]) else: techniques = set() for technique in techniques: algorithm, _ = _canonicalize_technique(technique) if not algorithm: continue category = _categorize_technique(technique, algorithm) key = category.lower() if key not in category_map: category_map[key] = { "name": category, "algorithms": set(), "papers": set(), } category_map[key]["algorithms"].add(algorithm) category_map[key]["papers"].add(paper_label) technique_rows = [ { "Main Computational Technique": entry["name"], "Algorithms": ", ".join(sorted(entry["algorithms"])), "Papers": " | ".join(sorted(entry["papers"])), } for entry in sorted(category_map.values(), key=lambda v: str(v["name"]).lower()) ] technique_df = pd.DataFrame( technique_rows, columns=["Main Computational Technique", "Algorithms", "Papers"], ) technique_csv_path = rdir / "technique_to_papers.csv" technique_df.to_csv(technique_csv_path, index=False) return { "n_papers": len(pdf_files), "n_extracted": len(paper_results), "csv_path": str(csv_path), "technique_csv_path": str(technique_csv_path), "results": paper_results, } # --------------------------------------------------------------------------- # Tool registry — imported by agent.py # --------------------------------------------------------------------------- ALL_TOOLS = [ load_scopus_csv, run_bertopic_discovery, label_topics_with_llm, verify_topic_labels_with_groq, consolidate_into_themes, compare_with_taxonomy, verify_taxonomy_mapping_with_groq, generate_comparison_csv, export_narrative, extract_methods_from_pdfs, ]