| """ |
| tools.py — BERTopic Thematic Analysis Pipeline Tools |
| ===================================================== |
| Nine LangChain @tool functions implementing Braun & Clarke's (2006) |
| six-phase thematic analysis pipeline. |
| |
| Conventions |
| ----------- |
| - All tools accept / return plain Python dicts (JSON-serialisable). |
| - Artefacts are written to OUTPUT_DIR / run_key / <file>. |
| - Functional style throughout: map, operator, numpy vectorised ops. |
| - No for/while loops, no try/except, no if/else. |
| |
| Fixes applied (v2) |
| ------------------ |
| - BUG 1 : run_bertopic_discovery() now saves sent_labels.npy — |
| per-sentence cluster-label array required by Tool 4. |
| - BUG 1 : consolidate_into_themes() _build_theme() rewritten — |
| centroid computed from actual merged-cluster embeddings |
| via sent_labels.npy mask (no dead `if False` scaffolding). |
| - ISSUE 1: generate_comparison_csv() guards against missing title run |
| with a .exists() check instead of hard-crashing. |
| |
| Dependencies |
| ------------ |
| pip install langchain langchain-core langchain-mistralai langchain-groq |
| sentence-transformers scikit-learn plotly pandas numpy |
| """ |
|
|
| |
| |
| |
| import json |
| import os |
| import re |
| import time |
| from functools import reduce |
| from pathlib import Path |
| from operator import itemgetter |
|
|
| |
| |
| |
| import numpy as np |
| import pandas as pd |
| import plotly.express as px |
| import plotly.graph_objects as go |
| import plotly.figure_factory as ff |
| from sklearn.metrics.pairwise import cosine_similarity |
| from sklearn.preprocessing import normalize |
| from sentence_transformers import SentenceTransformer |
| import hdbscan |
| import umap |
|
|
| import fitz |
|
|
| from langchain_core.tools import tool |
| from langchain_core.prompts import PromptTemplate |
| from langchain_core.output_parsers import JsonOutputParser |
| from langchain_mistralai import ChatMistralAI |
|
|
| try: |
| from langchain_groq import ChatGroq |
| except ImportError: |
| ChatGroq = None |
|
|
| |
| |
| |
|
|
| MISTRAL_API_KEY: str = os.environ.get("MISTRAL_API_KEY", "") |
| MODEL_NAME: str = "mistral-small-latest" |
| GROQ_API_KEY: str = os.environ.get("GROQ_API_KEY", "") |
| GROQ_MODEL_NAME: str = os.environ.get("GROQ_MODEL_NAME", "llama-3.3-70b-versatile") |
| GROQ_OLLAMA_MODEL_NAME: str = os.environ.get("GROQ_OLLAMA_MODEL_NAME", "llama-3.3-70b-versatile") |
| GROQ_GPT_MODEL_NAME: str = os.environ.get("GROQ_GPT_MODEL_NAME", "openai/gpt-oss-120b") |
| GROQ_JUDGE_MODEL_NAME: str = os.environ.get("GROQ_JUDGE_MODEL_NAME", "llama-3.1-8b-instant") |
| EMBED_MODEL: str = "allenai/specter2_base" |
| BASE_DIR: Path = Path(__file__).resolve().parent |
| OUTPUT_DIR: Path = BASE_DIR / "outputs" |
| N_EVIDENCE: int = 5 |
| DISTANCE_THRESH: float = 0.35 |
| RANDOM_SEED: int = 42 |
| LLM_TIMEOUT_S: int = 45 |
| LLM_MAX_RETRIES: int = 3 |
| MAX_LABEL_CLUSTERS: int = 60 |
| MIN_CLUSTER_SIZE_FOR_LABEL: int = 20 |
| MAX_TOOL_RETURN_PREVIEW: int = 12 |
| PROVIDER_RETRY_ATTEMPTS: int = 4 |
| PROVIDER_RETRY_BASE_DELAY_S: float = 2.0 |
| PROVIDER_RETRY_RATE_LIMIT_DELAY_S: float = 6.0 |
| PROVIDER_RETRY_MAX_DELAY_S: float = 18.0 |
| HDBSCAN_MIN_CLUSTER_SIZE: int = 20 |
| HDBSCAN_MIN_SAMPLES: int = 5 |
| HDBSCAN_MAX_CLUSTER_SIZE: int = 120 |
| UMAP_N_NEIGHBORS: int = 15 |
| UMAP_MIN_DIST: float = 0.0 |
| UMAP_N_COMPONENTS_CLUSTER: int = 5 |
| UMAP_N_COMPONENTS_VIZ: int = 2 |
| AUTO_OPTIMIZE_CLUSTERS: bool = True |
| OPTIMIZE_MAX_ITERS: int = 6 |
| OPTIMIZE_STABLE_ROUNDS: int = 2 |
| OPTIMIZE_MIN_IMPROVEMENT: float = 0.01 |
| OPTIMIZE_TARGET_CLUSTER_MIN: int = 20 |
| OPTIMIZE_TARGET_CLUSTER_MAX: int = 120 |
| OPTIMIZE_TARGET_NOISE_MAX: float = 0.50 |
| OPTIMIZE_MIN_CLUSTER_SIZE_MIN: int = 5 |
| OPTIMIZE_MIN_CLUSTER_SIZE_MAX: int = 60 |
| OPTIMIZE_MAX_CLUSTER_SIZE_MIN: int = 40 |
| OPTIMIZE_MAX_CLUSTER_SIZE_MAX: int = 200 |
| OPTIMIZE_MIN_SAMPLES_MIN: int = 1 |
| OPTIMIZE_MIN_SAMPLES_MAX: int = 15 |
|
|
| |
| RUN_CONFIGS: dict[str, list[str]] = { |
| "abstract": ["Abstract"], |
| "title": ["Title"], |
| "keywords": [ |
| "Author Keywords", |
| "Author Keywords Plus", |
| "Index Keywords", |
| "Keywords", |
| "Author_Keywords", |
| ], |
| } |
|
|
| |
| PAJAIS_TAXONOMY: list[str] = [ |
| "Artificial Intelligence & Machine Learning", |
| "Big Data & Analytics", |
| "Blockchain & Distributed Ledger", |
| "Cloud Computing & Infrastructure", |
| "Cybersecurity & Privacy", |
| "Decision Support Systems", |
| "Digital Business & E-Commerce", |
| "Digital Health & Telemedicine", |
| "Digital Innovation & Transformation", |
| "Enterprise Systems & ERP", |
| "Fintech & Digital Finance", |
| "Green IS & Sustainability", |
| "Human-Computer Interaction", |
| "Information Systems Strategy", |
| "IT Governance & Management", |
| "Knowledge Management", |
| "Mobile Computing & IoT", |
| "Natural Language Processing & Text Mining", |
| "Organizational Behavior & IS", |
| "Platform Ecosystems & APIs", |
| "Privacy & Ethics in IS", |
| "Smart Cities & Digital Government", |
| "Social Media & Collaboration", |
| "Supply Chain & Logistics IS", |
| "Virtual Reality & Immersive Technologies", |
| ] |
|
|
| |
| _BOILERPLATE_RE = re.compile( |
| r"(©\s*\d{4}.*?(?:rights reserved|elsevier|springer|wiley)[^.]*\.?)" |
| r"|(all rights reserved\.?)" |
| r"|(published by.*?(?:ltd|inc|llc)[^.]*\.?)" |
| r"|(doi:\s*\S+)", |
| re.IGNORECASE, |
| ) |
|
|
| |
| _SENT_RE = re.compile(r"(?<=[.!?])\s+") |
| _KEYWORD_SPLIT_RE = re.compile(r"\s*[;|]\s*") |
| _KEYWORD_COMMA_RE = re.compile(r"\s*,\s*") |
|
|
|
|
| |
| |
| |
|
|
| def _ensure_dir(path: Path) -> Path: |
| path.mkdir(parents=True, exist_ok=True) |
| return path |
|
|
|
|
| def _run_dir(run_key: str) -> Path: |
| return _ensure_dir(OUTPUT_DIR / run_key) |
|
|
|
|
| def _clean_text(text: str) -> str: |
| return _BOILERPLATE_RE.sub("", str(text)).strip() |
|
|
|
|
| def _split_sentences(text: str) -> list[str]: |
| return list(filter( |
| lambda s: len(s.strip()) >= 20, |
| _SENT_RE.split(_clean_text(text)), |
| )) |
|
|
|
|
| def _split_keywords(text: str) -> list[str]: |
| cleaned = _clean_text(text).replace("\n", " ").strip() |
| if not cleaned: |
| return [] |
| primary = list(filter(None, map(str.strip, _KEYWORD_SPLIT_RE.split(cleaned)))) |
| terms = ( |
| primary |
| if len(primary) > 1 |
| else list(filter(None, map(str.strip, _KEYWORD_COMMA_RE.split(cleaned)))) |
| ) |
| return list(dict.fromkeys(filter(lambda t: len(t) >= 2, terms))) |
|
|
|
|
| def _resolve_column_name(df: pd.DataFrame, candidates: list[str]) -> str | None: |
| normalised = { |
| str(col).strip().lower(): col |
| for col in df.columns |
| } |
| return next( |
| (normalised.get(str(c).strip().lower()) for c in candidates |
| if normalised.get(str(c).strip().lower()) is not None), |
| None, |
| ) |
|
|
|
|
| def _texts_for_candidates(df: pd.DataFrame, candidates: list[str]) -> tuple[list[str], str | None]: |
| col = _resolve_column_name(df, candidates) |
| return ( |
| df[col].dropna().astype(str).tolist(), |
| col, |
| ) if col else ([], None) |
|
|
|
|
| def _embed(sentences: list[str]) -> np.ndarray: |
| """Encode sentences to L2-normalised SPECTER2 vectors.""" |
| model = SentenceTransformer(EMBED_MODEL, trust_remote_code=True) |
| raw = model.encode(sentences, show_progress_bar=False, batch_size=64) |
| return normalize(raw, norm="l2") |
|
|
|
|
| def _umap_reduce(embeddings: np.ndarray, n_components: int) -> np.ndarray: |
| reducer = umap.UMAP( |
| n_neighbors=UMAP_N_NEIGHBORS, |
| min_dist=UMAP_MIN_DIST, |
| n_components=n_components, |
| metric="cosine", |
| random_state=RANDOM_SEED, |
| ) |
| return reducer.fit_transform(embeddings) |
|
|
|
|
| def _cluster(embeddings: np.ndarray, |
| min_cluster_size: int, |
| max_cluster_size: int, |
| min_samples: int) -> np.ndarray: |
| return hdbscan.HDBSCAN( |
| min_cluster_size=min_cluster_size, |
| min_samples=min_samples, |
| metric="euclidean", |
| cluster_selection_method="eom", |
| max_cluster_size=max_cluster_size, |
| ).fit_predict(embeddings) |
|
|
|
|
| def _centroid(embeddings: np.ndarray) -> np.ndarray: |
| """Mean-pool rows then re-normalise to unit length.""" |
| return normalize(embeddings.mean(axis=0, keepdims=True), norm="l2")[0] |
|
|
|
|
| def _top_k_indices(embeddings: np.ndarray, centroid: np.ndarray, k: int) -> np.ndarray: |
| sims = cosine_similarity(embeddings, centroid.reshape(1, -1)).flatten() |
| return np.argsort(sims)[::-1][:k] |
|
|
|
|
| def _llm() -> ChatMistralAI: |
| return ChatMistralAI( |
| model=MODEL_NAME, |
| api_key=MISTRAL_API_KEY, |
| temperature=0.2, |
| random_seed=RANDOM_SEED, |
| timeout=LLM_TIMEOUT_S, |
| max_retries=LLM_MAX_RETRIES, |
| ) |
|
|
|
|
| def _llm_groq(model_name: str): |
| if ChatGroq is None: |
| raise RuntimeError( |
| "langchain-groq is not installed. Install dependencies from requirements.txt " |
| "to enable Groq topic-label verification." |
| ) |
| return ChatGroq( |
| model=model_name, |
| api_key=GROQ_API_KEY, |
| temperature=0.2, |
| timeout=LLM_TIMEOUT_S, |
| max_retries=LLM_MAX_RETRIES, |
| ) |
|
|
|
|
| def _groq_ollama_enabled() -> bool: |
| return bool(GROQ_API_KEY) and ChatGroq is not None and bool(GROQ_OLLAMA_MODEL_NAME) |
|
|
|
|
| def _groq_gpt_enabled() -> bool: |
| return bool(GROQ_API_KEY) and ChatGroq is not None and bool(GROQ_GPT_MODEL_NAME) |
|
|
|
|
| def _groq_judge_enabled() -> bool: |
| return bool(GROQ_API_KEY) and ChatGroq is not None and bool(GROQ_JUDGE_MODEL_NAME) |
|
|
|
|
| def _to_float(value: object, fallback: float = 0.0) -> float: |
| try: |
| return float(value) |
| except (TypeError, ValueError): |
| return float(fallback) |
|
|
|
|
| def _clamp_int(value: object, low: int, high: int, fallback: int) -> int: |
| try: |
| casted = int(value) |
| except (TypeError, ValueError): |
| casted = int(fallback) |
| return max(low, min(high, casted)) |
|
|
|
|
| def _cluster_metrics(labels: np.ndarray) -> dict: |
| labels_arr = np.array(labels, dtype=np.int32) |
| n_sentences = int(labels_arr.shape[0]) |
| noise_count = int((labels_arr == -1).sum()) |
| unique_ids = sorted(filter(lambda v: v != -1, set(labels_arr.tolist()))) |
| sizes = list(map(lambda cid: int((labels_arr == cid).sum()), unique_ids)) |
|
|
| if sizes: |
| min_size = float(np.min(sizes)) |
| median_size = float(np.median(sizes)) |
| mean_size = float(np.mean(sizes)) |
| max_size = float(np.max(sizes)) |
| else: |
| min_size = 0.0 |
| median_size = 0.0 |
| mean_size = 0.0 |
| max_size = 0.0 |
|
|
| return { |
| "n_sentences": n_sentences, |
| "n_clusters": int(len(unique_ids)), |
| "noise_ratio": float(noise_count) / float(max(1, n_sentences)), |
| "min_size": min_size, |
| "median_size": median_size, |
| "mean_size": mean_size, |
| "max_size": max_size, |
| } |
|
|
|
|
| def _heuristic_hdbscan_tweak(metrics: dict, params: dict) -> dict: |
| n_clusters = int(metrics.get("n_clusters", 0)) |
| noise_ratio = float(metrics.get("noise_ratio", 0.0)) |
|
|
| min_cluster_size = int(params.get("min_cluster_size", HDBSCAN_MIN_CLUSTER_SIZE)) |
| max_cluster_size = int(params.get("max_cluster_size", HDBSCAN_MAX_CLUSTER_SIZE)) |
| min_samples = int(params.get("min_samples", HDBSCAN_MIN_SAMPLES)) |
|
|
| action = "accept" |
| reasoning = "Cluster metrics are within target ranges." |
|
|
| if n_clusters < OPTIMIZE_TARGET_CLUSTER_MIN: |
| min_cluster_size = max( |
| OPTIMIZE_MIN_CLUSTER_SIZE_MIN, |
| int(round(min_cluster_size * 0.8)), |
| ) |
| min_samples = max(OPTIMIZE_MIN_SAMPLES_MIN, min_samples - 1) |
| action = "tweak" |
| reasoning = "Too few clusters; reducing min_cluster_size and min_samples." |
| elif n_clusters > OPTIMIZE_TARGET_CLUSTER_MAX: |
| min_cluster_size = min( |
| OPTIMIZE_MIN_CLUSTER_SIZE_MAX, |
| int(round(min_cluster_size * 1.2)), |
| ) |
| min_samples = min(OPTIMIZE_MIN_SAMPLES_MAX, min_samples + 1) |
| action = "tweak" |
| reasoning = "Too many clusters; increasing min_cluster_size and min_samples." |
| elif noise_ratio > OPTIMIZE_TARGET_NOISE_MAX: |
| min_cluster_size = max( |
| OPTIMIZE_MIN_CLUSTER_SIZE_MIN, |
| int(round(min_cluster_size * 0.85)), |
| ) |
| min_samples = max(OPTIMIZE_MIN_SAMPLES_MIN, min_samples - 1) |
| action = "tweak" |
| reasoning = "Noise ratio is high; lowering min_cluster_size and min_samples." |
|
|
| return { |
| "action": action, |
| "min_cluster_size": min_cluster_size, |
| "max_cluster_size": max_cluster_size, |
| "min_samples": min_samples, |
| "reasoning": reasoning, |
| } |
|
|
|
|
| def _normalize_hdbscan_suggestion(suggestion: dict, current: dict) -> dict: |
| action = str(suggestion.get("action", "accept")).strip().lower() |
| action = action if action in {"accept", "tweak"} else "accept" |
|
|
| min_cluster_size = _clamp_int( |
| suggestion.get("min_cluster_size", current.get("min_cluster_size")), |
| OPTIMIZE_MIN_CLUSTER_SIZE_MIN, |
| OPTIMIZE_MIN_CLUSTER_SIZE_MAX, |
| current.get("min_cluster_size", HDBSCAN_MIN_CLUSTER_SIZE), |
| ) |
| max_cluster_size = _clamp_int( |
| suggestion.get("max_cluster_size", current.get("max_cluster_size")), |
| OPTIMIZE_MAX_CLUSTER_SIZE_MIN, |
| OPTIMIZE_MAX_CLUSTER_SIZE_MAX, |
| current.get("max_cluster_size", HDBSCAN_MAX_CLUSTER_SIZE), |
| ) |
| min_samples = _clamp_int( |
| suggestion.get("min_samples", current.get("min_samples")), |
| OPTIMIZE_MIN_SAMPLES_MIN, |
| OPTIMIZE_MIN_SAMPLES_MAX, |
| current.get("min_samples", HDBSCAN_MIN_SAMPLES), |
| ) |
|
|
| if max_cluster_size < min_cluster_size: |
| max_cluster_size = min_cluster_size + 1 |
|
|
| return { |
| "action": action, |
| "min_cluster_size": min_cluster_size, |
| "max_cluster_size": max_cluster_size, |
| "min_samples": min_samples, |
| "reasoning": str(suggestion.get("reasoning", "")).strip(), |
| } |
|
|
|
|
| def _metrics_in_target(metrics: dict) -> bool: |
| n_clusters = int(metrics.get("n_clusters", 0)) |
| noise_ratio = float(metrics.get("noise_ratio", 1.0)) |
| return ( |
| OPTIMIZE_TARGET_CLUSTER_MIN <= n_clusters <= OPTIMIZE_TARGET_CLUSTER_MAX |
| and noise_ratio <= OPTIMIZE_TARGET_NOISE_MAX |
| ) |
|
|
|
|
| def _optimization_score(metrics: dict) -> float: |
| n_clusters = int(metrics.get("n_clusters", 0)) |
| noise_ratio = float(metrics.get("noise_ratio", 1.0)) |
|
|
| if n_clusters < OPTIMIZE_TARGET_CLUSTER_MIN: |
| cluster_penalty = (OPTIMIZE_TARGET_CLUSTER_MIN - n_clusters) / max( |
| OPTIMIZE_TARGET_CLUSTER_MIN, |
| 1, |
| ) |
| elif n_clusters > OPTIMIZE_TARGET_CLUSTER_MAX: |
| cluster_penalty = (n_clusters - OPTIMIZE_TARGET_CLUSTER_MAX) / max( |
| OPTIMIZE_TARGET_CLUSTER_MAX, |
| 1, |
| ) |
| else: |
| cluster_penalty = 0.0 |
|
|
| noise_penalty = max(0.0, noise_ratio - OPTIMIZE_TARGET_NOISE_MAX) / max( |
| OPTIMIZE_TARGET_NOISE_MAX, |
| 1e-6, |
| ) |
|
|
| return 1.0 - min(1.0, cluster_penalty + noise_penalty) |
|
|
|
|
| def _load_sentence_meta(run_key: str, sentences: list[str]) -> list[dict]: |
| meta_path = OUTPUT_DIR / run_key / "sentence_meta.json" |
| if not meta_path.exists(): |
| return [ |
| { |
| "sentence": s, |
| "paper_title": "", |
| "paper_id": None, |
| } |
| for s in sentences |
| ] |
|
|
| meta = _load_json(meta_path) |
| if not isinstance(meta, list): |
| return [ |
| { |
| "sentence": s, |
| "paper_title": "", |
| "paper_id": None, |
| } |
| for s in sentences |
| ] |
|
|
| if len(meta) != len(sentences): |
| return [ |
| { |
| "sentence": s, |
| "paper_title": "", |
| "paper_id": None, |
| } |
| for s in sentences |
| ] |
|
|
| return meta |
|
|
|
|
| def _top_papers_for_mask(meta: list[dict], mask: np.ndarray, k: int = 3) -> dict: |
| counts: dict[tuple[object, str], int] = {} |
| for idx, entry in enumerate(meta): |
| if not mask[idx]: |
| continue |
| paper_id = entry.get("paper_id") |
| title = str(entry.get("paper_title") or entry.get("title") or "").strip() |
| if not title: |
| title = f"Paper {paper_id}" if paper_id is not None else "Unknown" |
| key = (paper_id, title) |
| counts[key] = counts.get(key, 0) + 1 |
|
|
| ordered = sorted( |
| counts.items(), |
| key=lambda kv: (-kv[1], str(kv[0][1]).lower()), |
| ) |
|
|
| top = [ |
| {"paper_id": pid, "paper_title": title, "count": count} |
| for (pid, title), count in ordered[:k] |
| ] |
|
|
| return { |
| "paper_count": int(len(counts)), |
| "top_papers": top, |
| } |
|
|
|
|
| def _is_transient_provider_error(exc: Exception) -> bool: |
| """Detect transient provider outages (Mistral/Groq) that should be retried.""" |
| msg = str(exc).lower() |
| return ( |
| "unreachable_backend" in msg |
| or "internal server error" in msg |
| or '"code":"1100"' in msg |
| or '"raw_status_code":503' in msg |
| or '"raw_status_code":502' in msg |
| or '"raw_status_code":504' in msg |
| or '"status":503' in msg |
| or '"status":502' in msg |
| or '"status":504' in msg |
| or '"status":429' in msg |
| or "too many requests" in msg |
| or "rate limit" in msg |
| or "gateway timeout" in msg |
| or "service unavailable" in msg |
| ) |
|
|
|
|
| def _is_rate_limit_error(exc: Exception) -> bool: |
| msg = str(exc).lower() |
| return ( |
| "rate limit" in msg |
| or "too many requests" in msg |
| or '"raw_status_code":429' in msg |
| or '"status":429' in msg |
| or "status code: 429" in msg |
| ) |
|
|
|
|
| def _invoke_with_retries(fn): |
| """Run an LLM call with bounded linear backoff on transient provider errors.""" |
| last_exc: Exception | None = None |
| for attempt in range(PROVIDER_RETRY_ATTEMPTS): |
| try: |
| return fn() |
| except Exception as exc: |
| if not _is_transient_provider_error(exc): |
| raise |
| last_exc = exc |
| if attempt < PROVIDER_RETRY_ATTEMPTS - 1: |
| delay = PROVIDER_RETRY_BASE_DELAY_S * (attempt + 1) |
| if _is_rate_limit_error(exc): |
| delay = max(delay, PROVIDER_RETRY_RATE_LIMIT_DELAY_S * (attempt + 1)) |
| time.sleep(min(PROVIDER_RETRY_MAX_DELAY_S, delay)) |
| continue |
| raise last_exc |
|
|
| raise RuntimeError("Unexpected retry flow in _invoke_with_retries") |
|
|
|
|
| def _save_json(path: Path, data: object) -> None: |
| path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") |
|
|
|
|
| def _load_json(path: Path) -> object: |
| return json.loads(path.read_text(encoding="utf-8")) |
|
|
|
|
| |
| |
| |
|
|
| def _chart_intertopic(summaries: list[dict]) -> go.Figure: |
| df = pd.DataFrame(summaries) |
| return px.scatter( |
| df, |
| x="cx", y="cy", |
| size="size", |
| text="cluster_id", |
| color="size", |
| color_continuous_scale="Blues", |
| title="Intertopic Distance Map", |
| labels={"cx": "Dim-1", "cy": "Dim-2", "size": "Sentences"}, |
| template="plotly_dark", |
| ) |
|
|
|
|
| def _chart_top_words(summaries: list[dict]) -> go.Figure: |
| df = ( |
| pd.DataFrame(summaries) |
| .nlargest(20, "size") |
| .assign(label=lambda d: d["cluster_id"].astype(str)) |
| ) |
| return px.bar( |
| df, |
| x="size", y="label", |
| orientation="h", |
| title="Top Clusters by Sentence Count", |
| labels={"size": "Sentences", "label": "Cluster"}, |
| color="size", |
| color_continuous_scale="Teal", |
| template="plotly_dark", |
| ) |
|
|
|
|
| def _chart_hierarchy(labels: list[int], embeddings: np.ndarray) -> go.Figure: |
| unique = sorted(filter(lambda v: v != -1, set(labels))) |
| if not unique: |
| fig = go.Figure() |
| fig.update_layout(title="Cluster Hierarchy", template="plotly_dark") |
| return fig |
| labels_arr = np.array(labels) |
| centroids = np.vstack([ |
| _centroid(embeddings[labels_arr == lbl]) |
| for lbl in unique |
| ]) |
| dist_mat = 1 - cosine_similarity(centroids) |
| fig = ff.create_dendrogram( |
| dist_mat, |
| labels=[str(l) for l in unique], |
| colorscale=px.colors.sequential.Blues, |
| ) |
| fig.update_layout(title="Cluster Hierarchy", template="plotly_dark") |
| return fig |
|
|
|
|
| def _chart_heatmap(labels: list[int], embeddings: np.ndarray) -> go.Figure: |
| unique = sorted(filter(lambda v: v != -1, set(labels))) |
| if not unique: |
| fig = go.Figure() |
| fig.update_layout(title="Cluster Similarity Heatmap", template="plotly_dark") |
| return fig |
| labels_arr = np.array(labels) |
| centroids = np.vstack([ |
| _centroid(embeddings[labels_arr == lbl]) |
| for lbl in unique |
| ]) |
| sim_mat = cosine_similarity(centroids) |
| return px.imshow( |
| sim_mat, |
| x=[str(l) for l in unique], |
| y=[str(l) for l in unique], |
| color_continuous_scale="Blues", |
| title="Cluster Similarity Heatmap", |
| template="plotly_dark", |
| ) |
|
|
|
|
| def _save_chart(fig: go.Figure, path: Path) -> str: |
| fig.write_html(str(path), full_html=True, include_plotlyjs="cdn") |
| return str(path) |
|
|
|
|
| _OPTIMIZE_PROMPT = PromptTemplate.from_template( |
| """You are optimizing HDBSCAN clustering parameters for BERTopic. |
| |
| Current parameters: |
| min_cluster_size: {min_cluster_size} |
| max_cluster_size: {max_cluster_size} |
| min_samples: {min_samples} |
| |
| Clustering metrics: |
| n_sentences: {n_sentences} |
| n_clusters: {n_clusters} |
| noise_ratio: {noise_ratio} |
| min_size: {min_size} |
| median_size: {median_size} |
| mean_size: {mean_size} |
| max_size: {max_size} |
| |
| Constraints: |
| - Only adjust min_cluster_size, max_cluster_size, min_samples. |
| - Keep min_cluster_size within [{min_cluster_size_min}, {min_cluster_size_max}]. |
| - Keep max_cluster_size within [{max_cluster_size_min}, {max_cluster_size_max}]. |
| - Keep min_samples within [{min_samples_min}, {min_samples_max}]. |
| - Prefer n_clusters in [{target_cluster_min}, {target_cluster_max}]. |
| - Prefer noise_ratio <= {target_noise_max}. |
| |
| Return RAW JSON with exactly these keys: |
| action: "accept" or "tweak" |
| min_cluster_size: int |
| max_cluster_size: int |
| min_samples: int |
| reasoning: short sentence |
| |
| If clustering already looks good, set action="accept" and repeat the current values. |
| Respond with RAW JSON only. |
| """ |
| ) |
|
|
|
|
| def _recommend_hdbscan_params(metrics: dict, params: dict) -> dict: |
| if not MISTRAL_API_KEY: |
| return _normalize_hdbscan_suggestion( |
| _heuristic_hdbscan_tweak(metrics, params), |
| params, |
| ) |
|
|
| chain = _OPTIMIZE_PROMPT | _llm() | JsonOutputParser() |
|
|
| payload = { |
| **metrics, |
| **params, |
| "min_cluster_size_min": OPTIMIZE_MIN_CLUSTER_SIZE_MIN, |
| "min_cluster_size_max": OPTIMIZE_MIN_CLUSTER_SIZE_MAX, |
| "max_cluster_size_min": OPTIMIZE_MAX_CLUSTER_SIZE_MIN, |
| "max_cluster_size_max": OPTIMIZE_MAX_CLUSTER_SIZE_MAX, |
| "min_samples_min": OPTIMIZE_MIN_SAMPLES_MIN, |
| "min_samples_max": OPTIMIZE_MIN_SAMPLES_MAX, |
| "target_cluster_min": OPTIMIZE_TARGET_CLUSTER_MIN, |
| "target_cluster_max": OPTIMIZE_TARGET_CLUSTER_MAX, |
| "target_noise_max": OPTIMIZE_TARGET_NOISE_MAX, |
| } |
|
|
| try: |
| suggestion = _invoke_with_retries(lambda: chain.invoke(payload)) |
| except Exception: |
| suggestion = {} |
|
|
| if not isinstance(suggestion, dict) or not suggestion: |
| suggestion = _heuristic_hdbscan_tweak(metrics, params) |
|
|
| return _normalize_hdbscan_suggestion(suggestion, params) |
|
|
|
|
| |
| |
| |
|
|
| @tool |
| def load_scopus_csv(filepath: str) -> dict: |
| """ |
| Load a Scopus-exported CSV and extract corpus statistics. |
| |
| Parameters |
| ---------- |
| filepath : str |
| Absolute or relative path to the CSV file. |
| |
| Returns |
| ------- |
| dict with keys: |
| paper_count, abstract_sentence_count, title_sentence_count, |
| keywords_term_count, |
| columns, sample_abstracts, filepath |
| """ |
| df = pd.read_csv(filepath).rename(columns=str.strip) |
|
|
| abstract_texts, abstract_col = _texts_for_candidates(df, RUN_CONFIGS["abstract"]) |
| title_texts, title_col = _texts_for_candidates(df, RUN_CONFIGS["title"]) |
| keywords_texts, keywords_col = _texts_for_candidates(df, RUN_CONFIGS["keywords"]) |
|
|
| titles_for_meta = ( |
| df[title_col].fillna("").astype(str).tolist() |
| if title_col |
| else [""] * len(df) |
| ) |
|
|
| def _build_sentences_and_meta(text_col: str | None, splitter) -> tuple[list[str], list[dict]]: |
| if not text_col: |
| return [], [] |
| texts = df[text_col].fillna("").astype(str).tolist() |
| sentences: list[str] = [] |
| meta: list[dict] = [] |
| for idx, (text, title) in enumerate(zip(texts, titles_for_meta), start=1): |
| parts = splitter(text) |
| if not parts: |
| continue |
| sentences.extend(parts) |
| meta.extend( |
| { |
| "sentence": part, |
| "paper_title": title or f"Paper {idx}", |
| "paper_id": idx, |
| } |
| for part in parts |
| ) |
| return sentences, meta |
|
|
| abstract_sentences, abstract_meta = _build_sentences_and_meta( |
| abstract_col, _split_sentences |
| ) |
| title_sentences, title_meta = _build_sentences_and_meta( |
| title_col, _split_sentences |
| ) |
| keywords_terms, keywords_meta = _build_sentences_and_meta( |
| keywords_col, _split_keywords |
| ) |
|
|
| _ensure_dir(OUTPUT_DIR / "abstract") |
| _ensure_dir(OUTPUT_DIR / "title") |
| _ensure_dir(OUTPUT_DIR / "keywords") |
|
|
| _save_json(OUTPUT_DIR / "abstract" / "sentences.json", abstract_sentences) |
| _save_json(OUTPUT_DIR / "abstract" / "sentence_meta.json", abstract_meta) |
| _save_json(OUTPUT_DIR / "title" / "sentences.json", title_sentences) |
| _save_json(OUTPUT_DIR / "title" / "sentence_meta.json", title_meta) |
| _save_json(OUTPUT_DIR / "keywords" / "sentences.json", keywords_terms) |
| _save_json(OUTPUT_DIR / "keywords" / "sentence_meta.json", keywords_meta) |
|
|
| df.to_csv(OUTPUT_DIR / "corpus.csv", index=False) |
|
|
| return { |
| "paper_count": int(len(df)), |
| "abstract_sentence_count": int(len(abstract_sentences)), |
| "title_sentence_count": int(len(title_sentences)), |
| "keywords_term_count": int(len(keywords_terms)), |
| "detected_columns": { |
| "abstract": abstract_col, |
| "title": title_col, |
| "keywords": keywords_col, |
| }, |
| "columns": df.columns.tolist(), |
| "sample_abstracts": abstract_texts[:3], |
| "filepath": str(filepath), |
| } |
|
|
|
|
| |
| |
| |
|
|
| @tool |
| def run_bertopic_discovery( |
| run_key: str, |
| threshold: float = DISTANCE_THRESH, |
| min_cluster_size: int = HDBSCAN_MIN_CLUSTER_SIZE, |
| max_cluster_size: int = HDBSCAN_MAX_CLUSTER_SIZE, |
| min_samples: int = HDBSCAN_MIN_SAMPLES, |
| auto_optimize: bool = AUTO_OPTIMIZE_CLUSTERS, |
| max_optimize_iters: int = OPTIMIZE_MAX_ITERS, |
| ) -> dict: |
| """ |
| Embed sentences, cluster with UMAP + HDBSCAN, extract evidence, |
| and generate four Plotly charts. |
| |
| Saved artefacts |
| --------------- |
| emb.npy : (N, D) float32 L2-normalised embeddings |
| sent_labels.npy : (N,) int32 per-sentence cluster label [BUG 1 FIX] |
| summaries.json : list of cluster dicts with evidence sentences |
| optimization.json : list of optimization rounds and metrics |
| |
| Parameters |
| ---------- |
| run_key : str — "abstract" or "title" or "keywords" |
| threshold : float — legacy arg (ignored by HDBSCAN) |
| min_cluster_size : int — HDBSCAN minimum cluster size |
| max_cluster_size : int — HDBSCAN maximum cluster size |
| min_samples : int — HDBSCAN min_samples |
| auto_optimize : bool — run LLM-guided optimization loop |
| max_optimize_iters : int — max optimization rounds after initial run |
| |
| Returns |
| ------- |
| dict with keys: |
| run_key, n_clusters, n_sentences, threshold, |
| chart_paths, summaries_path, embeddings_path, optimization_path |
| """ |
| if run_key not in RUN_CONFIGS: |
| return { |
| "run_key": run_key, |
| "n_clusters": 0, |
| "n_sentences": 0, |
| "threshold": threshold, |
| "chart_paths": {}, |
| "error": ( |
| f"Unsupported run_key: {run_key}. " |
| f"Use one of: {', '.join(RUN_CONFIGS.keys())}." |
| ), |
| } |
|
|
| rdir = _run_dir(run_key) |
| sent_path = OUTPUT_DIR / run_key / "sentences.json" |
| if not sent_path.exists(): |
| return { |
| "run_key": run_key, |
| "n_clusters": 0, |
| "n_sentences": 0, |
| "threshold": threshold, |
| "chart_paths": {}, |
| "error": ( |
| f"Missing sentences artifact: {sent_path}. " |
| "Run load_scopus_csv first." |
| ), |
| } |
|
|
| sentences = _load_json(sent_path) |
| if not sentences: |
| return { |
| "run_key": run_key, |
| "n_clusters": 0, |
| "n_sentences": 0, |
| "threshold": threshold, |
| "chart_paths": {}, |
| "error": ( |
| f"No sentences/terms found for run_key={run_key}. " |
| "Check that the corresponding source column exists in the CSV." |
| ), |
| } |
|
|
| sentence_meta = _load_sentence_meta(run_key, sentences) |
|
|
| emb_path = rdir / "emb.npy" |
| embeddings = None |
| if emb_path.exists(): |
| cached = np.load(str(emb_path)) |
| if cached.shape[0] == len(sentences): |
| embeddings = cached |
|
|
| if embeddings is None: |
| embeddings = _embed(sentences) |
| np.save(str(emb_path), embeddings) |
|
|
| cluster_space = _umap_reduce(embeddings, UMAP_N_COMPONENTS_CLUSTER) |
| umap_2d = _umap_reduce(embeddings, UMAP_N_COMPONENTS_VIZ) |
|
|
| def _run_hdbscan(params: dict) -> tuple[list[int], dict]: |
| labels_local = _cluster( |
| cluster_space, |
| min_cluster_size=int(params.get("min_cluster_size", HDBSCAN_MIN_CLUSTER_SIZE)), |
| max_cluster_size=int(params.get("max_cluster_size", HDBSCAN_MAX_CLUSTER_SIZE)), |
| min_samples=int(params.get("min_samples", HDBSCAN_MIN_SAMPLES)), |
| ).tolist() |
| return labels_local, _cluster_metrics(np.array(labels_local)) |
|
|
| current_params = { |
| "min_cluster_size": int(min_cluster_size), |
| "max_cluster_size": int(max_cluster_size), |
| "min_samples": int(min_samples), |
| } |
|
|
| labels, metrics = _run_hdbscan(current_params) |
| optimization_log = [ |
| { |
| "round": 0, |
| "params": current_params, |
| "metrics": metrics, |
| } |
| ] |
|
|
| best_score = _optimization_score(metrics) |
| stable_rounds = 0 |
|
|
| seen_params = {( |
| current_params["min_cluster_size"], |
| current_params["max_cluster_size"], |
| current_params["min_samples"], |
| )} |
|
|
| if bool(auto_optimize) and int(max_optimize_iters) > 0: |
| for round_idx in range(1, int(max_optimize_iters) + 1): |
| suggestion = _recommend_hdbscan_params(metrics, current_params) |
| if suggestion.get("action") == "accept": |
| optimization_log.append({ |
| "round": round_idx, |
| "params": current_params, |
| "metrics": metrics, |
| "action": "accept", |
| "reasoning": suggestion.get("reasoning", ""), |
| }) |
| break |
|
|
| next_params = { |
| "min_cluster_size": int(suggestion.get("min_cluster_size")), |
| "max_cluster_size": int(suggestion.get("max_cluster_size")), |
| "min_samples": int(suggestion.get("min_samples")), |
| } |
| next_key = ( |
| next_params["min_cluster_size"], |
| next_params["max_cluster_size"], |
| next_params["min_samples"], |
| ) |
| if next_key in seen_params: |
| optimization_log.append({ |
| "round": round_idx, |
| "params": current_params, |
| "metrics": metrics, |
| "action": "stop", |
| "reasoning": "Repeated parameter set; stopping optimization.", |
| }) |
| break |
|
|
| labels, metrics = _run_hdbscan(next_params) |
| optimization_log.append({ |
| "round": round_idx, |
| "params": next_params, |
| "metrics": metrics, |
| "reasoning": suggestion.get("reasoning", ""), |
| }) |
| current_params = next_params |
| seen_params.add(next_key) |
|
|
| score = _optimization_score(metrics) |
| if score <= best_score + OPTIMIZE_MIN_IMPROVEMENT: |
| stable_rounds += 1 |
| else: |
| best_score = score |
| stable_rounds = 0 |
|
|
| if _metrics_in_target(metrics): |
| break |
|
|
| if stable_rounds >= OPTIMIZE_STABLE_ROUNDS: |
| break |
|
|
| optimization_path = rdir / "optimization.json" |
| _save_json(optimization_path, optimization_log) |
|
|
| unique_ids = sorted(filter(lambda v: v != -1, set(labels))) |
|
|
| |
| |
| np.save(str(rdir / "sent_labels.npy"), np.array(labels, dtype=np.int32)) |
|
|
| labels_arr = np.array(labels) |
|
|
| if not unique_ids: |
| _save_json(rdir / "summaries.json", []) |
| return { |
| "run_key": run_key, |
| "n_clusters": 0, |
| "n_sentences": int(len(sentences)), |
| "threshold": threshold, |
| "min_cluster_size": int(current_params["min_cluster_size"]), |
| "max_cluster_size": int(current_params["max_cluster_size"]), |
| "min_samples": int(current_params["min_samples"]), |
| "chart_paths": {}, |
| "summaries_path": str(rdir / "summaries.json"), |
| "embeddings_path": str(rdir / "emb.npy"), |
| "optimization_path": str(optimization_path), |
| "error": "HDBSCAN produced no clusters (all points labeled as noise).", |
| } |
|
|
| def _cluster_summary(cid: int) -> dict: |
| mask = labels_arr == cid |
| c_emb = embeddings[mask] |
| c_umap = umap_2d[mask] |
| c_sent = list(np.array(sentences)[mask]) |
| ctroid = _centroid(c_emb) |
| top_idx = _top_k_indices(c_emb, ctroid, N_EVIDENCE) |
| coords = ( |
| c_umap.mean(axis=0) |
| if c_umap.shape[0] > 0 |
| else np.zeros(UMAP_N_COMPONENTS_VIZ, dtype=np.float32) |
| ) |
| paper_stats = _top_papers_for_mask(sentence_meta, mask, k=3) |
| return { |
| "cluster_id": int(cid), |
| "size": int(mask.sum()), |
| "cx": float(coords[0]), |
| "cy": float(coords[1]), |
| "evidence": list(np.array(c_sent)[top_idx]), |
| "paper_count": paper_stats.get("paper_count", 0), |
| "top_papers": paper_stats.get("top_papers", []), |
| } |
|
|
| summaries = list(map(_cluster_summary, unique_ids)) |
| _save_json(rdir / "summaries.json", summaries) |
|
|
| chart_paths = { |
| "Intertopic Map": _save_chart(_chart_intertopic(summaries), rdir / "intertopic.html"), |
| "Top Words": _save_chart(_chart_top_words(summaries), rdir / "topwords.html"), |
| "Hierarchy": _save_chart(_chart_hierarchy(labels, embeddings), rdir / "hierarchy.html"), |
| "Heatmap": _save_chart(_chart_heatmap(labels, embeddings), rdir / "heatmap.html"), |
| } |
|
|
| return { |
| "run_key": run_key, |
| "n_clusters": int(len(unique_ids)), |
| "n_sentences": int(len(sentences)), |
| "threshold": threshold, |
| "min_cluster_size": int(current_params["min_cluster_size"]), |
| "max_cluster_size": int(current_params["max_cluster_size"]), |
| "min_samples": int(current_params["min_samples"]), |
| "chart_paths": chart_paths, |
| "summaries_path": str(rdir / "summaries.json"), |
| "embeddings_path": str(rdir / "emb.npy"), |
| "optimization_path": str(optimization_path), |
| } |
|
|
|
|
| |
| |
| |
|
|
| _LABEL_PROMPT = PromptTemplate.from_template( |
| """You are an expert academic researcher specialising in Information Systems. |
| |
| Given the following cluster of research sentences, return a JSON object with EXACTLY these keys: |
| label : short research-area name (<= 6 words) |
| category : broader IS research category |
| confidence : float 0.0-1.0 |
| reasoning : one sentence explaining your choice |
| niche : boolean - true if highly specialised / narrow |
| |
| Cluster ID : {cluster_id} |
| Sentence count: {size} |
| Evidence sentences: |
| {evidence} |
| |
| Respond with RAW JSON only. No markdown, no explanation outside the JSON. |
| """ |
| ) |
|
|
|
|
| _LABEL_JUDGE_PROMPT = PromptTemplate.from_template( |
| """You are an expert label adjudicator. Choose the single best label from |
| the candidates below based on the evidence sentences. |
| |
| Cluster ID : {cluster_id} |
| Sentence count: {size} |
| Evidence sentences: |
| {evidence} |
| |
| Candidate labels: |
| 1) Mistral |
| Label: {mistral_label} |
| Category: {mistral_category} |
| Confidence: {mistral_confidence} |
| Reasoning: {mistral_reasoning} |
| |
| 2) Groq-Ollama |
| Label: {groq_ollama_label} |
| Category: {groq_ollama_category} |
| Confidence: {groq_ollama_confidence} |
| Reasoning: {groq_ollama_reasoning} |
| |
| 3) Groq-GPT |
| Label: {groq_gpt_label} |
| Category: {groq_gpt_category} |
| Confidence: {groq_gpt_confidence} |
| Reasoning: {groq_gpt_reasoning} |
| |
| Rules: |
| - Choose exactly one of the three labels. Do not invent a new label. |
| - Pick the label that best matches the evidence and is most specific. |
| - If two are equally good, prefer the one with higher confidence. |
| |
| Return RAW JSON with exactly these keys: |
| best_label: string |
| best_category: string |
| chosen_source: string # one of: mistral, groq_ollama, groq_gpt |
| best_reasoning: string |
| |
| Respond with RAW JSON only. |
| """ |
| ) |
|
|
|
|
| @tool |
| def label_topics_with_llm(run_key: str) -> dict: |
| """ |
| Label each cluster with Mistral only (default Phase 2 labeling pass). |
| |
| Parameters |
| ---------- |
| run_key : str — "abstract" or "title" or "keywords" |
| |
| Returns |
| ------- |
| dict with keys: |
| run_key, labels_path, labelled_count, labels_preview (list of dicts) |
| """ |
| rdir = _run_dir(run_key) |
| summaries_path = rdir / "summaries.json" |
| if not summaries_path.exists(): |
| return { |
| "run_key": run_key, |
| "labels_path": str(rdir / "labels.json"), |
| "labelled_count": 0, |
| "total_clusters": 0, |
| "selected_clusters": 0, |
| "skipped_clusters": 0, |
| "labels_preview": [], |
| "error": ( |
| f"Missing discovery artifact: {summaries_path}. " |
| "Run run_bertopic_discovery first for this run_key." |
| ), |
| } |
|
|
| summaries = _load_json(summaries_path) |
|
|
| ranked = sorted( |
| filter(lambda s: s.get("size", 0) >= MIN_CLUSTER_SIZE_FOR_LABEL, summaries), |
| key=lambda s: s.get("size", 0), |
| reverse=True, |
| ) |
| selected = ranked[:MAX_LABEL_CLUSTERS] |
|
|
| chain_mistral = _LABEL_PROMPT | _llm() | JsonOutputParser() |
|
|
| def _evidence_block(summary: dict) -> str: |
| return "\n".join( |
| f" {i+1}. {s}" |
| for i, s in enumerate(summary["evidence"]) |
| ) |
|
|
| def _label_one(summary: dict) -> dict: |
| result = _invoke_with_retries(lambda: chain_mistral.invoke({ |
| "cluster_id": summary["cluster_id"], |
| "size": summary["size"], |
| "evidence": _evidence_block(summary), |
| })) |
|
|
| return { |
| **summary, |
| **result, |
| "mistral_label": result.get("label", ""), |
| "mistral_category": result.get("category", ""), |
| "mistral_confidence": _to_float(result.get("confidence"), 0.0), |
| "mistral_reasoning": result.get("reasoning", ""), |
| "mistral_niche": bool(result.get("niche", False)), |
| "groq_label": "", |
| "groq_category": "", |
| "groq_confidence": 0.0, |
| "groq_reasoning": "", |
| "groq_niche": False, |
| "groq_ollama_label": "", |
| "groq_ollama_category": "", |
| "groq_ollama_confidence": 0.0, |
| "groq_ollama_reasoning": "", |
| "groq_ollama_niche": False, |
| "groq_gpt_label": "", |
| "groq_gpt_category": "", |
| "groq_gpt_confidence": 0.0, |
| "groq_gpt_reasoning": "", |
| "groq_gpt_niche": False, |
| "verification_done": False, |
| "verification_done_ollama": False, |
| "verification_done_gpt": False, |
| "verification_note": ( |
| "Run VERIFY in Phase 2 to compare with Groq-Ollama and Groq-GPT labels." |
| ), |
| } |
|
|
| labelled = list(map(_label_one, selected)) |
| _save_json(rdir / "labels.json", labelled) |
|
|
| |
| preview = list(map( |
| lambda r: { |
| "cluster_id": r.get("cluster_id"), |
| "label": r.get("label"), |
| "category": r.get("category"), |
| "confidence": r.get("confidence"), |
| "mistral_label": r.get("mistral_label", ""), |
| "groq_label": r.get("groq_label", ""), |
| "groq_ollama_label": r.get("groq_ollama_label", r.get("groq_label", "")), |
| "groq_gpt_label": r.get("groq_gpt_label", ""), |
| "size": r.get("size"), |
| "niche": r.get("niche", False), |
| }, |
| labelled[:MAX_TOOL_RETURN_PREVIEW], |
| )) |
|
|
| return { |
| "run_key": run_key, |
| "labels_path": str(rdir / "labels.json"), |
| "labelled_count": len(labelled), |
| "total_clusters": len(summaries), |
| "selected_clusters": len(selected), |
| "skipped_clusters": max(0, len(summaries) - len(selected)), |
| "groq_enabled": _groq_ollama_enabled() and _groq_gpt_enabled(), |
| "mode_note": "Single-model labeling complete (Mistral). Send VERIFY in Phase 2 to run Groq-Ollama and Groq-GPT verification.", |
| "labels_preview": preview, |
| } |
|
|
|
|
| @tool |
| def verify_topic_labels_with_groq(run_key: str) -> dict: |
| """ |
| Run Groq topic labeling for already-labeled topics and append comparison fields |
| into labels.json so UI review table can show Mistral vs Groq-Ollama vs Groq-GPT labels, |
| plus an adjudicated best label when GROQ_JUDGE_MODEL_NAME is configured. |
| |
| Parameters |
| ---------- |
| run_key : str — "abstract" or "title" or "keywords" |
| |
| Returns |
| ------- |
| dict with keys: |
| run_key, labels_path, verification_path, verified_count, labels_preview |
| """ |
| rdir = _run_dir(run_key) |
| labels_path = rdir / "labels.json" |
| summaries_path = rdir / "summaries.json" |
|
|
| if not _groq_ollama_enabled() or not _groq_gpt_enabled(): |
| return { |
| "run_key": run_key, |
| "labels_path": str(labels_path), |
| "verified_count": 0, |
| "labels_preview": [], |
| "error": ( |
| "GROQ_API_KEY or Groq model config is missing, or langchain-groq is unavailable. " |
| "Set GROQ_API_KEY and GROQ_GPT_MODEL_NAME (and optionally GROQ_OLLAMA_MODEL_NAME) " |
| "and install requirements to use VERIFY." |
| ), |
| } |
|
|
| if not labels_path.exists(): |
| return { |
| "run_key": run_key, |
| "labels_path": str(labels_path), |
| "verified_count": 0, |
| "labels_preview": [], |
| "error": ( |
| f"Missing labels artifact: {labels_path}. " |
| "Run label_topics_with_llm first." |
| ), |
| } |
|
|
| if not summaries_path.exists(): |
| return { |
| "run_key": run_key, |
| "labels_path": str(labels_path), |
| "verified_count": 0, |
| "labels_preview": [], |
| "error": ( |
| f"Missing summaries artifact: {summaries_path}. " |
| "Run run_bertopic_discovery first." |
| ), |
| } |
|
|
| labels_data = _load_json(labels_path) |
| summaries = _load_json(summaries_path) |
| summary_by_id = { |
| int(s.get("cluster_id", -1)): s |
| for s in summaries |
| } |
|
|
| target_rows = list(filter( |
| lambda r: int(r.get("cluster_id", -1)) in summary_by_id, |
| labels_data, |
| )) |
|
|
| chain_groq_ollama = _LABEL_PROMPT | _llm_groq(GROQ_OLLAMA_MODEL_NAME) | JsonOutputParser() |
| chain_groq_gpt = _LABEL_PROMPT | _llm_groq(GROQ_GPT_MODEL_NAME) | JsonOutputParser() |
| chain_judge = ( |
| _LABEL_JUDGE_PROMPT | _llm_groq(GROQ_JUDGE_MODEL_NAME) | JsonOutputParser() |
| if _groq_judge_enabled() |
| else None |
| ) |
|
|
| def _evidence_block(summary: dict) -> str: |
| return "\n".join( |
| f" {i+1}. {s}" |
| for i, s in enumerate(summary.get("evidence", [])) |
| ) |
|
|
| def _label_with_groq(row: dict) -> tuple[int, dict, dict]: |
| cid = int(row.get("cluster_id", -1)) |
| summary = summary_by_id[cid] |
| payload = { |
| "cluster_id": summary["cluster_id"], |
| "size": summary["size"], |
| "evidence": _evidence_block(summary), |
| } |
| groq_ollama = _invoke_with_retries(lambda: chain_groq_ollama.invoke(payload)) |
| groq_gpt = _invoke_with_retries(lambda: chain_groq_gpt.invoke(payload)) |
| return cid, groq_ollama, groq_gpt |
|
|
| groq_pairs = list(map(_label_with_groq, target_rows)) |
| groq_ollama_by_id = {cid: data for cid, data, _ in groq_pairs} |
| groq_gpt_by_id = {cid: data for cid, _, data in groq_pairs} |
|
|
| def _judge_label(row: dict) -> tuple[int, dict]: |
| if chain_judge is None: |
| return int(row.get("cluster_id", -1)), {} |
| cid = int(row.get("cluster_id", -1)) |
| summary = summary_by_id[cid] |
| groq_ollama = groq_ollama_by_id.get(cid, {}) |
| groq_gpt = groq_gpt_by_id.get(cid, {}) |
| payload = { |
| "cluster_id": summary.get("cluster_id"), |
| "size": summary.get("size"), |
| "evidence": _evidence_block(summary), |
| "mistral_label": str(row.get("mistral_label") or row.get("label", "")).strip(), |
| "mistral_category": str(row.get("mistral_category") or row.get("category", "")).strip(), |
| "mistral_confidence": _to_float(row.get("mistral_confidence", row.get("confidence", 0.0)), 0.0), |
| "mistral_reasoning": str(row.get("mistral_reasoning") or row.get("reasoning", "")).strip(), |
| "groq_ollama_label": str(groq_ollama.get("label", "")).strip(), |
| "groq_ollama_category": str(groq_ollama.get("category", "")).strip(), |
| "groq_ollama_confidence": _to_float(groq_ollama.get("confidence"), 0.0), |
| "groq_ollama_reasoning": str(groq_ollama.get("reasoning", "")).strip(), |
| "groq_gpt_label": str(groq_gpt.get("label", "")).strip(), |
| "groq_gpt_category": str(groq_gpt.get("category", "")).strip(), |
| "groq_gpt_confidence": _to_float(groq_gpt.get("confidence"), 0.0), |
| "groq_gpt_reasoning": str(groq_gpt.get("reasoning", "")).strip(), |
| } |
| try: |
| result = _invoke_with_retries(lambda: chain_judge.invoke(payload)) |
| except Exception: |
| result = {} |
| return cid, result |
|
|
| judge_pairs = list(map(_judge_label, target_rows)) if chain_judge else [] |
| judge_by_id = {cid: data for cid, data in judge_pairs} |
|
|
| def _merge_row(row: dict) -> dict: |
| cid = int(row.get("cluster_id", -1)) |
| groq_ollama = groq_ollama_by_id.get(cid, {}) |
| groq_gpt = groq_gpt_by_id.get(cid, {}) |
| adjudicated = judge_by_id.get(cid, {}) |
| has_groq_ollama = bool(groq_ollama) |
| has_groq_gpt = bool(groq_gpt) |
| mistral_label = str(row.get("mistral_label") or row.get("label", "")).strip() |
| groq_ollama_label = str(groq_ollama.get("label", "")).strip() |
| groq_gpt_label = str(groq_gpt.get("label", "")).strip() |
| adjudicated_label = str(adjudicated.get("best_label", "")).strip() |
| is_agreement = ( |
| all([mistral_label, groq_ollama_label, groq_gpt_label]) |
| and mistral_label.lower() == groq_ollama_label.lower() |
| and mistral_label.lower() == groq_gpt_label.lower() |
| ) |
|
|
| return { |
| **row, |
| "mistral_label": mistral_label, |
| "mistral_category": row.get("mistral_category") or row.get("category", ""), |
| "mistral_confidence": _to_float( |
| row.get("mistral_confidence", row.get("confidence", 0.0)), |
| 0.0, |
| ), |
| "mistral_reasoning": row.get("mistral_reasoning") or row.get("reasoning", ""), |
| "mistral_niche": bool(row.get("mistral_niche", row.get("niche", False))), |
| "groq_label": groq_ollama_label, |
| "groq_category": groq_ollama.get("category", ""), |
| "groq_confidence": _to_float(groq_ollama.get("confidence"), 0.0), |
| "groq_reasoning": groq_ollama.get("reasoning", ""), |
| "groq_niche": bool(groq_ollama.get("niche", False)), |
| "groq_ollama_label": groq_ollama_label, |
| "groq_ollama_category": groq_ollama.get("category", ""), |
| "groq_ollama_confidence": _to_float(groq_ollama.get("confidence"), 0.0), |
| "groq_ollama_reasoning": groq_ollama.get("reasoning", ""), |
| "groq_ollama_niche": bool(groq_ollama.get("niche", False)), |
| "groq_gpt_label": groq_gpt_label, |
| "groq_gpt_category": groq_gpt.get("category", ""), |
| "groq_gpt_confidence": _to_float(groq_gpt.get("confidence"), 0.0), |
| "groq_gpt_reasoning": groq_gpt.get("reasoning", ""), |
| "groq_gpt_niche": bool(groq_gpt.get("niche", False)), |
| "adjudicated_label": adjudicated_label, |
| "adjudicated_category": str(adjudicated.get("best_category", "")).strip(), |
| "adjudicated_reasoning": str(adjudicated.get("best_reasoning", "")).strip(), |
| "adjudicated_source": str(adjudicated.get("chosen_source", "")).strip(), |
| "adjudication_done": bool(adjudicated_label), |
| "adjudication_note": ( |
| "Adjudicated label available." |
| if adjudicated_label |
| else "Adjudication unavailable for this topic." |
| ), |
| "verification_done": has_groq_ollama and has_groq_gpt, |
| "verification_done_ollama": has_groq_ollama, |
| "verification_done_gpt": has_groq_gpt, |
| "verification_note": ( |
| "Mistral, Groq-Ollama, and Groq-GPT labels match." |
| if is_agreement |
| else "Model labels differ. Review before approval." |
| ) |
| if has_groq_ollama and has_groq_gpt |
| else "Groq labeling unavailable for this topic.", |
| } |
|
|
| verified_rows = list(map(_merge_row, labels_data)) |
| verification_path = rdir / "labels_verification.json" |
| _save_json(labels_path, verified_rows) |
| _save_json(verification_path, verified_rows) |
|
|
| preview = list(map( |
| lambda r: { |
| "cluster_id": r.get("cluster_id"), |
| "mistral_label": r.get("mistral_label", ""), |
| "groq_ollama_label": r.get("groq_ollama_label", r.get("groq_label", "")), |
| "groq_gpt_label": r.get("groq_gpt_label", ""), |
| "adjudicated_label": r.get("adjudicated_label", ""), |
| "verification_note": r.get("verification_note", ""), |
| }, |
| verified_rows[:MAX_TOOL_RETURN_PREVIEW], |
| )) |
|
|
| verified_count = sum( |
| 1 |
| for row in verified_rows |
| if row.get("groq_ollama_label") and row.get("groq_gpt_label") |
| ) |
|
|
| return { |
| "run_key": run_key, |
| "labels_path": str(labels_path), |
| "verification_path": str(verification_path), |
| "verified_count": int(verified_count), |
| "labelled_count": int(len(verified_rows)), |
| "labels_preview": preview, |
| } |
|
|
|
|
| |
| |
| |
|
|
| @tool |
| def consolidate_into_themes(run_key: str, theme_map: dict) -> dict: |
| """ |
| Merge approved / renamed topics into consolidated themes and recompute |
| centroids from the actual merged-cluster embeddings. |
| |
| Parameters |
| ---------- |
| run_key : str — "abstract" or "title" or "keywords" |
| theme_map : dict — {new_theme_name: [cluster_id, ...], ...} |
| Only approved topics need appear here. |
| |
| Returns |
| ------- |
| dict with keys: |
| run_key, theme_count, themes_path, themes_preview (list of dicts) |
| """ |
| rdir = _run_dir(run_key) |
| labels_data = _load_json(rdir / "labels.json") |
| embeddings = np.load(str(rdir / "emb.npy")) |
| sent_labels = np.load(str(rdir / "sent_labels.npy")) |
|
|
| |
| label_idx = {item["cluster_id"]: item for item in labels_data} |
|
|
| def _build_theme(theme_name: str, cids: list[int]) -> dict: |
| """ |
| Build one consolidated theme from a list of cluster IDs. |
| |
| Evidence : top-N sentences pooled across all merged clusters |
| Centroid : L2-normalised mean of all embeddings in the merged set |
| Size : total sentence count across merged clusters |
| """ |
| member_labels = list(map(label_idx.get, cids)) |
|
|
| |
| all_evidence = reduce( |
| lambda acc, lbl: acc + lbl["evidence"], |
| filter(None, member_labels), |
| [], |
| ) |
|
|
| |
| total_size = reduce( |
| lambda acc, lbl: acc + lbl.get("size", 0), |
| filter(None, member_labels), |
| 0, |
| ) |
|
|
| |
| cluster_mask = np.isin(sent_labels, np.array(cids, dtype=np.int32)) |
| theme_embeddings = embeddings[cluster_mask] |
|
|
| |
| theme_centroid = ( |
| _centroid(theme_embeddings) |
| if theme_embeddings.shape[0] > 0 |
| else np.zeros(embeddings.shape[1], dtype=np.float32) |
| ) |
|
|
| return { |
| "theme_name": theme_name, |
| "cluster_ids": cids, |
| "size": total_size, |
| "evidence": all_evidence[:N_EVIDENCE], |
| "centroid": theme_centroid.tolist(), |
| "sub_labels": list(map( |
| itemgetter("label"), |
| filter(None, member_labels), |
| )), |
| } |
|
|
| themes = list(map( |
| lambda kv: _build_theme(kv[0], kv[1]), |
| theme_map.items(), |
| )) |
|
|
| _save_json(rdir / "themes.json", themes) |
|
|
| preview = list(map( |
| lambda t: { |
| "theme_name": t.get("theme_name"), |
| "size": t.get("size", 0), |
| "cluster_count": len(t.get("cluster_ids", [])), |
| }, |
| themes[:MAX_TOOL_RETURN_PREVIEW], |
| )) |
|
|
| return { |
| "run_key": run_key, |
| "theme_count": len(themes), |
| "themes_path": str(rdir / "themes.json"), |
| "themes_preview": preview, |
| } |
|
|
|
|
| |
| |
| |
|
|
| _TAXONOMY_PROMPT = PromptTemplate.from_template( |
| """You are an IS research taxonomist. Map the following research theme to the |
| PAJAIS taxonomy. Return RAW JSON with EXACTLY these keys: |
| theme_name : the input theme name (unchanged) |
| pajais_match : best matching PAJAIS category OR the string "NOVEL" |
| confidence : float 0.0-1.0 |
| reasoning : one sentence |
| is_novel : boolean |
| |
| PAJAIS categories: |
| {taxonomy} |
| |
| Theme to map: |
| Name : {theme_name} |
| Evidence : {evidence} |
| |
| Respond with RAW JSON only. No markdown. |
| """ |
| ) |
|
|
|
|
| @tool |
| def compare_with_taxonomy(run_key: str) -> dict: |
| """ |
| Map consolidated themes to PAJAIS taxonomy via Mistral. |
| |
| Parameters |
| ---------- |
| run_key : str — "abstract" or "title" or "keywords" |
| |
| Returns |
| ------- |
| dict with keys: |
| run_key, taxonomy_path, mapped_count, novel_count, mapping_preview |
| """ |
| rdir = _run_dir(run_key) |
| themes = _load_json(rdir / "themes.json") |
| chain = _TAXONOMY_PROMPT | _llm() | JsonOutputParser() |
|
|
| taxonomy_str = "\n".join(f" - {cat}" for cat in PAJAIS_TAXONOMY) |
|
|
| def _map_theme(theme: dict) -> dict: |
| result = _invoke_with_retries(lambda: chain.invoke({ |
| "taxonomy": taxonomy_str, |
| "theme_name": theme["theme_name"], |
| "evidence": " | ".join(theme.get("evidence", [])[:3]), |
| })) |
| return {**theme, **result} |
|
|
| taxonomy_map = list(map(_map_theme, themes)) |
| _save_json(rdir / "taxonomy_map.json", taxonomy_map) |
|
|
| novel_count = sum(1 for t in taxonomy_map if t.get("is_novel", False)) |
| mapped_count = len(taxonomy_map) - novel_count |
|
|
| preview = list(map( |
| lambda t: { |
| "theme_name": t.get("theme_name"), |
| "pajais_match": t.get("pajais_match", "NOVEL"), |
| "confidence": t.get("confidence", 0), |
| "is_novel": t.get("is_novel", False), |
| }, |
| taxonomy_map[:MAX_TOOL_RETURN_PREVIEW], |
| )) |
|
|
| return { |
| "run_key": run_key, |
| "taxonomy_path": str(rdir / "taxonomy_map.json"), |
| "mapped_count": mapped_count, |
| "novel_count": novel_count, |
| "mapping_preview": preview, |
| } |
|
|
|
|
| @tool |
| def verify_taxonomy_mapping_with_groq(run_key: str) -> dict: |
| """ |
| Run Groq validation for PAJAIS taxonomy mappings and persist side-by-side |
| Mistral/Groq mapping fields for each theme. |
| |
| Parameters |
| ---------- |
| run_key : str — "abstract" or "title" or "keywords" |
| |
| Returns |
| ------- |
| dict with keys: |
| run_key, taxonomy_path, verification_path, |
| verified_count, mapping_preview |
| """ |
| if not _groq_ollama_enabled(): |
| return { |
| "run_key": run_key, |
| "taxonomy_path": str(_run_dir(run_key) / "taxonomy_map.json"), |
| "verified_count": 0, |
| "mapping_preview": [], |
| "error": ( |
| "GROQ_API_KEY is missing or langchain-groq is unavailable. " |
| "Set GROQ_API_KEY and install requirements to use VERIFY." |
| ), |
| } |
|
|
| rdir = _run_dir(run_key) |
| themes_path = rdir / "themes.json" |
| taxonomy_path = rdir / "taxonomy_map.json" |
|
|
| if not themes_path.exists(): |
| return { |
| "run_key": run_key, |
| "taxonomy_path": str(taxonomy_path), |
| "verified_count": 0, |
| "mapping_preview": [], |
| "error": ( |
| f"Missing themes artifact: {themes_path}. " |
| "Run consolidate_into_themes first." |
| ), |
| } |
|
|
| if not taxonomy_path.exists(): |
| return { |
| "run_key": run_key, |
| "taxonomy_path": str(taxonomy_path), |
| "verified_count": 0, |
| "mapping_preview": [], |
| "error": ( |
| f"Missing taxonomy artifact: {taxonomy_path}. " |
| "Run compare_with_taxonomy first." |
| ), |
| } |
|
|
| themes = _load_json(themes_path) |
| taxonomy_map = _load_json(taxonomy_path) |
| taxonomy_str = "\n".join(f" - {cat}" for cat in PAJAIS_TAXONOMY) |
|
|
| chain_groq = _TAXONOMY_PROMPT | _llm_groq(GROQ_OLLAMA_MODEL_NAME) | JsonOutputParser() |
|
|
| def _map_theme_with_groq(theme: dict) -> dict: |
| return _invoke_with_retries(lambda: chain_groq.invoke({ |
| "taxonomy": taxonomy_str, |
| "theme_name": theme["theme_name"], |
| "evidence": " | ".join(theme.get("evidence", [])[:3]), |
| })) |
|
|
| groq_maps = list(map(_map_theme_with_groq, themes)) |
| groq_by_theme = { |
| str(item.get("theme_name", "")).strip(): item |
| for item in groq_maps |
| } |
|
|
| def _merge_mappings(mistral_row: dict) -> dict: |
| theme_name = str(mistral_row.get("theme_name", "")).strip() |
| groq_row = groq_by_theme.get(theme_name, {}) |
| groq_match = str(groq_row.get("pajais_match", "")).strip() |
| mistral_match = str(mistral_row.get("pajais_match", "")).strip() |
| is_same = bool(groq_match) and (groq_match.lower() == mistral_match.lower()) |
|
|
| return { |
| **mistral_row, |
| "mistral_pajais_match": mistral_match, |
| "mistral_confidence": _to_float( |
| mistral_row.get("mistral_confidence", mistral_row.get("confidence", 0.0)), |
| 0.0, |
| ), |
| "mistral_reasoning": str( |
| mistral_row.get("mistral_reasoning", mistral_row.get("reasoning", "")) |
| ), |
| "mistral_is_novel": bool( |
| mistral_row.get("mistral_is_novel", mistral_row.get("is_novel", False)) |
| ), |
| "groq_pajais_match": groq_match, |
| "groq_confidence": _to_float(groq_row.get("confidence"), 0.0), |
| "groq_reasoning": str(groq_row.get("reasoning", "")), |
| "groq_is_novel": bool(groq_row.get("is_novel", False)), |
| "taxonomy_verification_done": bool(groq_row), |
| "taxonomy_verification_note": ( |
| "Mistral and Groq taxonomy mapping match." |
| if is_same |
| else "Mistral and Groq taxonomy mapping differ." |
| ) if groq_row else "Groq taxonomy mapping unavailable for this theme.", |
| } |
|
|
| merged_rows = list(map(_merge_mappings, taxonomy_map)) |
| verification_path = rdir / "taxonomy_verification.json" |
| _save_json(taxonomy_path, merged_rows) |
| _save_json(verification_path, merged_rows) |
|
|
| preview = list(map( |
| lambda row: { |
| "theme_name": row.get("theme_name", ""), |
| "mistral_pajais_match": row.get("mistral_pajais_match", row.get("pajais_match", "")), |
| "groq_pajais_match": row.get("groq_pajais_match", ""), |
| "taxonomy_verification_note": row.get("taxonomy_verification_note", ""), |
| }, |
| merged_rows[:MAX_TOOL_RETURN_PREVIEW], |
| )) |
|
|
| verified_count = sum(1 for row in merged_rows if row.get("groq_pajais_match")) |
|
|
| return { |
| "run_key": run_key, |
| "taxonomy_path": str(taxonomy_path), |
| "verification_path": str(verification_path), |
| "verified_count": int(verified_count), |
| "mapped_count": int(len(merged_rows)), |
| "mapping_preview": preview, |
| } |
|
|
|
|
| |
| |
| |
|
|
| @tool |
| def generate_comparison_csv() -> dict: |
| """ |
| Side-by-side comparison of abstract/title/keywords theme mappings. |
| |
| Each run is optional. Missing runs produce empty columns. |
| |
| Returns |
| ------- |
| dict with keys: |
| csv_path, row_count, columns, preview (list of dicts) |
| """ |
| abstract_path = OUTPUT_DIR / "abstract" / "taxonomy_map.json" |
| title_path = OUTPUT_DIR / "title" / "taxonomy_map.json" |
| keywords_path = OUTPUT_DIR / "keywords" / "taxonomy_map.json" |
|
|
| abstract_map = _load_json(abstract_path) if abstract_path.exists() else [] |
| title_map = _load_json(title_path) if title_path.exists() else [] |
| keywords_map = _load_json(keywords_path) if keywords_path.exists() else [] |
|
|
| if not (abstract_map or title_map or keywords_map): |
| return { |
| "csv_path": str(OUTPUT_DIR / "comparison.csv"), |
| "row_count": 0, |
| "columns": [], |
| "preview": [], |
| "error": ( |
| "No taxonomy_map.json files found for abstract/title/keywords. " |
| "Run compare_with_taxonomy for at least one run first." |
| ), |
| } |
|
|
| def _row(a_theme: dict | None, t_theme: dict | None, k_theme: dict | None) -> dict: |
| return { |
| "Abstract Theme": a_theme.get("theme_name", "") if a_theme else "", |
| "Abstract PAJAIS": a_theme.get("pajais_match", "") if a_theme else "", |
| "Abstract Confidence": a_theme.get("confidence", 0) if a_theme else 0, |
| "Abstract Novel": a_theme.get("is_novel", False) if a_theme else False, |
| "Title Theme": t_theme.get("theme_name", "") if t_theme else "", |
| "Title PAJAIS": t_theme.get("pajais_match", "") if t_theme else "", |
| "Title Confidence": t_theme.get("confidence", 0) if t_theme else 0, |
| "Title Novel": t_theme.get("is_novel", False) if t_theme else False, |
| "Keywords Theme": k_theme.get("theme_name", "") if k_theme else "", |
| "Keywords PAJAIS": k_theme.get("pajais_match", "") if k_theme else "", |
| "Keywords Confidence": k_theme.get("confidence", 0) if k_theme else 0, |
| "Keywords Novel": k_theme.get("is_novel", False) if k_theme else False, |
| } |
|
|
| max_len = max(len(abstract_map), len(title_map), len(keywords_map), 1) |
| padded_a = abstract_map + [{}] * (max_len - len(abstract_map)) |
| padded_t = title_map + [{}] * (max_len - len(title_map)) |
| padded_k = keywords_map + [{}] * (max_len - len(keywords_map)) |
|
|
| rows = list(map(_row, padded_a, padded_t, padded_k)) |
| df = pd.DataFrame(rows) |
|
|
| out_path = OUTPUT_DIR / "comparison.csv" |
| df.to_csv(out_path, index=False) |
|
|
| return { |
| "csv_path": str(out_path), |
| "row_count": len(df), |
| "columns": df.columns.tolist(), |
| "preview": df.head(5).to_dict(orient="records"), |
| } |
|
|
|
|
| |
| |
| |
|
|
| _NARRATIVE_PROMPT = PromptTemplate.from_template( |
| """You are an academic researcher writing a methodology and findings section. |
| |
| Write a 500-word academic narrative describing the thematic analysis results below. |
| Structure: (1) methodology overview, (2) major themes found across runs, |
| (3) PAJAIS alignment, (4) novel contributions, (5) limitations. |
| |
| Use formal academic English. Do NOT use bullet points. |
| |
| Abstract themes & taxonomy: |
| {abstract_themes} |
| |
| Title themes & taxonomy: |
| {title_themes} |
| |
| Keywords themes & taxonomy: |
| {keywords_themes} |
| |
| Respond with plain text only. |
| """ |
| ) |
|
|
|
|
| @tool |
| def export_narrative(run_key: str) -> dict: |
| """ |
| Generate a 500-word academic narrative and save to narrative.txt. |
| |
| Parameters |
| ---------- |
| run_key : str — "abstract" or "title" or "keywords" (primary source) |
| |
| Returns |
| ------- |
| dict with keys: |
| narrative_path, word_count, preview (first 300 chars) |
| """ |
| rdir = _run_dir(run_key) |
| abstract_path = OUTPUT_DIR / "abstract" / "taxonomy_map.json" |
| title_path = OUTPUT_DIR / "title" / "taxonomy_map.json" |
| keywords_path = OUTPUT_DIR / "keywords" / "taxonomy_map.json" |
|
|
| abstract_map = _load_json(abstract_path) if abstract_path.exists() else [] |
| title_map = _load_json(title_path) if title_path.exists() else [] |
| keywords_map = _load_json(keywords_path) if keywords_path.exists() else [] |
|
|
| if not (abstract_map or title_map or keywords_map): |
| return { |
| "narrative_path": str(rdir / "narrative.txt"), |
| "word_count": 0, |
| "preview": "", |
| "error": ( |
| "No taxonomy mappings found for abstract/title/keywords. " |
| "Run compare_with_taxonomy before export_narrative." |
| ), |
| } |
|
|
| def _theme_summary(t: dict) -> str: |
| return ( |
| f" - {t.get('theme_name','?')} -> {t.get('pajais_match','?')} " |
| f"(conf={t.get('confidence',0):.2f}, novel={t.get('is_novel',False)})" |
| ) |
|
|
| abstract_str = "\n".join(map(_theme_summary, abstract_map)) |
| title_str = "\n".join(map(_theme_summary, title_map)) or "Not run." |
| keywords_str = "\n".join(map(_theme_summary, keywords_map)) or "Not run." |
|
|
| chain = _NARRATIVE_PROMPT | _llm() |
| response = _invoke_with_retries(lambda: chain.invoke({ |
| "abstract_themes": abstract_str, |
| "title_themes": title_str, |
| "keywords_themes": keywords_str, |
| })) |
|
|
| narrative = response.content if hasattr(response, "content") else str(response) |
| out_path = rdir / "narrative.txt" |
| out_path.write_text(narrative, encoding="utf-8") |
|
|
| return { |
| "narrative_path": str(out_path), |
| "word_count": len(narrative.split()), |
| "preview": narrative[:300], |
| } |
|
|
|
|
| |
| |
| |
|
|
| def _extract_text_from_pdf(pdf_path: str) -> str: |
| """Extract all text from a PDF using PyMuPDF (text only, no images).""" |
| import fitz |
| doc = fitz.open(pdf_path) |
| pages = [] |
| for page in doc: |
| pages.append(page.get_text("text")) |
| doc.close() |
| return "\n".join(pages) |
|
|
|
|
| def _extract_title_from_pdf(full_text: str) -> str: |
| """Try to extract the paper title from the first few lines of text.""" |
| lines = full_text.strip().split("\n") |
| title_lines = [] |
| for line in lines[:10]: |
| stripped = line.strip() |
| if not stripped: |
| if title_lines: |
| break |
| continue |
| low = stripped.lower() |
| if low.startswith("abstract") or low.startswith("keyword"): |
| break |
| if len(stripped) > 10: |
| title_lines.append(stripped) |
| if len(title_lines) >= 2: |
| break |
| return " ".join(title_lines)[:200] if title_lines else "" |
|
|
|
|
| def _chunk_text(text: str, chunk_size: int = 12000, overlap: int = 1000) -> list[str]: |
| """Split text into chunks of `chunk_size` characters with `overlap`.""" |
| if not text: |
| return [] |
| chunks = [] |
| start = 0 |
| text_len = len(text) |
| while start < text_len: |
| end = start + chunk_size |
| chunks.append(text[start:end]) |
| if end >= text_len: |
| break |
| start = end - overlap |
| return chunks |
|
|
|
|
| |
| _EXTRACT_METHODS_PROMPT = PromptTemplate.from_template( |
| """You are an expert IS research methodologist. Read this excerpt from a research |
| paper and identify ALL computational techniques used. |
| |
| The excerpt may come from methods or results. Use: |
| - explicit method statements ("this study uses", "we employed") |
| - analytical technique mentions in results (beta coefficients, BERT scores, LDA topics, network centrality) |
| - sample/data descriptions (N=, dataset, corpus) |
| - implicit method cues from results presentation (e.g., beta tables imply regression) |
| Do not guess beyond evidence in the excerpt. |
| |
| A "computational method" or "analytical technique" refers to specific algorithms, |
| statistical tests, machine learning models, NLP techniques, network measures, |
| or simulation/optimization approaches. |
| |
| Paper: {paper_name} |
| |
| Excerpt text: |
| {method_text} |
| |
| Return a JSON object with EXACTLY this key: |
| computational_methods : list of specific algorithms, models, or computational techniques found. |
| Be very specific. DO NOT just say "Machine Learning", name the algorithm. |
| Examples: ["Random Forest", "BERT", "K-means clustering", "LSTM", "XGBoost", |
| "LDA topic modeling", "PLS-SEM", "CB-SEM", "OLS Regression", "ANOVA", |
| "Network centrality", "Louvain community detection", "Sentiment Analysis (VADER)"] |
| Return an empty list [] if absolutely no specific computational |
| techniques or statistical models are mentioned. |
| |
| Respond with RAW JSON only. No markdown, no explanation. |
| """ |
| ) |
|
|
|
|
| @tool |
| def extract_methods_from_pdfs(pdf_dir: str) -> dict: |
| """ |
| Extract computational methods from each PDF paper. |
| |
| For each PDF: extract text (no images) → split into overlapping chunks → |
| send each chunk to Mistral LLM → aggregate identified methods per paper. |
| |
| Parameters |
| ---------- |
| pdf_dir : str — directory containing PDF files |
| |
| Returns |
| ------- |
| dict with keys: |
| n_papers, results (list of per-paper method dicts), csv_path |
| """ |
| pdf_dir_path = Path(pdf_dir) |
| if not pdf_dir_path.exists(): |
| return {"error": f"PDF directory not found: {pdf_dir}"} |
|
|
| pdf_files = sorted( |
| [str(p) for p in pdf_dir_path.glob("*.pdf")] |
| + [str(p) for p in pdf_dir_path.glob("*.PDF")] |
| ) |
| if not pdf_files: |
| return {"error": f"No PDF files found in {pdf_dir}"} |
|
|
| rdir = _ensure_dir(OUTPUT_DIR / "methods") |
|
|
| |
| paper_chunks = [] |
| for idx, pdf_path in enumerate(pdf_files, start=1): |
| try: |
| full_text = _extract_text_from_pdf(pdf_path) |
| title = Path(pdf_path).stem |
| chunks = _chunk_text(full_text) |
| |
| paper_chunks.append({ |
| "paper_id": idx, |
| "paper_filename": Path(pdf_path).stem, |
| "paper_title": title, |
| "chunks": chunks, |
| }) |
| except Exception as exc: |
| paper_chunks.append({ |
| "paper_id": idx, |
| "paper_filename": Path(pdf_path).stem, |
| "paper_title": Path(pdf_path).stem, |
| "chunks": [], |
| "error": str(exc), |
| }) |
|
|
| |
| if not MISTRAL_API_KEY: |
| return { |
| "n_papers": len(pdf_files), |
| "results": paper_chunks, |
| "error": "MISTRAL_API_KEY not set — extracted text chunks but cannot identify methods via LLM.", |
| } |
|
|
| chain = _EXTRACT_METHODS_PROMPT | _llm() | JsonOutputParser() |
| paper_results = [] |
|
|
| for entry in paper_chunks: |
| chunks = entry.get("chunks", []) |
| if not chunks: |
| paper_results.append({ |
| "paper_id": entry["paper_id"], |
| "paper_filename": entry["paper_filename"], |
| "paper_title": entry.get("paper_title", ""), |
| "computational_methods": [], |
| "extraction_note": "No text extracted", |
| }) |
| continue |
|
|
| all_comp_methods = set() |
|
|
| |
| for chunk in chunks: |
| if len(chunk) < 50: |
| continue |
| try: |
| result = _invoke_with_retries(lambda c=chunk: chain.invoke({ |
| "paper_name": entry.get("paper_title", entry.get("paper_filename", "")), |
| "method_text": c, |
| })) |
| |
| |
| cm = result.get("computational_methods", []) |
| if isinstance(cm, list): |
| for item in cm: |
| if isinstance(item, str) and item.strip(): |
| all_comp_methods.add(item.strip()) |
| elif isinstance(cm, str) and cm.strip(): |
| all_comp_methods.add(cm.strip()) |
|
|
| except Exception as exc: |
| pass |
|
|
| paper_results.append({ |
| "paper_id": entry["paper_id"], |
| "paper_filename": entry["paper_filename"], |
| "paper_title": entry.get("paper_title", ""), |
| "computational_methods": sorted(list(all_comp_methods)), |
| "chunks_processed": len(chunks) |
| }) |
|
|
| |
| _save_json(rdir / "method_results.json", paper_results) |
|
|
| |
| rows = [] |
| for r in paper_results: |
| comp_methods = r.get("computational_methods", []) |
| if isinstance(comp_methods, list): |
| comp_str = ", ".join(comp_methods) |
| else: |
| comp_str = str(comp_methods) |
| rows.append({ |
| "Paper ID": r.get("paper_id", ""), |
| "Paper Title": r.get("paper_title", r.get("paper_filename", "")), |
| "Computational Methods": comp_str, |
| }) |
|
|
| df = pd.DataFrame(rows) |
| csv_path = rdir / "method_summary.csv" |
| df.to_csv(csv_path, index=False) |
|
|
| def _clean_technique_name(name: str) -> str: |
| return re.sub(r"\s+", " ", name.strip()) |
|
|
| def _normalize_technique_key(name: str) -> str: |
| cleaned = re.sub(r"[^a-z0-9+ ]", " ", name.lower()) |
| cleaned = re.sub(r"\s+", " ", cleaned).strip() |
| cleaned = cleaned.replace("forests", "forest") |
| cleaned = cleaned.replace("trees", "tree") |
| cleaned = cleaned.replace("networks", "network") |
| cleaned = cleaned.replace("models", "model") |
| cleaned = cleaned.replace("transformers", "transformer") |
| cleaned = cleaned.replace("embeddings", "embedding") |
| cleaned = cleaned.replace("topics", "topic") |
| cleaned = cleaned.replace("measures", "measure") |
| return cleaned |
|
|
| canonical_patterns = [ |
| (re.compile(r"\bbert\b"), "BERT"), |
| (re.compile(r"\broberta\b"), "RoBERTa"), |
| (re.compile(r"\bxlm[- ]?roberta\b"), "XLM-RoBERTa"), |
| (re.compile(r"\bgpt[- ]?[0-9]*\b"), "GPT"), |
| (re.compile(r"\bt5\b"), "T5"), |
| (re.compile(r"\bword2vec\b"), "Word2Vec"), |
| (re.compile(r"\bglove\b"), "GloVe"), |
| (re.compile(r"\bdoc2vec\b"), "Doc2Vec"), |
| (re.compile(r"\bfasttext\b"), "fastText"), |
| (re.compile(r"\bspecter\b"), "SPECTER"), |
| (re.compile(r"\bsentence[- ]?transformer"), "Sentence-Transformers"), |
| (re.compile(r"\btf[- ]?idf\b"), "TF-IDF"), |
| (re.compile(r"\bbm25\b"), "BM25"), |
| (re.compile(r"\bbag of words\b|\bbow\b"), "Bag-of-words"), |
| (re.compile(r"\blda\b|\blatent dirichlet allocation\b"), "LDA topic modeling"), |
| (re.compile(r"\bnmf\b|\bnon[- ]?negative matrix factorization\b"), "NMF topic modeling"), |
| (re.compile(r"\blsa\b|\blsi\b|\blatent semantic analysis\b"), "LSA"), |
| (re.compile(r"\bbertopic\b"), "BERTopic"), |
| (re.compile(r"\bk[- ]?means\b"), "K-means clustering"), |
| (re.compile(r"\bhierarchical clustering\b"), "Hierarchical clustering"), |
| (re.compile(r"\bdbscan\b"), "DBSCAN"), |
| (re.compile(r"\bhdbscan\b"), "HDBSCAN"), |
| (re.compile(r"\bgmm\b|\bgaussian mixture\b"), "Gaussian mixture model"), |
| (re.compile(r"\bpca\b|\bprincipal component analysis\b"), "PCA"), |
| (re.compile(r"\bsvd\b|\bsingular value decomposition\b"), "SVD"), |
| (re.compile(r"\btsne\b|\bt-sne\b"), "t-SNE"), |
| (re.compile(r"\bumap\b"), "UMAP"), |
| (re.compile(r"\bner\b|\bnamed entity recognition\b"), "Named entity recognition"), |
| (re.compile(r"\bsentiment\b"), "Sentiment analysis"), |
| (re.compile(r"\brandom forest\b"), "Random Forest"), |
| (re.compile(r"\bdecision tree\b"), "Decision Tree"), |
| (re.compile(r"\bgradient boosting\b|\bxgboost\b|\blightgbm\b|\bcatboost\b"), "Gradient boosting"), |
| (re.compile(r"\bsvm\b|\bsupport vector machine\b"), "SVM"), |
| (re.compile(r"\bknn\b|\bk[- ]?nearest neighbor\b"), "KNN"), |
| (re.compile(r"\bnaive bayes\b"), "Naive Bayes"), |
| (re.compile(r"\bneural network\b|\bdeep learning\b|\bmlp\b"), "Neural networks"), |
| (re.compile(r"\bcnn\b|\bconvolutional neural network\b"), "CNN"), |
| (re.compile(r"\brnn\b|\brecurrent neural network\b"), "RNN"), |
| (re.compile(r"\blstm\b"), "LSTM"), |
| (re.compile(r"\bgru\b"), "GRU"), |
| (re.compile(r"\bautoencoder\b"), "Autoencoder"), |
| (re.compile(r"\btransformer\b"), "Transformers"), |
| (re.compile(r"\bfine[- ]?tuning\b"), "Model fine-tuning"), |
| (re.compile(r"\bpls[- ]?sem\b|\bpartial least squares\b"), "PLS-SEM"), |
| (re.compile(r"\bcb[- ]?sem\b|\bcovariance[- ]?based sem\b"), "CB-SEM"), |
| (re.compile(r"\bsem\b|\bstructural equation model\b"), "SEM"), |
| (re.compile(r"\bglmm\b|\bgeneralized linear mixed model\b"), "GLMM"), |
| (re.compile(r"\birt\b|\bitem response theory\b"), "IRT"), |
| (re.compile(r"\bbayesian\b"), "Bayesian inference"), |
| (re.compile(r"\bmediation\b"), "Mediation analysis"), |
| (re.compile(r"\bmoderation\b"), "Moderation analysis"), |
| (re.compile(r"\bchi[- ]?square\b|\bchi square\b"), "Chi-square test"), |
| (re.compile(r"\banova\b"), "ANOVA"), |
| (re.compile(r"\bmanova\b"), "MANOVA"), |
| (re.compile(r"\bancova\b"), "ANCOVA"), |
| (re.compile(r"\bmancova\b"), "MANCOVA"), |
| (re.compile(r"\bt[- ]?test\b"), "t-test"), |
| (re.compile(r"\bwilcoxon\b"), "Wilcoxon test"), |
| (re.compile(r"\bkruskal[- ]?wallis\b"), "Kruskal-Wallis test"), |
| (re.compile(r"\bfactor analysis\b"), "Factor analysis"), |
| (re.compile(r"\btime[- ]?series\b"), "Time-series analysis"), |
| (re.compile(r"\barima\b"), "ARIMA"), |
| (re.compile(r"\bsarima\b"), "SARIMA"), |
| (re.compile(r"\bvar\b|\bvector autoregression\b"), "VAR"), |
| (re.compile(r"\bprophet\b"), "Prophet"), |
| (re.compile(r"\bpanel regression\b|\bpanel data\b"), "Panel regression"), |
| (re.compile(r"\bfixed effects\b"), "Fixed-effects regression"), |
| (re.compile(r"\brandom effects\b"), "Random-effects regression"), |
| (re.compile(r"\bmultilevel\b|\bhierarchical linear model\b|\bhlm\b|\bmixed effects\b"), "Multilevel / mixed-effects regression"), |
| (re.compile(r"\bglm\b|\bgeneralized linear model\b"), "Generalized linear model"), |
| (re.compile(r"\bgls\b|\bgeneralized least squares\b"), "Generalized least squares"), |
| (re.compile(r"\bgee\b|\bgeneralized estimating equation\b"), "GEE"), |
| (re.compile(r"\bgmm\b|\bgeneralized method of moments\b"), "GMM"), |
| (re.compile(r"\b2sls\b|\btwo[- ]?stage least squares\b"), "2SLS"), |
| (re.compile(r"\b3sls\b|\bthree[- ]?stage least squares\b"), "3SLS"), |
| (re.compile(r"\binstrumental variable\b|\biv\b"), "Instrumental variables"), |
| (re.compile(r"\btobit\b"), "Tobit regression"), |
| (re.compile(r"\bheckman\b"), "Heckman selection model"), |
| (re.compile(r"\bpoisson\b"), "Poisson regression"), |
| (re.compile(r"\bnegative binomial\b"), "Negative binomial regression"), |
| (re.compile(r"\bprobit\b"), "Probit regression"), |
| (re.compile(r"\bsurvival analysis\b|\bcox\b|\bhazard model\b|\bkaplan[- ]?meier\b"), "Survival analysis"), |
| (re.compile(r"\blatent class analysis\b|\blca\b"), "Latent class analysis"), |
| (re.compile(r"\blatent profile analysis\b|\blpa\b"), "Latent profile analysis"), |
| (re.compile(r"\blogistic regression\b"), "Logistic regression"), |
| (re.compile(r"\bols\b|\borderinary least squares\b|\blinear regression\b|\bmultiple regression\b"), "Linear regression (OLS)"), |
| (re.compile(r"\bridge regression\b|\bridge\b"), "Ridge regression"), |
| (re.compile(r"\blasso\b"), "LASSO regression"), |
| (re.compile(r"\belastic net\b"), "Elastic Net regression"), |
| (re.compile(r"\bregression\b"), "Regression"), |
| (re.compile(r"\bcentrality\b"), "Network centrality"), |
| (re.compile(r"\bcommunity detection\b|\blouvain\b|\bleiden\b"), "Community detection"), |
| (re.compile(r"\bergm\b|\bexponential random graph\b"), "ERGM"), |
| (re.compile(r"\blink prediction\b"), "Link prediction"), |
| (re.compile(r"\bpagerank\b|\bpage rank\b"), "PageRank"), |
| (re.compile(r"\bgraph neural network\b|\bgnn\b"), "Graph neural networks"), |
| (re.compile(r"\bhidden markov\b|\bhmm\b"), "Hidden Markov Model"), |
| (re.compile(r"\bmarkov chain\b|\bmarkov model\b"), "Markov models"), |
| (re.compile(r"\bkalman filter\b"), "Kalman filter"), |
| (re.compile(r"\bstate[- ]?space\b"), "State-space models"), |
| (re.compile(r"\bhawkes\b"), "Hawkes process"), |
| (re.compile(r"\brecommender\b|\bcollaborative filtering\b|\bmatrix factorization\b"), "Recommender systems"), |
| (re.compile(r"\bahp\b|\banalytic hierarchy process\b"), "AHP"), |
| (re.compile(r"\btopsis\b"), "TOPSIS"), |
| (re.compile(r"\bvikor\b"), "VIKOR"), |
| (re.compile(r"\bpromethee\b"), "PROMETHEE"), |
| (re.compile(r"\bdematel\b"), "DEMATEL"), |
| (re.compile(r"\bdea\b|\bdata envelopment analysis\b"), "DEA"), |
| (re.compile(r"\bsfa\b|\bstochastic frontier\b"), "SFA"), |
| (re.compile(r"\bagent[- ]?based\b"), "Agent-based simulation"), |
| (re.compile(r"\bmonte carlo\b"), "Monte Carlo simulation"), |
| (re.compile(r"\bbayesian optimization\b"), "Bayesian optimization"), |
| (re.compile(r"\blinear programming\b|\binteger programming\b|\bmixed integer\b"), "Mathematical optimization"), |
| (re.compile(r"\bgenetic algorithm\b"), "Genetic algorithms"), |
| (re.compile(r"\bsimulated annealing\b"), "Simulated annealing"), |
| ] |
|
|
| def _canonicalize_technique(name: str) -> tuple[str, str]: |
| cleaned = _normalize_technique_key(name) |
| for pattern, canonical in canonical_patterns: |
| if pattern.search(cleaned): |
| return canonical, canonical.lower() |
| display = " ".join(word.capitalize() for word in cleaned.split()) |
| display = display or _clean_technique_name(name) |
| return display, display.lower() |
|
|
| category_patterns = [ |
| (re.compile(r"\b(bert|roberta|xlm roberta|gpt|t5|transformer|fine[- ]?tuning)\b"), "Transformers"), |
| (re.compile(r"\b(word2vec|glove|doc2vec|fasttext|specter|sentence[- ]?transformer|embedding|tf[- ]?idf|bm25|bag of words|bow)\b"), "Embeddings / Representation"), |
| (re.compile(r"\b(topic modeling|lda|nmf|bertopic|lsa|lsi)\b"), "Topic Modeling"), |
| (re.compile(r"\b(k[- ]?means|hierarchical clustering|dbscan|hdbscan|gaussian mixture|gmm|clustering)\b"), "Clustering"), |
| (re.compile(r"\b(pca|svd|t-sne|tsne|umap|dimensionality reduction)\b"), "Dimensionality Reduction"), |
| (re.compile(r"\b(arima|sarima|var|prophet|time[- ]?series)\b"), "Time Series / Forecasting"), |
| (re.compile(r"\b(panel data|panel regression|fixed effects|random effects|multilevel|hierarchical linear model|hlm|mixed effects|glm|gls|gee|gmm|2sls|3sls|instrumental variable|tobit|heckman|poisson|negative binomial|probit|logit)\b"), "Econometric / Panel Models"), |
| (re.compile(r"\b(ols|linear regression|logistic regression|ridge|lasso|elastic net|regression)\b"), "Regression"), |
| (re.compile(r"\b(sem|pls[- ]?sem|cb[- ]?sem|structural equation|cfa|efa)\b"), "SEM"), |
| (re.compile(r"\b(latent class analysis|latent profile analysis|latent variable|mixture model)\b"), "Latent Variable Models"), |
| (re.compile(r"\b(grad(ient)? boosting|xgboost|lightgbm|catboost)\b"), "Boosting / Ensembles"), |
| (re.compile(r"\b(random forest|decision tree|svm|knn|naive bayes)\b"), "Classic ML"), |
| (re.compile(r"\b(neural network|deep learning|lstm|cnn|rnn|gru|mlp|autoencoder)\b"), "Deep Learning"), |
| (re.compile(r"\b(ner|named entity recognition|sentiment|nlp|text mining|tokenization|stemming|lemmatization|keyword extraction)\b"), "NLP / Text Mining"), |
| (re.compile(r"\b(network|centrality|community detection|louvain|leiden|ergm|link prediction|pagerank|graph neural network|gnn)\b"), "Network Analysis"), |
| (re.compile(r"\b(agent[- ]?based|monte carlo|bayesian optimization|linear programming|integer programming|genetic algorithm|simulated annealing)\b"), "Simulation / Optimization"), |
| (re.compile(r"\b(survival|cox|hazard|kaplan[- ]?meier)\b"), "Survival / Event History"), |
| (re.compile(r"\b(bayesian|mcmc|gibbs|variational)\b"), "Bayesian Methods"), |
| (re.compile(r"\b(anova|manova|ancova|mancova|t[- ]?test|chi[- ]?square|factor analysis|glmm|irt|mediation|moderation|wilcoxon|kruskal[- ]?wallis)\b"), "Statistical Tests / Models"), |
| (re.compile(r"\b(difference[- ]?in[- ]?differences|did|regression discontinuity|rdd|instrumental variable|iv|propensity score|matching)\b"), "Causal Inference"), |
| (re.compile(r"\b(recommender|collaborative filtering|matrix factorization)\b"), "Recommender Systems"), |
| (re.compile(r"\b(hidden markov|hmm|markov|kalman|state[- ]?space|hawkes)\b"), "Sequence / Stochastic Processes"), |
| (re.compile(r"\b(ahp|analytic hierarchy process|topsis|vikor|promethee|dematel)\b"), "Decision Analysis / MCDA"), |
| (re.compile(r"\b(dea|data envelopment analysis|stochastic frontier|sfa|frontier analysis)\b"), "Efficiency / Frontier Analysis"), |
| ] |
|
|
| def _categorize_technique(*names: str) -> str: |
| for name in names: |
| if not name: |
| continue |
| key = _normalize_technique_key(name) |
| for pattern, category in category_patterns: |
| if pattern.search(key): |
| return category |
| fallback_keywords = [ |
| ("Classic ML", ["classifier", "classification", "predictive model", "prediction", "supervised"]), |
| ("Clustering", ["cluster", "clustering"]), |
| ("Topic Modeling", ["topic", "semantic"]), |
| ("Embeddings / Representation", ["embedding", "vector", "tf idf", "bow", "bag of words"]), |
| ("Regression", ["regression", "logit", "probit", "panel", "fixed effects", "random effects", "glm", "gls", "gee", "gmm"]), |
| ("SEM", ["sem", "structural equation", "factor", "latent"]), |
| ("Bayesian Methods", ["bayesian", "mcmc", "gibbs", "prior", "posterior"]), |
| ("Time Series / Forecasting", ["time series", "forecast", "arima", "sarima", "var", "prophet"]), |
| ("NLP / Text Mining", ["nlp", "text", "token", "lemma", "stem", "language"]), |
| ("Network Analysis", ["network", "graph", "node", "edge"]), |
| ("Simulation / Optimization", ["simulation", "optimi", "heuristic", "metaheuristic", "monte carlo", "agent-based"]), |
| ] |
| for category, keywords in fallback_keywords: |
| if any(k in key for k in keywords): |
| return category |
| if any(token in key for token in ["model", "analysis", "estimation", "test"]): |
| return "Statistical Tests / Models" |
| return "Other" |
|
|
| category_map: dict[str, dict[str, object]] = {} |
| for r in paper_results: |
| paper_title = r.get("paper_title") or r.get("paper_filename") or "" |
| paper_id = r.get("paper_id", "") |
| paper_label = str(paper_title or paper_id) |
|
|
| methods = r.get("computational_methods", []) |
| if isinstance(methods, list): |
| techniques = set([m.strip() for m in methods if isinstance(m, str) and m.strip()]) |
| elif isinstance(methods, str) and methods.strip(): |
| techniques = set([m.strip() for m in re.split(r"[;,]", methods) if m.strip()]) |
| else: |
| techniques = set() |
|
|
| for technique in techniques: |
| algorithm, _ = _canonicalize_technique(technique) |
| if not algorithm: |
| continue |
| category = _categorize_technique(technique, algorithm) |
| key = category.lower() |
| if key not in category_map: |
| category_map[key] = { |
| "name": category, |
| "algorithms": set(), |
| "papers": set(), |
| } |
| category_map[key]["algorithms"].add(algorithm) |
| category_map[key]["papers"].add(paper_label) |
|
|
| technique_rows = [ |
| { |
| "Main Computational Technique": entry["name"], |
| "Algorithms": ", ".join(sorted(entry["algorithms"])), |
| "Papers": " | ".join(sorted(entry["papers"])), |
| } |
| for entry in sorted(category_map.values(), key=lambda v: str(v["name"]).lower()) |
| ] |
| technique_df = pd.DataFrame( |
| technique_rows, |
| columns=["Main Computational Technique", "Algorithms", "Papers"], |
| ) |
| technique_csv_path = rdir / "technique_to_papers.csv" |
| technique_df.to_csv(technique_csv_path, index=False) |
|
|
| return { |
| "n_papers": len(pdf_files), |
| "n_extracted": len(paper_results), |
| "csv_path": str(csv_path), |
| "technique_csv_path": str(technique_csv_path), |
| "results": paper_results, |
| } |
|
|
|
|
| |
| |
| |
|
|
| ALL_TOOLS = [ |
| load_scopus_csv, |
| run_bertopic_discovery, |
| label_topics_with_llm, |
| verify_topic_labels_with_groq, |
| consolidate_into_themes, |
| compare_with_taxonomy, |
| verify_taxonomy_mapping_with_groq, |
| generate_comparison_csv, |
| export_narrative, |
| extract_methods_from_pdfs, |
| ] |
|
|