from langchain_core.tools import tool import os import json import re import numpy as np import pandas as pd CHECKPOINT_DIR = "/tmp/checkpoints" os.makedirs(CHECKPOINT_DIR, exist_ok=True) NEAREST_K = 5 SENT_SPLIT_RE = r'(?<=[.!?])\s+(?=[A-Z])' MIN_SENT_LEN = 30 RUN_CONFIGS = {"abstract": ["Abstract"], "title": ["Title"]} _data = {} def _split_sentences(text): raw = re.split(SENT_SPLIT_RE, str(text)) return list(filter(lambda s: len(s.strip()) >= MIN_SENT_LEN, raw)) @tool def load_scopus_csv(filepath: str) -> str: """Load and summarize a Scopus CSV dataset.""" df = pd.read_csv(filepath, encoding="utf-8-sig") _data["df"] = df cols = [c for c in ["Title", "Abstract", "Author Keywords"] if c in df.columns] sample = df[cols].head(3).to_string(max_colwidth=80) nulls = ", ".join([f"{c}: {df[c].notna().sum()}/{len(df)}" for c in cols]) avg_sents = df["Abstract"].head(5).apply(_split_sentences).apply(len).mean() est = int(avg_sents * len(df)) return (f"📊 **Dataset Statistics:**\n" f"- **Papers:** {len(df)}\n" f"- **Abstract sentences:** ~{est}\n" f"- **Title sentences:** {int(df['Title'].notna().sum())}\n" f"- **Non-null:** {nulls}\n\n" f"Columns: {', '.join(list(df.columns)[:15])}\n\n" f"Sample:\n{sample}") @tool def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str: """Run BERTopic clustering on abstracts or titles.""" from bertopic import BERTopic from sentence_transformers import SentenceTransformer from sklearn.preprocessing import FunctionTransformer from sklearn.cluster import AgglomerativeClustering df = _data["df"].copy() available = [c for c in RUN_CONFIGS[run_key] if c in df.columns] df["_text"] = df[available].fillna("").agg(" ".join, axis=1) df["_paper_id"] = df.index df["_sentences"] = df["_text"].apply(_split_sentences) meta = [c for c in ["_paper_id", "Title", "Author Keywords", "_sentences"] if c in df.columns] sent_df = df[meta].explode("_sentences").rename(columns={"_sentences": "text"}).dropna(subset=["text"]).reset_index(drop=True) sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount() patterns = r"Licensee MDPI|Published by Informa|Published by Elsevier|Taylor & Francis|Copyright ©|Creative Commons|open access article|Inderscience Enterprises|All rights reserved|Springer Nature|Emerald Publishing|limitations and (future|implications|discussed)|implications (are|were) (discussed|presented)|concludes with .* implications" sent_df = sent_df[~sent_df["text"].str.contains(patterns, case=False, regex=True, na=False)].reset_index(drop=True) embedder = SentenceTransformer("all-MiniLM-L6-v2") embs = embedder.encode(sent_df["text"].tolist(), show_progress_bar=False, normalize_embeddings=True) np.save(f"{CHECKPOINT_DIR}/rq4_{run_key}_emb.npy", embs) cluster = AgglomerativeClustering(n_clusters=None, metric="cosine", linkage="average", distance_threshold=threshold) model = BERTopic(hdbscan_model=cluster, umap_model=FunctionTransformer()) topics, _ = model.fit_transform(sent_df["text"].tolist(), embs) _data[f"{run_key}_model"] = model _data[f"{run_key}_topics"] = np.array(topics) _data[f"{run_key}_embeddings"] = embs _data[f"{run_key}_sent_df"] = sent_df n = len(set(topics)) - int(-1 in topics) (n >= 3) and model.visualize_topics().write_html(f"/tmp/rq4_{run_key}_intertopic.html") (n >= 1) and model.visualize_barchart(top_n_topics=min(10, n)).write_html(f"/tmp/rq4_{run_key}_bars.html") (n >= 2) and model.visualize_hierarchy().write_html(f"/tmp/rq4_{run_key}_hierarchy.html") (n >= 2) and model.visualize_heatmap().write_html(f"/tmp/rq4_{run_key}_heatmap.html") t_arr = np.array(topics) valid = [r for r in model.get_topic_info().to_dict("records") if r["Topic"] != -1] def _centroid(row): mask = t_arr == row["Topic"] m_idx = np.where(mask)[0] m_embs = embs[mask] cent = m_embs.mean(axis=0) dists = 1 - (m_embs @ cent) / (np.linalg.norm(m_embs, axis=1) * np.linalg.norm(cent) + 1e-10) near = np.argsort(dists)[:NEAREST_K] evidence = [{"sentence": str(sent_df.iloc[m_idx[i]]["text"])[:250], "paper_id": int(sent_df.iloc[m_idx[i]]["_paper_id"]), "title": str(sent_df.iloc[m_idx[i]].get("Title", ""))[:150], "keywords": str(sent_df.iloc[m_idx[i]].get("Author Keywords", ""))[:150]} for i in near] p_df = sent_df.iloc[m_idx].drop_duplicates(subset=["_paper_id"]) titles = [str(p_df.iloc[i].get("Title", ""))[:200] for i in range(min(50, len(p_df)))] return {"topic_id": int(row["Topic"]), "sentence_count": int(row["Count"]), "paper_count": len(p_df), "top_words": str(row.get("Name", ""))[:100], "nearest": evidence, "paper_titles": titles} sums = list(map(_centroid, valid)) json.dump(sums, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json", "w"), indent=2, default=str) lines = [f" Topic {s['topic_id']} ({s['sentence_count']} sents, {s['paper_count']} papers): {s['top_words']}" for s in sums] return f"[{run_key}] {n} topics from {len(sent_df)} sentences.\n\n" + "\n".join(lines) @tool def label_topics_with_llm(run_key: str) -> str: """Use Mistral to generate academic labels for discovered topics.""" from langchain_mistralai import ChatMistralAI from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser sums = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json")) to_label = sorted(sums, key=lambda s: s.get("sentence_count", 0), reverse=True)[:100] block = "\n\n".join([f"Topic {s['topic_id']} ({s['sentence_count']} sents):\n{NEAREST_K} entries:\n" + "\n".join([f"- {e['sentence']}\n Paper: {e['title']}" for e in s["nearest"]]) for s in to_label]) prompt = PromptTemplate.from_template("Return JSON ARRAY of objects with topic_id, label, category, confidence, reasoning, niche for:\n{topics}") llm = ChatMistralAI(model="mistral-small-latest", temperature=0) labels = (prompt | llm | JsonOutputParser()).invoke({"topics": block}) labeled = [{**s, **l} for s, l in zip(sums, labels + sums)] json.dump(labeled, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json", "w"), indent=2, default=str) lines = [f" **Topic {l.get('topic_id')}: {l.get('label')}** [{l.get('category')}] ({l.get('sentence_count')} sents)" for l in labeled] return f"[{run_key}] {len(labeled)} topics labeled.\n\n" + "\n\n".join(lines) @tool def generate_comparison_csv() -> str: """Create a comparison matrix across multiple analysis runs.""" done = [k for k in RUN_CONFIGS.keys() if os.path.exists(f"{CHECKPOINT_DIR}/rq4_{k}_labels.json")] rows = [] for k in done: ls = json.load(open(f"{CHECKPOINT_DIR}/rq4_{k}_labels.json")) rows.extend([{"run": k, "topic_id": l.get("topic_id"), "label": l.get("label"), "category": l.get("category"), "sentences": l.get("sentence_count"), "papers": l.get("paper_count")} for l in ls]) df = pd.DataFrame(rows) df.to_csv("/tmp/rq4_comparison.csv", index=False) return f"Saved to /tmp/rq4_comparison.csv\n\n{df.to_string(index=False)}" @tool def export_narrative(run_key: str) -> str: """Generate a 500-word research narrative for the results section.""" from langchain_mistralai import ChatMistralAI ls = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json")) txt = "\n".join([f"- {l.get('label')} ({l.get('sentence_count')} sents)" for l in ls]) llm = ChatMistralAI(model="mistral-small-latest", temperature=0.3) res = llm.invoke(f"Write a 500-word Section 7 'Topic Modeling Results' for {run_key} run:\n{txt}") open("/tmp/rq4_narrative.txt", "w", encoding="utf-8").write(res.content) return f"Saved to /tmp/rq4_narrative.txt\n\n{res.content}" @tool def consolidate_into_themes(run_key: str, theme_map: dict) -> str: """Merge specific topics into broader research themes.""" t_arr, embs, s_df = _data[f"{run_key}_topics"], _data[f"{run_key}_embeddings"], _data[f"{run_key}_sent_df"] def _build(name, ids): mask = np.isin(t_arr, ids) m_idx, m_embs = np.where(mask)[0], embs[mask] cent = m_embs.mean(axis=0) dists = 1 - (m_embs @ cent) / (np.linalg.norm(m_embs, axis=1) * np.linalg.norm(cent) + 1e-10) near = np.argsort(dists)[:NEAREST_K] evidence = [{"sentence": str(s_df.iloc[m_idx[i]]["text"])[:250], "title": str(s_df.iloc[m_idx[i]].get("Title", ""))[:150]} for i in near] return {"label": name, "merged_topics": list(ids), "sentence_count": int(mask.sum()), "paper_count": int(s_df.iloc[m_idx]["_paper_id"].nunique()), "nearest": evidence} themes = [{"topic_id": i, **_build(n, ids)} for i, (n, ids) in enumerate(theme_map.items())] json.dump(themes, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json", "w"), indent=2, default=str) lines = [f" **{t['label']}** ({t['sentence_count']} sents)" for t in themes] return f"[{run_key}] {len(themes)} themes.\n\n" + "\n".join(lines) PAJAIS = ["Electronic Business", "HCI", "IS Strategy", "Business Intelligence", "Design Science", "Enterprise Systems", "Adoption", "Social Media", "Cultural Issues", "Security", "Smart/IoT", "Knowledge Management", "Digital Platform", "Healthcare", "Project Management", "Service Science", "Social/Org Aspects", "Research Methods", "E-Finance", "E-Government", "Education", "Sustainability"] @tool def compare_with_taxonomy(run_key: str) -> str: """Map themes to the PAJAIS taxonomy or identify novel contributions.""" from langchain_mistralai import ChatMistralAI from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser src = (os.path.exists(f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json") and f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json") or f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json" ts = json.load(open(src)) prompt = PromptTemplate.from_template("Map themes to PAJAIS taxonomy or mark 'NOVEL'. Return JSON array for:\nThemes:\n{ts}\nTaxonomy:\n{tax}") llm = ChatMistralAI(model="mistral-small-latest", temperature=0) ms = (prompt | llm | JsonOutputParser()).invoke({"ts": "\n".join([t['label'] for t in ts]), "tax": "\n".join(PAJAIS)}) json.dump(ms, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json", "w"), indent=2, default=str) return f"[{run_key}] Mapping complete." def get_all_tools(): ts = [load_scopus_csv, run_bertopic_discovery, label_topics_with_llm, consolidate_into_themes, compare_with_taxonomy, generate_comparison_csv, export_narrative] for t in ts: setattr(t, 'handle_tool_error', True) return ts