Daksh C Jain
Initial commit: EIS Topic Intelligence β€” UMAP+HDBSCAN+Mistral council, dark EIS theme, 23 clusters from Enterprise Information Systems corpus
c91d9b4
"""
tools.py β€” 7 @tool functions for BERTopic Agentic AI
Assignment: Text Analysis & Topic Modelling (Prof. Shailaja Jha)
Generated via: Anthropic Claude Sonnet 4.5
Architecture: LangChain @tool + LangGraph | Model: Mistral Small Latest
Rules: ZERO if/elif/else | ZERO for/while | ZERO try/except | handle_tool_error=True
"""
import os
import re
import json
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from langchain_core.tools import tool
from langchain_mistralai import ChatMistralAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
# ─── CONSTANTS ────────────────────────────────────────────────────────────────
OUTPUT_DIR = "./outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
MAX_LABEL_TOPICS = 100
NEAREST_K = 5
BATCH_SIZE = 20
BOILERPLATE_RE = re.compile(
r"Β©\s*\d{4}[^.]*?\.|All\s+rights\s+reserved\.?|"
r"Published\s+by\s+[A-Z][^.]*?\.|This\s+is\s+an\s+open\s+access[^.]*?\.|"
r"Correspondence\s+(to|author):[^.]*?\.|E-?mail:[^.]*?\.|"
r"Received:[^.]*?Accepted:[^.]*?\.|DOI:\S+|doi:\S+|https?://\S+|"
r"Keywords:[^.]*?\.|JEL[^.]*?\.|ISSN[^.]*?\.|ISBN[^.]*?\.|"
r"Elsevier[^.]*?\.|Springer[^.]*?\.|Emerald[^.]*?\.|"
r"Wiley[^.]*?\.|Taylor\s*&\s*Francis[^.]*?\.|"
r"This\s+paper\s+is\s+part\s+of[^.]*?\.|"
r"Conflict\s+of\s+interest[^.]*?\.|"
r"Funding[^.]*?:\s*[^.]*?\.|"
r"Acknowledgement[s]?:[^.]*?\.",
re.IGNORECASE | re.DOTALL,
)
SENT_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z\"(])")
PAJAIS_25 = [
"IS Strategy and Management", "E-Commerce and E-Business",
"IT Adoption and Diffusion", "Business Intelligence and Analytics",
"Social Commerce and Social Media", "Mobile Commerce and Applications",
"Knowledge Management", "Healthcare Information Systems",
"Privacy, Security and Trust", "Enterprise Systems and ERP",
"Digital Platforms and Ecosystems", "Blockchain and Distributed Ledgers",
"Artificial Intelligence and Machine Learning",
"Human-Computer Interaction and UX",
"Digital Transformation and Innovation",
"Financial Technology and Digital Finance",
"Supply Chain and Logistics IS", "Smart Systems IoT and Smart Cities",
"IS Research Methods and Theory",
"Recommender and Personalization Systems",
"Digital Marketing and Advertising",
"Virtual Teams and Online Collaboration",
"Cloud Computing and SaaS", "Big Data Analytics and Data Science",
"IS Education and Training",
]
_EMBED_MODEL = None
def _get_embed_model():
global _EMBED_MODEL
from sentence_transformers import SentenceTransformer
_EMBED_MODEL = _EMBED_MODEL or SentenceTransformer(
"all-MiniLM-L6-v2"
)
return _EMBED_MODEL
def _get_llm():
return ChatMistralAI(
model="mistral-small-latest",
api_key=os.environ.get("MISTRAL_API_KEY", ""),
temperature=0.1,
)
def _clean(text: str) -> str:
return BOILERPLATE_RE.sub(" ", str(text)).strip()
def _split(text: str) -> list:
return [s.strip() for s in SENT_RE.split(_clean(text)) if len(s.strip()) > 30]
def _save(data, name: str) -> str:
path = os.path.join(OUTPUT_DIR, name)
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
return path
def _load(name: str):
with open(os.path.join(OUTPUT_DIR, name), "r", encoding="utf-8") as f:
return json.load(f)
def _opath(name: str) -> str:
return os.path.join(OUTPUT_DIR, name)
def _generate_charts(run_key: str, data: list, name_key: str = "cluster_id"):
"""Regenerates the 4 Plotly charts using the latest data (summaries, labels, or themes)."""
if not data:
return
centroids = np.array([s["centroid"] for s in data])
sizes = [s["sentence_count"] for s in data]
n_clusters = len(data)
# Use the name_key to get human-readable labels if available
def get_name(s):
if name_key == "cluster_id":
return f"C{s.get('cluster_id', '?')}"
return s.get(name_key, f"C{s.get('cluster_id', '?')}")
names = [get_name(s) for s in data]
pca = PCA(n_components=2)
# Handle case where n_clusters < 2
if n_clusters < 2:
coords = np.zeros((n_clusters, 2))
else:
coords = pca.fit_transform(centroids)
chart_dir = _opath(f"{run_key}_charts")
os.makedirs(chart_dir, exist_ok=True)
fig1 = px.scatter(
x=coords[:, 0], y=coords[:, 1], size=sizes,
title=f"Intertopic Distance Map β€” {run_key.title()}",
labels={"x": "PC1", "y": "PC2"},
hover_name=names,
template="plotly_dark",
)
fig1.write_html(os.path.join(chart_dir, "intertopic_map.html"), include_plotlyjs="cdn", full_html=True)
top30 = data[:30]
fig2 = px.bar(
x=[get_name(s) for s in top30],
y=[s["sentence_count"] for s in top30],
title=f"Top 30 Cluster Sizes β€” {run_key.title()}",
labels={"x": "Cluster", "y": "Sentences"},
template="plotly_dark",
)
fig2.write_html(os.path.join(chart_dir, "bar_chart.html"), include_plotlyjs="cdn", full_html=True)
fig3 = px.treemap(
names=names,
parents=["clusters"] * n_clusters,
values=sizes,
title=f"Topic Treemap β€” {run_key.title()}",
)
fig3.write_html(os.path.join(chart_dir, "treemap.html"), include_plotlyjs="cdn", full_html=True)
# Heatmap: pad to 20
hm_items = data[:20]
pad_count = 20 - len(hm_items)
hm_items_padded = hm_items + [{"sentence_count": 0, name_key: "Empty"}] * pad_count
heatmap_data = np.array([s.get("sentence_count", 0) for s in hm_items_padded]).reshape(4, 5)
heatmap_text = [[get_name(hm_items_padded[i * 5 + j]) for j in range(5)] for i in range(4)]
fig4 = go.Figure(go.Heatmap(
z=heatmap_data, colorscale="Viridis", text=heatmap_text,
texttemplate="%{text}", showscale=True,
))
fig4.update_layout(title=f"Topic Size Heatmap β€” {run_key.title()}", template="plotly_dark")
fig4.write_html(os.path.join(chart_dir, "heatmap.html"), include_plotlyjs="cdn", full_html=True)
# ─── TOOL 1: LOAD CSV ─────────────────────────────────────────────────────────
@tool
def load_scopus_csv(filepath: str) -> str:
"""Load a Scopus CSV export file and return statistics.
Phase 1 of Braun & Clarke (2006) β€” Familiarisation.
Call this FIRST before any analysis. filepath must be the absolute path to the CSV."""
df = pd.read_csv(filepath, encoding="utf-8-sig", on_bad_lines="skip")
required = ["Title", "Abstract", "Authors", "Year", "Cited by",
"Author Keywords", "Source title"]
found = [c for c in required if c in df.columns]
missing = [c for c in required if c not in df.columns]
pairs_abs = [(s, i) for i, t in enumerate(df["Abstract"].fillna("").tolist())
for s in _split(t)]
pairs_ttl = [(s, i) for i, t in enumerate(df["Title"].fillna("").tolist())
for s in _split(t)]
year_min = int(df["Year"].dropna().min()) if "Year" in df.columns else 0
year_max = int(df["Year"].dropna().max()) if "Year" in df.columns else 0
journal = (df["Source title"].value_counts().index[0]
if "Source title" in df.columns else "Unknown")
_save({"filepath": filepath, "journal": journal,
"rows": len(df), "year_min": year_min, "year_max": year_max},
"corpus_config.json")
return (
f"βœ… CSV Loaded\nJournal: {journal}\nPapers: {len(df)}\n"
f"Year Range: {year_min}–{year_max}\n"
f"Columns Found ({len(found)}/7): {found}\nMissing: {missing}\n"
f"Abstract sentences: {len(pairs_abs):,}\n"
f"Title sentences: {len(pairs_ttl):,}\n"
f"Type 'run abstract' to begin Phase 2."
)
# ─── TOOL 2: RUN BERTOPIC DISCOVERY ──────────────────────────────────────────
@tool
def run_bertopic_discovery(run_key: str, target_size: int = 250) -> str:
"""Embed sentences with all-MiniLM-L6-v2 and apply Balanced Agglomerative Clustering.
Dynamic K selection based on data size (target_size=250 sentences per topic).
Includes automatic splitting of massive clusters and merging of tiny clusters
to guarantee minimal size disparity across all discovered topics.
Saves {run_key}_summaries.json + {run_key}_emb.npy. Phase 2 of Braun & Clarke.
run_key must be 'abstract' or 'title'. target_size guides the dynamic cluster counts."""
cfg = _load("corpus_config.json")
df = pd.read_csv(cfg["filepath"], encoding="utf-8-sig", on_bad_lines="skip")
col = "Abstract" if run_key == "abstract" else "Title"
pairs = [(s, i) for i, t in enumerate(df[col].fillna("").tolist())
for s in _split(t)]
sentences = [p[0] for p in pairs]
paper_ids = [p[1] for p in pairs]
model = _get_embed_model()
emb = model.encode(sentences, normalize_embeddings=True,
batch_size=64, show_progress_bar=True)
np.save(_opath(f"{run_key}_emb.npy"), emb)
_save({"sentences": sentences, "paper_ids": paper_ids},
f"{run_key}_sentences.json")
# Dynamic sizing calculations
total_sents = len(sentences)
dynamic_k = max(5, total_sents // target_size)
max_size = target_size * 2
min_size = target_size // 2
labels_arr = AgglomerativeClustering(
n_clusters=dynamic_k, metric="euclidean", linkage="ward"
).fit_predict(emb)
# 1. Enforce splitting of massive clusters
while True:
u_labels, counts = np.unique(labels_arr, return_counts=True)
too_big = u_labels[counts > max_size]
if len(too_big) == 0:
break
for cid in too_big:
idx = np.where(labels_arr == cid)[0]
split_k = int(np.ceil(len(idx) / target_size))
sub_labels = AgglomerativeClustering(
n_clusters=split_k, metric="euclidean", linkage="ward"
).fit_predict(emb[idx])
new_id_start = max(labels_arr) + 1
for sub_id in range(1, split_k):
sub_idx = idx[sub_labels == sub_id]
labels_arr[sub_idx] = new_id_start
new_id_start += 1
# 2. Enforce merging of tiny clusters
while True:
u_labels, counts = np.unique(labels_arr, return_counts=True)
too_small = u_labels[counts < min_size]
if len(too_small) == 0 or len(u_labels) <= 5: # keep at least 5 clusters
break
cid = too_small[0]
idx = np.where(labels_arr == cid)[0]
centroid = emb[idx].mean(axis=0, keepdims=True)
best_dist = -1.0
best_merge_id = -1
# Try merging into the nearest cluster that won't become too huge
for other_id in u_labels:
if other_id == cid: continue
other_idx = np.where(labels_arr == other_id)[0]
if len(other_idx) + len(idx) > (max_size * 1.5):
continue
other_centroid = emb[other_idx].mean(axis=0, keepdims=True)
sim = cosine_similarity(centroid, other_centroid)[0][0]
if sim > best_dist:
best_dist = float(sim)
best_merge_id = int(other_id)
if best_merge_id != -1:
labels_arr[idx] = best_merge_id
else:
# Fallback: force merge into absolute nearest neighbor regardless of size limit
best_dist = -1.0
best_merge_id = -1
for other_id in u_labels:
if other_id == cid: continue
other_idx = np.where(labels_arr == other_id)[0]
other_centroid = emb[other_idx].mean(axis=0, keepdims=True)
sim = cosine_similarity(centroid, other_centroid)[0][0]
if sim > best_dist:
best_dist = float(sim)
best_merge_id = int(other_id)
labels_arr[idx] = best_merge_id
unique_labels = np.unique(labels_arr)
n_clusters = len(unique_labels)
# Build per-cluster sentence index list
# list(map(int,...)) converts numpy.int64 -> Python int for JSON serialisation
cluster_sentence_idx = {int(cid): list(map(int, np.where(labels_arr == cid)[0]))
for cid in unique_labels}
def make_summary(cid):
idx = cluster_sentence_idx[int(cid)]
c_emb = emb[idx]
centroid = c_emb.mean(axis=0, keepdims=True)
sims = cosine_similarity(centroid, c_emb)[0]
top_k = min(NEAREST_K, len(idx))
# Convert numpy int64 -> Python int to ensure JSON serialisability
top_local = list(map(int, np.argsort(sims)[-top_k:][::-1]))
top_global = list(map(lambda j: idx[j], top_local))
return {
"cluster_id": int(cid),
"sentence_count": len(idx),
"paper_count": len(set(paper_ids[i] for i in idx)),
"top_sentences": [sentences[i] for i in top_global],
"centroid": centroid[0].tolist(),
# idx already Python int from cluster_sentence_idx (Fix 1)
"sentence_indices": idx,
}
summaries = list(map(make_summary, unique_labels))
summaries = sorted(summaries, key=lambda x: x["sentence_count"], reverse=True)
_save(summaries, f"{run_key}_summaries.json")
# ── 4 Plotly Charts ───────────────────────────────────────────────────────
_generate_charts(run_key, summaries, name_key="cluster_id")
chart_dir = _opath(f"{run_key}_charts")
return (
f"βœ… BERTopic Discovery Complete ({run_key})\n"
f"Total sentences: {len(sentences):,}\n"
f"Topics generated: {n_clusters} (Dynamic via target_size={target_size})\n"
f"Algorithm: Constrained Agglomerative (Split & Merge Balanced)\n"
f"Largest cluster: {summaries[0]['sentence_count']} sentences\n"
f"Smallest cluster: {summaries[-1]['sentence_count']} sentences\n"
f"Charts saved to {chart_dir}\n"
f"Now calling label_topics_with_llm..."
)
# ─── TOOL 3: LABEL TOPICS WITH LLM ───────────────────────────────────────────
@tool
def label_topics_with_llm(run_key: str) -> str:
"""Send top 100 clusters to Mistral for labelling.
Returns topic labels, categories, confidence scores, reasoning, is_niche.
Saves {run_key}_labels.json. Phase 2 of Braun & Clarke.
run_key must be 'abstract' or 'title'."""
summaries = _load(f"{run_key}_summaries.json")[:MAX_LABEL_TOPICS]
llm = _get_llm()
label_prompt = PromptTemplate.from_template(
"You are a bibliometric research expert.\n"
"Label each cluster below with a concise research area name.\n"
"Return ONLY a JSON array β€” one object per cluster:\n"
' {{"cluster_id": N, "label": "...", "category": "...", '
'"confidence": 0.0-1.0, "reasoning": "...", "is_niche": true/false}}\n\n'
"Clusters (ID | sentence_count | top 2 sentences):\n{clusters}\n\n"
"Return valid JSON array only, no markdown fences."
)
def _format_batch(batch):
return "\n".join(
f"{s['cluster_id']} | {s['sentence_count']} sents | "
+ " /// ".join(s["top_sentences"][:2])
for s in batch
)
def label_batch(batch):
raw = (label_prompt | llm | StrOutputParser()).invoke(
{"clusters": _format_batch(batch)}
)
cleaned = raw.strip().lstrip("```json").lstrip("```").rstrip("```").strip()
return json.loads(cleaned)
batch_starts = list(range(0, len(summaries), BATCH_SIZE))
batches = list(map(lambda i: summaries[i:i + BATCH_SIZE], batch_starts))
results = [item for batch_result in map(label_batch, batches)
for item in batch_result]
label_map = {r["cluster_id"]: r for r in results}
labeled = [
{**s, **label_map.get(s["cluster_id"],
{"label": f"Topic {s['cluster_id']}", "category": "Unknown",
"confidence": 0.5, "reasoning": "", "is_niche": False})}
for s in summaries
]
_save(labeled, f"{run_key}_labels.json")
_generate_charts(run_key, labeled, name_key="label")
return (
f"βœ… Labels Generated ({run_key})\n"
f"Topics labeled: {len(labeled)}\n"
f"Review table populated. Edit Approve/Rename columns, "
f"then click Submit Review."
)
# ─── TOOL 4: CONSOLIDATE INTO THEMES ─────────────────────────────────────────
@tool
def consolidate_into_themes(run_key: str, theme_map: str) -> str:
"""Merge researcher-approved topic groups into consolidated themes.
theme_map: JSON string β€” array from review table with cluster_id, approve, rename_to fields.
Recomputes centroids and paper counts from actual embeddings.
Saves {run_key}_themes.json. Phase 3 of Braun & Clarke."""
decisions = json.loads(theme_map)
emb = np.load(_opath(f"{run_key}_emb.npy"))
sent_data = _load(f"{run_key}_sentences.json")
paper_ids = sent_data["paper_ids"]
sentences = sent_data["sentences"]
summaries = _load(f"{run_key}_summaries.json")
# Build cluster_id β†’ sentence_indices map from summaries
# (sentence_indices stored during discovery; fallback to sequential search)
sum_map = {s["cluster_id"]: s for s in summaries}
approved = [d for d in decisions if str(d.get("approve", "")).upper() == "YES"]
# Group cluster IDs by theme name
theme_groups: dict = {}
list(map(
lambda d: theme_groups.setdefault(
str(d.get("rename_to", "") or d.get("label", f"Topic {d['cluster_id']}")).strip(),
[]
).append(int(d["cluster_id"])),
approved
))
def build_theme(name_cids_tuple):
name, cids = name_cids_tuple
# Collect all sentence indices for these clusters
all_sent_idx = list(set(
idx
for cid in cids
for idx in (sum_map[cid].get("sentence_indices", []) if cid in sum_map else [])
))
# Fallback: scan paper_ids if sentence_indices weren't stored
fallback_idx = list(set(
i for cid in cids
for i in range(len(paper_ids))
if paper_ids[i] == cid
)) if not all_sent_idx else all_sent_idx
use_idx = all_sent_idx if all_sent_idx else fallback_idx
theme_emb = emb[use_idx] if use_idx else emb[:1]
centroid = theme_emb.mean(axis=0)
total_sents = sum(sum_map[cid]["sentence_count"]
for cid in cids if cid in sum_map)
unique_papers = set(paper_ids[i] for i in use_idx) if use_idx else set()
top_sents = sum_map[cids[0]]["top_sentences"][:3] if cids and cids[0] in sum_map else []
return {
"theme_name": name,
"merged_cluster_ids": cids,
"sentence_count": total_sents,
"paper_count": len(unique_papers),
"top_sentences": top_sents,
"centroid": centroid.tolist(),
}
themes = list(map(build_theme, theme_groups.items()))
themes.sort(key=lambda x: x["sentence_count"], reverse=True)
_save(themes, f"{run_key}_themes.json")
_generate_charts(run_key, themes, name_key="theme_name")
return (
f"βœ… Themes Consolidated ({run_key})\n"
f"Approved topics: {len(approved)}\n"
f"Final themes: {len(themes)}\n"
f"Theme names: {[t['theme_name'] for t in themes]}\n"
f"Review consolidated themes. Click Submit Review to confirm."
)
# ─── TOOL 5: COMPARE WITH TAXONOMY ───────────────────────────────────────────
@tool
def compare_with_taxonomy(run_key: str) -> str:
"""Map final themes to the PAJAIS taxonomy (Jiang et al. 2019) β€” 25 categories.
Classifies each theme as MAPPED or NOVEL.
Saves taxonomy_map.json. Phase 5.5 of Braun & Clarke.
run_key must be 'abstract' or 'title'."""
themes_file = (f"{run_key}_themes.json"
if os.path.exists(_opath(f"{run_key}_themes.json"))
else f"{run_key}_labels.json")
themes_raw = _load(themes_file)
theme_names = [t.get("theme_name", t.get("label", "")) for t in themes_raw]
llm = _get_llm()
tax_prompt = PromptTemplate.from_template(
"You are a bibliometric taxonomy expert.\n"
"Map each theme to the PAJAIS taxonomy (Jiang et al., 2019).\n\n"
"PAJAIS 25 categories:\n{pajais}\n\n"
"Themes to classify:\n{themes}\n\n"
"Return ONLY a JSON array:\n"
'[{{"theme": "...", "pajais_match": "category or NOVEL", '
'"match_confidence": 0.0-1.0, "reasoning": "...", "is_novel": true/false}}]\n'
"If no PAJAIS category fits well, set pajais_match to NOVEL and is_novel to true.\n"
"No markdown fences, return raw JSON only."
)
pajais_str = "\n".join(f"{i+1}. {c}" for i, c in enumerate(PAJAIS_25))
themes_str = "\n".join(f"- {n}" for n in theme_names)
raw = (tax_prompt | llm | StrOutputParser()).invoke(
{"pajais": pajais_str, "themes": themes_str}
)
cleaned = raw.strip().lstrip("```json").lstrip("```").rstrip("```").strip()
results = json.loads(cleaned)
mapped = [r for r in results if not r.get("is_novel", False)]
novel = [r for r in results if r.get("is_novel", False)]
covered = set(r["pajais_match"] for r in mapped)
gaps = [c for c in PAJAIS_25 if c not in covered]
taxonomy_map = {
"run_key": run_key,
"taxonomy_mapping": {r["theme"]: r for r in results},
"novel_themes": [r["theme"] for r in novel],
"pajais_gap_categories": gaps,
"coverage_stats": {
"total_themes": len(results),
"mapped": len(mapped),
"novel": len(novel),
},
}
_save(taxonomy_map, "taxonomy_map.json")
return (
f"βœ… PAJAIS Taxonomy Mapped ({run_key})\n"
f"Themes mapped: {len(mapped)}\n"
f"NOVEL themes: {len(novel)} β†’ {[r['theme'] for r in novel]}\n"
f"PAJAIS gaps (top 5): {gaps[:5]}\n"
f"taxonomy_map.json saved. Review PAJAIS mapping in table. Click Submit Review."
)
# ─── TOOL 6: GENERATE COMPARISON CSV ─────────────────────────────────────────
@tool
def generate_comparison_csv() -> str:
"""Load themes from both abstract and title runs and create a side-by-side comparison.
Identifies STABLE (convergent), ABSTRACT-ONLY, and TITLE-ONLY themes.
Saves comparison.csv. Phase 6 of Braun & Clarke."""
def load_themes(key):
fname = (f"{key}_themes.json"
if os.path.exists(_opath(f"{key}_themes.json"))
else f"{key}_labels.json")
return _load(fname)
abs_themes = load_themes("abstract")
ttl_themes = load_themes("title")
abs_names = [t.get("theme_name", t.get("label", "")) for t in abs_themes]
ttl_names = [t.get("theme_name", t.get("label", "")) for t in ttl_themes]
abs_kws = [" | ".join(t.get("top_sentences", [""])[:1]) for t in abs_themes]
ttl_kws = [" | ".join(t.get("top_sentences", [""])[:1]) for t in ttl_themes]
max_len = max(len(abs_themes), len(ttl_themes))
pad = lambda lst, val: lst + [val] * (max_len - len(lst)) # noqa: E731
df = pd.DataFrame({
"Abstract_Theme": pad(abs_names, ""),
"Abstract_Evidence": pad(abs_kws, ""),
"Abstract_Sentences": pad([t.get("sentence_count", 0) for t in abs_themes], 0),
"Title_Theme": pad(ttl_names, ""),
"Title_Evidence": pad(ttl_kws, ""),
"Title_Sentences": pad([t.get("sentence_count", 0) for t in ttl_themes], 0),
"Convergence": pad(
["STABLE" if a in ttl_names else "ABSTRACT-ONLY" for a in abs_names],
"TITLE-ONLY"
),
})
path = _opath("comparison.csv")
df.to_csv(path, index=False)
return (
f"βœ… Comparison CSV Generated\n"
f"Abstract themes: {len(abs_themes)}\n"
f"Title themes: {len(ttl_themes)}\n"
f"Rows: {len(df)}\nFile: {path}\n"
f"Check Download tab for comparison.csv. Click Submit Review to generate narrative."
)
# ─── TOOL 7: EXPORT NARRATIVE ─────────────────────────────────────────────────
@tool
def export_narrative(run_key: str) -> str:
"""Generate a 500-word Section 7 narrative via Mistral LLM.
Uses themes + PAJAIS taxonomy mapping as context.
Saves narrative.txt. Phase 6 of Braun & Clarke.
run_key must be 'abstract' or 'title'."""
cfg = _load("corpus_config.json")
theme_file = (f"{run_key}_themes.json"
if os.path.exists(_opath(f"{run_key}_themes.json"))
else f"{run_key}_labels.json")
themes = _load(theme_file)
tax = _load("taxonomy_map.json")
theme_names = [t.get("theme_name", t.get("label", "")) for t in themes]
novel_themes = tax.get("novel_themes", [])
gaps = tax.get("pajais_gap_categories", [])
mapped = tax.get("coverage_stats", {}).get("mapped", 0)
llm = _get_llm()
narr_prompt = PromptTemplate.from_template(
"Write a 500-word Section 7 for a conference paper on topic modelling.\n"
"Journal: {journal} | Papers: {papers} | Years: {y_min}–{y_max}\n"
"Stable BERTopic themes (abstract run): {themes}\n"
"NOVEL themes (not in PAJAIS 2019): {novel}\n"
"PAJAIS gap categories: {gaps}\n"
"Themes mapped to PAJAIS: {mapped}\n\n"
"Structure: 7.1 Methodology (LDA + BERTopic, Braun & Clarke 2006), "
"7.2 RQ4 LDA Findings, 7.3 RQ5 Abstract vs Title Comparison, "
"7.4 RQ6 PAJAIS Taxonomy Mapping with NOVEL theme justification, "
"7.5 RQ7 Future Research Agenda.\n"
"Cite: Braun & Clarke (2006), Jiang et al. (2019), Grootendorst (2022).\n"
"~500 words, academic tone, no bullet points, paragraph form."
)
narrative = (narr_prompt | llm | StrOutputParser()).invoke({
"journal": cfg.get("journal", "Electronic Markets"),
"papers": cfg.get("rows", 908),
"y_min": cfg.get("year_min", 2007),
"y_max": cfg.get("year_max", 2026),
"themes": ", ".join(theme_names[:10]),
"novel": ", ".join(novel_themes[:5]),
"gaps": ", ".join(gaps[:5]),
"mapped": mapped,
})
path = _opath("narrative.txt")
with open(path, "w", encoding="utf-8") as f:
f.write(narrative)
return (
f"βœ… Narrative Exported\n"
f"Words: {len(narrative.split())}\n"
f"File: {path}\n"
f"πŸŽ‰ Pipeline complete! Download narrative.txt from the Download tab.\n"
f"Deliverables: comparison.csv | taxonomy_map.json | narrative.txt"
)
# --- SET handle_tool_error ON ALL TOOLS (BaseTool property) ---
# langchain-core 0.3.x: handle_tool_error is a BaseTool property,
# not a @tool() decorator argument. Using map() - zero loops.
_ALL_TOOLS = [
load_scopus_csv, run_bertopic_discovery, label_topics_with_llm,
consolidate_into_themes, compare_with_taxonomy,
generate_comparison_csv, export_narrative,
]
list(map(lambda t: setattr(t, "handle_tool_error", True), _ALL_TOOLS))