THEMIS-BERTopic / tools.py
anujjuna's picture
Update tools.py
b41e337 verified
"""
tools.py β€” 7 @tool functions for BERTopic Agentic Thematic Analysis
Generated for: Braun & Clarke (2006) 6-Phase Framework Pipeline
"""
import json
import os
import re
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from langchain_core.tools import tool
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_mistralai import ChatMistralAI
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from bertopic import BERTopic
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import sent_tokenize
# ── Constants ──────────────────────────────────────────────────────────────────
EMBED_MODEL = "all-MiniLM-L6-v2"
CHECKPOINT_DIR = "checkpoints"
NEAREST_K = 5
MAX_LABEL_TOPICS = 100
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
PAJAIS_CATEGORIES = [
"Artificial Intelligence & Machine Learning",
"Big Data & Analytics",
"Blockchain & Distributed Ledger",
"Business Intelligence & Decision Support",
"Cloud Computing & Infrastructure",
"Cybersecurity & Privacy",
"Digital Transformation & Strategy",
"E-Commerce & Digital Markets",
"Enterprise Systems & ERP",
"Ethics & Governance in IS",
"Health Informatics & eHealth",
"Human-Computer Interaction & UX",
"Information Systems Theory & Foundations",
"Internet of Things & Cyber-Physical Systems",
"IS Education & Curriculum",
"IS in Developing Countries",
"IS Project Management & Implementation",
"Knowledge Management & Organizational Learning",
"Mobile & Ubiquitous Computing",
"Natural Language Processing & Text Mining",
"Open Source & Collaborative Systems",
"Platforms & Ecosystems",
"Social Media & Online Communities",
"Supply Chain & Logistics IS",
"Virtual Reality & Immersive Technologies",
]
BOILERPLATE_PATTERNS = [
r"Β©\s*\d{4}.*?(elsevier|springer|wiley|taylor|emerald|sage|ieee|acm|informs).*?\.",
r"all rights reserved\.?",
r"published by.*?\.",
r"doi:\s*\S+",
r"http[s]?://\S+",
r"this article is protected by copyright.*?\.",
r"please cite this article.*?\.",
r"accepted manuscript.*?\.",
r"preprint.*?\.",
r"peer.reviewed.*?\.",
r"received:\s*\d+.*?accepted:\s*\d+.*?\.",
r"keywords:.*?\.",
r"jel classification.*?\.",
r"abstract[-–—]?\s*",
r"introduction[-–—]?\s*$",
r"in this (paper|study|article|research).*?we (propose|present|examine|investigate|explore)",
r"the purpose of this (paper|study|article)",
r"this (paper|study|article) (aims|seeks|investigates|examines|explores|presents)",
r"we (propose|present|examine|investigate|explore)",
r"\b(furthermore|moreover|however|nevertheless|therefore|thus|hence)\b",
r"^\s*\d+\s*$",
r"\s{2,}",
]
# ── Helpers ────────────────────────────────────────────────────────────────────
_df_cache: dict = {}
_embeddings_cache: dict = {}
def _get_llm():
return ChatMistralAI(
model="mistral-large-latest",
temperature=0.1,
api_key=os.environ.get("MISTRAL_API_KEY", ""),
)
def _clean_text(text: str) -> str:
if not isinstance(text, str):
return ""
for pat in BOILERPLATE_PATTERNS:
text = re.sub(pat, " ", text, flags=re.IGNORECASE)
return text.strip()
def _load_df() -> pd.DataFrame:
if "df" in _df_cache:
return _df_cache["df"]
path = os.path.join(CHECKPOINT_DIR, "scopus_data.csv")
if os.path.exists(path):
_df_cache["df"] = pd.read_csv(path)
return _df_cache["df"]
raise FileNotFoundError("No CSV loaded. Please upload your Scopus CSV first.")
def _get_sentences(run_key: str) -> list[str]:
df = _load_df()
col_map = {"abstract": "Abstract", "title": "Title"}
col = col_map.get(run_key.lower(), "Abstract")
sentences = []
for text in df[col].dropna():
cleaned = _clean_text(str(text))
if run_key.lower() == "abstract":
sents = sent_tokenize(cleaned)
sentences.extend([s.strip() for s in sents if len(s.strip()) > 30])
else:
if len(cleaned.strip()) > 10:
sentences.append(cleaned.strip())
return sentences
def _embed(sentences: list[str], run_key: str) -> np.ndarray:
cache_key = f"{run_key}_emb"
emb_path = os.path.join(CHECKPOINT_DIR, f"{run_key}_emb.npy")
if cache_key in _embeddings_cache:
return _embeddings_cache[cache_key]
if os.path.exists(emb_path):
emb = np.load(emb_path)
_embeddings_cache[cache_key] = emb
return emb
model = SentenceTransformer(EMBED_MODEL)
emb = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False)
np.save(emb_path, emb)
_embeddings_cache[cache_key] = emb
return emb
def _save_json(data, filename: str):
path = os.path.join(CHECKPOINT_DIR, filename)
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def _load_json(filename: str):
path = os.path.join(CHECKPOINT_DIR, filename)
if not os.path.exists(path):
return None
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
# ── Tool 1: Load CSV ───────────────────────────────────────────────────────────
@tool
def load_scopus_csv(filepath: str) -> str:
"""
Load a Scopus CSV export and prepare it for topic modelling.
Returns paper count, abstract sentence count, and title sentence count.
Saves the cleaned dataframe as a checkpoint for subsequent tools.
Args:
filepath: Path to the uploaded Scopus CSV file.
"""
df = pd.read_csv(filepath)
# Normalize column names
df.columns = [c.strip() for c in df.columns]
expected = ["Authors", "Title", "Abstract", "Author Keywords", "Cited by", "Source title", "Year"]
for col in expected:
if col not in df.columns:
# Try case-insensitive match
matches = [c for c in df.columns if c.lower() == col.lower()]
if matches:
df.rename(columns={matches[0]: col}, inplace=True)
# Save checkpoint
save_path = os.path.join(CHECKPOINT_DIR, "scopus_data.csv")
df.to_csv(save_path, index=False)
_df_cache["df"] = df
# Count papers
n_papers = len(df)
# Count abstract sentences
abstract_sents = 0
for text in df.get("Abstract", pd.Series()).dropna():
cleaned = _clean_text(str(text))
sents = sent_tokenize(cleaned)
abstract_sents += len([s for s in sents if len(s.strip()) > 30])
# Count title sentences
title_sents = len(df["Title"].dropna())
# Save summaries checkpoint (Phase 1 marker)
summary_data = {
"n_papers": n_papers,
"abstract_sentences": abstract_sents,
"title_sentences": title_sents,
"columns": list(df.columns),
"year_range": f"{int(df['Year'].min())} – {int(df['Year'].max())}" if "Year" in df.columns else "N/A",
}
_save_json(summary_data, "summaries.json")
return (
f"βœ… CSV loaded successfully!\n\n"
f"πŸ“„ Papers: {n_papers:,}\n"
f"πŸ“ Abstract sentences (after cleaning): {abstract_sents:,}\n"
f"🏷️ Title sentences: {title_sents:,}\n"
f"πŸ“… Year range: {summary_data['year_range']}\n"
f"πŸ“Š Columns detected: {', '.join(df.columns.tolist())}\n\n"
f"Phase 1 (Familiarisation) complete. Type 'run abstract' to begin Phase 2."
)
# ── Tool 2: Run BERTopic ───────────────────────────────────────────────────────
@tool
def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str:
"""
Run BERTopic clustering on either abstracts or titles.
Uses SentenceTransformer embeddings in 384-dimensional space with
AgglomerativeClustering (cosine metric, average linkage). No UMAP reduction.
Generates 4 interactive Plotly charts. Saves summaries.json and emb.npy.
Args:
run_key: Either 'abstract' or 'title'
threshold: AgglomerativeClustering distance threshold (default 0.7)
"""
sentences = _get_sentences(run_key)
if not sentences:
return f"No sentences found for run_key='{run_key}'. Check your CSV."
# Embed
emb = _embed(sentences, run_key)
# Cluster in 384d (no UMAP)
clustering = AgglomerativeClustering(
metric="cosine",
linkage="average",
distance_threshold=threshold,
n_clusters=None,
)
labels = clustering.fit_predict(emb)
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
# Build topic summaries
topics = {}
for idx, label in enumerate(labels):
if label == -1:
continue
topics.setdefault(label, []).append(idx)
# Find nearest-K sentences to centroid
topic_summaries = []
for topic_id, sent_indices in sorted(topics.items()):
topic_embs = emb[sent_indices]
centroid = topic_embs.mean(axis=0, keepdims=True)
sims = cosine_similarity(centroid, topic_embs)[0]
top_k = np.argsort(sims)[::-1][:NEAREST_K]
top_sentences = [sentences[sent_indices[i]] for i in top_k]
topic_summaries.append({
"topic_id": int(topic_id),
"count": len(sent_indices),
"top_sentences": top_sentences,
"label": None,
"category": None,
"confidence": None,
"reasoning": None,
"niche": None,
"approve": "",
"rename_to": "",
"user_reasoning": "",
})
# Sort by size descending
topic_summaries.sort(key=lambda x: x["count"], reverse=True)
# Save checkpoint
_save_json({"run_key": run_key, "topics": topic_summaries}, f"{run_key}_summaries.json")
_save_json({"run_key": run_key, "topics": topic_summaries}, "summaries.json")
# ── Generate Plotly charts ─────────────────────────────────────────────────
# Intertopic distance map (using PCA-projected centroids)
from sklearn.decomposition import PCA
centroids = []
sizes = []
ids = []
for t in topic_summaries[:50]:
sent_indices = topics[t["topic_id"]]
centroid = emb[sent_indices].mean(axis=0)
centroids.append(centroid)
sizes.append(t["count"])
ids.append(t["topic_id"])
pca = PCA(n_components=2)
coords = pca.fit_transform(np.array(centroids))
# Chart 1: Intertopic Map
fig1 = go.Figure()
fig1.add_trace(go.Scatter(
x=coords[:, 0], y=coords[:, 1],
mode="markers+text",
marker=dict(size=[max(10, s / 2) for s in sizes], color=sizes,
colorscale="Viridis", showscale=True,
colorbar=dict(title="Sentences")),
text=[f"T{i}" for i in ids],
textposition="top center",
hovertext=[f"Topic {i}<br>{s} sentences" for i, s in zip(ids, sizes)],
))
fig1.update_layout(title=f"Intertopic Distance Map β€” {run_key.title()} ({n_clusters} topics)",
template="plotly_dark", height=500,
xaxis_title="PC1", yaxis_title="PC2")
# Chart 2: Topic Size Bar Chart
top_n = topic_summaries[:30]
fig2 = px.bar(
x=[t["count"] for t in top_n],
y=[f"Topic {t['topic_id']}" for t in top_n],
orientation="h",
color=[t["count"] for t in top_n],
color_continuous_scale="Plasma",
title=f"Top 30 Topics by Size β€” {run_key.title()}",
labels={"x": "Sentence Count", "y": "Topic"},
)
fig2.update_layout(template="plotly_dark", height=600)
# Chart 3: Topic Hierarchy (dendrogram-style via heatmap of similarity)
top20_indices = [topics[t["topic_id"]] for t in topic_summaries[:20]]
sim_matrix = np.zeros((20, 20))
cent20 = [emb[idx].mean(axis=0) for idx in top20_indices]
for i in range(20):
for j in range(20):
sim_matrix[i][j] = cosine_similarity([cent20[i]], [cent20[j]])[0][0]
fig3 = go.Figure(go.Heatmap(
z=sim_matrix,
x=[f"T{topic_summaries[i]['topic_id']}" for i in range(20)],
y=[f"T{topic_summaries[i]['topic_id']}" for i in range(20)],
colorscale="RdBu", zmin=0, zmax=1,
))
fig3.update_layout(title=f"Topic Similarity Heatmap (Top 20) β€” {run_key.title()}",
template="plotly_dark", height=500)
# Chart 4: Sentence distribution
fig4 = px.histogram(
x=[t["count"] for t in topic_summaries],
nbins=30, title=f"Topic Size Distribution β€” {run_key.title()}",
labels={"x": "Sentences per Topic", "y": "Number of Topics"},
color_discrete_sequence=["#7C3AED"],
)
fig4.update_layout(template="plotly_dark", height=400)
# Save charts
charts = {
"intertopic": fig1.to_html(include_plotlyjs="cdn", full_html=False),
"bars": fig2.to_html(include_plotlyjs="cdn", full_html=False),
"heatmap": fig3.to_html(include_plotlyjs="cdn", full_html=False),
"distribution": fig4.to_html(include_plotlyjs="cdn", full_html=False),
}
_save_json(charts, f"{run_key}_charts.json")
return (
f"βœ… BERTopic clustering complete for {run_key}!\n\n"
f"πŸ”’ Topics discovered: {n_clusters}\n"
f"πŸ“Š Sentences processed: {len(sentences):,}\n"
f"πŸ“ Embedding dimensions: 384 (no UMAP reduction)\n"
f"πŸ“ Distance threshold: {threshold}\n\n"
f"4 interactive charts saved. Calling label_topics_with_llm next..."
)
# ── Tool 3: Label Topics ───────────────────────────────────────────────────────
@tool
def label_topics_with_llm(run_key: str) -> str:
"""
Send top topics to Mistral LLM for labeling with research area names,
categories, confidence scores, reasoning, and niche flag.
Saves labels.json checkpoint.
Args:
run_key: Either 'abstract' or 'title'
"""
data = _load_json(f"{run_key}_summaries.json") or _load_json("summaries.json")
if not data:
return "No topic summaries found. Run run_bertopic_discovery first."
topics = data["topics"][:MAX_LABEL_TOPICS]
llm = _get_llm()
# Build prompt payload
topic_texts = []
for t in topics:
sents = " | ".join(t["top_sentences"][:3])
topic_texts.append(f"Topic {t['topic_id']} ({t['count']} sentences): {sents[:300]}")
prompt_template = PromptTemplate.from_template(
"""You are an expert academic researcher specializing in Information Systems and Computer Science.
Analyze these research topics extracted from journal abstracts/titles and label each one.
Topics:
{topics}
For each topic, respond with a JSON array. Each element must have:
- topic_id: integer
- label: concise research area name (3-7 words)
- category: broad category (e.g., "AI & ML", "HCI", "Security", "Data Management")
- confidence: float 0.0-1.0
- reasoning: one sentence explaining the label
- niche: boolean (true if highly specialized/narrow)
Respond ONLY with a valid JSON array. No markdown, no preamble, no explanation."""
)
parser = JsonOutputParser()
chain = prompt_template | llm | parser
result = chain.invoke({"topics": "\n".join(topic_texts)})
# Merge labels back into topics
label_map = {item["topic_id"]: item for item in result}
labeled_topics = []
for t in data["topics"]:
lbl = label_map.get(t["topic_id"], {})
labeled_topics.append({
**t,
"label": lbl.get("label", f"Topic {t['topic_id']}"),
"category": lbl.get("category", "Uncategorized"),
"confidence": lbl.get("confidence", 0.5),
"reasoning": lbl.get("reasoning", ""),
"niche": lbl.get("niche", False),
})
_save_json({"run_key": run_key, "topics": labeled_topics}, f"{run_key}_labels.json")
_save_json({"run_key": run_key, "topics": labeled_topics}, "labels.json")
labeled_count = len([t for t in labeled_topics if t.get("label")])
return (
f"βœ… Topics labeled by LLM!\n\n"
f"🏷️ Topics labeled: {labeled_count}\n"
f"πŸ“‹ Review the table below β€” check labels, approve or rename topics.\n\n"
f"**Phase 2 complete. Review the table, edit Approve/Rename columns, then click Submit Review.**"
)
# ── Tool 4: Consolidate Themes ─────────────────────────────────────────────────
@tool
def consolidate_into_themes(run_key: str, theme_map: str) -> str:
"""
Consolidate approved topics into researcher-defined themes.
Recomputes centroids for merged theme groups.
Saves themes.json checkpoint.
Args:
run_key: Either 'abstract' or 'title'
theme_map: JSON string mapping theme names to lists of topic IDs.
Example: '{"AI in Healthcare": [0, 3, 7], "Blockchain": [1, 5]}'
"""
data = _load_json(f"{run_key}_labels.json") or _load_json("labels.json")
if not data:
return "No labeled topics found. Run label_topics_with_llm first."
try:
if isinstance(theme_map, str):
groupings = json.loads(theme_map)
else:
groupings = theme_map
except json.JSONDecodeError as e:
return f"Invalid theme_map JSON: {e}"
topics_by_id = {t["topic_id"]: t for t in data["topics"]}
sentences = _get_sentences(run_key)
emb = _embed(sentences, run_key)
# Build label β†’ sentences mapping for original topics
# (reconstruct from saved summaries)
summaries = _load_json(f"{run_key}_summaries.json") or _load_json("summaries.json")
all_topic_indices: dict = {}
if summaries:
# We need to re-cluster to get indices β€” use saved emb
clustering = AgglomerativeClustering(
metric="cosine", linkage="average",
distance_threshold=0.7, n_clusters=None
)
labels_arr = clustering.fit_predict(emb)
for idx, lbl in enumerate(labels_arr):
all_topic_indices.setdefault(int(lbl), []).append(idx)
themes = []
used_ids = set()
for theme_name, topic_ids in groupings.items():
merged_sentence_indices = []
for tid in topic_ids:
merged_sentence_indices.extend(all_topic_indices.get(tid, []))
used_ids.add(tid)
if not merged_sentence_indices:
continue
theme_embs = emb[merged_sentence_indices]
centroid = theme_embs.mean(axis=0, keepdims=True)
sims = cosine_similarity(centroid, theme_embs)[0]
top_k = np.argsort(sims)[::-1][:NEAREST_K]
top_sents = [sentences[merged_sentence_indices[i]] for i in top_k]
# Count papers
df = _load_df()
n_papers = len(df) # Approximate
themes.append({
"theme_name": theme_name,
"topic_ids": topic_ids,
"sentence_count": len(merged_sentence_indices),
"paper_count": n_papers,
"top_sentences": top_sents,
"approve": "",
"rename_to": "",
"user_reasoning": "",
"pajais_match": None,
"is_novel": None,
})
# Handle uncategorized topics
uncategorized = [tid for tid in all_topic_indices if tid not in used_ids]
if uncategorized:
merged = []
for tid in uncategorized:
merged.extend(all_topic_indices[tid])
if merged:
themes.append({
"theme_name": "Uncategorized",
"topic_ids": uncategorized,
"sentence_count": len(merged),
"paper_count": 0,
"top_sentences": [sentences[i] for i in merged[:3]],
"approve": "",
"rename_to": "",
"user_reasoning": "",
"pajais_match": None,
"is_novel": None,
})
_save_json({"run_key": run_key, "themes": themes}, f"{run_key}_themes.json")
_save_json({"run_key": run_key, "themes": themes}, "themes.json")
return (
f"βœ… Themes consolidated!\n\n"
f"πŸ—‚οΈ Themes created: {len(themes)}\n"
f"πŸ“Š Total sentences covered: {sum(t['sentence_count'] for t in themes):,}\n\n"
f"**Phase 3 complete. Review consolidated themes in the table. Click Submit Review.**"
)
# ── Tool 5: Compare with PAJAIS Taxonomy ──────────────────────────────────────
@tool
def compare_with_taxonomy(run_key: str) -> str:
"""
Map final themes to PAJAIS 25-category taxonomy using Mistral LLM.
Identifies NOVEL themes not covered by existing taxonomy.
Saves taxonomy_map.json checkpoint.
Args:
run_key: Either 'abstract' or 'title'
"""
data = _load_json(f"{run_key}_themes.json") or _load_json("themes.json")
if not data:
return "No themes found. Run consolidate_into_themes first."
themes = data["themes"]
llm = _get_llm()
theme_descriptions = []
for t in themes:
sents = " | ".join(t["top_sentences"][:2])
theme_descriptions.append(
f"Theme: {t['theme_name']}\nEvidence: {sents[:250]}"
)
prompt_template = PromptTemplate.from_template(
"""You are an expert in Information Systems research taxonomy.
Map each research theme to the PAJAIS (Pan-Pacific Journal of Advanced Research in Information Systems) taxonomy categories, or flag as NOVEL if no match exists.
PAJAIS Categories:
{categories}
Themes to map:
{themes}
For each theme, respond with a JSON array. Each element must have:
- theme_name: string (exact match from input)
- pajais_match: string (exact PAJAIS category name, or "NOVEL")
- match_confidence: float 0.0-1.0
- reasoning: one sentence justification
- is_novel: boolean (true if NOVEL)
- evidence_summary: brief description of what the theme covers
Respond ONLY with valid JSON array. No markdown."""
)
parser = JsonOutputParser()
chain = prompt_template | llm | parser
result = chain.invoke({
"categories": "\n".join(f"{i+1}. {c}" for i, c in enumerate(PAJAIS_CATEGORIES)),
"themes": "\n\n".join(theme_descriptions),
})
# Merge results
result_map = {item["theme_name"]: item for item in result}
taxonomy_themes = []
for t in themes:
mapping = result_map.get(t["theme_name"], {})
taxonomy_themes.append({
**t,
"pajais_match": mapping.get("pajais_match", "NOVEL"),
"match_confidence": mapping.get("match_confidence", 0.0),
"reasoning": mapping.get("reasoning", ""),
"is_novel": mapping.get("is_novel", True),
"evidence_summary": mapping.get("evidence_summary", ""),
"top_sentences": [
f"β†’ {mapping.get('pajais_match', 'NOVEL')} | {mapping.get('reasoning', '')}"
] + t.get("top_sentences", [])[:2],
})
novel_count = len([t for t in taxonomy_themes if t.get("is_novel")])
mapped_count = len(taxonomy_themes) - novel_count
_save_json({"run_key": run_key, "themes": taxonomy_themes}, f"{run_key}_taxonomy_map.json")
_save_json({"run_key": run_key, "themes": taxonomy_themes}, "taxonomy_map.json")
return (
f"βœ… PAJAIS taxonomy mapping complete!\n\n"
f"βœ… MAPPED themes: {mapped_count}\n"
f"πŸ†• NOVEL themes: {novel_count}\n\n"
f"**Phase 5.5 complete. Review PAJAIS mapping in the table (Top Evidence column shows β†’ PAJAIS match). "
f"Click Submit Review.**"
)
# ── Tool 6: Generate Comparison CSV ───────────────────────────────────────────
@tool
def generate_comparison_csv() -> str:
"""
Compare abstract themes vs title themes side-by-side.
Creates a convergence/divergence analysis CSV.
Saves comparison.csv checkpoint.
"""
abstract_data = _load_json("abstract_themes.json") or _load_json("themes.json")
title_data = _load_json("title_themes.json")
if not abstract_data:
return "Abstract themes not found. Complete abstract analysis first."
if not title_data:
return "Title themes not found. Complete title analysis first (run title analysis)."
abstract_themes = {t["theme_name"]: t for t in abstract_data.get("themes", [])}
title_themes = {t["theme_name"]: t for t in title_data.get("themes", [])}
all_themes = sorted(set(list(abstract_themes.keys()) + list(title_themes.keys())))
rows = []
for theme in all_themes:
a = abstract_themes.get(theme, {})
t = title_themes.get(theme, {})
convergence = "CONVERGE" if theme in abstract_themes and theme in title_themes else (
"ABSTRACT ONLY" if theme in abstract_themes else "TITLE ONLY"
)
rows.append({
"Theme": theme,
"Abstract_Sentences": a.get("sentence_count", 0),
"Title_Sentences": t.get("sentence_count", 0),
"Abstract_PAJAIS": a.get("pajais_match", "N/A"),
"Title_PAJAIS": t.get("pajais_match", "N/A"),
"Abstract_Novel": a.get("is_novel", False),
"Title_Novel": t.get("is_novel", False),
"Convergence": convergence,
"Top_Abstract_Evidence": (a.get("top_sentences", [""])[0])[:200] if a else "",
"Top_Title_Evidence": (t.get("top_sentences", [""])[0])[:200] if t else "",
})
df = pd.DataFrame(rows)
save_path = os.path.join(CHECKPOINT_DIR, "comparison.csv")
df.to_csv(save_path, index=False)
converge = len([r for r in rows if r["Convergence"] == "CONVERGE"])
abstract_only = len([r for r in rows if r["Convergence"] == "ABSTRACT ONLY"])
title_only = len([r for r in rows if r["Convergence"] == "TITLE ONLY"])
return (
f"βœ… Comparison CSV generated!\n\n"
f"πŸ”„ Converging themes: {converge}\n"
f"πŸ“ Abstract-only themes: {abstract_only}\n"
f"🏷️ Title-only themes: {title_only}\n\n"
f"**Check the Download tab for comparison.csv. Click Submit Review to confirm.**"
)
# ── Tool 7: Export Narrative ───────────────────────────────────────────────────
@tool
def export_narrative(run_key: str) -> str:
"""
Generate a 500-word Section 7 literature review narrative using Mistral LLM.
References B&C methodology, key themes, PAJAIS mapping, and limitations.
Saves narrative.txt checkpoint.
Args:
run_key: Either 'abstract' or 'title'
"""
taxonomy_data = _load_json(f"{run_key}_taxonomy_map.json") or _load_json("taxonomy_map.json")
if not taxonomy_data:
return "No taxonomy mapping found. Run compare_with_taxonomy first."
themes = taxonomy_data.get("themes", [])
llm = _get_llm()
theme_summary = []
for t in themes:
novel_flag = " [NOVEL]" if t.get("is_novel") else f" [β†’ {t.get('pajais_match', '')}]"
theme_summary.append(f"β€’ {t['theme_name']}{novel_flag}: {t.get('evidence_summary', t.get('reasoning', ''))}")
summaries_data = _load_json("summaries.json") or {}
n_papers = summaries_data.get("n_papers", "N/A")
prompt_template = PromptTemplate.from_template(
"""You are an academic writer drafting a Section 7 (Thematic Analysis Results) for a peer-reviewed Information Systems journal paper.
Context:
- Dataset: {n_papers} papers from Scopus
- Method: BERTopic with AgglomerativeClustering (cosine metric, 384d embeddings, no UMAP), Braun & Clarke (2006) 6-phase framework
- Analysis type: {run_key} analysis
Themes discovered:
{themes}
Write a 500-word Section 7 that:
1. Opens with methodology overview (BERTopic, B&C phases, embedding approach)
2. Presents each major theme with evidence and paper count references
3. Discusses PAJAIS taxonomy alignment (MAPPED vs NOVEL themes)
4. Highlights the most significant NOVEL themes and their publication potential
5. Acknowledges limitations (single journal, time period, computational constraints)
6. Closes with implications for future research
Write in formal academic style. Use hedged language where appropriate. Do not use bullet points β€” write in flowing paragraphs."""
)
chain = prompt_template | llm
response = chain.invoke({
"n_papers": n_papers,
"run_key": run_key,
"themes": "\n".join(theme_summary),
})
narrative_text = response.content if hasattr(response, "content") else str(response)
save_path = os.path.join(CHECKPOINT_DIR, "narrative.txt")
with open(save_path, "w", encoding="utf-8") as f:
f.write(narrative_text)
word_count = len(narrative_text.split())
return (
f"βœ… Section 7 narrative exported!\n\n"
f"πŸ“ Word count: {word_count}\n"
f"πŸ’Ύ Saved to: narrative.txt\n\n"
f"**Phase 6 complete! All B&C phases finished. Check Download tab for all outputs.**\n\n"
f"---\n\n{narrative_text[:500]}...\n\n*(Full narrative in narrative.txt)*"
)
# ── All tools list ─────────────────────────────────────────────────────────────
ALL_TOOLS = [
load_scopus_csv,
run_bertopic_discovery,
label_topics_with_llm,
consolidate_into_themes,
compare_with_taxonomy,
generate_comparison_csv,
export_narrative,
]