""" tools.py — 7 @tool functions for BERTopic Agentic Thematic Analysis Generated for: Braun & Clarke (2006) 6-Phase Framework Pipeline """ import json import os import re import numpy as np import pandas as pd import plotly.graph_objects as go import plotly.express as px from plotly.subplots import make_subplots from langchain_core.tools import tool from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_mistralai import ChatMistralAI from sentence_transformers import SentenceTransformer from sklearn.cluster import AgglomerativeClustering from sklearn.metrics.pairwise import cosine_similarity from bertopic import BERTopic import nltk nltk.download('punkt', quiet=True) nltk.download('punkt_tab', quiet=True) from nltk.tokenize import sent_tokenize # ── Constants ────────────────────────────────────────────────────────────────── EMBED_MODEL = "all-MiniLM-L6-v2" CHECKPOINT_DIR = "checkpoints" NEAREST_K = 5 MAX_LABEL_TOPICS = 100 os.makedirs(CHECKPOINT_DIR, exist_ok=True) PAJAIS_CATEGORIES = [ "Artificial Intelligence & Machine Learning", "Big Data & Analytics", "Blockchain & Distributed Ledger", "Business Intelligence & Decision Support", "Cloud Computing & Infrastructure", "Cybersecurity & Privacy", "Digital Transformation & Strategy", "E-Commerce & Digital Markets", "Enterprise Systems & ERP", "Ethics & Governance in IS", "Health Informatics & eHealth", "Human-Computer Interaction & UX", "Information Systems Theory & Foundations", "Internet of Things & Cyber-Physical Systems", "IS Education & Curriculum", "IS in Developing Countries", "IS Project Management & Implementation", "Knowledge Management & Organizational Learning", "Mobile & Ubiquitous Computing", "Natural Language Processing & Text Mining", "Open Source & Collaborative Systems", "Platforms & Ecosystems", "Social Media & Online Communities", "Supply Chain & Logistics IS", "Virtual Reality & Immersive Technologies", ] BOILERPLATE_PATTERNS = [ r"©\s*\d{4}.*?(elsevier|springer|wiley|taylor|emerald|sage|ieee|acm|informs).*?\.", r"all rights reserved\.?", r"published by.*?\.", r"doi:\s*\S+", r"http[s]?://\S+", r"this article is protected by copyright.*?\.", r"please cite this article.*?\.", r"accepted manuscript.*?\.", r"preprint.*?\.", r"peer.reviewed.*?\.", r"received:\s*\d+.*?accepted:\s*\d+.*?\.", r"keywords:.*?\.", r"jel classification.*?\.", r"abstract[-–—]?\s*", r"introduction[-–—]?\s*$", r"in this (paper|study|article|research).*?we (propose|present|examine|investigate|explore)", r"the purpose of this (paper|study|article)", r"this (paper|study|article) (aims|seeks|investigates|examines|explores|presents)", r"we (propose|present|examine|investigate|explore)", r"\b(furthermore|moreover|however|nevertheless|therefore|thus|hence)\b", r"^\s*\d+\s*$", r"\s{2,}", ] # ── Helpers ──────────────────────────────────────────────────────────────────── _df_cache: dict = {} _embeddings_cache: dict = {} def _get_llm(): return ChatMistralAI( model="mistral-large-latest", temperature=0.1, api_key=os.environ.get("MISTRAL_API_KEY", ""), ) def _clean_text(text: str) -> str: if not isinstance(text, str): return "" for pat in BOILERPLATE_PATTERNS: text = re.sub(pat, " ", text, flags=re.IGNORECASE) return text.strip() def _load_df() -> pd.DataFrame: if "df" in _df_cache: return _df_cache["df"] path = os.path.join(CHECKPOINT_DIR, "scopus_data.csv") if os.path.exists(path): _df_cache["df"] = pd.read_csv(path) return _df_cache["df"] raise FileNotFoundError("No CSV loaded. Please upload your Scopus CSV first.") def _get_sentences(run_key: str) -> list[str]: df = _load_df() col_map = {"abstract": "Abstract", "title": "Title"} col = col_map.get(run_key.lower(), "Abstract") sentences = [] for text in df[col].dropna(): cleaned = _clean_text(str(text)) if run_key.lower() == "abstract": sents = sent_tokenize(cleaned) sentences.extend([s.strip() for s in sents if len(s.strip()) > 30]) else: if len(cleaned.strip()) > 10: sentences.append(cleaned.strip()) return sentences def _embed(sentences: list[str], run_key: str) -> np.ndarray: cache_key = f"{run_key}_emb" emb_path = os.path.join(CHECKPOINT_DIR, f"{run_key}_emb.npy") if cache_key in _embeddings_cache: return _embeddings_cache[cache_key] if os.path.exists(emb_path): emb = np.load(emb_path) _embeddings_cache[cache_key] = emb return emb model = SentenceTransformer(EMBED_MODEL) emb = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False) np.save(emb_path, emb) _embeddings_cache[cache_key] = emb return emb def _save_json(data, filename: str): path = os.path.join(CHECKPOINT_DIR, filename) with open(path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) def _load_json(filename: str): path = os.path.join(CHECKPOINT_DIR, filename) if not os.path.exists(path): return None with open(path, "r", encoding="utf-8") as f: return json.load(f) # ── Tool 1: Load CSV ─────────────────────────────────────────────────────────── @tool def load_scopus_csv(filepath: str) -> str: """ Load a Scopus CSV export and prepare it for topic modelling. Returns paper count, abstract sentence count, and title sentence count. Saves the cleaned dataframe as a checkpoint for subsequent tools. Args: filepath: Path to the uploaded Scopus CSV file. """ df = pd.read_csv(filepath) # Normalize column names df.columns = [c.strip() for c in df.columns] expected = ["Authors", "Title", "Abstract", "Author Keywords", "Cited by", "Source title", "Year"] for col in expected: if col not in df.columns: # Try case-insensitive match matches = [c for c in df.columns if c.lower() == col.lower()] if matches: df.rename(columns={matches[0]: col}, inplace=True) # Save checkpoint save_path = os.path.join(CHECKPOINT_DIR, "scopus_data.csv") df.to_csv(save_path, index=False) _df_cache["df"] = df # Count papers n_papers = len(df) # Count abstract sentences abstract_sents = 0 for text in df.get("Abstract", pd.Series()).dropna(): cleaned = _clean_text(str(text)) sents = sent_tokenize(cleaned) abstract_sents += len([s for s in sents if len(s.strip()) > 30]) # Count title sentences title_sents = len(df["Title"].dropna()) # Save summaries checkpoint (Phase 1 marker) summary_data = { "n_papers": n_papers, "abstract_sentences": abstract_sents, "title_sentences": title_sents, "columns": list(df.columns), "year_range": f"{int(df['Year'].min())} – {int(df['Year'].max())}" if "Year" in df.columns else "N/A", } _save_json(summary_data, "summaries.json") return ( f"✅ CSV loaded successfully!\n\n" f"📄 Papers: {n_papers:,}\n" f"📝 Abstract sentences (after cleaning): {abstract_sents:,}\n" f"🏷️ Title sentences: {title_sents:,}\n" f"📅 Year range: {summary_data['year_range']}\n" f"📊 Columns detected: {', '.join(df.columns.tolist())}\n\n" f"Phase 1 (Familiarisation) complete. Type 'run abstract' to begin Phase 2." ) # ── Tool 2: Run BERTopic ─────────────────────────────────────────────────────── @tool def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str: """ Run BERTopic clustering on either abstracts or titles. Uses SentenceTransformer embeddings in 384-dimensional space with AgglomerativeClustering (cosine metric, average linkage). No UMAP reduction. Generates 4 interactive Plotly charts. Saves summaries.json and emb.npy. Args: run_key: Either 'abstract' or 'title' threshold: AgglomerativeClustering distance threshold (default 0.7) """ sentences = _get_sentences(run_key) if not sentences: return f"No sentences found for run_key='{run_key}'. Check your CSV." # Embed emb = _embed(sentences, run_key) # Cluster in 384d (no UMAP) clustering = AgglomerativeClustering( metric="cosine", linkage="average", distance_threshold=threshold, n_clusters=None, ) labels = clustering.fit_predict(emb) n_clusters = len(set(labels)) - (1 if -1 in labels else 0) # Build topic summaries topics = {} for idx, label in enumerate(labels): if label == -1: continue topics.setdefault(label, []).append(idx) # Find nearest-K sentences to centroid topic_summaries = [] for topic_id, sent_indices in sorted(topics.items()): topic_embs = emb[sent_indices] centroid = topic_embs.mean(axis=0, keepdims=True) sims = cosine_similarity(centroid, topic_embs)[0] top_k = np.argsort(sims)[::-1][:NEAREST_K] top_sentences = [sentences[sent_indices[i]] for i in top_k] topic_summaries.append({ "topic_id": int(topic_id), "count": len(sent_indices), "top_sentences": top_sentences, "label": None, "category": None, "confidence": None, "reasoning": None, "niche": None, "approve": "", "rename_to": "", "user_reasoning": "", }) # Sort by size descending topic_summaries.sort(key=lambda x: x["count"], reverse=True) # Save checkpoint _save_json({"run_key": run_key, "topics": topic_summaries}, f"{run_key}_summaries.json") _save_json({"run_key": run_key, "topics": topic_summaries}, "summaries.json") # ── Generate Plotly charts ───────────────────────────────────────────────── # Intertopic distance map (using PCA-projected centroids) from sklearn.decomposition import PCA centroids = [] sizes = [] ids = [] for t in topic_summaries[:50]: sent_indices = topics[t["topic_id"]] centroid = emb[sent_indices].mean(axis=0) centroids.append(centroid) sizes.append(t["count"]) ids.append(t["topic_id"]) pca = PCA(n_components=2) coords = pca.fit_transform(np.array(centroids)) # Chart 1: Intertopic Map fig1 = go.Figure() fig1.add_trace(go.Scatter( x=coords[:, 0], y=coords[:, 1], mode="markers+text", marker=dict(size=[max(10, s / 2) for s in sizes], color=sizes, colorscale="Viridis", showscale=True, colorbar=dict(title="Sentences")), text=[f"T{i}" for i in ids], textposition="top center", hovertext=[f"Topic {i}
{s} sentences" for i, s in zip(ids, sizes)], )) fig1.update_layout(title=f"Intertopic Distance Map — {run_key.title()} ({n_clusters} topics)", template="plotly_dark", height=500, xaxis_title="PC1", yaxis_title="PC2") # Chart 2: Topic Size Bar Chart top_n = topic_summaries[:30] fig2 = px.bar( x=[t["count"] for t in top_n], y=[f"Topic {t['topic_id']}" for t in top_n], orientation="h", color=[t["count"] for t in top_n], color_continuous_scale="Plasma", title=f"Top 30 Topics by Size — {run_key.title()}", labels={"x": "Sentence Count", "y": "Topic"}, ) fig2.update_layout(template="plotly_dark", height=600) # Chart 3: Topic Hierarchy (dendrogram-style via heatmap of similarity) top20_indices = [topics[t["topic_id"]] for t in topic_summaries[:20]] sim_matrix = np.zeros((20, 20)) cent20 = [emb[idx].mean(axis=0) for idx in top20_indices] for i in range(20): for j in range(20): sim_matrix[i][j] = cosine_similarity([cent20[i]], [cent20[j]])[0][0] fig3 = go.Figure(go.Heatmap( z=sim_matrix, x=[f"T{topic_summaries[i]['topic_id']}" for i in range(20)], y=[f"T{topic_summaries[i]['topic_id']}" for i in range(20)], colorscale="RdBu", zmin=0, zmax=1, )) fig3.update_layout(title=f"Topic Similarity Heatmap (Top 20) — {run_key.title()}", template="plotly_dark", height=500) # Chart 4: Sentence distribution fig4 = px.histogram( x=[t["count"] for t in topic_summaries], nbins=30, title=f"Topic Size Distribution — {run_key.title()}", labels={"x": "Sentences per Topic", "y": "Number of Topics"}, color_discrete_sequence=["#7C3AED"], ) fig4.update_layout(template="plotly_dark", height=400) # Save charts charts = { "intertopic": fig1.to_html(include_plotlyjs="cdn", full_html=False), "bars": fig2.to_html(include_plotlyjs="cdn", full_html=False), "heatmap": fig3.to_html(include_plotlyjs="cdn", full_html=False), "distribution": fig4.to_html(include_plotlyjs="cdn", full_html=False), } _save_json(charts, f"{run_key}_charts.json") return ( f"✅ BERTopic clustering complete for {run_key}!\n\n" f"🔢 Topics discovered: {n_clusters}\n" f"📊 Sentences processed: {len(sentences):,}\n" f"📐 Embedding dimensions: 384 (no UMAP reduction)\n" f"📏 Distance threshold: {threshold}\n\n" f"4 interactive charts saved. Calling label_topics_with_llm next..." ) # ── Tool 3: Label Topics ─────────────────────────────────────────────────────── @tool def label_topics_with_llm(run_key: str) -> str: """ Send top topics to Mistral LLM for labeling with research area names, categories, confidence scores, reasoning, and niche flag. Saves labels.json checkpoint. Args: run_key: Either 'abstract' or 'title' """ data = _load_json(f"{run_key}_summaries.json") or _load_json("summaries.json") if not data: return "No topic summaries found. Run run_bertopic_discovery first." topics = data["topics"][:MAX_LABEL_TOPICS] llm = _get_llm() # Build prompt payload topic_texts = [] for t in topics: sents = " | ".join(t["top_sentences"][:3]) topic_texts.append(f"Topic {t['topic_id']} ({t['count']} sentences): {sents[:300]}") prompt_template = PromptTemplate.from_template( """You are an expert academic researcher specializing in Information Systems and Computer Science. Analyze these research topics extracted from journal abstracts/titles and label each one. Topics: {topics} For each topic, respond with a JSON array. Each element must have: - topic_id: integer - label: concise research area name (3-7 words) - category: broad category (e.g., "AI & ML", "HCI", "Security", "Data Management") - confidence: float 0.0-1.0 - reasoning: one sentence explaining the label - niche: boolean (true if highly specialized/narrow) Respond ONLY with a valid JSON array. No markdown, no preamble, no explanation.""" ) parser = JsonOutputParser() chain = prompt_template | llm | parser result = chain.invoke({"topics": "\n".join(topic_texts)}) # Merge labels back into topics label_map = {item["topic_id"]: item for item in result} labeled_topics = [] for t in data["topics"]: lbl = label_map.get(t["topic_id"], {}) labeled_topics.append({ **t, "label": lbl.get("label", f"Topic {t['topic_id']}"), "category": lbl.get("category", "Uncategorized"), "confidence": lbl.get("confidence", 0.5), "reasoning": lbl.get("reasoning", ""), "niche": lbl.get("niche", False), }) _save_json({"run_key": run_key, "topics": labeled_topics}, f"{run_key}_labels.json") _save_json({"run_key": run_key, "topics": labeled_topics}, "labels.json") labeled_count = len([t for t in labeled_topics if t.get("label")]) return ( f"✅ Topics labeled by LLM!\n\n" f"🏷️ Topics labeled: {labeled_count}\n" f"📋 Review the table below — check labels, approve or rename topics.\n\n" f"**Phase 2 complete. Review the table, edit Approve/Rename columns, then click Submit Review.**" ) # ── Tool 4: Consolidate Themes ───────────────────────────────────────────────── @tool def consolidate_into_themes(run_key: str, theme_map: str) -> str: """ Consolidate approved topics into researcher-defined themes. Recomputes centroids for merged theme groups. Saves themes.json checkpoint. Args: run_key: Either 'abstract' or 'title' theme_map: JSON string mapping theme names to lists of topic IDs. Example: '{"AI in Healthcare": [0, 3, 7], "Blockchain": [1, 5]}' """ data = _load_json(f"{run_key}_labels.json") or _load_json("labels.json") if not data: return "No labeled topics found. Run label_topics_with_llm first." try: if isinstance(theme_map, str): groupings = json.loads(theme_map) else: groupings = theme_map except json.JSONDecodeError as e: return f"Invalid theme_map JSON: {e}" topics_by_id = {t["topic_id"]: t for t in data["topics"]} sentences = _get_sentences(run_key) emb = _embed(sentences, run_key) # Build label → sentences mapping for original topics # (reconstruct from saved summaries) summaries = _load_json(f"{run_key}_summaries.json") or _load_json("summaries.json") all_topic_indices: dict = {} if summaries: # We need to re-cluster to get indices — use saved emb clustering = AgglomerativeClustering( metric="cosine", linkage="average", distance_threshold=0.7, n_clusters=None ) labels_arr = clustering.fit_predict(emb) for idx, lbl in enumerate(labels_arr): all_topic_indices.setdefault(int(lbl), []).append(idx) themes = [] used_ids = set() for theme_name, topic_ids in groupings.items(): merged_sentence_indices = [] for tid in topic_ids: merged_sentence_indices.extend(all_topic_indices.get(tid, [])) used_ids.add(tid) if not merged_sentence_indices: continue theme_embs = emb[merged_sentence_indices] centroid = theme_embs.mean(axis=0, keepdims=True) sims = cosine_similarity(centroid, theme_embs)[0] top_k = np.argsort(sims)[::-1][:NEAREST_K] top_sents = [sentences[merged_sentence_indices[i]] for i in top_k] # Count papers df = _load_df() n_papers = len(df) # Approximate themes.append({ "theme_name": theme_name, "topic_ids": topic_ids, "sentence_count": len(merged_sentence_indices), "paper_count": n_papers, "top_sentences": top_sents, "approve": "", "rename_to": "", "user_reasoning": "", "pajais_match": None, "is_novel": None, }) # Handle uncategorized topics uncategorized = [tid for tid in all_topic_indices if tid not in used_ids] if uncategorized: merged = [] for tid in uncategorized: merged.extend(all_topic_indices[tid]) if merged: themes.append({ "theme_name": "Uncategorized", "topic_ids": uncategorized, "sentence_count": len(merged), "paper_count": 0, "top_sentences": [sentences[i] for i in merged[:3]], "approve": "", "rename_to": "", "user_reasoning": "", "pajais_match": None, "is_novel": None, }) _save_json({"run_key": run_key, "themes": themes}, f"{run_key}_themes.json") _save_json({"run_key": run_key, "themes": themes}, "themes.json") return ( f"✅ Themes consolidated!\n\n" f"🗂️ Themes created: {len(themes)}\n" f"📊 Total sentences covered: {sum(t['sentence_count'] for t in themes):,}\n\n" f"**Phase 3 complete. Review consolidated themes in the table. Click Submit Review.**" ) # ── Tool 5: Compare with PAJAIS Taxonomy ────────────────────────────────────── @tool def compare_with_taxonomy(run_key: str) -> str: """ Map final themes to PAJAIS 25-category taxonomy using Mistral LLM. Identifies NOVEL themes not covered by existing taxonomy. Saves taxonomy_map.json checkpoint. Args: run_key: Either 'abstract' or 'title' """ data = _load_json(f"{run_key}_themes.json") or _load_json("themes.json") if not data: return "No themes found. Run consolidate_into_themes first." themes = data["themes"] llm = _get_llm() theme_descriptions = [] for t in themes: sents = " | ".join(t["top_sentences"][:2]) theme_descriptions.append( f"Theme: {t['theme_name']}\nEvidence: {sents[:250]}" ) prompt_template = PromptTemplate.from_template( """You are an expert in Information Systems research taxonomy. Map each research theme to the PAJAIS (Pan-Pacific Journal of Advanced Research in Information Systems) taxonomy categories, or flag as NOVEL if no match exists. PAJAIS Categories: {categories} Themes to map: {themes} For each theme, respond with a JSON array. Each element must have: - theme_name: string (exact match from input) - pajais_match: string (exact PAJAIS category name, or "NOVEL") - match_confidence: float 0.0-1.0 - reasoning: one sentence justification - is_novel: boolean (true if NOVEL) - evidence_summary: brief description of what the theme covers Respond ONLY with valid JSON array. No markdown.""" ) parser = JsonOutputParser() chain = prompt_template | llm | parser result = chain.invoke({ "categories": "\n".join(f"{i+1}. {c}" for i, c in enumerate(PAJAIS_CATEGORIES)), "themes": "\n\n".join(theme_descriptions), }) # Merge results result_map = {item["theme_name"]: item for item in result} taxonomy_themes = [] for t in themes: mapping = result_map.get(t["theme_name"], {}) taxonomy_themes.append({ **t, "pajais_match": mapping.get("pajais_match", "NOVEL"), "match_confidence": mapping.get("match_confidence", 0.0), "reasoning": mapping.get("reasoning", ""), "is_novel": mapping.get("is_novel", True), "evidence_summary": mapping.get("evidence_summary", ""), "top_sentences": [ f"→ {mapping.get('pajais_match', 'NOVEL')} | {mapping.get('reasoning', '')}" ] + t.get("top_sentences", [])[:2], }) novel_count = len([t for t in taxonomy_themes if t.get("is_novel")]) mapped_count = len(taxonomy_themes) - novel_count _save_json({"run_key": run_key, "themes": taxonomy_themes}, f"{run_key}_taxonomy_map.json") _save_json({"run_key": run_key, "themes": taxonomy_themes}, "taxonomy_map.json") return ( f"✅ PAJAIS taxonomy mapping complete!\n\n" f"✅ MAPPED themes: {mapped_count}\n" f"🆕 NOVEL themes: {novel_count}\n\n" f"**Phase 5.5 complete. Review PAJAIS mapping in the table (Top Evidence column shows → PAJAIS match). " f"Click Submit Review.**" ) # ── Tool 6: Generate Comparison CSV ─────────────────────────────────────────── @tool def generate_comparison_csv() -> str: """ Compare abstract themes vs title themes side-by-side. Creates a convergence/divergence analysis CSV. Saves comparison.csv checkpoint. """ abstract_data = _load_json("abstract_themes.json") or _load_json("themes.json") title_data = _load_json("title_themes.json") if not abstract_data: return "Abstract themes not found. Complete abstract analysis first." if not title_data: return "Title themes not found. Complete title analysis first (run title analysis)." abstract_themes = {t["theme_name"]: t for t in abstract_data.get("themes", [])} title_themes = {t["theme_name"]: t for t in title_data.get("themes", [])} all_themes = sorted(set(list(abstract_themes.keys()) + list(title_themes.keys()))) rows = [] for theme in all_themes: a = abstract_themes.get(theme, {}) t = title_themes.get(theme, {}) convergence = "CONVERGE" if theme in abstract_themes and theme in title_themes else ( "ABSTRACT ONLY" if theme in abstract_themes else "TITLE ONLY" ) rows.append({ "Theme": theme, "Abstract_Sentences": a.get("sentence_count", 0), "Title_Sentences": t.get("sentence_count", 0), "Abstract_PAJAIS": a.get("pajais_match", "N/A"), "Title_PAJAIS": t.get("pajais_match", "N/A"), "Abstract_Novel": a.get("is_novel", False), "Title_Novel": t.get("is_novel", False), "Convergence": convergence, "Top_Abstract_Evidence": (a.get("top_sentences", [""])[0])[:200] if a else "", "Top_Title_Evidence": (t.get("top_sentences", [""])[0])[:200] if t else "", }) df = pd.DataFrame(rows) save_path = os.path.join(CHECKPOINT_DIR, "comparison.csv") df.to_csv(save_path, index=False) converge = len([r for r in rows if r["Convergence"] == "CONVERGE"]) abstract_only = len([r for r in rows if r["Convergence"] == "ABSTRACT ONLY"]) title_only = len([r for r in rows if r["Convergence"] == "TITLE ONLY"]) return ( f"✅ Comparison CSV generated!\n\n" f"🔄 Converging themes: {converge}\n" f"📝 Abstract-only themes: {abstract_only}\n" f"🏷️ Title-only themes: {title_only}\n\n" f"**Check the Download tab for comparison.csv. Click Submit Review to confirm.**" ) # ── Tool 7: Export Narrative ─────────────────────────────────────────────────── @tool def export_narrative(run_key: str) -> str: """ Generate a 500-word Section 7 literature review narrative using Mistral LLM. References B&C methodology, key themes, PAJAIS mapping, and limitations. Saves narrative.txt checkpoint. Args: run_key: Either 'abstract' or 'title' """ taxonomy_data = _load_json(f"{run_key}_taxonomy_map.json") or _load_json("taxonomy_map.json") if not taxonomy_data: return "No taxonomy mapping found. Run compare_with_taxonomy first." themes = taxonomy_data.get("themes", []) llm = _get_llm() theme_summary = [] for t in themes: novel_flag = " [NOVEL]" if t.get("is_novel") else f" [→ {t.get('pajais_match', '')}]" theme_summary.append(f"• {t['theme_name']}{novel_flag}: {t.get('evidence_summary', t.get('reasoning', ''))}") summaries_data = _load_json("summaries.json") or {} n_papers = summaries_data.get("n_papers", "N/A") prompt_template = PromptTemplate.from_template( """You are an academic writer drafting a Section 7 (Thematic Analysis Results) for a peer-reviewed Information Systems journal paper. Context: - Dataset: {n_papers} papers from Scopus - Method: BERTopic with AgglomerativeClustering (cosine metric, 384d embeddings, no UMAP), Braun & Clarke (2006) 6-phase framework - Analysis type: {run_key} analysis Themes discovered: {themes} Write a 500-word Section 7 that: 1. Opens with methodology overview (BERTopic, B&C phases, embedding approach) 2. Presents each major theme with evidence and paper count references 3. Discusses PAJAIS taxonomy alignment (MAPPED vs NOVEL themes) 4. Highlights the most significant NOVEL themes and their publication potential 5. Acknowledges limitations (single journal, time period, computational constraints) 6. Closes with implications for future research Write in formal academic style. Use hedged language where appropriate. Do not use bullet points — write in flowing paragraphs.""" ) chain = prompt_template | llm response = chain.invoke({ "n_papers": n_papers, "run_key": run_key, "themes": "\n".join(theme_summary), }) narrative_text = response.content if hasattr(response, "content") else str(response) save_path = os.path.join(CHECKPOINT_DIR, "narrative.txt") with open(save_path, "w", encoding="utf-8") as f: f.write(narrative_text) word_count = len(narrative_text.split()) return ( f"✅ Section 7 narrative exported!\n\n" f"📝 Word count: {word_count}\n" f"💾 Saved to: narrative.txt\n\n" f"**Phase 6 complete! All B&C phases finished. Check Download tab for all outputs.**\n\n" f"---\n\n{narrative_text[:500]}...\n\n*(Full narrative in narrative.txt)*" ) # ── All tools list ───────────────────────────────────────────────────────────── ALL_TOOLS = [ load_scopus_csv, run_bertopic_discovery, label_topics_with_llm, consolidate_into_themes, compare_with_taxonomy, generate_comparison_csv, export_narrative, ]