""" tools.py — 7 @tool functions for BERTopic Agentic AI Assignment: Text Analysis & Topic Modelling (Prof. Shailaja Jha) Generated via: Anthropic Claude Sonnet 4.5 Architecture: LangChain @tool + LangGraph | Model: Mistral Small Latest Rules: ZERO if/elif/else | ZERO for/while | ZERO try/except | handle_tool_error=True """ import os import re import json import numpy as np import pandas as pd import plotly.express as px import plotly.graph_objects as go from sklearn.cluster import AgglomerativeClustering from sklearn.metrics.pairwise import cosine_similarity from sklearn.decomposition import PCA from langchain_core.tools import tool from langchain_mistralai import ChatMistralAI from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser, StrOutputParser # ─── CONSTANTS ──────────────────────────────────────────────────────────────── OUTPUT_DIR = "./outputs" os.makedirs(OUTPUT_DIR, exist_ok=True) MAX_LABEL_TOPICS = 100 NEAREST_K = 5 BATCH_SIZE = 20 BOILERPLATE_RE = re.compile( r"©\s*\d{4}[^.]*?\.|All\s+rights\s+reserved\.?|" r"Published\s+by\s+[A-Z][^.]*?\.|This\s+is\s+an\s+open\s+access[^.]*?\.|" r"Correspondence\s+(to|author):[^.]*?\.|E-?mail:[^.]*?\.|" r"Received:[^.]*?Accepted:[^.]*?\.|DOI:\S+|doi:\S+|https?://\S+|" r"Keywords:[^.]*?\.|JEL[^.]*?\.|ISSN[^.]*?\.|ISBN[^.]*?\.|" r"Elsevier[^.]*?\.|Springer[^.]*?\.|Emerald[^.]*?\.|" r"Wiley[^.]*?\.|Taylor\s*&\s*Francis[^.]*?\.|" r"This\s+paper\s+is\s+part\s+of[^.]*?\.|" r"Conflict\s+of\s+interest[^.]*?\.|" r"Funding[^.]*?:\s*[^.]*?\.|" r"Acknowledgement[s]?:[^.]*?\.", re.IGNORECASE | re.DOTALL, ) SENT_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z\"(])") PAJAIS_25 = [ "IS Strategy and Management", "E-Commerce and E-Business", "IT Adoption and Diffusion", "Business Intelligence and Analytics", "Social Commerce and Social Media", "Mobile Commerce and Applications", "Knowledge Management", "Healthcare Information Systems", "Privacy, Security and Trust", "Enterprise Systems and ERP", "Digital Platforms and Ecosystems", "Blockchain and Distributed Ledgers", "Artificial Intelligence and Machine Learning", "Human-Computer Interaction and UX", "Digital Transformation and Innovation", "Financial Technology and Digital Finance", "Supply Chain and Logistics IS", "Smart Systems IoT and Smart Cities", "IS Research Methods and Theory", "Recommender and Personalization Systems", "Digital Marketing and Advertising", "Virtual Teams and Online Collaboration", "Cloud Computing and SaaS", "Big Data Analytics and Data Science", "IS Education and Training", ] _EMBED_MODEL = None def _get_embed_model(): global _EMBED_MODEL from sentence_transformers import SentenceTransformer _EMBED_MODEL = _EMBED_MODEL or SentenceTransformer( "all-MiniLM-L6-v2" ) return _EMBED_MODEL def _get_llm(): return ChatMistralAI( model="mistral-small-latest", api_key=os.environ.get("MISTRAL_API_KEY", ""), temperature=0.1, ) def _clean(text: str) -> str: return BOILERPLATE_RE.sub(" ", str(text)).strip() def _split(text: str) -> list: return [s.strip() for s in SENT_RE.split(_clean(text)) if len(s.strip()) > 30] def _save(data, name: str) -> str: path = os.path.join(OUTPUT_DIR, name) with open(path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) return path def _load(name: str): with open(os.path.join(OUTPUT_DIR, name), "r", encoding="utf-8") as f: return json.load(f) def _opath(name: str) -> str: return os.path.join(OUTPUT_DIR, name) def _generate_charts(run_key: str, data: list, name_key: str = "cluster_id"): """Regenerates the 4 Plotly charts using the latest data (summaries, labels, or themes).""" if not data: return centroids = np.array([s["centroid"] for s in data]) sizes = [s["sentence_count"] for s in data] n_clusters = len(data) # Use the name_key to get human-readable labels if available def get_name(s): if name_key == "cluster_id": return f"C{s.get('cluster_id', '?')}" return s.get(name_key, f"C{s.get('cluster_id', '?')}") names = [get_name(s) for s in data] pca = PCA(n_components=2) # Handle case where n_clusters < 2 if n_clusters < 2: coords = np.zeros((n_clusters, 2)) else: coords = pca.fit_transform(centroids) chart_dir = _opath(f"{run_key}_charts") os.makedirs(chart_dir, exist_ok=True) fig1 = px.scatter( x=coords[:, 0], y=coords[:, 1], size=sizes, title=f"Intertopic Distance Map — {run_key.title()}", labels={"x": "PC1", "y": "PC2"}, hover_name=names, template="plotly_dark", ) fig1.write_html(os.path.join(chart_dir, "intertopic_map.html"), include_plotlyjs="cdn", full_html=True) top30 = data[:30] fig2 = px.bar( x=[get_name(s) for s in top30], y=[s["sentence_count"] for s in top30], title=f"Top 30 Cluster Sizes — {run_key.title()}", labels={"x": "Cluster", "y": "Sentences"}, template="plotly_dark", ) fig2.write_html(os.path.join(chart_dir, "bar_chart.html"), include_plotlyjs="cdn", full_html=True) fig3 = px.treemap( names=names, parents=["clusters"] * n_clusters, values=sizes, title=f"Topic Treemap — {run_key.title()}", ) fig3.write_html(os.path.join(chart_dir, "treemap.html"), include_plotlyjs="cdn", full_html=True) # Heatmap: pad to 20 hm_items = data[:20] pad_count = 20 - len(hm_items) hm_items_padded = hm_items + [{"sentence_count": 0, name_key: "Empty"}] * pad_count heatmap_data = np.array([s.get("sentence_count", 0) for s in hm_items_padded]).reshape(4, 5) heatmap_text = [[get_name(hm_items_padded[i * 5 + j]) for j in range(5)] for i in range(4)] fig4 = go.Figure(go.Heatmap( z=heatmap_data, colorscale="Viridis", text=heatmap_text, texttemplate="%{text}", showscale=True, )) fig4.update_layout(title=f"Topic Size Heatmap — {run_key.title()}", template="plotly_dark") fig4.write_html(os.path.join(chart_dir, "heatmap.html"), include_plotlyjs="cdn", full_html=True) # ─── TOOL 1: LOAD CSV ───────────────────────────────────────────────────────── @tool def load_scopus_csv(filepath: str) -> str: """Load a Scopus CSV export file and return statistics. Phase 1 of Braun & Clarke (2006) — Familiarisation. Call this FIRST before any analysis. filepath must be the absolute path to the CSV.""" df = pd.read_csv(filepath, encoding="utf-8-sig", on_bad_lines="skip") required = ["Title", "Abstract", "Authors", "Year", "Cited by", "Author Keywords", "Source title"] found = [c for c in required if c in df.columns] missing = [c for c in required if c not in df.columns] pairs_abs = [(s, i) for i, t in enumerate(df["Abstract"].fillna("").tolist()) for s in _split(t)] pairs_ttl = [(s, i) for i, t in enumerate(df["Title"].fillna("").tolist()) for s in _split(t)] year_min = int(df["Year"].dropna().min()) if "Year" in df.columns else 0 year_max = int(df["Year"].dropna().max()) if "Year" in df.columns else 0 journal = (df["Source title"].value_counts().index[0] if "Source title" in df.columns else "Unknown") _save({"filepath": filepath, "journal": journal, "rows": len(df), "year_min": year_min, "year_max": year_max}, "corpus_config.json") return ( f"✅ CSV Loaded\nJournal: {journal}\nPapers: {len(df)}\n" f"Year Range: {year_min}–{year_max}\n" f"Columns Found ({len(found)}/7): {found}\nMissing: {missing}\n" f"Abstract sentences: {len(pairs_abs):,}\n" f"Title sentences: {len(pairs_ttl):,}\n" f"Type 'run abstract' to begin Phase 2." ) # ─── TOOL 2: RUN BERTOPIC DISCOVERY ────────────────────────────────────────── @tool def run_bertopic_discovery(run_key: str, target_size: int = 250) -> str: """Embed sentences with all-MiniLM-L6-v2 and apply Balanced Agglomerative Clustering. Dynamic K selection based on data size (target_size=250 sentences per topic). Includes automatic splitting of massive clusters and merging of tiny clusters to guarantee minimal size disparity across all discovered topics. Saves {run_key}_summaries.json + {run_key}_emb.npy. Phase 2 of Braun & Clarke. run_key must be 'abstract' or 'title'. target_size guides the dynamic cluster counts.""" cfg = _load("corpus_config.json") df = pd.read_csv(cfg["filepath"], encoding="utf-8-sig", on_bad_lines="skip") col = "Abstract" if run_key == "abstract" else "Title" pairs = [(s, i) for i, t in enumerate(df[col].fillna("").tolist()) for s in _split(t)] sentences = [p[0] for p in pairs] paper_ids = [p[1] for p in pairs] model = _get_embed_model() emb = model.encode(sentences, normalize_embeddings=True, batch_size=64, show_progress_bar=True) np.save(_opath(f"{run_key}_emb.npy"), emb) _save({"sentences": sentences, "paper_ids": paper_ids}, f"{run_key}_sentences.json") # Dynamic sizing calculations total_sents = len(sentences) dynamic_k = max(5, total_sents // target_size) max_size = target_size * 2 min_size = target_size // 2 labels_arr = AgglomerativeClustering( n_clusters=dynamic_k, metric="euclidean", linkage="ward" ).fit_predict(emb) # 1. Enforce splitting of massive clusters while True: u_labels, counts = np.unique(labels_arr, return_counts=True) too_big = u_labels[counts > max_size] if len(too_big) == 0: break for cid in too_big: idx = np.where(labels_arr == cid)[0] split_k = int(np.ceil(len(idx) / target_size)) sub_labels = AgglomerativeClustering( n_clusters=split_k, metric="euclidean", linkage="ward" ).fit_predict(emb[idx]) new_id_start = max(labels_arr) + 1 for sub_id in range(1, split_k): sub_idx = idx[sub_labels == sub_id] labels_arr[sub_idx] = new_id_start new_id_start += 1 # 2. Enforce merging of tiny clusters while True: u_labels, counts = np.unique(labels_arr, return_counts=True) too_small = u_labels[counts < min_size] if len(too_small) == 0 or len(u_labels) <= 5: # keep at least 5 clusters break cid = too_small[0] idx = np.where(labels_arr == cid)[0] centroid = emb[idx].mean(axis=0, keepdims=True) best_dist = -1.0 best_merge_id = -1 # Try merging into the nearest cluster that won't become too huge for other_id in u_labels: if other_id == cid: continue other_idx = np.where(labels_arr == other_id)[0] if len(other_idx) + len(idx) > (max_size * 1.5): continue other_centroid = emb[other_idx].mean(axis=0, keepdims=True) sim = cosine_similarity(centroid, other_centroid)[0][0] if sim > best_dist: best_dist = float(sim) best_merge_id = int(other_id) if best_merge_id != -1: labels_arr[idx] = best_merge_id else: # Fallback: force merge into absolute nearest neighbor regardless of size limit best_dist = -1.0 best_merge_id = -1 for other_id in u_labels: if other_id == cid: continue other_idx = np.where(labels_arr == other_id)[0] other_centroid = emb[other_idx].mean(axis=0, keepdims=True) sim = cosine_similarity(centroid, other_centroid)[0][0] if sim > best_dist: best_dist = float(sim) best_merge_id = int(other_id) labels_arr[idx] = best_merge_id unique_labels = np.unique(labels_arr) n_clusters = len(unique_labels) # Build per-cluster sentence index list # list(map(int,...)) converts numpy.int64 -> Python int for JSON serialisation cluster_sentence_idx = {int(cid): list(map(int, np.where(labels_arr == cid)[0])) for cid in unique_labels} def make_summary(cid): idx = cluster_sentence_idx[int(cid)] c_emb = emb[idx] centroid = c_emb.mean(axis=0, keepdims=True) sims = cosine_similarity(centroid, c_emb)[0] top_k = min(NEAREST_K, len(idx)) # Convert numpy int64 -> Python int to ensure JSON serialisability top_local = list(map(int, np.argsort(sims)[-top_k:][::-1])) top_global = list(map(lambda j: idx[j], top_local)) return { "cluster_id": int(cid), "sentence_count": len(idx), "paper_count": len(set(paper_ids[i] for i in idx)), "top_sentences": [sentences[i] for i in top_global], "centroid": centroid[0].tolist(), # idx already Python int from cluster_sentence_idx (Fix 1) "sentence_indices": idx, } summaries = list(map(make_summary, unique_labels)) summaries = sorted(summaries, key=lambda x: x["sentence_count"], reverse=True) _save(summaries, f"{run_key}_summaries.json") # ── 4 Plotly Charts ─────────────────────────────────────────────────────── _generate_charts(run_key, summaries, name_key="cluster_id") chart_dir = _opath(f"{run_key}_charts") return ( f"✅ BERTopic Discovery Complete ({run_key})\n" f"Total sentences: {len(sentences):,}\n" f"Topics generated: {n_clusters} (Dynamic via target_size={target_size})\n" f"Algorithm: Constrained Agglomerative (Split & Merge Balanced)\n" f"Largest cluster: {summaries[0]['sentence_count']} sentences\n" f"Smallest cluster: {summaries[-1]['sentence_count']} sentences\n" f"Charts saved to {chart_dir}\n" f"Now calling label_topics_with_llm..." ) # ─── TOOL 3: LABEL TOPICS WITH LLM ─────────────────────────────────────────── @tool def label_topics_with_llm(run_key: str) -> str: """Send top 100 clusters to Mistral for labelling. Returns topic labels, categories, confidence scores, reasoning, is_niche. Saves {run_key}_labels.json. Phase 2 of Braun & Clarke. run_key must be 'abstract' or 'title'.""" summaries = _load(f"{run_key}_summaries.json")[:MAX_LABEL_TOPICS] llm = _get_llm() label_prompt = PromptTemplate.from_template( "You are a bibliometric research expert.\n" "Label each cluster below with a concise research area name.\n" "Return ONLY a JSON array — one object per cluster:\n" ' {{"cluster_id": N, "label": "...", "category": "...", ' '"confidence": 0.0-1.0, "reasoning": "...", "is_niche": true/false}}\n\n' "Clusters (ID | sentence_count | top 2 sentences):\n{clusters}\n\n" "Return valid JSON array only, no markdown fences." ) def _format_batch(batch): return "\n".join( f"{s['cluster_id']} | {s['sentence_count']} sents | " + " /// ".join(s["top_sentences"][:2]) for s in batch ) def label_batch(batch): raw = (label_prompt | llm | StrOutputParser()).invoke( {"clusters": _format_batch(batch)} ) cleaned = raw.strip().lstrip("```json").lstrip("```").rstrip("```").strip() return json.loads(cleaned) batch_starts = list(range(0, len(summaries), BATCH_SIZE)) batches = list(map(lambda i: summaries[i:i + BATCH_SIZE], batch_starts)) results = [item for batch_result in map(label_batch, batches) for item in batch_result] label_map = {r["cluster_id"]: r for r in results} labeled = [ {**s, **label_map.get(s["cluster_id"], {"label": f"Topic {s['cluster_id']}", "category": "Unknown", "confidence": 0.5, "reasoning": "", "is_niche": False})} for s in summaries ] _save(labeled, f"{run_key}_labels.json") _generate_charts(run_key, labeled, name_key="label") return ( f"✅ Labels Generated ({run_key})\n" f"Topics labeled: {len(labeled)}\n" f"Review table populated. Edit Approve/Rename columns, " f"then click Submit Review." ) # ─── TOOL 4: CONSOLIDATE INTO THEMES ───────────────────────────────────────── @tool def consolidate_into_themes(run_key: str, theme_map: str) -> str: """Merge researcher-approved topic groups into consolidated themes. theme_map: JSON string — array from review table with cluster_id, approve, rename_to fields. Recomputes centroids and paper counts from actual embeddings. Saves {run_key}_themes.json. Phase 3 of Braun & Clarke.""" decisions = json.loads(theme_map) emb = np.load(_opath(f"{run_key}_emb.npy")) sent_data = _load(f"{run_key}_sentences.json") paper_ids = sent_data["paper_ids"] sentences = sent_data["sentences"] summaries = _load(f"{run_key}_summaries.json") # Build cluster_id → sentence_indices map from summaries # (sentence_indices stored during discovery; fallback to sequential search) sum_map = {s["cluster_id"]: s for s in summaries} approved = [d for d in decisions if str(d.get("approve", "")).upper() == "YES"] # Group cluster IDs by theme name theme_groups: dict = {} list(map( lambda d: theme_groups.setdefault( str(d.get("rename_to", "") or d.get("label", f"Topic {d['cluster_id']}")).strip(), [] ).append(int(d["cluster_id"])), approved )) def build_theme(name_cids_tuple): name, cids = name_cids_tuple # Collect all sentence indices for these clusters all_sent_idx = list(set( idx for cid in cids for idx in (sum_map[cid].get("sentence_indices", []) if cid in sum_map else []) )) # Fallback: scan paper_ids if sentence_indices weren't stored fallback_idx = list(set( i for cid in cids for i in range(len(paper_ids)) if paper_ids[i] == cid )) if not all_sent_idx else all_sent_idx use_idx = all_sent_idx if all_sent_idx else fallback_idx theme_emb = emb[use_idx] if use_idx else emb[:1] centroid = theme_emb.mean(axis=0) total_sents = sum(sum_map[cid]["sentence_count"] for cid in cids if cid in sum_map) unique_papers = set(paper_ids[i] for i in use_idx) if use_idx else set() top_sents = sum_map[cids[0]]["top_sentences"][:3] if cids and cids[0] in sum_map else [] return { "theme_name": name, "merged_cluster_ids": cids, "sentence_count": total_sents, "paper_count": len(unique_papers), "top_sentences": top_sents, "centroid": centroid.tolist(), } themes = list(map(build_theme, theme_groups.items())) themes.sort(key=lambda x: x["sentence_count"], reverse=True) _save(themes, f"{run_key}_themes.json") _generate_charts(run_key, themes, name_key="theme_name") return ( f"✅ Themes Consolidated ({run_key})\n" f"Approved topics: {len(approved)}\n" f"Final themes: {len(themes)}\n" f"Theme names: {[t['theme_name'] for t in themes]}\n" f"Review consolidated themes. Click Submit Review to confirm." ) # ─── TOOL 5: COMPARE WITH TAXONOMY ─────────────────────────────────────────── @tool def compare_with_taxonomy(run_key: str) -> str: """Map final themes to the PAJAIS taxonomy (Jiang et al. 2019) — 25 categories. Classifies each theme as MAPPED or NOVEL. Saves taxonomy_map.json. Phase 5.5 of Braun & Clarke. run_key must be 'abstract' or 'title'.""" themes_file = (f"{run_key}_themes.json" if os.path.exists(_opath(f"{run_key}_themes.json")) else f"{run_key}_labels.json") themes_raw = _load(themes_file) theme_names = [t.get("theme_name", t.get("label", "")) for t in themes_raw] llm = _get_llm() tax_prompt = PromptTemplate.from_template( "You are a bibliometric taxonomy expert.\n" "Map each theme to the PAJAIS taxonomy (Jiang et al., 2019).\n\n" "PAJAIS 25 categories:\n{pajais}\n\n" "Themes to classify:\n{themes}\n\n" "Return ONLY a JSON array:\n" '[{{"theme": "...", "pajais_match": "category or NOVEL", ' '"match_confidence": 0.0-1.0, "reasoning": "...", "is_novel": true/false}}]\n' "If no PAJAIS category fits well, set pajais_match to NOVEL and is_novel to true.\n" "No markdown fences, return raw JSON only." ) pajais_str = "\n".join(f"{i+1}. {c}" for i, c in enumerate(PAJAIS_25)) themes_str = "\n".join(f"- {n}" for n in theme_names) raw = (tax_prompt | llm | StrOutputParser()).invoke( {"pajais": pajais_str, "themes": themes_str} ) cleaned = raw.strip().lstrip("```json").lstrip("```").rstrip("```").strip() results = json.loads(cleaned) mapped = [r for r in results if not r.get("is_novel", False)] novel = [r for r in results if r.get("is_novel", False)] covered = set(r["pajais_match"] for r in mapped) gaps = [c for c in PAJAIS_25 if c not in covered] taxonomy_map = { "run_key": run_key, "taxonomy_mapping": {r["theme"]: r for r in results}, "novel_themes": [r["theme"] for r in novel], "pajais_gap_categories": gaps, "coverage_stats": { "total_themes": len(results), "mapped": len(mapped), "novel": len(novel), }, } _save(taxonomy_map, "taxonomy_map.json") return ( f"✅ PAJAIS Taxonomy Mapped ({run_key})\n" f"Themes mapped: {len(mapped)}\n" f"NOVEL themes: {len(novel)} → {[r['theme'] for r in novel]}\n" f"PAJAIS gaps (top 5): {gaps[:5]}\n" f"taxonomy_map.json saved. Review PAJAIS mapping in table. Click Submit Review." ) # ─── TOOL 6: GENERATE COMPARISON CSV ───────────────────────────────────────── @tool def generate_comparison_csv() -> str: """Load themes from both abstract and title runs and create a side-by-side comparison. Identifies STABLE (convergent), ABSTRACT-ONLY, and TITLE-ONLY themes. Saves comparison.csv. Phase 6 of Braun & Clarke.""" def load_themes(key): fname = (f"{key}_themes.json" if os.path.exists(_opath(f"{key}_themes.json")) else f"{key}_labels.json") return _load(fname) abs_themes = load_themes("abstract") ttl_themes = load_themes("title") abs_names = [t.get("theme_name", t.get("label", "")) for t in abs_themes] ttl_names = [t.get("theme_name", t.get("label", "")) for t in ttl_themes] abs_kws = [" | ".join(t.get("top_sentences", [""])[:1]) for t in abs_themes] ttl_kws = [" | ".join(t.get("top_sentences", [""])[:1]) for t in ttl_themes] max_len = max(len(abs_themes), len(ttl_themes)) pad = lambda lst, val: lst + [val] * (max_len - len(lst)) # noqa: E731 df = pd.DataFrame({ "Abstract_Theme": pad(abs_names, ""), "Abstract_Evidence": pad(abs_kws, ""), "Abstract_Sentences": pad([t.get("sentence_count", 0) for t in abs_themes], 0), "Title_Theme": pad(ttl_names, ""), "Title_Evidence": pad(ttl_kws, ""), "Title_Sentences": pad([t.get("sentence_count", 0) for t in ttl_themes], 0), "Convergence": pad( ["STABLE" if a in ttl_names else "ABSTRACT-ONLY" for a in abs_names], "TITLE-ONLY" ), }) path = _opath("comparison.csv") df.to_csv(path, index=False) return ( f"✅ Comparison CSV Generated\n" f"Abstract themes: {len(abs_themes)}\n" f"Title themes: {len(ttl_themes)}\n" f"Rows: {len(df)}\nFile: {path}\n" f"Check Download tab for comparison.csv. Click Submit Review to generate narrative." ) # ─── TOOL 7: EXPORT NARRATIVE ───────────────────────────────────────────────── @tool def export_narrative(run_key: str) -> str: """Generate a 500-word Section 7 narrative via Mistral LLM. Uses themes + PAJAIS taxonomy mapping as context. Saves narrative.txt. Phase 6 of Braun & Clarke. run_key must be 'abstract' or 'title'.""" cfg = _load("corpus_config.json") theme_file = (f"{run_key}_themes.json" if os.path.exists(_opath(f"{run_key}_themes.json")) else f"{run_key}_labels.json") themes = _load(theme_file) tax = _load("taxonomy_map.json") theme_names = [t.get("theme_name", t.get("label", "")) for t in themes] novel_themes = tax.get("novel_themes", []) gaps = tax.get("pajais_gap_categories", []) mapped = tax.get("coverage_stats", {}).get("mapped", 0) llm = _get_llm() narr_prompt = PromptTemplate.from_template( "Write a 500-word Section 7 for a conference paper on topic modelling.\n" "Journal: {journal} | Papers: {papers} | Years: {y_min}–{y_max}\n" "Stable BERTopic themes (abstract run): {themes}\n" "NOVEL themes (not in PAJAIS 2019): {novel}\n" "PAJAIS gap categories: {gaps}\n" "Themes mapped to PAJAIS: {mapped}\n\n" "Structure: 7.1 Methodology (LDA + BERTopic, Braun & Clarke 2006), " "7.2 RQ4 LDA Findings, 7.3 RQ5 Abstract vs Title Comparison, " "7.4 RQ6 PAJAIS Taxonomy Mapping with NOVEL theme justification, " "7.5 RQ7 Future Research Agenda.\n" "Cite: Braun & Clarke (2006), Jiang et al. (2019), Grootendorst (2022).\n" "~500 words, academic tone, no bullet points, paragraph form." ) narrative = (narr_prompt | llm | StrOutputParser()).invoke({ "journal": cfg.get("journal", "Electronic Markets"), "papers": cfg.get("rows", 908), "y_min": cfg.get("year_min", 2007), "y_max": cfg.get("year_max", 2026), "themes": ", ".join(theme_names[:10]), "novel": ", ".join(novel_themes[:5]), "gaps": ", ".join(gaps[:5]), "mapped": mapped, }) path = _opath("narrative.txt") with open(path, "w", encoding="utf-8") as f: f.write(narrative) return ( f"✅ Narrative Exported\n" f"Words: {len(narrative.split())}\n" f"File: {path}\n" f"🎉 Pipeline complete! Download narrative.txt from the Download tab.\n" f"Deliverables: comparison.csv | taxonomy_map.json | narrative.txt" ) # --- SET handle_tool_error ON ALL TOOLS (BaseTool property) --- # langchain-core 0.3.x: handle_tool_error is a BaseTool property, # not a @tool() decorator argument. Using map() - zero loops. _ALL_TOOLS = [ load_scopus_csv, run_bertopic_discovery, label_topics_with_llm, consolidate_into_themes, compare_with_taxonomy, generate_comparison_csv, export_narrative, ] list(map(lambda t: setattr(t, "handle_tool_error", True), _ALL_TOOLS))