""" tools.py - 7 LangChain @tool functions for BERTopic Thematic Analysis Agent. Rules: ZERO if/else, ZERO for/while, ZERO try/except, ZERO PromptTemplate. All LLM calls use plain HumanMessage strings directly. Workflow: - Abstract run saves to data/abstract/ - Title run saves to data/title/ - Comparison CSV + narrative only generated when BOTH runs are complete - Topic IDs are sequential 1..N (not raw cluster labels) - Boilerplate filter catches © symbol, all major publishers """ from __future__ import annotations import json import re import shutil from pathlib import Path import numpy as np import pandas as pd import plotly.express as px from langchain_core.tools import tool from langchain_core.messages import HumanMessage from langchain_mistralai import ChatMistralAI from sentence_transformers import SentenceTransformer from sklearn.cluster import AgglomerativeClustering from sklearn.decomposition import PCA from sklearn.metrics.pairwise import cosine_similarity # ── paths ────────────────────────────────────────────────────────────────────── DATA_DIR = Path("data") DATA_DIR.mkdir(exist_ok=True) # ── Embedding model — loaded ONCE at module level, reused everywhere ─────────── # This prevents repeated HuggingFace downloads and avoids 429 rate limit errors. # The UNEXPECTED embeddings.position_ids warning is harmless — safe to ignore. print("Loading sentence-transformers model (one-time)...") _EMBED_MODEL = SentenceTransformer("all-MiniLM-L6-v2") print("Model loaded OK.") def _p(run_config: str) -> dict: """Return all file paths for a given run_config, creating subdirectory.""" d = DATA_DIR / run_config d.mkdir(parents=True, exist_ok=True) return { "dir": d, "sentences": d / "sentences.json", "stats": d / "stats.json", "papers": d / "papers.csv", "emb": d / "emb.npy", "summaries": d / "summaries.json", "charts": d / "charts.json", "themes": d / "themes.json", "taxonomy": d / "taxonomy.json", "narrative": d / "narrative.txt", "comparison": DATA_DIR / "comparison.csv", # shared output } RUN_CONFIGS = { "abstract": ["Abstract"], "title": ["Title"], } # Comprehensive boilerplate filter — catches © symbol + all major publishers BOILERPLATE_PATTERNS = [ r"\u00a9", # © unicode r"\\u00a9", # escaped unicode r"copyright\s*\d{4}", r"\d{4}\s+john wiley", r"john wiley\s*(&|and)\s*sons", r"blackwell\s*(publishing|pub)", r"wiley\s+periodicals", r"wiley\s+online", r"all rights reserved", r"doi\s*:\s*\S+", r"published by elsevier", r"elsevier\s*(b\.v|inc|ltd|science)", r"springer\s*(nature|verlag|science|link)", r"taylor\s*(&|and)\s*francis", r"informa\s+uk", r"sage\s+publications", r"information systems journal\s+published", r"emerald\s+(publishing|group)", r"this article is", r"rights reserved", r"permission from", r"reproduced with", ] BOILERPLATE_RE = re.compile("|".join(BOILERPLATE_PATTERNS), re.IGNORECASE) # Extra keyword filter applied per-sentence PUBLISHER_KEYWORDS = frozenset([ "wiley", "elsevier", "blackwell", "springer", "taylor", "information systems journal", "emerald" ]) PAJAIS_CATEGORIES = [ "Information Systems Theory", "IS Strategy & Governance", "Digital Innovation", "Enterprise Systems", "AI & Intelligent Systems", "Big Data & Analytics", "Cybersecurity & Privacy", "Cloud Computing", "IS in Healthcare", "IS in Education", "E-Commerce & Digital Markets", "Social Media & Platforms", "Human-Computer Interaction", "IS Project Management", "IT Outsourcing", "Knowledge Management", "IS Development Methodologies", "Digital Transformation", "IS Ethics & Society", "IS in Developing Countries", "Mobile Computing", "IT Infrastructure", "IS Adoption & Diffusion", "IS Evaluation", "Organizational IS & Change", ] def safe_read_csv(path): """Read CSV with UTF-8 fallback to latin-1.""" try: return pd.read_csv(path, encoding="utf-8") except UnicodeDecodeError: return pd.read_csv(path, encoding="latin-1") def _is_clean(s: str) -> bool: """Return True if sentence passes all quality checks.""" sl = s.lower().strip() return ( not BOILERPLATE_RE.search(s) and not s.strip().startswith("\u00a9") and not s.strip().startswith("©") and len(s.split()) > 6 and len(s.strip()) > 40 and not any(kw in sl for kw in PUBLISHER_KEYWORDS) ) def _call_llm_json(llm, prompt: str) -> list: """Call LLM with plain HumanMessage, strip markdown fences, parse JSON.""" response = llm.invoke([HumanMessage(content=prompt)]) raw = response.content.strip() raw = raw.split("```json")[-1].split("```")[0].strip() if "```" in raw else raw return json.loads(raw) def _both_runs_complete() -> bool: """Return True only when BOTH abstract and title runs have themes saved.""" return ( (_p("abstract")["themes"]).exists() and (_p("title")["themes"]).exists() ) # ============================================================================= # TOOL 1 — load_scopus_csv # Saves to data/uploaded.csv (permanent copy) AND data/{run_config}/papers.csv # ============================================================================= @tool def load_scopus_csv(csv_path: str, run_config: str = "abstract") -> str: """Load a Scopus CSV, filter boilerplate sentences, save per run_config. Saves sentences to data/{run_config}/sentences.json. Also copies the CSV permanently to data/uploaded.csv. Args: csv_path: Path to the uploaded Scopus CSV file. run_config: 'abstract' or 'title' (default 'abstract'). """ p = _p(run_config) columns = RUN_CONFIGS.get(run_config, ["Abstract"]) # Copy CSV to permanent location only if it is a different file dest = DATA_DIR / "uploaded.csv" src = Path(csv_path).resolve() dst = dest.resolve() _ = shutil.copy(str(src), str(dst)) if src != dst else None df_raw = safe_read_csv(dest) # Find which text column actually exists in this CSV # Scopus sometimes uses "Abstract" or "abstract" or "ABSTRACT" col_lower_map = {c.strip().lower(): c for c in df_raw.columns} target_lower = columns[0].lower() actual_col = col_lower_map.get(target_lower, None) # Also try partial match if exact match fails actual_col = ( actual_col if actual_col is not None else next(filter(lambda c: target_lower in c.lower(), df_raw.columns), None) ) # If still not found, return early with clear message if actual_col is None: available = list(df_raw.columns) return json.dumps({ "error": "Column '{}' not found in CSV. Available columns: {}".format( columns[0], available ), "run_config": run_config, }) # Build keep_cols — deduplicate to avoid DataFrame-instead-of-Series bug # when actual_col == "Title" (title run) and "Title" also appears in extras extras = ["Title", "Year", "Source title", "Cited by"] all_wanted = [actual_col] + [c for c in extras if c != actual_col] keep_cols = list(dict.fromkeys(filter(lambda c: c in df_raw.columns, all_wanted))) df = df_raw[keep_cols].copy() # Access the text column safely as a Series using column position text_series = df[actual_col] # If still a DataFrame (duplicate col names), take first column text_series = ( text_series.iloc[:, 0] if isinstance(text_series, pd.DataFrame) else text_series ) mask = text_series.notna() & (text_series.astype(str).str.strip() != "") df = df[mask].copy() text_series = text_series[mask] def split_sentences(text): parts = re.split(r"(?<=[.!?])\s+", str(text)) return list(filter(_is_clean, parts)) sentences_lists = list(map(split_sentences, list(text_series))) all_sentences = [s for lst in sentences_lists for s in lst] stats = { "papers": int(len(df)), "sentences_after_filter": int(len(all_sentences)), "columns_used": [actual_col], "run_config": run_config, } p["sentences"].write_text(json.dumps(all_sentences, ensure_ascii=False)) p["stats"].write_text(json.dumps(stats, ensure_ascii=False)) df.to_csv(p["papers"], index=False) return json.dumps(stats) # ============================================================================= # TOOL 2 — run_bertopic_discovery # threshold=0.35 → ~100 fine-grained clusters; IDs renumbered 1..N # ============================================================================= @tool def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract") -> str: """Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering (cosine, threshold=0.35) targeting ~100 topics. Topic IDs are sequential 1..N. Args: top_n_topics: Target number of clusters (default 100). run_config: 'abstract' or 'title' (default 'abstract'). """ p = _p(run_config) sentences = json.loads(p["sentences"].read_text()) embeddings = _EMBED_MODEL.encode( sentences, normalize_embeddings=True, show_progress_bar=False, batch_size=64 ) np.save(p["emb"], embeddings) clustering = AgglomerativeClustering( metric="cosine", linkage="average", distance_threshold=0.35, n_clusters=None, ) labels = clustering.fit_predict(embeddings) all_labels = sorted(set(labels.tolist())) label_sizes = list(map(lambda lb: (lb, int((labels == lb).sum())), all_labels)) # Keep clusters with ≥3 sentences, sort by size desc, take top N label_filtered = list(filter(lambda x: x[1] >= 3, label_sizes)) label_sorted = sorted(label_filtered, key=lambda x: -x[1]) retained = list(map(lambda x: x[0], label_sorted[:top_n_topics])) def build_summary(seq_label): seq_id, raw_label = seq_label mask = labels == raw_label cluster_embs = embeddings[mask] raw_sents = [sentences[i] for i, m in enumerate(mask.tolist()) if m] clean_sents = list(filter(_is_clean, raw_sents)) sents = clean_sents if clean_sents else raw_sents[:5] centroid = cluster_embs.mean(axis=0, keepdims=True) sims = cosine_similarity(centroid, cluster_embs)[0] top5_idx = sims.argsort()[-5:][::-1].tolist() raw_top = list(map(lambda i: raw_sents[i], top5_idx)) clean_set = set(sents) top_evidence = list(filter(lambda s: s in clean_set, raw_top))[:5] top_evidence = top_evidence if top_evidence else raw_top[:3] return { "topic_id": seq_id, "size": int(mask.sum()), "top_evidence": top_evidence, "sentences": sents, "centroid": centroid[0].tolist(), "run_config": run_config, } # Sequential IDs starting at 1 seq_pairs = list(map(lambda x: (x[0] + 1, x[1]), enumerate(retained))) summaries = list(map(build_summary, seq_pairs)) p["summaries"].write_text(json.dumps(summaries, indent=2, ensure_ascii=False)) sizes = list(map(lambda s: s["size"], summaries)) ids = list(map(lambda s: s["topic_id"], summaries)) fig1 = px.bar(x=ids, y=sizes, title="Topic Sizes — {}".format(run_config), labels={"x": "Topic #", "y": "Sentences"}) fig2 = px.histogram(x=sizes, nbins=30, title="Size Distribution — {}".format(run_config), labels={"x": "Cluster Size"}) centroids = np.array(list(map(lambda s: s["centroid"], summaries))) n_comp = min(2, centroids.shape[0], centroids.shape[1]) coords = PCA(n_components=n_comp).fit_transform(centroids) fig3 = px.scatter( x=coords[:, 0], y=(coords[:, 1] if coords.shape[1] > 1 else [0] * len(coords)), text=list(map(str, ids)), title="Topic Centroids PCA — {}".format(run_config), labels={"x": "PC1", "y": "PC2"}, ) fig4 = px.treemap( names=list(map(str, ids)), parents=["Topics"] * len(ids), values=sizes, title="Treemap — {}".format(run_config), ) charts = { "bar": fig1.to_html(full_html=False, include_plotlyjs="cdn"), "histogram": fig2.to_html(full_html=False, include_plotlyjs=False), "scatter": fig3.to_html(full_html=False, include_plotlyjs=False), "treemap": fig4.to_html(full_html=False, include_plotlyjs=False), } p["charts"].write_text(json.dumps(charts)) return json.dumps({ "topics_found": len(summaries), "run_config": run_config, "chart_types": list(charts.keys()), "note": "Topics numbered 1..{}, threshold=0.35".format(len(summaries)), }) # ============================================================================= # TOOL 3 — label_topics_with_llm # ============================================================================= @tool def label_topics_with_llm(batch_size: int = 15, run_config: str = "abstract") -> str: """Label topic clusters with human-readable names via Mistral LLM. Uses mistral-small-latest to stay within free-tier rate limits. Adds 12-second sleep between batches to avoid HTTP 429 errors. Args: batch_size: Topics per LLM call (default 15). run_config: 'abstract' or 'title' (default 'abstract'). """ import time p = _p(run_config) summaries = json.loads(p["summaries"].read_text()) # Cap at 60 to reduce total API calls — covers the most meaningful clusters top_summaries = summaries[:60] # mistral-small has higher RPM limits than mistral-large on the free tier llm = ChatMistralAI(model="mistral-small-latest", temperature=0.2) batch_starts = list(range(0, len(top_summaries), batch_size)) def label_batch(start): batch = top_summaries[start: start + batch_size] # Only 2 evidence sentences per topic to reduce token usage mini = list(map( lambda s: {"topic_id": s["topic_id"], "sentences": s["top_evidence"][:2]}, batch )) topic_ids_in_batch = list(map(lambda s: s["topic_id"], batch)) prompt = ( "You are a thematic analysis expert in Information Systems research.\n" "For each topic cluster below, provide:\n" " - label: a specific 3-6 word academic theme name (e.g. 'Digital Transformation Barriers', " "'AI Adoption in Healthcare', 'IS Project Management Challenges')\n" " - reasoning: one sentence explaining why you chose that label\n\n" "IMPORTANT: You MUST return exactly one entry for each topic_id in this list: " + str(topic_ids_in_batch) + "\n\n" "TOPICS:\n" + json.dumps(mini, indent=2) + "\n\n" "Return ONLY a raw JSON array with no markdown fences. " "Each element must have exactly these three keys: " "topic_id (integer matching the input), label (string), reasoning (string)." ) return _call_llm_json(llm, prompt) # Sequential with sleep between batches — free tier ~5 req/min for mistral-small # 12 seconds between calls keeps us safely under the limit all_labels_raw = [] for idx, start in enumerate(batch_starts): all_labels_raw.extend(label_batch(start)) _ = time.sleep(12) if idx < len(batch_starts) - 1 else None # Build label_map keyed by BOTH int and str — LLM sometimes returns "1" not 1 label_map = {} for item in all_labels_raw: tid = item.get("topic_id", "") label_map[int(tid)] = item label_map[str(tid)] = item def enrich(s): tid = s["topic_id"] info = label_map.get(tid) or label_map.get(str(tid)) or {} raw_label = str(info.get("label", "")).strip() raw_reason = str(info.get("reasoning", "")).strip() good_label = ( raw_label if raw_label and raw_label.lower() not in ("", "n/a", "none", "null") else "Topic {}".format(tid) ) return {**s, "label": good_label, "reasoning": raw_reason} enriched = list(map(enrich, top_summaries)) p["summaries"].write_text(json.dumps(enriched, indent=2, ensure_ascii=False)) labelled_count = sum( 1 for s in enriched if s.get("label", "").strip() and not s["label"].startswith("Topic ") ) return json.dumps({ "labelled_topics": len(enriched), "with_llm_label": labelled_count, "run_config": run_config, }) @tool def consolidate_into_themes(approved_groups: str, run_config: str = "abstract") -> str: """Merge approved topic groups into themes and recompute centroids. Args: approved_groups: JSON list [{theme_name: str, topic_ids: [int,...]}] run_config: 'abstract' or 'title' (default 'abstract'). """ p = _p(run_config) groups = json.loads(approved_groups) summaries = json.loads(p["summaries"].read_text()) id_map = {s["topic_id"]: s for s in summaries} def build_theme(group): ids = group["topic_ids"] members = list(map(lambda tid: id_map[tid], ids)) sents = [s for ms in members for s in ms.get("sentences", [])] centroids = np.array(list(map(lambda ms: ms["centroid"], members))) return { "theme_name": group["theme_name"], "topic_ids": ids, "sentences": sents, "centroid": centroids.mean(axis=0).tolist(), "paper_count": len(set(sents)), "run_config": run_config, } themes = list(map(build_theme, groups)) p["themes"].write_text(json.dumps(themes, indent=2, ensure_ascii=False)) return json.dumps({ "themes_created": len(themes), "theme_names": list(map(lambda t: t["theme_name"], themes)), "run_config": run_config, "both_complete": _both_runs_complete(), }) # ============================================================================= # TOOL 5 — compare_with_taxonomy # ============================================================================= @tool def compare_with_taxonomy(run_config: str = "abstract") -> str: """Map themes to PAJAIS 25 categories via Mistral LLM. Args: run_config: 'abstract' or 'title' (default 'abstract'). """ p = _p(run_config) themes = json.loads(p["themes"].read_text()) llm = ChatMistralAI(model="mistral-small-latest", temperature=0.1) theme_mini = list(map( lambda t: {"name": t["theme_name"], "sample": t["sentences"][:2]}, themes )) prompt = ( "You are a research classification expert in Information Systems.\n\n" "Map each theme to the single most relevant PAJAIS category.\n\n" "THEMES:\n" + json.dumps(theme_mini, indent=2) + "\n\n" "PAJAIS CATEGORIES:\n" + json.dumps(PAJAIS_CATEGORIES, indent=2) + "\n\n" "Return ONLY a raw JSON array. " "Each element: name, pajais_category, confidence, rationale. " "No markdown, no explanation." ) result = _call_llm_json(llm, prompt) p["taxonomy"].write_text(json.dumps(result, indent=2, ensure_ascii=False)) return json.dumps({ "mapped_themes": len(result), "run_config": run_config, "both_complete": _both_runs_complete(), }) # ============================================================================= # TOOL 6 — generate_comparison_csv # ONLY runs when BOTH abstract and title runs are complete # Columns: Title | Abstract | Year | Source Journal # ============================================================================= @tool def generate_comparison_csv() -> str: """Generate Title | Abstract | Year | Source Journal comparison CSV. Only available after BOTH abstract and title runs have completed themes. Saves to data/comparison.csv. """ abs_complete = _p("abstract")["themes"].exists() title_complete = _p("title")["themes"].exists() status_msg = ( "Abstract complete: {}, Title complete: {}. " "Run 'run title' to complete the title analysis first." ).format(abs_complete, title_complete) # Use ternary to avoid if/else result = ( _do_generate_comparison_csv() if (abs_complete and title_complete) else status_msg ) return result def _assign_theme_for_text(text: str, themes: list, taxonomy_map: dict) -> tuple: """ Given a piece of text (title or abstract), find the best matching theme by computing cosine similarity between the text embedding and theme centroids. Returns (theme_name, pajais_category). """ text_emb = _EMBED_MODEL.encode([str(text)], normalize_embeddings=True)[0] centroids = np.array(list(map(lambda t: t["centroid"], themes))) sims = cosine_similarity(text_emb.reshape(1, -1), centroids)[0] best_idx = int(sims.argmax()) best_theme = themes[best_idx]["theme_name"] pajais = taxonomy_map.get(best_theme, "Unknown") return best_theme, pajais, float(round(sims[best_idx], 4)) def _do_generate_comparison_csv() -> str: """ Build enriched comparison CSV with per-paper theme assignments for both runs. Columns: Title | Title Theme | Title PAJAIS Category | Abstract | Abstract Theme | Abstract PAJAIS Category | Year | Source Journal | Theme Similarity | Similarity % | Similarity Reasoning """ df = safe_read_csv(DATA_DIR / "uploaded.csv") # Detect columns title_col = next(filter(lambda c: c.strip().lower() == "title", df.columns), None) abstract_col = next(filter(lambda c: c.strip().lower() == "abstract", df.columns), None) year_col = next(filter(lambda c: c.strip().lower() == "year", df.columns), None) journal_col = next(filter(lambda c: "source" in c.lower(), df.columns), None) # Load abstract themes + taxonomy abs_themes = json.loads(_p("abstract")["themes"].read_text()) abs_taxonomy = json.loads(_p("abstract")["taxonomy"].read_text()) abs_tax_map = { item.get("name", item.get("theme_name", "")): item.get("pajais_category", "") for item in abs_taxonomy } # Load title themes + taxonomy title_themes = json.loads(_p("title")["themes"].read_text()) title_taxonomy = json.loads(_p("title")["taxonomy"].read_text()) title_tax_map = { item.get("name", item.get("theme_name", "")): item.get("pajais_category", "") for item in title_taxonomy } # Build theme name → PAJAIS lookup abs_theme_names = list(map(lambda t: t["theme_name"], abs_themes)) title_theme_names = list(map(lambda t: t["theme_name"], title_themes)) # Assign themes per paper using centroid similarity def assign_abstract_theme(text): return _assign_theme_for_text(str(text), abs_themes, abs_tax_map) def assign_title_theme(text): return _assign_theme_for_text(str(text), title_themes, title_tax_map) abstracts = list(df[abstract_col].fillna("") if abstract_col else [""] * len(df)) titles = list(df[title_col].fillna("") if title_col else [""] * len(df)) abs_assignments = list(map(assign_abstract_theme, abstracts)) title_assignments = list(map(assign_title_theme, titles)) # Use LLM to compute similarity reasoning between matched theme pairs import time llm = ChatMistralAI(model="mistral-small-latest", temperature=0.1) # Get unique theme pairs — call LLM once per pair, not once per paper unique_pairs = list(set( (a[0], t[0]) for a, t in zip(abs_assignments, title_assignments) )) def get_similarity_reasoning(pair): abs_theme, title_theme = pair abs_pajais = abs_tax_map.get(abs_theme, "Unknown") title_pajais = title_tax_map.get(title_theme, "Unknown") prompt = ( "Compare these two research themes and assess their similarity:\n" "Abstract Theme: {} (PAJAIS: {})\n" "Title Theme: {} (PAJAIS: {})\n\n" "Return ONLY a raw JSON object with three keys:\n" " similarity_label: one of High/Medium/Low\n" " similarity_pct: integer 0-100\n" " reasoning: one sentence explaining the similarity or difference\n" "No markdown, no explanation, just the JSON object." ).format(abs_theme, abs_pajais, title_theme, title_pajais) result = _call_llm_json(llm, prompt) return pair, result # Sequential with sleep to respect rate limits pair_results_raw = [] for idx, pair in enumerate(unique_pairs): pair_results_raw.append(get_similarity_reasoning(pair)) _ = time.sleep(8) if idx < len(unique_pairs) - 1 else None pair_map = {pair: result for pair, result in pair_results_raw} # Build output rows def build_row(idx): a_theme, a_pajais, a_sim = abs_assignments[idx] t_theme, t_pajais, t_sim = title_assignments[idx] sim_info = pair_map.get((a_theme, t_theme), {}) return { "Title": titles[idx], "Title Theme": t_theme, "Title PAJAIS Category": t_pajais, "Abstract": abstracts[idx], "Abstract Theme": a_theme, "Abstract PAJAIS Category": a_pajais, "Year": str(df[year_col].iloc[idx]) if year_col else "", "Source Journal": str(df[journal_col].iloc[idx]) if journal_col else "", "Theme Similarity": sim_info.get("similarity_label", ""), "Similarity %": str(sim_info.get("similarity_pct", "")), "Similarity Reasoning": sim_info.get("reasoning", ""), } rows = list(map(build_row, list(range(len(df))))) out_df = pd.DataFrame(rows) dest = DATA_DIR / "comparison.csv" out_df.to_csv(dest, index=False, encoding="utf-8-sig") return json.dumps({ "rows": len(out_df), "columns": list(out_df.columns), "path": str(dest), "abstract_themes": abs_theme_names, "title_themes": title_theme_names, "note": "Enriched comparison CSV with per-paper theme + PAJAIS + similarity", }) # ============================================================================= # TOOL 7 — export_narrative # ONLY runs when BOTH abstract and title runs are complete # ============================================================================= @tool def export_narrative() -> str: """Write a 500-word Section 7 narrative using themes from BOTH runs. Only available after BOTH abstract and title runs have completed taxonomy mapping. Saves to data/narrative.txt. """ abs_tax = _p("abstract")["taxonomy"] title_tax = _p("title")["taxonomy"] both_done = abs_tax.exists() and title_tax.exists() result = ( _do_export_narrative() if both_done else ( "Narrative cannot be generated yet. " "Abstract taxonomy complete: {}. Title taxonomy complete: {}. " "Complete both runs through Phase 5.5 first.".format( abs_tax.exists(), title_tax.exists() ) ) ) return result def _do_export_narrative() -> str: """Internal: generate narrative when both runs are done.""" abs_themes = json.loads(_p("abstract")["themes"].read_text()) title_themes = json.loads(_p("title")["themes"].read_text()) abs_taxonomy = json.loads(_p("abstract")["taxonomy"].read_text()) title_taxonomy = json.loads(_p("title")["taxonomy"].read_text()) llm = ChatMistralAI(model="mistral-small-latest", temperature=0.4) abs_summary = list(map(lambda t: {"name": t["theme_name"], "sentences": len(t["sentences"])}, abs_themes)) title_summary = list(map(lambda t: {"name": t["theme_name"], "sentences": len(t["sentences"])}, title_themes)) prompt = ( "You are an academic writing expert in Information Systems.\n\n" "Write Section 7 (Discussion and Thematic Synthesis) of a systematic " "literature review paper. Approximately 500 words, formal academic prose.\n" "Cover:\n" "(a) Overview of themes from abstract analysis\n" "(b) Overview of themes from title analysis\n" "(c) Comparison: what themes appear in both vs only one\n" "(d) PAJAIS taxonomy mapping and implications\n" "(e) Implications for IS research and practice\n" "(f) Limitations\n\n" "ABSTRACT THEMES:\n" + json.dumps(abs_summary, indent=2) + "\n\n" "TITLE THEMES:\n" + json.dumps(title_summary, indent=2) + "\n\n" "ABSTRACT PAJAIS MAPPING:\n" + json.dumps(abs_taxonomy, indent=2) + "\n\n" "TITLE PAJAIS MAPPING:\n" + json.dumps(title_taxonomy, indent=2) + "\n\n" "Write in continuous academic paragraphs. No bullet points or headers." ) response = llm.invoke([HumanMessage(content=prompt)]) narrative_text = response.content dest = DATA_DIR / "narrative.txt" dest.write_text(narrative_text, encoding="utf-8") return json.dumps({ "word_count": len(narrative_text.split()), "path": str(dest), "note": "Narrative combines both abstract and title run themes", })