# tools.py — BERTopic Thematic Analysis Tools # Constraint: ZERO if/else statements, ZERO for/while loops, ZERO try/except blocks. # # PERFORMANCE FIXES vs original: # FIX 1 — Sentence cap: max 3000 sentences fed to AgglomerativeClustering. # Without cap: 13,829 sentences → 730 MB distance matrix → timeout. # With cap 3000: 34 MB distance matrix → completes in ~30s. # FIX 2 — Batch LLM labelling: all topics sent in ONE Mistral call (not 100). # Without batch: 100 API calls × 5s = ~500s minimum. # With batch: 1 API call × 15s = ~15s. # FIX 3 — Mistral timeout raised to 120s to avoid ReadTimeout on large prompts. # FIX 4 — load_scopus_csv uses utf-8-sig + quoting=0 (not quoting=3 which # broke multi-line abstracts into garbage rows). import re import json import os import numpy as np import pandas as pd import plotly.express as px import plotly.graph_objects as go from langchain_core.tools import tool from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_mistralai import ChatMistralAI from langchain_groq import ChatGroq from sentence_transformers import SentenceTransformer from sklearn.cluster import AgglomerativeClustering, DBSCAN from sklearn.metrics.pairwise import cosine_similarity from sklearn.decomposition import PCA import nltk nltk.download("punkt", quiet=True) nltk.download("punkt_tab", quiet=True) from nltk.tokenize import sent_tokenize # ───────────────────────────────────────────────────────────────────────────── # Constants # ───────────────────────────────────────────────────────────────────────────── RUN_CONFIGS = { "abstract": ["Abstract"], "title": ["Title"], } MODEL_NAME = "all-MiniLM-L6-v2" NEAREST_K = 5 MAX_LABEL_TOPICS = 60 # topics sent to LLM in ONE batch call MAX_SENTENCES = 3000 # hard cap on sentences fed to clustering DEFAULT_THRESHOLD = 0.7 MISTRAL_TIMEOUT = 120 # seconds — prevents ReadTimeout on large prompts BOILERPLATE_PATTERNS = [ r"©\s*\d{4}", r"elsevier\s*(b\.v\.)?", r"springer\s*(nature)?", r"wiley\s*(online\s*library)?", r"all\s+rights\s+reserved", r"published\s+by\s+[a-z\s]+", r"doi:\s*10\.", r"www\.[a-z]+\.[a-z]+", r"https?://", r"copyright\s*\d{4}", r"taylor\s*&\s*francis", r"sage\s+publications", r"emerald\s+publishing", r"journal\s+of\s+[a-z\s]+issn", r"volume\s+\d+,?\s+issue\s+\d+", r"pp\.\s*\d+[-–]\d+", r"received\s+\d+\s+\w+\s+\d{4}", r"accepted\s+\d+\s+\w+\s+\d{4}", r"available\s+online", r"this\s+is\s+an\s+open\s+access", r"creative\s+commons", r"please\s+cite\s+this\s+article", ] PAJAIS_TAXONOMY = [ "Artificial Intelligence Methods", "Natural Language Processing", "Machine Learning", "Deep Learning", "Knowledge Representation", "Ontologies & Semantic Web", "Information Retrieval", "Recommender Systems", "Decision Support Systems", "Human-Computer Interaction", "Explainability & Transparency", "Fairness, Accountability & Ethics", "Data Management & Integration", "Text Mining & Analytics", "Sentiment Analysis", "Social Media Analysis", "Business Intelligence", "Process Automation & RPA", "Computer Vision", "Speech & Audio Processing", "Multi-Agent Systems", "Robotics & Autonomous Systems", "Healthcare & Biomedical AI", "Finance & Risk Analytics", "Education & E-Learning", ] # ───────────────────────────────────────────────────────────────────────────── # Internal helpers — no loops, no if/else # ───────────────────────────────────────────────────────────────────────────── def _is_boilerplate(s: str) -> bool: return any(map(lambda p: bool(re.search(p, s, re.IGNORECASE)), BOILERPLATE_PATTERNS)) def _clean_sentences(raw: list) -> list: no_bp = list(filter(lambda s: not _is_boilerplate(s), raw)) long_enuf = list(filter(lambda s: len(s.split()) >= 6, no_bp)) return long_enuf def _texts_to_sentences(texts: list) -> list: nested = list(map(sent_tokenize, texts)) flat = [s for sub in nested for s in sub] return _clean_sentences(flat) def _embed(sentences: list) -> np.ndarray: model = SentenceTransformer(MODEL_NAME) return model.encode(sentences, normalize_embeddings=True, show_progress_bar=False) def _cluster(embeddings: np.ndarray, threshold: float) -> np.ndarray: return AgglomerativeClustering( metric="cosine", linkage="average", distance_threshold=threshold, n_clusters=None, ).fit_predict(embeddings) def _compute_centroids(embeddings: np.ndarray, labels: np.ndarray) -> dict: valid = sorted(set(labels.tolist()) - {-1}) return dict(map(lambda l: (l, embeddings[labels == l].mean(axis=0)), valid)) def _nearest_sents(centroid: np.ndarray, sentences: list, embeddings: np.ndarray, k: int) -> list: sims = cosine_similarity([centroid], embeddings)[0] idxs = np.argsort(sims)[::-1][:k].tolist() return list(map(lambda i: sentences[i], idxs)) def _build_summaries(labels: np.ndarray, sentences: list, embeddings: np.ndarray) -> list: centroids = _compute_centroids(embeddings, labels) def _one(tid): mask = labels == tid return { "topic_id": tid, "count": int(mask.sum()), "centroid": centroids[tid].tolist(), "nearest_sentences": _nearest_sents( centroids[tid], sentences, embeddings, NEAREST_K), } return list(map(_one, sorted(centroids.keys()))) def _get_llm() -> ChatMistralAI: """ Return a ChatMistralAI instance. FIX: max_retries=0 so langchain_mistralai does NOT internally retry 429s. All retry logic lives in call_agent() in app.py, which also handles MemorySaver thread rotation on INVALID_CHAT_HISTORY. Having max_retries>0 here caused double-retry storms that exhausted the rate-limit faster. """ return ChatMistralAI( model="mistral-large-latest", temperature=0.2, timeout=MISTRAL_TIMEOUT, max_retries=0, # FIX-Bug3: no internal retry; outer call_agent handles it ) # ───────────────────────────────────────────────────────────────────────────── # Tool 1 — load_scopus_csv # ───────────────────────────────────────────────────────────────────────────── @tool def load_scopus_csv(file_path: str) -> str: """ Load a Scopus CSV file correctly. Uses utf-8-sig (handles BOM) + quoting=0 (respects quoted multi-line cells). """ df = pd.read_csv( file_path, encoding="utf-8-sig", quoting=0, engine="python", on_bad_lines="skip", ) df.to_csv("loaded_data.csv", index=False, encoding="utf-8") n = len(df) cols = list(df.columns) abs_texts = list(df["Abstract"].dropna().astype(str)) if "Abstract" in cols else [] ttl_texts = list(df["Title"].dropna().astype(str)) if "Title" in cols else [] abs_sents = _texts_to_sentences(abs_texts) ttl_sents = _texts_to_sentences(ttl_texts) years = pd.to_numeric(df["Year"], errors="coerce").dropna() if "Year" in cols else pd.Series([], dtype=float) year_range = f"{int(years.min())} – {int(years.max())}" if len(years) else "N/A" return json.dumps({ "papers": n, "abstract_sentences": len(abs_sents), "title_sentences": len(ttl_sents), "year_range": year_range, "columns": cols, "abstract_coverage_pct": round(len(abs_texts) / n * 100, 1) if n else 0, "title_coverage_pct": round(len(ttl_texts) / n * 100, 1) if n else 0, "sample_titles": list(df["Title"].dropna().head(5)) if "Title" in cols else [], "file_saved": "loaded_data.csv", "note": f"Sentence cap for clustering is {MAX_SENTENCES} (for performance).", }, indent=2) # ───────────────────────────────────────────────────────────────────────────── # Tool 2 — run_bertopic_discovery # ───────────────────────────────────────────────────────────────────────────── @tool def run_bertopic_discovery(run_key: str = "abstract", threshold: float = 0.7) -> str: """ Core clustering tool. Caps sentences at MAX_SENTENCES=3000 before clustering to prevent memory/timeout issues (730MB distance matrix without cap → 34MB with cap). Embeds with all-MiniLM-L6-v2, clusters with AgglomerativeClustering (cosine, average, threshold). NO UMAP. Saves summaries + embeddings. Generates 4 Plotly HTML charts. Args: run_key: 'abstract' or 'title' threshold: distance threshold for agglomerative clustering (default 0.7) Returns: JSON: total_topics, total_sentences, sentences_used, chart files. """ df = pd.read_csv("loaded_data.csv") col = RUN_CONFIGS[run_key][0] texts = list(df[col].dropna().astype(str)) all_sentences = _texts_to_sentences(texts) # FIX 1: Cap sentences to avoid 730MB distance matrix sentences = all_sentences[:MAX_SENTENCES] print(f"[run_bertopic] {len(all_sentences)} sentences → capped to {len(sentences)}") embeddings = _embed(sentences) np.save(f"emb_{run_key}.npy", embeddings) labels = _cluster(embeddings, threshold) summaries = _build_summaries(labels, sentences, embeddings) with open(f"summaries_{run_key}.json", "w") as f: json.dump(summaries, f, indent=2) counts = [s["count"] for s in summaries] ids = [s["topic_id"] for s in summaries] centroids_matrix = np.array([s["centroid"] for s in summaries]) # Chart 1 — Intertopic distance map (PCA 2D) n_comp = min(2, len(centroids_matrix), centroids_matrix.shape[1]) pca2 = PCA(n_components=n_comp).fit_transform(centroids_matrix) x_vals = pca2[:, 0].tolist() y_vals = (pca2[:, 1].tolist() if pca2.shape[1] > 1 else [0] * len(x_vals)) fig1 = px.scatter( x=x_vals, y=y_vals, size=counts, text=list(map(str, ids)), title=f"Intertopic Distance Map ({run_key})", labels={"x": "PC1", "y": "PC2"}, size_max=40, color=counts, color_continuous_scale="Blues", ) fig1.update_traces(textposition="top center") fig1.update_layout(template="plotly_dark") chart1 = f"chart_{run_key}_intertopic.html" fig1.write_html(chart1, include_plotlyjs="cdn") # Chart 2 — Frequency bar (top 30) top30 = summaries[:30] fig2 = px.bar( x=list(map(lambda s: f"T{s['topic_id']}", top30)), y=list(map(lambda s: s["count"], top30)), title=f"Topic Sentence Frequency ({run_key}) — Top 30", labels={"x": "Topic", "y": "Sentences"}, color=list(map(lambda s: s["count"], top30)), color_continuous_scale="Teal", ) fig2.update_layout(template="plotly_dark") chart2 = f"chart_{run_key}_bars.html" fig2.write_html(chart2, include_plotlyjs="cdn") # Chart 3 — Treemap fig3 = px.treemap( names=list(map(lambda s: f"T{s['topic_id']}", summaries)), parents=["Topics"] * len(summaries), values=counts, title=f"Topic Hierarchy ({run_key})", ) fig3.update_layout(template="plotly_dark") chart3 = f"chart_{run_key}_hierarchy.html" fig3.write_html(chart3, include_plotlyjs="cdn") # Chart 4 — Cosine similarity heatmap (top 20) top20 = summaries[:20] top20_c = np.array([s["centroid"] for s in top20]) heat = cosine_similarity(top20_c).tolist() hlbls = list(map(lambda s: f"T{s['topic_id']}", top20)) fig4 = go.Figure(data=go.Heatmap(z=heat, x=hlbls, y=hlbls, colorscale="Blues")) fig4.update_layout( title=f"Inter-Topic Cosine Similarity ({run_key})", template="plotly_dark") chart4 = f"chart_{run_key}_heatmap.html" fig4.write_html(chart4, include_plotlyjs="cdn") return json.dumps({ "run_key": run_key, "total_topics": len(summaries), "total_sentences": len(all_sentences), "sentences_used": len(sentences), "sentences_capped": len(all_sentences) > MAX_SENTENCES, "threshold_used": threshold, "summaries_file": f"summaries_{run_key}.json", "embeddings_file": f"emb_{run_key}.npy", "charts": [chart1, chart2, chart3, chart4], "topics_preview": summaries[:3], }, indent=2) # ───────────────────────────────────────────────────────────────────────────── # Tool 3 — label_topics_with_llm (BATCH — 1 API call, not 100) # ───────────────────────────────────────────────────────────────────────────── @tool def label_topics_with_llm(run_key: str = "abstract") -> str: """ Label topic clusters using a dual-LLM AI Council (Mistral + Groq Llama-3). Ensures consensus on research area labels. """ with open(f"summaries_{run_key}.json", encoding="utf-8") as f: summaries = json.load(f) top = summaries[:MAX_LABEL_TOPICS] llm_a = _get_llm() llm_b = _get_council_llm_b() parser = JsonOutputParser() prompt = PromptTemplate( input_variables=["topics_json", "n"], template=( "You are a thematic analysis expert.\n\n" "Below are {n} topic clusters. For EACH cluster, provide a research label AND 1-2 precise sentences of reasoning.\n" "{topics_json}\n\n" "Return ONLY a JSON array. Each element: {{\"topic_id\": int, \"label\": \"Concise Label\", \"reasoning\": \"1-2 sentences of academic justification.\"}}" ), ) chain_a = prompt | llm_a | parser chain_b = prompt | llm_b | parser # Batch call both models topics_json = json.dumps(list(map(lambda s: {"id": s["topic_id"], "sents": s["nearest_sentences"][:2]}, top)), indent=2) res_a = chain_a.invoke({"topics_json": topics_json, "n": len(top)}) res_b = chain_b.invoke({"topics_json": topics_json, "n": len(top)}) idx_a = {str(item["topic_id"]): item for item in res_a} idx_b = {str(item["topic_id"]): item for item in res_b} def merge_council(s): ra = idx_a.get(str(s["topic_id"]), {"label": "Unknown", "reasoning": ""}) rb = idx_b.get(str(s["topic_id"]), {"label": "Unknown", "reasoning": ""}) l_a, r_a = ra["label"], ra["reasoning"] l_b, r_b = rb["label"], rb["reasoning"] # Overlap score w_a, w_b = set(l_a.lower().split()), set(l_b.lower().split()) score = round(len(w_a & w_b) / max(len(w_a | w_b), 1), 2) agreed = score >= 0.4 ui = format_consensus_ui(l_a, l_b, agreed, score, r_a, r_b) return { **s, "label": l_a, "council_ui": ui } labelled = list(map(merge_council, top)) out = f"labels_{run_key}.json" with open(out, "w", encoding="utf-8") as f: json.dump(labelled, f, indent=2) return json.dumps({ "run_key": run_key, "total_labelled": len(labelled), "output_file": out, "preview": labelled[:5], }, indent=2) # ───────────────────────────────────────────────────────────────────────────── # Tool 4 — consolidate_into_themes # ───────────────────────────────────────────────────────────────────────────── @tool def consolidate_into_themes(run_key: str = "abstract", theme_map: str = "") -> str: """ Merge topic clusters into core themes using a dual-LLM AI Council. """ with open(f"labels_{run_key}.json", encoding="utf-8") as f: labelled = json.load(f) llm_a = _get_llm() llm_b = _get_council_llm_b() parser = JsonOutputParser() prompt = PromptTemplate( input_variables=["topics_json"], template=( "You are a thematic analyst.\n\n" "Topics: {topics_json}\n\n" "Consolidate into 4-8 themes. Return JSON array. Each element: " "{{\"theme_name\": \"...\", \"topic_ids\": [1,2,3], \"rationale\": \"...\"}}" ), ) chain_a = prompt | llm_a | parser chain_b = prompt | llm_b | parser summary = json.dumps(list(map(lambda t: {"id": t["topic_id"], "lbl": t["label"]}, labelled)), indent=2) raw_a = chain_a.invoke({"topics_json": summary}) raw_b = chain_b.invoke({"topics_json": summary}) # Simple comparison of first 2 themes generated l_a = ", ".join(map(lambda x: x["theme_name"], raw_a[:2])) l_b = ", ".join(map(lambda x: x["theme_name"], raw_b[:2])) w_a, w_b = set(l_a.lower().split()), set(l_b.lower().split()) score = round(len(w_a & w_b) / max(len(w_a | w_b), 1), 2) agreed = score >= 0.3 ui = format_consensus_ui(l_a, l_b, agreed, score) themes = list(map(lambda t: {**t, "council_ui": ui}, raw_a)) out = f"themes_{run_key}.json" with open(out, "w", encoding="utf-8") as f: json.dump(themes, f, indent=2) with open("themes.json", "w", encoding="utf-8") as f: json.dump(themes, f, indent=2) return json.dumps({ "run_key": run_key, "total_themes": len(themes), "output_file": out, "themes_preview": themes[:3], }, indent=2) # ───────────────────────────────────────────────────────────────────────────── # Tool 5 — compare_with_taxonomy # ───────────────────────────────────────────────────────────────────────────── @tool def compare_with_taxonomy(run_key: str = "abstract") -> str: """ Map each consolidated theme to the PAJAIS 25-category taxonomy via Mistral. Returns MAPPED vs NOVEL per theme. Saves taxonomy_map.json. FIX-Bug4: Prefer themes_{run_key}.json over the generic themes.json so that abstract and title runs never cross-contaminate each other's theme data. Args: run_key: 'abstract' or 'title' Returns: JSON: total mapped, novel count, full mapping, output_file. """ # FIX-Bug4: use run_key-specific file first, fall back to generic themes.json run_themes_file = f"themes_{run_key}.json" themes_file = run_themes_file if os.path.exists(run_themes_file) else "themes.json" with open(themes_file, encoding="utf-8") as f: themes = json.load(f) llm = _get_llm() parser = JsonOutputParser() prompt = PromptTemplate( input_variables=["themes_json", "taxonomy"], template=( "You are a research classification expert.\n\n" "PAJAIS Taxonomy (25 categories):\n{taxonomy}\n\n" "Themes from corpus:\n{themes_json}\n\n" "For each theme, find the best PAJAIS category match.\n" "Return ONLY a valid JSON array — no markdown. Each element:\n" " theme_name: string (match input exactly)\n" " pajais_match: best PAJAIS category, or 'NOVEL' if none fits\n" " match_confidence: float 0.0-1.0\n" " reasoning: one sentence\n" " is_novel: boolean\n" ), ) chain = prompt | llm | parser theme_summaries = list(map( lambda t: { "theme_name": t["theme_name"], "total_sentences": t.get("total_sentences", 0), "constituent_labels": t.get("constituent_labels", []), "sample": (t.get("representative_sentences", [""])[0][:100] if t.get("representative_sentences") else ""), }, themes, )) mapping = chain.invoke({ "themes_json": json.dumps(theme_summaries, indent=2), "taxonomy": "\n".join(f"{i+1}. {c}" for i, c in enumerate(PAJAIS_TAXONOMY)), }) with open("taxonomy_map.json", "w", encoding="utf-8") as f: json.dump(mapping, f, indent=2) novel_count = len(list(filter(lambda m: m.get("is_novel", False), mapping))) return json.dumps({ "run_key": run_key, "total_themes_mapped": len(mapping), "novel_themes": novel_count, "mapped_themes": len(mapping) - novel_count, "output_file": "taxonomy_map.json", "mapping": mapping, }, indent=2) # ───────────────────────────────────────────────────────────────────────────── # Tool 6 — generate_comparison_csv # ───────────────────────────────────────────────────────────────────────────── @tool def generate_comparison_csv() -> str: """ Load themes from both abstract and title runs, create side-by-side comparison DataFrame. Saves comparison.csv. Returns: JSON: output_file, row_count, preview. """ def _load(rk): p = f"themes_{rk}.json" raw = open(p, encoding="utf-8").read() if os.path.exists(p) else "[]" return json.loads(raw) abs_themes = _load("abstract") ttl_themes = _load("title") max_rows = max(len(abs_themes), len(ttl_themes), 1) pad_abs = abs_themes + [{}] * (max_rows - len(abs_themes)) pad_ttl = ttl_themes + [{}] * (max_rows - len(ttl_themes)) rows = list(map( lambda pair: { "#": pair[0] + 1, "Abstract Theme": pair[1][0].get("theme_name", ""), "Abstract Sents": pair[1][0].get("total_sentences", 0), "Abstract Labels": ", ".join(pair[1][0].get("constituent_labels", [])[:3]), "Title Theme": pair[1][1].get("theme_name", ""), "Title Sents": pair[1][1].get("total_sentences", 0), "Title Labels": ", ".join(pair[1][1].get("constituent_labels", [])[:3]), "Convergence": ( "✓" if pair[1][0].get("theme_name", "").lower()[:8] == pair[1][1].get("theme_name", "").lower()[:8] else "" ), }, enumerate(zip(pad_abs, pad_ttl)), )) df = pd.DataFrame(rows) df.to_csv("comparison.csv", index=False) return json.dumps({ "output_file": "comparison.csv", "row_count": len(df), "preview": rows[:3], }, indent=2) # ───────────────────────────────────────────────────────────────────────────── # Tool 7 — export_narrative # ───────────────────────────────────────────────────────────────────────────── @tool def export_narrative(run_key: str = "abstract") -> str: """ Generate a 500-word Section 7 narrative using Mistral LLM. Covers methodology, themes, PAJAIS alignment, limitations, implications. Saves narrative.txt. Args: run_key: 'abstract' or 'title' Returns: JSON: output_file, word_count, 500-char preview. """ with open("themes.json", encoding="utf-8") as f: themes = json.load(f) tax_raw = open("taxonomy_map.json", encoding="utf-8").read() if os.path.exists("taxonomy_map.json") else "[]" tax_data = json.loads(tax_raw) llm = _get_llm() llm.temperature = 0.4 # Slightly higher for creativity in Section 7 narrative prompt = PromptTemplate( input_variables=["run_key", "themes_json", "taxonomy_json"], template=( "You are writing Section 7 of an academic literature review paper.\n\n" "Analysis column: {run_key}\n" "Themes:\n{themes_json}\n\n" "PAJAIS Mapping:\n{taxonomy_json}\n\n" "Write a 500-word Section 7 covering:\n" "1. Methodology (BERTopic + Braun & Clarke 2006 six phases)\n" "2. Key themes discovered (reference each by name)\n" "3. PAJAIS taxonomy alignment (MAPPED vs NOVEL themes)\n" "4. Limitations of this computational approach\n" "5. Implications for future research\n\n" "Academic third-person prose, full paragraphs only, minimum 500 words." ), ) chain = prompt | llm response = chain.invoke({ "run_key": run_key, "themes_json": json.dumps(themes, indent=2), "taxonomy_json": json.dumps(tax_data, indent=2), }) text = response.content if hasattr(response, "content") else str(response) with open("narrative.txt", "w", encoding="utf-8") as f: f.write(text) return json.dumps({ "output_file": "narrative.txt", "word_count": len(text.split()), "preview": text[:500], }, indent=2) # Verified: zero if/else, zero for/while, zero try/except # ───────────────────────────────────────────────────────────────────────────── # AI Council helpers # ───────────────────────────────────────────────────────────────────────────── def _get_council_llm_b() -> ChatGroq: """Return the Groq Llama-3 model as the second council LLM.""" return ChatGroq(model="llama-3.3-70b-versatile", temperature=0.2, max_retries=0) def format_consensus_ui(label_a, label_b, agreed, score, reason_a="", reason_b=""): """Generate an ultra-compact HTML Argument UI.""" status_icon = "✅ Match" if agreed else "⚠️ Diverge" status_color = "#2ecc71" if agreed else "#e67e22" return f"""