|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| import re
|
| import json
|
| import os
|
| import numpy as np
|
| import pandas as pd
|
| import plotly.express as px
|
| import plotly.graph_objects as go
|
| from langchain_core.tools import tool
|
| from langchain_core.prompts import PromptTemplate
|
| from langchain_core.output_parsers import JsonOutputParser
|
| from langchain_mistralai import ChatMistralAI
|
| from langchain_groq import ChatGroq
|
| from sentence_transformers import SentenceTransformer
|
| from sklearn.cluster import AgglomerativeClustering, DBSCAN
|
| from sklearn.metrics.pairwise import cosine_similarity
|
| from sklearn.decomposition import PCA
|
| import nltk
|
|
|
| nltk.download("punkt", quiet=True)
|
| nltk.download("punkt_tab", quiet=True)
|
| from nltk.tokenize import sent_tokenize
|
|
|
|
|
|
|
|
|
| RUN_CONFIGS = {
|
| "abstract": ["Abstract"],
|
| "title": ["Title"],
|
| }
|
|
|
| MODEL_NAME = "all-MiniLM-L6-v2"
|
| NEAREST_K = 5
|
| MAX_LABEL_TOPICS = 60
|
| MAX_SENTENCES = 3000
|
| DEFAULT_THRESHOLD = 0.7
|
| MISTRAL_TIMEOUT = 120
|
|
|
| BOILERPLATE_PATTERNS = [
|
| r"Β©\s*\d{4}",
|
| r"elsevier\s*(b\.v\.)?",
|
| r"springer\s*(nature)?",
|
| r"wiley\s*(online\s*library)?",
|
| r"all\s+rights\s+reserved",
|
| r"published\s+by\s+[a-z\s]+",
|
| r"doi:\s*10\.",
|
| r"www\.[a-z]+\.[a-z]+",
|
| r"https?://",
|
| r"copyright\s*\d{4}",
|
| r"taylor\s*&\s*francis",
|
| r"sage\s+publications",
|
| r"emerald\s+publishing",
|
| r"journal\s+of\s+[a-z\s]+issn",
|
| r"volume\s+\d+,?\s+issue\s+\d+",
|
| r"pp\.\s*\d+[-β]\d+",
|
| r"received\s+\d+\s+\w+\s+\d{4}",
|
| r"accepted\s+\d+\s+\w+\s+\d{4}",
|
| r"available\s+online",
|
| r"this\s+is\s+an\s+open\s+access",
|
| r"creative\s+commons",
|
| r"please\s+cite\s+this\s+article",
|
| ]
|
|
|
| PAJAIS_TAXONOMY = [
|
| "Artificial Intelligence Methods",
|
| "Natural Language Processing",
|
| "Machine Learning",
|
| "Deep Learning",
|
| "Knowledge Representation",
|
| "Ontologies & Semantic Web",
|
| "Information Retrieval",
|
| "Recommender Systems",
|
| "Decision Support Systems",
|
| "Human-Computer Interaction",
|
| "Explainability & Transparency",
|
| "Fairness, Accountability & Ethics",
|
| "Data Management & Integration",
|
| "Text Mining & Analytics",
|
| "Sentiment Analysis",
|
| "Social Media Analysis",
|
| "Business Intelligence",
|
| "Process Automation & RPA",
|
| "Computer Vision",
|
| "Speech & Audio Processing",
|
| "Multi-Agent Systems",
|
| "Robotics & Autonomous Systems",
|
| "Healthcare & Biomedical AI",
|
| "Finance & Risk Analytics",
|
| "Education & E-Learning",
|
| ]
|
|
|
|
|
|
|
|
|
|
|
| def _is_boilerplate(s: str) -> bool:
|
| return any(map(lambda p: bool(re.search(p, s, re.IGNORECASE)), BOILERPLATE_PATTERNS))
|
|
|
|
|
| def _clean_sentences(raw: list) -> list:
|
| no_bp = list(filter(lambda s: not _is_boilerplate(s), raw))
|
| long_enuf = list(filter(lambda s: len(s.split()) >= 6, no_bp))
|
| return long_enuf
|
|
|
|
|
| def _texts_to_sentences(texts: list) -> list:
|
| nested = list(map(sent_tokenize, texts))
|
| flat = [s for sub in nested for s in sub]
|
| return _clean_sentences(flat)
|
|
|
|
|
| def _embed(sentences: list) -> np.ndarray:
|
| model = SentenceTransformer(MODEL_NAME)
|
| return model.encode(sentences, normalize_embeddings=True, show_progress_bar=False)
|
|
|
|
|
| def _cluster(embeddings: np.ndarray, threshold: float) -> np.ndarray:
|
| return AgglomerativeClustering(
|
| metric="cosine", linkage="average",
|
| distance_threshold=threshold, n_clusters=None,
|
| ).fit_predict(embeddings)
|
|
|
|
|
| def _compute_centroids(embeddings: np.ndarray, labels: np.ndarray) -> dict:
|
| valid = sorted(set(labels.tolist()) - {-1})
|
| return dict(map(lambda l: (l, embeddings[labels == l].mean(axis=0)), valid))
|
|
|
|
|
| def _nearest_sents(centroid: np.ndarray, sentences: list,
|
| embeddings: np.ndarray, k: int) -> list:
|
| sims = cosine_similarity([centroid], embeddings)[0]
|
| idxs = np.argsort(sims)[::-1][:k].tolist()
|
| return list(map(lambda i: sentences[i], idxs))
|
|
|
|
|
| def _build_summaries(labels: np.ndarray, sentences: list,
|
| embeddings: np.ndarray) -> list:
|
| centroids = _compute_centroids(embeddings, labels)
|
|
|
| def _one(tid):
|
| mask = labels == tid
|
| return {
|
| "topic_id": tid,
|
| "count": int(mask.sum()),
|
| "centroid": centroids[tid].tolist(),
|
| "nearest_sentences": _nearest_sents(
|
| centroids[tid], sentences, embeddings, NEAREST_K),
|
| }
|
| return list(map(_one, sorted(centroids.keys())))
|
|
|
|
|
| def _get_llm() -> ChatMistralAI:
|
| """
|
| Return a ChatMistralAI instance.
|
| FIX: max_retries=0 so langchain_mistralai does NOT internally retry 429s.
|
| All retry logic lives in call_agent() in app.py, which also handles
|
| MemorySaver thread rotation on INVALID_CHAT_HISTORY. Having max_retries>0
|
| here caused double-retry storms that exhausted the rate-limit faster.
|
| """
|
| return ChatMistralAI(
|
| model="mistral-large-latest",
|
| temperature=0.2,
|
| timeout=MISTRAL_TIMEOUT,
|
| max_retries=0,
|
| )
|
|
|
|
|
|
|
|
|
|
|
| @tool
|
| def load_scopus_csv(file_path: str) -> str:
|
| """
|
| Load a Scopus CSV file correctly.
|
| Uses utf-8-sig (handles BOM) + quoting=0 (respects quoted multi-line cells).
|
| """
|
| df = pd.read_csv(
|
| file_path,
|
| encoding="utf-8-sig",
|
| quoting=0,
|
| engine="python",
|
| on_bad_lines="skip",
|
| )
|
| df.to_csv("loaded_data.csv", index=False, encoding="utf-8")
|
|
|
| n = len(df)
|
| cols = list(df.columns)
|
|
|
| abs_texts = list(df["Abstract"].dropna().astype(str)) if "Abstract" in cols else []
|
| ttl_texts = list(df["Title"].dropna().astype(str)) if "Title" in cols else []
|
|
|
| abs_sents = _texts_to_sentences(abs_texts)
|
| ttl_sents = _texts_to_sentences(ttl_texts)
|
|
|
| years = pd.to_numeric(df["Year"], errors="coerce").dropna() if "Year" in cols else pd.Series([], dtype=float)
|
| year_range = f"{int(years.min())} β {int(years.max())}" if len(years) else "N/A"
|
|
|
| return json.dumps({
|
| "papers": n,
|
| "abstract_sentences": len(abs_sents),
|
| "title_sentences": len(ttl_sents),
|
| "year_range": year_range,
|
| "columns": cols,
|
| "abstract_coverage_pct": round(len(abs_texts) / n * 100, 1) if n else 0,
|
| "title_coverage_pct": round(len(ttl_texts) / n * 100, 1) if n else 0,
|
| "sample_titles": list(df["Title"].dropna().head(5)) if "Title" in cols else [],
|
| "file_saved": "loaded_data.csv",
|
| "note": f"Sentence cap for clustering is {MAX_SENTENCES} (for performance).",
|
| }, indent=2)
|
|
|
|
|
|
|
|
|
|
|
| @tool
|
| def run_bertopic_discovery(run_key: str = "abstract", threshold: float = 0.7) -> str:
|
| """
|
| Core clustering tool.
|
| Caps sentences at MAX_SENTENCES=3000 before clustering to prevent
|
| memory/timeout issues (730MB distance matrix without cap β 34MB with cap).
|
| Embeds with all-MiniLM-L6-v2, clusters with AgglomerativeClustering
|
| (cosine, average, threshold). NO UMAP. Saves summaries + embeddings.
|
| Generates 4 Plotly HTML charts.
|
|
|
| Args:
|
| run_key: 'abstract' or 'title'
|
| threshold: distance threshold for agglomerative clustering (default 0.7)
|
|
|
| Returns:
|
| JSON: total_topics, total_sentences, sentences_used, chart files.
|
| """
|
| df = pd.read_csv("loaded_data.csv")
|
| col = RUN_CONFIGS[run_key][0]
|
| texts = list(df[col].dropna().astype(str))
|
|
|
| all_sentences = _texts_to_sentences(texts)
|
|
|
|
|
| sentences = all_sentences[:MAX_SENTENCES]
|
| print(f"[run_bertopic] {len(all_sentences)} sentences β capped to {len(sentences)}")
|
|
|
| embeddings = _embed(sentences)
|
| np.save(f"emb_{run_key}.npy", embeddings)
|
|
|
| labels = _cluster(embeddings, threshold)
|
| summaries = _build_summaries(labels, sentences, embeddings)
|
|
|
| with open(f"summaries_{run_key}.json", "w") as f:
|
| json.dump(summaries, f, indent=2)
|
|
|
| counts = [s["count"] for s in summaries]
|
| ids = [s["topic_id"] for s in summaries]
|
| centroids_matrix = np.array([s["centroid"] for s in summaries])
|
|
|
|
|
| n_comp = min(2, len(centroids_matrix), centroids_matrix.shape[1])
|
| pca2 = PCA(n_components=n_comp).fit_transform(centroids_matrix)
|
| x_vals = pca2[:, 0].tolist()
|
| y_vals = (pca2[:, 1].tolist() if pca2.shape[1] > 1 else [0] * len(x_vals))
|
|
|
| fig1 = px.scatter(
|
| x=x_vals, y=y_vals,
|
| size=counts, text=list(map(str, ids)),
|
| title=f"Intertopic Distance Map ({run_key})",
|
| labels={"x": "PC1", "y": "PC2"},
|
| size_max=40, color=counts, color_continuous_scale="Blues",
|
| )
|
| fig1.update_traces(textposition="top center")
|
| fig1.update_layout(template="plotly_dark")
|
| chart1 = f"chart_{run_key}_intertopic.html"
|
| fig1.write_html(chart1, include_plotlyjs="cdn")
|
|
|
|
|
| top30 = summaries[:30]
|
| fig2 = px.bar(
|
| x=list(map(lambda s: f"T{s['topic_id']}", top30)),
|
| y=list(map(lambda s: s["count"], top30)),
|
| title=f"Topic Sentence Frequency ({run_key}) β Top 30",
|
| labels={"x": "Topic", "y": "Sentences"},
|
| color=list(map(lambda s: s["count"], top30)),
|
| color_continuous_scale="Teal",
|
| )
|
| fig2.update_layout(template="plotly_dark")
|
| chart2 = f"chart_{run_key}_bars.html"
|
| fig2.write_html(chart2, include_plotlyjs="cdn")
|
|
|
|
|
| fig3 = px.treemap(
|
| names=list(map(lambda s: f"T{s['topic_id']}", summaries)),
|
| parents=["Topics"] * len(summaries),
|
| values=counts,
|
| title=f"Topic Hierarchy ({run_key})",
|
| )
|
| fig3.update_layout(template="plotly_dark")
|
| chart3 = f"chart_{run_key}_hierarchy.html"
|
| fig3.write_html(chart3, include_plotlyjs="cdn")
|
|
|
|
|
| top20 = summaries[:20]
|
| top20_c = np.array([s["centroid"] for s in top20])
|
| heat = cosine_similarity(top20_c).tolist()
|
| hlbls = list(map(lambda s: f"T{s['topic_id']}", top20))
|
| fig4 = go.Figure(data=go.Heatmap(z=heat, x=hlbls, y=hlbls, colorscale="Blues"))
|
| fig4.update_layout(
|
| title=f"Inter-Topic Cosine Similarity ({run_key})", template="plotly_dark")
|
| chart4 = f"chart_{run_key}_heatmap.html"
|
| fig4.write_html(chart4, include_plotlyjs="cdn")
|
|
|
| return json.dumps({
|
| "run_key": run_key,
|
| "total_topics": len(summaries),
|
| "total_sentences": len(all_sentences),
|
| "sentences_used": len(sentences),
|
| "sentences_capped": len(all_sentences) > MAX_SENTENCES,
|
| "threshold_used": threshold,
|
| "summaries_file": f"summaries_{run_key}.json",
|
| "embeddings_file": f"emb_{run_key}.npy",
|
| "charts": [chart1, chart2, chart3, chart4],
|
| "topics_preview": summaries[:3],
|
| }, indent=2)
|
|
|
|
|
|
|
|
|
|
|
| @tool
|
| def label_topics_with_llm(run_key: str = "abstract") -> str:
|
| """
|
| Label topic clusters using a dual-LLM AI Council (Mistral + Groq Llama-3).
|
| Ensures consensus on research area labels.
|
| """
|
| with open(f"summaries_{run_key}.json", encoding="utf-8") as f:
|
| summaries = json.load(f)
|
|
|
| top = summaries[:MAX_LABEL_TOPICS]
|
| llm_a = _get_llm()
|
| llm_b = _get_council_llm_b()
|
| parser = JsonOutputParser()
|
|
|
| prompt = PromptTemplate(
|
| input_variables=["topics_json", "n"],
|
| template=(
|
| "You are a thematic analysis expert.\n\n"
|
| "Below are {n} topic clusters. For EACH cluster, provide a research label AND 1-2 precise sentences of reasoning.\n"
|
| "{topics_json}\n\n"
|
| "Return ONLY a JSON array. Each element: {{\"topic_id\": int, \"label\": \"Concise Label\", \"reasoning\": \"1-2 sentences of academic justification.\"}}"
|
| ),
|
| )
|
| chain_a = prompt | llm_a | parser
|
| chain_b = prompt | llm_b | parser
|
|
|
|
|
| topics_json = json.dumps(list(map(lambda s: {"id": s["topic_id"], "sents": s["nearest_sentences"][:2]}, top)), indent=2)
|
| res_a = chain_a.invoke({"topics_json": topics_json, "n": len(top)})
|
| res_b = chain_b.invoke({"topics_json": topics_json, "n": len(top)})
|
|
|
| idx_a = {str(item["topic_id"]): item for item in res_a}
|
| idx_b = {str(item["topic_id"]): item for item in res_b}
|
|
|
| def merge_council(s):
|
| ra = idx_a.get(str(s["topic_id"]), {"label": "Unknown", "reasoning": ""})
|
| rb = idx_b.get(str(s["topic_id"]), {"label": "Unknown", "reasoning": ""})
|
| l_a, r_a = ra["label"], ra["reasoning"]
|
| l_b, r_b = rb["label"], rb["reasoning"]
|
|
|
|
|
| w_a, w_b = set(l_a.lower().split()), set(l_b.lower().split())
|
| score = round(len(w_a & w_b) / max(len(w_a | w_b), 1), 2)
|
| agreed = score >= 0.4
|
|
|
| ui = format_consensus_ui(l_a, l_b, agreed, score, r_a, r_b)
|
| return {
|
| **s, "label": l_a,
|
| "council_ui": ui
|
| }
|
|
|
| labelled = list(map(merge_council, top))
|
| out = f"labels_{run_key}.json"
|
| with open(out, "w", encoding="utf-8") as f:
|
| json.dump(labelled, f, indent=2)
|
|
|
| return json.dumps({
|
| "run_key": run_key,
|
| "total_labelled": len(labelled),
|
| "output_file": out,
|
| "preview": labelled[:5],
|
| }, indent=2)
|
|
|
|
|
|
|
|
|
|
|
| @tool
|
| def consolidate_into_themes(run_key: str = "abstract", theme_map: str = "") -> str:
|
| """
|
| Merge topic clusters into core themes using a dual-LLM AI Council.
|
| """
|
| with open(f"labels_{run_key}.json", encoding="utf-8") as f:
|
| labelled = json.load(f)
|
|
|
| llm_a = _get_llm()
|
| llm_b = _get_council_llm_b()
|
| parser = JsonOutputParser()
|
|
|
| prompt = PromptTemplate(
|
| input_variables=["topics_json"],
|
| template=(
|
| "You are a thematic analyst.\n\n"
|
| "Topics: {topics_json}\n\n"
|
| "Consolidate into 4-8 themes. Return JSON array. Each element: "
|
| "{{\"theme_name\": \"...\", \"topic_ids\": [1,2,3], \"rationale\": \"...\"}}"
|
| ),
|
| )
|
| chain_a = prompt | llm_a | parser
|
| chain_b = prompt | llm_b | parser
|
|
|
| summary = json.dumps(list(map(lambda t: {"id": t["topic_id"], "lbl": t["label"]}, labelled)), indent=2)
|
| raw_a = chain_a.invoke({"topics_json": summary})
|
| raw_b = chain_b.invoke({"topics_json": summary})
|
|
|
|
|
| l_a = ", ".join(map(lambda x: x["theme_name"], raw_a[:2]))
|
| l_b = ", ".join(map(lambda x: x["theme_name"], raw_b[:2]))
|
| w_a, w_b = set(l_a.lower().split()), set(l_b.lower().split())
|
| score = round(len(w_a & w_b) / max(len(w_a | w_b), 1), 2)
|
| agreed = score >= 0.3
|
| ui = format_consensus_ui(l_a, l_b, agreed, score)
|
|
|
| themes = list(map(lambda t: {**t, "council_ui": ui}, raw_a))
|
|
|
| out = f"themes_{run_key}.json"
|
| with open(out, "w", encoding="utf-8") as f:
|
| json.dump(themes, f, indent=2)
|
| with open("themes.json", "w", encoding="utf-8") as f:
|
| json.dump(themes, f, indent=2)
|
|
|
| return json.dumps({
|
| "run_key": run_key,
|
| "total_themes": len(themes),
|
| "output_file": out,
|
| "themes_preview": themes[:3],
|
| }, indent=2)
|
|
|
|
|
|
|
|
|
|
|
| @tool
|
| def compare_with_taxonomy(run_key: str = "abstract") -> str:
|
| """
|
| Map each consolidated theme to the PAJAIS 25-category taxonomy via Mistral.
|
| Returns MAPPED vs NOVEL per theme. Saves taxonomy_map.json.
|
|
|
| FIX-Bug4: Prefer themes_{run_key}.json over the generic themes.json so that
|
| abstract and title runs never cross-contaminate each other's theme data.
|
|
|
| Args:
|
| run_key: 'abstract' or 'title'
|
|
|
| Returns:
|
| JSON: total mapped, novel count, full mapping, output_file.
|
| """
|
|
|
| run_themes_file = f"themes_{run_key}.json"
|
| themes_file = run_themes_file if os.path.exists(run_themes_file) else "themes.json"
|
| with open(themes_file, encoding="utf-8") as f:
|
| themes = json.load(f)
|
|
|
| llm = _get_llm()
|
| parser = JsonOutputParser()
|
|
|
| prompt = PromptTemplate(
|
| input_variables=["themes_json", "taxonomy"],
|
| template=(
|
| "You are a research classification expert.\n\n"
|
| "PAJAIS Taxonomy (25 categories):\n{taxonomy}\n\n"
|
| "Themes from corpus:\n{themes_json}\n\n"
|
| "For each theme, find the best PAJAIS category match.\n"
|
| "Return ONLY a valid JSON array β no markdown. Each element:\n"
|
| " theme_name: string (match input exactly)\n"
|
| " pajais_match: best PAJAIS category, or 'NOVEL' if none fits\n"
|
| " match_confidence: float 0.0-1.0\n"
|
| " reasoning: one sentence\n"
|
| " is_novel: boolean\n"
|
| ),
|
| )
|
| chain = prompt | llm | parser
|
|
|
| theme_summaries = list(map(
|
| lambda t: {
|
| "theme_name": t["theme_name"],
|
| "total_sentences": t.get("total_sentences", 0),
|
| "constituent_labels": t.get("constituent_labels", []),
|
| "sample": (t.get("representative_sentences", [""])[0][:100]
|
| if t.get("representative_sentences") else ""),
|
| },
|
| themes,
|
| ))
|
|
|
| mapping = chain.invoke({
|
| "themes_json": json.dumps(theme_summaries, indent=2),
|
| "taxonomy": "\n".join(f"{i+1}. {c}" for i, c in enumerate(PAJAIS_TAXONOMY)),
|
| })
|
|
|
| with open("taxonomy_map.json", "w", encoding="utf-8") as f:
|
| json.dump(mapping, f, indent=2)
|
|
|
| novel_count = len(list(filter(lambda m: m.get("is_novel", False), mapping)))
|
|
|
| return json.dumps({
|
| "run_key": run_key,
|
| "total_themes_mapped": len(mapping),
|
| "novel_themes": novel_count,
|
| "mapped_themes": len(mapping) - novel_count,
|
| "output_file": "taxonomy_map.json",
|
| "mapping": mapping,
|
| }, indent=2)
|
|
|
|
|
|
|
|
|
|
|
| @tool
|
| def generate_comparison_csv() -> str:
|
| """
|
| Load themes from both abstract and title runs, create side-by-side
|
| comparison DataFrame. Saves comparison.csv.
|
|
|
| Returns:
|
| JSON: output_file, row_count, preview.
|
| """
|
| def _load(rk):
|
| p = f"themes_{rk}.json"
|
| raw = open(p, encoding="utf-8").read() if os.path.exists(p) else "[]"
|
| return json.loads(raw)
|
|
|
| abs_themes = _load("abstract")
|
| ttl_themes = _load("title")
|
| max_rows = max(len(abs_themes), len(ttl_themes), 1)
|
|
|
| pad_abs = abs_themes + [{}] * (max_rows - len(abs_themes))
|
| pad_ttl = ttl_themes + [{}] * (max_rows - len(ttl_themes))
|
|
|
| rows = list(map(
|
| lambda pair: {
|
| "#": pair[0] + 1,
|
| "Abstract Theme": pair[1][0].get("theme_name", ""),
|
| "Abstract Sents": pair[1][0].get("total_sentences", 0),
|
| "Abstract Labels": ", ".join(pair[1][0].get("constituent_labels", [])[:3]),
|
| "Title Theme": pair[1][1].get("theme_name", ""),
|
| "Title Sents": pair[1][1].get("total_sentences", 0),
|
| "Title Labels": ", ".join(pair[1][1].get("constituent_labels", [])[:3]),
|
| "Convergence": (
|
| "β" if pair[1][0].get("theme_name", "").lower()[:8]
|
| == pair[1][1].get("theme_name", "").lower()[:8]
|
| else ""
|
| ),
|
| },
|
| enumerate(zip(pad_abs, pad_ttl)),
|
| ))
|
|
|
| df = pd.DataFrame(rows)
|
| df.to_csv("comparison.csv", index=False)
|
|
|
| return json.dumps({
|
| "output_file": "comparison.csv",
|
| "row_count": len(df),
|
| "preview": rows[:3],
|
| }, indent=2)
|
|
|
|
|
|
|
|
|
|
|
| @tool
|
| def export_narrative(run_key: str = "abstract") -> str:
|
| """
|
| Generate a 500-word Section 7 narrative using Mistral LLM.
|
| Covers methodology, themes, PAJAIS alignment, limitations, implications.
|
| Saves narrative.txt.
|
|
|
| Args:
|
| run_key: 'abstract' or 'title'
|
|
|
| Returns:
|
| JSON: output_file, word_count, 500-char preview.
|
| """
|
| with open("themes.json", encoding="utf-8") as f:
|
| themes = json.load(f)
|
|
|
| tax_raw = open("taxonomy_map.json", encoding="utf-8").read() if os.path.exists("taxonomy_map.json") else "[]"
|
| tax_data = json.loads(tax_raw)
|
|
|
| llm = _get_llm()
|
| llm.temperature = 0.4
|
| prompt = PromptTemplate(
|
| input_variables=["run_key", "themes_json", "taxonomy_json"],
|
| template=(
|
| "You are writing Section 7 of an academic literature review paper.\n\n"
|
| "Analysis column: {run_key}\n"
|
| "Themes:\n{themes_json}\n\n"
|
| "PAJAIS Mapping:\n{taxonomy_json}\n\n"
|
| "Write a 500-word Section 7 covering:\n"
|
| "1. Methodology (BERTopic + Braun & Clarke 2006 six phases)\n"
|
| "2. Key themes discovered (reference each by name)\n"
|
| "3. PAJAIS taxonomy alignment (MAPPED vs NOVEL themes)\n"
|
| "4. Limitations of this computational approach\n"
|
| "5. Implications for future research\n\n"
|
| "Academic third-person prose, full paragraphs only, minimum 500 words."
|
| ),
|
| )
|
| chain = prompt | llm
|
| response = chain.invoke({
|
| "run_key": run_key,
|
| "themes_json": json.dumps(themes, indent=2),
|
| "taxonomy_json": json.dumps(tax_data, indent=2),
|
| })
|
| text = response.content if hasattr(response, "content") else str(response)
|
|
|
| with open("narrative.txt", "w", encoding="utf-8") as f:
|
| f.write(text)
|
|
|
| return json.dumps({
|
| "output_file": "narrative.txt",
|
| "word_count": len(text.split()),
|
| "preview": text[:500],
|
| }, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| def _get_council_llm_b() -> ChatGroq:
|
| """Return the Groq Llama-3 model as the second council LLM."""
|
| return ChatGroq(model="llama-3.3-70b-versatile", temperature=0.2, max_retries=0)
|
|
|
|
|
| def format_consensus_ui(label_a, label_b, agreed, score, reason_a="", reason_b=""):
|
| """Generate an ultra-compact HTML Argument UI."""
|
| status_icon = "β
Match" if agreed else "β οΈ Diverge"
|
| status_color = "#2ecc71" if agreed else "#e67e22"
|
|
|
| return f"""
|
| <div style="margin-top:4px; border-left: 2px solid {status_color}; padding-left:8px; font-size:0.75rem;">
|
| <div style="color:{status_color}; font-weight:700; margin-bottom:2px;">{status_icon} ({score})</div>
|
| <div style="display:flex; gap:10px;">
|
| <div style="flex:1; background:#0d1117; padding:6px; border-radius:4px; border:1px solid #30363d;">
|
| <b style="color:#7fb3f5; font-size:0.65rem;">MISTRAL:</b> {reason_a}
|
| </div>
|
| <div style="flex:1; background:#0d1117; padding:6px; border-radius:4px; border:1px solid #30363d;">
|
| <b style="color:#7fb3f5; font-size:0.65rem;">GROQ:</b> {reason_b}
|
| </div>
|
| </div>
|
| </div>
|
| """
|
|
|
|
|
| def _council_agreement_score(label_a: str, label_b: str) -> float:
|
| """Compute word-level Jaccard similarity between two label strings."""
|
| words_a = set(label_a.lower().split())
|
| words_b = set(label_b.lower().split())
|
| intersection = words_a & words_b
|
| union = words_a | words_b
|
| return round(len(intersection) / max(len(union), 1), 3)
|
|
|
|
|
|
|
|
|
|
|
| @tool
|
| def run_dbscan_clustering(run_key: str = "abstract", eps: float = 0.3, min_samples: int = 3) -> str:
|
| """
|
| Run DBSCAN clustering on the SAME embeddings produced by run_bertopic_discovery.
|
| Operates in 384-dim cosine space (no UMAP), complementing the existing
|
| AgglomerativeClustering results. Outputs stored separately β does NOT overwrite
|
| agglomerative results.
|
|
|
| Uses sklearn DBSCAN with metric='cosine', algorithm='brute'.
|
| Noise points (label=-1) are reported but excluded from cluster summaries.
|
|
|
| Args:
|
| run_key: 'abstract' or 'title'
|
| eps: Maximum cosine distance between points in same cluster (default 0.3)
|
| min_samples: Minimum points to form a core (default 3)
|
|
|
| Returns:
|
| JSON: n_clusters, noise_points, largest_cluster, summaries_file, chart files.
|
| """
|
| embeddings = np.load(f"emb_{run_key}.npy")
|
|
|
|
|
| with open(f"summaries_{run_key}.json", encoding="utf-8") as f:
|
| agg_summaries = json.load(f)
|
|
|
|
|
|
|
| all_nearest = [s for summ in agg_summaries for s in summ.get("nearest_sentences", [])]
|
|
|
| db = DBSCAN(eps=eps, min_samples=min_samples, metric="cosine", algorithm="brute")
|
| db_labels = db.fit_predict(embeddings)
|
|
|
| valid_ids = sorted(set(db_labels.tolist()) - {-1})
|
| noise_count = int((db_labels == -1).sum())
|
|
|
| centroids = _compute_centroids(embeddings, db_labels)
|
|
|
| def _dbscan_summary(cid):
|
| mask = db_labels == cid
|
| count = int(mask.sum())
|
| sents = _nearest_sents(centroids[cid],
|
| all_nearest or [f"Cluster {cid}"],
|
| embeddings[: len(all_nearest or ["x"])],
|
| min(3, len(all_nearest or ["x"])))
|
| return {
|
| "cluster_id": cid,
|
| "count": count,
|
| "centroid": centroids[cid].tolist(),
|
| "nearest_sentences": sents,
|
| "source": "dbscan",
|
| }
|
|
|
| summaries = list(map(_dbscan_summary, valid_ids))
|
|
|
| out_file = f"dbscan_summaries_{run_key}.json"
|
| with open(out_file, "w", encoding="utf-8") as f:
|
| json.dump(summaries, f, indent=2)
|
|
|
|
|
| n_comp = min(2, len(embeddings), embeddings.shape[1])
|
| pca2 = PCA(n_components=n_comp).fit_transform(embeddings)
|
| x_vals = pca2[:, 0].tolist()
|
| y_vals = pca2[:, 1].tolist() if n_comp > 1 else [0.0] * len(x_vals)
|
| colors = db_labels.tolist()
|
|
|
| fig_scatter = px.scatter(
|
| x=x_vals, y=y_vals,
|
| color=list(map(str, colors)),
|
| title=f"DBSCAN Cluster Map ({run_key}) β eps={eps}, min_samples={min_samples}",
|
| labels={"x": "PC1", "y": "PC2", "color": "Cluster"},
|
| opacity=0.7,
|
| )
|
| fig_scatter.update_layout(template="plotly_dark")
|
| chart_scatter = f"chart_{run_key}_dbscan_scatter.html"
|
| fig_scatter.write_html(chart_scatter, include_plotlyjs="cdn")
|
|
|
|
|
| agg_count = len(agg_summaries)
|
| dbscan_count = len(summaries)
|
| fig_cmp = px.bar(
|
| x=["Agglomerative", "DBSCAN"],
|
| y=[agg_count, dbscan_count],
|
| color=["Agglomerative", "DBSCAN"],
|
| color_discrete_sequence=["#4a90d9", "#e67e22"],
|
| title=f"Cluster Count Comparison ({run_key})",
|
| labels={"x": "Method", "y": "# Clusters"},
|
| text=[agg_count, dbscan_count],
|
| )
|
| fig_cmp.update_traces(textposition="outside")
|
| fig_cmp.update_layout(template="plotly_dark", showlegend=False)
|
| chart_cmp = f"chart_{run_key}_dbscan_comparison.html"
|
| fig_cmp.write_html(chart_cmp, include_plotlyjs="cdn")
|
|
|
| largest = max(map(lambda s: s["count"], summaries), default=0)
|
|
|
| return json.dumps({
|
| "run_key": run_key,
|
| "n_clusters": len(summaries),
|
| "noise_points": noise_count,
|
| "largest_cluster": largest,
|
| "eps_used": eps,
|
| "min_samples_used": min_samples,
|
| "summaries_file": out_file,
|
| "charts": [chart_scatter, chart_cmp],
|
| "preview": summaries[:3],
|
| }, indent=2)
|
|
|
|
|
|
|
|
|
|
|
| @tool
|
| def refine_large_clusters(run_key: str = "abstract", size_threshold: int = 200) -> str:
|
| """
|
| Post-processing: identifies overly large DBSCAN clusters and refines them
|
| into sub-clusters using a tighter AgglomerativeClustering threshold (0.45).
|
|
|
| Does NOT modify dbscan_summaries or any existing agglomerative results.
|
| Saves results to refined_clusters_{run_key}.json.
|
|
|
| Args:
|
| run_key: 'abstract' or 'title'
|
| size_threshold: Clusters with count > this value will be refined (default 200)
|
|
|
| Returns:
|
| JSON: n_refined, total_subclusters, refined_clusters_file, chart file.
|
| """
|
| dbscan_file = f"dbscan_summaries_{run_key}.json"
|
| with open(dbscan_file, encoding="utf-8") as f:
|
| summaries = json.load(f)
|
|
|
| embeddings = np.load(f"emb_{run_key}.npy")
|
|
|
| large = list(filter(lambda s: s["count"] >= size_threshold, summaries))
|
| unchanged = list(filter(lambda s: s["count"] < size_threshold, summaries))
|
|
|
|
|
| def _refine_one(parent_summary):
|
| pid = parent_summary["cluster_id"]
|
| parent_c = np.array(parent_summary["centroid"])
|
|
|
| sims = cosine_similarity([parent_c], embeddings)[0]
|
| count = parent_summary["count"]
|
| idxs = np.argsort(sims)[::-1][:count].tolist()
|
|
|
| sub_emb = embeddings[idxs]
|
| sub_labels = AgglomerativeClustering(
|
| metric="cosine", linkage="average",
|
| distance_threshold=0.45, n_clusters=None,
|
| ).fit_predict(sub_emb)
|
|
|
| sub_ids = sorted(set(sub_labels.tolist()))
|
| sub_centroids = dict(map(
|
| lambda sid: (sid, sub_emb[sub_labels == sid].mean(axis=0)),
|
| sub_ids,
|
| ))
|
|
|
| def _sub(sid):
|
| mask = sub_labels == sid
|
| sents = parent_summary.get("nearest_sentences", [])
|
| return {
|
| "cluster_id": f"{pid}.{sid}",
|
| "parent_cluster_id": pid,
|
| "count": int(mask.sum()),
|
| "centroid": sub_centroids[sid].tolist(),
|
| "nearest_sentences": sents[:3],
|
| "source": "dbscan_refined",
|
| }
|
|
|
| return list(map(_sub, sub_ids))
|
|
|
| refined_subs = [item for sublist in map(_refine_one, large) for item in sublist]
|
|
|
|
|
| unchanged_kept = list(map(
|
| lambda s: {**s, "source": "dbscan_unchanged"},
|
| unchanged,
|
| ))
|
|
|
| all_refined = unchanged_kept + refined_subs
|
|
|
| out_file = f"refined_clusters_{run_key}.json"
|
| with open(out_file, "w", encoding="utf-8") as f:
|
| json.dump(all_refined, f, indent=2)
|
|
|
|
|
| labels_list = list(map(lambda c: str(c["cluster_id"]), all_refined))
|
| parents_list = list(map(
|
| lambda c: str(c.get("parent_cluster_id", "root")) if "." in str(c["cluster_id"]) else "root",
|
| all_refined,
|
| ))
|
| values_list = list(map(lambda c: c["count"], all_refined))
|
|
|
| fig_tree = px.treemap(
|
| names=labels_list,
|
| parents=parents_list,
|
| values=values_list,
|
| title=f"Refined Sub-Clusters ({run_key}) β threshold={size_threshold}",
|
| )
|
| fig_tree.update_layout(template="plotly_dark")
|
| chart_tree = f"chart_{run_key}_refined.html"
|
| fig_tree.write_html(chart_tree, include_plotlyjs="cdn")
|
|
|
| return json.dumps({
|
| "run_key": run_key,
|
| "size_threshold": size_threshold,
|
| "n_large_refined": len(large),
|
| "total_subclusters": len(refined_subs),
|
| "unchanged_clusters": len(unchanged),
|
| "total_output_clusters": len(all_refined),
|
| "output_file": out_file,
|
| "chart": chart_tree,
|
| "preview": all_refined[:4],
|
| }, indent=2)
|
|
|
|
|
|
|
|
|
|
|
| @tool
|
| def run_ai_council(run_key: str = "abstract") -> str:
|
| """
|
| AI Council: two LLM instances independently label each DBSCAN cluster
|
| from its top-3 representative sentences, then a consensus step merges them.
|
|
|
| Model A: Mistral Large (temperature=0.2) β analytical, precise
|
| Model B: Groq Llama-3.3-70b-versatile (temperature=0.2) β genuinely different
|
| model providing independent perspective (Karpathy-style second opinion)
|
|
|
| Consensus rule:
|
| - Jaccard word overlap >= 0.4 β agreement; consensus = Model A label
|
| - Jaccard word overlap < 0.4 β divergence; Model A (Mistral) selected as primary
|
|
|
| Saves council_labels_{run_key}.json (compatible with PAJAIS mapping).
|
|
|
| Args:
|
| run_key: 'abstract' or 'title'
|
|
|
| Returns:
|
| JSON: total_labelled, agreement_rate, output_file, preview.
|
| """
|
| dbscan_file = f"dbscan_summaries_{run_key}.json"
|
| with open(dbscan_file, encoding="utf-8") as f:
|
| summaries = json.load(f)
|
|
|
| top = summaries[:MAX_LABEL_TOPICS]
|
|
|
| topics_for_prompt = list(map(
|
| lambda s: {
|
| "cluster_id": s["cluster_id"],
|
| "count": s["count"],
|
| "sentences": s.get("nearest_sentences", [])[:3],
|
| },
|
| top,
|
| ))
|
|
|
|
|
| llm_a = _get_llm()
|
| llm_b = _get_council_llm_b()
|
|
|
| council_prompt_tmpl = (
|
| "You are an expert thematic analyst reviewing DBSCAN-discovered clusters "
|
| "from an academic corpus.\n\n"
|
| "Below are cluster IDs with their top-3 representative sentences:\n\n"
|
| "{topics_json}\n\n"
|
| "For EACH cluster, propose a concise label (3-6 words).\n"
|
| "Return ONLY a valid JSON array. Each element must have:\n"
|
| " cluster_id: same integer as input\n"
|
| " label: concise 3-6 word research area name\n"
|
| " reasoning: one sentence explaining your choice\n\n"
|
| "Return ALL {n} clusters. Do not skip any."
|
| )
|
|
|
| prompt_a = PromptTemplate(
|
| input_variables=["topics_json", "n"],
|
| template=council_prompt_tmpl,
|
| )
|
| prompt_b = PromptTemplate(
|
| input_variables=["topics_json", "n"],
|
| template=council_prompt_tmpl,
|
| )
|
|
|
| parser = JsonOutputParser()
|
| chain_a = prompt_a | llm_a | parser
|
| chain_b = prompt_b | llm_b | parser
|
|
|
| input_data = {
|
| "topics_json": json.dumps(topics_for_prompt, indent=2),
|
| "n": len(top),
|
| }
|
|
|
| results_a = chain_a.invoke(input_data)
|
| results_b = chain_b.invoke(input_data)
|
|
|
| idx_a = {str(r["cluster_id"]): r for r in results_a}
|
| idx_b = {str(r["cluster_id"]): r for r in results_b}
|
|
|
|
|
| def _consensus(cluster_summary):
|
| cid = str(cluster_summary["cluster_id"])
|
| ra = idx_a.get(cid, {})
|
| rb = idx_b.get(cid, {})
|
| label_a = ra.get("label", f"Cluster {cid}")
|
| label_b = rb.get("label", f"Cluster {cid}")
|
|
|
| score = _council_agreement_score(label_a, label_b)
|
|
|
|
|
| consensus = label_a if score >= 0.4 else (
|
|
|
| label_a
|
| )
|
| council_reasoning = (
|
| f"A: '{label_a}' | B: '{label_b}' | Jaccard={score:.2f} | "
|
| + ("AGREED" if score >= 0.4 else f"DIVERGED β Model A selected as primary")
|
| )
|
|
|
| ui = format_consensus_ui(label_a, label_b, score >= 0.4, score, ra.get("reasoning",""), rb.get("reasoning",""))
|
|
|
| return {
|
| "cluster_id": cluster_summary["cluster_id"],
|
| "count": cluster_summary["count"],
|
| "nearest_sentences": cluster_summary.get("nearest_sentences", [])[:3],
|
| "label_a": label_a,
|
| "label_b": label_b,
|
| "consensus_label": label_a,
|
| "agreement_score": score,
|
| "council_ui": ui,
|
| "source": "dbscan_ai_council",
|
| "label": label_a,
|
| "reasoning": ra.get("reasoning", ""),
|
| }
|
|
|
| council_labels = list(map(_consensus, top))
|
|
|
| out_file = f"council_labels_{run_key}.json"
|
| with open(out_file, "w", encoding="utf-8") as f:
|
| json.dump(council_labels, f, indent=2)
|
|
|
| agreed_count = len(list(filter(lambda c: c["agreement_score"] >= 0.4, council_labels)))
|
| agreement_rate = round(agreed_count / max(len(council_labels), 1) * 100, 1)
|
|
|
| return json.dumps({
|
| "run_key": run_key,
|
| "total_labelled": len(council_labels),
|
| "agreed_count": agreed_count,
|
| "agreement_rate": f"{agreement_rate}%",
|
| "output_file": out_file,
|
| "note": (
|
| "council_labels contain 'label' field for PAJAIS compatibility. "
|
| "Model A = Mistral Large (analytical). "
|
| "Model B = Groq Llama-3.3-70b-versatile (independent second opinion)."
|
| ),
|
| "preview": council_labels[:4],
|
| }, indent=2)
|
|
|
|
|
|
|
| |