Spaces:

aadisawant2912
/

topic_modelling

Running

App Files Files Community

aadisawant2912 commited on Apr 11

Commit

0b98cfb

verified ·

1 Parent(s): 9fd51d0

Update tools.py

Browse files

Files changed (1) hide show

tools.py +271 -258

tools.py CHANGED Viewed

@@ -1,14 +1,21 @@
 """
 tools.py - 7 LangChain @tool functions for BERTopic Thematic Analysis Agent.
 Rules: ZERO if/else, ZERO for/while, ZERO try/except, ZERO PromptTemplate.
-All LLM calls use plain HumanMessage strings.
-Supports BOTH abstract and title run_configs simultaneously without overwriting.
 """
 from __future__ import annotations
 import json
 import re
 from pathlib import Path
 import numpy as np
@@ -26,63 +33,62 @@ from sklearn.metrics.pairwise import cosine_similarity
 DATA_DIR = Path("data")
 DATA_DIR.mkdir(exist_ok=True)
-# Run-config-specific paths — keyed by run_config name so they never overwrite
-def _paths(run_config: str) -> dict:
-    prefix = DATA_DIR / run_config          # data/abstract/ or data/title/
-    prefix.mkdir(exist_ok=True)
     return {
-        "sentences":  prefix / "sentences.json",
-        "stats":      prefix / "stats.json",
-        "papers":     prefix / "papers.csv",
-        "emb":        prefix / "emb.npy",
-        "summaries":  prefix / "summaries.json",
-        "charts":     prefix / "charts.json",
-        "themes":     prefix / "themes.json",
-        "taxonomy":   prefix / "taxonomy.json",
-        "narrative":  prefix / "narrative.txt",
-        "comparison": prefix / "comparison.csv",
     }
-# Shared fallback paths (used when run_config not specified)
-SUMMARIES_PATH  = DATA_DIR / "abstract" / "summaries.json"
-THEMES_PATH     = DATA_DIR / "abstract" / "themes.json"
-TAXONOMY_PATH   = DATA_DIR / "abstract" / "taxonomy.json"
-NARRATIVE_PATH  = DATA_DIR / "abstract" / "narrative.txt"
-COMPARISON_PATH = DATA_DIR / "abstract" / "comparison.csv"
-EMB_PATH        = DATA_DIR / "abstract" / "emb.npy"
 RUN_CONFIGS = {
     "abstract": ["Abstract"],
     "title":    ["Title"],
-    "both":     ["Abstract", "Title"],
 }
 BOILERPLATE_PATTERNS = [
-    r"\u00a9",                          # © unicode symbol
-    r"\\u00a9",                        # escaped unicode
-    r"copyright\s*\d{4}",             # copyright 2018
-    r"\d{4}\s+john wiley",            # 2018 John Wiley
-    r"john wiley\s*&\s*sons",         # John Wiley & Sons
-    r"blackwell publishing",             # Blackwell Publishing
-    r"blackwell\s+pub",
     r"wiley\s+periodicals",
     r"all rights reserved",
     r"doi\s*:\s*\S+",
     r"published by elsevier",
-    r"elsevier\s+(b\.v|inc|ltd)",
-    r"springer\s+(nature|verlag|science)",
-    r"taylor\s*&\s*francis",
     r"informa\s+uk",
     r"sage\s+publications",
-    r"information systems journal",      # journal boilerplate
     r"this article is",
     r"rights reserved",
     r"permission from",
     r"reproduced with",
-    r"^\s*abstract\s*$",              # lone word "Abstract"
 ]
 BOILERPLATE_RE = re.compile("|".join(BOILERPLATE_PATTERNS), re.IGNORECASE)
 PAJAIS_CATEGORIES = [
     "Information Systems Theory",    "IS Strategy & Governance",
     "Digital Innovation",            "Enterprise Systems",
@@ -101,203 +107,167 @@ PAJAIS_CATEGORIES = [
 def safe_read_csv(path):
-    """Read CSV with UTF-8, fall back to latin-1 on encoding errors."""
     try:
         return pd.read_csv(path, encoding="utf-8")
     except UnicodeDecodeError:
         return pd.read_csv(path, encoding="latin-1")
 def _call_llm_json(llm, prompt: str) -> list:
-    """Call LLM with plain HumanMessage and parse JSON response."""
     response = llm.invoke([HumanMessage(content=prompt)])
     raw = response.content.strip()
     raw = raw.split("```json")[-1].split("```")[0].strip() if "```" in raw else raw
     return json.loads(raw)
 # =============================================================================
-# TOOL 1 - load_scopus_csv
-# Supports run_config = "abstract", "title", or "both"
-# Each config saves to its own subdirectory so nothing is overwritten
 # =============================================================================
 @tool
 def load_scopus_csv(csv_path: str, run_config: str = "abstract") -> str:
-    """Load a Scopus CSV export, filter boilerplate, save sentences per run_config.
-    Supports run_config = 'abstract', 'title', or 'both'.
-    Each config saves to data/abstract/, data/title/ separately so they
-    never overwrite each other.
     Args:
         csv_path:   Path to the uploaded Scopus CSV file.
-        run_config: 'abstract', 'title', or 'both' (default 'abstract').
     """
-    # Resolve which columns to process
-    configs_to_run = (
-        list(RUN_CONFIGS.items())
-        if run_config == "both"
-        else [(run_config, RUN_CONFIGS.get(run_config, ["Abstract"]))]
-    )
-    df_raw = safe_read_csv(csv_path)
-    def process_config(config_pair):
-        cfg_name, columns = config_pair
-        p = _paths(cfg_name)
-        # Keep only columns that exist in this CSV
-        present_cols = list(filter(
-            lambda c: c in df_raw.columns,
-            columns + ["Title", "Year", "Source title", "Cited by"]
-        ))
-        df = df_raw[present_cols].dropna(subset=list(filter(
-            lambda c: c in df_raw.columns, columns
-        )))
-        def split_sentences(text):
-            raw_sents = re.split(r"(?<=[.!?])\s+", str(text))
-            return list(filter(
-                lambda s: (
-                    not BOILERPLATE_RE.search(s)
-                    and len(s.split()) > 6           # at least 7 words
-                    and len(s.strip()) > 40          # at least 40 chars
-                    and not s.strip().startswith("©")
-                    and "wiley" not in s.lower()
-                    and "elsevier" not in s.lower()
-                    and "blackwell" not in s.lower()
-                    and "springer" not in s.lower()
-                    and "information systems journal" not in s.lower()
-                ),
-                raw_sents
-            ))
-        sentences_lists = list(map(split_sentences, df[columns[0]].tolist()))
-        all_sentences = [s for lst in sentences_lists for s in lst]
-        stats = {
-            "papers":                 int(len(df)),
-            "sentences_after_filter": int(len(all_sentences)),
-            "columns_used":           columns,
-            "csv_path":               str(csv_path),
-            "run_config":             cfg_name,
-        }
-        p["sentences"].write_text(json.dumps(all_sentences, ensure_ascii=False))
-        p["stats"].write_text(json.dumps(stats, ensure_ascii=False))
-        df.to_csv(p["papers"], index=False)
-        return stats
-    all_stats = list(map(process_config, configs_to_run))
-    # Also save original CSV as uploaded.csv for reference
-    import shutil
-    shutil.copy(csv_path, DATA_DIR / "uploaded.csv")
-    return json.dumps({
-        "configs_processed": list(map(lambda s: s["run_config"], all_stats)),
-        "results": all_stats,
-    })
 # =============================================================================
-# TOOL 2 - run_bertopic_discovery
-# run_config controls which sentences file to use
-# distance_threshold=0.35 gives ~100 topics from 2000+ sentences
 # =============================================================================
 @tool
 def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract") -> str:
-    """Embed sentences, cluster with AgglomerativeClustering (cosine, threshold=0.35),
-    targeting ~100 topics. Saves summaries, embeddings, and 4 Plotly charts.
     Args:
-        top_n_topics: Target number of topic clusters (default 100).
-        run_config:   Which sentences to use: 'abstract' or 'title' (default 'abstract').
     """
-    p = _paths(run_config)
     sentences = json.loads(p["sentences"].read_text())
-    model = SentenceTransformer("all-MiniLM-L6-v2")
     embeddings = model.encode(
-        sentences, normalize_embeddings=True, show_progress_bar=False, batch_size=64
     )
     np.save(p["emb"], embeddings)
-    # threshold=0.35 produces many fine-grained clusters (~100 for 2000+ sentences)
-    # threshold=0.70 produces fewer broad clusters (~40-60)
-    # We use 0.35 to get close to the desired 100 topics
     clustering = AgglomerativeClustering(
-        metric="cosine",
-        linkage="average",
-        distance_threshold=0.35,
-        n_clusters=None,
     )
     labels = clustering.fit_predict(embeddings)
-    # Sort clusters by size descending, take top_n_topics largest
-    all_labels = sorted(set(labels.tolist()))
-    label_sizes = list(map(lambda lb: (lb, int((labels == lb).sum())), all_labels))
-    label_sizes_sorted = sorted(label_sizes, key=lambda x: -x[1])
-    # Only keep clusters with at least 3 sentences (filter noise)
-    label_sizes_filtered = list(filter(lambda x: x[1] >= 3, label_sizes_sorted))
-    retained = list(map(lambda x: x[0], label_sizes_filtered[:top_n_topics]))
-    # Extra boilerplate check for individual sentences
-    def is_clean_sentence(s):
-        return (
-            not BOILERPLATE_RE.search(s)
-            and len(s.split()) > 6
-            and len(s) > 40
-            and not s.strip().startswith("©")
-            and "wiley" not in s.lower()
-            and "elsevier" not in s.lower()
-            and "blackwell" not in s.lower()
-            and "springer" not in s.lower()
-            and "taylor" not in s.lower()
-            and "john wiley" not in s.lower()
-        )
-    def build_cluster_summary(seq_and_label):
-        seq_id, label = seq_and_label   # seq_id = 1-based sequential number
-        mask = labels == label
         cluster_embs  = embeddings[mask]
-        cluster_sents_raw = [sentences[i] for i, m in enumerate(mask.tolist()) if m]
-        # Apply extra boilerplate filter to sentences inside each cluster
-        cluster_sents = list(filter(is_clean_sentence, cluster_sents_raw))
-        # Fall back to raw if filter removed everything
-        cluster_sents = cluster_sents if cluster_sents else cluster_sents_raw[:5]
-        centroid  = cluster_embs.mean(axis=0, keepdims=True)
-        sims      = cosine_similarity(centroid, cluster_embs)[0]
-        top5_idx  = sims.argsort()[-5:][::-1].tolist()
-        # top_evidence: pick from clean sentences preferring highest-similarity ones
-        clean_set = set(cluster_sents)
-        top_evidence_raw = list(map(lambda i: cluster_sents_raw[i], top5_idx))
-        top_evidence = list(filter(lambda s: s in clean_set, top_evidence_raw))[:5]
-        top_evidence = top_evidence if top_evidence else top_evidence_raw[:3]
         return {
-            "topic_id":    seq_id,        # sequential 1-based ID shown in table
-            "raw_label":   int(label),    # original cluster label kept for internal use
             "size":        int(mask.sum()),
             "top_evidence": top_evidence,
-            "sentences":   cluster_sents,
             "centroid":    centroid[0].tolist(),
             "run_config":  run_config,
         }
-    # Enumerate gives (1-based index, raw_label) pairs
-    seq_label_pairs = list(map(lambda x: (x[0]+1, x[1]), enumerate(retained)))
-    summaries = list(map(build_cluster_summary, seq_label_pairs))
     p["summaries"].write_text(json.dumps(summaries, indent=2, ensure_ascii=False))
-    sizes = list(map(lambda s: s["size"], summaries))
     ids   = list(map(lambda s: s["topic_id"], summaries))
-    fig1 = px.bar(x=ids, y=sizes,
-                  labels={"x": "Topic ID", "y": "Sentence Count"},
-                  title="Topic Size Distribution ({})".format(run_config))
-    fig2 = px.histogram(x=sizes, nbins=30, title="Cluster Size Histogram ({})".format(run_config),
                         labels={"x": "Cluster Size"})
     centroids = np.array(list(map(lambda s: s["centroid"], summaries)))
     n_comp    = min(2, centroids.shape[0], centroids.shape[1])
     coords    = PCA(n_components=n_comp).fit_transform(centroids)
@@ -305,14 +275,12 @@ def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract"
         x=coords[:, 0],
         y=(coords[:, 1] if coords.shape[1] > 1 else [0] * len(coords)),
         text=list(map(str, ids)),
-        title="Topic Centroids PCA ({})".format(run_config),
         labels={"x": "PC1", "y": "PC2"},
     )
     fig4 = px.treemap(
-        names=list(map(str, ids)),
-        parents=["Topics"] * len(ids),
-        values=sizes,
-        title="Topic Treemap ({})".format(run_config),
     )
     charts = {
@@ -327,22 +295,22 @@ def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract"
         "topics_found": len(summaries),
         "run_config":   run_config,
         "chart_types":  list(charts.keys()),
-        "note": "threshold=0.35 used for ~100 fine-grained clusters",
     })
 # =============================================================================
-# TOOL 3 - label_topics_with_llm
 # =============================================================================
 @tool
 def label_topics_with_llm(batch_size: int = 20, run_config: str = "abstract") -> str:
-    """Send top topics to Mistral LLM to generate human-readable labels and reasoning.
     Args:
-        batch_size:  Topics per LLM batch (default 20).
-        run_config:  Which summaries to label: 'abstract' or 'title' (default 'abstract').
     """
-    p = _paths(run_config)
     summaries     = json.loads(p["summaries"].read_text())
     top_summaries = summaries[:100]
     llm           = ChatMistralAI(model="mistral-large-latest", temperature=0.2)
@@ -355,14 +323,13 @@ def label_topics_with_llm(batch_size: int = 20, run_config: str = "abstract") ->
             batch
         ))
         prompt = (
-            "You are a thematic analysis expert specialising in Information Systems research.\n"
-            "Given the following research topic clusters with sample sentences, "
-            "assign a concise label (3-6 words) and one-sentence reasoning for each.\n\n"
             "TOPICS:\n" + json.dumps(mini, indent=2) + "\n\n"
-            "Return ONLY a valid JSON array. "
-            "Each element must have exactly three keys: "
-            "topic_id (integer), label (string), reasoning (string). "
-            "No markdown fences, no explanation, just the raw JSON array."
         )
         return _call_llm_json(llm, prompt)
@@ -383,32 +350,32 @@ def label_topics_with_llm(batch_size: int = 20, run_config: str = "abstract") ->
 # =============================================================================
-# TOOL 4 - consolidate_into_themes
 # =============================================================================
 @tool
 def consolidate_into_themes(approved_groups: str, run_config: str = "abstract") -> str:
-    """Merge researcher-approved topic groups into themes, recompute centroids.
     Args:
-        approved_groups: JSON string [{theme_name: str, topic_ids: [int, ...]}]
-        run_config:      Which summaries to use (default 'abstract').
     """
-    p         = _paths(run_config)
     groups    = json.loads(approved_groups)
     summaries = json.loads(p["summaries"].read_text())
-    id_to_summary = {s["topic_id"]: s for s in summaries}
     def build_theme(group):
         ids      = group["topic_ids"]
-        members  = list(map(lambda tid: id_to_summary[tid], ids))
-        all_sents = [s for ms in members for s in ms.get("sentences", [])]
         centroids = np.array(list(map(lambda ms: ms["centroid"], members)))
         return {
             "theme_name":  group["theme_name"],
             "topic_ids":   ids,
-            "sentences":   all_sents,
             "centroid":    centroids.mean(axis=0).tolist(),
-            "paper_count": len(set(all_sents)),
             "run_config":  run_config,
         }
@@ -418,20 +385,21 @@ def consolidate_into_themes(approved_groups: str, run_config: str = "abstract")
         "themes_created": len(themes),
         "theme_names":    list(map(lambda t: t["theme_name"], themes)),
         "run_config":     run_config,
     })
 # =============================================================================
-# TOOL 5 - compare_with_taxonomy
 # =============================================================================
 @tool
 def compare_with_taxonomy(run_config: str = "abstract") -> str:
-    """Map consolidated themes to PAJAIS 25 categories via Mistral LLM.
     Args:
-        run_config: Which themes to map (default 'abstract').
     """
-    p      = _paths(run_config)
     themes = json.loads(p["themes"].read_text())
     llm    = ChatMistralAI(model="mistral-large-latest", temperature=0.1)
@@ -439,107 +407,152 @@ def compare_with_taxonomy(run_config: str = "abstract") -> str:
         lambda t: {"name": t["theme_name"], "sample": t["sentences"][:2]},
         themes
     ))
     prompt = (
         "You are a research classification expert in Information Systems.\n\n"
-        "Map each of the following research themes to the single most relevant "
-        "PAJAIS (Pacific Asia Journal of the Association for Information Systems) category.\n\n"
         "THEMES:\n" + json.dumps(theme_mini, indent=2) + "\n\n"
         "PAJAIS CATEGORIES:\n" + json.dumps(PAJAIS_CATEGORIES, indent=2) + "\n\n"
-        "Return ONLY a valid JSON array. "
-        "Each element must have exactly four string keys: "
-        "name, pajais_category, confidence, rationale. "
-        "No markdown fences, no explanation, just the raw JSON array."
     )
     result = _call_llm_json(llm, prompt)
     p["taxonomy"].write_text(json.dumps(result, indent=2, ensure_ascii=False))
-    return json.dumps({"mapped_themes": len(result), "run_config": run_config})
 # =============================================================================
-# TOOL 6 - generate_comparison_csv
-# Columns: Title | Abstract | Year | Source title (journal)
 # =============================================================================
 @tool
-def generate_comparison_csv(run_config: str = "abstract") -> str:
-    """Generate Title | Abstract | Year | Source title comparison CSV.
-    Args:
-        run_config: Which papers.csv to use (default 'abstract').
     """
-    p  = _paths(run_config)
-    df = safe_read_csv(p["papers"])
-    # Detect columns robustly
     title_col    = next(filter(lambda c: c.strip().lower() == "title",    df.columns), None)
     abstract_col = next(filter(lambda c: c.strip().lower() == "abstract", df.columns), None)
     year_col     = next(filter(lambda c: c.strip().lower() == "year",     df.columns), None)
     journal_col  = next(filter(lambda c: "source" in c.lower(),           df.columns), None)
-    # Build ordered column list: Title, Abstract, Year, Source title
     ordered = [title_col, abstract_col, year_col, journal_col]
     present = list(filter(lambda c: c is not None and c in df.columns, ordered))
-    out_df = df[present].copy()
-    # Rename columns to clean standard names
     rename_map = {
         title_col:    "Title",
         abstract_col: "Abstract",
         year_col:     "Year",
         journal_col:  "Source Journal",
     }
-    out_df = out_df.rename(columns={k: v for k, v in rename_map.items() if k in out_df.columns})
-    out_df.to_csv(p["comparison"], index=False, encoding="utf-8-sig")
     return json.dumps({
-        "rows":       len(out_df),
-        "columns":    list(out_df.columns),
-        "path":       str(p["comparison"]),
-        "run_config": run_config,
     })
 # =============================================================================
-# TOOL 7 - export_narrative
 # =============================================================================
 @tool
-def export_narrative(run_config: str = "abstract") -> str:
-    """Generate a 500-word Section 7 narrative report via Mistral LLM.
-    Args:
-        run_config: Which themes and taxonomy to use (default 'abstract').
     """
-    p        = _paths(run_config)
-    themes   = json.loads(p["themes"].read_text())
-    taxonomy = json.loads(p["taxonomy"].read_text())
-    llm      = ChatMistralAI(model="mistral-large-latest", temperature=0.4)
-    theme_summary = list(map(
-        lambda t: {"name": t["theme_name"], "sentence_count": len(t["sentences"])},
-        themes
-    ))
     prompt = (
         "You are an academic writing expert in Information Systems.\n\n"
         "Write Section 7 (Discussion and Thematic Synthesis) of a systematic "
-        "literature review paper. Write approximately 500 words in formal academic prose.\n"
-        "Cover all four of these points:\n"
-        "(a) Overview of the identified themes and their significance\n"
-        "(b) How the themes map to the PAJAIS taxonomy categories\n"
-        "(c) Implications for IS research and practice\n"
-        "(d) Limitations of the thematic analysis\n\n"
-        "IDENTIFIED THEMES:\n" + json.dumps(theme_summary, indent=2) + "\n\n"
-        "PAJAIS TAXONOMY MAPPING:\n" + json.dumps(taxonomy, indent=2) + "\n\n"
-        "Write the full section now in continuous academic paragraphs. "
-        "Do not use bullet points, numbered lists, or section headers."
     )
     response       = llm.invoke([HumanMessage(content=prompt)])
     narrative_text = response.content
-    p["narrative"].write_text(narrative_text, encoding="utf-8")
     return json.dumps({
         "word_count": len(narrative_text.split()),
-        "path":       str(p["narrative"]),
-        "run_config": run_config,
     })

 """
 tools.py - 7 LangChain @tool functions for BERTopic Thematic Analysis Agent.
 Rules: ZERO if/else, ZERO for/while, ZERO try/except, ZERO PromptTemplate.
+All LLM calls use plain HumanMessage strings directly.
+Workflow:
+  - Abstract run saves to data/abstract/
+  - Title run saves to data/title/
+  - Comparison CSV + narrative only generated when BOTH runs are complete
+  - Topic IDs are sequential 1..N (not raw cluster labels)
+  - Boilerplate filter catches © symbol, all major publishers
 """
 from __future__ import annotations
 import json
 import re
+import shutil
 from pathlib import Path
 import numpy as np
 DATA_DIR = Path("data")
 DATA_DIR.mkdir(exist_ok=True)
+def _p(run_config: str) -> dict:
+    """Return all file paths for a given run_config, creating subdirectory."""
+    d = DATA_DIR / run_config
+    d.mkdir(parents=True, exist_ok=True)
     return {
+        "dir":        d,
+        "sentences":  d / "sentences.json",
+        "stats":      d / "stats.json",
+        "papers":     d / "papers.csv",
+        "emb":        d / "emb.npy",
+        "summaries":  d / "summaries.json",
+        "charts":     d / "charts.json",
+        "themes":     d / "themes.json",
+        "taxonomy":   d / "taxonomy.json",
+        "narrative":  d / "narrative.txt",
+        "comparison": DATA_DIR / "comparison.csv",   # shared output
     }
 RUN_CONFIGS = {
     "abstract": ["Abstract"],
     "title":    ["Title"],
 }
+# Comprehensive boilerplate filter — catches © symbol + all major publishers
 BOILERPLATE_PATTERNS = [
+    r"\u00a9",                           # © unicode
+    r"\\u00a9",                          # escaped unicode
+    r"copyright\s*\d{4}",
+    r"\d{4}\s+john wiley",
+    r"john wiley\s*(&|and)\s*sons",
+    r"blackwell\s*(publishing|pub)",
     r"wiley\s+periodicals",
+    r"wiley\s+online",
     r"all rights reserved",
     r"doi\s*:\s*\S+",
     r"published by elsevier",
+    r"elsevier\s*(b\.v|inc|ltd|science)",
+    r"springer\s*(nature|verlag|science|link)",
+    r"taylor\s*(&|and)\s*francis",
     r"informa\s+uk",
     r"sage\s+publications",
+    r"information systems journal\s+published",
+    r"emerald\s+(publishing|group)",
     r"this article is",
     r"rights reserved",
     r"permission from",
     r"reproduced with",
 ]
 BOILERPLATE_RE = re.compile("|".join(BOILERPLATE_PATTERNS), re.IGNORECASE)
+# Extra keyword filter applied per-sentence
+PUBLISHER_KEYWORDS = frozenset([
+    "wiley", "elsevier", "blackwell", "springer",
+    "taylor", "information systems journal", "emerald"
+])
 PAJAIS_CATEGORIES = [
     "Information Systems Theory",    "IS Strategy & Governance",
     "Digital Innovation",            "Enterprise Systems",
 def safe_read_csv(path):
+    """Read CSV with UTF-8 fallback to latin-1."""
     try:
         return pd.read_csv(path, encoding="utf-8")
     except UnicodeDecodeError:
         return pd.read_csv(path, encoding="latin-1")
+def _is_clean(s: str) -> bool:
+    """Return True if sentence passes all quality checks."""
+    sl = s.lower().strip()
+    return (
+        not BOILERPLATE_RE.search(s)
+        and not s.strip().startswith("\u00a9")
+        and not s.strip().startswith("©")
+        and len(s.split()) > 6
+        and len(s.strip()) > 40
+        and not any(kw in sl for kw in PUBLISHER_KEYWORDS)
+    )
 def _call_llm_json(llm, prompt: str) -> list:
+    """Call LLM with plain HumanMessage, strip markdown fences, parse JSON."""
     response = llm.invoke([HumanMessage(content=prompt)])
     raw = response.content.strip()
     raw = raw.split("```json")[-1].split("```")[0].strip() if "```" in raw else raw
     return json.loads(raw)
+def _both_runs_complete() -> bool:
+    """Return True only when BOTH abstract and title runs have themes saved."""
+    return (
+        (_p("abstract")["themes"]).exists()
+        and (_p("title")["themes"]).exists()
+    )
 # =============================================================================
+# TOOL 1 — load_scopus_csv
+# Saves to data/uploaded.csv (permanent copy) AND data/{run_config}/papers.csv
 # =============================================================================
 @tool
 def load_scopus_csv(csv_path: str, run_config: str = "abstract") -> str:
+    """Load a Scopus CSV, filter boilerplate sentences, save per run_config.
+    Saves sentences to data/{run_config}/sentences.json.
+    Also copies the CSV permanently to data/uploaded.csv.
     Args:
         csv_path:   Path to the uploaded Scopus CSV file.
+        run_config: 'abstract' or 'title' (default 'abstract').
     """
+    p = _p(run_config)
+    columns = RUN_CONFIGS.get(run_config, ["Abstract"])
+    # Copy CSV to permanent location only if it is a different file
+    dest = DATA_DIR / "uploaded.csv"
+    src  = Path(csv_path).resolve()
+    dst  = dest.resolve()
+    _ = shutil.copy(str(src), str(dst)) if src != dst else None
+    df_raw = safe_read_csv(dest)
+    present_cols = list(filter(
+        lambda c: c in df_raw.columns,
+        columns + ["Title", "Year", "Source title", "Cited by"]
+    ))
+    text_cols = list(filter(lambda c: c in df_raw.columns, columns))
+    df = df_raw[present_cols].dropna(subset=text_cols)
+    def split_sentences(text):
+        parts = re.split(r"(?<=[.!?])\s+", str(text))
+        return list(filter(_is_clean, parts))
+    sentences_lists = list(map(split_sentences, df[columns[0]].tolist()))
+    all_sentences   = [s for lst in sentences_lists for s in lst]
+    stats = {
+        "papers":                 int(len(df)),
+        "sentences_after_filter": int(len(all_sentences)),
+        "columns_used":           columns,
+        "run_config":             run_config,
+    }
+    p["sentences"].write_text(json.dumps(all_sentences, ensure_ascii=False))
+    p["stats"].write_text(json.dumps(stats, ensure_ascii=False))
+    df.to_csv(p["papers"], index=False)
+    return json.dumps(stats)
 # =============================================================================
+# TOOL 2 — run_bertopic_discovery
+# threshold=0.35 → ~100 fine-grained clusters; IDs renumbered 1..N
 # =============================================================================
 @tool
 def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract") -> str:
+    """Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering
+    (cosine, threshold=0.35) targeting ~100 topics. Topic IDs are sequential 1..N.
     Args:
+        top_n_topics: Target number of clusters (default 100).
+        run_config:   'abstract' or 'title' (default 'abstract').
     """
+    p = _p(run_config)
     sentences = json.loads(p["sentences"].read_text())
+    model      = SentenceTransformer("all-MiniLM-L6-v2")
     embeddings = model.encode(
+        sentences, normalize_embeddings=True,
+        show_progress_bar=False, batch_size=64
     )
     np.save(p["emb"], embeddings)
     clustering = AgglomerativeClustering(
+        metric="cosine", linkage="average",
+        distance_threshold=0.35, n_clusters=None,
     )
     labels = clustering.fit_predict(embeddings)
+    all_labels     = sorted(set(labels.tolist()))
+    label_sizes    = list(map(lambda lb: (lb, int((labels == lb).sum())), all_labels))
+    # Keep clusters with ≥3 sentences, sort by size desc, take top N
+    label_filtered = list(filter(lambda x: x[1] >= 3, label_sizes))
+    label_sorted   = sorted(label_filtered, key=lambda x: -x[1])
+    retained       = list(map(lambda x: x[0], label_sorted[:top_n_topics]))
+    def build_summary(seq_label):
+        seq_id, raw_label = seq_label
+        mask          = labels == raw_label
         cluster_embs  = embeddings[mask]
+        raw_sents     = [sentences[i] for i, m in enumerate(mask.tolist()) if m]
+        clean_sents   = list(filter(_is_clean, raw_sents))
+        sents         = clean_sents if clean_sents else raw_sents[:5]
+        centroid      = cluster_embs.mean(axis=0, keepdims=True)
+        sims          = cosine_similarity(centroid, cluster_embs)[0]
+        top5_idx      = sims.argsort()[-5:][::-1].tolist()
+        raw_top       = list(map(lambda i: raw_sents[i], top5_idx))
+        clean_set     = set(sents)
+        top_evidence  = list(filter(lambda s: s in clean_set, raw_top))[:5]
+        top_evidence  = top_evidence if top_evidence else raw_top[:3]
         return {
+            "topic_id":    seq_id,
             "size":        int(mask.sum()),
             "top_evidence": top_evidence,
+            "sentences":   sents,
             "centroid":    centroid[0].tolist(),
             "run_config":  run_config,
         }
+    # Sequential IDs starting at 1
+    seq_pairs = list(map(lambda x: (x[0] + 1, x[1]), enumerate(retained)))
+    summaries = list(map(build_summary, seq_pairs))
     p["summaries"].write_text(json.dumps(summaries, indent=2, ensure_ascii=False))
+    sizes = list(map(lambda s: s["size"],     summaries))
     ids   = list(map(lambda s: s["topic_id"], summaries))
+    fig1 = px.bar(x=ids, y=sizes, title="Topic Sizes — {}".format(run_config),
+                  labels={"x": "Topic #", "y": "Sentences"})
+    fig2 = px.histogram(x=sizes, nbins=30, title="Size Distribution — {}".format(run_config),
                         labels={"x": "Cluster Size"})
     centroids = np.array(list(map(lambda s: s["centroid"], summaries)))
     n_comp    = min(2, centroids.shape[0], centroids.shape[1])
     coords    = PCA(n_components=n_comp).fit_transform(centroids)
         x=coords[:, 0],
         y=(coords[:, 1] if coords.shape[1] > 1 else [0] * len(coords)),
         text=list(map(str, ids)),
+        title="Topic Centroids PCA — {}".format(run_config),
         labels={"x": "PC1", "y": "PC2"},
     )
     fig4 = px.treemap(
+        names=list(map(str, ids)), parents=["Topics"] * len(ids),
+        values=sizes, title="Treemap — {}".format(run_config),
     )
     charts = {
         "topics_found": len(summaries),
         "run_config":   run_config,
         "chart_types":  list(charts.keys()),
+        "note":         "Topics numbered 1..{}, threshold=0.35".format(len(summaries)),
     })
 # =============================================================================
+# TOOL 3 — label_topics_with_llm
 # =============================================================================
 @tool
 def label_topics_with_llm(batch_size: int = 20, run_config: str = "abstract") -> str:
+    """Label topic clusters with human-readable names via Mistral LLM.
     Args:
+        batch_size:  Topics per LLM call (default 20).
+        run_config:  'abstract' or 'title' (default 'abstract').
     """
+    p             = _p(run_config)
     summaries     = json.loads(p["summaries"].read_text())
     top_summaries = summaries[:100]
     llm           = ChatMistralAI(model="mistral-large-latest", temperature=0.2)
             batch
         ))
         prompt = (
+            "You are a thematic analysis expert in Information Systems research.\n"
+            "Label each topic cluster with a concise 3-6 word academic label "
+            "and one-sentence reasoning.\n\n"
             "TOPICS:\n" + json.dumps(mini, indent=2) + "\n\n"
+            "Return ONLY a raw JSON array. "
+            "Each element: topic_id (integer), label (string), reasoning (string). "
+            "No markdown, no explanation."
         )
         return _call_llm_json(llm, prompt)
 # =============================================================================
+# TOOL 4 — consolidate_into_themes
 # =============================================================================
 @tool
 def consolidate_into_themes(approved_groups: str, run_config: str = "abstract") -> str:
+    """Merge approved topic groups into themes and recompute centroids.
     Args:
+        approved_groups: JSON list [{theme_name: str, topic_ids: [int,...]}]
+        run_config:      'abstract' or 'title' (default 'abstract').
     """
+    p         = _p(run_config)
     groups    = json.loads(approved_groups)
     summaries = json.loads(p["summaries"].read_text())
+    id_map    = {s["topic_id"]: s for s in summaries}
     def build_theme(group):
         ids      = group["topic_ids"]
+        members  = list(map(lambda tid: id_map[tid], ids))
+        sents    = [s for ms in members for s in ms.get("sentences", [])]
         centroids = np.array(list(map(lambda ms: ms["centroid"], members)))
         return {
             "theme_name":  group["theme_name"],
             "topic_ids":   ids,
+            "sentences":   sents,
             "centroid":    centroids.mean(axis=0).tolist(),
+            "paper_count": len(set(sents)),
             "run_config":  run_config,
         }
         "themes_created": len(themes),
         "theme_names":    list(map(lambda t: t["theme_name"], themes)),
         "run_config":     run_config,
+        "both_complete":  _both_runs_complete(),
     })
 # =============================================================================
+# TOOL 5 — compare_with_taxonomy
 # =============================================================================
 @tool
 def compare_with_taxonomy(run_config: str = "abstract") -> str:
+    """Map themes to PAJAIS 25 categories via Mistral LLM.
     Args:
+        run_config: 'abstract' or 'title' (default 'abstract').
     """
+    p      = _p(run_config)
     themes = json.loads(p["themes"].read_text())
     llm    = ChatMistralAI(model="mistral-large-latest", temperature=0.1)
         lambda t: {"name": t["theme_name"], "sample": t["sentences"][:2]},
         themes
     ))
     prompt = (
         "You are a research classification expert in Information Systems.\n\n"
+        "Map each theme to the single most relevant PAJAIS category.\n\n"
         "THEMES:\n" + json.dumps(theme_mini, indent=2) + "\n\n"
         "PAJAIS CATEGORIES:\n" + json.dumps(PAJAIS_CATEGORIES, indent=2) + "\n\n"
+        "Return ONLY a raw JSON array. "
+        "Each element: name, pajais_category, confidence, rationale. "
+        "No markdown, no explanation."
     )
     result = _call_llm_json(llm, prompt)
     p["taxonomy"].write_text(json.dumps(result, indent=2, ensure_ascii=False))
+    return json.dumps({
+        "mapped_themes": len(result),
+        "run_config":    run_config,
+        "both_complete": _both_runs_complete(),
+    })
 # =============================================================================
+# TOOL 6 — generate_comparison_csv
+# ONLY runs when BOTH abstract and title runs are complete
+# Columns: Title | Abstract | Year | Source Journal
 # =============================================================================
 @tool
+def generate_comparison_csv() -> str:
+    """Generate Title | Abstract | Year | Source Journal comparison CSV.
+    Only available after BOTH abstract and title runs have completed themes.
+    Saves to data/comparison.csv.
     """
+    abs_complete   = _p("abstract")["themes"].exists()
+    title_complete = _p("title")["themes"].exists()
+    status_msg = (
+        "Abstract complete: {}, Title complete: {}. "
+        "Run 'run title' to complete the title analysis first."
+    ).format(abs_complete, title_complete)
+    # Use ternary to avoid if/else
+    result = (
+        _do_generate_comparison_csv()
+        if (abs_complete and title_complete)
+        else status_msg
+    )
+    return result
+def _do_generate_comparison_csv() -> str:
+    """Internal: actually generate the CSV when both runs are done."""
+    df = safe_read_csv(DATA_DIR / "uploaded.csv")
     title_col    = next(filter(lambda c: c.strip().lower() == "title",    df.columns), None)
     abstract_col = next(filter(lambda c: c.strip().lower() == "abstract", df.columns), None)
     year_col     = next(filter(lambda c: c.strip().lower() == "year",     df.columns), None)
     journal_col  = next(filter(lambda c: "source" in c.lower(),           df.columns), None)
     ordered = [title_col, abstract_col, year_col, journal_col]
     present = list(filter(lambda c: c is not None and c in df.columns, ordered))
+    out_df  = df[present].copy()
     rename_map = {
         title_col:    "Title",
         abstract_col: "Abstract",
         year_col:     "Year",
         journal_col:  "Source Journal",
     }
+    out_df = out_df.rename(
+        columns={k: v for k, v in rename_map.items() if k in out_df.columns}
+    )
+    dest = DATA_DIR / "comparison.csv"
+    out_df.to_csv(dest, index=False, encoding="utf-8-sig")
     return json.dumps({
+        "rows":    len(out_df),
+        "columns": list(out_df.columns),
+        "path":    str(dest),
+        "note":    "Both runs complete — comparison CSV generated",
     })
 # =============================================================================
+# TOOL 7 — export_narrative
+# ONLY runs when BOTH abstract and title runs are complete
 # =============================================================================
 @tool
+def export_narrative() -> str:
+    """Write a 500-word Section 7 narrative using themes from BOTH runs.
+    Only available after BOTH abstract and title runs have completed taxonomy mapping.
+    Saves to data/narrative.txt.
     """
+    abs_tax   = _p("abstract")["taxonomy"]
+    title_tax = _p("title")["taxonomy"]
+    both_done = abs_tax.exists() and title_tax.exists()
+    result = (
+        _do_export_narrative()
+        if both_done
+        else (
+            "Narrative cannot be generated yet. "
+            "Abstract taxonomy complete: {}. Title taxonomy complete: {}. "
+            "Complete both runs through Phase 5.5 first.".format(
+                abs_tax.exists(), title_tax.exists()
+            )
+        )
+    )
+    return result
+def _do_export_narrative() -> str:
+    """Internal: generate narrative when both runs are done."""
+    abs_themes    = json.loads(_p("abstract")["themes"].read_text())
+    title_themes  = json.loads(_p("title")["themes"].read_text())
+    abs_taxonomy  = json.loads(_p("abstract")["taxonomy"].read_text())
+    title_taxonomy = json.loads(_p("title")["taxonomy"].read_text())
+    llm           = ChatMistralAI(model="mistral-large-latest", temperature=0.4)
+    abs_summary   = list(map(lambda t: {"name": t["theme_name"],
+                                         "sentences": len(t["sentences"])}, abs_themes))
+    title_summary = list(map(lambda t: {"name": t["theme_name"],
+                                         "sentences": len(t["sentences"])}, title_themes))
     prompt = (
         "You are an academic writing expert in Information Systems.\n\n"
         "Write Section 7 (Discussion and Thematic Synthesis) of a systematic "
+        "literature review paper. Approximately 500 words, formal academic prose.\n"
+        "Cover:\n"
+        "(a) Overview of themes from abstract analysis\n"
+        "(b) Overview of themes from title analysis\n"
+        "(c) Comparison: what themes appear in both vs only one\n"
+        "(d) PAJAIS taxonomy mapping and implications\n"
+        "(e) Implications for IS research and practice\n"
+        "(f) Limitations\n\n"
+        "ABSTRACT THEMES:\n" + json.dumps(abs_summary, indent=2) + "\n\n"
+        "TITLE THEMES:\n"    + json.dumps(title_summary, indent=2) + "\n\n"
+        "ABSTRACT PAJAIS MAPPING:\n" + json.dumps(abs_taxonomy, indent=2) + "\n\n"
+        "TITLE PAJAIS MAPPING:\n"    + json.dumps(title_taxonomy, indent=2) + "\n\n"
+        "Write in continuous academic paragraphs. No bullet points or headers."
     )
     response       = llm.invoke([HumanMessage(content=prompt)])
     narrative_text = response.content
+    dest           = DATA_DIR / "narrative.txt"
+    dest.write_text(narrative_text, encoding="utf-8")
     return json.dumps({
         "word_count": len(narrative_text.split()),
+        "path":       str(dest),
+        "note":       "Narrative combines both abstract and title run themes",
     })