Spaces:
Sleeping
Sleeping
| """ | |
| tools.py - 7 LangChain @tool functions for BERTopic Thematic Analysis Agent. | |
| Rules: ZERO if/else, ZERO for/while, ZERO try/except, ZERO PromptTemplate. | |
| All LLM calls use plain HumanMessage strings directly. | |
| Workflow: | |
| - Abstract run saves to data/abstract/ | |
| - Title run saves to data/title/ | |
| - Comparison CSV + narrative only generated when BOTH runs are complete | |
| - Topic IDs are sequential 1..N (not raw cluster labels) | |
| - Boilerplate filter catches © symbol, all major publishers | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| import shutil | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.express as px | |
| from langchain_core.tools import tool | |
| from langchain_core.messages import HumanMessage | |
| from langchain_mistralai import ChatMistralAI | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.cluster import AgglomerativeClustering | |
| from sklearn.decomposition import PCA | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # ── paths ────────────────────────────────────────────────────────────────────── | |
| DATA_DIR = Path("data") | |
| DATA_DIR.mkdir(exist_ok=True) | |
| # ── Embedding model — loaded ONCE at module level, reused everywhere ─────────── | |
| # This prevents repeated HuggingFace downloads and avoids 429 rate limit errors. | |
| # The UNEXPECTED embeddings.position_ids warning is harmless — safe to ignore. | |
| print("Loading sentence-transformers model (one-time)...") | |
| _EMBED_MODEL = SentenceTransformer("all-MiniLM-L6-v2") | |
| print("Model loaded OK.") | |
| def _p(run_config: str) -> dict: | |
| """Return all file paths for a given run_config, creating subdirectory.""" | |
| d = DATA_DIR / run_config | |
| d.mkdir(parents=True, exist_ok=True) | |
| return { | |
| "dir": d, | |
| "sentences": d / "sentences.json", | |
| "stats": d / "stats.json", | |
| "papers": d / "papers.csv", | |
| "emb": d / "emb.npy", | |
| "summaries": d / "summaries.json", | |
| "charts": d / "charts.json", | |
| "themes": d / "themes.json", | |
| "taxonomy": d / "taxonomy.json", | |
| "narrative": d / "narrative.txt", | |
| "comparison": DATA_DIR / "comparison.csv", # shared output | |
| } | |
| RUN_CONFIGS = { | |
| "abstract": ["Abstract"], | |
| "title": ["Title"], | |
| } | |
| # Comprehensive boilerplate filter — catches © symbol + all major publishers | |
| BOILERPLATE_PATTERNS = [ | |
| r"\u00a9", # © unicode | |
| r"\\u00a9", # escaped unicode | |
| r"copyright\s*\d{4}", | |
| r"\d{4}\s+john wiley", | |
| r"john wiley\s*(&|and)\s*sons", | |
| r"blackwell\s*(publishing|pub)", | |
| r"wiley\s+periodicals", | |
| r"wiley\s+online", | |
| r"all rights reserved", | |
| r"doi\s*:\s*\S+", | |
| r"published by elsevier", | |
| r"elsevier\s*(b\.v|inc|ltd|science)", | |
| r"springer\s*(nature|verlag|science|link)", | |
| r"taylor\s*(&|and)\s*francis", | |
| r"informa\s+uk", | |
| r"sage\s+publications", | |
| r"information systems journal\s+published", | |
| r"emerald\s+(publishing|group)", | |
| r"this article is", | |
| r"rights reserved", | |
| r"permission from", | |
| r"reproduced with", | |
| ] | |
| BOILERPLATE_RE = re.compile("|".join(BOILERPLATE_PATTERNS), re.IGNORECASE) | |
| # Extra keyword filter applied per-sentence | |
| PUBLISHER_KEYWORDS = frozenset([ | |
| "wiley", "elsevier", "blackwell", "springer", | |
| "taylor", "information systems journal", "emerald" | |
| ]) | |
| PAJAIS_CATEGORIES = [ | |
| "Information Systems Theory", "IS Strategy & Governance", | |
| "Digital Innovation", "Enterprise Systems", | |
| "AI & Intelligent Systems", "Big Data & Analytics", | |
| "Cybersecurity & Privacy", "Cloud Computing", | |
| "IS in Healthcare", "IS in Education", | |
| "E-Commerce & Digital Markets", "Social Media & Platforms", | |
| "Human-Computer Interaction", "IS Project Management", | |
| "IT Outsourcing", "Knowledge Management", | |
| "IS Development Methodologies", "Digital Transformation", | |
| "IS Ethics & Society", "IS in Developing Countries", | |
| "Mobile Computing", "IT Infrastructure", | |
| "IS Adoption & Diffusion", "IS Evaluation", | |
| "Organizational IS & Change", | |
| ] | |
| def safe_read_csv(path): | |
| """Read CSV with UTF-8 fallback to latin-1.""" | |
| try: | |
| return pd.read_csv(path, encoding="utf-8") | |
| except UnicodeDecodeError: | |
| return pd.read_csv(path, encoding="latin-1") | |
| def _is_clean(s: str) -> bool: | |
| """Return True if sentence passes all quality checks.""" | |
| sl = s.lower().strip() | |
| return ( | |
| not BOILERPLATE_RE.search(s) | |
| and not s.strip().startswith("\u00a9") | |
| and not s.strip().startswith("©") | |
| and len(s.split()) > 6 | |
| and len(s.strip()) > 40 | |
| and not any(kw in sl for kw in PUBLISHER_KEYWORDS) | |
| ) | |
| def _call_llm_json(llm, prompt: str) -> list: | |
| """Call LLM with plain HumanMessage, strip markdown fences, parse JSON.""" | |
| response = llm.invoke([HumanMessage(content=prompt)]) | |
| raw = response.content.strip() | |
| raw = raw.split("```json")[-1].split("```")[0].strip() if "```" in raw else raw | |
| return json.loads(raw) | |
| def _both_runs_complete() -> bool: | |
| """Return True only when BOTH abstract and title runs have themes saved.""" | |
| return ( | |
| (_p("abstract")["themes"]).exists() | |
| and (_p("title")["themes"]).exists() | |
| ) | |
| # ============================================================================= | |
| # TOOL 1 — load_scopus_csv | |
| # Saves to data/uploaded.csv (permanent copy) AND data/{run_config}/papers.csv | |
| # ============================================================================= | |
| def load_scopus_csv(csv_path: str, run_config: str = "abstract") -> str: | |
| """Load a Scopus CSV, filter boilerplate sentences, save per run_config. | |
| Saves sentences to data/{run_config}/sentences.json. | |
| Also copies the CSV permanently to data/uploaded.csv. | |
| Args: | |
| csv_path: Path to the uploaded Scopus CSV file. | |
| run_config: 'abstract' or 'title' (default 'abstract'). | |
| """ | |
| p = _p(run_config) | |
| columns = RUN_CONFIGS.get(run_config, ["Abstract"]) | |
| # Copy CSV to permanent location only if it is a different file | |
| dest = DATA_DIR / "uploaded.csv" | |
| src = Path(csv_path).resolve() | |
| dst = dest.resolve() | |
| _ = shutil.copy(str(src), str(dst)) if src != dst else None | |
| df_raw = safe_read_csv(dest) | |
| # Find which text column actually exists in this CSV | |
| # Scopus sometimes uses "Abstract" or "abstract" or "ABSTRACT" | |
| col_lower_map = {c.strip().lower(): c for c in df_raw.columns} | |
| target_lower = columns[0].lower() | |
| actual_col = col_lower_map.get(target_lower, None) | |
| # Also try partial match if exact match fails | |
| actual_col = ( | |
| actual_col | |
| if actual_col is not None | |
| else next(filter(lambda c: target_lower in c.lower(), df_raw.columns), None) | |
| ) | |
| # If still not found, return early with clear message | |
| if actual_col is None: | |
| available = list(df_raw.columns) | |
| return json.dumps({ | |
| "error": "Column '{}' not found in CSV. Available columns: {}".format( | |
| columns[0], available | |
| ), | |
| "run_config": run_config, | |
| }) | |
| # Build keep_cols — deduplicate to avoid DataFrame-instead-of-Series bug | |
| # when actual_col == "Title" (title run) and "Title" also appears in extras | |
| extras = ["Title", "Year", "Source title", "Cited by"] | |
| all_wanted = [actual_col] + [c for c in extras if c != actual_col] | |
| keep_cols = list(dict.fromkeys(filter(lambda c: c in df_raw.columns, all_wanted))) | |
| df = df_raw[keep_cols].copy() | |
| # Access the text column safely as a Series using column position | |
| text_series = df[actual_col] | |
| # If still a DataFrame (duplicate col names), take first column | |
| text_series = ( | |
| text_series.iloc[:, 0] | |
| if isinstance(text_series, pd.DataFrame) | |
| else text_series | |
| ) | |
| mask = text_series.notna() & (text_series.astype(str).str.strip() != "") | |
| df = df[mask].copy() | |
| text_series = text_series[mask] | |
| def split_sentences(text): | |
| parts = re.split(r"(?<=[.!?])\s+", str(text)) | |
| return list(filter(_is_clean, parts)) | |
| sentences_lists = list(map(split_sentences, list(text_series))) | |
| all_sentences = [s for lst in sentences_lists for s in lst] | |
| stats = { | |
| "papers": int(len(df)), | |
| "sentences_after_filter": int(len(all_sentences)), | |
| "columns_used": [actual_col], | |
| "run_config": run_config, | |
| } | |
| p["sentences"].write_text(json.dumps(all_sentences, ensure_ascii=False)) | |
| p["stats"].write_text(json.dumps(stats, ensure_ascii=False)) | |
| df.to_csv(p["papers"], index=False) | |
| return json.dumps(stats) | |
| # ============================================================================= | |
| # TOOL 2 — run_bertopic_discovery | |
| # threshold=0.35 → ~100 fine-grained clusters; IDs renumbered 1..N | |
| # ============================================================================= | |
| def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract") -> str: | |
| """Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering | |
| (cosine, threshold=0.35) targeting ~100 topics. Topic IDs are sequential 1..N. | |
| Args: | |
| top_n_topics: Target number of clusters (default 100). | |
| run_config: 'abstract' or 'title' (default 'abstract'). | |
| """ | |
| p = _p(run_config) | |
| sentences = json.loads(p["sentences"].read_text()) | |
| embeddings = _EMBED_MODEL.encode( | |
| sentences, normalize_embeddings=True, | |
| show_progress_bar=False, batch_size=64 | |
| ) | |
| np.save(p["emb"], embeddings) | |
| clustering = AgglomerativeClustering( | |
| metric="cosine", linkage="average", | |
| distance_threshold=0.35, n_clusters=None, | |
| ) | |
| labels = clustering.fit_predict(embeddings) | |
| all_labels = sorted(set(labels.tolist())) | |
| label_sizes = list(map(lambda lb: (lb, int((labels == lb).sum())), all_labels)) | |
| # Keep clusters with ≥3 sentences, sort by size desc, take top N | |
| label_filtered = list(filter(lambda x: x[1] >= 3, label_sizes)) | |
| label_sorted = sorted(label_filtered, key=lambda x: -x[1]) | |
| retained = list(map(lambda x: x[0], label_sorted[:top_n_topics])) | |
| def build_summary(seq_label): | |
| seq_id, raw_label = seq_label | |
| mask = labels == raw_label | |
| cluster_embs = embeddings[mask] | |
| raw_sents = [sentences[i] for i, m in enumerate(mask.tolist()) if m] | |
| clean_sents = list(filter(_is_clean, raw_sents)) | |
| sents = clean_sents if clean_sents else raw_sents[:5] | |
| centroid = cluster_embs.mean(axis=0, keepdims=True) | |
| sims = cosine_similarity(centroid, cluster_embs)[0] | |
| top5_idx = sims.argsort()[-5:][::-1].tolist() | |
| raw_top = list(map(lambda i: raw_sents[i], top5_idx)) | |
| clean_set = set(sents) | |
| top_evidence = list(filter(lambda s: s in clean_set, raw_top))[:5] | |
| top_evidence = top_evidence if top_evidence else raw_top[:3] | |
| return { | |
| "topic_id": seq_id, | |
| "size": int(mask.sum()), | |
| "top_evidence": top_evidence, | |
| "sentences": sents, | |
| "centroid": centroid[0].tolist(), | |
| "run_config": run_config, | |
| } | |
| # Sequential IDs starting at 1 | |
| seq_pairs = list(map(lambda x: (x[0] + 1, x[1]), enumerate(retained))) | |
| summaries = list(map(build_summary, seq_pairs)) | |
| p["summaries"].write_text(json.dumps(summaries, indent=2, ensure_ascii=False)) | |
| sizes = list(map(lambda s: s["size"], summaries)) | |
| ids = list(map(lambda s: s["topic_id"], summaries)) | |
| fig1 = px.bar(x=ids, y=sizes, title="Topic Sizes — {}".format(run_config), | |
| labels={"x": "Topic #", "y": "Sentences"}) | |
| fig2 = px.histogram(x=sizes, nbins=30, title="Size Distribution — {}".format(run_config), | |
| labels={"x": "Cluster Size"}) | |
| centroids = np.array(list(map(lambda s: s["centroid"], summaries))) | |
| n_comp = min(2, centroids.shape[0], centroids.shape[1]) | |
| coords = PCA(n_components=n_comp).fit_transform(centroids) | |
| fig3 = px.scatter( | |
| x=coords[:, 0], | |
| y=(coords[:, 1] if coords.shape[1] > 1 else [0] * len(coords)), | |
| text=list(map(str, ids)), | |
| title="Topic Centroids PCA — {}".format(run_config), | |
| labels={"x": "PC1", "y": "PC2"}, | |
| ) | |
| fig4 = px.treemap( | |
| names=list(map(str, ids)), parents=["Topics"] * len(ids), | |
| values=sizes, title="Treemap — {}".format(run_config), | |
| ) | |
| charts = { | |
| "bar": fig1.to_html(full_html=False, include_plotlyjs="cdn"), | |
| "histogram": fig2.to_html(full_html=False, include_plotlyjs=False), | |
| "scatter": fig3.to_html(full_html=False, include_plotlyjs=False), | |
| "treemap": fig4.to_html(full_html=False, include_plotlyjs=False), | |
| } | |
| p["charts"].write_text(json.dumps(charts)) | |
| return json.dumps({ | |
| "topics_found": len(summaries), | |
| "run_config": run_config, | |
| "chart_types": list(charts.keys()), | |
| "note": "Topics numbered 1..{}, threshold=0.35".format(len(summaries)), | |
| }) | |
| # ============================================================================= | |
| # TOOL 3 — label_topics_with_llm | |
| # ============================================================================= | |
| def label_topics_with_llm(batch_size: int = 15, run_config: str = "abstract") -> str: | |
| """Label topic clusters with human-readable names via Mistral LLM. | |
| Uses mistral-small-latest to stay within free-tier rate limits. | |
| Adds 12-second sleep between batches to avoid HTTP 429 errors. | |
| Args: | |
| batch_size: Topics per LLM call (default 15). | |
| run_config: 'abstract' or 'title' (default 'abstract'). | |
| """ | |
| import time | |
| p = _p(run_config) | |
| summaries = json.loads(p["summaries"].read_text()) | |
| # Cap at 60 to reduce total API calls — covers the most meaningful clusters | |
| top_summaries = summaries[:60] | |
| # mistral-small has higher RPM limits than mistral-large on the free tier | |
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0.2) | |
| batch_starts = list(range(0, len(top_summaries), batch_size)) | |
| def label_batch(start): | |
| batch = top_summaries[start: start + batch_size] | |
| # Only 2 evidence sentences per topic to reduce token usage | |
| mini = list(map( | |
| lambda s: {"topic_id": s["topic_id"], "sentences": s["top_evidence"][:2]}, | |
| batch | |
| )) | |
| topic_ids_in_batch = list(map(lambda s: s["topic_id"], batch)) | |
| prompt = ( | |
| "You are a thematic analysis expert in Information Systems research.\n" | |
| "For each topic cluster below, provide:\n" | |
| " - label: a specific 3-6 word academic theme name (e.g. 'Digital Transformation Barriers', " | |
| "'AI Adoption in Healthcare', 'IS Project Management Challenges')\n" | |
| " - reasoning: one sentence explaining why you chose that label\n\n" | |
| "IMPORTANT: You MUST return exactly one entry for each topic_id in this list: " | |
| + str(topic_ids_in_batch) + "\n\n" | |
| "TOPICS:\n" + json.dumps(mini, indent=2) + "\n\n" | |
| "Return ONLY a raw JSON array with no markdown fences. " | |
| "Each element must have exactly these three keys: " | |
| "topic_id (integer matching the input), label (string), reasoning (string)." | |
| ) | |
| return _call_llm_json(llm, prompt) | |
| # Sequential with sleep between batches — free tier ~5 req/min for mistral-small | |
| # 12 seconds between calls keeps us safely under the limit | |
| all_labels_raw = [] | |
| for idx, start in enumerate(batch_starts): | |
| all_labels_raw.extend(label_batch(start)) | |
| _ = time.sleep(12) if idx < len(batch_starts) - 1 else None | |
| # Build label_map keyed by BOTH int and str — LLM sometimes returns "1" not 1 | |
| label_map = {} | |
| for item in all_labels_raw: | |
| tid = item.get("topic_id", "") | |
| label_map[int(tid)] = item | |
| label_map[str(tid)] = item | |
| def enrich(s): | |
| tid = s["topic_id"] | |
| info = label_map.get(tid) or label_map.get(str(tid)) or {} | |
| raw_label = str(info.get("label", "")).strip() | |
| raw_reason = str(info.get("reasoning", "")).strip() | |
| good_label = ( | |
| raw_label | |
| if raw_label and raw_label.lower() not in ("", "n/a", "none", "null") | |
| else "Topic {}".format(tid) | |
| ) | |
| return {**s, "label": good_label, "reasoning": raw_reason} | |
| enriched = list(map(enrich, top_summaries)) | |
| p["summaries"].write_text(json.dumps(enriched, indent=2, ensure_ascii=False)) | |
| labelled_count = sum( | |
| 1 for s in enriched | |
| if s.get("label", "").strip() and not s["label"].startswith("Topic ") | |
| ) | |
| return json.dumps({ | |
| "labelled_topics": len(enriched), | |
| "with_llm_label": labelled_count, | |
| "run_config": run_config, | |
| }) | |
| def consolidate_into_themes(approved_groups: str, run_config: str = "abstract") -> str: | |
| """Merge approved topic groups into themes and recompute centroids. | |
| Args: | |
| approved_groups: JSON list [{theme_name: str, topic_ids: [int,...]}] | |
| run_config: 'abstract' or 'title' (default 'abstract'). | |
| """ | |
| p = _p(run_config) | |
| groups = json.loads(approved_groups) | |
| summaries = json.loads(p["summaries"].read_text()) | |
| id_map = {s["topic_id"]: s for s in summaries} | |
| def build_theme(group): | |
| ids = group["topic_ids"] | |
| members = list(map(lambda tid: id_map[tid], ids)) | |
| sents = [s for ms in members for s in ms.get("sentences", [])] | |
| centroids = np.array(list(map(lambda ms: ms["centroid"], members))) | |
| return { | |
| "theme_name": group["theme_name"], | |
| "topic_ids": ids, | |
| "sentences": sents, | |
| "centroid": centroids.mean(axis=0).tolist(), | |
| "paper_count": len(set(sents)), | |
| "run_config": run_config, | |
| } | |
| themes = list(map(build_theme, groups)) | |
| p["themes"].write_text(json.dumps(themes, indent=2, ensure_ascii=False)) | |
| return json.dumps({ | |
| "themes_created": len(themes), | |
| "theme_names": list(map(lambda t: t["theme_name"], themes)), | |
| "run_config": run_config, | |
| "both_complete": _both_runs_complete(), | |
| }) | |
| # ============================================================================= | |
| # TOOL 5 — compare_with_taxonomy | |
| # ============================================================================= | |
| def compare_with_taxonomy(run_config: str = "abstract") -> str: | |
| """Map themes to PAJAIS 25 categories via Mistral LLM. | |
| Args: | |
| run_config: 'abstract' or 'title' (default 'abstract'). | |
| """ | |
| p = _p(run_config) | |
| themes = json.loads(p["themes"].read_text()) | |
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0.1) | |
| theme_mini = list(map( | |
| lambda t: {"name": t["theme_name"], "sample": t["sentences"][:2]}, | |
| themes | |
| )) | |
| prompt = ( | |
| "You are a research classification expert in Information Systems.\n\n" | |
| "Map each theme to the single most relevant PAJAIS category.\n\n" | |
| "THEMES:\n" + json.dumps(theme_mini, indent=2) + "\n\n" | |
| "PAJAIS CATEGORIES:\n" + json.dumps(PAJAIS_CATEGORIES, indent=2) + "\n\n" | |
| "Return ONLY a raw JSON array. " | |
| "Each element: name, pajais_category, confidence, rationale. " | |
| "No markdown, no explanation." | |
| ) | |
| result = _call_llm_json(llm, prompt) | |
| p["taxonomy"].write_text(json.dumps(result, indent=2, ensure_ascii=False)) | |
| return json.dumps({ | |
| "mapped_themes": len(result), | |
| "run_config": run_config, | |
| "both_complete": _both_runs_complete(), | |
| }) | |
| # ============================================================================= | |
| # TOOL 6 — generate_comparison_csv | |
| # ONLY runs when BOTH abstract and title runs are complete | |
| # Columns: Title | Abstract | Year | Source Journal | |
| # ============================================================================= | |
| def generate_comparison_csv() -> str: | |
| """Generate Title | Abstract | Year | Source Journal comparison CSV. | |
| Only available after BOTH abstract and title runs have completed themes. | |
| Saves to data/comparison.csv. | |
| """ | |
| abs_complete = _p("abstract")["themes"].exists() | |
| title_complete = _p("title")["themes"].exists() | |
| status_msg = ( | |
| "Abstract complete: {}, Title complete: {}. " | |
| "Run 'run title' to complete the title analysis first." | |
| ).format(abs_complete, title_complete) | |
| # Use ternary to avoid if/else | |
| result = ( | |
| _do_generate_comparison_csv() | |
| if (abs_complete and title_complete) | |
| else status_msg | |
| ) | |
| return result | |
| def _assign_theme_for_text(text: str, themes: list, taxonomy_map: dict) -> tuple: | |
| """ | |
| Given a piece of text (title or abstract), find the best matching theme | |
| by computing cosine similarity between the text embedding and theme centroids. | |
| Returns (theme_name, pajais_category). | |
| """ | |
| text_emb = _EMBED_MODEL.encode([str(text)], normalize_embeddings=True)[0] | |
| centroids = np.array(list(map(lambda t: t["centroid"], themes))) | |
| sims = cosine_similarity(text_emb.reshape(1, -1), centroids)[0] | |
| best_idx = int(sims.argmax()) | |
| best_theme = themes[best_idx]["theme_name"] | |
| pajais = taxonomy_map.get(best_theme, "Unknown") | |
| return best_theme, pajais, float(round(sims[best_idx], 4)) | |
| def _do_generate_comparison_csv() -> str: | |
| """ | |
| Build enriched comparison CSV with per-paper theme assignments for both runs. | |
| Columns: | |
| Title | Title Theme | Title PAJAIS Category | | |
| Abstract | Abstract Theme | Abstract PAJAIS Category | | |
| Year | Source Journal | | |
| Theme Similarity | Similarity % | Similarity Reasoning | |
| """ | |
| df = safe_read_csv(DATA_DIR / "uploaded.csv") | |
| # Detect columns | |
| title_col = next(filter(lambda c: c.strip().lower() == "title", df.columns), None) | |
| abstract_col = next(filter(lambda c: c.strip().lower() == "abstract", df.columns), None) | |
| year_col = next(filter(lambda c: c.strip().lower() == "year", df.columns), None) | |
| journal_col = next(filter(lambda c: "source" in c.lower(), df.columns), None) | |
| # Load abstract themes + taxonomy | |
| abs_themes = json.loads(_p("abstract")["themes"].read_text()) | |
| abs_taxonomy = json.loads(_p("abstract")["taxonomy"].read_text()) | |
| abs_tax_map = { | |
| item.get("name", item.get("theme_name", "")): item.get("pajais_category", "") | |
| for item in abs_taxonomy | |
| } | |
| # Load title themes + taxonomy | |
| title_themes = json.loads(_p("title")["themes"].read_text()) | |
| title_taxonomy = json.loads(_p("title")["taxonomy"].read_text()) | |
| title_tax_map = { | |
| item.get("name", item.get("theme_name", "")): item.get("pajais_category", "") | |
| for item in title_taxonomy | |
| } | |
| # Build theme name → PAJAIS lookup | |
| abs_theme_names = list(map(lambda t: t["theme_name"], abs_themes)) | |
| title_theme_names = list(map(lambda t: t["theme_name"], title_themes)) | |
| # Assign themes per paper using centroid similarity | |
| def assign_abstract_theme(text): | |
| return _assign_theme_for_text(str(text), abs_themes, abs_tax_map) | |
| def assign_title_theme(text): | |
| return _assign_theme_for_text(str(text), title_themes, title_tax_map) | |
| abstracts = list(df[abstract_col].fillna("") if abstract_col else [""] * len(df)) | |
| titles = list(df[title_col].fillna("") if title_col else [""] * len(df)) | |
| abs_assignments = list(map(assign_abstract_theme, abstracts)) | |
| title_assignments = list(map(assign_title_theme, titles)) | |
| # Use LLM to compute similarity reasoning between matched theme pairs | |
| import time | |
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0.1) | |
| # Get unique theme pairs — call LLM once per pair, not once per paper | |
| unique_pairs = list(set( | |
| (a[0], t[0]) for a, t in zip(abs_assignments, title_assignments) | |
| )) | |
| def get_similarity_reasoning(pair): | |
| abs_theme, title_theme = pair | |
| abs_pajais = abs_tax_map.get(abs_theme, "Unknown") | |
| title_pajais = title_tax_map.get(title_theme, "Unknown") | |
| prompt = ( | |
| "Compare these two research themes and assess their similarity:\n" | |
| "Abstract Theme: {} (PAJAIS: {})\n" | |
| "Title Theme: {} (PAJAIS: {})\n\n" | |
| "Return ONLY a raw JSON object with three keys:\n" | |
| " similarity_label: one of High/Medium/Low\n" | |
| " similarity_pct: integer 0-100\n" | |
| " reasoning: one sentence explaining the similarity or difference\n" | |
| "No markdown, no explanation, just the JSON object." | |
| ).format(abs_theme, abs_pajais, title_theme, title_pajais) | |
| result = _call_llm_json(llm, prompt) | |
| return pair, result | |
| # Sequential with sleep to respect rate limits | |
| pair_results_raw = [] | |
| for idx, pair in enumerate(unique_pairs): | |
| pair_results_raw.append(get_similarity_reasoning(pair)) | |
| _ = time.sleep(8) if idx < len(unique_pairs) - 1 else None | |
| pair_map = {pair: result for pair, result in pair_results_raw} | |
| # Build output rows | |
| def build_row(idx): | |
| a_theme, a_pajais, a_sim = abs_assignments[idx] | |
| t_theme, t_pajais, t_sim = title_assignments[idx] | |
| sim_info = pair_map.get((a_theme, t_theme), {}) | |
| return { | |
| "Title": titles[idx], | |
| "Title Theme": t_theme, | |
| "Title PAJAIS Category": t_pajais, | |
| "Abstract": abstracts[idx], | |
| "Abstract Theme": a_theme, | |
| "Abstract PAJAIS Category": a_pajais, | |
| "Year": str(df[year_col].iloc[idx]) if year_col else "", | |
| "Source Journal": str(df[journal_col].iloc[idx]) if journal_col else "", | |
| "Theme Similarity": sim_info.get("similarity_label", ""), | |
| "Similarity %": str(sim_info.get("similarity_pct", "")), | |
| "Similarity Reasoning": sim_info.get("reasoning", ""), | |
| } | |
| rows = list(map(build_row, list(range(len(df))))) | |
| out_df = pd.DataFrame(rows) | |
| dest = DATA_DIR / "comparison.csv" | |
| out_df.to_csv(dest, index=False, encoding="utf-8-sig") | |
| return json.dumps({ | |
| "rows": len(out_df), | |
| "columns": list(out_df.columns), | |
| "path": str(dest), | |
| "abstract_themes": abs_theme_names, | |
| "title_themes": title_theme_names, | |
| "note": "Enriched comparison CSV with per-paper theme + PAJAIS + similarity", | |
| }) | |
| # ============================================================================= | |
| # TOOL 7 — export_narrative | |
| # ONLY runs when BOTH abstract and title runs are complete | |
| # ============================================================================= | |
| def export_narrative() -> str: | |
| """Write a 500-word Section 7 narrative using themes from BOTH runs. | |
| Only available after BOTH abstract and title runs have completed taxonomy mapping. | |
| Saves to data/narrative.txt. | |
| """ | |
| abs_tax = _p("abstract")["taxonomy"] | |
| title_tax = _p("title")["taxonomy"] | |
| both_done = abs_tax.exists() and title_tax.exists() | |
| result = ( | |
| _do_export_narrative() | |
| if both_done | |
| else ( | |
| "Narrative cannot be generated yet. " | |
| "Abstract taxonomy complete: {}. Title taxonomy complete: {}. " | |
| "Complete both runs through Phase 5.5 first.".format( | |
| abs_tax.exists(), title_tax.exists() | |
| ) | |
| ) | |
| ) | |
| return result | |
| def _do_export_narrative() -> str: | |
| """Internal: generate narrative when both runs are done.""" | |
| abs_themes = json.loads(_p("abstract")["themes"].read_text()) | |
| title_themes = json.loads(_p("title")["themes"].read_text()) | |
| abs_taxonomy = json.loads(_p("abstract")["taxonomy"].read_text()) | |
| title_taxonomy = json.loads(_p("title")["taxonomy"].read_text()) | |
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0.4) | |
| abs_summary = list(map(lambda t: {"name": t["theme_name"], | |
| "sentences": len(t["sentences"])}, abs_themes)) | |
| title_summary = list(map(lambda t: {"name": t["theme_name"], | |
| "sentences": len(t["sentences"])}, title_themes)) | |
| prompt = ( | |
| "You are an academic writing expert in Information Systems.\n\n" | |
| "Write Section 7 (Discussion and Thematic Synthesis) of a systematic " | |
| "literature review paper. Approximately 500 words, formal academic prose.\n" | |
| "Cover:\n" | |
| "(a) Overview of themes from abstract analysis\n" | |
| "(b) Overview of themes from title analysis\n" | |
| "(c) Comparison: what themes appear in both vs only one\n" | |
| "(d) PAJAIS taxonomy mapping and implications\n" | |
| "(e) Implications for IS research and practice\n" | |
| "(f) Limitations\n\n" | |
| "ABSTRACT THEMES:\n" + json.dumps(abs_summary, indent=2) + "\n\n" | |
| "TITLE THEMES:\n" + json.dumps(title_summary, indent=2) + "\n\n" | |
| "ABSTRACT PAJAIS MAPPING:\n" + json.dumps(abs_taxonomy, indent=2) + "\n\n" | |
| "TITLE PAJAIS MAPPING:\n" + json.dumps(title_taxonomy, indent=2) + "\n\n" | |
| "Write in continuous academic paragraphs. No bullet points or headers." | |
| ) | |
| response = llm.invoke([HumanMessage(content=prompt)]) | |
| narrative_text = response.content | |
| dest = DATA_DIR / "narrative.txt" | |
| dest.write_text(narrative_text, encoding="utf-8") | |
| return json.dumps({ | |
| "word_count": len(narrative_text.split()), | |
| "path": str(dest), | |
| "note": "Narrative combines both abstract and title run themes", | |
| }) |