Spaces:
Runtime error
Runtime error
| """ | |
| tools.py β 7 @tool functions for BERTopic Agentic Thematic Analysis | |
| Generated for: Braun & Clarke (2006) 6-Phase Framework Pipeline | |
| """ | |
| import json | |
| import os | |
| import re | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| import plotly.express as px | |
| from plotly.subplots import make_subplots | |
| from langchain_core.tools import tool | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_core.output_parsers import JsonOutputParser | |
| from langchain_mistralai import ChatMistralAI | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.cluster import AgglomerativeClustering | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from bertopic import BERTopic | |
| import nltk | |
| nltk.download('punkt', quiet=True) | |
| nltk.download('punkt_tab', quiet=True) | |
| from nltk.tokenize import sent_tokenize | |
| # ββ Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| EMBED_MODEL = "all-MiniLM-L6-v2" | |
| CHECKPOINT_DIR = "checkpoints" | |
| NEAREST_K = 5 | |
| MAX_LABEL_TOPICS = 100 | |
| os.makedirs(CHECKPOINT_DIR, exist_ok=True) | |
| PAJAIS_CATEGORIES = [ | |
| "Artificial Intelligence & Machine Learning", | |
| "Big Data & Analytics", | |
| "Blockchain & Distributed Ledger", | |
| "Business Intelligence & Decision Support", | |
| "Cloud Computing & Infrastructure", | |
| "Cybersecurity & Privacy", | |
| "Digital Transformation & Strategy", | |
| "E-Commerce & Digital Markets", | |
| "Enterprise Systems & ERP", | |
| "Ethics & Governance in IS", | |
| "Health Informatics & eHealth", | |
| "Human-Computer Interaction & UX", | |
| "Information Systems Theory & Foundations", | |
| "Internet of Things & Cyber-Physical Systems", | |
| "IS Education & Curriculum", | |
| "IS in Developing Countries", | |
| "IS Project Management & Implementation", | |
| "Knowledge Management & Organizational Learning", | |
| "Mobile & Ubiquitous Computing", | |
| "Natural Language Processing & Text Mining", | |
| "Open Source & Collaborative Systems", | |
| "Platforms & Ecosystems", | |
| "Social Media & Online Communities", | |
| "Supply Chain & Logistics IS", | |
| "Virtual Reality & Immersive Technologies", | |
| ] | |
| BOILERPLATE_PATTERNS = [ | |
| r"Β©\s*\d{4}.*?(elsevier|springer|wiley|taylor|emerald|sage|ieee|acm|informs).*?\.", | |
| r"all rights reserved\.?", | |
| r"published by.*?\.", | |
| r"doi:\s*\S+", | |
| r"http[s]?://\S+", | |
| r"this article is protected by copyright.*?\.", | |
| r"please cite this article.*?\.", | |
| r"accepted manuscript.*?\.", | |
| r"preprint.*?\.", | |
| r"peer.reviewed.*?\.", | |
| r"received:\s*\d+.*?accepted:\s*\d+.*?\.", | |
| r"keywords:.*?\.", | |
| r"jel classification.*?\.", | |
| r"abstract[-ββ]?\s*", | |
| r"introduction[-ββ]?\s*$", | |
| r"in this (paper|study|article|research).*?we (propose|present|examine|investigate|explore)", | |
| r"the purpose of this (paper|study|article)", | |
| r"this (paper|study|article) (aims|seeks|investigates|examines|explores|presents)", | |
| r"we (propose|present|examine|investigate|explore)", | |
| r"\b(furthermore|moreover|however|nevertheless|therefore|thus|hence)\b", | |
| r"^\s*\d+\s*$", | |
| r"\s{2,}", | |
| ] | |
| # ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _df_cache: dict = {} | |
| _embeddings_cache: dict = {} | |
| def _get_llm(): | |
| return ChatMistralAI( | |
| model="mistral-large-latest", | |
| temperature=0.1, | |
| api_key=os.environ.get("MISTRAL_API_KEY", ""), | |
| ) | |
| def _clean_text(text: str) -> str: | |
| if not isinstance(text, str): | |
| return "" | |
| for pat in BOILERPLATE_PATTERNS: | |
| text = re.sub(pat, " ", text, flags=re.IGNORECASE) | |
| return text.strip() | |
| def _load_df() -> pd.DataFrame: | |
| if "df" in _df_cache: | |
| return _df_cache["df"] | |
| path = os.path.join(CHECKPOINT_DIR, "scopus_data.csv") | |
| if os.path.exists(path): | |
| _df_cache["df"] = pd.read_csv(path) | |
| return _df_cache["df"] | |
| raise FileNotFoundError("No CSV loaded. Please upload your Scopus CSV first.") | |
| def _get_sentences(run_key: str) -> list[str]: | |
| df = _load_df() | |
| col_map = {"abstract": "Abstract", "title": "Title"} | |
| col = col_map.get(run_key.lower(), "Abstract") | |
| sentences = [] | |
| for text in df[col].dropna(): | |
| cleaned = _clean_text(str(text)) | |
| if run_key.lower() == "abstract": | |
| sents = sent_tokenize(cleaned) | |
| sentences.extend([s.strip() for s in sents if len(s.strip()) > 30]) | |
| else: | |
| if len(cleaned.strip()) > 10: | |
| sentences.append(cleaned.strip()) | |
| return sentences | |
| def _embed(sentences: list[str], run_key: str) -> np.ndarray: | |
| cache_key = f"{run_key}_emb" | |
| emb_path = os.path.join(CHECKPOINT_DIR, f"{run_key}_emb.npy") | |
| if cache_key in _embeddings_cache: | |
| return _embeddings_cache[cache_key] | |
| if os.path.exists(emb_path): | |
| emb = np.load(emb_path) | |
| _embeddings_cache[cache_key] = emb | |
| return emb | |
| model = SentenceTransformer(EMBED_MODEL) | |
| emb = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False) | |
| np.save(emb_path, emb) | |
| _embeddings_cache[cache_key] = emb | |
| return emb | |
| def _save_json(data, filename: str): | |
| path = os.path.join(CHECKPOINT_DIR, filename) | |
| with open(path, "w", encoding="utf-8") as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| def _load_json(filename: str): | |
| path = os.path.join(CHECKPOINT_DIR, filename) | |
| if not os.path.exists(path): | |
| return None | |
| with open(path, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| # ββ Tool 1: Load CSV βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_scopus_csv(filepath: str) -> str: | |
| """ | |
| Load a Scopus CSV export and prepare it for topic modelling. | |
| Returns paper count, abstract sentence count, and title sentence count. | |
| Saves the cleaned dataframe as a checkpoint for subsequent tools. | |
| Args: | |
| filepath: Path to the uploaded Scopus CSV file. | |
| """ | |
| df = pd.read_csv(filepath) | |
| # Normalize column names | |
| df.columns = [c.strip() for c in df.columns] | |
| expected = ["Authors", "Title", "Abstract", "Author Keywords", "Cited by", "Source title", "Year"] | |
| for col in expected: | |
| if col not in df.columns: | |
| # Try case-insensitive match | |
| matches = [c for c in df.columns if c.lower() == col.lower()] | |
| if matches: | |
| df.rename(columns={matches[0]: col}, inplace=True) | |
| # Save checkpoint | |
| save_path = os.path.join(CHECKPOINT_DIR, "scopus_data.csv") | |
| df.to_csv(save_path, index=False) | |
| _df_cache["df"] = df | |
| # Count papers | |
| n_papers = len(df) | |
| # Count abstract sentences | |
| abstract_sents = 0 | |
| for text in df.get("Abstract", pd.Series()).dropna(): | |
| cleaned = _clean_text(str(text)) | |
| sents = sent_tokenize(cleaned) | |
| abstract_sents += len([s for s in sents if len(s.strip()) > 30]) | |
| # Count title sentences | |
| title_sents = len(df["Title"].dropna()) | |
| # Save summaries checkpoint (Phase 1 marker) | |
| summary_data = { | |
| "n_papers": n_papers, | |
| "abstract_sentences": abstract_sents, | |
| "title_sentences": title_sents, | |
| "columns": list(df.columns), | |
| "year_range": f"{int(df['Year'].min())} β {int(df['Year'].max())}" if "Year" in df.columns else "N/A", | |
| } | |
| _save_json(summary_data, "summaries.json") | |
| return ( | |
| f"β CSV loaded successfully!\n\n" | |
| f"π Papers: {n_papers:,}\n" | |
| f"π Abstract sentences (after cleaning): {abstract_sents:,}\n" | |
| f"π·οΈ Title sentences: {title_sents:,}\n" | |
| f"π Year range: {summary_data['year_range']}\n" | |
| f"π Columns detected: {', '.join(df.columns.tolist())}\n\n" | |
| f"Phase 1 (Familiarisation) complete. Type 'run abstract' to begin Phase 2." | |
| ) | |
| # ββ Tool 2: Run BERTopic βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str: | |
| """ | |
| Run BERTopic clustering on either abstracts or titles. | |
| Uses SentenceTransformer embeddings in 384-dimensional space with | |
| AgglomerativeClustering (cosine metric, average linkage). No UMAP reduction. | |
| Generates 4 interactive Plotly charts. Saves summaries.json and emb.npy. | |
| Args: | |
| run_key: Either 'abstract' or 'title' | |
| threshold: AgglomerativeClustering distance threshold (default 0.7) | |
| """ | |
| sentences = _get_sentences(run_key) | |
| if not sentences: | |
| return f"No sentences found for run_key='{run_key}'. Check your CSV." | |
| # Embed | |
| emb = _embed(sentences, run_key) | |
| # Cluster in 384d (no UMAP) | |
| clustering = AgglomerativeClustering( | |
| metric="cosine", | |
| linkage="average", | |
| distance_threshold=threshold, | |
| n_clusters=None, | |
| ) | |
| labels = clustering.fit_predict(emb) | |
| n_clusters = len(set(labels)) - (1 if -1 in labels else 0) | |
| # Build topic summaries | |
| topics = {} | |
| for idx, label in enumerate(labels): | |
| if label == -1: | |
| continue | |
| topics.setdefault(label, []).append(idx) | |
| # Find nearest-K sentences to centroid | |
| topic_summaries = [] | |
| for topic_id, sent_indices in sorted(topics.items()): | |
| topic_embs = emb[sent_indices] | |
| centroid = topic_embs.mean(axis=0, keepdims=True) | |
| sims = cosine_similarity(centroid, topic_embs)[0] | |
| top_k = np.argsort(sims)[::-1][:NEAREST_K] | |
| top_sentences = [sentences[sent_indices[i]] for i in top_k] | |
| topic_summaries.append({ | |
| "topic_id": int(topic_id), | |
| "count": len(sent_indices), | |
| "top_sentences": top_sentences, | |
| "label": None, | |
| "category": None, | |
| "confidence": None, | |
| "reasoning": None, | |
| "niche": None, | |
| "approve": "", | |
| "rename_to": "", | |
| "user_reasoning": "", | |
| }) | |
| # Sort by size descending | |
| topic_summaries.sort(key=lambda x: x["count"], reverse=True) | |
| # Save checkpoint | |
| _save_json({"run_key": run_key, "topics": topic_summaries}, f"{run_key}_summaries.json") | |
| _save_json({"run_key": run_key, "topics": topic_summaries}, "summaries.json") | |
| # ββ Generate Plotly charts βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Intertopic distance map (using PCA-projected centroids) | |
| from sklearn.decomposition import PCA | |
| centroids = [] | |
| sizes = [] | |
| ids = [] | |
| for t in topic_summaries[:50]: | |
| sent_indices = topics[t["topic_id"]] | |
| centroid = emb[sent_indices].mean(axis=0) | |
| centroids.append(centroid) | |
| sizes.append(t["count"]) | |
| ids.append(t["topic_id"]) | |
| pca = PCA(n_components=2) | |
| coords = pca.fit_transform(np.array(centroids)) | |
| # Chart 1: Intertopic Map | |
| fig1 = go.Figure() | |
| fig1.add_trace(go.Scatter( | |
| x=coords[:, 0], y=coords[:, 1], | |
| mode="markers+text", | |
| marker=dict(size=[max(10, s / 2) for s in sizes], color=sizes, | |
| colorscale="Viridis", showscale=True, | |
| colorbar=dict(title="Sentences")), | |
| text=[f"T{i}" for i in ids], | |
| textposition="top center", | |
| hovertext=[f"Topic {i}<br>{s} sentences" for i, s in zip(ids, sizes)], | |
| )) | |
| fig1.update_layout(title=f"Intertopic Distance Map β {run_key.title()} ({n_clusters} topics)", | |
| template="plotly_dark", height=500, | |
| xaxis_title="PC1", yaxis_title="PC2") | |
| # Chart 2: Topic Size Bar Chart | |
| top_n = topic_summaries[:30] | |
| fig2 = px.bar( | |
| x=[t["count"] for t in top_n], | |
| y=[f"Topic {t['topic_id']}" for t in top_n], | |
| orientation="h", | |
| color=[t["count"] for t in top_n], | |
| color_continuous_scale="Plasma", | |
| title=f"Top 30 Topics by Size β {run_key.title()}", | |
| labels={"x": "Sentence Count", "y": "Topic"}, | |
| ) | |
| fig2.update_layout(template="plotly_dark", height=600) | |
| # Chart 3: Topic Hierarchy (dendrogram-style via heatmap of similarity) | |
| top20_indices = [topics[t["topic_id"]] for t in topic_summaries[:20]] | |
| sim_matrix = np.zeros((20, 20)) | |
| cent20 = [emb[idx].mean(axis=0) for idx in top20_indices] | |
| for i in range(20): | |
| for j in range(20): | |
| sim_matrix[i][j] = cosine_similarity([cent20[i]], [cent20[j]])[0][0] | |
| fig3 = go.Figure(go.Heatmap( | |
| z=sim_matrix, | |
| x=[f"T{topic_summaries[i]['topic_id']}" for i in range(20)], | |
| y=[f"T{topic_summaries[i]['topic_id']}" for i in range(20)], | |
| colorscale="RdBu", zmin=0, zmax=1, | |
| )) | |
| fig3.update_layout(title=f"Topic Similarity Heatmap (Top 20) β {run_key.title()}", | |
| template="plotly_dark", height=500) | |
| # Chart 4: Sentence distribution | |
| fig4 = px.histogram( | |
| x=[t["count"] for t in topic_summaries], | |
| nbins=30, title=f"Topic Size Distribution β {run_key.title()}", | |
| labels={"x": "Sentences per Topic", "y": "Number of Topics"}, | |
| color_discrete_sequence=["#7C3AED"], | |
| ) | |
| fig4.update_layout(template="plotly_dark", height=400) | |
| # Save charts | |
| charts = { | |
| "intertopic": fig1.to_html(include_plotlyjs="cdn", full_html=False), | |
| "bars": fig2.to_html(include_plotlyjs="cdn", full_html=False), | |
| "heatmap": fig3.to_html(include_plotlyjs="cdn", full_html=False), | |
| "distribution": fig4.to_html(include_plotlyjs="cdn", full_html=False), | |
| } | |
| _save_json(charts, f"{run_key}_charts.json") | |
| return ( | |
| f"β BERTopic clustering complete for {run_key}!\n\n" | |
| f"π’ Topics discovered: {n_clusters}\n" | |
| f"π Sentences processed: {len(sentences):,}\n" | |
| f"π Embedding dimensions: 384 (no UMAP reduction)\n" | |
| f"π Distance threshold: {threshold}\n\n" | |
| f"4 interactive charts saved. Calling label_topics_with_llm next..." | |
| ) | |
| # ββ Tool 3: Label Topics βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def label_topics_with_llm(run_key: str) -> str: | |
| """ | |
| Send top topics to Mistral LLM for labeling with research area names, | |
| categories, confidence scores, reasoning, and niche flag. | |
| Saves labels.json checkpoint. | |
| Args: | |
| run_key: Either 'abstract' or 'title' | |
| """ | |
| data = _load_json(f"{run_key}_summaries.json") or _load_json("summaries.json") | |
| if not data: | |
| return "No topic summaries found. Run run_bertopic_discovery first." | |
| topics = data["topics"][:MAX_LABEL_TOPICS] | |
| llm = _get_llm() | |
| # Build prompt payload | |
| topic_texts = [] | |
| for t in topics: | |
| sents = " | ".join(t["top_sentences"][:3]) | |
| topic_texts.append(f"Topic {t['topic_id']} ({t['count']} sentences): {sents[:300]}") | |
| prompt_template = PromptTemplate.from_template( | |
| """You are an expert academic researcher specializing in Information Systems and Computer Science. | |
| Analyze these research topics extracted from journal abstracts/titles and label each one. | |
| Topics: | |
| {topics} | |
| For each topic, respond with a JSON array. Each element must have: | |
| - topic_id: integer | |
| - label: concise research area name (3-7 words) | |
| - category: broad category (e.g., "AI & ML", "HCI", "Security", "Data Management") | |
| - confidence: float 0.0-1.0 | |
| - reasoning: one sentence explaining the label | |
| - niche: boolean (true if highly specialized/narrow) | |
| Respond ONLY with a valid JSON array. No markdown, no preamble, no explanation.""" | |
| ) | |
| parser = JsonOutputParser() | |
| chain = prompt_template | llm | parser | |
| result = chain.invoke({"topics": "\n".join(topic_texts)}) | |
| # Merge labels back into topics | |
| label_map = {item["topic_id"]: item for item in result} | |
| labeled_topics = [] | |
| for t in data["topics"]: | |
| lbl = label_map.get(t["topic_id"], {}) | |
| labeled_topics.append({ | |
| **t, | |
| "label": lbl.get("label", f"Topic {t['topic_id']}"), | |
| "category": lbl.get("category", "Uncategorized"), | |
| "confidence": lbl.get("confidence", 0.5), | |
| "reasoning": lbl.get("reasoning", ""), | |
| "niche": lbl.get("niche", False), | |
| }) | |
| _save_json({"run_key": run_key, "topics": labeled_topics}, f"{run_key}_labels.json") | |
| _save_json({"run_key": run_key, "topics": labeled_topics}, "labels.json") | |
| labeled_count = len([t for t in labeled_topics if t.get("label")]) | |
| return ( | |
| f"β Topics labeled by LLM!\n\n" | |
| f"π·οΈ Topics labeled: {labeled_count}\n" | |
| f"π Review the table below β check labels, approve or rename topics.\n\n" | |
| f"**Phase 2 complete. Review the table, edit Approve/Rename columns, then click Submit Review.**" | |
| ) | |
| # ββ Tool 4: Consolidate Themes βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def consolidate_into_themes(run_key: str, theme_map: str) -> str: | |
| """ | |
| Consolidate approved topics into researcher-defined themes. | |
| Recomputes centroids for merged theme groups. | |
| Saves themes.json checkpoint. | |
| Args: | |
| run_key: Either 'abstract' or 'title' | |
| theme_map: JSON string mapping theme names to lists of topic IDs. | |
| Example: '{"AI in Healthcare": [0, 3, 7], "Blockchain": [1, 5]}' | |
| """ | |
| data = _load_json(f"{run_key}_labels.json") or _load_json("labels.json") | |
| if not data: | |
| return "No labeled topics found. Run label_topics_with_llm first." | |
| try: | |
| if isinstance(theme_map, str): | |
| groupings = json.loads(theme_map) | |
| else: | |
| groupings = theme_map | |
| except json.JSONDecodeError as e: | |
| return f"Invalid theme_map JSON: {e}" | |
| topics_by_id = {t["topic_id"]: t for t in data["topics"]} | |
| sentences = _get_sentences(run_key) | |
| emb = _embed(sentences, run_key) | |
| # Build label β sentences mapping for original topics | |
| # (reconstruct from saved summaries) | |
| summaries = _load_json(f"{run_key}_summaries.json") or _load_json("summaries.json") | |
| all_topic_indices: dict = {} | |
| if summaries: | |
| # We need to re-cluster to get indices β use saved emb | |
| clustering = AgglomerativeClustering( | |
| metric="cosine", linkage="average", | |
| distance_threshold=0.7, n_clusters=None | |
| ) | |
| labels_arr = clustering.fit_predict(emb) | |
| for idx, lbl in enumerate(labels_arr): | |
| all_topic_indices.setdefault(int(lbl), []).append(idx) | |
| themes = [] | |
| used_ids = set() | |
| for theme_name, topic_ids in groupings.items(): | |
| merged_sentence_indices = [] | |
| for tid in topic_ids: | |
| merged_sentence_indices.extend(all_topic_indices.get(tid, [])) | |
| used_ids.add(tid) | |
| if not merged_sentence_indices: | |
| continue | |
| theme_embs = emb[merged_sentence_indices] | |
| centroid = theme_embs.mean(axis=0, keepdims=True) | |
| sims = cosine_similarity(centroid, theme_embs)[0] | |
| top_k = np.argsort(sims)[::-1][:NEAREST_K] | |
| top_sents = [sentences[merged_sentence_indices[i]] for i in top_k] | |
| # Count papers | |
| df = _load_df() | |
| n_papers = len(df) # Approximate | |
| themes.append({ | |
| "theme_name": theme_name, | |
| "topic_ids": topic_ids, | |
| "sentence_count": len(merged_sentence_indices), | |
| "paper_count": n_papers, | |
| "top_sentences": top_sents, | |
| "approve": "", | |
| "rename_to": "", | |
| "user_reasoning": "", | |
| "pajais_match": None, | |
| "is_novel": None, | |
| }) | |
| # Handle uncategorized topics | |
| uncategorized = [tid for tid in all_topic_indices if tid not in used_ids] | |
| if uncategorized: | |
| merged = [] | |
| for tid in uncategorized: | |
| merged.extend(all_topic_indices[tid]) | |
| if merged: | |
| themes.append({ | |
| "theme_name": "Uncategorized", | |
| "topic_ids": uncategorized, | |
| "sentence_count": len(merged), | |
| "paper_count": 0, | |
| "top_sentences": [sentences[i] for i in merged[:3]], | |
| "approve": "", | |
| "rename_to": "", | |
| "user_reasoning": "", | |
| "pajais_match": None, | |
| "is_novel": None, | |
| }) | |
| _save_json({"run_key": run_key, "themes": themes}, f"{run_key}_themes.json") | |
| _save_json({"run_key": run_key, "themes": themes}, "themes.json") | |
| return ( | |
| f"β Themes consolidated!\n\n" | |
| f"ποΈ Themes created: {len(themes)}\n" | |
| f"π Total sentences covered: {sum(t['sentence_count'] for t in themes):,}\n\n" | |
| f"**Phase 3 complete. Review consolidated themes in the table. Click Submit Review.**" | |
| ) | |
| # ββ Tool 5: Compare with PAJAIS Taxonomy ββββββββββββββββββββββββββββββββββββββ | |
| def compare_with_taxonomy(run_key: str) -> str: | |
| """ | |
| Map final themes to PAJAIS 25-category taxonomy using Mistral LLM. | |
| Identifies NOVEL themes not covered by existing taxonomy. | |
| Saves taxonomy_map.json checkpoint. | |
| Args: | |
| run_key: Either 'abstract' or 'title' | |
| """ | |
| data = _load_json(f"{run_key}_themes.json") or _load_json("themes.json") | |
| if not data: | |
| return "No themes found. Run consolidate_into_themes first." | |
| themes = data["themes"] | |
| llm = _get_llm() | |
| theme_descriptions = [] | |
| for t in themes: | |
| sents = " | ".join(t["top_sentences"][:2]) | |
| theme_descriptions.append( | |
| f"Theme: {t['theme_name']}\nEvidence: {sents[:250]}" | |
| ) | |
| prompt_template = PromptTemplate.from_template( | |
| """You are an expert in Information Systems research taxonomy. | |
| Map each research theme to the PAJAIS (Pan-Pacific Journal of Advanced Research in Information Systems) taxonomy categories, or flag as NOVEL if no match exists. | |
| PAJAIS Categories: | |
| {categories} | |
| Themes to map: | |
| {themes} | |
| For each theme, respond with a JSON array. Each element must have: | |
| - theme_name: string (exact match from input) | |
| - pajais_match: string (exact PAJAIS category name, or "NOVEL") | |
| - match_confidence: float 0.0-1.0 | |
| - reasoning: one sentence justification | |
| - is_novel: boolean (true if NOVEL) | |
| - evidence_summary: brief description of what the theme covers | |
| Respond ONLY with valid JSON array. No markdown.""" | |
| ) | |
| parser = JsonOutputParser() | |
| chain = prompt_template | llm | parser | |
| result = chain.invoke({ | |
| "categories": "\n".join(f"{i+1}. {c}" for i, c in enumerate(PAJAIS_CATEGORIES)), | |
| "themes": "\n\n".join(theme_descriptions), | |
| }) | |
| # Merge results | |
| result_map = {item["theme_name"]: item for item in result} | |
| taxonomy_themes = [] | |
| for t in themes: | |
| mapping = result_map.get(t["theme_name"], {}) | |
| taxonomy_themes.append({ | |
| **t, | |
| "pajais_match": mapping.get("pajais_match", "NOVEL"), | |
| "match_confidence": mapping.get("match_confidence", 0.0), | |
| "reasoning": mapping.get("reasoning", ""), | |
| "is_novel": mapping.get("is_novel", True), | |
| "evidence_summary": mapping.get("evidence_summary", ""), | |
| "top_sentences": [ | |
| f"β {mapping.get('pajais_match', 'NOVEL')} | {mapping.get('reasoning', '')}" | |
| ] + t.get("top_sentences", [])[:2], | |
| }) | |
| novel_count = len([t for t in taxonomy_themes if t.get("is_novel")]) | |
| mapped_count = len(taxonomy_themes) - novel_count | |
| _save_json({"run_key": run_key, "themes": taxonomy_themes}, f"{run_key}_taxonomy_map.json") | |
| _save_json({"run_key": run_key, "themes": taxonomy_themes}, "taxonomy_map.json") | |
| return ( | |
| f"β PAJAIS taxonomy mapping complete!\n\n" | |
| f"β MAPPED themes: {mapped_count}\n" | |
| f"π NOVEL themes: {novel_count}\n\n" | |
| f"**Phase 5.5 complete. Review PAJAIS mapping in the table (Top Evidence column shows β PAJAIS match). " | |
| f"Click Submit Review.**" | |
| ) | |
| # ββ Tool 6: Generate Comparison CSV βββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_comparison_csv() -> str: | |
| """ | |
| Compare abstract themes vs title themes side-by-side. | |
| Creates a convergence/divergence analysis CSV. | |
| Saves comparison.csv checkpoint. | |
| """ | |
| abstract_data = _load_json("abstract_themes.json") or _load_json("themes.json") | |
| title_data = _load_json("title_themes.json") | |
| if not abstract_data: | |
| return "Abstract themes not found. Complete abstract analysis first." | |
| if not title_data: | |
| return "Title themes not found. Complete title analysis first (run title analysis)." | |
| abstract_themes = {t["theme_name"]: t for t in abstract_data.get("themes", [])} | |
| title_themes = {t["theme_name"]: t for t in title_data.get("themes", [])} | |
| all_themes = sorted(set(list(abstract_themes.keys()) + list(title_themes.keys()))) | |
| rows = [] | |
| for theme in all_themes: | |
| a = abstract_themes.get(theme, {}) | |
| t = title_themes.get(theme, {}) | |
| convergence = "CONVERGE" if theme in abstract_themes and theme in title_themes else ( | |
| "ABSTRACT ONLY" if theme in abstract_themes else "TITLE ONLY" | |
| ) | |
| rows.append({ | |
| "Theme": theme, | |
| "Abstract_Sentences": a.get("sentence_count", 0), | |
| "Title_Sentences": t.get("sentence_count", 0), | |
| "Abstract_PAJAIS": a.get("pajais_match", "N/A"), | |
| "Title_PAJAIS": t.get("pajais_match", "N/A"), | |
| "Abstract_Novel": a.get("is_novel", False), | |
| "Title_Novel": t.get("is_novel", False), | |
| "Convergence": convergence, | |
| "Top_Abstract_Evidence": (a.get("top_sentences", [""])[0])[:200] if a else "", | |
| "Top_Title_Evidence": (t.get("top_sentences", [""])[0])[:200] if t else "", | |
| }) | |
| df = pd.DataFrame(rows) | |
| save_path = os.path.join(CHECKPOINT_DIR, "comparison.csv") | |
| df.to_csv(save_path, index=False) | |
| converge = len([r for r in rows if r["Convergence"] == "CONVERGE"]) | |
| abstract_only = len([r for r in rows if r["Convergence"] == "ABSTRACT ONLY"]) | |
| title_only = len([r for r in rows if r["Convergence"] == "TITLE ONLY"]) | |
| return ( | |
| f"β Comparison CSV generated!\n\n" | |
| f"π Converging themes: {converge}\n" | |
| f"π Abstract-only themes: {abstract_only}\n" | |
| f"π·οΈ Title-only themes: {title_only}\n\n" | |
| f"**Check the Download tab for comparison.csv. Click Submit Review to confirm.**" | |
| ) | |
| # ββ Tool 7: Export Narrative βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def export_narrative(run_key: str) -> str: | |
| """ | |
| Generate a 500-word Section 7 literature review narrative using Mistral LLM. | |
| References B&C methodology, key themes, PAJAIS mapping, and limitations. | |
| Saves narrative.txt checkpoint. | |
| Args: | |
| run_key: Either 'abstract' or 'title' | |
| """ | |
| taxonomy_data = _load_json(f"{run_key}_taxonomy_map.json") or _load_json("taxonomy_map.json") | |
| if not taxonomy_data: | |
| return "No taxonomy mapping found. Run compare_with_taxonomy first." | |
| themes = taxonomy_data.get("themes", []) | |
| llm = _get_llm() | |
| theme_summary = [] | |
| for t in themes: | |
| novel_flag = " [NOVEL]" if t.get("is_novel") else f" [β {t.get('pajais_match', '')}]" | |
| theme_summary.append(f"β’ {t['theme_name']}{novel_flag}: {t.get('evidence_summary', t.get('reasoning', ''))}") | |
| summaries_data = _load_json("summaries.json") or {} | |
| n_papers = summaries_data.get("n_papers", "N/A") | |
| prompt_template = PromptTemplate.from_template( | |
| """You are an academic writer drafting a Section 7 (Thematic Analysis Results) for a peer-reviewed Information Systems journal paper. | |
| Context: | |
| - Dataset: {n_papers} papers from Scopus | |
| - Method: BERTopic with AgglomerativeClustering (cosine metric, 384d embeddings, no UMAP), Braun & Clarke (2006) 6-phase framework | |
| - Analysis type: {run_key} analysis | |
| Themes discovered: | |
| {themes} | |
| Write a 500-word Section 7 that: | |
| 1. Opens with methodology overview (BERTopic, B&C phases, embedding approach) | |
| 2. Presents each major theme with evidence and paper count references | |
| 3. Discusses PAJAIS taxonomy alignment (MAPPED vs NOVEL themes) | |
| 4. Highlights the most significant NOVEL themes and their publication potential | |
| 5. Acknowledges limitations (single journal, time period, computational constraints) | |
| 6. Closes with implications for future research | |
| Write in formal academic style. Use hedged language where appropriate. Do not use bullet points β write in flowing paragraphs.""" | |
| ) | |
| chain = prompt_template | llm | |
| response = chain.invoke({ | |
| "n_papers": n_papers, | |
| "run_key": run_key, | |
| "themes": "\n".join(theme_summary), | |
| }) | |
| narrative_text = response.content if hasattr(response, "content") else str(response) | |
| save_path = os.path.join(CHECKPOINT_DIR, "narrative.txt") | |
| with open(save_path, "w", encoding="utf-8") as f: | |
| f.write(narrative_text) | |
| word_count = len(narrative_text.split()) | |
| return ( | |
| f"β Section 7 narrative exported!\n\n" | |
| f"π Word count: {word_count}\n" | |
| f"πΎ Saved to: narrative.txt\n\n" | |
| f"**Phase 6 complete! All B&C phases finished. Check Download tab for all outputs.**\n\n" | |
| f"---\n\n{narrative_text[:500]}...\n\n*(Full narrative in narrative.txt)*" | |
| ) | |
| # ββ All tools list βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ALL_TOOLS = [ | |
| load_scopus_csv, | |
| run_bertopic_discovery, | |
| label_topics_with_llm, | |
| consolidate_into_themes, | |
| compare_with_taxonomy, | |
| generate_comparison_csv, | |
| export_narrative, | |
| ] |