Spaces:
Sleeping
Sleeping
Daksh C Jain
Initial commit: EIS Topic Intelligence β UMAP+HDBSCAN+Mistral council, dark EIS theme, 23 clusters from Enterprise Information Systems corpus
c91d9b4 | """ | |
| tools.py β 7 @tool functions for BERTopic Agentic AI | |
| Assignment: Text Analysis & Topic Modelling (Prof. Shailaja Jha) | |
| Generated via: Anthropic Claude Sonnet 4.5 | |
| Architecture: LangChain @tool + LangGraph | Model: Mistral Small Latest | |
| Rules: ZERO if/elif/else | ZERO for/while | ZERO try/except | handle_tool_error=True | |
| """ | |
| import os | |
| import re | |
| import json | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from sklearn.cluster import AgglomerativeClustering | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.decomposition import PCA | |
| from langchain_core.tools import tool | |
| from langchain_mistralai import ChatMistralAI | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_core.output_parsers import JsonOutputParser, StrOutputParser | |
| # βββ CONSTANTS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| OUTPUT_DIR = "./outputs" | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| MAX_LABEL_TOPICS = 100 | |
| NEAREST_K = 5 | |
| BATCH_SIZE = 20 | |
| BOILERPLATE_RE = re.compile( | |
| r"Β©\s*\d{4}[^.]*?\.|All\s+rights\s+reserved\.?|" | |
| r"Published\s+by\s+[A-Z][^.]*?\.|This\s+is\s+an\s+open\s+access[^.]*?\.|" | |
| r"Correspondence\s+(to|author):[^.]*?\.|E-?mail:[^.]*?\.|" | |
| r"Received:[^.]*?Accepted:[^.]*?\.|DOI:\S+|doi:\S+|https?://\S+|" | |
| r"Keywords:[^.]*?\.|JEL[^.]*?\.|ISSN[^.]*?\.|ISBN[^.]*?\.|" | |
| r"Elsevier[^.]*?\.|Springer[^.]*?\.|Emerald[^.]*?\.|" | |
| r"Wiley[^.]*?\.|Taylor\s*&\s*Francis[^.]*?\.|" | |
| r"This\s+paper\s+is\s+part\s+of[^.]*?\.|" | |
| r"Conflict\s+of\s+interest[^.]*?\.|" | |
| r"Funding[^.]*?:\s*[^.]*?\.|" | |
| r"Acknowledgement[s]?:[^.]*?\.", | |
| re.IGNORECASE | re.DOTALL, | |
| ) | |
| SENT_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z\"(])") | |
| PAJAIS_25 = [ | |
| "IS Strategy and Management", "E-Commerce and E-Business", | |
| "IT Adoption and Diffusion", "Business Intelligence and Analytics", | |
| "Social Commerce and Social Media", "Mobile Commerce and Applications", | |
| "Knowledge Management", "Healthcare Information Systems", | |
| "Privacy, Security and Trust", "Enterprise Systems and ERP", | |
| "Digital Platforms and Ecosystems", "Blockchain and Distributed Ledgers", | |
| "Artificial Intelligence and Machine Learning", | |
| "Human-Computer Interaction and UX", | |
| "Digital Transformation and Innovation", | |
| "Financial Technology and Digital Finance", | |
| "Supply Chain and Logistics IS", "Smart Systems IoT and Smart Cities", | |
| "IS Research Methods and Theory", | |
| "Recommender and Personalization Systems", | |
| "Digital Marketing and Advertising", | |
| "Virtual Teams and Online Collaboration", | |
| "Cloud Computing and SaaS", "Big Data Analytics and Data Science", | |
| "IS Education and Training", | |
| ] | |
| _EMBED_MODEL = None | |
| def _get_embed_model(): | |
| global _EMBED_MODEL | |
| from sentence_transformers import SentenceTransformer | |
| _EMBED_MODEL = _EMBED_MODEL or SentenceTransformer( | |
| "all-MiniLM-L6-v2" | |
| ) | |
| return _EMBED_MODEL | |
| def _get_llm(): | |
| return ChatMistralAI( | |
| model="mistral-small-latest", | |
| api_key=os.environ.get("MISTRAL_API_KEY", ""), | |
| temperature=0.1, | |
| ) | |
| def _clean(text: str) -> str: | |
| return BOILERPLATE_RE.sub(" ", str(text)).strip() | |
| def _split(text: str) -> list: | |
| return [s.strip() for s in SENT_RE.split(_clean(text)) if len(s.strip()) > 30] | |
| def _save(data, name: str) -> str: | |
| path = os.path.join(OUTPUT_DIR, name) | |
| with open(path, "w", encoding="utf-8") as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| return path | |
| def _load(name: str): | |
| with open(os.path.join(OUTPUT_DIR, name), "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| def _opath(name: str) -> str: | |
| return os.path.join(OUTPUT_DIR, name) | |
| def _generate_charts(run_key: str, data: list, name_key: str = "cluster_id"): | |
| """Regenerates the 4 Plotly charts using the latest data (summaries, labels, or themes).""" | |
| if not data: | |
| return | |
| centroids = np.array([s["centroid"] for s in data]) | |
| sizes = [s["sentence_count"] for s in data] | |
| n_clusters = len(data) | |
| # Use the name_key to get human-readable labels if available | |
| def get_name(s): | |
| if name_key == "cluster_id": | |
| return f"C{s.get('cluster_id', '?')}" | |
| return s.get(name_key, f"C{s.get('cluster_id', '?')}") | |
| names = [get_name(s) for s in data] | |
| pca = PCA(n_components=2) | |
| # Handle case where n_clusters < 2 | |
| if n_clusters < 2: | |
| coords = np.zeros((n_clusters, 2)) | |
| else: | |
| coords = pca.fit_transform(centroids) | |
| chart_dir = _opath(f"{run_key}_charts") | |
| os.makedirs(chart_dir, exist_ok=True) | |
| fig1 = px.scatter( | |
| x=coords[:, 0], y=coords[:, 1], size=sizes, | |
| title=f"Intertopic Distance Map β {run_key.title()}", | |
| labels={"x": "PC1", "y": "PC2"}, | |
| hover_name=names, | |
| template="plotly_dark", | |
| ) | |
| fig1.write_html(os.path.join(chart_dir, "intertopic_map.html"), include_plotlyjs="cdn", full_html=True) | |
| top30 = data[:30] | |
| fig2 = px.bar( | |
| x=[get_name(s) for s in top30], | |
| y=[s["sentence_count"] for s in top30], | |
| title=f"Top 30 Cluster Sizes β {run_key.title()}", | |
| labels={"x": "Cluster", "y": "Sentences"}, | |
| template="plotly_dark", | |
| ) | |
| fig2.write_html(os.path.join(chart_dir, "bar_chart.html"), include_plotlyjs="cdn", full_html=True) | |
| fig3 = px.treemap( | |
| names=names, | |
| parents=["clusters"] * n_clusters, | |
| values=sizes, | |
| title=f"Topic Treemap β {run_key.title()}", | |
| ) | |
| fig3.write_html(os.path.join(chart_dir, "treemap.html"), include_plotlyjs="cdn", full_html=True) | |
| # Heatmap: pad to 20 | |
| hm_items = data[:20] | |
| pad_count = 20 - len(hm_items) | |
| hm_items_padded = hm_items + [{"sentence_count": 0, name_key: "Empty"}] * pad_count | |
| heatmap_data = np.array([s.get("sentence_count", 0) for s in hm_items_padded]).reshape(4, 5) | |
| heatmap_text = [[get_name(hm_items_padded[i * 5 + j]) for j in range(5)] for i in range(4)] | |
| fig4 = go.Figure(go.Heatmap( | |
| z=heatmap_data, colorscale="Viridis", text=heatmap_text, | |
| texttemplate="%{text}", showscale=True, | |
| )) | |
| fig4.update_layout(title=f"Topic Size Heatmap β {run_key.title()}", template="plotly_dark") | |
| fig4.write_html(os.path.join(chart_dir, "heatmap.html"), include_plotlyjs="cdn", full_html=True) | |
| # βββ TOOL 1: LOAD CSV βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_scopus_csv(filepath: str) -> str: | |
| """Load a Scopus CSV export file and return statistics. | |
| Phase 1 of Braun & Clarke (2006) β Familiarisation. | |
| Call this FIRST before any analysis. filepath must be the absolute path to the CSV.""" | |
| df = pd.read_csv(filepath, encoding="utf-8-sig", on_bad_lines="skip") | |
| required = ["Title", "Abstract", "Authors", "Year", "Cited by", | |
| "Author Keywords", "Source title"] | |
| found = [c for c in required if c in df.columns] | |
| missing = [c for c in required if c not in df.columns] | |
| pairs_abs = [(s, i) for i, t in enumerate(df["Abstract"].fillna("").tolist()) | |
| for s in _split(t)] | |
| pairs_ttl = [(s, i) for i, t in enumerate(df["Title"].fillna("").tolist()) | |
| for s in _split(t)] | |
| year_min = int(df["Year"].dropna().min()) if "Year" in df.columns else 0 | |
| year_max = int(df["Year"].dropna().max()) if "Year" in df.columns else 0 | |
| journal = (df["Source title"].value_counts().index[0] | |
| if "Source title" in df.columns else "Unknown") | |
| _save({"filepath": filepath, "journal": journal, | |
| "rows": len(df), "year_min": year_min, "year_max": year_max}, | |
| "corpus_config.json") | |
| return ( | |
| f"β CSV Loaded\nJournal: {journal}\nPapers: {len(df)}\n" | |
| f"Year Range: {year_min}β{year_max}\n" | |
| f"Columns Found ({len(found)}/7): {found}\nMissing: {missing}\n" | |
| f"Abstract sentences: {len(pairs_abs):,}\n" | |
| f"Title sentences: {len(pairs_ttl):,}\n" | |
| f"Type 'run abstract' to begin Phase 2." | |
| ) | |
| # βββ TOOL 2: RUN BERTOPIC DISCOVERY ββββββββββββββββββββββββββββββββββββββββββ | |
| def run_bertopic_discovery(run_key: str, target_size: int = 250) -> str: | |
| """Embed sentences with all-MiniLM-L6-v2 and apply Balanced Agglomerative Clustering. | |
| Dynamic K selection based on data size (target_size=250 sentences per topic). | |
| Includes automatic splitting of massive clusters and merging of tiny clusters | |
| to guarantee minimal size disparity across all discovered topics. | |
| Saves {run_key}_summaries.json + {run_key}_emb.npy. Phase 2 of Braun & Clarke. | |
| run_key must be 'abstract' or 'title'. target_size guides the dynamic cluster counts.""" | |
| cfg = _load("corpus_config.json") | |
| df = pd.read_csv(cfg["filepath"], encoding="utf-8-sig", on_bad_lines="skip") | |
| col = "Abstract" if run_key == "abstract" else "Title" | |
| pairs = [(s, i) for i, t in enumerate(df[col].fillna("").tolist()) | |
| for s in _split(t)] | |
| sentences = [p[0] for p in pairs] | |
| paper_ids = [p[1] for p in pairs] | |
| model = _get_embed_model() | |
| emb = model.encode(sentences, normalize_embeddings=True, | |
| batch_size=64, show_progress_bar=True) | |
| np.save(_opath(f"{run_key}_emb.npy"), emb) | |
| _save({"sentences": sentences, "paper_ids": paper_ids}, | |
| f"{run_key}_sentences.json") | |
| # Dynamic sizing calculations | |
| total_sents = len(sentences) | |
| dynamic_k = max(5, total_sents // target_size) | |
| max_size = target_size * 2 | |
| min_size = target_size // 2 | |
| labels_arr = AgglomerativeClustering( | |
| n_clusters=dynamic_k, metric="euclidean", linkage="ward" | |
| ).fit_predict(emb) | |
| # 1. Enforce splitting of massive clusters | |
| while True: | |
| u_labels, counts = np.unique(labels_arr, return_counts=True) | |
| too_big = u_labels[counts > max_size] | |
| if len(too_big) == 0: | |
| break | |
| for cid in too_big: | |
| idx = np.where(labels_arr == cid)[0] | |
| split_k = int(np.ceil(len(idx) / target_size)) | |
| sub_labels = AgglomerativeClustering( | |
| n_clusters=split_k, metric="euclidean", linkage="ward" | |
| ).fit_predict(emb[idx]) | |
| new_id_start = max(labels_arr) + 1 | |
| for sub_id in range(1, split_k): | |
| sub_idx = idx[sub_labels == sub_id] | |
| labels_arr[sub_idx] = new_id_start | |
| new_id_start += 1 | |
| # 2. Enforce merging of tiny clusters | |
| while True: | |
| u_labels, counts = np.unique(labels_arr, return_counts=True) | |
| too_small = u_labels[counts < min_size] | |
| if len(too_small) == 0 or len(u_labels) <= 5: # keep at least 5 clusters | |
| break | |
| cid = too_small[0] | |
| idx = np.where(labels_arr == cid)[0] | |
| centroid = emb[idx].mean(axis=0, keepdims=True) | |
| best_dist = -1.0 | |
| best_merge_id = -1 | |
| # Try merging into the nearest cluster that won't become too huge | |
| for other_id in u_labels: | |
| if other_id == cid: continue | |
| other_idx = np.where(labels_arr == other_id)[0] | |
| if len(other_idx) + len(idx) > (max_size * 1.5): | |
| continue | |
| other_centroid = emb[other_idx].mean(axis=0, keepdims=True) | |
| sim = cosine_similarity(centroid, other_centroid)[0][0] | |
| if sim > best_dist: | |
| best_dist = float(sim) | |
| best_merge_id = int(other_id) | |
| if best_merge_id != -1: | |
| labels_arr[idx] = best_merge_id | |
| else: | |
| # Fallback: force merge into absolute nearest neighbor regardless of size limit | |
| best_dist = -1.0 | |
| best_merge_id = -1 | |
| for other_id in u_labels: | |
| if other_id == cid: continue | |
| other_idx = np.where(labels_arr == other_id)[0] | |
| other_centroid = emb[other_idx].mean(axis=0, keepdims=True) | |
| sim = cosine_similarity(centroid, other_centroid)[0][0] | |
| if sim > best_dist: | |
| best_dist = float(sim) | |
| best_merge_id = int(other_id) | |
| labels_arr[idx] = best_merge_id | |
| unique_labels = np.unique(labels_arr) | |
| n_clusters = len(unique_labels) | |
| # Build per-cluster sentence index list | |
| # list(map(int,...)) converts numpy.int64 -> Python int for JSON serialisation | |
| cluster_sentence_idx = {int(cid): list(map(int, np.where(labels_arr == cid)[0])) | |
| for cid in unique_labels} | |
| def make_summary(cid): | |
| idx = cluster_sentence_idx[int(cid)] | |
| c_emb = emb[idx] | |
| centroid = c_emb.mean(axis=0, keepdims=True) | |
| sims = cosine_similarity(centroid, c_emb)[0] | |
| top_k = min(NEAREST_K, len(idx)) | |
| # Convert numpy int64 -> Python int to ensure JSON serialisability | |
| top_local = list(map(int, np.argsort(sims)[-top_k:][::-1])) | |
| top_global = list(map(lambda j: idx[j], top_local)) | |
| return { | |
| "cluster_id": int(cid), | |
| "sentence_count": len(idx), | |
| "paper_count": len(set(paper_ids[i] for i in idx)), | |
| "top_sentences": [sentences[i] for i in top_global], | |
| "centroid": centroid[0].tolist(), | |
| # idx already Python int from cluster_sentence_idx (Fix 1) | |
| "sentence_indices": idx, | |
| } | |
| summaries = list(map(make_summary, unique_labels)) | |
| summaries = sorted(summaries, key=lambda x: x["sentence_count"], reverse=True) | |
| _save(summaries, f"{run_key}_summaries.json") | |
| # ββ 4 Plotly Charts βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _generate_charts(run_key, summaries, name_key="cluster_id") | |
| chart_dir = _opath(f"{run_key}_charts") | |
| return ( | |
| f"β BERTopic Discovery Complete ({run_key})\n" | |
| f"Total sentences: {len(sentences):,}\n" | |
| f"Topics generated: {n_clusters} (Dynamic via target_size={target_size})\n" | |
| f"Algorithm: Constrained Agglomerative (Split & Merge Balanced)\n" | |
| f"Largest cluster: {summaries[0]['sentence_count']} sentences\n" | |
| f"Smallest cluster: {summaries[-1]['sentence_count']} sentences\n" | |
| f"Charts saved to {chart_dir}\n" | |
| f"Now calling label_topics_with_llm..." | |
| ) | |
| # βββ TOOL 3: LABEL TOPICS WITH LLM βββββββββββββββββββββββββββββββββββββββββββ | |
| def label_topics_with_llm(run_key: str) -> str: | |
| """Send top 100 clusters to Mistral for labelling. | |
| Returns topic labels, categories, confidence scores, reasoning, is_niche. | |
| Saves {run_key}_labels.json. Phase 2 of Braun & Clarke. | |
| run_key must be 'abstract' or 'title'.""" | |
| summaries = _load(f"{run_key}_summaries.json")[:MAX_LABEL_TOPICS] | |
| llm = _get_llm() | |
| label_prompt = PromptTemplate.from_template( | |
| "You are a bibliometric research expert.\n" | |
| "Label each cluster below with a concise research area name.\n" | |
| "Return ONLY a JSON array β one object per cluster:\n" | |
| ' {{"cluster_id": N, "label": "...", "category": "...", ' | |
| '"confidence": 0.0-1.0, "reasoning": "...", "is_niche": true/false}}\n\n' | |
| "Clusters (ID | sentence_count | top 2 sentences):\n{clusters}\n\n" | |
| "Return valid JSON array only, no markdown fences." | |
| ) | |
| def _format_batch(batch): | |
| return "\n".join( | |
| f"{s['cluster_id']} | {s['sentence_count']} sents | " | |
| + " /// ".join(s["top_sentences"][:2]) | |
| for s in batch | |
| ) | |
| def label_batch(batch): | |
| raw = (label_prompt | llm | StrOutputParser()).invoke( | |
| {"clusters": _format_batch(batch)} | |
| ) | |
| cleaned = raw.strip().lstrip("```json").lstrip("```").rstrip("```").strip() | |
| return json.loads(cleaned) | |
| batch_starts = list(range(0, len(summaries), BATCH_SIZE)) | |
| batches = list(map(lambda i: summaries[i:i + BATCH_SIZE], batch_starts)) | |
| results = [item for batch_result in map(label_batch, batches) | |
| for item in batch_result] | |
| label_map = {r["cluster_id"]: r for r in results} | |
| labeled = [ | |
| {**s, **label_map.get(s["cluster_id"], | |
| {"label": f"Topic {s['cluster_id']}", "category": "Unknown", | |
| "confidence": 0.5, "reasoning": "", "is_niche": False})} | |
| for s in summaries | |
| ] | |
| _save(labeled, f"{run_key}_labels.json") | |
| _generate_charts(run_key, labeled, name_key="label") | |
| return ( | |
| f"β Labels Generated ({run_key})\n" | |
| f"Topics labeled: {len(labeled)}\n" | |
| f"Review table populated. Edit Approve/Rename columns, " | |
| f"then click Submit Review." | |
| ) | |
| # βββ TOOL 4: CONSOLIDATE INTO THEMES βββββββββββββββββββββββββββββββββββββββββ | |
| def consolidate_into_themes(run_key: str, theme_map: str) -> str: | |
| """Merge researcher-approved topic groups into consolidated themes. | |
| theme_map: JSON string β array from review table with cluster_id, approve, rename_to fields. | |
| Recomputes centroids and paper counts from actual embeddings. | |
| Saves {run_key}_themes.json. Phase 3 of Braun & Clarke.""" | |
| decisions = json.loads(theme_map) | |
| emb = np.load(_opath(f"{run_key}_emb.npy")) | |
| sent_data = _load(f"{run_key}_sentences.json") | |
| paper_ids = sent_data["paper_ids"] | |
| sentences = sent_data["sentences"] | |
| summaries = _load(f"{run_key}_summaries.json") | |
| # Build cluster_id β sentence_indices map from summaries | |
| # (sentence_indices stored during discovery; fallback to sequential search) | |
| sum_map = {s["cluster_id"]: s for s in summaries} | |
| approved = [d for d in decisions if str(d.get("approve", "")).upper() == "YES"] | |
| # Group cluster IDs by theme name | |
| theme_groups: dict = {} | |
| list(map( | |
| lambda d: theme_groups.setdefault( | |
| str(d.get("rename_to", "") or d.get("label", f"Topic {d['cluster_id']}")).strip(), | |
| [] | |
| ).append(int(d["cluster_id"])), | |
| approved | |
| )) | |
| def build_theme(name_cids_tuple): | |
| name, cids = name_cids_tuple | |
| # Collect all sentence indices for these clusters | |
| all_sent_idx = list(set( | |
| idx | |
| for cid in cids | |
| for idx in (sum_map[cid].get("sentence_indices", []) if cid in sum_map else []) | |
| )) | |
| # Fallback: scan paper_ids if sentence_indices weren't stored | |
| fallback_idx = list(set( | |
| i for cid in cids | |
| for i in range(len(paper_ids)) | |
| if paper_ids[i] == cid | |
| )) if not all_sent_idx else all_sent_idx | |
| use_idx = all_sent_idx if all_sent_idx else fallback_idx | |
| theme_emb = emb[use_idx] if use_idx else emb[:1] | |
| centroid = theme_emb.mean(axis=0) | |
| total_sents = sum(sum_map[cid]["sentence_count"] | |
| for cid in cids if cid in sum_map) | |
| unique_papers = set(paper_ids[i] for i in use_idx) if use_idx else set() | |
| top_sents = sum_map[cids[0]]["top_sentences"][:3] if cids and cids[0] in sum_map else [] | |
| return { | |
| "theme_name": name, | |
| "merged_cluster_ids": cids, | |
| "sentence_count": total_sents, | |
| "paper_count": len(unique_papers), | |
| "top_sentences": top_sents, | |
| "centroid": centroid.tolist(), | |
| } | |
| themes = list(map(build_theme, theme_groups.items())) | |
| themes.sort(key=lambda x: x["sentence_count"], reverse=True) | |
| _save(themes, f"{run_key}_themes.json") | |
| _generate_charts(run_key, themes, name_key="theme_name") | |
| return ( | |
| f"β Themes Consolidated ({run_key})\n" | |
| f"Approved topics: {len(approved)}\n" | |
| f"Final themes: {len(themes)}\n" | |
| f"Theme names: {[t['theme_name'] for t in themes]}\n" | |
| f"Review consolidated themes. Click Submit Review to confirm." | |
| ) | |
| # βββ TOOL 5: COMPARE WITH TAXONOMY βββββββββββββββββββββββββββββββββββββββββββ | |
| def compare_with_taxonomy(run_key: str) -> str: | |
| """Map final themes to the PAJAIS taxonomy (Jiang et al. 2019) β 25 categories. | |
| Classifies each theme as MAPPED or NOVEL. | |
| Saves taxonomy_map.json. Phase 5.5 of Braun & Clarke. | |
| run_key must be 'abstract' or 'title'.""" | |
| themes_file = (f"{run_key}_themes.json" | |
| if os.path.exists(_opath(f"{run_key}_themes.json")) | |
| else f"{run_key}_labels.json") | |
| themes_raw = _load(themes_file) | |
| theme_names = [t.get("theme_name", t.get("label", "")) for t in themes_raw] | |
| llm = _get_llm() | |
| tax_prompt = PromptTemplate.from_template( | |
| "You are a bibliometric taxonomy expert.\n" | |
| "Map each theme to the PAJAIS taxonomy (Jiang et al., 2019).\n\n" | |
| "PAJAIS 25 categories:\n{pajais}\n\n" | |
| "Themes to classify:\n{themes}\n\n" | |
| "Return ONLY a JSON array:\n" | |
| '[{{"theme": "...", "pajais_match": "category or NOVEL", ' | |
| '"match_confidence": 0.0-1.0, "reasoning": "...", "is_novel": true/false}}]\n' | |
| "If no PAJAIS category fits well, set pajais_match to NOVEL and is_novel to true.\n" | |
| "No markdown fences, return raw JSON only." | |
| ) | |
| pajais_str = "\n".join(f"{i+1}. {c}" for i, c in enumerate(PAJAIS_25)) | |
| themes_str = "\n".join(f"- {n}" for n in theme_names) | |
| raw = (tax_prompt | llm | StrOutputParser()).invoke( | |
| {"pajais": pajais_str, "themes": themes_str} | |
| ) | |
| cleaned = raw.strip().lstrip("```json").lstrip("```").rstrip("```").strip() | |
| results = json.loads(cleaned) | |
| mapped = [r for r in results if not r.get("is_novel", False)] | |
| novel = [r for r in results if r.get("is_novel", False)] | |
| covered = set(r["pajais_match"] for r in mapped) | |
| gaps = [c for c in PAJAIS_25 if c not in covered] | |
| taxonomy_map = { | |
| "run_key": run_key, | |
| "taxonomy_mapping": {r["theme"]: r for r in results}, | |
| "novel_themes": [r["theme"] for r in novel], | |
| "pajais_gap_categories": gaps, | |
| "coverage_stats": { | |
| "total_themes": len(results), | |
| "mapped": len(mapped), | |
| "novel": len(novel), | |
| }, | |
| } | |
| _save(taxonomy_map, "taxonomy_map.json") | |
| return ( | |
| f"β PAJAIS Taxonomy Mapped ({run_key})\n" | |
| f"Themes mapped: {len(mapped)}\n" | |
| f"NOVEL themes: {len(novel)} β {[r['theme'] for r in novel]}\n" | |
| f"PAJAIS gaps (top 5): {gaps[:5]}\n" | |
| f"taxonomy_map.json saved. Review PAJAIS mapping in table. Click Submit Review." | |
| ) | |
| # βββ TOOL 6: GENERATE COMPARISON CSV βββββββββββββββββββββββββββββββββββββββββ | |
| def generate_comparison_csv() -> str: | |
| """Load themes from both abstract and title runs and create a side-by-side comparison. | |
| Identifies STABLE (convergent), ABSTRACT-ONLY, and TITLE-ONLY themes. | |
| Saves comparison.csv. Phase 6 of Braun & Clarke.""" | |
| def load_themes(key): | |
| fname = (f"{key}_themes.json" | |
| if os.path.exists(_opath(f"{key}_themes.json")) | |
| else f"{key}_labels.json") | |
| return _load(fname) | |
| abs_themes = load_themes("abstract") | |
| ttl_themes = load_themes("title") | |
| abs_names = [t.get("theme_name", t.get("label", "")) for t in abs_themes] | |
| ttl_names = [t.get("theme_name", t.get("label", "")) for t in ttl_themes] | |
| abs_kws = [" | ".join(t.get("top_sentences", [""])[:1]) for t in abs_themes] | |
| ttl_kws = [" | ".join(t.get("top_sentences", [""])[:1]) for t in ttl_themes] | |
| max_len = max(len(abs_themes), len(ttl_themes)) | |
| pad = lambda lst, val: lst + [val] * (max_len - len(lst)) # noqa: E731 | |
| df = pd.DataFrame({ | |
| "Abstract_Theme": pad(abs_names, ""), | |
| "Abstract_Evidence": pad(abs_kws, ""), | |
| "Abstract_Sentences": pad([t.get("sentence_count", 0) for t in abs_themes], 0), | |
| "Title_Theme": pad(ttl_names, ""), | |
| "Title_Evidence": pad(ttl_kws, ""), | |
| "Title_Sentences": pad([t.get("sentence_count", 0) for t in ttl_themes], 0), | |
| "Convergence": pad( | |
| ["STABLE" if a in ttl_names else "ABSTRACT-ONLY" for a in abs_names], | |
| "TITLE-ONLY" | |
| ), | |
| }) | |
| path = _opath("comparison.csv") | |
| df.to_csv(path, index=False) | |
| return ( | |
| f"β Comparison CSV Generated\n" | |
| f"Abstract themes: {len(abs_themes)}\n" | |
| f"Title themes: {len(ttl_themes)}\n" | |
| f"Rows: {len(df)}\nFile: {path}\n" | |
| f"Check Download tab for comparison.csv. Click Submit Review to generate narrative." | |
| ) | |
| # βββ TOOL 7: EXPORT NARRATIVE βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def export_narrative(run_key: str) -> str: | |
| """Generate a 500-word Section 7 narrative via Mistral LLM. | |
| Uses themes + PAJAIS taxonomy mapping as context. | |
| Saves narrative.txt. Phase 6 of Braun & Clarke. | |
| run_key must be 'abstract' or 'title'.""" | |
| cfg = _load("corpus_config.json") | |
| theme_file = (f"{run_key}_themes.json" | |
| if os.path.exists(_opath(f"{run_key}_themes.json")) | |
| else f"{run_key}_labels.json") | |
| themes = _load(theme_file) | |
| tax = _load("taxonomy_map.json") | |
| theme_names = [t.get("theme_name", t.get("label", "")) for t in themes] | |
| novel_themes = tax.get("novel_themes", []) | |
| gaps = tax.get("pajais_gap_categories", []) | |
| mapped = tax.get("coverage_stats", {}).get("mapped", 0) | |
| llm = _get_llm() | |
| narr_prompt = PromptTemplate.from_template( | |
| "Write a 500-word Section 7 for a conference paper on topic modelling.\n" | |
| "Journal: {journal} | Papers: {papers} | Years: {y_min}β{y_max}\n" | |
| "Stable BERTopic themes (abstract run): {themes}\n" | |
| "NOVEL themes (not in PAJAIS 2019): {novel}\n" | |
| "PAJAIS gap categories: {gaps}\n" | |
| "Themes mapped to PAJAIS: {mapped}\n\n" | |
| "Structure: 7.1 Methodology (LDA + BERTopic, Braun & Clarke 2006), " | |
| "7.2 RQ4 LDA Findings, 7.3 RQ5 Abstract vs Title Comparison, " | |
| "7.4 RQ6 PAJAIS Taxonomy Mapping with NOVEL theme justification, " | |
| "7.5 RQ7 Future Research Agenda.\n" | |
| "Cite: Braun & Clarke (2006), Jiang et al. (2019), Grootendorst (2022).\n" | |
| "~500 words, academic tone, no bullet points, paragraph form." | |
| ) | |
| narrative = (narr_prompt | llm | StrOutputParser()).invoke({ | |
| "journal": cfg.get("journal", "Electronic Markets"), | |
| "papers": cfg.get("rows", 908), | |
| "y_min": cfg.get("year_min", 2007), | |
| "y_max": cfg.get("year_max", 2026), | |
| "themes": ", ".join(theme_names[:10]), | |
| "novel": ", ".join(novel_themes[:5]), | |
| "gaps": ", ".join(gaps[:5]), | |
| "mapped": mapped, | |
| }) | |
| path = _opath("narrative.txt") | |
| with open(path, "w", encoding="utf-8") as f: | |
| f.write(narrative) | |
| return ( | |
| f"β Narrative Exported\n" | |
| f"Words: {len(narrative.split())}\n" | |
| f"File: {path}\n" | |
| f"π Pipeline complete! Download narrative.txt from the Download tab.\n" | |
| f"Deliverables: comparison.csv | taxonomy_map.json | narrative.txt" | |
| ) | |
| # --- SET handle_tool_error ON ALL TOOLS (BaseTool property) --- | |
| # langchain-core 0.3.x: handle_tool_error is a BaseTool property, | |
| # not a @tool() decorator argument. Using map() - zero loops. | |
| _ALL_TOOLS = [ | |
| load_scopus_csv, run_bertopic_discovery, label_topics_with_llm, | |
| consolidate_into_themes, compare_with_taxonomy, | |
| generate_comparison_csv, export_narrative, | |
| ] | |
| list(map(lambda t: setattr(t, "handle_tool_error", True), _ALL_TOOLS)) | |