import json import os import re from collections import Counter, defaultdict from datetime import datetime from typing import Any, Dict, List, Tuple import numpy as np import pandas as pd import plotly.express as px from sklearn.cluster import AgglomerativeClustering, KMeans from sklearn.decomposition import PCA, TruncatedSVD from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics import silhouette_score from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import normalize OUTPUT_DIR = "./outputs" os.makedirs(OUTPUT_DIR, exist_ok=True) TARGET_MIN_CLUSTERS = 15 TARGET_MAX_CLUSTERS = 25 MIN_CLUSTER_SIZE = 5 MAX_CLUSTER_SIZE = 100 LABEL_STOP_WORDS = set(""" abstract available research study studies paper papers article articles journal based using use uses used result results effect effects model models approach analysis data information system systems electronic markets market business """.split()) PAJAIS_25 = [ "IS Strategy and Management", "E-Commerce and E-Business", "IT Adoption and Diffusion", "Business Intelligence and Analytics", "Social Commerce and Social Media", "Mobile Commerce and Applications", "Knowledge Management", "Healthcare Information Systems", "Privacy, Security and Trust", "Enterprise Systems and ERP", "Digital Platforms and Ecosystems", "Blockchain and Distributed Ledgers", "Artificial Intelligence and Machine Learning", "Human-Computer Interaction and UX", "Digital Transformation and Innovation", "Financial Technology and Digital Finance", "Supply Chain and Logistics IS", "Smart Systems IoT and Smart Cities", "IS Research Methods and Theory", "Recommender and Personalization Systems", "Digital Marketing and Advertising", "Virtual Teams and Online Collaboration", "Cloud Computing and SaaS", "Big Data Analytics and Data Science", "IS Education and Training", ] CATEGORY_TERMS = { "IS Strategy and Management": "strategy governance value performance management capability alignment", "E-Commerce and E-Business": "e-commerce marketplace online shopping electronic market platform transaction", "IT Adoption and Diffusion": "adoption acceptance intention use continuance utaut tam diffusion", "Business Intelligence and Analytics": "analytics data mining business intelligence decision support prediction", "Social Commerce and Social Media": "social media social commerce online community influencer live streaming", "Mobile Commerce and Applications": "mobile app smartphone m-commerce location based wearable", "Knowledge Management": "knowledge sharing knowledge management learning collaboration expertise", "Healthcare Information Systems": "health healthcare patient medical telemedicine ehealth", "Privacy, Security and Trust": "privacy security trust risk fraud identity protection", "Enterprise Systems and ERP": "erp enterprise system process integration organization", "Digital Platforms and Ecosystems": "platform ecosystem multi-sided digital platform complementor", "Blockchain and Distributed Ledgers": "blockchain distributed ledger smart contract token cryptocurrency", "Artificial Intelligence and Machine Learning": "artificial intelligence machine learning ai algorithm automation robot", "Human-Computer Interaction and UX": "user experience interface interaction usability design hci", "Digital Transformation and Innovation": "digital transformation innovation digitization disruption business model", "Financial Technology and Digital Finance": "fintech finance payment robo-advisor banking investment", "Supply Chain and Logistics IS": "supply chain logistics procurement inventory operations", "Smart Systems IoT and Smart Cities": "iot internet of things sensor smart city smart service", "IS Research Methods and Theory": "method theory literature review framework model research design", "Recommender and Personalization Systems": "recommendation recommender personalization preference choice", "Digital Marketing and Advertising": "advertising marketing consumer brand customer targeting", "Virtual Teams and Online Collaboration": "virtual team collaboration remote work crowd outsourcing", "Cloud Computing and SaaS": "cloud saas service computing infrastructure platform as a service", "Big Data Analytics and Data Science": "big data data science text mining deep learning analytics", "IS Education and Training": "education training learning student teaching mooc", } THEORY_PATTERNS = [ "technology acceptance model", "tam", "utaut", "diffusion of innovation", "task technology fit", "social exchange theory", "institutional theory", "resource based view", "transaction cost", "information systems success", "expectation confirmation", "trust theory", "planned behavior", ] METHOD_PATTERNS = [ "survey", "experiment", "case study", "structural equation", "sem", "regression", "machine learning", "design science", "literature review", "qualitative", "interview", "content analysis", "simulation", ] COMPUTATIONAL_PATTERNS = [ "machine learning", "deep learning", "neural network", "random forest", "support vector", "svm", "classification", "clustering", "topic model", "lda", "natural language processing", "nlp", "text mining", "sentiment", "recommender", "algorithm", "prediction", "analytics", "optimization", ] def _opath(name: str) -> str: return os.path.join(OUTPUT_DIR, name) def _save_json(data: Any, name: str) -> str: path = _opath(name) with open(path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) return path def _clean_text(text: Any) -> str: text = re.sub(r"\s+", " ", str(text or "")).strip() text = re.sub(r"©.*$", "", text).strip() return text def _first_existing(df: pd.DataFrame, candidates: List[str]) -> str: lowered = {c.lower(): c for c in df.columns} for name in candidates: if name.lower() in lowered: return lowered[name.lower()] return "" def load_corpus(filepath: str) -> Tuple[pd.DataFrame, Dict[str, Any]]: df = pd.read_csv(filepath, encoding="utf-8-sig", on_bad_lines="skip") title_col = _first_existing(df, ["Title"]) abstract_col = _first_existing(df, ["Abstract"]) doi_col = _first_existing(df, ["DOI"]) year_col = _first_existing(df, ["Year"]) journal_col = _first_existing(df, ["Source title", "Journal"]) cited_col = _first_existing(df, ["Cited by", "Citations"]) if not title_col or not abstract_col: raise ValueError("CSV must include Title and Abstract columns.") df = df.copy() df["__title"] = df[title_col].map(_clean_text) df["__abstract"] = df[abstract_col].map(_clean_text) df["__doi"] = df[doi_col].fillna("").map(str) if doi_col else "" df["__combined"] = (df["__title"] + ". " + df["__abstract"]).map(_clean_text) df = df[df["__combined"].str.len() > 80].reset_index(drop=True) df["__paper_id"] = np.arange(len(df)) if cited_col: df["__cited_by"] = pd.to_numeric(df[cited_col], errors="coerce").fillna(0) else: df["__cited_by"] = 0 years = pd.to_numeric(df[year_col], errors="coerce") if year_col else pd.Series(dtype=float) journal = df[journal_col].dropna().astype(str).mode().iloc[0] if journal_col and not df[journal_col].dropna().empty else "Unknown" config = { "filepath": filepath, "journal": journal, "rows": int(len(df)), "year_min": int(years.min()) if not years.dropna().empty else None, "year_max": int(years.max()) if not years.dropna().empty else None, "title_column": title_col, "abstract_column": abstract_col, "doi_column": doi_col or "missing", "combined_field": "Title + Abstract; DOI retained as paper identifier", "generated_at": datetime.now().isoformat(timespec="seconds"), } _save_json(config, "corpus_config.json") return df, config def _embed_documents(texts: List[str]) -> Tuple[np.ndarray, Dict[str, Any]]: errors = [] try: import torch from transformers import AutoModel, AutoTokenizer model_name = "allenai/specter2_base" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) model.eval() batches = [] with torch.no_grad(): for start in range(0, len(texts), 8): batch = texts[start:start + 8] encoded = tokenizer( batch, padding=True, truncation=True, max_length=512, return_tensors="pt", ) output = model(**encoded) mask = encoded["attention_mask"].unsqueeze(-1) pooled = (output.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1) batches.append(pooled.cpu().numpy()) vectors = normalize(np.vstack(batches)) return np.asarray(vectors, dtype=np.float32), { "embedding_model": "allenai/specter2_base", "embedding_note": "SPECTER2 transformer embeddings from Title + Abstract; DOI retained as paper identifier.", } except Exception as exc: errors.append(f"transformers allenai/specter2_base: {exc.__class__.__name__}: {exc}") for model_name in ["allenai/specter2_base", "allenai-specter"]: try: from sentence_transformers import SentenceTransformer model = SentenceTransformer(model_name) vectors = model.encode( texts, normalize_embeddings=True, batch_size=16, show_progress_bar=True, ) return np.asarray(vectors, dtype=np.float32), { "embedding_model": model_name, "embedding_note": "Transformer embeddings. SPECTER2 attempted first.", } except Exception as exc: errors.append(f"{model_name}: {exc.__class__.__name__}: {exc}") vectorizer = TfidfVectorizer( max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.85, stop_words="english", ) tfidf = vectorizer.fit_transform(texts) n_components = min(256, max(2, min(tfidf.shape) - 1)) svd = TruncatedSVD(n_components=n_components, random_state=42) vectors = normalize(svd.fit_transform(tfidf)) meta = { "embedding_model": "TF-IDF + TruncatedSVD fallback", "embedding_note": "SPECTER2/Transformer loading failed; deterministic fallback kept the app runnable.", "embedding_errors": errors[-3:], } return np.asarray(vectors, dtype=np.float32), meta def _cluster_metrics(labels: np.ndarray, vectors: np.ndarray) -> Dict[str, Any]: valid = labels >= 0 cluster_ids, counts = np.unique(labels[valid], return_counts=True) n_clusters = int(len(cluster_ids)) noise_ratio = float(np.mean(~valid)) if len(labels) else 1.0 too_small = int(np.sum(counts < MIN_CLUSTER_SIZE)) if len(counts) else 999 too_large = int(np.sum(counts > MAX_CLUSTER_SIZE)) if len(counts) else 999 silhouette = -1.0 if n_clusters > 1 and np.sum(valid) > n_clusters: sample_vectors = vectors[valid] sample_labels = labels[valid] if len(sample_vectors) > 800: rng = np.random.default_rng(42) sample_idx = rng.choice(len(sample_vectors), 800, replace=False) sample_vectors = sample_vectors[sample_idx] sample_labels = sample_labels[sample_idx] try: silhouette = float(silhouette_score(sample_vectors, sample_labels, metric="cosine")) except Exception: silhouette = -1.0 range_penalty = 0 if n_clusters < TARGET_MIN_CLUSTERS: range_penalty = (TARGET_MIN_CLUSTERS - n_clusters) * 3 if n_clusters > TARGET_MAX_CLUSTERS: range_penalty = (n_clusters - TARGET_MAX_CLUSTERS) * 3 score = ( range_penalty + too_small * 2 + too_large * 4 + noise_ratio * 8 - silhouette ) return { "n_clusters": n_clusters, "noise_ratio": round(noise_ratio, 4), "min_size": int(counts.min()) if len(counts) else 0, "max_size": int(counts.max()) if len(counts) else 0, "too_small": too_small, "too_large": too_large, "silhouette_cosine": round(silhouette, 4), "score": round(float(score), 4), } def _compact_labels(labels: np.ndarray) -> np.ndarray: labels = np.asarray(labels, dtype=int).copy() positive = [int(x) for x in sorted(np.unique(labels)) if x >= 0] mapping = {old: new for new, old in enumerate(positive)} return np.asarray([mapping.get(int(x), -1) for x in labels], dtype=int) def _repair_labels(labels: np.ndarray, vectors: np.ndarray) -> np.ndarray: labels = _compact_labels(labels) if np.all(labels < 0): k = min(20, max(TARGET_MIN_CLUSTERS, len(vectors) // 35)) return KMeans(n_clusters=k, random_state=42, n_init=20).fit_predict(vectors) noise_idx = np.where(labels < 0)[0] if len(noise_idx): valid_ids = [int(x) for x in sorted(np.unique(labels)) if x >= 0] centroids = np.asarray([vectors[labels == cid].mean(axis=0) for cid in valid_ids]) nearest = cosine_similarity(vectors[noise_idx], centroids).argmax(axis=1) labels[noise_idx] = np.asarray([valid_ids[i] for i in nearest], dtype=int) labels = _compact_labels(labels) next_id = int(labels.max()) + 1 for cid in list(sorted(np.unique(labels))): idx = np.where(labels == cid)[0] if len(idx) <= MAX_CLUSTER_SIZE: continue centroid = vectors[idx].mean(axis=0, keepdims=True) order = cosine_similarity(vectors[idx], centroid).ravel().argsort()[::-1] ordered_idx = idx[order] chunks = [ordered_idx[i:i + MAX_CLUSTER_SIZE] for i in range(0, len(ordered_idx), MAX_CLUSTER_SIZE)] labels[chunks[0]] = cid for chunk in chunks[1:]: labels[chunk] = next_id next_id += 1 labels = _compact_labels(labels) changed = True while changed: changed = False ids, counts = np.unique(labels, return_counts=True) tiny = [int(cid) for cid, count in zip(ids, counts) if count < MIN_CLUSTER_SIZE] if not tiny or len(ids) <= TARGET_MIN_CLUSTERS: break for cid in tiny: idx = np.where(labels == cid)[0] other_ids = [int(x) for x in np.unique(labels) if int(x) != cid] if not other_ids: continue centroid = vectors[idx].mean(axis=0, keepdims=True) other_centroids = np.asarray([vectors[labels == oid].mean(axis=0) for oid in other_ids]) nearest_order = cosine_similarity(centroid, other_centroids).ravel().argsort()[::-1] target = other_ids[int(nearest_order[0])] labels[idx] = target labels = _compact_labels(labels) changed = True break labels = _compact_labels(labels) while len(np.unique(labels)) < TARGET_MIN_CLUSTERS: ids, counts = np.unique(labels, return_counts=True) largest = int(ids[np.argmax(counts)]) idx = np.where(labels == largest)[0] if len(idx) < MIN_CLUSTER_SIZE * 2: break centroid = vectors[idx].mean(axis=0, keepdims=True) order = cosine_similarity(vectors[idx], centroid).ravel().argsort()[::-1] split_at = len(idx) // 2 if split_at < MIN_CLUSTER_SIZE or len(idx) - split_at < MIN_CLUSTER_SIZE: break labels[idx[order[split_at:]]] = int(labels.max()) + 1 labels = _compact_labels(labels) return _compact_labels(labels) def _optimizer_recommendation(metrics: Dict[str, Any]) -> str: if metrics["n_clusters"] < TARGET_MIN_CLUSTERS: return "Increase UMAP n_neighbors separation pressure or lower HDBSCAN min_cluster_size." if metrics["n_clusters"] > TARGET_MAX_CLUSTERS: return "Raise HDBSCAN min_cluster_size or increase UMAP n_neighbors to merge nearby themes." if metrics["max_size"] > MAX_CLUSTER_SIZE: return "Split dominant clusters by lowering min_cluster_size or lowering min_samples." if metrics["noise_ratio"] > 0.25: return "Reduce min_samples and keep UMAP n_components at 5-10 to reduce noise." return "Keep this parameter set; it satisfies the 15-25 cluster target best." def _run_umap_hdbscan(vectors: np.ndarray) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any], List[Dict[str, Any]]]: candidates = [] best = None best_reduced = None best_params = None try: import umap try: import hdbscan as external_hdbscan hdbscan_backend = "external" except Exception: external_hdbscan = None from sklearn.cluster import HDBSCAN as SklearnHDBSCAN hdbscan_backend = "sklearn" for n_neighbors in [10, 20, 35]: for n_components in [5, 10]: reduced = umap.UMAP( n_neighbors=n_neighbors, n_components=n_components, min_dist=0.0, metric="cosine", random_state=42, ).fit_transform(vectors) for min_cluster_size in [5, 8, 12, 16, 25]: for min_samples in [1, 3, None]: if hdbscan_backend == "external": clusterer = external_hdbscan.HDBSCAN( min_cluster_size=min_cluster_size, min_samples=min_samples, metric="euclidean", prediction_data=True, ) else: clusterer = SklearnHDBSCAN( min_cluster_size=min_cluster_size, min_samples=min_samples, metric="euclidean", ) labels = clusterer.fit_predict(reduced) labels = _repair_labels(labels, vectors) metrics = _cluster_metrics(labels, vectors) params = { "algorithm": f"UMAP + HDBSCAN ({hdbscan_backend})", "umap_n_neighbors": n_neighbors, "umap_n_components": n_components, "umap_metric": "cosine", "hdbscan_min_cluster_size": min_cluster_size, "hdbscan_min_samples": min_samples, "hdbscan_metric": "euclidean", } row = {**params, **metrics, "optimizer_recommendation": _optimizer_recommendation(metrics)} candidates.append(row) if best is None or metrics["score"] < best[1]["score"]: best = (labels, metrics, getattr(clusterer, "probabilities_", np.ones(len(labels)))) best_reduced = reduced best_params = params if ( TARGET_MIN_CLUSTERS <= metrics["n_clusters"] <= TARGET_MAX_CLUSTERS and metrics["too_small"] == 0 and metrics["too_large"] == 0 and metrics["noise_ratio"] <= 0.25 ): probs = getattr(clusterer, "probabilities_", np.ones(len(labels))) if len(probs) != len(labels): probs = np.ones(len(labels)) return labels, probs, {**params, **metrics}, candidates except Exception as exc: candidates.append({ "algorithm": "UMAP + HDBSCAN", "error": f"{exc.__class__.__name__}: {exc}", "optimizer_recommendation": "UMAP is unavailable in this Python install; try PCA manifold fallback with HDBSCAN.", }) try: from sklearn.cluster import HDBSCAN as SklearnHDBSCAN for n_components in [5, 10, 20]: pca_components = min(n_components, max(2, min(vectors.shape) - 1)) reduced = PCA(n_components=pca_components, random_state=42).fit_transform(vectors) for min_cluster_size in [5, 8, 12, 16, 25]: for min_samples in [1, 3, None]: clusterer = SklearnHDBSCAN( min_cluster_size=min_cluster_size, min_samples=min_samples, metric="euclidean", ) labels = _repair_labels(clusterer.fit_predict(reduced), vectors) metrics = _cluster_metrics(labels, vectors) params = { "algorithm": "PCA manifold fallback + HDBSCAN (sklearn; UMAP unavailable locally)", "pca_n_components": pca_components, "hdbscan_min_cluster_size": min_cluster_size, "hdbscan_min_samples": min_samples, "hdbscan_metric": "euclidean", } row = {**params, **metrics, "optimizer_recommendation": _optimizer_recommendation(metrics)} candidates.append(row) probs = np.ones(len(labels)) if best is None or metrics["score"] < best[1]["score"]: best = (labels, metrics, probs) best_params = params if ( TARGET_MIN_CLUSTERS <= metrics["n_clusters"] <= TARGET_MAX_CLUSTERS and metrics["too_small"] == 0 and metrics["too_large"] == 0 ): return labels, probs, {**params, **metrics}, candidates except Exception as fallback_exc: candidates.append({ "algorithm": "PCA manifold fallback + HDBSCAN", "error": f"{fallback_exc.__class__.__name__}: {fallback_exc}", "optimizer_recommendation": "Use deterministic KMeans fallback so the pipeline still completes.", }) if best is not None and TARGET_MIN_CLUSTERS <= best[1]["n_clusters"] <= TARGET_MAX_CLUSTERS: return best[0], best[2], {**best_params, **best[1]}, candidates n_clusters = min(20, max(TARGET_MIN_CLUSTERS, len(vectors) // 35)) labels = KMeans(n_clusters=n_clusters, random_state=42, n_init=20).fit_predict(vectors) labels = _repair_labels(labels, vectors) metrics = _cluster_metrics(labels, vectors) params = { "algorithm": "KMeans fallback after UMAP/HDBSCAN optimization", "n_clusters": n_clusters, **metrics, } sims = np.zeros(len(labels), dtype=float) for cid in np.unique(labels): idx = np.where(labels == cid)[0] centroid = vectors[idx].mean(axis=0, keepdims=True) sims[idx] = cosine_similarity(vectors[idx], centroid).ravel() probs = np.clip((sims + 1) / 2, 0, 1) candidates.append({**params, "optimizer_recommendation": "Fallback used to guarantee a crisp 15-25 cluster solution."}) return labels, probs, params, candidates def _top_terms(texts: List[str], top_n: int = 8) -> List[str]: if not texts: return [] vec = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=1, max_features=1200) matrix = vec.fit_transform(texts) scores = np.asarray(matrix.sum(axis=0)).ravel() terms = np.asarray(vec.get_feature_names_out()) order = scores.argsort()[::-1] cleaned = [] for term in terms[order]: pieces = term.split() if ( len(term) > 2 and not term.isdigit() and term not in cleaned and not all(piece in LABEL_STOP_WORDS for piece in pieces) and not any(piece in {"doi", "s12525", "1007"} for piece in pieces) ): cleaned.append(term) if len(cleaned) >= top_n: break return cleaned def _category_for_terms(terms: List[str]) -> Tuple[str, float]: query = " ".join(terms) docs = [query] + list(CATEGORY_TERMS.values()) vec = TfidfVectorizer(stop_words="english").fit_transform(docs) sims = cosine_similarity(vec[0], vec[1:]).ravel() best = int(np.argmax(sims)) return PAJAIS_25[best], float(sims[best]) def _title_from_terms(terms: List[str], category: str) -> str: priority = [ "artificial intelligence", "machine learning", "blockchain", "digital platform", "social commerce", "e-commerce", "privacy", "trust", "fintech", "robo advisor", "analytics", "mobile", "digital transformation", "recommender", ] joined = " ".join(terms) for phrase in priority: if phrase in joined: return phrase.title().replace("Ai", "AI") if terms: return " ".join(t.title() for t in terms[:3]) return category def _optional_mistral_label(cluster: Dict[str, Any]) -> str: api_key = os.environ.get("MISTRAL_API_KEY", "").strip() if not api_key: return "" try: from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import PromptTemplate from langchain_mistralai import ChatMistralAI prompt = PromptTemplate.from_template( "Name this academic topic cluster in 3-7 words. " "Use only the evidence, no markdown.\nKeywords: {keywords}\nTitles:\n{titles}" ) llm = ChatMistralAI(model="mistral-small-latest", api_key=api_key, temperature=0.1) return (prompt | llm | StrOutputParser()).invoke({ "keywords": ", ".join(cluster["keywords"]), "titles": "\n".join(cluster["top_titles"][:3]), }).strip().strip('"')[:80] except Exception: return "" def _optional_mistral_council(cluster: Dict[str, Any]) -> List[Dict[str, str]]: api_key = os.environ.get("MISTRAL_API_KEY", "").strip() if not api_key: return [] try: from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import PromptTemplate from langchain_mistralai import ChatMistralAI personas = [ ("LLM Council A - Domain Labeler", "Name the Information Systems research theme."), ("LLM Council B - Methods Skeptic", "Name the theme conservatively using only the three titles and keywords."), ("LLM Council C - Taxonomy Judge", "Name the theme and prefer PAJAIS-style terminology when appropriate."), ] llm = ChatMistralAI(model="mistral-small-latest", api_key=api_key, temperature=0.1) prompt = PromptTemplate.from_template( "{task}\nReturn one concise 3-7 word label only.\n" "Keywords: {keywords}\n" "High-probability paper titles:\n{titles}" ) votes = [] for member, task in personas: label = (prompt | llm | StrOutputParser()).invoke({ "task": task, "keywords": ", ".join(cluster["keywords"]), "titles": "\n".join(cluster["top_titles"][:3]), }).strip().strip('"')[:80] if label: votes.append({"member": member, "label": label, "method": "Mistral LLM council vote using top-3 high-probability paper titles"}) return votes except Exception: return [] def _build_cluster_summaries(df: pd.DataFrame, vectors: np.ndarray, labels: np.ndarray, probabilities: np.ndarray) -> List[Dict[str, Any]]: summaries = [] for cid in sorted([int(x) for x in np.unique(labels) if x >= 0]): idx = np.where(labels == cid)[0] texts = df.loc[idx, "__combined"].tolist() titles = df.loc[idx, "__title"].tolist() keywords = _top_terms(texts, 10) category, category_score = _category_for_terms(keywords + titles[:5]) centroid = vectors[idx].mean(axis=0, keepdims=True) sims = cosine_similarity(vectors[idx], centroid).ravel() rank = np.lexsort((-sims, -probabilities[idx]))[::-1] top_idx = idx[rank[:3]] cluster_draft = { "keywords": keywords, "top_titles": df.loc[top_idx, "__title"].tolist(), } vote_keyword = _title_from_terms(keywords, category) vote_taxonomy = category llm_votes = _optional_mistral_council(cluster_draft) votes = llm_votes or [ {"member": "Council A - Keyword Extractor", "label": vote_keyword, "method": "deterministic cluster TF-IDF terms"}, {"member": "Council B - PAJAIS Mapper", "label": vote_taxonomy, "method": "taxonomy-term cosine validation"}, {"member": "Council C - Local Semantic Judge", "label": vote_keyword, "method": "local fallback; configure MISTRAL_API_KEY for live 3-LLM council"}, ] normalized_votes = [v["label"].strip().lower() for v in votes if v["label"]] agreement = Counter(normalized_votes).most_common(1)[0][1] / max(1, len(normalized_votes)) final_label = Counter([v["label"] for v in votes if v["label"]]).most_common(1)[0][0] if votes else vote_keyword summaries.append({ "cluster_id": cid, "label": final_label, "category": category, "confidence": round(float(np.mean(probabilities[idx]) * 0.65 + agreement * 0.35), 3), "category_confidence": round(category_score, 3), "sentence_count": int(len(idx)), "paper_count": int(len(idx)), "top_sentences": df.loc[top_idx, "__abstract"].str[:350].tolist(), "top_titles": df.loc[top_idx, "__title"].tolist(), "keywords": keywords, "centroid": centroid.ravel().tolist(), "paper_indices": [int(i) for i in idx], "council_votes": votes, "agreement_score": round(float(agreement), 3), "is_niche": bool(len(idx) <= 8), "reasoning": f"{len(idx)} papers; top terms: {', '.join(keywords[:6])}; council agreement {agreement:.2f}.", }) summaries.sort(key=lambda x: x["paper_count"], reverse=True) return summaries def _generate_charts(summaries: List[Dict[str, Any]]) -> None: chart_dir = _opath("combined_charts") os.makedirs(chart_dir, exist_ok=True) if not summaries: return centroids = np.asarray([s["centroid"] for s in summaries]) sizes = [s["paper_count"] for s in summaries] labels = [s["label"] for s in summaries] coords = PCA(n_components=2, random_state=42).fit_transform(centroids) if len(summaries) > 1 else np.zeros((1, 2)) fig = px.scatter( x=coords[:, 0], y=coords[:, 1], size=sizes, color=[s["category"] for s in summaries], hover_name=labels, title="Intertopic Map - Title + Abstract + DOI", template="plotly_dark", ) fig.write_html(os.path.join(chart_dir, "intertopic_map.html"), include_plotlyjs="cdn", full_html=True) bar = px.bar( x=labels, y=sizes, title="Cluster Sizes", labels={"x": "Cluster", "y": "Papers"}, template="plotly_dark", ) bar.write_html(os.path.join(chart_dir, "bar_chart.html"), include_plotlyjs="cdn", full_html=True) tree = px.treemap( names=labels, parents=["clusters"] * len(labels), values=sizes, title="Topic Treemap", ) tree.write_html(os.path.join(chart_dir, "treemap.html"), include_plotlyjs="cdn", full_html=True) def _taxonomy_map(summaries: List[Dict[str, Any]]) -> Dict[str, Any]: mapping = {} for s in summaries: is_novel = s["category_confidence"] < 0.08 mapping[s["label"]] = { "theme": s["label"], "pajais_match": "NOVEL" if is_novel else s["category"], "match_confidence": s["category_confidence"], "reasoning": s["reasoning"], "is_novel": is_novel, } covered = {v["pajais_match"] for v in mapping.values() if not v["is_novel"]} novel = [k for k, v in mapping.items() if v["is_novel"]] return { "run_key": "combined", "taxonomy_mapping": mapping, "novel_themes": novel, "pajais_gap_categories": [c for c in PAJAIS_25 if c not in covered], "coverage_stats": { "total_themes": len(mapping), "mapped": len(mapping) - len(novel), "novel": len(novel), }, } def _write_comparison_csv(summaries: List[Dict[str, Any]]) -> str: rows = [] for s in summaries: rows.append({ "Cluster_ID": s["cluster_id"], "Final_Label": s["label"], "PAJAIS_Category": s["category"], "Papers": s["paper_count"], "Confidence": s["confidence"], "Agreement": s["agreement_score"], "Top_Keywords": "; ".join(s["keywords"][:8]), "Top_3_Paper_Titles": " | ".join(s["top_titles"][:3]), "Validation_Status": "VALIDATED" if s["confidence"] >= 0.55 else "REVIEW", }) path = _opath("comparison.csv") pd.DataFrame(rows).to_csv(path, index=False) return path def _extract_matches(text: str, patterns: List[str]) -> List[str]: lower = text.lower() return sorted({p for p in patterns if re.search(r"\b" + re.escape(p.lower()) + r"\b", lower)}) def _write_tccm_validation(df: pd.DataFrame) -> str: top = df.sort_values("__cited_by", ascending=False).head(100).copy() rows = [] for rank, (_, row) in enumerate(top.iterrows(), start=1): text = f"{row['__title']} {row['__abstract']}" theories = _extract_matches(text, THEORY_PATTERNS) methods = _extract_matches(text, METHOD_PATTERNS) techniques = _extract_matches(text, COMPUTATIONAL_PATTERNS) context = [] for category, terms in CATEGORY_TERMS.items(): if any(term in text.lower() for term in terms.split()[:6]): context.append(category) rows.append({ "Paper ID": rank, "Title": row["__title"], "DOI": row["__doi"], "Cited_By": row["__cited_by"], "Theory_Regex": "; ".join(theories), "Context_Taxonomy": "; ".join(context[:3]), "Characteristics_Constructs": "; ".join(_top_terms([text], 6)), "Method_Regex": "; ".join(methods), "Computational_Techniques_Regex": "; ".join(techniques), "Validation_Method_1": "dictionary/regex extraction", "Validation_Method_2": "cluster/category semantic match", "Validation_Status": "VALIDATED" if (methods or techniques or theories) else "NEEDS_FULL_TEXT_REVIEW", }) path = _opath("tccm_validation.csv") pd.DataFrame(rows).to_csv(path, index=False) return path def parse_notebooklm_tccm_text(raw_text: str) -> str: """Parse NotebookLM's copied table text into a Google-Sheets-ready CSV.""" columns = [ "Paper ID", "Paper Citation", "Study Type", "Dependent Variable(s) (DV)", "Independent Variable(s) (IVs)", "Mediator(s)", "Moderator(s)", "Relationship Direction", "Evidence Snippet", ] skip = {c.lower() for c in columns} skip.update({ "today • 14:09", "ask a question or create something", "notebooklm can be inaccurate; please double-check its responses.", "i want minimum 1 dependent and 1 independent variable", "i want minimum 1 dependernt and 1 independent variable", }) lines = [ re.sub(r"\s+", " ", line.strip()) for line in str(raw_text or "").splitlines() if line.strip() ] rows = [] current_id = None fields = [] def is_id(line: str) -> bool: return bool(re.fullmatch(r"\d{1,3}", line)) and 1 <= int(line) <= 100 def finalise(): if current_id is None or not fields: return cleaned = [f for f in fields if f.lower() not in skip] if len(cleaned) < 2: return fixed = cleaned[:7] evidence = " ".join(cleaned[7:]) if len(cleaned) > 7 else "" while len(fixed) < 7: fixed.append("NA") rows.append({ "Paper ID": int(current_id), "Paper Citation": fixed[0], "Study Type": fixed[1], "Dependent Variable(s) (DV)": fixed[2], "Independent Variable(s) (IVs)": fixed[3], "Mediator(s)": fixed[4], "Moderator(s)": fixed[5], "Relationship Direction": fixed[6], "Evidence Snippet": evidence, }) seen_header = False for line in lines: lower = line.lower() if lower in skip: seen_header = True continue if is_id(line): if seen_header or current_id is not None: finalise() current_id = int(line) fields = [] continue if current_id is not None: fields.append(line) finalise() df = pd.DataFrame(rows, columns=columns) if not df.empty: df = df.sort_values("Paper ID").drop_duplicates("Paper ID", keep="last") path = _opath("notebooklm_extraction.csv") df.to_csv(path, index=False) return path def write_tccm_dual_validation(notebooklm_path: str = "", second_llm_path: str = "") -> str: base_path = _opath("tccm_validation.csv") base = pd.read_csv(base_path) if os.path.exists(base_path) else pd.DataFrame() def load_optional(path: str, prefix: str) -> pd.DataFrame: if not path or not os.path.exists(path): return pd.DataFrame() loaded = pd.read_csv(path, encoding="utf-8-sig", on_bad_lines="skip") rename = {} for col in loaded.columns: low = col.lower().strip() if low in {"paper id", "paper_id", "rank", "id"}: rename[col] = "Paper ID" elif low in {"title", "paper title", "article title"}: rename[col] = "Title" elif low in {"paper citation", "citation"}: rename[col] = f"{prefix}_Citation" elif "study type" in low: rename[col] = f"{prefix}_Study_Type" elif "independent" in low or low == "iv" or "ivs" in low: rename[col] = f"{prefix}_IV" elif "dependent" in low or low == "dv": rename[col] = f"{prefix}_DV" elif "mediator" in low: rename[col] = f"{prefix}_Mediator" elif "moderator" in low: rename[col] = f"{prefix}_Moderator" elif "relationship direction" in low or low == "direction": rename[col] = f"{prefix}_Relationship_Direction" elif "evidence" in low or "snippet" in low: rename[col] = f"{prefix}_Evidence" elif low in {"doi"}: rename[col] = "DOI" elif "theor" in low: rename[col] = f"{prefix}_Theory" elif "context" in low: rename[col] = f"{prefix}_Context" elif "method" in low: rename[col] = f"{prefix}_Method" elif "variable" in low or "construct" in low or "characteristic" in low: rename[col] = f"{prefix}_Variables" elif "technique" in low or "comput" in low: rename[col] = f"{prefix}_Computational_Techniques" loaded = loaded.rename(columns=rename) keep = [c for c in loaded.columns if c in {"Paper ID", "Title", "DOI"} or c.startswith(prefix)] return loaded[keep].copy() notebook = load_optional(notebooklm_path, "NotebookLM") second = load_optional(second_llm_path, "SecondLLM") if base.empty: merged = pd.DataFrame() else: merged = base.copy() if not notebook.empty: key = "Paper ID" if "Paper ID" in notebook.columns and "Paper ID" in merged.columns else ("DOI" if "DOI" in notebook.columns and merged["DOI"].astype(str).str.len().gt(0).any() else "Title") merged = merged.merge(notebook, how="left", on=key, suffixes=("", "_NotebookLM_Input")) if not second.empty: key = "Paper ID" if "Paper ID" in second.columns and "Paper ID" in merged.columns else ("DOI" if "DOI" in second.columns and merged["DOI"].astype(str).str.len().gt(0).any() else "Title") merged = merged.merge(second, how="left", on=key, suffixes=("", "_SecondLLM_Input")) if merged.empty: merged = pd.DataFrame([{ "Compliance_Status": "PENDING", "Required_Action": "Run topic pipeline first, then upload NotebookLM and second-LLM extraction CSV files.", }]) else: has_notebook = any(c.startswith("NotebookLM") for c in merged.columns) has_second = any(c.startswith("SecondLLM") for c in merged.columns) def row_status(row): regex_hit = any( str(row.get(c, "")).strip().lower() not in {"", "nan", "none"} for c in ["Theory_Regex", "Method_Regex", "Computational_Techniques_Regex", "Characteristics_Constructs"] ) notebook_cols = [c for c in merged.columns if c.startswith("NotebookLM_") and c != "NotebookLM_File_Loaded"] second_cols = [c for c in merged.columns if c.startswith("SecondLLM_")] notebook_hit = any(str(row.get(c, "")).strip().lower() not in {"", "nan", "none", "false"} for c in notebook_cols) second_hit = any(str(row.get(c, "")).strip().lower() not in {"", "nan", "none", "false"} for c in second_cols) if notebook_hit and second_hit: return "COMPLIANT_NOTEBOOKLM_PLUS_SECOND_LLM" if notebook_hit and regex_hit: return "PARTIAL_NOTEBOOKLM_PLUS_REGEX" if second_hit and regex_hit: return "PARTIAL_SECOND_LLM_PLUS_REGEX" return "PENDING_NOTEBOOKLM_AND_SECOND_LLM" merged["NotebookLM_File_Loaded"] = has_notebook merged["Second_LLM_File_Loaded"] = has_second merged["Final_TCCM_Compliance_Status"] = merged.apply(row_status, axis=1) merged["Required_Action"] = np.where( merged["Final_TCCM_Compliance_Status"].eq("COMPLIANT_NOTEBOOKLM_PLUS_SECOND_LLM"), "Ready for mentor review with dual AI validation.", "Upload NotebookLM extraction and second LLM extraction from full-text PDFs before claiming final compliance.", ) path = _opath("tccm_dual_validation.csv") merged.to_csv(path, index=False) return path def write_compliance_checklist(params: Dict[str, Any], meta: Dict[str, Any], summaries: List[Dict[str, Any]]) -> str: has_live_llm = any( "Mistral LLM council vote" in vote.get("method", "") for summary in summaries for vote in summary.get("council_votes", []) ) notebook_loaded = os.path.exists(_opath("notebooklm_extraction.csv")) dual_path = _opath("tccm_dual_validation.csv") second_loaded = False if os.path.exists(dual_path): try: dual_df = pd.read_csv(dual_path) second_loaded = bool(dual_df.get("Second_LLM_File_Loaded", pd.Series([False])).astype(bool).any()) except Exception: second_loaded = False rows = [ { "Requirement": "15 to 25 crisp topic clusters", "Status": "PASS" if TARGET_MIN_CLUSTERS <= params.get("n_clusters", 0) <= TARGET_MAX_CLUSTERS else "FAIL", "Evidence": f"{params.get('n_clusters')} clusters generated.", "File": "comparison.csv / cluster_optimization_log.csv", }, { "Requirement": "Minimum 5 and maximum 100 papers per cluster", "Status": "PASS" if params.get("min_size", 0) >= MIN_CLUSTER_SIZE and params.get("max_size", 999) <= MAX_CLUSTER_SIZE else "FAIL", "Evidence": f"min={params.get('min_size')}, max={params.get('max_size')}.", "File": "cluster_optimization_log.csv", }, { "Requirement": "Cluster optimization loop with parameter recommendations", "Status": "PASS" if os.path.exists(_opath("cluster_optimization_log.csv")) else "FAIL", "Evidence": "Optimizer records attempted settings, scores, and recommendations.", "File": "cluster_optimization_log.csv", }, { "Requirement": "Top three high-probability paper titles fed for labels", "Status": "PASS" if all(len(s.get("top_titles", [])) >= 3 for s in summaries) else "REVIEW", "Evidence": "Top_3_Paper_Titles included for every cluster.", "File": "comparison.csv", }, { "Requirement": "LLM council visible in app, not just story text", "Status": "PASS" if os.path.exists(_opath("llm_council_validation.csv")) else "FAIL", "Evidence": "Animated council board and vote table in Council Validation tab.", "File": "llm_council_validation.csv / app.py", }, { "Requirement": "Live 3-LLM council labels", "Status": "PASS" if has_live_llm else "CONFIG_REQUIRED", "Evidence": "Set MISTRAL_API_KEY in Space secrets to switch from local fallback to live Mistral council.", "File": "llm_council_validation.csv", }, { "Requirement": "SPECTER2 paper-level embeddings", "Status": "PASS" if "specter2" in str(meta.get("embedding_model", "")).lower() else "ENV_FALLBACK", "Evidence": meta.get("embedding_note", ""), "File": "run_metadata.json", }, { "Requirement": "UMAP + HDBSCAN density clustering", "Status": "PASS" if str(params.get("algorithm", "")).lower().startswith("umap + hdbscan") else "ENV_FALLBACK", "Evidence": str(params.get("algorithm", "")), "File": "run_metadata.json / cluster_optimization_log.csv", }, { "Requirement": "TCCM corpus loaded and vectorised for computational techniques", "Status": "PASS" if os.path.exists(_opath("tccm_validation.csv")) else "FAIL", "Evidence": "Top-cited 100 papers exported with regex and semantic computational technique extraction.", "File": "tccm_validation.csv", }, { "Requirement": "NotebookLM output plus another LLM method for TCCM", "Status": "PASS" if notebook_loaded and second_loaded else ("PARTIAL" if notebook_loaded else "INPUT_REQUIRED"), "Evidence": ( "NotebookLM extraction loaded; second LLM extraction still required." if notebook_loaded and not second_loaded else ("NotebookLM and second LLM extraction loaded." if second_loaded else "Use TCCM Dual Validation tab to upload NotebookLM CSV and second LLM CSV.") ), "File": "tccm_dual_validation.csv", }, { "Requirement": "Formal mentor approval before final submission", "Status": "MANUAL_REQUIRED", "Evidence": "Cannot be automated; get faculty mentor approval.", "File": "mentor approval evidence", }, ] path = _opath("compliance_checklist.csv") pd.DataFrame(rows).to_csv(path, index=False) _save_json(rows, "compliance_checklist.json") return path def _write_validation_files(summaries: List[Dict[str, Any]], optimizer_log: List[Dict[str, Any]], params: Dict[str, Any], meta: Dict[str, Any]) -> None: council = [] for s in summaries: for vote in s["council_votes"]: council.append({ "cluster_id": s["cluster_id"], "final_label": s["label"], "member": vote["member"], "member_label": vote["label"], "method": vote["method"], "top_3_titles_used": " | ".join(s.get("top_titles", [])[:3]), "agreement_score": s["agreement_score"], "confidence": s["confidence"], }) pd.DataFrame(council).to_csv(_opath("llm_council_validation.csv"), index=False) _save_json(council, "llm_council.json") pd.DataFrame(optimizer_log).sort_values("score", na_position="last").to_csv(_opath("cluster_optimization_log.csv"), index=False) _save_json({"selected_parameters": params, "embedding": meta}, "run_metadata.json") def _write_report(config: Dict[str, Any], summaries: List[Dict[str, Any]], params: Dict[str, Any], meta: Dict[str, Any]) -> str: lines = [ "# Topic Modelling Final Submission Report", "", f"Journal: {config.get('journal')}", f"Papers analysed: {config.get('rows')}", f"Years: {config.get('year_min')} to {config.get('year_max')}", "", "## Method", f"The model uses one vector per paper from {config.get('combined_field')}. " f"Embedding model: {meta.get('embedding_model')}. Clustering: {params.get('algorithm')}. " "The optimizer searches UMAP/HDBSCAN parameters and selects the lowest penalty solution " "against the required 15-25 clusters, 5 minimum papers per cluster, and 100 maximum papers per cluster.", "", "## Selected Parameters", "```json", json.dumps(params, indent=2), "```", "", "## Validated Clusters", ] for s in summaries: lines.append( f"- C{s['cluster_id']}: {s['label']} ({s['paper_count']} papers, " f"confidence {s['confidence']}, PAJAIS: {s['category']}). " f"Evidence titles: {' | '.join(s['top_titles'][:3])}" ) lines.extend([ "", "## Validation", "Labels are validated through the in-app council table: keyword extraction, PAJAIS semantic mapping, " "and an LLM labeler when MISTRAL_API_KEY is configured. Without a key, the third council member " "uses a deterministic local semantic fallback, so the app remains executable end to end.", "", "TCCM and computational technique extraction are exported in `tccm_validation.csv` for the top-cited 100 papers. " "Rows marked `NEEDS_FULL_TEXT_REVIEW` should be checked against PDFs before final academic submission. " "Full TCCM compliance requires uploading NotebookLM extraction and a second LLM extraction in the app's " "TCCM Dual Validation tab to generate `tccm_dual_validation.csv`.", ]) path = _opath("topic_model_report.md") with open(path, "w", encoding="utf-8") as f: f.write("\n".join(lines)) with open(_opath("narrative.txt"), "w", encoding="utf-8") as f: f.write("\n".join(lines[:60])) return path def run_complete_pipeline(filepath: str) -> Dict[str, Any]: df, config = load_corpus(filepath) vectors, meta = _embed_documents(df["__combined"].tolist()) np.save(_opath("combined_emb.npy"), vectors) labels, probabilities, params, optimizer_log = _run_umap_hdbscan(vectors) summaries = _build_cluster_summaries(df, vectors, labels, probabilities) _save_json(summaries, "combined_labels.json") _save_json(summaries, "abstract_labels.json") _save_json(summaries, "title_labels.json") _save_json({"sentences": df["__combined"].tolist(), "paper_ids": df["__paper_id"].astype(int).tolist()}, "combined_sentences.json") taxonomy = _taxonomy_map(summaries) _save_json(taxonomy, "taxonomy_map.json") _generate_charts(summaries) comparison_path = _write_comparison_csv(summaries) tccm_path = _write_tccm_validation(df) _write_validation_files(summaries, optimizer_log, params, meta) dual_tccm_path = write_tccm_dual_validation() checklist_path = write_compliance_checklist(params, meta, summaries) report_path = _write_report(config, summaries, params, meta) deliverables = [ comparison_path, _opath("taxonomy_map.json"), _opath("topic_model_report.md"), _opath("narrative.txt"), _opath("cluster_optimization_log.csv"), _opath("llm_council_validation.csv"), _opath("tccm_validation.csv"), dual_tccm_path, checklist_path, _opath("run_metadata.json"), _opath("combined_labels.json"), ] return { "config": config, "parameters": params, "embedding": meta, "clusters": summaries, "taxonomy": taxonomy, "deliverables": [p for p in deliverables if os.path.exists(p)], }