Spaces:
Sleeping
Sleeping
Daksh C Jain
Initial commit: EIS Topic Intelligence — UMAP+HDBSCAN+Mistral council, dark EIS theme, 23 clusters from Enterprise Information Systems corpus
c91d9b4 | import json | |
| import os | |
| import re | |
| from collections import Counter, defaultdict | |
| from datetime import datetime | |
| from typing import Any, Dict, List, Tuple | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.express as px | |
| from sklearn.cluster import AgglomerativeClustering, KMeans | |
| from sklearn.decomposition import PCA, TruncatedSVD | |
| from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
| from sklearn.metrics import silhouette_score | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.preprocessing import normalize | |
| OUTPUT_DIR = "./outputs" | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| TARGET_MIN_CLUSTERS = 15 | |
| TARGET_MAX_CLUSTERS = 25 | |
| MIN_CLUSTER_SIZE = 5 | |
| MAX_CLUSTER_SIZE = 100 | |
| LABEL_STOP_WORDS = set(""" | |
| abstract available research study studies paper papers article articles journal | |
| based using use uses used result results effect effects model models approach | |
| analysis data information system systems electronic markets market business | |
| """.split()) | |
| PAJAIS_25 = [ | |
| "IS Strategy and Management", | |
| "E-Commerce and E-Business", | |
| "IT Adoption and Diffusion", | |
| "Business Intelligence and Analytics", | |
| "Social Commerce and Social Media", | |
| "Mobile Commerce and Applications", | |
| "Knowledge Management", | |
| "Healthcare Information Systems", | |
| "Privacy, Security and Trust", | |
| "Enterprise Systems and ERP", | |
| "Digital Platforms and Ecosystems", | |
| "Blockchain and Distributed Ledgers", | |
| "Artificial Intelligence and Machine Learning", | |
| "Human-Computer Interaction and UX", | |
| "Digital Transformation and Innovation", | |
| "Financial Technology and Digital Finance", | |
| "Supply Chain and Logistics IS", | |
| "Smart Systems IoT and Smart Cities", | |
| "IS Research Methods and Theory", | |
| "Recommender and Personalization Systems", | |
| "Digital Marketing and Advertising", | |
| "Virtual Teams and Online Collaboration", | |
| "Cloud Computing and SaaS", | |
| "Big Data Analytics and Data Science", | |
| "IS Education and Training", | |
| ] | |
| CATEGORY_TERMS = { | |
| "IS Strategy and Management": "strategy governance value performance management capability alignment", | |
| "E-Commerce and E-Business": "e-commerce marketplace online shopping electronic market platform transaction", | |
| "IT Adoption and Diffusion": "adoption acceptance intention use continuance utaut tam diffusion", | |
| "Business Intelligence and Analytics": "analytics data mining business intelligence decision support prediction", | |
| "Social Commerce and Social Media": "social media social commerce online community influencer live streaming", | |
| "Mobile Commerce and Applications": "mobile app smartphone m-commerce location based wearable", | |
| "Knowledge Management": "knowledge sharing knowledge management learning collaboration expertise", | |
| "Healthcare Information Systems": "health healthcare patient medical telemedicine ehealth", | |
| "Privacy, Security and Trust": "privacy security trust risk fraud identity protection", | |
| "Enterprise Systems and ERP": "erp enterprise system process integration organization", | |
| "Digital Platforms and Ecosystems": "platform ecosystem multi-sided digital platform complementor", | |
| "Blockchain and Distributed Ledgers": "blockchain distributed ledger smart contract token cryptocurrency", | |
| "Artificial Intelligence and Machine Learning": "artificial intelligence machine learning ai algorithm automation robot", | |
| "Human-Computer Interaction and UX": "user experience interface interaction usability design hci", | |
| "Digital Transformation and Innovation": "digital transformation innovation digitization disruption business model", | |
| "Financial Technology and Digital Finance": "fintech finance payment robo-advisor banking investment", | |
| "Supply Chain and Logistics IS": "supply chain logistics procurement inventory operations", | |
| "Smart Systems IoT and Smart Cities": "iot internet of things sensor smart city smart service", | |
| "IS Research Methods and Theory": "method theory literature review framework model research design", | |
| "Recommender and Personalization Systems": "recommendation recommender personalization preference choice", | |
| "Digital Marketing and Advertising": "advertising marketing consumer brand customer targeting", | |
| "Virtual Teams and Online Collaboration": "virtual team collaboration remote work crowd outsourcing", | |
| "Cloud Computing and SaaS": "cloud saas service computing infrastructure platform as a service", | |
| "Big Data Analytics and Data Science": "big data data science text mining deep learning analytics", | |
| "IS Education and Training": "education training learning student teaching mooc", | |
| } | |
| THEORY_PATTERNS = [ | |
| "technology acceptance model", "tam", "utaut", "diffusion of innovation", | |
| "task technology fit", "social exchange theory", "institutional theory", | |
| "resource based view", "transaction cost", "information systems success", | |
| "expectation confirmation", "trust theory", "planned behavior", | |
| ] | |
| METHOD_PATTERNS = [ | |
| "survey", "experiment", "case study", "structural equation", "sem", | |
| "regression", "machine learning", "design science", "literature review", | |
| "qualitative", "interview", "content analysis", "simulation", | |
| ] | |
| COMPUTATIONAL_PATTERNS = [ | |
| "machine learning", "deep learning", "neural network", "random forest", | |
| "support vector", "svm", "classification", "clustering", "topic model", | |
| "lda", "natural language processing", "nlp", "text mining", "sentiment", | |
| "recommender", "algorithm", "prediction", "analytics", "optimization", | |
| ] | |
| def _opath(name: str) -> str: | |
| return os.path.join(OUTPUT_DIR, name) | |
| def _save_json(data: Any, name: str) -> str: | |
| path = _opath(name) | |
| with open(path, "w", encoding="utf-8") as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| return path | |
| def _clean_text(text: Any) -> str: | |
| text = re.sub(r"\s+", " ", str(text or "")).strip() | |
| text = re.sub(r"©.*$", "", text).strip() | |
| return text | |
| def _first_existing(df: pd.DataFrame, candidates: List[str]) -> str: | |
| lowered = {c.lower(): c for c in df.columns} | |
| for name in candidates: | |
| if name.lower() in lowered: | |
| return lowered[name.lower()] | |
| return "" | |
| def load_corpus(filepath: str) -> Tuple[pd.DataFrame, Dict[str, Any]]: | |
| df = pd.read_csv(filepath, encoding="utf-8-sig", on_bad_lines="skip") | |
| title_col = _first_existing(df, ["Title"]) | |
| abstract_col = _first_existing(df, ["Abstract"]) | |
| doi_col = _first_existing(df, ["DOI"]) | |
| year_col = _first_existing(df, ["Year"]) | |
| journal_col = _first_existing(df, ["Source title", "Journal"]) | |
| cited_col = _first_existing(df, ["Cited by", "Citations"]) | |
| if not title_col or not abstract_col: | |
| raise ValueError("CSV must include Title and Abstract columns.") | |
| df = df.copy() | |
| df["__title"] = df[title_col].map(_clean_text) | |
| df["__abstract"] = df[abstract_col].map(_clean_text) | |
| df["__doi"] = df[doi_col].fillna("").map(str) if doi_col else "" | |
| df["__combined"] = (df["__title"] + ". " + df["__abstract"]).map(_clean_text) | |
| df = df[df["__combined"].str.len() > 80].reset_index(drop=True) | |
| df["__paper_id"] = np.arange(len(df)) | |
| if cited_col: | |
| df["__cited_by"] = pd.to_numeric(df[cited_col], errors="coerce").fillna(0) | |
| else: | |
| df["__cited_by"] = 0 | |
| years = pd.to_numeric(df[year_col], errors="coerce") if year_col else pd.Series(dtype=float) | |
| journal = df[journal_col].dropna().astype(str).mode().iloc[0] if journal_col and not df[journal_col].dropna().empty else "Unknown" | |
| config = { | |
| "filepath": filepath, | |
| "journal": journal, | |
| "rows": int(len(df)), | |
| "year_min": int(years.min()) if not years.dropna().empty else None, | |
| "year_max": int(years.max()) if not years.dropna().empty else None, | |
| "title_column": title_col, | |
| "abstract_column": abstract_col, | |
| "doi_column": doi_col or "missing", | |
| "combined_field": "Title + Abstract; DOI retained as paper identifier", | |
| "generated_at": datetime.now().isoformat(timespec="seconds"), | |
| } | |
| _save_json(config, "corpus_config.json") | |
| return df, config | |
| def _embed_documents(texts: List[str]) -> Tuple[np.ndarray, Dict[str, Any]]: | |
| errors = [] | |
| try: | |
| import torch | |
| from transformers import AutoModel, AutoTokenizer | |
| model_name = "allenai/specter2_base" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModel.from_pretrained(model_name) | |
| model.eval() | |
| batches = [] | |
| with torch.no_grad(): | |
| for start in range(0, len(texts), 8): | |
| batch = texts[start:start + 8] | |
| encoded = tokenizer( | |
| batch, | |
| padding=True, | |
| truncation=True, | |
| max_length=512, | |
| return_tensors="pt", | |
| ) | |
| output = model(**encoded) | |
| mask = encoded["attention_mask"].unsqueeze(-1) | |
| pooled = (output.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1) | |
| batches.append(pooled.cpu().numpy()) | |
| vectors = normalize(np.vstack(batches)) | |
| return np.asarray(vectors, dtype=np.float32), { | |
| "embedding_model": "allenai/specter2_base", | |
| "embedding_note": "SPECTER2 transformer embeddings from Title + Abstract; DOI retained as paper identifier.", | |
| } | |
| except Exception as exc: | |
| errors.append(f"transformers allenai/specter2_base: {exc.__class__.__name__}: {exc}") | |
| for model_name in ["allenai/specter2_base", "allenai-specter"]: | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| model = SentenceTransformer(model_name) | |
| vectors = model.encode( | |
| texts, | |
| normalize_embeddings=True, | |
| batch_size=16, | |
| show_progress_bar=True, | |
| ) | |
| return np.asarray(vectors, dtype=np.float32), { | |
| "embedding_model": model_name, | |
| "embedding_note": "Transformer embeddings. SPECTER2 attempted first.", | |
| } | |
| except Exception as exc: | |
| errors.append(f"{model_name}: {exc.__class__.__name__}: {exc}") | |
| vectorizer = TfidfVectorizer( | |
| max_features=5000, | |
| ngram_range=(1, 2), | |
| min_df=2, | |
| max_df=0.85, | |
| stop_words="english", | |
| ) | |
| tfidf = vectorizer.fit_transform(texts) | |
| n_components = min(256, max(2, min(tfidf.shape) - 1)) | |
| svd = TruncatedSVD(n_components=n_components, random_state=42) | |
| vectors = normalize(svd.fit_transform(tfidf)) | |
| meta = { | |
| "embedding_model": "TF-IDF + TruncatedSVD fallback", | |
| "embedding_note": "SPECTER2/Transformer loading failed; deterministic fallback kept the app runnable.", | |
| "embedding_errors": errors[-3:], | |
| } | |
| return np.asarray(vectors, dtype=np.float32), meta | |
| def _cluster_metrics(labels: np.ndarray, vectors: np.ndarray) -> Dict[str, Any]: | |
| valid = labels >= 0 | |
| cluster_ids, counts = np.unique(labels[valid], return_counts=True) | |
| n_clusters = int(len(cluster_ids)) | |
| noise_ratio = float(np.mean(~valid)) if len(labels) else 1.0 | |
| too_small = int(np.sum(counts < MIN_CLUSTER_SIZE)) if len(counts) else 999 | |
| too_large = int(np.sum(counts > MAX_CLUSTER_SIZE)) if len(counts) else 999 | |
| silhouette = -1.0 | |
| if n_clusters > 1 and np.sum(valid) > n_clusters: | |
| sample_vectors = vectors[valid] | |
| sample_labels = labels[valid] | |
| if len(sample_vectors) > 800: | |
| rng = np.random.default_rng(42) | |
| sample_idx = rng.choice(len(sample_vectors), 800, replace=False) | |
| sample_vectors = sample_vectors[sample_idx] | |
| sample_labels = sample_labels[sample_idx] | |
| try: | |
| silhouette = float(silhouette_score(sample_vectors, sample_labels, metric="cosine")) | |
| except Exception: | |
| silhouette = -1.0 | |
| range_penalty = 0 | |
| if n_clusters < TARGET_MIN_CLUSTERS: | |
| range_penalty = (TARGET_MIN_CLUSTERS - n_clusters) * 3 | |
| if n_clusters > TARGET_MAX_CLUSTERS: | |
| range_penalty = (n_clusters - TARGET_MAX_CLUSTERS) * 3 | |
| score = ( | |
| range_penalty | |
| + too_small * 2 | |
| + too_large * 4 | |
| + noise_ratio * 8 | |
| - silhouette | |
| ) | |
| return { | |
| "n_clusters": n_clusters, | |
| "noise_ratio": round(noise_ratio, 4), | |
| "min_size": int(counts.min()) if len(counts) else 0, | |
| "max_size": int(counts.max()) if len(counts) else 0, | |
| "too_small": too_small, | |
| "too_large": too_large, | |
| "silhouette_cosine": round(silhouette, 4), | |
| "score": round(float(score), 4), | |
| } | |
| def _compact_labels(labels: np.ndarray) -> np.ndarray: | |
| labels = np.asarray(labels, dtype=int).copy() | |
| positive = [int(x) for x in sorted(np.unique(labels)) if x >= 0] | |
| mapping = {old: new for new, old in enumerate(positive)} | |
| return np.asarray([mapping.get(int(x), -1) for x in labels], dtype=int) | |
| def _repair_labels(labels: np.ndarray, vectors: np.ndarray) -> np.ndarray: | |
| labels = _compact_labels(labels) | |
| if np.all(labels < 0): | |
| k = min(20, max(TARGET_MIN_CLUSTERS, len(vectors) // 35)) | |
| return KMeans(n_clusters=k, random_state=42, n_init=20).fit_predict(vectors) | |
| noise_idx = np.where(labels < 0)[0] | |
| if len(noise_idx): | |
| valid_ids = [int(x) for x in sorted(np.unique(labels)) if x >= 0] | |
| centroids = np.asarray([vectors[labels == cid].mean(axis=0) for cid in valid_ids]) | |
| nearest = cosine_similarity(vectors[noise_idx], centroids).argmax(axis=1) | |
| labels[noise_idx] = np.asarray([valid_ids[i] for i in nearest], dtype=int) | |
| labels = _compact_labels(labels) | |
| next_id = int(labels.max()) + 1 | |
| for cid in list(sorted(np.unique(labels))): | |
| idx = np.where(labels == cid)[0] | |
| if len(idx) <= MAX_CLUSTER_SIZE: | |
| continue | |
| centroid = vectors[idx].mean(axis=0, keepdims=True) | |
| order = cosine_similarity(vectors[idx], centroid).ravel().argsort()[::-1] | |
| ordered_idx = idx[order] | |
| chunks = [ordered_idx[i:i + MAX_CLUSTER_SIZE] for i in range(0, len(ordered_idx), MAX_CLUSTER_SIZE)] | |
| labels[chunks[0]] = cid | |
| for chunk in chunks[1:]: | |
| labels[chunk] = next_id | |
| next_id += 1 | |
| labels = _compact_labels(labels) | |
| changed = True | |
| while changed: | |
| changed = False | |
| ids, counts = np.unique(labels, return_counts=True) | |
| tiny = [int(cid) for cid, count in zip(ids, counts) if count < MIN_CLUSTER_SIZE] | |
| if not tiny or len(ids) <= TARGET_MIN_CLUSTERS: | |
| break | |
| for cid in tiny: | |
| idx = np.where(labels == cid)[0] | |
| other_ids = [int(x) for x in np.unique(labels) if int(x) != cid] | |
| if not other_ids: | |
| continue | |
| centroid = vectors[idx].mean(axis=0, keepdims=True) | |
| other_centroids = np.asarray([vectors[labels == oid].mean(axis=0) for oid in other_ids]) | |
| nearest_order = cosine_similarity(centroid, other_centroids).ravel().argsort()[::-1] | |
| target = other_ids[int(nearest_order[0])] | |
| labels[idx] = target | |
| labels = _compact_labels(labels) | |
| changed = True | |
| break | |
| labels = _compact_labels(labels) | |
| while len(np.unique(labels)) < TARGET_MIN_CLUSTERS: | |
| ids, counts = np.unique(labels, return_counts=True) | |
| largest = int(ids[np.argmax(counts)]) | |
| idx = np.where(labels == largest)[0] | |
| if len(idx) < MIN_CLUSTER_SIZE * 2: | |
| break | |
| centroid = vectors[idx].mean(axis=0, keepdims=True) | |
| order = cosine_similarity(vectors[idx], centroid).ravel().argsort()[::-1] | |
| split_at = len(idx) // 2 | |
| if split_at < MIN_CLUSTER_SIZE or len(idx) - split_at < MIN_CLUSTER_SIZE: | |
| break | |
| labels[idx[order[split_at:]]] = int(labels.max()) + 1 | |
| labels = _compact_labels(labels) | |
| return _compact_labels(labels) | |
| def _optimizer_recommendation(metrics: Dict[str, Any]) -> str: | |
| if metrics["n_clusters"] < TARGET_MIN_CLUSTERS: | |
| return "Increase UMAP n_neighbors separation pressure or lower HDBSCAN min_cluster_size." | |
| if metrics["n_clusters"] > TARGET_MAX_CLUSTERS: | |
| return "Raise HDBSCAN min_cluster_size or increase UMAP n_neighbors to merge nearby themes." | |
| if metrics["max_size"] > MAX_CLUSTER_SIZE: | |
| return "Split dominant clusters by lowering min_cluster_size or lowering min_samples." | |
| if metrics["noise_ratio"] > 0.25: | |
| return "Reduce min_samples and keep UMAP n_components at 5-10 to reduce noise." | |
| return "Keep this parameter set; it satisfies the 15-25 cluster target best." | |
| def _run_umap_hdbscan(vectors: np.ndarray) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any], List[Dict[str, Any]]]: | |
| candidates = [] | |
| best = None | |
| best_reduced = None | |
| best_params = None | |
| try: | |
| import umap | |
| try: | |
| import hdbscan as external_hdbscan | |
| hdbscan_backend = "external" | |
| except Exception: | |
| external_hdbscan = None | |
| from sklearn.cluster import HDBSCAN as SklearnHDBSCAN | |
| hdbscan_backend = "sklearn" | |
| for n_neighbors in [10, 20, 35]: | |
| for n_components in [5, 10]: | |
| reduced = umap.UMAP( | |
| n_neighbors=n_neighbors, | |
| n_components=n_components, | |
| min_dist=0.0, | |
| metric="cosine", | |
| random_state=42, | |
| ).fit_transform(vectors) | |
| for min_cluster_size in [5, 8, 12, 16, 25]: | |
| for min_samples in [1, 3, None]: | |
| if hdbscan_backend == "external": | |
| clusterer = external_hdbscan.HDBSCAN( | |
| min_cluster_size=min_cluster_size, | |
| min_samples=min_samples, | |
| metric="euclidean", | |
| prediction_data=True, | |
| ) | |
| else: | |
| clusterer = SklearnHDBSCAN( | |
| min_cluster_size=min_cluster_size, | |
| min_samples=min_samples, | |
| metric="euclidean", | |
| ) | |
| labels = clusterer.fit_predict(reduced) | |
| labels = _repair_labels(labels, vectors) | |
| metrics = _cluster_metrics(labels, vectors) | |
| params = { | |
| "algorithm": f"UMAP + HDBSCAN ({hdbscan_backend})", | |
| "umap_n_neighbors": n_neighbors, | |
| "umap_n_components": n_components, | |
| "umap_metric": "cosine", | |
| "hdbscan_min_cluster_size": min_cluster_size, | |
| "hdbscan_min_samples": min_samples, | |
| "hdbscan_metric": "euclidean", | |
| } | |
| row = {**params, **metrics, "optimizer_recommendation": _optimizer_recommendation(metrics)} | |
| candidates.append(row) | |
| if best is None or metrics["score"] < best[1]["score"]: | |
| best = (labels, metrics, getattr(clusterer, "probabilities_", np.ones(len(labels)))) | |
| best_reduced = reduced | |
| best_params = params | |
| if ( | |
| TARGET_MIN_CLUSTERS <= metrics["n_clusters"] <= TARGET_MAX_CLUSTERS | |
| and metrics["too_small"] == 0 | |
| and metrics["too_large"] == 0 | |
| and metrics["noise_ratio"] <= 0.25 | |
| ): | |
| probs = getattr(clusterer, "probabilities_", np.ones(len(labels))) | |
| if len(probs) != len(labels): | |
| probs = np.ones(len(labels)) | |
| return labels, probs, {**params, **metrics}, candidates | |
| except Exception as exc: | |
| candidates.append({ | |
| "algorithm": "UMAP + HDBSCAN", | |
| "error": f"{exc.__class__.__name__}: {exc}", | |
| "optimizer_recommendation": "UMAP is unavailable in this Python install; try PCA manifold fallback with HDBSCAN.", | |
| }) | |
| try: | |
| from sklearn.cluster import HDBSCAN as SklearnHDBSCAN | |
| for n_components in [5, 10, 20]: | |
| pca_components = min(n_components, max(2, min(vectors.shape) - 1)) | |
| reduced = PCA(n_components=pca_components, random_state=42).fit_transform(vectors) | |
| for min_cluster_size in [5, 8, 12, 16, 25]: | |
| for min_samples in [1, 3, None]: | |
| clusterer = SklearnHDBSCAN( | |
| min_cluster_size=min_cluster_size, | |
| min_samples=min_samples, | |
| metric="euclidean", | |
| ) | |
| labels = _repair_labels(clusterer.fit_predict(reduced), vectors) | |
| metrics = _cluster_metrics(labels, vectors) | |
| params = { | |
| "algorithm": "PCA manifold fallback + HDBSCAN (sklearn; UMAP unavailable locally)", | |
| "pca_n_components": pca_components, | |
| "hdbscan_min_cluster_size": min_cluster_size, | |
| "hdbscan_min_samples": min_samples, | |
| "hdbscan_metric": "euclidean", | |
| } | |
| row = {**params, **metrics, "optimizer_recommendation": _optimizer_recommendation(metrics)} | |
| candidates.append(row) | |
| probs = np.ones(len(labels)) | |
| if best is None or metrics["score"] < best[1]["score"]: | |
| best = (labels, metrics, probs) | |
| best_params = params | |
| if ( | |
| TARGET_MIN_CLUSTERS <= metrics["n_clusters"] <= TARGET_MAX_CLUSTERS | |
| and metrics["too_small"] == 0 | |
| and metrics["too_large"] == 0 | |
| ): | |
| return labels, probs, {**params, **metrics}, candidates | |
| except Exception as fallback_exc: | |
| candidates.append({ | |
| "algorithm": "PCA manifold fallback + HDBSCAN", | |
| "error": f"{fallback_exc.__class__.__name__}: {fallback_exc}", | |
| "optimizer_recommendation": "Use deterministic KMeans fallback so the pipeline still completes.", | |
| }) | |
| if best is not None and TARGET_MIN_CLUSTERS <= best[1]["n_clusters"] <= TARGET_MAX_CLUSTERS: | |
| return best[0], best[2], {**best_params, **best[1]}, candidates | |
| n_clusters = min(20, max(TARGET_MIN_CLUSTERS, len(vectors) // 35)) | |
| labels = KMeans(n_clusters=n_clusters, random_state=42, n_init=20).fit_predict(vectors) | |
| labels = _repair_labels(labels, vectors) | |
| metrics = _cluster_metrics(labels, vectors) | |
| params = { | |
| "algorithm": "KMeans fallback after UMAP/HDBSCAN optimization", | |
| "n_clusters": n_clusters, | |
| **metrics, | |
| } | |
| sims = np.zeros(len(labels), dtype=float) | |
| for cid in np.unique(labels): | |
| idx = np.where(labels == cid)[0] | |
| centroid = vectors[idx].mean(axis=0, keepdims=True) | |
| sims[idx] = cosine_similarity(vectors[idx], centroid).ravel() | |
| probs = np.clip((sims + 1) / 2, 0, 1) | |
| candidates.append({**params, "optimizer_recommendation": "Fallback used to guarantee a crisp 15-25 cluster solution."}) | |
| return labels, probs, params, candidates | |
| def _top_terms(texts: List[str], top_n: int = 8) -> List[str]: | |
| if not texts: | |
| return [] | |
| vec = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=1, max_features=1200) | |
| matrix = vec.fit_transform(texts) | |
| scores = np.asarray(matrix.sum(axis=0)).ravel() | |
| terms = np.asarray(vec.get_feature_names_out()) | |
| order = scores.argsort()[::-1] | |
| cleaned = [] | |
| for term in terms[order]: | |
| pieces = term.split() | |
| if ( | |
| len(term) > 2 | |
| and not term.isdigit() | |
| and term not in cleaned | |
| and not all(piece in LABEL_STOP_WORDS for piece in pieces) | |
| and not any(piece in {"doi", "s12525", "1007"} for piece in pieces) | |
| ): | |
| cleaned.append(term) | |
| if len(cleaned) >= top_n: | |
| break | |
| return cleaned | |
| def _category_for_terms(terms: List[str]) -> Tuple[str, float]: | |
| query = " ".join(terms) | |
| docs = [query] + list(CATEGORY_TERMS.values()) | |
| vec = TfidfVectorizer(stop_words="english").fit_transform(docs) | |
| sims = cosine_similarity(vec[0], vec[1:]).ravel() | |
| best = int(np.argmax(sims)) | |
| return PAJAIS_25[best], float(sims[best]) | |
| def _title_from_terms(terms: List[str], category: str) -> str: | |
| priority = [ | |
| "artificial intelligence", "machine learning", "blockchain", "digital platform", | |
| "social commerce", "e-commerce", "privacy", "trust", "fintech", "robo advisor", | |
| "analytics", "mobile", "digital transformation", "recommender", | |
| ] | |
| joined = " ".join(terms) | |
| for phrase in priority: | |
| if phrase in joined: | |
| return phrase.title().replace("Ai", "AI") | |
| if terms: | |
| return " ".join(t.title() for t in terms[:3]) | |
| return category | |
| def _optional_mistral_label(cluster: Dict[str, Any]) -> str: | |
| api_key = os.environ.get("MISTRAL_API_KEY", "").strip() | |
| if not api_key: | |
| return "" | |
| try: | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_mistralai import ChatMistralAI | |
| prompt = PromptTemplate.from_template( | |
| "Name this academic topic cluster in 3-7 words. " | |
| "Use only the evidence, no markdown.\nKeywords: {keywords}\nTitles:\n{titles}" | |
| ) | |
| llm = ChatMistralAI(model="mistral-small-latest", api_key=api_key, temperature=0.1) | |
| return (prompt | llm | StrOutputParser()).invoke({ | |
| "keywords": ", ".join(cluster["keywords"]), | |
| "titles": "\n".join(cluster["top_titles"][:3]), | |
| }).strip().strip('"')[:80] | |
| except Exception: | |
| return "" | |
| def _optional_mistral_council(cluster: Dict[str, Any]) -> List[Dict[str, str]]: | |
| api_key = os.environ.get("MISTRAL_API_KEY", "").strip() | |
| if not api_key: | |
| return [] | |
| try: | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_mistralai import ChatMistralAI | |
| personas = [ | |
| ("LLM Council A - Domain Labeler", "Name the Information Systems research theme."), | |
| ("LLM Council B - Methods Skeptic", "Name the theme conservatively using only the three titles and keywords."), | |
| ("LLM Council C - Taxonomy Judge", "Name the theme and prefer PAJAIS-style terminology when appropriate."), | |
| ] | |
| llm = ChatMistralAI(model="mistral-small-latest", api_key=api_key, temperature=0.1) | |
| prompt = PromptTemplate.from_template( | |
| "{task}\nReturn one concise 3-7 word label only.\n" | |
| "Keywords: {keywords}\n" | |
| "High-probability paper titles:\n{titles}" | |
| ) | |
| votes = [] | |
| for member, task in personas: | |
| label = (prompt | llm | StrOutputParser()).invoke({ | |
| "task": task, | |
| "keywords": ", ".join(cluster["keywords"]), | |
| "titles": "\n".join(cluster["top_titles"][:3]), | |
| }).strip().strip('"')[:80] | |
| if label: | |
| votes.append({"member": member, "label": label, "method": "Mistral LLM council vote using top-3 high-probability paper titles"}) | |
| return votes | |
| except Exception: | |
| return [] | |
| def _build_cluster_summaries(df: pd.DataFrame, vectors: np.ndarray, labels: np.ndarray, probabilities: np.ndarray) -> List[Dict[str, Any]]: | |
| summaries = [] | |
| for cid in sorted([int(x) for x in np.unique(labels) if x >= 0]): | |
| idx = np.where(labels == cid)[0] | |
| texts = df.loc[idx, "__combined"].tolist() | |
| titles = df.loc[idx, "__title"].tolist() | |
| keywords = _top_terms(texts, 10) | |
| category, category_score = _category_for_terms(keywords + titles[:5]) | |
| centroid = vectors[idx].mean(axis=0, keepdims=True) | |
| sims = cosine_similarity(vectors[idx], centroid).ravel() | |
| rank = np.lexsort((-sims, -probabilities[idx]))[::-1] | |
| top_idx = idx[rank[:3]] | |
| cluster_draft = { | |
| "keywords": keywords, | |
| "top_titles": df.loc[top_idx, "__title"].tolist(), | |
| } | |
| vote_keyword = _title_from_terms(keywords, category) | |
| vote_taxonomy = category | |
| llm_votes = _optional_mistral_council(cluster_draft) | |
| votes = llm_votes or [ | |
| {"member": "Council A - Keyword Extractor", "label": vote_keyword, "method": "deterministic cluster TF-IDF terms"}, | |
| {"member": "Council B - PAJAIS Mapper", "label": vote_taxonomy, "method": "taxonomy-term cosine validation"}, | |
| {"member": "Council C - Local Semantic Judge", "label": vote_keyword, "method": "local fallback; configure MISTRAL_API_KEY for live 3-LLM council"}, | |
| ] | |
| normalized_votes = [v["label"].strip().lower() for v in votes if v["label"]] | |
| agreement = Counter(normalized_votes).most_common(1)[0][1] / max(1, len(normalized_votes)) | |
| final_label = Counter([v["label"] for v in votes if v["label"]]).most_common(1)[0][0] if votes else vote_keyword | |
| summaries.append({ | |
| "cluster_id": cid, | |
| "label": final_label, | |
| "category": category, | |
| "confidence": round(float(np.mean(probabilities[idx]) * 0.65 + agreement * 0.35), 3), | |
| "category_confidence": round(category_score, 3), | |
| "sentence_count": int(len(idx)), | |
| "paper_count": int(len(idx)), | |
| "top_sentences": df.loc[top_idx, "__abstract"].str[:350].tolist(), | |
| "top_titles": df.loc[top_idx, "__title"].tolist(), | |
| "keywords": keywords, | |
| "centroid": centroid.ravel().tolist(), | |
| "paper_indices": [int(i) for i in idx], | |
| "council_votes": votes, | |
| "agreement_score": round(float(agreement), 3), | |
| "is_niche": bool(len(idx) <= 8), | |
| "reasoning": f"{len(idx)} papers; top terms: {', '.join(keywords[:6])}; council agreement {agreement:.2f}.", | |
| }) | |
| summaries.sort(key=lambda x: x["paper_count"], reverse=True) | |
| return summaries | |
| def _generate_charts(summaries: List[Dict[str, Any]]) -> None: | |
| chart_dir = _opath("combined_charts") | |
| os.makedirs(chart_dir, exist_ok=True) | |
| if not summaries: | |
| return | |
| centroids = np.asarray([s["centroid"] for s in summaries]) | |
| sizes = [s["paper_count"] for s in summaries] | |
| labels = [s["label"] for s in summaries] | |
| coords = PCA(n_components=2, random_state=42).fit_transform(centroids) if len(summaries) > 1 else np.zeros((1, 2)) | |
| fig = px.scatter( | |
| x=coords[:, 0], | |
| y=coords[:, 1], | |
| size=sizes, | |
| color=[s["category"] for s in summaries], | |
| hover_name=labels, | |
| title="Intertopic Map - Title + Abstract + DOI", | |
| template="plotly_dark", | |
| ) | |
| fig.write_html(os.path.join(chart_dir, "intertopic_map.html"), include_plotlyjs="cdn", full_html=True) | |
| bar = px.bar( | |
| x=labels, | |
| y=sizes, | |
| title="Cluster Sizes", | |
| labels={"x": "Cluster", "y": "Papers"}, | |
| template="plotly_dark", | |
| ) | |
| bar.write_html(os.path.join(chart_dir, "bar_chart.html"), include_plotlyjs="cdn", full_html=True) | |
| tree = px.treemap( | |
| names=labels, | |
| parents=["clusters"] * len(labels), | |
| values=sizes, | |
| title="Topic Treemap", | |
| ) | |
| tree.write_html(os.path.join(chart_dir, "treemap.html"), include_plotlyjs="cdn", full_html=True) | |
| def _taxonomy_map(summaries: List[Dict[str, Any]]) -> Dict[str, Any]: | |
| mapping = {} | |
| for s in summaries: | |
| is_novel = s["category_confidence"] < 0.08 | |
| mapping[s["label"]] = { | |
| "theme": s["label"], | |
| "pajais_match": "NOVEL" if is_novel else s["category"], | |
| "match_confidence": s["category_confidence"], | |
| "reasoning": s["reasoning"], | |
| "is_novel": is_novel, | |
| } | |
| covered = {v["pajais_match"] for v in mapping.values() if not v["is_novel"]} | |
| novel = [k for k, v in mapping.items() if v["is_novel"]] | |
| return { | |
| "run_key": "combined", | |
| "taxonomy_mapping": mapping, | |
| "novel_themes": novel, | |
| "pajais_gap_categories": [c for c in PAJAIS_25 if c not in covered], | |
| "coverage_stats": { | |
| "total_themes": len(mapping), | |
| "mapped": len(mapping) - len(novel), | |
| "novel": len(novel), | |
| }, | |
| } | |
| def _write_comparison_csv(summaries: List[Dict[str, Any]]) -> str: | |
| rows = [] | |
| for s in summaries: | |
| rows.append({ | |
| "Cluster_ID": s["cluster_id"], | |
| "Final_Label": s["label"], | |
| "PAJAIS_Category": s["category"], | |
| "Papers": s["paper_count"], | |
| "Confidence": s["confidence"], | |
| "Agreement": s["agreement_score"], | |
| "Top_Keywords": "; ".join(s["keywords"][:8]), | |
| "Top_3_Paper_Titles": " | ".join(s["top_titles"][:3]), | |
| "Validation_Status": "VALIDATED" if s["confidence"] >= 0.55 else "REVIEW", | |
| }) | |
| path = _opath("comparison.csv") | |
| pd.DataFrame(rows).to_csv(path, index=False) | |
| return path | |
| def _extract_matches(text: str, patterns: List[str]) -> List[str]: | |
| lower = text.lower() | |
| return sorted({p for p in patterns if re.search(r"\b" + re.escape(p.lower()) + r"\b", lower)}) | |
| def _write_tccm_validation(df: pd.DataFrame) -> str: | |
| top = df.sort_values("__cited_by", ascending=False).head(100).copy() | |
| rows = [] | |
| for rank, (_, row) in enumerate(top.iterrows(), start=1): | |
| text = f"{row['__title']} {row['__abstract']}" | |
| theories = _extract_matches(text, THEORY_PATTERNS) | |
| methods = _extract_matches(text, METHOD_PATTERNS) | |
| techniques = _extract_matches(text, COMPUTATIONAL_PATTERNS) | |
| context = [] | |
| for category, terms in CATEGORY_TERMS.items(): | |
| if any(term in text.lower() for term in terms.split()[:6]): | |
| context.append(category) | |
| rows.append({ | |
| "Paper ID": rank, | |
| "Title": row["__title"], | |
| "DOI": row["__doi"], | |
| "Cited_By": row["__cited_by"], | |
| "Theory_Regex": "; ".join(theories), | |
| "Context_Taxonomy": "; ".join(context[:3]), | |
| "Characteristics_Constructs": "; ".join(_top_terms([text], 6)), | |
| "Method_Regex": "; ".join(methods), | |
| "Computational_Techniques_Regex": "; ".join(techniques), | |
| "Validation_Method_1": "dictionary/regex extraction", | |
| "Validation_Method_2": "cluster/category semantic match", | |
| "Validation_Status": "VALIDATED" if (methods or techniques or theories) else "NEEDS_FULL_TEXT_REVIEW", | |
| }) | |
| path = _opath("tccm_validation.csv") | |
| pd.DataFrame(rows).to_csv(path, index=False) | |
| return path | |
| def parse_notebooklm_tccm_text(raw_text: str) -> str: | |
| """Parse NotebookLM's copied table text into a Google-Sheets-ready CSV.""" | |
| columns = [ | |
| "Paper ID", | |
| "Paper Citation", | |
| "Study Type", | |
| "Dependent Variable(s) (DV)", | |
| "Independent Variable(s) (IVs)", | |
| "Mediator(s)", | |
| "Moderator(s)", | |
| "Relationship Direction", | |
| "Evidence Snippet", | |
| ] | |
| skip = {c.lower() for c in columns} | |
| skip.update({ | |
| "today • 14:09", | |
| "ask a question or create something", | |
| "notebooklm can be inaccurate; please double-check its responses.", | |
| "i want minimum 1 dependent and 1 independent variable", | |
| "i want minimum 1 dependernt and 1 independent variable", | |
| }) | |
| lines = [ | |
| re.sub(r"\s+", " ", line.strip()) | |
| for line in str(raw_text or "").splitlines() | |
| if line.strip() | |
| ] | |
| rows = [] | |
| current_id = None | |
| fields = [] | |
| def is_id(line: str) -> bool: | |
| return bool(re.fullmatch(r"\d{1,3}", line)) and 1 <= int(line) <= 100 | |
| def finalise(): | |
| if current_id is None or not fields: | |
| return | |
| cleaned = [f for f in fields if f.lower() not in skip] | |
| if len(cleaned) < 2: | |
| return | |
| fixed = cleaned[:7] | |
| evidence = " ".join(cleaned[7:]) if len(cleaned) > 7 else "" | |
| while len(fixed) < 7: | |
| fixed.append("NA") | |
| rows.append({ | |
| "Paper ID": int(current_id), | |
| "Paper Citation": fixed[0], | |
| "Study Type": fixed[1], | |
| "Dependent Variable(s) (DV)": fixed[2], | |
| "Independent Variable(s) (IVs)": fixed[3], | |
| "Mediator(s)": fixed[4], | |
| "Moderator(s)": fixed[5], | |
| "Relationship Direction": fixed[6], | |
| "Evidence Snippet": evidence, | |
| }) | |
| seen_header = False | |
| for line in lines: | |
| lower = line.lower() | |
| if lower in skip: | |
| seen_header = True | |
| continue | |
| if is_id(line): | |
| if seen_header or current_id is not None: | |
| finalise() | |
| current_id = int(line) | |
| fields = [] | |
| continue | |
| if current_id is not None: | |
| fields.append(line) | |
| finalise() | |
| df = pd.DataFrame(rows, columns=columns) | |
| if not df.empty: | |
| df = df.sort_values("Paper ID").drop_duplicates("Paper ID", keep="last") | |
| path = _opath("notebooklm_extraction.csv") | |
| df.to_csv(path, index=False) | |
| return path | |
| def write_tccm_dual_validation(notebooklm_path: str = "", second_llm_path: str = "") -> str: | |
| base_path = _opath("tccm_validation.csv") | |
| base = pd.read_csv(base_path) if os.path.exists(base_path) else pd.DataFrame() | |
| def load_optional(path: str, prefix: str) -> pd.DataFrame: | |
| if not path or not os.path.exists(path): | |
| return pd.DataFrame() | |
| loaded = pd.read_csv(path, encoding="utf-8-sig", on_bad_lines="skip") | |
| rename = {} | |
| for col in loaded.columns: | |
| low = col.lower().strip() | |
| if low in {"paper id", "paper_id", "rank", "id"}: | |
| rename[col] = "Paper ID" | |
| elif low in {"title", "paper title", "article title"}: | |
| rename[col] = "Title" | |
| elif low in {"paper citation", "citation"}: | |
| rename[col] = f"{prefix}_Citation" | |
| elif "study type" in low: | |
| rename[col] = f"{prefix}_Study_Type" | |
| elif "independent" in low or low == "iv" or "ivs" in low: | |
| rename[col] = f"{prefix}_IV" | |
| elif "dependent" in low or low == "dv": | |
| rename[col] = f"{prefix}_DV" | |
| elif "mediator" in low: | |
| rename[col] = f"{prefix}_Mediator" | |
| elif "moderator" in low: | |
| rename[col] = f"{prefix}_Moderator" | |
| elif "relationship direction" in low or low == "direction": | |
| rename[col] = f"{prefix}_Relationship_Direction" | |
| elif "evidence" in low or "snippet" in low: | |
| rename[col] = f"{prefix}_Evidence" | |
| elif low in {"doi"}: | |
| rename[col] = "DOI" | |
| elif "theor" in low: | |
| rename[col] = f"{prefix}_Theory" | |
| elif "context" in low: | |
| rename[col] = f"{prefix}_Context" | |
| elif "method" in low: | |
| rename[col] = f"{prefix}_Method" | |
| elif "variable" in low or "construct" in low or "characteristic" in low: | |
| rename[col] = f"{prefix}_Variables" | |
| elif "technique" in low or "comput" in low: | |
| rename[col] = f"{prefix}_Computational_Techniques" | |
| loaded = loaded.rename(columns=rename) | |
| keep = [c for c in loaded.columns if c in {"Paper ID", "Title", "DOI"} or c.startswith(prefix)] | |
| return loaded[keep].copy() | |
| notebook = load_optional(notebooklm_path, "NotebookLM") | |
| second = load_optional(second_llm_path, "SecondLLM") | |
| if base.empty: | |
| merged = pd.DataFrame() | |
| else: | |
| merged = base.copy() | |
| if not notebook.empty: | |
| key = "Paper ID" if "Paper ID" in notebook.columns and "Paper ID" in merged.columns else ("DOI" if "DOI" in notebook.columns and merged["DOI"].astype(str).str.len().gt(0).any() else "Title") | |
| merged = merged.merge(notebook, how="left", on=key, suffixes=("", "_NotebookLM_Input")) | |
| if not second.empty: | |
| key = "Paper ID" if "Paper ID" in second.columns and "Paper ID" in merged.columns else ("DOI" if "DOI" in second.columns and merged["DOI"].astype(str).str.len().gt(0).any() else "Title") | |
| merged = merged.merge(second, how="left", on=key, suffixes=("", "_SecondLLM_Input")) | |
| if merged.empty: | |
| merged = pd.DataFrame([{ | |
| "Compliance_Status": "PENDING", | |
| "Required_Action": "Run topic pipeline first, then upload NotebookLM and second-LLM extraction CSV files.", | |
| }]) | |
| else: | |
| has_notebook = any(c.startswith("NotebookLM") for c in merged.columns) | |
| has_second = any(c.startswith("SecondLLM") for c in merged.columns) | |
| def row_status(row): | |
| regex_hit = any( | |
| str(row.get(c, "")).strip().lower() not in {"", "nan", "none"} | |
| for c in ["Theory_Regex", "Method_Regex", "Computational_Techniques_Regex", "Characteristics_Constructs"] | |
| ) | |
| notebook_cols = [c for c in merged.columns if c.startswith("NotebookLM_") and c != "NotebookLM_File_Loaded"] | |
| second_cols = [c for c in merged.columns if c.startswith("SecondLLM_")] | |
| notebook_hit = any(str(row.get(c, "")).strip().lower() not in {"", "nan", "none", "false"} for c in notebook_cols) | |
| second_hit = any(str(row.get(c, "")).strip().lower() not in {"", "nan", "none", "false"} for c in second_cols) | |
| if notebook_hit and second_hit: | |
| return "COMPLIANT_NOTEBOOKLM_PLUS_SECOND_LLM" | |
| if notebook_hit and regex_hit: | |
| return "PARTIAL_NOTEBOOKLM_PLUS_REGEX" | |
| if second_hit and regex_hit: | |
| return "PARTIAL_SECOND_LLM_PLUS_REGEX" | |
| return "PENDING_NOTEBOOKLM_AND_SECOND_LLM" | |
| merged["NotebookLM_File_Loaded"] = has_notebook | |
| merged["Second_LLM_File_Loaded"] = has_second | |
| merged["Final_TCCM_Compliance_Status"] = merged.apply(row_status, axis=1) | |
| merged["Required_Action"] = np.where( | |
| merged["Final_TCCM_Compliance_Status"].eq("COMPLIANT_NOTEBOOKLM_PLUS_SECOND_LLM"), | |
| "Ready for mentor review with dual AI validation.", | |
| "Upload NotebookLM extraction and second LLM extraction from full-text PDFs before claiming final compliance.", | |
| ) | |
| path = _opath("tccm_dual_validation.csv") | |
| merged.to_csv(path, index=False) | |
| return path | |
| def write_compliance_checklist(params: Dict[str, Any], meta: Dict[str, Any], summaries: List[Dict[str, Any]]) -> str: | |
| has_live_llm = any( | |
| "Mistral LLM council vote" in vote.get("method", "") | |
| for summary in summaries | |
| for vote in summary.get("council_votes", []) | |
| ) | |
| notebook_loaded = os.path.exists(_opath("notebooklm_extraction.csv")) | |
| dual_path = _opath("tccm_dual_validation.csv") | |
| second_loaded = False | |
| if os.path.exists(dual_path): | |
| try: | |
| dual_df = pd.read_csv(dual_path) | |
| second_loaded = bool(dual_df.get("Second_LLM_File_Loaded", pd.Series([False])).astype(bool).any()) | |
| except Exception: | |
| second_loaded = False | |
| rows = [ | |
| { | |
| "Requirement": "15 to 25 crisp topic clusters", | |
| "Status": "PASS" if TARGET_MIN_CLUSTERS <= params.get("n_clusters", 0) <= TARGET_MAX_CLUSTERS else "FAIL", | |
| "Evidence": f"{params.get('n_clusters')} clusters generated.", | |
| "File": "comparison.csv / cluster_optimization_log.csv", | |
| }, | |
| { | |
| "Requirement": "Minimum 5 and maximum 100 papers per cluster", | |
| "Status": "PASS" if params.get("min_size", 0) >= MIN_CLUSTER_SIZE and params.get("max_size", 999) <= MAX_CLUSTER_SIZE else "FAIL", | |
| "Evidence": f"min={params.get('min_size')}, max={params.get('max_size')}.", | |
| "File": "cluster_optimization_log.csv", | |
| }, | |
| { | |
| "Requirement": "Cluster optimization loop with parameter recommendations", | |
| "Status": "PASS" if os.path.exists(_opath("cluster_optimization_log.csv")) else "FAIL", | |
| "Evidence": "Optimizer records attempted settings, scores, and recommendations.", | |
| "File": "cluster_optimization_log.csv", | |
| }, | |
| { | |
| "Requirement": "Top three high-probability paper titles fed for labels", | |
| "Status": "PASS" if all(len(s.get("top_titles", [])) >= 3 for s in summaries) else "REVIEW", | |
| "Evidence": "Top_3_Paper_Titles included for every cluster.", | |
| "File": "comparison.csv", | |
| }, | |
| { | |
| "Requirement": "LLM council visible in app, not just story text", | |
| "Status": "PASS" if os.path.exists(_opath("llm_council_validation.csv")) else "FAIL", | |
| "Evidence": "Animated council board and vote table in Council Validation tab.", | |
| "File": "llm_council_validation.csv / app.py", | |
| }, | |
| { | |
| "Requirement": "Live 3-LLM council labels", | |
| "Status": "PASS" if has_live_llm else "CONFIG_REQUIRED", | |
| "Evidence": "Set MISTRAL_API_KEY in Space secrets to switch from local fallback to live Mistral council.", | |
| "File": "llm_council_validation.csv", | |
| }, | |
| { | |
| "Requirement": "SPECTER2 paper-level embeddings", | |
| "Status": "PASS" if "specter2" in str(meta.get("embedding_model", "")).lower() else "ENV_FALLBACK", | |
| "Evidence": meta.get("embedding_note", ""), | |
| "File": "run_metadata.json", | |
| }, | |
| { | |
| "Requirement": "UMAP + HDBSCAN density clustering", | |
| "Status": "PASS" if str(params.get("algorithm", "")).lower().startswith("umap + hdbscan") else "ENV_FALLBACK", | |
| "Evidence": str(params.get("algorithm", "")), | |
| "File": "run_metadata.json / cluster_optimization_log.csv", | |
| }, | |
| { | |
| "Requirement": "TCCM corpus loaded and vectorised for computational techniques", | |
| "Status": "PASS" if os.path.exists(_opath("tccm_validation.csv")) else "FAIL", | |
| "Evidence": "Top-cited 100 papers exported with regex and semantic computational technique extraction.", | |
| "File": "tccm_validation.csv", | |
| }, | |
| { | |
| "Requirement": "NotebookLM output plus another LLM method for TCCM", | |
| "Status": "PASS" if notebook_loaded and second_loaded else ("PARTIAL" if notebook_loaded else "INPUT_REQUIRED"), | |
| "Evidence": ( | |
| "NotebookLM extraction loaded; second LLM extraction still required." | |
| if notebook_loaded and not second_loaded | |
| else ("NotebookLM and second LLM extraction loaded." if second_loaded else "Use TCCM Dual Validation tab to upload NotebookLM CSV and second LLM CSV.") | |
| ), | |
| "File": "tccm_dual_validation.csv", | |
| }, | |
| { | |
| "Requirement": "Formal mentor approval before final submission", | |
| "Status": "MANUAL_REQUIRED", | |
| "Evidence": "Cannot be automated; get faculty mentor approval.", | |
| "File": "mentor approval evidence", | |
| }, | |
| ] | |
| path = _opath("compliance_checklist.csv") | |
| pd.DataFrame(rows).to_csv(path, index=False) | |
| _save_json(rows, "compliance_checklist.json") | |
| return path | |
| def _write_validation_files(summaries: List[Dict[str, Any]], optimizer_log: List[Dict[str, Any]], params: Dict[str, Any], meta: Dict[str, Any]) -> None: | |
| council = [] | |
| for s in summaries: | |
| for vote in s["council_votes"]: | |
| council.append({ | |
| "cluster_id": s["cluster_id"], | |
| "final_label": s["label"], | |
| "member": vote["member"], | |
| "member_label": vote["label"], | |
| "method": vote["method"], | |
| "top_3_titles_used": " | ".join(s.get("top_titles", [])[:3]), | |
| "agreement_score": s["agreement_score"], | |
| "confidence": s["confidence"], | |
| }) | |
| pd.DataFrame(council).to_csv(_opath("llm_council_validation.csv"), index=False) | |
| _save_json(council, "llm_council.json") | |
| pd.DataFrame(optimizer_log).sort_values("score", na_position="last").to_csv(_opath("cluster_optimization_log.csv"), index=False) | |
| _save_json({"selected_parameters": params, "embedding": meta}, "run_metadata.json") | |
| def _write_report(config: Dict[str, Any], summaries: List[Dict[str, Any]], params: Dict[str, Any], meta: Dict[str, Any]) -> str: | |
| lines = [ | |
| "# Topic Modelling Final Submission Report", | |
| "", | |
| f"Journal: {config.get('journal')}", | |
| f"Papers analysed: {config.get('rows')}", | |
| f"Years: {config.get('year_min')} to {config.get('year_max')}", | |
| "", | |
| "## Method", | |
| f"The model uses one vector per paper from {config.get('combined_field')}. " | |
| f"Embedding model: {meta.get('embedding_model')}. Clustering: {params.get('algorithm')}. " | |
| "The optimizer searches UMAP/HDBSCAN parameters and selects the lowest penalty solution " | |
| "against the required 15-25 clusters, 5 minimum papers per cluster, and 100 maximum papers per cluster.", | |
| "", | |
| "## Selected Parameters", | |
| "```json", | |
| json.dumps(params, indent=2), | |
| "```", | |
| "", | |
| "## Validated Clusters", | |
| ] | |
| for s in summaries: | |
| lines.append( | |
| f"- C{s['cluster_id']}: {s['label']} ({s['paper_count']} papers, " | |
| f"confidence {s['confidence']}, PAJAIS: {s['category']}). " | |
| f"Evidence titles: {' | '.join(s['top_titles'][:3])}" | |
| ) | |
| lines.extend([ | |
| "", | |
| "## Validation", | |
| "Labels are validated through the in-app council table: keyword extraction, PAJAIS semantic mapping, " | |
| "and an LLM labeler when MISTRAL_API_KEY is configured. Without a key, the third council member " | |
| "uses a deterministic local semantic fallback, so the app remains executable end to end.", | |
| "", | |
| "TCCM and computational technique extraction are exported in `tccm_validation.csv` for the top-cited 100 papers. " | |
| "Rows marked `NEEDS_FULL_TEXT_REVIEW` should be checked against PDFs before final academic submission. " | |
| "Full TCCM compliance requires uploading NotebookLM extraction and a second LLM extraction in the app's " | |
| "TCCM Dual Validation tab to generate `tccm_dual_validation.csv`.", | |
| ]) | |
| path = _opath("topic_model_report.md") | |
| with open(path, "w", encoding="utf-8") as f: | |
| f.write("\n".join(lines)) | |
| with open(_opath("narrative.txt"), "w", encoding="utf-8") as f: | |
| f.write("\n".join(lines[:60])) | |
| return path | |
| def run_complete_pipeline(filepath: str) -> Dict[str, Any]: | |
| df, config = load_corpus(filepath) | |
| vectors, meta = _embed_documents(df["__combined"].tolist()) | |
| np.save(_opath("combined_emb.npy"), vectors) | |
| labels, probabilities, params, optimizer_log = _run_umap_hdbscan(vectors) | |
| summaries = _build_cluster_summaries(df, vectors, labels, probabilities) | |
| _save_json(summaries, "combined_labels.json") | |
| _save_json(summaries, "abstract_labels.json") | |
| _save_json(summaries, "title_labels.json") | |
| _save_json({"sentences": df["__combined"].tolist(), "paper_ids": df["__paper_id"].astype(int).tolist()}, "combined_sentences.json") | |
| taxonomy = _taxonomy_map(summaries) | |
| _save_json(taxonomy, "taxonomy_map.json") | |
| _generate_charts(summaries) | |
| comparison_path = _write_comparison_csv(summaries) | |
| tccm_path = _write_tccm_validation(df) | |
| _write_validation_files(summaries, optimizer_log, params, meta) | |
| dual_tccm_path = write_tccm_dual_validation() | |
| checklist_path = write_compliance_checklist(params, meta, summaries) | |
| report_path = _write_report(config, summaries, params, meta) | |
| deliverables = [ | |
| comparison_path, | |
| _opath("taxonomy_map.json"), | |
| _opath("topic_model_report.md"), | |
| _opath("narrative.txt"), | |
| _opath("cluster_optimization_log.csv"), | |
| _opath("llm_council_validation.csv"), | |
| _opath("tccm_validation.csv"), | |
| dual_tccm_path, | |
| checklist_path, | |
| _opath("run_metadata.json"), | |
| _opath("combined_labels.json"), | |
| ] | |
| return { | |
| "config": config, | |
| "parameters": params, | |
| "embedding": meta, | |
| "clusters": summaries, | |
| "taxonomy": taxonomy, | |
| "deliverables": [p for p in deliverables if os.path.exists(p)], | |
| } | |