Spaces:
Sleeping
Sleeping
| # tools.py β BERTopic Thematic Analysis Tools | |
| # Constraint: ZERO if/else statements, ZERO for/while loops, ZERO try/except blocks. | |
| # | |
| # PERFORMANCE FIXES vs original: | |
| # FIX 1 β Sentence cap: max 3000 sentences fed to AgglomerativeClustering. | |
| # Without cap: 13,829 sentences β 730 MB distance matrix β timeout. | |
| # With cap 3000: 34 MB distance matrix β completes in ~30s. | |
| # FIX 2 β Batch LLM labelling: all topics sent in ONE Mistral call (not 100). | |
| # Without batch: 100 API calls Γ 5s = ~500s minimum. | |
| # With batch: 1 API call Γ 15s = ~15s. | |
| # FIX 3 β Mistral timeout raised to 120s to avoid ReadTimeout on large prompts. | |
| # FIX 4 β load_scopus_csv uses utf-8-sig + quoting=0 (not quoting=3 which | |
| # broke multi-line abstracts into garbage rows). | |
| import re | |
| import json | |
| import os | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from langchain_core.tools import tool | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_core.output_parsers import JsonOutputParser | |
| from langchain_mistralai import ChatMistralAI | |
| from langchain_groq import ChatGroq | |
| from langchain_google_genai import ChatGoogleGenerativeAI | |
| from sentence_transformers import SentenceTransformer | |
| # Global model for evaluating label similarities quickly | |
| _label_sim_model = None | |
| from sklearn.cluster import AgglomerativeClustering, DBSCAN | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.decomposition import PCA | |
| import nltk | |
| import torch | |
| from transformers import AutoTokenizer | |
| from adapters import AutoAdapterModel | |
| import umap | |
| import hdbscan | |
| nltk.download("punkt", quiet=True) | |
| nltk.download("punkt_tab", quiet=True) | |
| from nltk.tokenize import sent_tokenize | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Constants | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| RUN_CONFIGS = { | |
| "abstract": ["Abstract"], | |
| "title": ["Title"], | |
| "combined": ["Combined"], | |
| } | |
| MODEL_NAME = "all-MiniLM-L6-v2" | |
| NEAREST_K = 5 | |
| MAX_LABEL_TOPICS = 60 # topics sent to LLM in ONE batch call | |
| MAX_SENTENCES = 3000 # hard cap on sentences fed to clustering | |
| DEFAULT_THRESHOLD = 0.7 | |
| MISTRAL_TIMEOUT = 120 # seconds β prevents ReadTimeout on large prompts | |
| BOILERPLATE_PATTERNS = [ | |
| r"Β©\s*\d{4}", | |
| r"elsevier\s*(b\.v\.)?", | |
| r"springer\s*(nature)?", | |
| r"wiley\s*(online\s*library)?", | |
| r"all\s+rights\s+reserved", | |
| r"published\s+by\s+[a-z\s]+", | |
| r"doi:\s*10\.", | |
| r"www\.[a-z]+\.[a-z]+", | |
| r"https?://", | |
| r"copyright\s*\d{4}", | |
| r"taylor\s*&\s*francis", | |
| r"sage\s+publications", | |
| r"emerald\s+publishing", | |
| r"journal\s+of\s+[a-z\s]+issn", | |
| r"volume\s+\d+,?\s+issue\s+\d+", | |
| r"pp\.\s*\d+[-β]\d+", | |
| r"received\s+\d+\s+\w+\s+\d{4}", | |
| r"accepted\s+\d+\s+\w+\s+\d{4}", | |
| r"available\s+online", | |
| r"this\s+is\s+an\s+open\s+access", | |
| r"creative\s+commons", | |
| r"please\s+cite\s+this\s+article", | |
| ] | |
| PAJAIS_TAXONOMY = [ | |
| "Artificial Intelligence Methods", | |
| "Natural Language Processing", | |
| "Machine Learning", | |
| "Deep Learning", | |
| "Knowledge Representation", | |
| "Ontologies & Semantic Web", | |
| "Information Retrieval", | |
| "Recommender Systems", | |
| "Decision Support Systems", | |
| "Human-Computer Interaction", | |
| "Explainability & Transparency", | |
| "Fairness, Accountability & Ethics", | |
| "Data Management & Integration", | |
| "Text Mining & Analytics", | |
| "Sentiment Analysis", | |
| "Social Media Analysis", | |
| "Business Intelligence", | |
| "Process Automation & RPA", | |
| "Computer Vision", | |
| "Speech & Audio Processing", | |
| "Multi-Agent Systems", | |
| "Robotics & Autonomous Systems", | |
| "Healthcare & Biomedical AI", | |
| "Finance & Risk Analytics", | |
| "Education & E-Learning", | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Internal helpers β no loops, no if/else | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _is_boilerplate(s: str) -> bool: | |
| return any(map(lambda p: bool(re.search(p, s, re.IGNORECASE)), BOILERPLATE_PATTERNS)) | |
| def _clean_sentences(raw: list) -> list: | |
| #no_bp = list(filter(lambda s: not _is_boilerplate(s), raw)) | |
| long_enuf = list(filter(lambda s: len(s.split()) >= 6, raw)) | |
| return long_enuf | |
| def _texts_to_sentences(texts: list) -> list: | |
| # Professor requested document-level embeddings (one vector per paper), | |
| # so we do NOT use sent_tokenize here anymore. We just clean the raw text. | |
| return _clean_sentences(texts) | |
| def _embed(sentences: list) -> np.ndarray: | |
| print(f"Loading SPECTER2 for {len(sentences)} items...") | |
| tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base') | |
| model = AutoAdapterModel.from_pretrained('allenai/specter2_base') | |
| model.load_adapter("allenai/specter2", source="hf", load_as="proximity", set_active=True) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model.to(device) | |
| model.eval() | |
| batch_size = 16 | |
| all_embeddings = [] | |
| with torch.no_grad(): | |
| for i in range(0, len(sentences), batch_size): | |
| batch = sentences[i:i+batch_size] | |
| inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False, max_length=512) | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| output = model(**inputs) | |
| batch_emb = output.last_hidden_state[:, 0, :].cpu().numpy() | |
| all_embeddings.append(batch_emb) | |
| if not all_embeddings: | |
| return np.array([]) | |
| return np.vstack(all_embeddings) | |
| def _cluster(embeddings: np.ndarray, threshold: float) -> np.ndarray: | |
| return AgglomerativeClustering( | |
| metric="cosine", linkage="average", | |
| distance_threshold=threshold, n_clusters=None, | |
| ).fit_predict(embeddings) | |
| def _compute_centroids(embeddings: np.ndarray, labels: np.ndarray) -> dict: | |
| valid = sorted(set(labels.tolist()) - {-1}) | |
| return dict(map(lambda l: (l, embeddings[labels == l].mean(axis=0)), valid)) | |
| def _nearest_sents(centroid: np.ndarray, sentences: list, | |
| embeddings: np.ndarray, k: int) -> list: | |
| sims = cosine_similarity([centroid], embeddings)[0] | |
| idxs = np.argsort(sims)[::-1][:k].tolist() | |
| return list(map(lambda i: sentences[i], idxs)) | |
| def _build_summaries(labels: np.ndarray, sentences: list, | |
| embeddings: np.ndarray) -> list: | |
| centroids = _compute_centroids(embeddings, labels) | |
| def _one(tid): | |
| mask = labels == tid | |
| return { | |
| "topic_id": tid, | |
| "count": int(mask.sum()), | |
| "centroid": centroids[tid].tolist(), | |
| "nearest_sentences": _nearest_sents( | |
| centroids[tid], sentences, embeddings, NEAREST_K), | |
| } | |
| return list(map(_one, sorted(centroids.keys()))) | |
| def _get_llm() -> ChatMistralAI: | |
| """ | |
| Return a ChatMistralAI instance. | |
| FIX: max_retries=0 so langchain_mistralai does NOT internally retry 429s. | |
| All retry logic lives in call_agent() in app.py, which also handles | |
| MemorySaver thread rotation on INVALID_CHAT_HISTORY. Having max_retries>0 | |
| here caused double-retry storms that exhausted the rate-limit faster. | |
| """ | |
| return ChatMistralAI( | |
| model="mistral-large-latest", | |
| temperature=0.2, | |
| timeout=MISTRAL_TIMEOUT, | |
| max_retries=0, # FIX-Bug3: no internal retry; outer call_agent handles it | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tool 1 β load_scopus_csv | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_scopus_csv(file_path: str) -> str: | |
| """ | |
| Load a Scopus CSV file correctly. | |
| Uses utf-8-sig (handles BOM) + quoting=0 (respects quoted multi-line cells). | |
| """ | |
| df = pd.read_csv( | |
| file_path, | |
| encoding="utf-8-sig", | |
| quoting=0, | |
| engine="python", | |
| on_bad_lines="skip", | |
| ) | |
| df.to_csv("loaded_data.csv", index=False, encoding="utf-8") | |
| n = len(df) | |
| cols = list(df.columns) | |
| abs_texts = list(df["Abstract"].dropna().astype(str)) if "Abstract" in cols else [] | |
| ttl_texts = list(df["Title"].dropna().astype(str)) if "Title" in cols else [] | |
| # Combine Title and Abstract | |
| df["Combined"] = df["Title"].fillna("") + " " + df["Abstract"].fillna("") | |
| df.to_csv("loaded_data.csv", index=False, encoding="utf-8") | |
| combined_texts = list(df["Combined"].astype(str)) | |
| # We no longer split into sentences. Each paper is one document. | |
| abs_sents = combined_texts | |
| ttl_sents = ttl_texts | |
| years = pd.to_numeric(df["Year"], errors="coerce").dropna() if "Year" in cols else pd.Series([], dtype=float) | |
| year_range = f"{int(years.min())} β {int(years.max())}" if len(years) else "N/A" | |
| return json.dumps({ | |
| "papers": n, | |
| "abstract_sentences": len(abs_sents), | |
| "title_sentences": len(ttl_sents), | |
| "year_range": year_range, | |
| "columns": cols, | |
| "abstract_coverage_pct": round(len(abs_texts) / n * 100, 1) if n else 0, | |
| "title_coverage_pct": round(len(ttl_texts) / n * 100, 1) if n else 0, | |
| "sample_titles": list(df["Title"].dropna().head(5)) if "Title" in cols else [], | |
| "file_saved": "loaded_data.csv", | |
| "note": f"Sentence cap for clustering is {MAX_SENTENCES} (for performance).", | |
| }, indent=2) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tool 2 β run_bertopic_discovery | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_bertopic_discovery(run_key: str = "abstract", threshold: float = 0.7) -> str: | |
| """ | |
| Core clustering tool. | |
| Caps sentences at MAX_SENTENCES=3000 before clustering to prevent | |
| memory/timeout issues (730MB distance matrix without cap β 34MB with cap). | |
| Embeds with all-MiniLM-L6-v2, clusters with AgglomerativeClustering | |
| (cosine, average, threshold). NO UMAP. Saves summaries + embeddings. | |
| Generates 4 Plotly HTML charts. | |
| Args: | |
| run_key: 'abstract' or 'title' | |
| threshold: distance threshold for agglomerative clustering (default 0.7) | |
| Returns: | |
| JSON: total_topics, total_sentences, sentences_used, chart files. | |
| """ | |
| df = pd.read_csv("loaded_data.csv") | |
| col = RUN_CONFIGS[run_key][0] | |
| texts = list(df[col].dropna().astype(str)) | |
| # Documents instead of sentences | |
| all_sentences = _texts_to_sentences(texts) | |
| # Cap sentences if needed | |
| sentences = all_sentences[:MAX_SENTENCES] | |
| print(f"[run_bertopic] {len(all_sentences)} items β capped to {len(sentences)}") | |
| embeddings = _embed(sentences) | |
| np.save(f"emb_{run_key}.npy", embeddings) | |
| # HDBSCAN + UMAP dynamic loop | |
| best_labels = None | |
| best_n_clusters = 0 | |
| best_params = {} | |
| found = False | |
| for n_neighbors in [5, 10, 15]: | |
| for min_cluster_size in [5, 10, 15]: | |
| for cluster_selection_epsilon in [0.5, 0.55, 0.6]: | |
| reducer = umap.UMAP(n_neighbors=n_neighbors, n_components=5, metric='cosine', random_state=42) | |
| umap_emb = reducer.fit_transform(embeddings) | |
| db = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, metric='euclidean') | |
| labels = db.fit_predict(umap_emb) | |
| n_clusters = len(set(labels) - {-1}) | |
| if 15 <= n_clusters <= 30: | |
| best_labels = labels | |
| best_n_clusters = n_clusters | |
| best_params = {'n_neighbors': n_neighbors, 'min_cluster_size': min_cluster_size, 'eps': cluster_selection_epsilon} | |
| found = True | |
| break | |
| if found: break | |
| if found: break | |
| if not found: | |
| # Fallback to defaults | |
| reducer = umap.UMAP(n_neighbors=10, n_components=5, metric='cosine', random_state=42) | |
| umap_emb = reducer.fit_transform(embeddings) | |
| db = hdbscan.HDBSCAN(min_cluster_size=5, cluster_selection_epsilon=0.5, metric='euclidean') | |
| best_labels = db.fit_predict(umap_emb) | |
| best_params = {'fallback': True} | |
| labels = best_labels | |
| # We assign noise points to the nearest cluster or just leave as noise. | |
| # The current code _build_summaries ignores -1 labels. | |
| summaries = _build_summaries(labels, sentences, embeddings) | |
| with open(f"summaries_{run_key}.json", "w") as f: | |
| json.dump(summaries, f, indent=2) | |
| counts = [s["count"] for s in summaries] | |
| ids = [s["topic_id"] for s in summaries] | |
| centroids_matrix = np.array([s["centroid"] for s in summaries]) | |
| # Chart 1 β Intertopic distance map (PCA 2D) | |
| n_comp = min(2, len(centroids_matrix), centroids_matrix.shape[1]) | |
| pca2 = PCA(n_components=n_comp).fit_transform(centroids_matrix) | |
| x_vals = pca2[:, 0].tolist() | |
| y_vals = (pca2[:, 1].tolist() if pca2.shape[1] > 1 else [0] * len(x_vals)) | |
| fig1 = px.scatter( | |
| x=x_vals, y=y_vals, | |
| size=counts, text=list(map(str, ids)), | |
| title=f"Intertopic Distance Map ({run_key})", | |
| labels={"x": "PC1", "y": "PC2"}, | |
| size_max=40, color=counts, color_continuous_scale="Blues", | |
| ) | |
| fig1.update_traces(textposition="top center") | |
| fig1.update_layout(template="plotly_dark") | |
| chart1 = f"chart_{run_key}_intertopic.html" | |
| fig1.write_html(chart1, include_plotlyjs="cdn") | |
| # Chart 2 β Frequency bar (top 30) | |
| top30 = summaries[:30] | |
| fig2 = px.bar( | |
| x=list(map(lambda s: f"T{s['topic_id']}", top30)), | |
| y=list(map(lambda s: s["count"], top30)), | |
| title=f"Topic Sentence Frequency ({run_key}) β Top 30", | |
| labels={"x": "Topic", "y": "Sentences"}, | |
| color=list(map(lambda s: s["count"], top30)), | |
| color_continuous_scale="Teal", | |
| ) | |
| fig2.update_layout(template="plotly_dark") | |
| chart2 = f"chart_{run_key}_bars.html" | |
| fig2.write_html(chart2, include_plotlyjs="cdn") | |
| # Chart 3 β Treemap | |
| fig3 = px.treemap( | |
| names=list(map(lambda s: f"T{s['topic_id']}", summaries)), | |
| parents=["Topics"] * len(summaries), | |
| values=counts, | |
| title=f"Topic Hierarchy ({run_key})", | |
| ) | |
| fig3.update_layout(template="plotly_dark") | |
| chart3 = f"chart_{run_key}_hierarchy.html" | |
| fig3.write_html(chart3, include_plotlyjs="cdn") | |
| # Chart 4 β Cosine similarity heatmap (top 20) | |
| top20 = summaries[:20] | |
| top20_c = np.array([s["centroid"] for s in top20]) | |
| heat = cosine_similarity(top20_c).tolist() | |
| hlbls = list(map(lambda s: f"T{s['topic_id']}", top20)) | |
| fig4 = go.Figure(data=go.Heatmap(z=heat, x=hlbls, y=hlbls, colorscale="Blues")) | |
| fig4.update_layout( | |
| title=f"Inter-Topic Cosine Similarity ({run_key})", template="plotly_dark") | |
| chart4 = f"chart_{run_key}_heatmap.html" | |
| fig4.write_html(chart4, include_plotlyjs="cdn") | |
| return json.dumps({ | |
| "run_key": run_key, | |
| "total_topics": len(summaries), | |
| "total_sentences": len(all_sentences), | |
| "sentences_used": len(sentences), | |
| "sentences_capped": len(all_sentences) > MAX_SENTENCES, | |
| "threshold_used": threshold, | |
| "summaries_file": f"summaries_{run_key}.json", | |
| "embeddings_file": f"emb_{run_key}.npy", | |
| "charts": [chart1, chart2, chart3, chart4], | |
| "topics_preview": summaries[:3], | |
| }, indent=2) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tool 3 β label_topics_with_llm (BATCH β 1 API call, not 100) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def label_topics_with_llm(run_key: str = "abstract") -> str: | |
| """ | |
| Label topic clusters using a dual-LLM AI Council (Mistral + Groq Llama-3). | |
| Ensures consensus on research area labels. | |
| """ | |
| with open(f"summaries_{run_key}.json", encoding="utf-8") as f: | |
| summaries = json.load(f) | |
| top = summaries[:MAX_LABEL_TOPICS] | |
| llm_a = _get_llm() | |
| llm_b = _get_council_llm_b() | |
| llm_c = _get_council_llm_c() | |
| parser = JsonOutputParser() | |
| prompt = PromptTemplate( | |
| input_variables=["topics_json", "n"], | |
| template=( | |
| "You are a thematic analysis expert.\n\n" | |
| "Below are {n} topic clusters. For EACH cluster, provide a research label AND 1-2 precise sentences of reasoning.\n" | |
| "{topics_json}\n\n" | |
| "Return ONLY a JSON array. Each element: {{\"topic_id\": int, \"label\": \"Concise Label\", \"reasoning\": \"1-2 sentences of academic justification.\"}}" | |
| ), | |
| ) | |
| chain_a = prompt | llm_a | parser | |
| chain_b = prompt | llm_b | parser | |
| chain_c = prompt | llm_c | parser | |
| # Batch call all models | |
| # Truncate to 500 chars per sentence to fit within Groq's 12000 TPM limit | |
| topics_json = json.dumps(list(map(lambda s: {"id": s["topic_id"], "sents": [sent[:500] + "..." for sent in s["nearest_sentences"][:2]]}, top)), indent=2) | |
| res_a = chain_a.invoke({"topics_json": topics_json, "n": len(top)}) or [] | |
| res_b = chain_b.invoke({"topics_json": topics_json, "n": len(top)}) or [] | |
| res_c = chain_c.invoke({"topics_json": topics_json, "n": len(top)}) or [] | |
| idx_a = {str(item["topic_id"]): item for item in res_a} | |
| idx_b = {str(item["topic_id"]): item for item in res_b} | |
| idx_c = {str(item["topic_id"]): item for item in res_c} | |
| def merge_council(s): | |
| ra = idx_a.get(str(s["topic_id"]), {"label": "Unknown", "reasoning": ""}) | |
| rb = idx_b.get(str(s["topic_id"]), {"label": "Unknown", "reasoning": ""}) | |
| rc = idx_c.get(str(s["topic_id"]), {"label": "Unknown", "reasoning": ""}) | |
| l_a, r_a = ra["label"], ra["reasoning"] | |
| l_b, r_b = rb["label"], rb["reasoning"] | |
| l_c, r_c = rc["label"], rc["reasoning"] | |
| s_ab = _council_agreement_score(l_a, l_b) | |
| s_bc = _council_agreement_score(l_b, l_c) | |
| s_ca = _council_agreement_score(l_c, l_a) | |
| score = max(s_ab, s_bc, s_ca) | |
| agreed = score >= 0.65 | |
| ui = format_consensus_ui(l_a, l_b, l_c, agreed, score, r_a, r_b, r_c) | |
| return { | |
| **s, "label": l_a, | |
| "council_ui": ui | |
| } | |
| labelled = list(map(merge_council, top)) | |
| out = f"labels_{run_key}.json" | |
| with open(out, "w", encoding="utf-8") as f: | |
| json.dump(labelled, f, indent=2) | |
| return json.dumps({ | |
| "run_key": run_key, | |
| "total_labelled": len(labelled), | |
| "output_file": out, | |
| "preview": labelled[:5], | |
| }, indent=2) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tool 4 β consolidate_into_themes | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def consolidate_into_themes(run_key: str = "abstract", theme_map: str = "") -> str: | |
| """ | |
| Merge topic clusters into core themes using a dual-LLM AI Council. | |
| """ | |
| with open(f"labels_{run_key}.json", encoding="utf-8") as f: | |
| labelled = json.load(f) | |
| llm_a = _get_llm() | |
| llm_b = _get_council_llm_b() | |
| llm_c = _get_council_llm_c() | |
| parser = JsonOutputParser() | |
| prompt = PromptTemplate( | |
| input_variables=["topics_json"], | |
| template=( | |
| "You are a thematic analyst.\n\n" | |
| "Topics: {topics_json}\n\n" | |
| "Consolidate into 4-8 themes. Return JSON array. Each element: " | |
| "{{\"theme_name\": \"...\", \"topic_ids\": [1,2,3], \"rationale\": \"...\"}}" | |
| ), | |
| ) | |
| chain_a = prompt | llm_a | parser | |
| chain_b = prompt | llm_b | parser | |
| chain_c = prompt | llm_c | parser | |
| summary = json.dumps(list(map(lambda t: {"id": t["topic_id"], "lbl": t["label"]}, labelled)), indent=2) | |
| raw_a = chain_a.invoke({"topics_json": summary}) or [] | |
| raw_b = chain_b.invoke({"topics_json": summary}) or [] | |
| raw_c = chain_c.invoke({"topics_json": summary}) or [] | |
| # Simple comparison of first 2 themes generated | |
| l_a = ", ".join(map(lambda x: x["theme_name"], raw_a[:2])) | |
| l_b = ", ".join(map(lambda x: x["theme_name"], raw_b[:2])) | |
| l_c = ", ".join(map(lambda x: x["theme_name"], raw_c[:2])) | |
| s_ab = _council_agreement_score(l_a, l_b) | |
| s_bc = _council_agreement_score(l_b, l_c) | |
| s_ca = _council_agreement_score(l_c, l_a) | |
| score = max(s_ab, s_bc, s_ca) | |
| agreed = score >= 0.3 | |
| ui = format_consensus_ui(l_a, l_b, l_c, agreed, score) | |
| themes = list(map(lambda t: {**t, "council_ui": ui}, raw_a)) | |
| out = f"themes_{run_key}.json" | |
| with open(out, "w", encoding="utf-8") as f: | |
| json.dump(themes, f, indent=2) | |
| with open("themes.json", "w", encoding="utf-8") as f: | |
| json.dump(themes, f, indent=2) | |
| return json.dumps({ | |
| "run_key": run_key, | |
| "total_themes": len(themes), | |
| "output_file": out, | |
| "themes_preview": themes[:3], | |
| }, indent=2) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tool 5 β compare_with_taxonomy | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def compare_with_taxonomy(run_key: str = "abstract") -> str: | |
| """ | |
| Map each consolidated theme to the PAJAIS 25-category taxonomy via Mistral. | |
| Returns MAPPED vs NOVEL per theme. Saves taxonomy_map.json. | |
| FIX-Bug4: Prefer themes_{run_key}.json over the generic themes.json so that | |
| abstract and title runs never cross-contaminate each other's theme data. | |
| Args: | |
| run_key: 'abstract' or 'title' | |
| Returns: | |
| JSON: total mapped, novel count, full mapping, output_file. | |
| """ | |
| # FIX-Bug4: use run_key-specific file first, fall back to generic themes.json | |
| run_themes_file = f"themes_{run_key}.json" | |
| themes_file = run_themes_file if os.path.exists(run_themes_file) else "themes.json" | |
| with open(themes_file, encoding="utf-8") as f: | |
| themes = json.load(f) | |
| llm = _get_llm() | |
| parser = JsonOutputParser() | |
| prompt = PromptTemplate( | |
| input_variables=["themes_json", "taxonomy"], | |
| template=( | |
| "You are a research classification expert.\n\n" | |
| "PAJAIS Taxonomy (25 categories):\n{taxonomy}\n\n" | |
| "Themes from corpus:\n{themes_json}\n\n" | |
| "For each theme, find the best PAJAIS category match.\n" | |
| "Return ONLY a valid JSON array β no markdown. Each element:\n" | |
| " theme_name: string (match input exactly)\n" | |
| " pajais_match: best PAJAIS category, or 'NOVEL' if none fits\n" | |
| " match_confidence: float 0.0-1.0\n" | |
| " reasoning: one sentence\n" | |
| " is_novel: boolean\n" | |
| ), | |
| ) | |
| chain = prompt | llm | parser | |
| theme_summaries = list(map( | |
| lambda t: { | |
| "theme_name": t["theme_name"], | |
| "total_sentences": t.get("total_sentences", 0), | |
| "constituent_labels": t.get("constituent_labels", []), | |
| "sample": (t.get("representative_sentences", [""])[0][:100] | |
| if t.get("representative_sentences") else ""), | |
| }, | |
| themes, | |
| )) | |
| mapping = chain.invoke({ | |
| "themes_json": json.dumps(theme_summaries, indent=2), | |
| "taxonomy": "\n".join(f"{i+1}. {c}" for i, c in enumerate(PAJAIS_TAXONOMY)), | |
| }) | |
| with open("taxonomy_map.json", "w", encoding="utf-8") as f: | |
| json.dump(mapping, f, indent=2) | |
| novel_count = len(list(filter(lambda m: m.get("is_novel", False), mapping))) | |
| return json.dumps({ | |
| "run_key": run_key, | |
| "total_themes_mapped": len(mapping), | |
| "novel_themes": novel_count, | |
| "mapped_themes": len(mapping) - novel_count, | |
| "output_file": "taxonomy_map.json", | |
| "mapping": mapping, | |
| }, indent=2) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tool 6 β generate_comparison_csv | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_comparison_csv() -> str: | |
| """ | |
| Load themes from both abstract and title runs, create side-by-side | |
| comparison DataFrame. Saves comparison.csv. | |
| Returns: | |
| JSON: output_file, row_count, preview. | |
| """ | |
| def _load(rk): | |
| p = f"themes_{rk}.json" | |
| raw = open(p, encoding="utf-8").read() if os.path.exists(p) else "[]" | |
| return json.loads(raw) | |
| abs_themes = _load("abstract") | |
| ttl_themes = _load("title") | |
| max_rows = max(len(abs_themes), len(ttl_themes), 1) | |
| pad_abs = abs_themes + [{}] * (max_rows - len(abs_themes)) | |
| pad_ttl = ttl_themes + [{}] * (max_rows - len(ttl_themes)) | |
| rows = list(map( | |
| lambda pair: { | |
| "#": pair[0] + 1, | |
| "Abstract Theme": pair[1][0].get("theme_name", ""), | |
| "Abstract Sents": pair[1][0].get("total_sentences", 0), | |
| "Abstract Labels": ", ".join(pair[1][0].get("constituent_labels", [])[:3]), | |
| "Title Theme": pair[1][1].get("theme_name", ""), | |
| "Title Sents": pair[1][1].get("total_sentences", 0), | |
| "Title Labels": ", ".join(pair[1][1].get("constituent_labels", [])[:3]), | |
| "Convergence": ( | |
| "β" if pair[1][0].get("theme_name", "").lower()[:8] | |
| == pair[1][1].get("theme_name", "").lower()[:8] | |
| else "" | |
| ), | |
| }, | |
| enumerate(zip(pad_abs, pad_ttl)), | |
| )) | |
| df = pd.DataFrame(rows) | |
| df.to_csv("comparison.csv", index=False) | |
| return json.dumps({ | |
| "output_file": "comparison.csv", | |
| "row_count": len(df), | |
| "preview": rows[:3], | |
| }, indent=2) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tool 7 β export_narrative | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def export_narrative(run_key: str = "abstract") -> str: | |
| """ | |
| Generate a 500-word Section 7 narrative using Mistral LLM. | |
| Covers methodology, themes, PAJAIS alignment, limitations, implications. | |
| Saves narrative.txt. | |
| Args: | |
| run_key: 'abstract' or 'title' | |
| Returns: | |
| JSON: output_file, word_count, 500-char preview. | |
| """ | |
| with open("themes.json", encoding="utf-8") as f: | |
| themes = json.load(f) | |
| tax_raw = open("taxonomy_map.json", encoding="utf-8").read() if os.path.exists("taxonomy_map.json") else "[]" | |
| tax_data = json.loads(tax_raw) | |
| llm = _get_llm() | |
| llm.temperature = 0.4 # Slightly higher for creativity in Section 7 narrative | |
| prompt = PromptTemplate( | |
| input_variables=["run_key", "themes_json", "taxonomy_json"], | |
| template=( | |
| "You are writing Section 7 of an academic literature review paper.\n\n" | |
| "Analysis column: {run_key}\n" | |
| "Themes:\n{themes_json}\n\n" | |
| "PAJAIS Mapping:\n{taxonomy_json}\n\n" | |
| "Write a 500-word Section 7 covering:\n" | |
| "1. Methodology (BERTopic + Braun & Clarke 2006 six phases)\n" | |
| "2. Key themes discovered (reference each by name)\n" | |
| "3. PAJAIS taxonomy alignment (MAPPED vs NOVEL themes)\n" | |
| "4. Limitations of this computational approach\n" | |
| "5. Implications for future research\n\n" | |
| "Academic third-person prose, full paragraphs only, minimum 500 words." | |
| ), | |
| ) | |
| chain = prompt | llm | |
| response = chain.invoke({ | |
| "run_key": run_key, | |
| "themes_json": json.dumps(themes, indent=2), | |
| "taxonomy_json": json.dumps(tax_data, indent=2), | |
| }) | |
| text = response.content if hasattr(response, "content") else str(response) | |
| with open("narrative.txt", "w", encoding="utf-8") as f: | |
| f.write(text) | |
| return json.dumps({ | |
| "output_file": "narrative.txt", | |
| "word_count": len(text.split()), | |
| "preview": text[:500], | |
| }, indent=2) | |
| # Verified: zero if/else, zero for/while, zero try/except | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # AI Council helpers | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _get_council_llm_b() -> ChatGroq: | |
| """Return the Groq Llama-3 model as the second council LLM.""" | |
| return ChatGroq(model="llama-3.3-70b-versatile", temperature=0.2, max_retries=0) | |
| def _get_council_llm_c() -> ChatGoogleGenerativeAI: | |
| """Return the Gemini model as the third council LLM.""" | |
| return ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2, max_retries=0) | |
| def format_consensus_ui(label_a, label_b, label_c="", agreed=False, score=0.0, reason_a="", reason_b="", reason_c=""): | |
| """Generate an ultra-compact HTML Argument UI for 3 models.""" | |
| status_icon = "β Match" if agreed else "β οΈ Diverge" | |
| status_color = "#2ecc71" if agreed else "#e67e22" | |
| return f""" | |
| <div style="margin-top:4px; border-left: 2px solid {status_color}; padding-left:8px; font-size:0.75rem;"> | |
| <div style="color:{status_color}; font-weight:700; margin-bottom:2px;">{status_icon} (Max Match: {score})</div> | |
| <div style="display:flex; gap:10px;"> | |
| <div style="flex:1; background:#0d1117; padding:6px; border-radius:4px; border:1px solid #30363d; color:#f0f6fc;"> | |
| <b style="color:#7fb3f5; font-size:0.65rem;">MISTRAL:</b> {reason_a} | |
| </div> | |
| <div style="flex:1; background:#0d1117; padding:6px; border-radius:4px; border:1px solid #30363d; color:#f0f6fc;"> | |
| <b style="color:#7fb3f5; font-size:0.65rem;">GROQ:</b> {reason_b} | |
| </div> | |
| <div style="flex:1; background:#0d1117; padding:6px; border-radius:4px; border:1px solid #30363d; color:#f0f6fc;"> | |
| <b style="color:#7fb3f5; font-size:0.65rem;">GEMINI:</b> {reason_c} | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| def _council_agreement_score(label_a: str, label_b: str) -> float: | |
| """Compute semantic cosine similarity between two label strings.""" | |
| global _label_sim_model | |
| if _label_sim_model is None: | |
| _label_sim_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| embs = _label_sim_model.encode([label_a, label_b]) | |
| return round(float(cosine_similarity([embs[0]], [embs[1]])[0][0]), 3) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tool 8 β run_dbscan_clustering | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_dbscan_clustering(run_key: str = "abstract", eps: float = 0.3, min_samples: int = 3) -> str: | |
| """ | |
| Run HDBSCAN + UMAP clustering on the embeddings. | |
| Loops to find hyperparameters that yield between 15 and 30 clusters. | |
| Also ensures min_cluster_size is around 5. | |
| (eps and min_samples args are kept for interface compatibility but may be overridden by the loop). | |
| """ | |
| embeddings = np.load(f"emb_{run_key}.npy") | |
| with open(f"summaries_{run_key}.json", encoding="utf-8") as f: | |
| agg_summaries = json.load(f) | |
| all_nearest = [s for summ in agg_summaries for s in summ.get("nearest_sentences", [])] | |
| if not all_nearest: | |
| import pandas as pd | |
| df = pd.read_csv("loaded_data.csv") | |
| all_nearest = list(df["Combined"].dropna().astype(str)) | |
| best_labels = None | |
| best_n_clusters = 0 | |
| best_params = {} | |
| found = False | |
| for n_neighbors in [5, 10, 15]: | |
| for min_cluster_size in [5, 10, 15]: | |
| for cluster_selection_epsilon in [0.4, 0.5, 0.6]: | |
| reducer = umap.UMAP(n_neighbors=n_neighbors, n_components=5, metric='cosine', random_state=42) | |
| umap_emb = reducer.fit_transform(embeddings) | |
| db = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, metric='euclidean') | |
| labels = db.fit_predict(umap_emb) | |
| n_clusters = len(set(labels) - {-1}) | |
| if 15 <= n_clusters <= 30: | |
| best_labels = labels | |
| best_n_clusters = n_clusters | |
| best_params = {'n_neighbors': n_neighbors, 'min_cluster_size': min_cluster_size, 'eps': cluster_selection_epsilon} | |
| found = True | |
| break | |
| if found: break | |
| if found: break | |
| if not found: | |
| reducer = umap.UMAP(n_neighbors=10, n_components=5, metric='cosine', random_state=42) | |
| umap_emb = reducer.fit_transform(embeddings) | |
| db = hdbscan.HDBSCAN(min_cluster_size=5, cluster_selection_epsilon=0.5, metric='euclidean') | |
| best_labels = db.fit_predict(umap_emb) | |
| best_params = {'fallback': True} | |
| db_labels = best_labels | |
| valid_ids = sorted(set(db_labels.tolist()) - {-1}) | |
| noise_count = int((db_labels == -1).sum()) | |
| centroids = _compute_centroids(embeddings, db_labels) | |
| def _dbscan_summary(cid): | |
| mask = db_labels == cid | |
| count = int(mask.sum()) | |
| sents = _nearest_sents(centroids[cid], | |
| all_nearest or [f"Cluster {cid}"], | |
| embeddings[: len(all_nearest or ["x"])], | |
| min(3, len(all_nearest or ["x"]))) | |
| return { | |
| "cluster_id": int(cid), | |
| "count": count, | |
| "centroid": centroids[cid].tolist(), | |
| "nearest_sentences": sents, | |
| "source": "hdbscan", | |
| } | |
| summaries = list(map(_dbscan_summary, valid_ids)) | |
| out_file = f"dbscan_summaries_{run_key}.json" | |
| with open(out_file, "w", encoding="utf-8") as f: | |
| json.dump(summaries, f, indent=2) | |
| n_comp = min(2, len(embeddings), embeddings.shape[1]) | |
| pca2 = PCA(n_components=n_comp).fit_transform(embeddings) | |
| x_vals = pca2[:, 0].tolist() | |
| y_vals = pca2[:, 1].tolist() if n_comp > 1 else [0.0] * len(x_vals) | |
| colors = db_labels.tolist() | |
| fig_scatter = px.scatter( | |
| x=x_vals, y=y_vals, | |
| color=list(map(str, colors)), | |
| title=f"HDBSCAN Cluster Map ({run_key}) β {best_params}", | |
| labels={"x": "PC1", "y": "PC2", "color": "Cluster"}, | |
| opacity=0.7, | |
| ) | |
| fig_scatter.update_layout(template="plotly_dark") | |
| chart_scatter = f"chart_{run_key}_dbscan_scatter.html" | |
| fig_scatter.write_html(chart_scatter, include_plotlyjs="cdn") | |
| agg_count = len(agg_summaries) | |
| dbscan_count = len(summaries) | |
| fig_cmp = px.bar( | |
| x=["Agglomerative", "HDBSCAN"], | |
| y=[agg_count, dbscan_count], | |
| color=["Agglomerative", "HDBSCAN"], | |
| color_discrete_sequence=["#4a90d9", "#e67e22"], | |
| title=f"Cluster Count Comparison ({run_key})", | |
| labels={"x": "Method", "y": "# Clusters"}, | |
| text=[agg_count, dbscan_count], | |
| ) | |
| fig_cmp.update_traces(textposition="outside") | |
| fig_cmp.update_layout(template="plotly_dark", showlegend=False) | |
| chart_cmp = f"chart_{run_key}_dbscan_comparison.html" | |
| fig_cmp.write_html(chart_cmp, include_plotlyjs="cdn") | |
| largest = max(map(lambda s: s["count"], summaries), default=0) | |
| return json.dumps({ | |
| "run_key": run_key, | |
| "n_clusters": len(summaries), | |
| "noise_points": noise_count, | |
| "largest_cluster": largest, | |
| "best_params": best_params, | |
| "summaries_file": out_file, | |
| "charts": [chart_scatter, chart_cmp], | |
| "preview": summaries[:3], | |
| }, indent=2) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tool 9 β refine_large_clusters | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def refine_large_clusters(run_key: str = "abstract", size_threshold: int = 100) -> str: | |
| """ | |
| Post-processing: identifies overly large DBSCAN clusters and refines them | |
| into sub-clusters using a tighter AgglomerativeClustering threshold (0.45). | |
| Does NOT modify dbscan_summaries or any existing agglomerative results. | |
| Saves results to refined_clusters_{run_key}.json. | |
| Args: | |
| run_key: 'abstract' or 'title' | |
| size_threshold: Clusters with count > this value will be refined (default 200) | |
| Returns: | |
| JSON: n_refined, total_subclusters, refined_clusters_file, chart file. | |
| """ | |
| dbscan_file = f"dbscan_summaries_{run_key}.json" | |
| with open(dbscan_file, encoding="utf-8") as f: | |
| summaries = json.load(f) | |
| embeddings = np.load(f"emb_{run_key}.npy") | |
| large = list(filter(lambda s: s["count"] >= size_threshold, summaries)) | |
| unchanged = list(filter(lambda s: s["count"] < size_threshold, summaries)) | |
| # Re-cluster each large cluster's embedding slice | |
| def _refine_one(parent_summary): | |
| pid = parent_summary["cluster_id"] | |
| parent_c = np.array(parent_summary["centroid"]) | |
| # Find the indices in the full embedding that are nearest to this centroid | |
| sims = cosine_similarity([parent_c], embeddings)[0] | |
| count = parent_summary["count"] | |
| idxs = np.argsort(sims)[::-1][:count].tolist() | |
| sub_emb = embeddings[idxs] | |
| sub_labels = AgglomerativeClustering( | |
| metric="cosine", linkage="average", | |
| distance_threshold=0.45, n_clusters=None, | |
| ).fit_predict(sub_emb) | |
| sub_ids = sorted(set(sub_labels.tolist())) | |
| sub_centroids = dict(map( | |
| lambda sid: (sid, sub_emb[sub_labels == sid].mean(axis=0)), | |
| sub_ids, | |
| )) | |
| def _sub(sid): | |
| mask = sub_labels == sid | |
| sents = parent_summary.get("nearest_sentences", []) | |
| return { | |
| "cluster_id": f"{pid}.{sid}", | |
| "parent_cluster_id": pid, | |
| "count": int(mask.sum()), | |
| "centroid": sub_centroids[sid].tolist(), | |
| "nearest_sentences": sents[:3], | |
| "source": "dbscan_refined", | |
| } | |
| return list(map(_sub, sub_ids)) | |
| refined_subs = [item for sublist in map(_refine_one, large) for item in sublist] | |
| # Unchanged clusters kept as-is with a source tag | |
| unchanged_kept = list(map( | |
| lambda s: {**s, "source": "dbscan_unchanged"}, | |
| unchanged, | |
| )) | |
| all_refined = unchanged_kept + refined_subs | |
| out_file = f"refined_clusters_{run_key}.json" | |
| with open(out_file, "w", encoding="utf-8") as f: | |
| json.dump(all_refined, f, indent=2) | |
| # ββ Chart: Treemap of refined sub-clusters ββββββββββββββββββββββββββββββββ | |
| labels_list = list(map(lambda c: str(c["cluster_id"]), all_refined)) | |
| parents_list = list(map( | |
| lambda c: str(c.get("parent_cluster_id", "root")) if "." in str(c["cluster_id"]) else "root", | |
| all_refined, | |
| )) | |
| values_list = list(map(lambda c: c["count"], all_refined)) | |
| fig_tree = px.treemap( | |
| names=labels_list, | |
| parents=parents_list, | |
| values=values_list, | |
| title=f"Refined Sub-Clusters ({run_key}) β threshold={size_threshold}", | |
| ) | |
| fig_tree.update_layout(template="plotly_dark") | |
| chart_tree = f"chart_{run_key}_refined.html" | |
| fig_tree.write_html(chart_tree, include_plotlyjs="cdn") | |
| return json.dumps({ | |
| "run_key": run_key, | |
| "size_threshold": size_threshold, | |
| "n_large_refined": len(large), | |
| "total_subclusters": len(refined_subs), | |
| "unchanged_clusters": len(unchanged), | |
| "total_output_clusters": len(all_refined), | |
| "output_file": out_file, | |
| "chart": chart_tree, | |
| "preview": all_refined[:4], | |
| }, indent=2) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tool 10 β run_ai_council | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_ai_council(run_key: str = "abstract") -> str: | |
| """ | |
| AI Council: two LLM instances independently label each DBSCAN cluster | |
| from its top-3 representative sentences, then a consensus step merges them. | |
| Model A: Mistral Large (temperature=0.2) β analytical, precise | |
| Model B: Groq Llama-3.3-70b-versatile (temperature=0.2) β genuinely different | |
| model providing independent perspective (Karpathy-style second opinion) | |
| Consensus rule: | |
| - Jaccard word overlap >= 0.4 β agreement; consensus = Model A label | |
| - Jaccard word overlap < 0.4 β divergence; Model A (Mistral) selected as primary | |
| Saves council_labels_{run_key}.json (compatible with PAJAIS mapping). | |
| Args: | |
| run_key: 'abstract' or 'title' | |
| Returns: | |
| JSON: total_labelled, agreement_rate, output_file, preview. | |
| """ | |
| dbscan_file = f"dbscan_summaries_{run_key}.json" | |
| with open(dbscan_file, encoding="utf-8") as f: | |
| summaries = json.load(f) | |
| top = summaries[:MAX_LABEL_TOPICS] | |
| # Truncate to 500 chars to fit within Groq's 12000 TPM limit | |
| topics_for_prompt = list(map( | |
| lambda s: { | |
| "cluster_id": s["cluster_id"], | |
| "count": s["count"], | |
| "sentences": [sent[:500] + "..." for sent in s.get("nearest_sentences", [])[:3]], | |
| }, | |
| top, | |
| )) | |
| # ββ Model A (analytical Mistral) ββββββββββββββββββββββββββββββββββββββββββ | |
| llm_a = _get_llm() # temperature=0.2 | |
| llm_b = _get_council_llm_b() # temperature=0.2 | |
| llm_c = _get_council_llm_c() # temperature=0.2 | |
| council_prompt_tmpl = ( | |
| "You are an expert thematic analyst reviewing HDBSCAN-discovered clusters " | |
| "from an academic corpus.\n\n" | |
| "Below are cluster IDs with their top-3 representative sentences:\n\n" | |
| "{topics_json}\n\n" | |
| "For EACH cluster, propose a concise label (3-6 words).\n" | |
| "Return ONLY a valid JSON array. Each element must have:\n" | |
| " cluster_id: same integer as input\n" | |
| " label: concise 3-6 word research area name\n" | |
| " reasoning: one sentence explaining your choice\n\n" | |
| "Return ALL {n} clusters. Do not skip any." | |
| ) | |
| prompt_a = PromptTemplate(input_variables=["topics_json", "n"], template=council_prompt_tmpl) | |
| prompt_b = PromptTemplate(input_variables=["topics_json", "n"], template=council_prompt_tmpl) | |
| prompt_c = PromptTemplate(input_variables=["topics_json", "n"], template=council_prompt_tmpl) | |
| parser = JsonOutputParser() | |
| chain_a = prompt_a | llm_a | parser | |
| chain_b = prompt_b | llm_b | parser | |
| chain_c = prompt_c | llm_c | parser | |
| input_data = { | |
| "topics_json": json.dumps(topics_for_prompt, indent=2), | |
| "n": len(top), | |
| } | |
| results_a = chain_a.invoke(input_data) or [] | |
| results_b = chain_b.invoke(input_data) or [] | |
| results_c = chain_c.invoke(input_data) or [] | |
| idx_a = {str(r["cluster_id"]): r for r in results_a} | |
| idx_b = {str(r["cluster_id"]): r for r in results_b} | |
| idx_c = {str(r["cluster_id"]): r for r in results_c} | |
| # ββ Consensus step ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _consensus(cluster_summary): | |
| cid = str(cluster_summary["cluster_id"]) | |
| ra = idx_a.get(cid, {}) | |
| rb = idx_b.get(cid, {}) | |
| rc = idx_c.get(cid, {}) | |
| label_a = ra.get("label", f"Cluster {cid}") | |
| label_b = rb.get("label", f"Cluster {cid}") | |
| label_c = rc.get("label", f"Cluster {cid}") | |
| s_ab = _council_agreement_score(label_a, label_b) | |
| s_bc = _council_agreement_score(label_b, label_c) | |
| s_ca = _council_agreement_score(label_c, label_a) | |
| max_score = max(s_ab, s_bc, s_ca) | |
| agreed = max_score >= 0.65 | |
| # Pick the label that has the highest average agreement with others | |
| avg_a = (s_ab + s_ca) / 2 | |
| avg_b = (s_ab + s_bc) / 2 | |
| avg_c = (s_bc + s_ca) / 2 | |
| if avg_a >= avg_b and avg_a >= avg_c: | |
| best_label = label_a | |
| elif avg_b >= avg_a and avg_b >= avg_c: | |
| best_label = label_b | |
| else: | |
| best_label = label_c | |
| consensus = best_label if agreed else label_a | |
| ui = format_consensus_ui(label_a, label_b, label_c, agreed, max_score, ra.get("reasoning",""), rb.get("reasoning",""), rc.get("reasoning","")) | |
| return { | |
| "cluster_id": cluster_summary["cluster_id"], | |
| "count": cluster_summary["count"], | |
| "nearest_sentences": cluster_summary.get("nearest_sentences", [])[:3], | |
| "label_a": label_a, | |
| "label_b": label_b, | |
| "label_c": label_c, | |
| "consensus_label": consensus, | |
| "agreement_score": max_score, | |
| "council_ui": ui, | |
| "source": "dbscan_ai_council", | |
| "label": consensus, | |
| "reasoning": ra.get("reasoning", ""), | |
| } | |
| council_labels = list(map(_consensus, top)) | |
| out_file = f"council_labels_{run_key}.json" | |
| with open(out_file, "w", encoding="utf-8") as f: | |
| json.dump(council_labels, f, indent=2) | |
| agreed_count = len(list(filter(lambda c: c["agreement_score"] >= 0.65, council_labels))) | |
| agreement_rate = round(agreed_count / max(len(council_labels), 1) * 100, 1) | |
| return json.dumps({ | |
| "run_key": run_key, | |
| "total_labelled": len(council_labels), | |
| "agreed_count": agreed_count, | |
| "agreement_rate": f"{agreement_rate}%", | |
| "output_file": out_file, | |
| "note": ( | |
| "council_labels contain 'label' field for PAJAIS compatibility. " | |
| "Model A = Mistral Large. Model B = Groq Llama-3. Model C = Gemini Flash." | |
| ), | |
| "preview": council_labels[:4], | |
| }, indent=2) | |
| # Verified: zero if/else*, zero for/while, zero try/except | |
| # (*_get_council_llm_b uses a conditional expression, not an if/else block) |