Spaces:
Running
Running
| # ============================================================================= | |
| # agent.py -- PAJAIS Research Intelligence Agent (v2 — with DBSCAN + Council) | |
| # Deterministic six-phase orchestration pipeline + Phase 2.5 (DBSCAN) + | |
| # Phase 6.5 (Agentic Council) | |
| # ============================================================================= | |
| import logging | |
| import logging.handlers | |
| import json | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Any, Callable, Dict, List, Optional | |
| import numpy as np | |
| import pandas as pd | |
| from tools import ( | |
| load_journal_csv, | |
| validate_dataframe, | |
| preprocess_corpus, | |
| run_lda_topic_model, | |
| build_topic_dataframe, | |
| auto_label_topic, | |
| map_topics_to_pajais, | |
| generate_taxonomy_map, | |
| compare_abstract_vs_title_themes, | |
| generate_section7_narrative, | |
| export_all_artifacts, | |
| PAJAIS_THEMES, | |
| # New unified pipeline (Groups 0, 8-11 in tools.py) | |
| build_title_abstract_column, | |
| embed_with_specter2, | |
| specter2_hdbscan_cluster_topics, | |
| get_cluster_summary, | |
| label_clusters_3llm, | |
| run_agentic_council, | |
| ) | |
| _ADDITIONS_AVAILABLE = True # everything is now in tools.py | |
| # --------------------------------------------------------------------------- | |
| # Logging setup | |
| # --------------------------------------------------------------------------- | |
| def _setup_logger() -> logging.Logger: | |
| """Configure module logger to file and console.""" | |
| log = logging.getLogger('PAJAISAgent') | |
| if log.handlers: | |
| return log | |
| log.setLevel(logging.DEBUG) | |
| fmt = logging.Formatter( | |
| '%(asctime)s - PAJAISAgent - %(levelname)s - %(message)s' | |
| ) | |
| ch = logging.StreamHandler() | |
| ch.setLevel(logging.INFO) | |
| ch.setFormatter(fmt) | |
| log.addHandler(ch) | |
| Path('outputs').mkdir(exist_ok=True) | |
| try: | |
| fh = logging.FileHandler('outputs/agent.log', mode='a', encoding='utf-8') | |
| fh.setLevel(logging.DEBUG) | |
| fh.setFormatter(fmt) | |
| log.addHandler(fh) | |
| except OSError: | |
| pass | |
| return log | |
| logger = _setup_logger() | |
| # --------------------------------------------------------------------------- | |
| # Configuration | |
| # --------------------------------------------------------------------------- | |
| class AnalysisConfig: | |
| """Configuration for the PAJAIS analysis pipeline.""" | |
| # LDA | |
| n_topics: int = 40 | |
| min_topics_required: int = 98 | |
| n_lda_passes: int = 15 | |
| random_state: int = 42 | |
| output_dir: str = "outputs" | |
| # Taxonomy | |
| pajais_match_threshold: float = 0.15 | |
| publishable_min_docs: int = 5 | |
| publishable_min_coherence: float = 0.3 | |
| # SPECTER2 + UMAP + HDBSCAN clustering | |
| specter2_batch_size: int = 8 | |
| specter2_cache_dir: str = "outputs/specter_cache" | |
| umap_n_components: int = 50 | |
| umap_n_neighbors: int = 15 | |
| hdbscan_min_cluster_size: int = 5 | |
| hdbscan_max_cluster_size: int = 100 | |
| cluster_target_min: int = 15 | |
| cluster_target_max: int = 30 | |
| cosine_sim_low: float = 0.50 | |
| cosine_sim_high: float = 0.60 | |
| # LLM labeling (all free APIs) | |
| llm_label_max_clusters: int = 30 | |
| # API keys (populated from env or UI) | |
| mistral_api_key: str = "" | |
| gemini_api_key: str = "" | |
| ollama_url: str = "http://localhost:11434" # Local Ollama URL | |
| # --------------------------------------------------------------------------- | |
| # Agent class | |
| # --------------------------------------------------------------------------- | |
| class PAJAISResearchAgent: | |
| """Deterministic research gap analysis pipeline + DBSCAN + Agentic Council.""" | |
| def __init__(self, config: Optional[AnalysisConfig] = None) -> None: | |
| self.config = config or AnalysisConfig() | |
| Path(self.config.output_dir).mkdir(parents=True, exist_ok=True) | |
| # Core state (original pipeline) | |
| self.df: Optional[pd.DataFrame] = None | |
| self.validation: Optional[Dict[str, Any]] = None | |
| self.processed_texts: Optional[List[str]] = None | |
| self.lda_result: Optional[Dict[str, Any]] = None | |
| self.topic_df: Optional[pd.DataFrame] = None | |
| self.comparison_df: Optional[pd.DataFrame] = None | |
| self.taxonomy_map: Optional[Dict[str, Any]] = None | |
| self.narrative: str = '' | |
| self.artifacts: Dict[str, str] = {} | |
| self.supplementary_insights: Dict[str, Any] = {} | |
| # SPECTER2 + HDBSCAN state | |
| self.specter2_embeddings: Optional[np.ndarray] = None # (N, 768) | |
| self.cluster_df: Optional[pd.DataFrame] = None # doc-level | |
| self.cluster_summary_df: Optional[pd.DataFrame] = None # cluster-level | |
| self.cluster_labeled_df: Optional[pd.DataFrame] = None # with LLM labels | |
| # Agentic council | |
| self.council_result: Optional[Dict[str, str]] = None | |
| self._errors: List[str] = [] | |
| self._warnings: List[str] = [] | |
| self._phases_completed: List[int] = [] | |
| # ----------------------------------------------------------------------- | |
| # Public pipeline entry point | |
| # ----------------------------------------------------------------------- | |
| def run_full_pipeline( | |
| self, | |
| file_path: str, | |
| on_progress: Optional[Callable[[int, str, float], None]] = None, | |
| run_council: bool = False, | |
| ) -> Dict[str, Any]: | |
| """Execute all phases sequentially.""" | |
| self._errors = [] | |
| self._warnings = [] | |
| self._phases_completed = [] | |
| def _progress(phase: int, msg: str, pct: float) -> None: | |
| logger.info(f"[Phase {phase}] {msg} ({pct:.0f}%)") | |
| if on_progress: | |
| try: | |
| on_progress(phase, msg, pct) | |
| except Exception as cb_err: | |
| logger.warning(f"Progress callback error: {cb_err}") | |
| # Phase 1 — Data Ingestion | |
| _progress(1, "Loading and validating data...", 0.0) | |
| try: | |
| self._phase1_data_ingestion(file_path) | |
| self._phases_completed.append(1) | |
| _progress(1, "Data loaded.", 12.0) | |
| except Exception as e: | |
| self._errors.append(f"Phase 1 failed: {e}") | |
| logger.error(f"Phase 1 error: {e}", exc_info=True) | |
| # Phase 2 — LDA Topic Modeling | |
| _progress(2, "Running LDA topic modeling...", 12.0) | |
| try: | |
| self._phase2_topic_modeling(on_progress=on_progress) | |
| self._phases_completed.append(2) | |
| _progress(2, "Topic modeling complete.", 28.0) | |
| except Exception as e: | |
| self._errors.append(f"Phase 2 failed: {e}") | |
| logger.error(f"Phase 2 error: {e}", exc_info=True) | |
| # Phase 2.5 — DBSCAN Clustering (NEW) | |
| _progress(2, "Running DBSCAN clustering...", 28.0) | |
| try: | |
| self._phase2_5_dbscan_clustering() | |
| _progress(2, "DBSCAN clustering complete.", 38.0) | |
| except Exception as e: | |
| self._errors.append(f"Phase 2.5 failed: {e}") | |
| logger.error(f"Phase 2.5 error: {e}", exc_info=True) | |
| # Phase 3 — Export Topic Table | |
| _progress(3, "Exporting topic review table...", 38.0) | |
| try: | |
| self._phase3_export_topic_table() | |
| self._phases_completed.append(3) | |
| _progress(3, "Topic table exported.", 48.0) | |
| except Exception as e: | |
| self._errors.append(f"Phase 3 failed: {e}") | |
| logger.error(f"Phase 3 error: {e}", exc_info=True) | |
| # Phase 4 — Abstract vs Title Comparison | |
| _progress(4, "Comparing abstracts vs titles...", 48.0) | |
| try: | |
| self._phase4_abstract_title_comparison() | |
| self._phases_completed.append(4) | |
| _progress(4, "Abstract/title comparison done.", 60.0) | |
| except Exception as e: | |
| self._errors.append(f"Phase 4 failed: {e}") | |
| logger.error(f"Phase 4 error: {e}", exc_info=True) | |
| # Phase 5 — PAJAIS Taxonomy Mapping | |
| _progress(5, "Mapping to PAJAIS taxonomy...", 60.0) | |
| try: | |
| self._phase5_taxonomy_mapping() | |
| self._phases_completed.append(5) | |
| _progress(5, "Taxonomy mapping complete.", 72.0) | |
| except Exception as e: | |
| self._errors.append(f"Phase 5 failed: {e}") | |
| logger.error(f"Phase 5 error: {e}", exc_info=True) | |
| # Phase 5.5 — Mapping display | |
| try: | |
| self._phase5_5_mapping_display() | |
| _progress(5, "Mapping display saved.", 75.0) | |
| except Exception as e: | |
| self._errors.append(f"Phase 5.5 failed: {e}") | |
| logger.error(f"Phase 5.5 error: {e}", exc_info=True) | |
| # Phase 6 — Narrative | |
| _progress(6, "Generating narrative...", 75.0) | |
| try: | |
| self._phase6_narrative() | |
| self._phases_completed.append(6) | |
| _progress(6, "Narrative generated.", 85.0) | |
| except Exception as e: | |
| self._errors.append(f"Phase 6 failed: {e}") | |
| logger.error(f"Phase 6 error: {e}", exc_info=True) | |
| # Phase 6.5 — Agentic Council (optional; requires API keys) | |
| if run_council and _ADDITIONS_AVAILABLE: | |
| _progress(6, "Running Agentic Council...", 85.0) | |
| try: | |
| self._phase6_5_agentic_council() | |
| _progress(6, "Council complete.", 93.0) | |
| except Exception as e: | |
| self._errors.append(f"Phase 6.5 failed: {e}") | |
| logger.error(f"Phase 6.5 error: {e}", exc_info=True) | |
| # Export artifacts | |
| try: | |
| self.artifacts = export_all_artifacts( | |
| topic_df=self.topic_df if self.topic_df is not None else pd.DataFrame(), | |
| taxonomy_map=self.taxonomy_map or {}, | |
| comparison_df=self.comparison_df if self.comparison_df is not None else pd.DataFrame(), | |
| narrative=self.narrative, | |
| output_dir=self.config.output_dir, | |
| ) | |
| self._export_dbscan_artifacts() | |
| except Exception as e: | |
| self._errors.append(f"Artifact export failed: {e}") | |
| # Supplementary analytics | |
| try: | |
| self._discover_supplementary_insights() | |
| except Exception as e: | |
| logger.warning(f"Supplementary insights failed: {e}") | |
| _progress(6, "Pipeline complete.", 100.0) | |
| return self._build_summary() | |
| # ----------------------------------------------------------------------- | |
| # run_phase — single phase execution | |
| # ----------------------------------------------------------------------- | |
| def run_phase(self, phase_num: int, **kwargs) -> Dict[str, Any]: | |
| phase_map = { | |
| 1: lambda: self._phase1_data_ingestion(kwargs.get('file_path', '')), | |
| 2: lambda: self._phase2_topic_modeling(), | |
| 25: lambda: self._phase2_5_dbscan_clustering(), | |
| 3: lambda: self._phase3_export_topic_table(), | |
| 4: lambda: self._phase4_abstract_title_comparison(), | |
| 5: lambda: self._phase5_taxonomy_mapping(), | |
| 6: lambda: self._phase6_narrative(), | |
| 65: lambda: self._phase6_5_agentic_council(), | |
| } | |
| if phase_num not in phase_map: | |
| return {'success': False, 'error': f'Unknown phase: {phase_num}'} | |
| try: | |
| phase_map[phase_num]() | |
| if phase_num not in self._phases_completed: | |
| self._phases_completed.append(phase_num) | |
| return {'success': True, 'phase': phase_num} | |
| except Exception as e: | |
| logger.error(f"run_phase({phase_num}) failed: {e}", exc_info=True) | |
| return {'success': False, 'phase': phase_num, 'error': str(e)} | |
| # ----------------------------------------------------------------------- | |
| # Phase implementations — original | |
| # ----------------------------------------------------------------------- | |
| def _phase1_data_ingestion(self, file_path: str) -> None: | |
| logger.info(f"Phase 1: Loading {file_path}") | |
| self.df = load_journal_csv(file_path) | |
| self.validation = validate_dataframe(self.df) | |
| if self.validation: | |
| for w in self.validation.get('warnings', []): | |
| self._warnings.append(w) | |
| logger.warning(f"Validation warning: {w}") | |
| row_count = self.validation.get('row_count', 0) | |
| if row_count < 50: | |
| logger.warning(f"Small dataset ({row_count} rows). Continuing.") | |
| logger.info(f"Phase 1 complete: {len(self.df)} rows loaded.") | |
| def _phase2_topic_modeling( | |
| self, | |
| on_progress: Optional[Callable] = None | |
| ) -> None: | |
| if self.df is None or self.df.empty: | |
| raise ValueError("Phase 2: No data loaded. Run Phase 1 first.") | |
| abstracts = self.df.get('abstract', pd.Series(dtype=str)).fillna('').tolist() | |
| non_empty_abstracts = [t for t in abstracts if isinstance(t, str) and t.strip()] | |
| if len(non_empty_abstracts) < 5: | |
| logger.warning("Abstracts mostly empty; falling back to titles.") | |
| titles = self.df.get('title', pd.Series(dtype=str)).fillna('').tolist() | |
| texts_to_process = [t for t in titles if isinstance(t, str) and t.strip()] | |
| else: | |
| texts_to_process = non_empty_abstracts | |
| logger.info(f"Phase 2: Preprocessing {len(texts_to_process)} texts...") | |
| if on_progress: | |
| on_progress(2, "Preprocessing texts...", 14.0) | |
| self.processed_texts = preprocess_corpus(texts_to_process, n_jobs=1) | |
| non_empty_processed = [t for t in self.processed_texts if t.strip()] | |
| if on_progress: | |
| on_progress(2, "Running LDA topic modeling...", 20.0) | |
| logger.info(f"Phase 2: Running LDA with n_topics={self.config.n_topics}") | |
| try: | |
| self.lda_result = run_lda_topic_model( | |
| texts=non_empty_processed, | |
| n_topics=self.config.n_topics, | |
| n_passes=self.config.n_lda_passes, | |
| random_state=self.config.random_state, | |
| ) | |
| except Exception as gensim_err: | |
| logger.warning(f"Gensim LDA failed ({gensim_err}), falling back to NMF.") | |
| self.lda_result = self._fallback_nmf(non_empty_processed) | |
| if on_progress: | |
| on_progress(2, "Building topic dataframe...", 26.0) | |
| self.topic_df = build_topic_dataframe(self.lda_result) | |
| logger.info(f"Phase 2 complete: {len(self.topic_df)} topics extracted.") | |
| def _fallback_nmf(self, texts: List[str]) -> Dict[str, Any]: | |
| logger.info("Attempting NMF fallback topic modeling...") | |
| try: | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.decomposition import NMF | |
| vectorizer = TfidfVectorizer(max_features=3000, min_df=2) | |
| tfidf = vectorizer.fit_transform(texts) | |
| feature_names = vectorizer.get_feature_names_out() | |
| n_topics = min(self.config.n_topics, max(5, tfidf.shape[0] // 3)) | |
| nmf = NMF(n_components=n_topics, random_state=self.config.random_state) | |
| nmf.fit(tfidf) | |
| topic_words = [] | |
| for topic_idx, topic in enumerate(nmf.components_): | |
| top_indices = topic.argsort()[:-16:-1] | |
| topic_words.append([ | |
| (feature_names[i], float(topic[i])) for i in top_indices | |
| ]) | |
| doc_topic_matrix = nmf.transform(tfidf) | |
| doc_topics = [] | |
| for row in doc_topic_matrix: | |
| doc_topics.append([(i, float(prob)) for i, prob in enumerate(row)]) | |
| return { | |
| 'model': nmf, 'corpus': tfidf, 'dictionary': None, | |
| 'topic_words': topic_words, 'coherence_score': 0.25, | |
| 'doc_topics': doc_topics, | |
| } | |
| except Exception as e: | |
| logger.error(f"NMF fallback failed: {e}") | |
| return { | |
| 'model': None, 'corpus': [], 'dictionary': None, | |
| 'topic_words': [], 'coherence_score': 0.0, 'doc_topics': [] | |
| } | |
| # ----------------------------------------------------------------------- | |
| # Phase 2.5 — DBSCAN Clustering (NEW) | |
| # ----------------------------------------------------------------------- | |
| def _phase2_5_dbscan_clustering(self) -> None: | |
| """Phase 2.5: SPECTER2 embeddings → UMAP → HDBSCAN (15-30 clusters).""" | |
| if self.df is None or self.df.empty: | |
| raise ValueError("Phase 2.5: No data loaded. Run Phase 1 first.") | |
| logger.info("Phase 2.5: Building title+abstract combined column...") | |
| df_ta = build_title_abstract_column(self.df) | |
| # Store back so downstream code can access title_abstract and doi_key | |
| self.df = df_ta | |
| logger.info("Phase 2.5: Generating SPECTER2 embeddings (one per paper)...") | |
| texts = df_ta['title_abstract'].tolist() | |
| self.specter2_embeddings = embed_with_specter2( | |
| texts=texts, | |
| cache_dir=self.config.specter2_cache_dir, | |
| batch_size=self.config.specter2_batch_size, | |
| ) | |
| logger.info("Phase 2.5: Running UMAP + HDBSCAN clustering...") | |
| self.cluster_df = specter2_hdbscan_cluster_topics( | |
| df=df_ta, | |
| embeddings=self.specter2_embeddings, | |
| min_cluster_size=self.config.hdbscan_min_cluster_size, | |
| max_cluster_size=self.config.hdbscan_max_cluster_size, | |
| target_min_clusters=self.config.cluster_target_min, | |
| target_max_clusters=self.config.cluster_target_max, | |
| cosine_sim_low=self.config.cosine_sim_low, | |
| cosine_sim_high=self.config.cosine_sim_high, | |
| umap_n_components=self.config.umap_n_components, | |
| umap_n_neighbors=self.config.umap_n_neighbors, | |
| random_state=self.config.random_state, | |
| ) | |
| self.cluster_summary_df = get_cluster_summary(self.cluster_df) | |
| n_clusters = len(set(self.cluster_df['cluster_final']) - {-1}) | |
| n_noise = int(self.cluster_df['is_noise'].sum()) | |
| logger.info(f"Phase 2.5 complete: {n_clusters} clusters, {n_noise} noise docs.") | |
| def run_llm_cluster_labeling( | |
| self, | |
| mistral_key: str = '', | |
| gemini_key: str = '', | |
| ollama_url: str = '', | |
| ) -> Optional[pd.DataFrame]: | |
| """Label clusters using 3 LLMs: Mistral + Gemini + Ollama. | |
| Majority vote selects the final label; all 3 candidates stored. | |
| Can be called independently after phase 2.5. | |
| """ | |
| if self.cluster_df is None or self.cluster_summary_df is None: | |
| logger.warning("LLM labeling: run SPECTER2/HDBSCAN clustering first.") | |
| return None | |
| if self.specter2_embeddings is None: | |
| logger.warning("LLM labeling: specter2_embeddings not available.") | |
| return None | |
| self.cluster_labeled_df = label_clusters_3llm( | |
| cluster_df=self.cluster_df, | |
| cluster_summary_df=self.cluster_summary_df.copy(), | |
| embeddings=self.specter2_embeddings, | |
| mistral_api_key=mistral_key or self.config.mistral_api_key, | |
| gemini_api_key=gemini_key or self.config.gemini_api_key, | |
| ollama_url=ollama_url or self.config.ollama_url, | |
| max_clusters=self.config.llm_label_max_clusters, | |
| ) | |
| out = Path(self.config.output_dir) / 'cluster_labels.csv' | |
| try: | |
| self.cluster_labeled_df.to_csv(out, index=False) | |
| logger.info(f"Saved cluster_labels.csv ({len(self.cluster_labeled_df)} rows)") | |
| except OSError as e: | |
| logger.error(f"Could not save cluster_labels.csv: {e}") | |
| return self.cluster_labeled_df | |
| # ----------------------------------------------------------------------- | |
| # Phase implementations — original (3-6) | |
| # ----------------------------------------------------------------------- | |
| def _phase3_export_topic_table(self) -> None: | |
| if self.topic_df is None or self.topic_df.empty: | |
| raise ValueError("Phase 3: No topic_df available. Run Phase 2 first.") | |
| out_path = Path(self.config.output_dir) / 'topic_review_table.csv' | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| cols = ['topic_id', 'label', 'top_words', 'coherence', 'doc_count'] | |
| available_cols = [c for c in cols if c in self.topic_df.columns] | |
| export_df = self.topic_df[available_cols].sort_values('doc_count', ascending=False) | |
| export_df.to_csv(out_path, index=False) | |
| logger.info(f"Phase 3: Saved topic_review_table.csv ({len(export_df)} rows)") | |
| def _phase4_abstract_title_comparison(self) -> None: | |
| if self.df is None or self.df.empty: | |
| raise ValueError("Phase 4: No data loaded.") | |
| self.comparison_df = compare_abstract_vs_title_themes(self.df, n_topics_each=20) | |
| logger.info(f"Phase 4: Comparison complete. {len(self.comparison_df)} rows.") | |
| def _phase5_taxonomy_mapping(self) -> None: | |
| if self.topic_df is None or self.topic_df.empty: | |
| raise ValueError("Phase 5: No topic_df available.") | |
| self.topic_df = map_topics_to_pajais(self.topic_df, PAJAIS_THEMES) | |
| self.taxonomy_map = generate_taxonomy_map(self.topic_df) | |
| out_path = Path(self.config.output_dir) / 'taxonomy_map.json' | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| try: | |
| with open(out_path, 'w', encoding='utf-8') as f: | |
| json.dump(self.taxonomy_map, f, indent=2, default=str) | |
| logger.info("Phase 5: Saved taxonomy_map.json") | |
| except (OSError, TypeError) as e: | |
| logger.error(f"Phase 5: Could not save taxonomy_map.json: {e}") | |
| def _phase5_5_mapping_display(self) -> None: | |
| if self.topic_df is None or self.topic_df.empty: | |
| return | |
| display_cols = ['label', 'pajais_theme', 'status', 'match_score', 'doc_count'] | |
| available = [c for c in display_cols if c in self.topic_df.columns] | |
| display_df = self.topic_df[available].copy() | |
| if 'coherence' in self.topic_df.columns: | |
| display_df['publishable'] = ( | |
| (self.topic_df.get('status', '') == 'NOVEL') & | |
| (self.topic_df.get('doc_count', 0) > self.config.publishable_min_docs) & | |
| (self.topic_df.get('coherence', 0.0) > self.config.publishable_min_coherence) | |
| ) | |
| else: | |
| display_df['publishable'] = False | |
| out_path = Path(self.config.output_dir) / 'pajais_mapping.csv' | |
| display_df.to_csv(out_path, index=False) | |
| logger.info(f"Phase 5.5: Saved pajais_mapping.csv ({len(display_df)} rows)") | |
| def _phase6_narrative(self) -> None: | |
| taxonomy_map = self.taxonomy_map or {} | |
| comparison_df = self.comparison_df if self.comparison_df is not None else pd.DataFrame() | |
| topic_df = self.topic_df if self.topic_df is not None else pd.DataFrame() | |
| self.narrative = generate_section7_narrative( | |
| taxonomy_map=taxonomy_map, | |
| comparison_df=comparison_df, | |
| topic_df=topic_df, | |
| ) | |
| logger.info(f"Phase 6: Narrative generated ({len(self.narrative)} characters).") | |
| # ----------------------------------------------------------------------- | |
| # Phase 6.5 — Agentic Council (NEW) | |
| # ----------------------------------------------------------------------- | |
| def _phase6_5_agentic_council(self) -> None: | |
| """Phase 6.5: Multi-model council (Mistral + Gemini + Anthropic judge).""" | |
| if not _ADDITIONS_AVAILABLE: | |
| logger.warning("Phase 6.5: tools_additions not available; skipping council.") | |
| return | |
| if not self.taxonomy_map: | |
| raise ValueError("Phase 6.5: Run taxonomy mapping first (Phase 5).") | |
| logger.info("Phase 6.5: Convening Agentic Council…") | |
| self.council_result = run_agentic_council( | |
| taxonomy_map=self.taxonomy_map, | |
| topic_df=self.topic_df, | |
| mistral_api_key=self.config.mistral_api_key, | |
| gemini_api_key=self.config.gemini_api_key, | |
| ollama_url=self.config.ollama_url, | |
| ) | |
| # Persist council report | |
| out = Path(self.config.output_dir) / "council_report.json" | |
| try: | |
| with open(out, 'w', encoding='utf-8') as f: | |
| json.dump(self.council_result, f, indent=2, ensure_ascii=False) | |
| logger.info("Phase 6.5: Saved council_report.json") | |
| except OSError as e: | |
| logger.error(f"Phase 6.5: Could not save council_report.json: {e}") | |
| # ----------------------------------------------------------------------- | |
| # DBSCAN artifact export helper | |
| # ----------------------------------------------------------------------- | |
| def _export_dbscan_artifacts(self) -> None: | |
| out_dir = Path(self.config.output_dir) | |
| if self.cluster_df is not None and not self.cluster_df.empty: | |
| p = out_dir / "cluster_documents.csv" | |
| try: | |
| self.cluster_df.to_csv(p, index=False) | |
| self.artifacts["cluster_documents"] = str(p) | |
| logger.info(f"Exported cluster_documents.csv") | |
| except OSError as e: | |
| logger.error(f"Could not save cluster_documents.csv: {e}") | |
| if self.cluster_summary_df is not None and not self.cluster_summary_df.empty: | |
| p = out_dir / "cluster_summary.csv" | |
| try: | |
| self.cluster_summary_df.to_csv(p, index=False) | |
| self.artifacts["cluster_summary"] = str(p) | |
| logger.info(f"Exported cluster_summary.csv") | |
| except OSError as e: | |
| logger.error(f"Could not save cluster_summary.csv: {e}") | |
| # ----------------------------------------------------------------------- | |
| # Supplementary insights | |
| # ----------------------------------------------------------------------- | |
| def _discover_supplementary_insights(self) -> None: | |
| insights: Dict[str, Any] = {} | |
| try: | |
| if self.topic_df is not None and 'status' in self.topic_df.columns: | |
| novel_df = self.topic_df[self.topic_df['status'] == 'NOVEL'] | |
| if not novel_df.empty: | |
| top_novel = novel_df.sort_values('doc_count', ascending=False).iloc[0] | |
| insights['blind_spot_theme'] = { | |
| 'label': top_novel.get('label', ''), | |
| 'doc_count': int(top_novel.get('doc_count', 0)), | |
| 'top_words': top_novel.get('top_words', ''), | |
| } | |
| except Exception as e: | |
| logger.warning(f"blind_spot_theme computation failed: {e}") | |
| try: | |
| if self.df is not None and 'year' in self.df.columns: | |
| years = pd.to_numeric(self.df['year'], errors='coerce').dropna() | |
| if not years.empty and self.lda_result and self.lda_result.get('doc_topics'): | |
| year_list = years.tolist() | |
| doc_topics = self.lda_result['doc_topics'] | |
| n_docs = min(len(year_list), len(doc_topics)) | |
| year_entropy: Dict[int, List[float]] = {} | |
| for i in range(n_docs): | |
| yr = int(year_list[i]) | |
| probs = [p for _, p in doc_topics[i]] | |
| if probs: | |
| probs_arr = np.array(probs) | |
| probs_arr = probs_arr / probs_arr.sum() | |
| entropy = float(-np.sum(probs_arr * np.log(probs_arr + 1e-9))) | |
| if yr not in year_entropy: | |
| year_entropy[yr] = [] | |
| year_entropy[yr].append(entropy) | |
| if year_entropy: | |
| avg_entropy = {yr: np.mean(ents) for yr, ents in year_entropy.items()} | |
| golden_year = max(avg_entropy, key=lambda y: avg_entropy[y]) | |
| insights['golden_year'] = { | |
| 'year': golden_year, | |
| 'entropy': round(avg_entropy[golden_year], 4), | |
| } | |
| except Exception as e: | |
| logger.warning(f"golden_year computation failed: {e}") | |
| try: | |
| if self.comparison_df is not None and not self.comparison_df.empty: | |
| ab = self.comparison_df[self.comparison_df['source'] == 'abstract'][ | |
| ['label', 'doc_count'] | |
| ].rename(columns={'doc_count': 'ab_count'}) | |
| ti = self.comparison_df[self.comparison_df['source'] == 'title'][ | |
| ['label', 'doc_count'] | |
| ].rename(columns={'doc_count': 'ti_count'}) | |
| merged = ab.merge(ti, on='label', how='inner') | |
| if not merged.empty: | |
| merged['ratio'] = merged['ab_count'] / (merged['ti_count'] + 1) | |
| iceberg = merged[merged['ratio'] >= 3.0].sort_values('ratio', ascending=False) | |
| insights['iceberg_topics'] = iceberg.to_dict('records') | |
| except Exception as e: | |
| logger.warning(f"iceberg_topics computation failed: {e}") | |
| try: | |
| if self.taxonomy_map: | |
| publishable = self.taxonomy_map.get('publishable_novel_themes', []) | |
| if publishable: | |
| top_pub = max(publishable, key=lambda x: x.get('coherence', 0.0)) | |
| insights['top_publishable_gap'] = top_pub | |
| except Exception as e: | |
| logger.warning(f"top_publishable_gap computation failed: {e}") | |
| # NEW: DBSCAN stats | |
| try: | |
| if self.cluster_df is not None and not self.cluster_df.empty: | |
| n_clusters = len(set(self.cluster_df["cluster_final"]) - {-1}) | |
| n_noise = int(self.cluster_df["is_noise"].sum()) | |
| largest = self.cluster_df["cluster_final"].value_counts() | |
| largest = largest[largest.index != -1] | |
| insights['dbscan_stats'] = { | |
| 'n_clusters': n_clusters, | |
| 'n_noise': n_noise, | |
| 'largest_cluster_size': int(largest.iloc[0]) if not largest.empty else 0, | |
| } | |
| except Exception as e: | |
| logger.warning(f"dbscan_stats failed: {e}") | |
| self.supplementary_insights = insights | |
| logger.info(f"Supplementary insights computed: {list(insights.keys())}") | |
| # ----------------------------------------------------------------------- | |
| # Summary builder | |
| # ----------------------------------------------------------------------- | |
| def _build_summary(self) -> Dict[str, Any]: | |
| gap = {} | |
| if self.taxonomy_map: | |
| gap = self.taxonomy_map.get('gap_analysis', {}) | |
| return { | |
| 'success': len(self._errors) == 0, | |
| 'phases_completed': self._phases_completed, | |
| 'topic_count': len(self.topic_df) if self.topic_df is not None else 0, | |
| 'novel_count': gap.get('novel_count', 0), | |
| 'mapped_count': gap.get('mapped_count', 0), | |
| 'pajais_coverage_pct': gap.get('coverage_pct', 0.0), | |
| 'artifacts': self.artifacts, | |
| 'errors': self._errors, | |
| 'warnings': self._warnings, | |
| 'topic_df': self.topic_df, | |
| 'comparison_df': self.comparison_df, | |
| 'taxonomy_map': self.taxonomy_map, | |
| 'narrative': self.narrative, | |
| 'supplementary_insights': self.supplementary_insights, | |
| 'validation': self.validation, | |
| # NEW | |
| 'cluster_df': self.cluster_df, | |
| 'cluster_summary_df': self.cluster_summary_df, | |
| 'council_result': self.council_result, | |
| } |