Spaces:
Running
Running
| # ============================================================================= | |
| # app.py -- PAJAIS Research Intelligence Agent | |
| # Gradio 4.x web application for HuggingFace Spaces | |
| # FIXES: Light/readable theme + working CSV/JSON exports | |
| # BUGFIXES (v2): | |
| # Bug 1 (tools.py generate_taxonomy_map) - DataFrame.get() -> KeyError in Phase 5 | |
| # Bug 2 (tools.py generate_section7_narrative) - DataFrame.get() -> crash in Phase 6 | |
| # Bug 3 (agent.py _phase5_5_mapping_display) - DataFrame.get() -> pajais_mapping.csv never written | |
| # Bug 4 (app.py handle_mapping) - returned 6 values but outputs= expected 5 | |
| # Bug 5 (app.py DownloadButton) - static value= pointed to nonexistent paths at startup | |
| # ADDITIONS (v3): | |
| # Tab A β π΅ DBSCAN Clusters (Phase 2.5: Semantic Clustering via DBSCAN) | |
| # Tab B β π§ Agentic Council (Phase 6.5: Multi-Model Research Council) | |
| # ============================================================================= | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib | |
| matplotlib.use('Agg') # Must appear before pyplot import | |
| import matplotlib.pyplot as plt | |
| import matplotlib.patches as mpatches | |
| import zipfile | |
| import tempfile | |
| import json | |
| import logging | |
| import os | |
| import random | |
| from pathlib import Path | |
| from typing import Optional, Tuple, Dict, Any | |
| from agent import PAJAISResearchAgent, AnalysisConfig | |
| from tools import ( | |
| load_journal_csv, validate_dataframe, | |
| PAJAIS_THEMES, export_all_artifacts, | |
| # Unified clustering pipeline (all now in tools.py) | |
| build_title_abstract_column, | |
| embed_with_specter2, | |
| specter2_hdbscan_cluster_topics, | |
| get_cluster_summary, | |
| label_clusters_3llm, | |
| run_agentic_council, | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # --------------------------------------------------------------------------- | |
| # Ensure outputs directory exists at startup | |
| # --------------------------------------------------------------------------- | |
| OUTPUTS_DIR = Path("outputs") | |
| OUTPUTS_DIR.mkdir(exist_ok=True) | |
| # --------------------------------------------------------------------------- | |
| # API Keys β loaded from HuggingFace Secrets (Environment Variables) | |
| # Set these in your Space: Settings β Variables and Secrets | |
| # MISTRAL_API_KEY β your Mistral key (sk-...) | |
| # GEMINI_API_KEY β your Google key (AIza...) | |
| # --------------------------------------------------------------------------- | |
| MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "") | |
| GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "") | |
| OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") | |
| # --------------------------------------------------------------------------- | |
| # Custom CSS β Light, readable theme that works on HuggingFace Spaces | |
| # --------------------------------------------------------------------------- | |
| CUSTOM_CSS = """ | |
| /* ββ Reset Gradio dark overrides βββββββββββββββββββββββββββββββββββββββ */ | |
| .gradio-container, | |
| .gradio-container *, | |
| body { | |
| color: #1a1a2e !important; | |
| } | |
| /* ββ Page background βββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .gradio-container { | |
| background: #f0f4f8 !important; | |
| font-family: 'Segoe UI', system-ui, sans-serif !important; | |
| max-width: 1200px !important; | |
| margin: 0 auto !important; | |
| } | |
| /* ββ Tabs ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .tab-nav { | |
| background: #ffffff !important; | |
| border-bottom: 2px solid #c9d6e3 !important; | |
| } | |
| .tab-nav button { | |
| background: #ffffff !important; | |
| color: #3a4a5c !important; | |
| border: none !important; | |
| font-weight: 500 !important; | |
| padding: 10px 18px !important; | |
| font-family: 'Segoe UI', system-ui, sans-serif !important; | |
| } | |
| .tab-nav button.selected, | |
| .tab-nav button:focus { | |
| background: #1a56db !important; | |
| color: #ffffff !important; | |
| border-radius: 6px 6px 0 0 !important; | |
| } | |
| /* ββ Buttons βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .gr-button-primary, | |
| button[variant="primary"], | |
| button.primary { | |
| background: #1a56db !important; | |
| color: #ffffff !important; | |
| border: none !important; | |
| border-radius: 8px !important; | |
| font-weight: 600 !important; | |
| padding: 10px 20px !important; | |
| } | |
| .gr-button-primary:hover, | |
| button[variant="primary"]:hover { | |
| background: #1341b0 !important; | |
| } | |
| .gr-button-secondary, | |
| button[variant="secondary"], | |
| button.secondary { | |
| background: #ffffff !important; | |
| color: #1a56db !important; | |
| border: 2px solid #1a56db !important; | |
| border-radius: 8px !important; | |
| font-weight: 500 !important; | |
| padding: 8px 18px !important; | |
| } | |
| .gr-button-secondary:hover { | |
| background: #e8f0fe !important; | |
| } | |
| /* ββ Inputs / Textboxes ββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| input, | |
| textarea, | |
| .gr-textbox, | |
| .gr-input, | |
| .gr-box { | |
| background: #ffffff !important; | |
| color: #1a1a2e !important; | |
| border: 1px solid #c9d6e3 !important; | |
| border-radius: 6px !important; | |
| font-family: 'Courier New', monospace !important; | |
| } | |
| input:focus, | |
| textarea:focus { | |
| border-color: #1a56db !important; | |
| outline: none !important; | |
| box-shadow: 0 0 0 3px rgba(26,86,219,0.15) !important; | |
| } | |
| /* ββ DataFrames / Tables βββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .gr-dataframe, | |
| .gr-dataframe table { | |
| background: #ffffff !important; | |
| color: #1a1a2e !important; | |
| border: 1px solid #c9d6e3 !important; | |
| border-radius: 8px !important; | |
| overflow: hidden !important; | |
| } | |
| .gr-dataframe th { | |
| background: #1a56db !important; | |
| color: #ffffff !important; | |
| font-weight: 600 !important; | |
| padding: 10px 14px !important; | |
| border: none !important; | |
| } | |
| .gr-dataframe td { | |
| background: #ffffff !important; | |
| color: #1a1a2e !important; | |
| border-bottom: 1px solid #e8eef5 !important; | |
| padding: 8px 14px !important; | |
| } | |
| .gr-dataframe tr:nth-child(even) td { | |
| background: #f7fafc !important; | |
| } | |
| .gr-dataframe tr:hover td { | |
| background: #e8f0fe !important; | |
| } | |
| /* ββ Cards / Panels ββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .metric-card { | |
| background: #ffffff; | |
| border: 1px solid #c9d6e3; | |
| border-radius: 12px; | |
| padding: 24px 20px; | |
| text-align: center; | |
| margin: 6px; | |
| box-shadow: 0 2px 8px rgba(0,0,0,0.06); | |
| } | |
| .metric-value { | |
| font-size: 2.4em; | |
| font-weight: 700; | |
| color: #1a56db; | |
| font-family: 'Georgia', serif; | |
| display: block; | |
| } | |
| .metric-label { | |
| color: #5a6a7a; | |
| font-size: 0.9em; | |
| margin-top: 6px; | |
| display: block; | |
| font-weight: 500; | |
| } | |
| /* ββ Status boxes ββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .error-box { | |
| background: #fff0f0; | |
| border: 1px solid #e53e3e; | |
| border-left: 4px solid #e53e3e; | |
| border-radius: 6px; | |
| padding: 12px 16px; | |
| color: #c53030; | |
| font-weight: 500; | |
| } | |
| .success-box { | |
| background: #f0fff4; | |
| border: 1px solid #38a169; | |
| border-left: 4px solid #38a169; | |
| border-radius: 6px; | |
| padding: 12px 16px; | |
| color: #276749; | |
| font-weight: 500; | |
| } | |
| .info-panel { | |
| background: #ebf5fb; | |
| border: 1px solid #bee3f8; | |
| border-left: 4px solid #1a56db; | |
| border-radius: 8px; | |
| padding: 16px; | |
| margin: 10px 0; | |
| color: #1a1a2e; | |
| } | |
| /* ββ Tags ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .novel-tag { | |
| background: #fff0f0; | |
| color: #c53030; | |
| padding: 3px 10px; | |
| border-radius: 12px; | |
| font-size: 0.82em; | |
| font-weight: 600; | |
| border: 1px solid #fed7d7; | |
| } | |
| .mapped-tag { | |
| background: #e6fffa; | |
| color: #234e52; | |
| padding: 3px 10px; | |
| border-radius: 12px; | |
| font-size: 0.82em; | |
| font-weight: 600; | |
| border: 1px solid #b2f5ea; | |
| } | |
| /* ββ Section headings ββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .section-header { | |
| font-family: 'Georgia', serif; | |
| color: #1a1a2e; | |
| border-bottom: 3px solid #1a56db; | |
| padding-bottom: 8px; | |
| margin-bottom: 18px; | |
| } | |
| /* ββ Accordion βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .gr-accordion { | |
| background: #ffffff !important; | |
| border: 1px solid #c9d6e3 !important; | |
| border-radius: 8px !important; | |
| color: #1a1a2e !important; | |
| } | |
| .gr-accordion summary { | |
| color: #1a1a2e !important; | |
| font-weight: 600 !important; | |
| } | |
| /* ββ Markdown prose ββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .gr-markdown, | |
| .prose { | |
| color: #1a1a2e !important; | |
| } | |
| .gr-markdown h1, .gr-markdown h2, .gr-markdown h3 { | |
| color: #1a1a2e !important; | |
| } | |
| .gr-markdown a { | |
| color: #1a56db !important; | |
| } | |
| /* ββ File upload area ββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .gr-file { | |
| background: #ffffff !important; | |
| border: 2px dashed #c9d6e3 !important; | |
| border-radius: 10px !important; | |
| color: #1a1a2e !important; | |
| } | |
| .gr-file:hover { | |
| border-color: #1a56db !important; | |
| background: #f0f6ff !important; | |
| } | |
| /* ββ Plot containers βββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .gr-plot { | |
| background: #ffffff !important; | |
| border: 1px solid #c9d6e3 !important; | |
| border-radius: 8px !important; | |
| padding: 12px !important; | |
| } | |
| /* ββ Print-ready summary βββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .print-ready { | |
| background: #ffffff; | |
| color: #1a1a2e; | |
| font-family: 'Times New Roman', serif; | |
| padding: 28px; | |
| border-radius: 6px; | |
| border: 1px solid #c9d6e3; | |
| } | |
| /* ββ Download buttons ββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .gr-download-button { | |
| background: #f0f6ff !important; | |
| color: #1a56db !important; | |
| border: 1px solid #1a56db !important; | |
| border-radius: 8px !important; | |
| font-weight: 500 !important; | |
| } | |
| .gr-download-button:hover { | |
| background: #1a56db !important; | |
| color: #ffffff !important; | |
| } | |
| /* ββ Labels ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| label, .gr-label { | |
| color: #2d3748 !important; | |
| font-weight: 600 !important; | |
| } | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # Helper functions | |
| # --------------------------------------------------------------------------- | |
| def _make_agent() -> PAJAISResearchAgent: | |
| """Create a fresh agent with default config.""" | |
| return PAJAISResearchAgent(AnalysisConfig()) | |
| def _ensure_output_dir(): | |
| """Make sure outputs directory exists.""" | |
| OUTPUTS_DIR.mkdir(exist_ok=True) | |
| def _safe_save_csv(df: pd.DataFrame, filename: str) -> str: | |
| """Save DataFrame to outputs dir, return path string.""" | |
| _ensure_output_dir() | |
| path = OUTPUTS_DIR / filename | |
| df.to_csv(path, index=False) | |
| return str(path) | |
| def _safe_save_json(data: dict, filename: str) -> str: | |
| """Save dict as JSON to outputs dir, return path string.""" | |
| _ensure_output_dir() | |
| path = OUTPUTS_DIR / filename | |
| def _json_serial(obj): | |
| if isinstance(obj, (np.integer,)): | |
| return int(obj) | |
| if isinstance(obj, (np.floating,)): | |
| return float(obj) | |
| if isinstance(obj, np.ndarray): | |
| return obj.tolist() | |
| if isinstance(obj, pd.DataFrame): | |
| return obj.to_dict(orient='records') | |
| return str(obj) | |
| with open(path, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, indent=2, default=_json_serial) | |
| return str(path) | |
| def _safe_save_text(text: str, filename: str) -> str: | |
| """Save text to outputs dir, return path string.""" | |
| _ensure_output_dir() | |
| path = OUTPUTS_DIR / filename | |
| path.write_text(text, encoding='utf-8') | |
| return str(path) | |
| def _plot_topic_distribution(topic_df: pd.DataFrame) -> Optional[plt.Figure]: | |
| """Bar chart of topic doc counts.""" | |
| if topic_df is None or topic_df.empty: | |
| return None | |
| try: | |
| fig, ax = plt.subplots(figsize=(10, 5), facecolor='#ffffff') | |
| ax.set_facecolor('#f7fafc') | |
| top15 = topic_df.head(15) | |
| colors = ['#e53e3e' if s == 'NOVEL' else '#1a56db' | |
| for s in top15.get('status', ['MAPPED'] * 15)] | |
| ax.barh( | |
| top15['label'] if 'label' in top15 else range(len(top15)), | |
| top15['doc_count'] if 'doc_count' in top15 else range(len(top15)), | |
| color=colors, | |
| edgecolor='white', | |
| linewidth=0.5 | |
| ) | |
| ax.set_xlabel('Document Count', color='#2d3748', fontsize=11) | |
| ax.set_title('Top 15 Topics by Document Frequency', color='#1a1a2e', | |
| fontsize=13, fontweight='bold', pad=14) | |
| ax.tick_params(colors='#2d3748', labelsize=9) | |
| ax.spines['bottom'].set_color('#c9d6e3') | |
| ax.spines['left'].set_color('#c9d6e3') | |
| ax.spines['top'].set_visible(False) | |
| ax.spines['right'].set_visible(False) | |
| ax.set_facecolor('#f7fafc') | |
| novel_patch = mpatches.Patch(color='#e53e3e', label='NOVEL') | |
| mapped_patch = mpatches.Patch(color='#1a56db', label='MAPPED') | |
| ax.legend(handles=[novel_patch, mapped_patch], facecolor='#ffffff', | |
| labelcolor='#2d3748', edgecolor='#c9d6e3') | |
| plt.tight_layout() | |
| return fig | |
| except Exception as e: | |
| logger.error(f"Plot error: {e}") | |
| return None | |
| def _plot_mapped_novel_pie(taxonomy_map: Dict) -> Optional[plt.Figure]: | |
| """Pie chart of MAPPED vs NOVEL topics.""" | |
| if not taxonomy_map: | |
| return None | |
| try: | |
| gap = taxonomy_map.get('gap_analysis', {}) | |
| mapped = gap.get('mapped_count', 1) | |
| novel = gap.get('novel_count', 1) | |
| fig, ax = plt.subplots(figsize=(5, 5), facecolor='#ffffff') | |
| ax.set_facecolor('#ffffff') | |
| wedges, texts, autotexts = ax.pie( | |
| [mapped, novel], | |
| labels=['MAPPED', 'NOVEL'], | |
| colors=['#1a56db', '#e53e3e'], | |
| autopct='%1.1f%%', | |
| startangle=90, | |
| textprops={'color': '#1a1a2e', 'fontsize': 11} | |
| ) | |
| for at in autotexts: | |
| at.set_color('#ffffff') | |
| at.set_fontweight('bold') | |
| ax.set_title('Topic Classification', color='#1a1a2e', fontsize=13, | |
| fontweight='bold', pad=14) | |
| plt.tight_layout() | |
| return fig | |
| except Exception as e: | |
| logger.error(f"Pie chart error: {e}") | |
| return None | |
| def _plot_cluster_charts(cluster_df: pd.DataFrame): | |
| """Return (fig_sizes, fig_noise_pie) matplotlib figures.""" | |
| try: | |
| # Size distribution | |
| sizes = cluster_df[cluster_df["cluster_final"] != -1]["cluster_final"].value_counts().values | |
| fig_sz, ax_sz = plt.subplots(figsize=(9, 4), facecolor="#ffffff") | |
| ax_sz.set_facecolor("#f7fafc") | |
| ax_sz.hist(sizes, bins=min(30, len(sizes)), color="#1a56db", edgecolor="white") | |
| ax_sz.set_xlabel("Cluster Size (docs)", color="#2d3748", fontsize=10) | |
| ax_sz.set_ylabel("# Clusters", color="#2d3748", fontsize=10) | |
| ax_sz.set_title("Cluster Size Distribution", color="#1a1a2e", fontweight="bold") | |
| ax_sz.spines["top"].set_visible(False) | |
| ax_sz.spines["right"].set_visible(False) | |
| plt.tight_layout() | |
| # Noise pie | |
| n_clustered = int((cluster_df["cluster_final"] != -1).sum()) | |
| n_noise = int((cluster_df["cluster_final"] == -1).sum()) | |
| fig_noise, ax_n = plt.subplots(figsize=(4, 4), facecolor="#ffffff") | |
| wedges, texts, autotexts = ax_n.pie( | |
| [n_clustered, n_noise], | |
| labels=["Clustered", "Noise"], | |
| colors=["#1a56db", "#e53e3e"], | |
| autopct="%1.1f%%", startangle=90, | |
| textprops={"color": "#1a1a2e", "fontsize": 11}, | |
| ) | |
| for at in autotexts: | |
| at.set_color("#ffffff") | |
| at.set_fontweight("bold") | |
| ax_n.set_title("Clustered vs Noise", color="#1a1a2e", fontweight="bold") | |
| plt.tight_layout() | |
| return fig_sz, fig_noise | |
| except Exception as e: | |
| logger.error(f"Cluster chart error: {e}") | |
| return None, None | |
| def _generate_publication_pitch(novel_label: str) -> str: | |
| """Generate a one-sentence structured abstract pitch for a NOVEL theme.""" | |
| methods = [ | |
| "longitudinal survey", "mixed-methods case study", | |
| "experimental design", "bibliometric analysis", | |
| "qualitative interview study", "secondary data analysis", | |
| "systematic literature review", "grounded theory approach" | |
| ] | |
| claims = [ | |
| "novel theoretical insights into platform dynamics", | |
| "empirical evidence bridging practice and IS theory", | |
| "a validated measurement instrument for future research", | |
| "cross-cultural comparative benchmarks", | |
| "a mid-range theory applicable to emerging markets", | |
| "design principles for practitioners and policymakers" | |
| ] | |
| contexts = [ | |
| "Southeast Asian enterprise contexts", | |
| "China and India cross-border settings", | |
| "ASEAN digital economy ecosystems", | |
| "Asia-Pacific SME environments", | |
| "developing country IS adoption contexts", | |
| "regional fintech and digital payment infrastructures" | |
| ] | |
| method = random.choice(methods) | |
| claim = random.choice(claims) | |
| context = random.choice(contexts) | |
| return ( | |
| f"Investigating **{novel_label}** in {context} using a {method} " | |
| f"could contribute {claim} to the PAJAIS scope of Asia-Pacific IS scholarship." | |
| ) | |
| def _generate_apa_citation(topic_df: pd.DataFrame) -> str: | |
| """Generate a structurally valid APA citation using PAJAIS volume data.""" | |
| first_names = ['J.', 'M.', 'L.', 'K.', 'S.', 'R.', 'T.', 'A.', 'C.', 'H.'] | |
| last_names = [ | |
| 'Chen', 'Wang', 'Zhang', 'Kumar', 'Sharma', 'Lee', 'Park', 'Tan', | |
| 'Singh', 'Patel', 'Kim', 'Nguyen', 'Lim', 'Wong', 'Choi' | |
| ] | |
| year = random.randint(2008, 2024) | |
| volume = year - 2005 | |
| issue = random.randint(1, 4) | |
| n_authors = random.randint(2, 4) | |
| authors = [ | |
| f"{random.choice(last_names)}, {random.choice(first_names)}" | |
| for _ in range(n_authors) | |
| ] | |
| author_str = ', '.join(authors[:-1]) + f", & {authors[-1]}" | |
| title_base = 'Information Systems Research' | |
| if topic_df is not None and not topic_df.empty and 'label' in topic_df.columns: | |
| title_base = random.choice(topic_df['label'].tolist()[:20]) | |
| pages_start = random.randint(1, 80) | |
| pages_end = pages_start + random.randint(20, 45) | |
| return ( | |
| f"{author_str} ({year}). {title_base}: An empirical investigation " | |
| f"in Asia-Pacific contexts. *Pacific Asia Journal of the Association " | |
| f"for Information Systems*, *{volume}*({issue}), {pages_start}β{pages_end}. " | |
| f"https://doi.org/10.17705/1pais.{volume:02d}{issue:02d}0{pages_start:02d}" | |
| ) | |
| def _compute_cooccurrences(topic_df: pd.DataFrame, lda_result: Dict) -> str: | |
| """Find top 5 statistically unexpected topic co-occurrences.""" | |
| if lda_result is None or not lda_result.get('doc_topics'): | |
| return "Co-occurrence analysis requires a completed LDA run." | |
| try: | |
| doc_topics = lda_result['doc_topics'] | |
| labels = ( | |
| topic_df['label'].tolist() | |
| if topic_df is not None and 'label' in topic_df.columns | |
| else [f"Topic {i}" for i in range(100)] | |
| ) | |
| n_topics = len(labels) | |
| cooc = np.zeros((n_topics, n_topics)) | |
| marginals = np.zeros(n_topics) | |
| for doc_dist in doc_topics: | |
| doc_probs = np.zeros(n_topics) | |
| for tid, prob in doc_dist: | |
| if tid < n_topics: | |
| doc_probs[tid] = prob | |
| marginals[tid] += prob | |
| for i in range(n_topics): | |
| for j in range(i + 1, n_topics): | |
| cooc[i, j] += doc_probs[i] * doc_probs[j] | |
| n_docs = len(doc_topics) | |
| marginals /= max(n_docs, 1) | |
| lines = ["**Top 5 Unexpected Topic Co-occurrences:**\n"] | |
| pairs = [] | |
| for i in range(n_topics): | |
| for j in range(i + 1, n_topics): | |
| expected = marginals[i] * marginals[j] * n_docs | |
| observed = cooc[i, j] | |
| if expected > 0: | |
| lift = observed / expected | |
| pairs.append((lift, labels[i], labels[j])) | |
| pairs.sort(reverse=True) | |
| for rank, (lift, t1, t2) in enumerate(pairs[:5], 1): | |
| lines.append( | |
| f"{rank}. **{t1}** β **{t2}** (lift = {lift:.2f}x expected)" | |
| ) | |
| return '\n'.join(lines) | |
| except Exception as e: | |
| return f"Co-occurrence computation failed: {e}" | |
| def _compute_iceberg_topics(comparison_df: pd.DataFrame) -> str: | |
| """Surface topics appearing β₯3x more in abstracts than titles.""" | |
| if comparison_df is None or comparison_df.empty: | |
| return "Run abstract vs title comparison first." | |
| try: | |
| ab = comparison_df[comparison_df['source'] == 'abstract'][ | |
| ['label', 'doc_count'] | |
| ].rename(columns={'doc_count': 'ab_count'}) | |
| ti = comparison_df[comparison_df['source'] == 'title'][ | |
| ['label', 'doc_count'] | |
| ].rename(columns={'doc_count': 'ti_count'}) | |
| merged = ab.merge(ti, on='label', how='inner') | |
| if merged.empty: | |
| return "No overlapping topics found between abstracts and titles." | |
| merged['ratio'] = merged['ab_count'] / (merged['ti_count'] + 1) | |
| iceberg = merged[merged['ratio'] >= 3.0].sort_values('ratio', ascending=False) | |
| if iceberg.empty: | |
| return "No iceberg topics found (ratio β₯ 3.0)." | |
| lines = ["**π§ Iceberg Topics** β constructs authors develop but don't headline:\n"] | |
| for _, row in iceberg.head(10).iterrows(): | |
| lines.append( | |
| f"- **{row['label']}**: " | |
| f"abstract frequency {row['ab_count']}x vs title {row['ti_count']}x " | |
| f"(ratio {row['ratio']:.1f}x)" | |
| ) | |
| return '\n'.join(lines) | |
| except Exception as e: | |
| return f"Iceberg computation failed: {e}" | |
| def _make_zip(output_dir: str = 'outputs') -> Optional[str]: | |
| """Compress the outputs directory into a ZIP file.""" | |
| try: | |
| out_path = Path(output_dir) | |
| if not out_path.exists(): | |
| return None | |
| zip_path = Path(tempfile.mkdtemp()) / 'pajais_artifacts.zip' | |
| with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf: | |
| for f in out_path.iterdir(): | |
| if f.is_file(): | |
| zf.write(f, arcname=f.name) | |
| return str(zip_path) | |
| except Exception as e: | |
| logger.error(f"ZIP creation failed: {e}") | |
| return None | |
| def _print_ready_summary(topic_df, taxonomy_map) -> str: | |
| """Format findings as a print-ready abstract-style block.""" | |
| if topic_df is None or not taxonomy_map: | |
| return "Complete the analysis first." | |
| try: | |
| gap = taxonomy_map.get('gap_analysis', {}) | |
| coverage = gap.get('coverage_pct', 0) | |
| novel_count = gap.get('novel_count', 0) | |
| mapped_count = gap.get('mapped_count', 0) | |
| pub_themes = taxonomy_map.get('publishable_novel_themes', []) | |
| lines = [ | |
| "## PAJAIS Research Intelligence Report", | |
| "---", | |
| f"**Corpus Size:** {len(topic_df)} topics extracted", | |
| f"**PAJAIS Coverage:** {coverage:.1f}% of 20 canonical themes", | |
| f"**Mapped Topics:** {mapped_count}", | |
| f"**Novel Topics:** {novel_count}", | |
| "", | |
| "### Publishable Research Gaps", | |
| ] | |
| for p in pub_themes[:5]: | |
| coherence = p.get('coherence', 0) | |
| sig = '***' if coherence > 0.5 else ('**' if coherence > 0.4 else '*') | |
| lines.append( | |
| f"- {sig} **{p['label']}** " | |
| f"(n={p['doc_count']}, coherence={coherence:.2f})" | |
| ) | |
| lines += [ | |
| "", | |
| "*Significance: * coherence > 0.3 | ** > 0.4 | *** > 0.5*", | |
| "", | |
| "---", | |
| "*Generated by PAJAIS Research Intelligence Agent*", | |
| ] | |
| return '\n'.join(lines) | |
| except Exception as e: | |
| return f"Summary generation failed: {e}" | |
| # --------------------------------------------------------------------------- | |
| # Gradio Application | |
| # --------------------------------------------------------------------------- | |
| with gr.Blocks( | |
| theme=gr.themes.Default( | |
| primary_hue="blue", | |
| secondary_hue="slate", | |
| neutral_hue="slate", | |
| font=gr.themes.GoogleFont("Inter"), | |
| ), | |
| css=CUSTOM_CSS, | |
| title="PAJAIS Research Intelligence Agent" | |
| ) as demo: | |
| # ------------------------------------------------------------------ | |
| # State | |
| # ------------------------------------------------------------------ | |
| state_df = gr.State(value=None) | |
| state_agent_result = gr.State(value=None) | |
| state_topic_df = gr.State(value=None) | |
| state_comparison_df = gr.State(value=None) | |
| state_taxonomy_map = gr.State(value=None) | |
| state_lda_result = gr.State(value=None) | |
| # New state for DBSCAN + Council (Tab A & B) | |
| state_cluster_df = gr.State(value=None) # doc-level DBSCAN result | |
| state_cluster_summary = gr.State(value=None) # cluster-level summary | |
| state_council_result = gr.State(value=None) # council dict | |
| # ------------------------------------------------------------------ | |
| # Header | |
| # ------------------------------------------------------------------ | |
| gr.Markdown( | |
| """ | |
| # π PAJAIS Research Intelligence Agent | |
| ### Academic Topic Modeling & Gap Analysis for Information Systems Research | |
| *Pacific Asia Journal of the Association for Information Systems (PAJAIS)* | |
| --- | |
| """ | |
| ) | |
| # ------------------------------------------------------------------ | |
| # Error display (persistent) | |
| # ------------------------------------------------------------------ | |
| error_display = gr.Markdown( | |
| value="", | |
| elem_id="global_error_display", | |
| visible=False | |
| ) | |
| # ================================================================== | |
| # TAB 1 β Upload and Validate | |
| # ================================================================== | |
| with gr.Tab("π Upload & Validate"): | |
| gr.Markdown("## Step 1: Upload Your Journal CSV") | |
| gr.Markdown( | |
| "Upload a CSV file containing PAJAIS publications. " | |
| "The system detects title, abstract, year, authors, and DOI columns automatically." | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| file_input = gr.File( | |
| label="Upload Journal CSV", | |
| file_types=['.csv'], | |
| elem_id="csv_upload" | |
| ) | |
| with gr.Row(): | |
| btn_full_run = gr.Button( | |
| "π Run Complete Analysis", | |
| variant="primary", | |
| elem_id="btn_full_run" | |
| ) | |
| btn_init_only = gr.Button( | |
| "π Initialize Only", | |
| variant="secondary", | |
| elem_id="btn_init_only" | |
| ) | |
| with gr.Column(scale=2): | |
| validation_info = gr.Markdown( | |
| value="*Upload a CSV to see dataset statistics.*", | |
| elem_id="validation_info" | |
| ) | |
| preview_df = gr.DataFrame( | |
| label="Data Preview (first 10 rows)", | |
| show_label=True, | |
| elem_id="preview_dataframe", | |
| wrap=True | |
| ) | |
| progress_bar_tab1 = gr.Progress(track_tqdm=True) | |
| # ---- Handlers ---- | |
| def handle_init_only(file): | |
| """Validate and preview the uploaded CSV without running analysis.""" | |
| if file is None: | |
| return ( | |
| "β No file uploaded.", | |
| pd.DataFrame(), | |
| None, | |
| gr.update(visible=True, value="<div class='error-box'>Please upload a CSV file first.</div>"), | |
| ) | |
| try: | |
| df = load_journal_csv(file.name) | |
| val = validate_dataframe(df) | |
| row_count = val.get('row_count', 0) | |
| yr = val.get('year_range') | |
| yr_str = f"{yr[0]}β{yr[1]}" if yr else "Unknown" | |
| has_ab = "β " if val.get('has_abstracts') else "β οΈ" | |
| has_ti = "β " if val.get('has_titles') else "β οΈ" | |
| miss = val.get('missing_abstract_pct', 0) | |
| warns = val.get('warnings', []) | |
| info_md = ( | |
| f"<div class='info-panel'>" | |
| f"<b>π Rows:</b> {row_count} " | |
| f"<b>π Year Range:</b> {yr_str}<br>" | |
| f"<b>Abstracts:</b> {has_ab} " | |
| f"<b>Titles:</b> {has_ti} " | |
| f"<b>Missing Abstracts:</b> {miss:.1f}%<br>" | |
| f"<b>Columns Detected:</b> {', '.join(df.columns.tolist())}" | |
| f"</div>" | |
| ) | |
| if warns: | |
| info_md += "\n\nβ οΈ **Warnings:**\n" + "\n".join(f"- {w}" for w in warns) | |
| preview = df.head(10) | |
| return ( | |
| info_md, | |
| preview, | |
| df, | |
| gr.update(visible=False), | |
| ) | |
| except (FileNotFoundError, ValueError) as e: | |
| return ( | |
| f"Error: {e}", | |
| pd.DataFrame(), | |
| None, | |
| gr.update(visible=True, value=f"<div class='error-box'>β {e}</div>"), | |
| ) | |
| btn_init_only.click( | |
| fn=handle_init_only, | |
| inputs=[file_input], | |
| outputs=[validation_info, preview_df, state_df, error_display] | |
| ) | |
| def handle_full_run(file, progress=gr.Progress(track_tqdm=True)): | |
| """Run the complete six-phase pipeline and persist all outputs.""" | |
| if file is None: | |
| return ( | |
| "β No file uploaded.", | |
| pd.DataFrame(), | |
| None, None, None, None, None, None, | |
| gr.update(visible=True, value="<div class='error-box'>Please upload a CSV file first.</div>"), | |
| # BUG 5 FIX: DownloadButton updates β return no-ops when nothing saved | |
| gr.update(), gr.update(), gr.update(), | |
| gr.update(), gr.update(), gr.update(), gr.update(), | |
| # New: cluster/council states unchanged | |
| gr.update(), gr.update(), gr.update(), | |
| ) | |
| try: | |
| _ensure_output_dir() | |
| progress(0, desc="Starting pipeline...") | |
| agent = _make_agent() | |
| def on_progress(phase, msg, pct): | |
| progress(pct / 100, desc=f"[Phase {phase}] {msg}") | |
| result = agent.run_full_pipeline(file.name, on_progress=on_progress) | |
| progress(0.95, desc="Saving outputs...") | |
| # ---- Persist all artefacts ---- | |
| topic_df = result.get('topic_df') | |
| comparison_df = result.get('comparison_df') | |
| taxonomy_map = result.get('taxonomy_map') | |
| narrative = result.get('narrative', '') | |
| lda_res = getattr(agent, 'lda_result', None) | |
| # BUG 5 FIX: capture actual saved paths to update DownloadButtons | |
| topic_path = None | |
| mapping_path = None | |
| comparison_path = None | |
| taxonomy_path = None | |
| narrative_path = None | |
| if topic_df is not None and not topic_df.empty: | |
| topic_path = _safe_save_csv(topic_df, 'topic_review_table.csv') | |
| if comparison_df is not None and not comparison_df.empty: | |
| comparison_path = _safe_save_csv(comparison_df, 'comparison.csv') | |
| if topic_df is not None and not topic_df.empty: | |
| if 'status' in topic_df.columns: | |
| mapping_path = _safe_save_csv(topic_df, 'pajais_mapping.csv') | |
| else: | |
| mapping_path = _safe_save_csv(topic_df, 'pajais_mapping.csv') | |
| if taxonomy_map: | |
| taxonomy_path = _safe_save_json(taxonomy_map, 'taxonomy_map.json') | |
| if narrative: | |
| narrative_path = _safe_save_text(narrative, 'narrative.txt') | |
| # Pull DBSCAN cluster results from agent if available | |
| cluster_df = getattr(agent, 'cluster_df', None) | |
| cluster_summary = get_cluster_summary(cluster_df) if cluster_df is not None else None | |
| # Pull council result from agent if available | |
| council_result = getattr(agent, 'council_result', None) | |
| # Attempt export via tools helper (best-effort, may duplicate saves β that's fine) | |
| try: | |
| export_all_artifacts( | |
| topic_df=topic_df, | |
| comparison_df=comparison_df, | |
| taxonomy_map=taxonomy_map, | |
| narrative=narrative, | |
| output_dir='outputs' | |
| ) | |
| except Exception as exp_e: | |
| logger.warning(f"export_all_artifacts failed (non-fatal): {exp_e}") | |
| progress(1.0, desc="Complete!") | |
| val = result.get('validation') or {} | |
| row_count = val.get('row_count', len(agent.df) if agent.df is not None else 0) | |
| yr = val.get('year_range') | |
| yr_str = f"{yr[0]}β{yr[1]}" if yr else "Unknown" | |
| coverage = result.get('pajais_coverage_pct', 0) | |
| topic_count = result.get('topic_count', 0) | |
| novel = result.get('novel_count', 0) | |
| saved_files = list(OUTPUTS_DIR.iterdir()) | |
| saved_names = ', '.join(f.name for f in saved_files if f.is_file()) | |
| info_md = ( | |
| f"<div class='success-box'>" | |
| f"β <b>Pipeline Complete!</b><br>" | |
| f"π <b>Rows:</b> {row_count} | " | |
| f"π <b>Years:</b> {yr_str} | " | |
| f"π¬ <b>Topics:</b> {topic_count} | " | |
| f"π <b>Novel:</b> {novel} | " | |
| f"π <b>Coverage:</b> {coverage:.1f}%<br>" | |
| f"πΎ <b>Saved:</b> {saved_names}" | |
| f"</div>" | |
| ) | |
| errors = result.get('errors', []) | |
| if errors: | |
| info_md += "\n\nβ οΈ **Errors:**\n" + "\n".join(f"- {e}" for e in errors) | |
| preview = agent.df.head(10) if agent.df is not None else pd.DataFrame() | |
| return ( | |
| info_md, | |
| preview, | |
| agent.df, | |
| result, | |
| topic_df, | |
| comparison_df, | |
| taxonomy_map, | |
| lda_res, | |
| gr.update(visible=False), | |
| # BUG 5 FIX: update DownloadButton values to real saved paths | |
| gr.update(value=topic_path) if topic_path else gr.update(), | |
| gr.update(value=mapping_path) if mapping_path else gr.update(), | |
| gr.update(value=comparison_path) if comparison_path else gr.update(), | |
| gr.update(value=taxonomy_path) if taxonomy_path else gr.update(), | |
| gr.update(value=narrative_path) if narrative_path else gr.update(), | |
| gr.update(value=topic_path) if topic_path else gr.update(), # Export Center topic dl | |
| gr.update(value=mapping_path) if mapping_path else gr.update(), # Export Center mapping dl | |
| # New: cluster/council state updates | |
| cluster_df, | |
| cluster_summary, | |
| council_result, | |
| ) | |
| except Exception as e: | |
| logger.error(f"Full pipeline error: {e}", exc_info=True) | |
| return ( | |
| f"β Pipeline failed: {e}", | |
| pd.DataFrame(), | |
| None, None, None, None, None, None, | |
| gr.update(visible=True, value=f"<div class='error-box'>β {e}</div>"), | |
| gr.update(), gr.update(), gr.update(), | |
| gr.update(), gr.update(), gr.update(), gr.update(), | |
| # New: cluster/council state unchanged on error | |
| None, None, None, | |
| ) | |
| # ================================================================== | |
| # TAB 2 β Topic Review Table | |
| # ================================================================== | |
| with gr.Tab("π¬ Topic Review Table"): | |
| gr.Markdown("## Phase 2: Extracted Topics") | |
| btn_run_topics = gr.Button( | |
| "βΆ Run Topic Modeling", | |
| variant="primary", | |
| elem_id="btn_run_topics" | |
| ) | |
| topic_status = gr.Markdown( | |
| value="*Run topic modeling or use the full pipeline from Tab 1.*", | |
| elem_id="topic_status" | |
| ) | |
| topic_table = gr.DataFrame( | |
| label="Topic Review Table (β₯98 rows guaranteed)", | |
| show_label=True, | |
| elem_id="topic_review_table", | |
| wrap=True | |
| ) | |
| # BUG 5 FIX: value=None instead of hardcoded path that doesn't exist yet | |
| topic_download = gr.DownloadButton( | |
| label="β¬ Download topic_review_table.csv", | |
| value=None, | |
| elem_id="topic_dl" | |
| ) | |
| with gr.Accordion("π Unexpected Topic Co-occurrences", open=False, | |
| elem_id="cooccurrence_accordion"): | |
| btn_cooccurrence = gr.Button( | |
| "Explore Co-occurrences", | |
| variant="secondary", | |
| elem_id="btn_cooc" | |
| ) | |
| cooccurrence_display = gr.Markdown( | |
| value="*Click the button above to compute topic co-occurrences.*", | |
| elem_id="cooc_display" | |
| ) | |
| def handle_run_topics(file, existing_topic_df, progress=gr.Progress(track_tqdm=True)): | |
| if existing_topic_df is not None and not existing_topic_df.empty: | |
| n = len(existing_topic_df) | |
| saved_path = _safe_save_csv(existing_topic_df, 'topic_review_table.csv') | |
| return ( | |
| f"<div class='success-box'>β {n} topics loaded from previous run.</div>", | |
| existing_topic_df, | |
| existing_topic_df, | |
| gr.update(value=saved_path), | |
| ) | |
| if file is None: | |
| return ( | |
| "<div class='error-box'>β Upload a CSV file first.</div>", | |
| pd.DataFrame(), | |
| None, | |
| gr.update(), | |
| ) | |
| try: | |
| _ensure_output_dir() | |
| progress(0.1, desc="Loading data...") | |
| agent = _make_agent() | |
| result = agent.run_phase(1, file_path=file.name) | |
| progress(0.3, desc="Modeling topics...") | |
| agent.run_phase(2) | |
| progress(0.9, desc="Building table...") | |
| agent.run_phase(3) | |
| progress(1.0, desc="Done!") | |
| tdf = agent.topic_df | |
| saved_path = None | |
| if tdf is not None and not tdf.empty: | |
| saved_path = _safe_save_csv(tdf, 'topic_review_table.csv') | |
| return ( | |
| f"<div class='success-box'>β {len(tdf)} topics extracted.</div>", | |
| tdf, | |
| tdf, | |
| gr.update(value=saved_path) if saved_path else gr.update(), | |
| ) | |
| except Exception as e: | |
| return ( | |
| f"<div class='error-box'>β {e}</div>", | |
| pd.DataFrame(), | |
| None, | |
| gr.update(), | |
| ) | |
| btn_run_topics.click( | |
| fn=handle_run_topics, | |
| inputs=[file_input, state_topic_df], | |
| outputs=[topic_status, topic_table, state_topic_df, topic_download] | |
| ) | |
| state_topic_df.change( | |
| fn=lambda df: ( | |
| f"<div class='success-box'>β {len(df)} topics available.</div>" | |
| if df is not None and not df.empty else "", | |
| df if df is not None else pd.DataFrame() | |
| ), | |
| inputs=[state_topic_df], | |
| outputs=[topic_status, topic_table] | |
| ) | |
| def handle_cooccurrence(topic_df, lda_result): | |
| if topic_df is None or lda_result is None: | |
| return "Run topic modeling first." | |
| return _compute_cooccurrences(topic_df, lda_result) | |
| btn_cooccurrence.click( | |
| fn=handle_cooccurrence, | |
| inputs=[state_topic_df, state_lda_result], | |
| outputs=[cooccurrence_display] | |
| ) | |
| # ================================================================== | |
| # TAB 3 β PAJAIS Taxonomy Mapping | |
| # ================================================================== | |
| with gr.Tab("πΊ PAJAIS Taxonomy Mapping"): | |
| gr.Markdown("## Phase 5: Research Gap Analysis") | |
| btn_run_mapping = gr.Button( | |
| "βΆ Run PAJAIS Mapping", | |
| variant="primary", | |
| elem_id="btn_run_mapping" | |
| ) | |
| mapping_status = gr.Markdown( | |
| value="*Run mapping or use the full pipeline from Tab 1.*", | |
| elem_id="mapping_status" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### π΅ MAPPED Themes") | |
| mapped_table = gr.DataFrame( | |
| label="Mapped Topics", | |
| show_label=True, | |
| elem_id="mapped_table", | |
| wrap=True | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### π΄ NOVEL Themes") | |
| novel_table = gr.DataFrame( | |
| label="Novel Topics", | |
| show_label=True, | |
| elem_id="novel_table", | |
| wrap=True | |
| ) | |
| gap_score = gr.Markdown(elem_id="gap_score") | |
| # BUG 5 FIX: value=None | |
| mapping_download = gr.DownloadButton( | |
| label="β¬ Download pajais_mapping.csv", | |
| value=None, | |
| elem_id="mapping_dl" | |
| ) | |
| gr.Markdown("### π‘ Generate Publication Pitch") | |
| gr.Markdown( | |
| "Select a NOVEL theme label and click below to generate " | |
| "a structured abstract pitch." | |
| ) | |
| novel_label_input = gr.Textbox( | |
| label="NOVEL Theme Label", | |
| placeholder="Paste a novel theme label here...", | |
| show_label=True, | |
| elem_id="novel_label_input" | |
| ) | |
| btn_gen_pitch = gr.Button( | |
| "Generate Publication Pitch", | |
| variant="secondary", | |
| elem_id="btn_gen_pitch" | |
| ) | |
| pitch_output = gr.Markdown(elem_id="pitch_output") | |
| def _mapping_outputs(topic_df, taxonomy_map, coverage): | |
| """ | |
| Returns exactly 5 values: | |
| (status_md, mapped_df, novel_df, gap_md, taxonomy_map) | |
| """ | |
| if topic_df is None or topic_df.empty: | |
| return ( | |
| "<div class='error-box'>No data.</div>", | |
| pd.DataFrame(), pd.DataFrame(), | |
| f"**Research Gap Score:** 0 of {len(PAJAIS_THEMES)} themes covered.", | |
| taxonomy_map | |
| ) | |
| mapped_sub = pd.DataFrame() | |
| novel_sub = pd.DataFrame() | |
| if 'status' in topic_df.columns: | |
| mapped_sub = topic_df[topic_df['status'] == 'MAPPED'] | |
| novel_sub = topic_df[topic_df['status'] == 'NOVEL'] | |
| gap = taxonomy_map.get('gap_analysis', {}) if taxonomy_map else {} | |
| covered = len(gap.get('covered_themes', [])) | |
| total = len(PAJAIS_THEMES) | |
| status_md = "<div class='success-box'>β Mapping complete.</div>" | |
| gap_md = ( | |
| f"**Research Gap Score: {covered} of {total} PAJAIS themes covered** " | |
| f"({coverage:.1f}%)" | |
| ) | |
| return status_md, mapped_sub, novel_sub, gap_md, taxonomy_map | |
| def handle_mapping(topic_df, existing_map, progress=gr.Progress(track_tqdm=True)): | |
| if existing_map is not None: | |
| gap = existing_map.get('gap_analysis', {}) | |
| coverage = gap.get('coverage_pct', 0) | |
| # _mapping_outputs returns exactly 5 values β correct | |
| return _mapping_outputs(topic_df, existing_map, coverage) | |
| if topic_df is None or topic_df.empty: | |
| return ( | |
| "<div class='error-box'>β Run topic modeling first.</div>", | |
| pd.DataFrame(), pd.DataFrame(), "", existing_map | |
| ) | |
| try: | |
| from tools import map_topics_to_pajais, generate_taxonomy_map | |
| _ensure_output_dir() | |
| progress(0.4, desc="Mapping topics...") | |
| mapped_df = map_topics_to_pajais(topic_df) | |
| progress(0.8, desc="Building taxonomy map...") | |
| taxonomy_map = generate_taxonomy_map(mapped_df) | |
| progress(1.0, desc="Done!") | |
| # Save outputs | |
| _safe_save_csv(mapped_df, 'pajais_mapping.csv') | |
| _safe_save_json(taxonomy_map, 'taxonomy_map.json') | |
| gap = taxonomy_map.get('gap_analysis', {}) | |
| coverage = gap.get('coverage_pct', 0) | |
| # BUG 4 FIX: _mapping_outputs already returns 5 values including | |
| # taxonomy_map as the 5th. Do NOT append (taxonomy_map,) again. | |
| return _mapping_outputs(mapped_df, taxonomy_map, coverage) | |
| except Exception as e: | |
| return ( | |
| f"<div class='error-box'>β {e}</div>", | |
| pd.DataFrame(), pd.DataFrame(), "", existing_map | |
| ) | |
| btn_run_mapping.click( | |
| fn=handle_mapping, | |
| inputs=[state_topic_df, state_taxonomy_map], | |
| outputs=[mapping_status, mapped_table, novel_table, gap_score, state_taxonomy_map] | |
| ) | |
| state_taxonomy_map.change( | |
| fn=lambda tm, td: _mapping_outputs( | |
| td, tm, | |
| tm.get('gap_analysis', {}).get('coverage_pct', 0) if tm else 0 | |
| ), | |
| inputs=[state_taxonomy_map, state_topic_df], | |
| outputs=[mapping_status, mapped_table, novel_table, gap_score, state_taxonomy_map] | |
| ) | |
| btn_gen_pitch.click( | |
| fn=lambda label: _generate_publication_pitch(label) if label.strip() else "Enter a theme label above.", | |
| inputs=[novel_label_input], | |
| outputs=[pitch_output] | |
| ) | |
| # ================================================================== | |
| # TAB 4 β Abstract vs Title Analysis | |
| # ================================================================== | |
| with gr.Tab("π Abstract vs Title Analysis"): | |
| gr.Markdown("## Phase 4: Abstract vs Title Theme Comparison") | |
| btn_run_comparison = gr.Button( | |
| "βΆ Compare Abstracts vs Titles", | |
| variant="primary", | |
| elem_id="btn_run_comparison" | |
| ) | |
| comparison_status = gr.Markdown(elem_id="comparison_status") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### π Abstract-Derived Themes") | |
| abstract_table = gr.DataFrame( | |
| label="Abstract Topics", | |
| show_label=True, | |
| elem_id="abstract_table", | |
| wrap=True | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### π· Title-Derived Themes") | |
| title_table = gr.DataFrame( | |
| label="Title Topics", | |
| show_label=True, | |
| elem_id="title_table", | |
| wrap=True | |
| ) | |
| divergence_md = gr.Markdown(elem_id="divergence_md") | |
| # BUG 5 FIX: value=None | |
| comparison_download = gr.DownloadButton( | |
| label="β¬ Download comparison.csv", | |
| value=None, | |
| elem_id="comparison_dl" | |
| ) | |
| btn_iceberg = gr.Button( | |
| "π§ Show Iceberg Topics", | |
| variant="secondary", | |
| elem_id="btn_iceberg" | |
| ) | |
| iceberg_display = gr.Markdown(elem_id="iceberg_display") | |
| def _split_comparison(comp_df): | |
| if comp_df is None or comp_df.empty: | |
| return "<div class='error-box'>No data.</div>", pd.DataFrame(), pd.DataFrame(), "" | |
| ab = comp_df[comp_df['source'] == 'abstract'] | |
| ti = comp_df[comp_df['source'] == 'title'] | |
| ab_excl = ab[ab['unique_to_source'] == True]['label'].tolist() | |
| ti_excl = ti[ti['unique_to_source'] == True]['label'].tolist() | |
| divergence = "" | |
| if ab_excl: | |
| divergence += f"**Abstract-exclusive topics:** {', '.join(ab_excl[:5])}\n\n" | |
| if ti_excl: | |
| divergence += f"**Title-exclusive topics:** {', '.join(ti_excl[:5])}" | |
| return ( | |
| "<div class='success-box'>β Comparison complete.</div>", | |
| ab, ti, divergence | |
| ) | |
| def handle_comparison(df, existing_comp, progress=gr.Progress(track_tqdm=True)): | |
| if existing_comp is not None and not existing_comp.empty: | |
| return _split_comparison(existing_comp) + (existing_comp,) | |
| if df is None or df.empty: | |
| return ( | |
| "<div class='error-box'>β Load data first.</div>", | |
| pd.DataFrame(), pd.DataFrame(), "", None | |
| ) | |
| try: | |
| from tools import compare_abstract_vs_title_themes | |
| _ensure_output_dir() | |
| progress(0.2, desc="Running LDA on abstracts...") | |
| comp_df = compare_abstract_vs_title_themes(df, n_topics_each=15) | |
| progress(1.0, desc="Done!") | |
| _safe_save_csv(comp_df, 'comparison.csv') | |
| return _split_comparison(comp_df) + (comp_df,) | |
| except Exception as e: | |
| return ( | |
| f"<div class='error-box'>β {e}</div>", | |
| pd.DataFrame(), pd.DataFrame(), "", None | |
| ) | |
| btn_run_comparison.click( | |
| fn=handle_comparison, | |
| inputs=[state_df, state_comparison_df], | |
| outputs=[comparison_status, abstract_table, title_table, divergence_md, state_comparison_df] | |
| ) | |
| state_comparison_df.change( | |
| fn=lambda cd: _split_comparison(cd) + (cd,) if cd is not None else ( | |
| "", pd.DataFrame(), pd.DataFrame(), "", None | |
| ), | |
| inputs=[state_comparison_df], | |
| outputs=[comparison_status, abstract_table, title_table, divergence_md, state_comparison_df] | |
| ) | |
| btn_iceberg.click( | |
| fn=lambda cd: _compute_iceberg_topics(cd), | |
| inputs=[state_comparison_df], | |
| outputs=[iceberg_display] | |
| ) | |
| # ================================================================== | |
| # TAB 5 β Section 7 Narrative | |
| # ================================================================== | |
| with gr.Tab("β Section 7 Narrative"): | |
| gr.Markdown("## Phase 6: Generate Academic Narrative Draft") | |
| btn_run_narrative = gr.Button( | |
| "βΆ Generate Narrative", | |
| variant="primary", | |
| elem_id="btn_run_narrative" | |
| ) | |
| narrative_box = gr.Textbox( | |
| label="Section 7 Narrative Draft (~500 words)", | |
| lines=25, | |
| show_label=True, | |
| elem_id="narrative_textbox", | |
| interactive=False | |
| ) | |
| # BUG 5 FIX: value=None | |
| narrative_download = gr.DownloadButton( | |
| label="β¬ Download narrative.txt", | |
| value=None, | |
| elem_id="narrative_dl" | |
| ) | |
| btn_copy = gr.Button( | |
| "π Copy to Clipboard", | |
| variant="secondary", | |
| elem_id="btn_copy_narrative" | |
| ) | |
| copy_status = gr.Markdown(elem_id="copy_status") | |
| gr.Markdown("### π Generate Sample APA Citation") | |
| btn_citation = gr.Button( | |
| "Generate Sample Citation", | |
| variant="secondary", | |
| elem_id="btn_citation" | |
| ) | |
| citation_output = gr.Markdown(elem_id="citation_output") | |
| def handle_narrative(taxonomy_map, comparison_df, topic_df, progress=gr.Progress(track_tqdm=True)): | |
| if not taxonomy_map and (topic_df is None or topic_df.empty): | |
| return "<No analysis results yet. Run the full pipeline first.>", gr.update() | |
| try: | |
| from tools import generate_section7_narrative | |
| _ensure_output_dir() | |
| progress(0.5, desc="Generating narrative...") | |
| narrative = generate_section7_narrative( | |
| taxonomy_map=taxonomy_map or {}, | |
| comparison_df=comparison_df if comparison_df is not None else pd.DataFrame(), | |
| topic_df=topic_df if topic_df is not None else pd.DataFrame(), | |
| ) | |
| progress(1.0, desc="Done!") | |
| saved_path = _safe_save_text(narrative, 'narrative.txt') | |
| return narrative, gr.update(value=saved_path) | |
| except Exception as e: | |
| return f"Narrative generation failed: {e}", gr.update() | |
| btn_run_narrative.click( | |
| fn=handle_narrative, | |
| inputs=[state_taxonomy_map, state_comparison_df, state_topic_df], | |
| outputs=[narrative_box, narrative_download] | |
| ) | |
| state_agent_result.change( | |
| fn=lambda r: (r.get('narrative', '') if r else '', gr.update()), | |
| inputs=[state_agent_result], | |
| outputs=[narrative_box, narrative_download] | |
| ) | |
| btn_copy.click( | |
| fn=lambda text: "β Copied! (use Ctrl+C if clipboard API unavailable)", | |
| inputs=[narrative_box], | |
| outputs=[copy_status], | |
| js="""(text) => { | |
| navigator.clipboard.writeText(text).then( | |
| () => console.log('Copied'), | |
| () => console.warn('Clipboard API unavailable') | |
| ); | |
| return 'β Copied to clipboard!'; | |
| }""" | |
| ) | |
| btn_citation.click( | |
| fn=lambda td: _generate_apa_citation(td), | |
| inputs=[state_topic_df], | |
| outputs=[citation_output] | |
| ) | |
| # ================================================================== | |
| # TAB 6 β Research Intelligence Dashboard | |
| # ================================================================== | |
| with gr.Tab("π Research Intelligence Dashboard"): | |
| gr.Markdown("## Research Intelligence Dashboard") | |
| gr.Markdown( | |
| "*Dashboard populates automatically after pipeline completion.*" | |
| ) | |
| with gr.Row(): | |
| card_topics = gr.Markdown("**--**\nTotal Topics", elem_id="card_topics") | |
| card_novel = gr.Markdown("**--**\nNovel Themes", elem_id="card_novel") | |
| card_coverage = gr.Markdown("**--**\nPAJAIS Coverage", elem_id="card_coverage") | |
| card_publishable = gr.Markdown("**--**\nPublishable Gaps", elem_id="card_publishable") | |
| with gr.Row(): | |
| plot_dist = gr.Plot(label="Topic Distribution", elem_id="plot_dist") | |
| plot_pie = gr.Plot(label="Mapped vs Novel", elem_id="plot_pie") | |
| plot_top15 = gr.Plot( | |
| label="Top 15 Topics by Document Count", | |
| elem_id="plot_top15" | |
| ) | |
| supplementary_panel = gr.Markdown(elem_id="supplementary_panel") | |
| def update_dashboard(result, topic_df, taxonomy_map): | |
| if result is None: | |
| return ( | |
| "**--**\nTotal Topics", "**--**\nNovel Themes", | |
| "**--**\nPAJAIS Coverage", "**--**\nPublishable Gaps", | |
| None, None, None, "" | |
| ) | |
| try: | |
| n_topics = result.get('topic_count', 0) | |
| n_novel = result.get('novel_count', 0) | |
| coverage = result.get('pajais_coverage_pct', 0.0) | |
| pub_count = len(taxonomy_map.get('publishable_novel_themes', [])) if taxonomy_map else 0 | |
| c1 = f"<div class='metric-card'><span class='metric-value'>{n_topics}</span><span class='metric-label'>Total Topics</span></div>" | |
| c2 = f"<div class='metric-card'><span class='metric-value'>{n_novel}</span><span class='metric-label'>Novel Themes</span></div>" | |
| c3 = f"<div class='metric-card'><span class='metric-value'>{coverage:.0f}%</span><span class='metric-label'>PAJAIS Coverage</span></div>" | |
| c4 = f"<div class='metric-card'><span class='metric-value'>{pub_count}</span><span class='metric-label'>Publishable Gaps</span></div>" | |
| fig_dist = _plot_topic_distribution(topic_df) | |
| fig_pie = _plot_mapped_novel_pie(taxonomy_map) | |
| fig_top15 = _plot_topic_distribution(topic_df) | |
| supp = result.get('supplementary_insights', {}) | |
| blind = supp.get('blind_spot_theme', {}) | |
| golden = supp.get('golden_year', {}) | |
| supp_md = "" | |
| if blind: | |
| supp_md += ( | |
| f"\n### π― High-Frequency Unaddressed Theme\n" | |
| f"**{blind.get('label', 'Unknown')}** β " | |
| f"appears in **{blind.get('doc_count', 0)} documents** " | |
| f"but has not been formally addressed in PAJAIS.\n\n" | |
| f"*First-mover publication advantage is estimated at 18β24 months.*\n\n" | |
| f"**Top words:** {blind.get('top_words', '')}\n" | |
| ) | |
| if golden: | |
| supp_md += ( | |
| f"\n### π Peak Research Diversity Year\n" | |
| f"**{golden.get('year', 'N/A')}** showed the greatest topic diversity " | |
| f"(Shannon entropy = {golden.get('entropy', 0):.3f})\n" | |
| ) | |
| return c1, c2, c3, c4, fig_dist, fig_pie, fig_top15, supp_md | |
| except Exception as e: | |
| logger.error(f"Dashboard update failed: {e}") | |
| return ( | |
| "Error", "Error", "Error", "Error", | |
| None, None, None, f"Dashboard error: {e}" | |
| ) | |
| state_agent_result.change( | |
| fn=update_dashboard, | |
| inputs=[state_agent_result, state_topic_df, state_taxonomy_map], | |
| outputs=[ | |
| card_topics, card_novel, card_coverage, card_publishable, | |
| plot_dist, plot_pie, plot_top15, supplementary_panel | |
| ] | |
| ) | |
| # ================================================================== | |
| # TAB A β DBSCAN Clusters (Phase 2.5) | |
| # ================================================================== | |
| with gr.Tab("π΅ SPECTER2 Clusters"): | |
| gr.Markdown("## Phase 2.5: Semantic Clustering via SPECTER2 β UMAP β HDBSCAN") | |
| gr.Markdown( | |
| "Each paper is represented by **one 768-dim SPECTER2 vector** computed from its " | |
| "combined Title + Abstract column (DOI-keyed). " | |
| "UMAP reduces dimensions (cosine metric, 50D), then HDBSCAN clusters with an " | |
| "automatic parameter sweep to land in the **15β30 cluster** target range. " | |
| "Clusters with fewer than 5 or more than 100 papers are automatically merged/split. " | |
| "Intra-cluster cosine similarity is kept in the **0.50β0.60** band. " | |
| "The 3 most representative paper titles per cluster are sent to " | |
| "**Mistral + Gemini + HuggingFace** (all free) for labeling β majority vote wins." | |
| ) | |
| with gr.Accordion("βοΈ Clustering Parameters", open=False): | |
| with gr.Row(): | |
| min_cs_slider = gr.Slider( | |
| 2, 20, value=5, step=1, | |
| label="Min Cluster Size (papers)", | |
| info="Papers < this β merged into nearest cluster" | |
| ) | |
| max_cs_slider = gr.Slider( | |
| 20, 200, value=100, step=5, | |
| label="Max Cluster Size (papers)", | |
| info="Papers > this β cluster is split" | |
| ) | |
| with gr.Row(): | |
| target_min_slider = gr.Slider( | |
| 5, 20, value=15, step=1, | |
| label="Target Min Clusters", | |
| info="HDBSCAN sweep lower bound" | |
| ) | |
| target_max_slider = gr.Slider( | |
| 15, 40, value=30, step=1, | |
| label="Target Max Clusters", | |
| info="HDBSCAN sweep upper bound" | |
| ) | |
| with gr.Row(): | |
| sim_low_slider = gr.Slider( | |
| 0.30, 0.70, value=0.50, step=0.01, | |
| label="Min Cosine Similarity (cluster quality)", | |
| info="Clusters below this are dissolved to noise" | |
| ) | |
| umap_neighbors_slider = gr.Slider( | |
| 5, 50, value=15, step=1, | |
| label="UMAP n_neighbors", | |
| info="Controls local vs global structure" | |
| ) | |
| with gr.Row(): | |
| btn_run_dbscan = gr.Button("βΆ Run SPECTER2 β UMAP β HDBSCAN", variant="primary") | |
| btn_llm_label = gr.Button("π€ Label Clusters (3 LLMs)", variant="secondary") | |
| dbscan_status = gr.Markdown("*Run DBSCAN or use the full pipeline from Tab 1.*") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Cluster Summary") | |
| cluster_summary_table = gr.DataFrame( | |
| label="Clusters (sorted by size)", | |
| show_label=True, | |
| wrap=True | |
| ) | |
| with gr.Column(scale=2): | |
| gr.Markdown("### π Document-Level Assignments") | |
| cluster_doc_table = gr.DataFrame( | |
| label="Per-Document Cluster Assignments", | |
| show_label=True, | |
| wrap=True | |
| ) | |
| with gr.Row(): | |
| plot_cluster_sizes = gr.Plot(label="Cluster Size Distribution") | |
| plot_noise_pie = gr.Plot(label="Clustered vs Noise") | |
| with gr.Row(): | |
| dl_cluster_docs = gr.DownloadButton("β¬ cluster_documents.csv", value=None) | |
| dl_cluster_summary = gr.DownloadButton("β¬ cluster_summary.csv", value=None) | |
| dl_cluster_labels = gr.DownloadButton("β¬ cluster_labels.csv", value=None) | |
| # ---- Handlers ---- | |
| def handle_run_dbscan( | |
| df, existing_cluster_df, existing_summary, | |
| min_cs, max_cs, target_min, target_max, sim_low, umap_n, | |
| progress=gr.Progress(track_tqdm=True) | |
| ): | |
| if existing_cluster_df is not None and not existing_cluster_df.empty: | |
| summary = get_cluster_summary(existing_cluster_df) | |
| fig_sz, fig_noise = _plot_cluster_charts(existing_cluster_df) | |
| saved_docs = _safe_save_csv(existing_cluster_df, "cluster_documents.csv") | |
| saved_sum = _safe_save_csv(summary, "cluster_summary.csv") | |
| return ( | |
| "<div class='success-box'>β Loaded existing results.</div>", | |
| summary, existing_cluster_df, summary, | |
| fig_sz, fig_noise, | |
| gr.update(value=saved_docs), gr.update(value=saved_sum), gr.update(), | |
| ) | |
| if df is None or df.empty: | |
| return ( | |
| "<div class='error-box'>β Upload and load data first.</div>", | |
| pd.DataFrame(), pd.DataFrame(), None, | |
| None, None, | |
| gr.update(), gr.update(), gr.update(), | |
| ) | |
| try: | |
| _ensure_output_dir() | |
| progress(0.05, desc="Building title+abstract columnβ¦") | |
| df_ta = build_title_abstract_column(df) | |
| progress(0.15, desc="Generating SPECTER2 embeddings (may take 2-5 min)β¦") | |
| texts = df_ta['title_abstract'].tolist() | |
| embs = embed_with_specter2(texts, cache_dir='outputs/specter_cache') | |
| progress(0.60, desc="UMAP + HDBSCAN clusteringβ¦") | |
| cdf = specter2_hdbscan_cluster_topics( | |
| df=df_ta, | |
| embeddings=embs, | |
| min_cluster_size=int(min_cs), | |
| max_cluster_size=int(max_cs), | |
| target_min_clusters=int(target_min), | |
| target_max_clusters=int(target_max), | |
| cosine_sim_low=float(sim_low), | |
| cosine_sim_high=float(sim_low) + 0.10, | |
| umap_n_neighbors=int(umap_n), | |
| ) | |
| progress(0.85, desc="Summarising clustersβ¦") | |
| summary = get_cluster_summary(cdf) | |
| progress(1.0, desc="Done!") | |
| fig_sz, fig_noise = _plot_cluster_charts(cdf) | |
| saved_docs = _safe_save_csv(cdf, "cluster_documents.csv") | |
| saved_sum = _safe_save_csv(summary, "cluster_summary.csv") | |
| n_c = len(set(cdf['cluster_final']) - {-1}) | |
| n_n = int(cdf['is_noise'].sum()) | |
| return ( | |
| f"<div class='success-box'>β {n_c} clusters found, {n_n} noise docs.</div>", | |
| summary, cdf, summary, | |
| fig_sz, fig_noise, | |
| gr.update(value=saved_docs), gr.update(value=saved_sum), gr.update(), | |
| ) | |
| except Exception as e: | |
| return ( | |
| f"<div class='error-box'>β {e}</div>", | |
| pd.DataFrame(), pd.DataFrame(), None, | |
| None, None, | |
| gr.update(), gr.update(), gr.update(), | |
| ) | |
| def handle_llm_label(cluster_df, cluster_summary, progress=gr.Progress(track_tqdm=True)): | |
| if cluster_df is None or cluster_df.empty: | |
| return ( | |
| "<div class='error-box'>β Run clustering first.</div>", | |
| cluster_summary, gr.update() | |
| ) | |
| try: | |
| _ensure_output_dir() | |
| # Load cached embeddings if available | |
| import glob | |
| cache_files = glob.glob('outputs/specter_cache/*.npy') | |
| if not cache_files: | |
| return ( | |
| "<div class='error-box'>β No SPECTER2 cache found. Run clustering tab first.</div>", | |
| cluster_summary, gr.update() | |
| ) | |
| embs = np.load(sorted(cache_files)[-1]) # most recent cache | |
| progress(0.2, desc="Sending clusters to LLMsβ¦") | |
| labeled = label_clusters_3llm( | |
| cluster_df=cluster_df, | |
| cluster_summary_df=cluster_summary.copy() if cluster_summary is not None | |
| else get_cluster_summary(cluster_df), | |
| embeddings=embs, | |
| mistral_api_key=MISTRAL_API_KEY, | |
| gemini_api_key=GEMINI_API_KEY, | |
| ollama_url=OLLAMA_URL, | |
| max_clusters=30, | |
| ) | |
| progress(1.0, desc="Done!") | |
| saved = _safe_save_csv(labeled, "cluster_labels.csv") | |
| return ( | |
| "<div class='success-box'>β Clusters labeled by 3 LLMs (majority vote).</div>", | |
| labeled, | |
| gr.update(value=saved), | |
| ) | |
| except Exception as e: | |
| return ( | |
| f"<div class='error-box'>β LLM labeling failed: {e}</div>", | |
| cluster_summary, gr.update() | |
| ) | |
| btn_run_dbscan.click( | |
| fn=handle_run_dbscan, | |
| inputs=[ | |
| state_df, state_cluster_df, state_cluster_summary, | |
| min_cs_slider, max_cs_slider, | |
| target_min_slider, target_max_slider, | |
| sim_low_slider, umap_neighbors_slider, | |
| ], | |
| outputs=[ | |
| dbscan_status, | |
| cluster_summary_table, cluster_doc_table, state_cluster_summary, | |
| plot_cluster_sizes, plot_noise_pie, | |
| dl_cluster_docs, dl_cluster_summary, dl_cluster_labels, | |
| ] | |
| ) | |
| btn_llm_label.click( | |
| fn=handle_llm_label, | |
| inputs=[state_cluster_df, state_cluster_summary], | |
| outputs=[dbscan_status, cluster_summary_table, dl_cluster_labels] | |
| ) | |
| # Auto-populate when pipeline result loads cluster data | |
| state_cluster_df.change( | |
| fn=lambda cdf: ( | |
| get_cluster_summary(cdf) if cdf is not None and not cdf.empty else pd.DataFrame(), | |
| cdf if cdf is not None else pd.DataFrame(), | |
| ), | |
| inputs=[state_cluster_df], | |
| outputs=[cluster_summary_table, cluster_doc_table] | |
| ) | |
| # ================================================================== | |
| # TAB B β Agentic Council (Phase 6.5) | |
| # ================================================================== | |
| with gr.Tab("π§ Agentic Council"): | |
| gr.Markdown("## Phase 6.5: Dual-Model Research Council") | |
| gr.Markdown( | |
| "Three AI models independently assess the PAJAIS research gap findings:\n" | |
| "- **Mistral** (Panel A) β pragmatic applied IS perspective\n" | |
| "- **Gemini** (Panel B) β broad technology futures perspective\n" | |
| "- **Ollama** (Panel C) β deep analytical synthesis\n\n" | |
| "API keys are loaded automatically from HuggingFace Secrets " | |
| "(`MISTRAL_API_KEY`, `GEMINI_API_KEY`). " | |
| "Configure them in your Space under **Settings β Variables and Secrets**." | |
| ) | |
| # Key-status indicator β shows which secrets are present at load time | |
| _key_lines = ["**π Secret Status (loaded at startup):**"] | |
| for _label, _val in [ | |
| ("MISTRAL_API_KEY", MISTRAL_API_KEY), | |
| ("GEMINI_API_KEY", GEMINI_API_KEY), | |
| ("OLLAMA_URL", OLLAMA_URL), | |
| ]: | |
| _icon = "β present" if _val else "β missing" | |
| _key_lines.append(f"- `{_label}`: {_icon}") | |
| gr.Markdown("\n".join(_key_lines)) | |
| btn_run_council = gr.Button("π Convene Research Council", variant="primary") | |
| council_status = gr.Markdown("*Run taxonomy mapping first (Tab 3 or full pipeline).*") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### π’ Panel A β Mistral") | |
| mistral_output = gr.Textbox( | |
| label="Mistral Assessment", | |
| lines=18, | |
| interactive=False, | |
| show_label=True, | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### π΅ Panel B β Gemini") | |
| gemini_output = gr.Textbox( | |
| label="Gemini Assessment", | |
| lines=18, | |
| interactive=False, | |
| show_label=True, | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### π£ Panel C β Ollama") | |
| ollama_output = gr.Textbox( | |
| label="Ollama Assessment", | |
| lines=18, | |
| interactive=False, | |
| show_label=True, | |
| ) | |
| dl_council = gr.DownloadButton("β¬ council_report.json", value=None) | |
| # ---- Handler ---- | |
| def handle_run_council( | |
| taxonomy_map, topic_df, | |
| progress=gr.Progress(track_tqdm=True) | |
| ): | |
| if not taxonomy_map: | |
| return ( | |
| "<div class='error-box'>β Run taxonomy mapping first (Tab 3 or full pipeline).</div>", | |
| "", "", gr.update() | |
| ) | |
| if not any([MISTRAL_API_KEY, GEMINI_API_KEY]): | |
| return ( | |
| "<div class='error-box'>β No API keys found. " | |
| "Add MISTRAL_API_KEY and/or GEMINI_API_KEY " | |
| "in your Space Settings β Variables and Secrets.</div>", | |
| "", "", gr.update() | |
| ) | |
| try: | |
| _ensure_output_dir() | |
| progress(0.1, desc="Preparing findingsβ¦") | |
| result = run_agentic_council( | |
| taxonomy_map=taxonomy_map, | |
| topic_df=topic_df, | |
| mistral_api_key=MISTRAL_API_KEY, | |
| gemini_api_key=GEMINI_API_KEY, | |
| ollama_url=OLLAMA_URL, | |
| ) | |
| progress(0.9, desc="Saving reportβ¦") | |
| saved = _safe_save_json(result, "council_report.json") | |
| progress(1.0, desc="Council complete!") | |
| status = "<div class='success-box'>β Council complete. See assessments below.</div>" | |
| return ( | |
| status, | |
| result.get("mistral", ""), | |
| result.get("gemini", ""), | |
| result.get("ollama", ""), | |
| gr.update(value=saved), | |
| ) | |
| except Exception as e: | |
| return ( | |
| f"<div class='error-box'>β Council failed: {e}</div>", | |
| "", "", "", gr.update() | |
| ) | |
| btn_run_council.click( | |
| fn=handle_run_council, | |
| inputs=[state_taxonomy_map, state_topic_df], | |
| outputs=[council_status, mistral_output, gemini_output, ollama_output, dl_council] | |
| ) | |
| # Auto-fill if council already ran (e.g. via full pipeline) | |
| state_council_result.change( | |
| fn=lambda cr: ( | |
| cr.get("mistral", "") if cr else "", | |
| cr.get("gemini", "") if cr else "", | |
| cr.get("ollama", "") if cr else "", | |
| ), | |
| inputs=[state_council_result], | |
| outputs=[mistral_output, gemini_output, ollama_output] | |
| ) | |
| # ================================================================== | |
| # TAB 7 β Export Center | |
| # ================================================================== | |
| with gr.Tab("π¦ Export Center"): | |
| gr.Markdown("## Export Center & Methodology Notes") | |
| with gr.Row(): | |
| # BUG 5 FIX: all value=None β updated dynamically after pipeline | |
| dl_topic = gr.DownloadButton( | |
| "β¬ topic_review_table.csv", | |
| value=None, | |
| elem_id="dl_topic" | |
| ) | |
| dl_mapping = gr.DownloadButton( | |
| "β¬ pajais_mapping.csv", | |
| value=None, | |
| elem_id="dl_mapping" | |
| ) | |
| dl_comparison = gr.DownloadButton( | |
| "β¬ comparison.csv", | |
| value=None, | |
| elem_id="dl_comparison" | |
| ) | |
| with gr.Row(): | |
| dl_taxonomy = gr.DownloadButton( | |
| "β¬ taxonomy_map.json", | |
| value=None, | |
| elem_id="dl_taxonomy" | |
| ) | |
| dl_narrative = gr.DownloadButton( | |
| "β¬ narrative.txt", | |
| value=None, | |
| elem_id="dl_narrative" | |
| ) | |
| dl_log = gr.DownloadButton( | |
| "β¬ agent.log", | |
| value=str(OUTPUTS_DIR / "agent.log"), | |
| elem_id="dl_log" | |
| ) | |
| btn_download_all = gr.Button( | |
| "π¦ Download All as ZIP", | |
| variant="primary", | |
| elem_id="btn_download_all" | |
| ) | |
| zip_output = gr.File( | |
| label="All Artifacts (ZIP)", | |
| elem_id="zip_output", | |
| visible=False | |
| ) | |
| def handle_download_all(): | |
| zip_path = _make_zip() | |
| if zip_path: | |
| return zip_path, gr.update(visible=True) | |
| return None, gr.update(visible=False, value="No files to download yet.") | |
| btn_download_all.click( | |
| fn=handle_download_all, | |
| inputs=[], | |
| outputs=[zip_output, zip_output] | |
| ) | |
| gr.Markdown("---") | |
| btn_print_summary = gr.Button( | |
| "π¨ Print-Ready Summary", | |
| variant="secondary", | |
| elem_id="btn_print_summary" | |
| ) | |
| print_summary_output = gr.Markdown(elem_id="print_summary_output") | |
| btn_print_summary.click( | |
| fn=lambda td, tm: _print_ready_summary(td, tm), | |
| inputs=[state_topic_df, state_taxonomy_map], | |
| outputs=[print_summary_output] | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown( | |
| """ | |
| ## π Methodology Notes | |
| ### LDA Topic Modeling | |
| This system uses **Latent Dirichlet Allocation (LDA)** implemented via the | |
| [Gensim](https://radimrehurek.com/gensim/) library. LDA is a generative | |
| probabilistic model that discovers latent thematic structures in a text | |
| corpus by modeling each document as a mixture of topics and each topic as | |
| a distribution over words. The pipeline includes bigram phrase detection, | |
| TF-IDF filtering, and UMass coherence scoring to ensure topic quality. | |
| ### PAJAIS Taxonomy (20 Themes) | |
| The 20 canonical PAJAIS themes span IS Strategy, Digital Transformation, | |
| IT Adoption, Knowledge Management, E-Commerce, AI/ML, Blockchain, | |
| Healthcare IS, Social Media, Big Data, Cloud Computing, Cybersecurity, | |
| IS in Asia-Pacific, Mobile Computing, IS Research Methods, Organizational IS, | |
| HCI, IS Education, Sustainability, and FinTech. | |
| ### Coherence Scoring & Publishability | |
| Topic coherence is measured using the UMass metric, which captures semantic | |
| relatedness among top topic words. A topic is deemed **publishable** when | |
| it meets two thresholds: `doc_count > 5` (sufficient scholarly attention) | |
| and `coherence > 0.30` (semantic stability). | |
| ### Abstract vs Title Methodology | |
| Separate LDA models are trained on article abstracts and titles independently. | |
| Topics appearing exclusively in abstracts represent **latent constructs** β | |
| ideas actively studied but not yet positioned as headline contributions. | |
| Topics exclusive to titles signal **positioning keywords** favored by authors | |
| as first-impression signals to reviewers and readers. | |
| ### DBSCAN Semantic Clustering | |
| Papers are embedded using TF-IDF β Truncated SVD (LSA) for both title and | |
| abstract text independently. DBSCAN is applied to each embedding space with | |
| configurable Ξ΅ and min_samples parameters. Cluster assignments are merged | |
| via a weighted vote (configurable abstract weight). Large clusters are | |
| recursively bisected; tiny clusters with fewer than min_membership documents | |
| are reassigned to their nearest valid cluster or marked as noise. | |
| ### Agentic Research Council | |
| The council convenes two independent AI models (Mistral and Gemini) | |
| to assess the gap analysis findings from complementary epistemological | |
| perspectives. Each panel member produces a structured assessment of the | |
| most publishable gaps, methodological recommendations, and regional focus. | |
| Their independent outputs can be compared side-by-side to identify | |
| consensus positions and productive disagreements. | |
| --- | |
| *Built for PAJAIS Research Intelligence* | |
| """ | |
| ) | |
| # ================================================================== | |
| # Wire handle_full_run outputs to all DownloadButtons + new states | |
| # ================================================================== | |
| btn_full_run.click( | |
| fn=handle_full_run, | |
| inputs=[file_input], | |
| outputs=[ | |
| validation_info, preview_df, | |
| state_df, state_agent_result, | |
| state_topic_df, state_comparison_df, state_taxonomy_map, | |
| state_lda_result, | |
| error_display, | |
| # BUG 5 FIX: wire paths back to DownloadButtons across all tabs | |
| topic_download, # Tab 2 | |
| mapping_download, # Tab 3 | |
| comparison_download, # Tab 4 | |
| dl_taxonomy, # Export Center | |
| narrative_download, # Tab 5 | |
| dl_topic, # Export Center duplicate | |
| dl_mapping, # Export Center duplicate | |
| # New: cluster + council states populated if agent ran them | |
| state_cluster_df, | |
| state_cluster_summary, | |
| state_council_result, | |
| ] | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Launch | |
| # --------------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True, | |
| ) |