Topic-Modelling / app.py
Shivani-Bhat's picture
Update app.py
7f126af verified
Raw
History Blame Contribute Delete
83.1 kB
# =============================================================================
# app.py -- PAJAIS Research Intelligence Agent
# Gradio 4.x web application for HuggingFace Spaces
# FIXES: Light/readable theme + working CSV/JSON exports
# BUGFIXES (v2):
# Bug 1 (tools.py generate_taxonomy_map) - DataFrame.get() -> KeyError in Phase 5
# Bug 2 (tools.py generate_section7_narrative) - DataFrame.get() -> crash in Phase 6
# Bug 3 (agent.py _phase5_5_mapping_display) - DataFrame.get() -> pajais_mapping.csv never written
# Bug 4 (app.py handle_mapping) - returned 6 values but outputs= expected 5
# Bug 5 (app.py DownloadButton) - static value= pointed to nonexistent paths at startup
# ADDITIONS (v3):
# Tab A β€” πŸ”΅ DBSCAN Clusters (Phase 2.5: Semantic Clustering via DBSCAN)
# Tab B β€” 🧠 Agentic Council (Phase 6.5: Multi-Model Research Council)
# =============================================================================
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg') # Must appear before pyplot import
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import zipfile
import tempfile
import json
import logging
import os
import random
from pathlib import Path
from typing import Optional, Tuple, Dict, Any
from agent import PAJAISResearchAgent, AnalysisConfig
from tools import (
load_journal_csv, validate_dataframe,
PAJAIS_THEMES, export_all_artifacts,
# Unified clustering pipeline (all now in tools.py)
build_title_abstract_column,
embed_with_specter2,
specter2_hdbscan_cluster_topics,
get_cluster_summary,
label_clusters_3llm,
run_agentic_council,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Ensure outputs directory exists at startup
# ---------------------------------------------------------------------------
OUTPUTS_DIR = Path("outputs")
OUTPUTS_DIR.mkdir(exist_ok=True)
# ---------------------------------------------------------------------------
# API Keys β€” loaded from HuggingFace Secrets (Environment Variables)
# Set these in your Space: Settings β†’ Variables and Secrets
# MISTRAL_API_KEY β†’ your Mistral key (sk-...)
# GEMINI_API_KEY β†’ your Google key (AIza...)
# ---------------------------------------------------------------------------
MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "")
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
# ---------------------------------------------------------------------------
# Custom CSS β€” Light, readable theme that works on HuggingFace Spaces
# ---------------------------------------------------------------------------
CUSTOM_CSS = """
/* ── Reset Gradio dark overrides ─────────────────────────────────────── */
.gradio-container,
.gradio-container *,
body {
color: #1a1a2e !important;
}
/* ── Page background ─────────────────────────────────────────────────── */
.gradio-container {
background: #f0f4f8 !important;
font-family: 'Segoe UI', system-ui, sans-serif !important;
max-width: 1200px !important;
margin: 0 auto !important;
}
/* ── Tabs ────────────────────────────────────────────────────────────── */
.tab-nav {
background: #ffffff !important;
border-bottom: 2px solid #c9d6e3 !important;
}
.tab-nav button {
background: #ffffff !important;
color: #3a4a5c !important;
border: none !important;
font-weight: 500 !important;
padding: 10px 18px !important;
font-family: 'Segoe UI', system-ui, sans-serif !important;
}
.tab-nav button.selected,
.tab-nav button:focus {
background: #1a56db !important;
color: #ffffff !important;
border-radius: 6px 6px 0 0 !important;
}
/* ── Buttons ─────────────────────────────────────────────────────────── */
.gr-button-primary,
button[variant="primary"],
button.primary {
background: #1a56db !important;
color: #ffffff !important;
border: none !important;
border-radius: 8px !important;
font-weight: 600 !important;
padding: 10px 20px !important;
}
.gr-button-primary:hover,
button[variant="primary"]:hover {
background: #1341b0 !important;
}
.gr-button-secondary,
button[variant="secondary"],
button.secondary {
background: #ffffff !important;
color: #1a56db !important;
border: 2px solid #1a56db !important;
border-radius: 8px !important;
font-weight: 500 !important;
padding: 8px 18px !important;
}
.gr-button-secondary:hover {
background: #e8f0fe !important;
}
/* ── Inputs / Textboxes ──────────────────────────────────────────────── */
input,
textarea,
.gr-textbox,
.gr-input,
.gr-box {
background: #ffffff !important;
color: #1a1a2e !important;
border: 1px solid #c9d6e3 !important;
border-radius: 6px !important;
font-family: 'Courier New', monospace !important;
}
input:focus,
textarea:focus {
border-color: #1a56db !important;
outline: none !important;
box-shadow: 0 0 0 3px rgba(26,86,219,0.15) !important;
}
/* ── DataFrames / Tables ─────────────────────────────────────────────── */
.gr-dataframe,
.gr-dataframe table {
background: #ffffff !important;
color: #1a1a2e !important;
border: 1px solid #c9d6e3 !important;
border-radius: 8px !important;
overflow: hidden !important;
}
.gr-dataframe th {
background: #1a56db !important;
color: #ffffff !important;
font-weight: 600 !important;
padding: 10px 14px !important;
border: none !important;
}
.gr-dataframe td {
background: #ffffff !important;
color: #1a1a2e !important;
border-bottom: 1px solid #e8eef5 !important;
padding: 8px 14px !important;
}
.gr-dataframe tr:nth-child(even) td {
background: #f7fafc !important;
}
.gr-dataframe tr:hover td {
background: #e8f0fe !important;
}
/* ── Cards / Panels ──────────────────────────────────────────────────── */
.metric-card {
background: #ffffff;
border: 1px solid #c9d6e3;
border-radius: 12px;
padding: 24px 20px;
text-align: center;
margin: 6px;
box-shadow: 0 2px 8px rgba(0,0,0,0.06);
}
.metric-value {
font-size: 2.4em;
font-weight: 700;
color: #1a56db;
font-family: 'Georgia', serif;
display: block;
}
.metric-label {
color: #5a6a7a;
font-size: 0.9em;
margin-top: 6px;
display: block;
font-weight: 500;
}
/* ── Status boxes ────────────────────────────────────────────────────── */
.error-box {
background: #fff0f0;
border: 1px solid #e53e3e;
border-left: 4px solid #e53e3e;
border-radius: 6px;
padding: 12px 16px;
color: #c53030;
font-weight: 500;
}
.success-box {
background: #f0fff4;
border: 1px solid #38a169;
border-left: 4px solid #38a169;
border-radius: 6px;
padding: 12px 16px;
color: #276749;
font-weight: 500;
}
.info-panel {
background: #ebf5fb;
border: 1px solid #bee3f8;
border-left: 4px solid #1a56db;
border-radius: 8px;
padding: 16px;
margin: 10px 0;
color: #1a1a2e;
}
/* ── Tags ────────────────────────────────────────────────────────────── */
.novel-tag {
background: #fff0f0;
color: #c53030;
padding: 3px 10px;
border-radius: 12px;
font-size: 0.82em;
font-weight: 600;
border: 1px solid #fed7d7;
}
.mapped-tag {
background: #e6fffa;
color: #234e52;
padding: 3px 10px;
border-radius: 12px;
font-size: 0.82em;
font-weight: 600;
border: 1px solid #b2f5ea;
}
/* ── Section headings ────────────────────────────────────────────────── */
.section-header {
font-family: 'Georgia', serif;
color: #1a1a2e;
border-bottom: 3px solid #1a56db;
padding-bottom: 8px;
margin-bottom: 18px;
}
/* ── Accordion ───────────────────────────────────────────────────────── */
.gr-accordion {
background: #ffffff !important;
border: 1px solid #c9d6e3 !important;
border-radius: 8px !important;
color: #1a1a2e !important;
}
.gr-accordion summary {
color: #1a1a2e !important;
font-weight: 600 !important;
}
/* ── Markdown prose ──────────────────────────────────────────────────── */
.gr-markdown,
.prose {
color: #1a1a2e !important;
}
.gr-markdown h1, .gr-markdown h2, .gr-markdown h3 {
color: #1a1a2e !important;
}
.gr-markdown a {
color: #1a56db !important;
}
/* ── File upload area ────────────────────────────────────────────────── */
.gr-file {
background: #ffffff !important;
border: 2px dashed #c9d6e3 !important;
border-radius: 10px !important;
color: #1a1a2e !important;
}
.gr-file:hover {
border-color: #1a56db !important;
background: #f0f6ff !important;
}
/* ── Plot containers ─────────────────────────────────────────────────── */
.gr-plot {
background: #ffffff !important;
border: 1px solid #c9d6e3 !important;
border-radius: 8px !important;
padding: 12px !important;
}
/* ── Print-ready summary ─────────────────────────────────────────────── */
.print-ready {
background: #ffffff;
color: #1a1a2e;
font-family: 'Times New Roman', serif;
padding: 28px;
border-radius: 6px;
border: 1px solid #c9d6e3;
}
/* ── Download buttons ────────────────────────────────────────────────── */
.gr-download-button {
background: #f0f6ff !important;
color: #1a56db !important;
border: 1px solid #1a56db !important;
border-radius: 8px !important;
font-weight: 500 !important;
}
.gr-download-button:hover {
background: #1a56db !important;
color: #ffffff !important;
}
/* ── Labels ──────────────────────────────────────────────────────────── */
label, .gr-label {
color: #2d3748 !important;
font-weight: 600 !important;
}
"""
# ---------------------------------------------------------------------------
# Helper functions
# ---------------------------------------------------------------------------
def _make_agent() -> PAJAISResearchAgent:
"""Create a fresh agent with default config."""
return PAJAISResearchAgent(AnalysisConfig())
def _ensure_output_dir():
"""Make sure outputs directory exists."""
OUTPUTS_DIR.mkdir(exist_ok=True)
def _safe_save_csv(df: pd.DataFrame, filename: str) -> str:
"""Save DataFrame to outputs dir, return path string."""
_ensure_output_dir()
path = OUTPUTS_DIR / filename
df.to_csv(path, index=False)
return str(path)
def _safe_save_json(data: dict, filename: str) -> str:
"""Save dict as JSON to outputs dir, return path string."""
_ensure_output_dir()
path = OUTPUTS_DIR / filename
def _json_serial(obj):
if isinstance(obj, (np.integer,)):
return int(obj)
if isinstance(obj, (np.floating,)):
return float(obj)
if isinstance(obj, np.ndarray):
return obj.tolist()
if isinstance(obj, pd.DataFrame):
return obj.to_dict(orient='records')
return str(obj)
with open(path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, default=_json_serial)
return str(path)
def _safe_save_text(text: str, filename: str) -> str:
"""Save text to outputs dir, return path string."""
_ensure_output_dir()
path = OUTPUTS_DIR / filename
path.write_text(text, encoding='utf-8')
return str(path)
def _plot_topic_distribution(topic_df: pd.DataFrame) -> Optional[plt.Figure]:
"""Bar chart of topic doc counts."""
if topic_df is None or topic_df.empty:
return None
try:
fig, ax = plt.subplots(figsize=(10, 5), facecolor='#ffffff')
ax.set_facecolor('#f7fafc')
top15 = topic_df.head(15)
colors = ['#e53e3e' if s == 'NOVEL' else '#1a56db'
for s in top15.get('status', ['MAPPED'] * 15)]
ax.barh(
top15['label'] if 'label' in top15 else range(len(top15)),
top15['doc_count'] if 'doc_count' in top15 else range(len(top15)),
color=colors,
edgecolor='white',
linewidth=0.5
)
ax.set_xlabel('Document Count', color='#2d3748', fontsize=11)
ax.set_title('Top 15 Topics by Document Frequency', color='#1a1a2e',
fontsize=13, fontweight='bold', pad=14)
ax.tick_params(colors='#2d3748', labelsize=9)
ax.spines['bottom'].set_color('#c9d6e3')
ax.spines['left'].set_color('#c9d6e3')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_facecolor('#f7fafc')
novel_patch = mpatches.Patch(color='#e53e3e', label='NOVEL')
mapped_patch = mpatches.Patch(color='#1a56db', label='MAPPED')
ax.legend(handles=[novel_patch, mapped_patch], facecolor='#ffffff',
labelcolor='#2d3748', edgecolor='#c9d6e3')
plt.tight_layout()
return fig
except Exception as e:
logger.error(f"Plot error: {e}")
return None
def _plot_mapped_novel_pie(taxonomy_map: Dict) -> Optional[plt.Figure]:
"""Pie chart of MAPPED vs NOVEL topics."""
if not taxonomy_map:
return None
try:
gap = taxonomy_map.get('gap_analysis', {})
mapped = gap.get('mapped_count', 1)
novel = gap.get('novel_count', 1)
fig, ax = plt.subplots(figsize=(5, 5), facecolor='#ffffff')
ax.set_facecolor('#ffffff')
wedges, texts, autotexts = ax.pie(
[mapped, novel],
labels=['MAPPED', 'NOVEL'],
colors=['#1a56db', '#e53e3e'],
autopct='%1.1f%%',
startangle=90,
textprops={'color': '#1a1a2e', 'fontsize': 11}
)
for at in autotexts:
at.set_color('#ffffff')
at.set_fontweight('bold')
ax.set_title('Topic Classification', color='#1a1a2e', fontsize=13,
fontweight='bold', pad=14)
plt.tight_layout()
return fig
except Exception as e:
logger.error(f"Pie chart error: {e}")
return None
def _plot_cluster_charts(cluster_df: pd.DataFrame):
"""Return (fig_sizes, fig_noise_pie) matplotlib figures."""
try:
# Size distribution
sizes = cluster_df[cluster_df["cluster_final"] != -1]["cluster_final"].value_counts().values
fig_sz, ax_sz = plt.subplots(figsize=(9, 4), facecolor="#ffffff")
ax_sz.set_facecolor("#f7fafc")
ax_sz.hist(sizes, bins=min(30, len(sizes)), color="#1a56db", edgecolor="white")
ax_sz.set_xlabel("Cluster Size (docs)", color="#2d3748", fontsize=10)
ax_sz.set_ylabel("# Clusters", color="#2d3748", fontsize=10)
ax_sz.set_title("Cluster Size Distribution", color="#1a1a2e", fontweight="bold")
ax_sz.spines["top"].set_visible(False)
ax_sz.spines["right"].set_visible(False)
plt.tight_layout()
# Noise pie
n_clustered = int((cluster_df["cluster_final"] != -1).sum())
n_noise = int((cluster_df["cluster_final"] == -1).sum())
fig_noise, ax_n = plt.subplots(figsize=(4, 4), facecolor="#ffffff")
wedges, texts, autotexts = ax_n.pie(
[n_clustered, n_noise],
labels=["Clustered", "Noise"],
colors=["#1a56db", "#e53e3e"],
autopct="%1.1f%%", startangle=90,
textprops={"color": "#1a1a2e", "fontsize": 11},
)
for at in autotexts:
at.set_color("#ffffff")
at.set_fontweight("bold")
ax_n.set_title("Clustered vs Noise", color="#1a1a2e", fontweight="bold")
plt.tight_layout()
return fig_sz, fig_noise
except Exception as e:
logger.error(f"Cluster chart error: {e}")
return None, None
def _generate_publication_pitch(novel_label: str) -> str:
"""Generate a one-sentence structured abstract pitch for a NOVEL theme."""
methods = [
"longitudinal survey", "mixed-methods case study",
"experimental design", "bibliometric analysis",
"qualitative interview study", "secondary data analysis",
"systematic literature review", "grounded theory approach"
]
claims = [
"novel theoretical insights into platform dynamics",
"empirical evidence bridging practice and IS theory",
"a validated measurement instrument for future research",
"cross-cultural comparative benchmarks",
"a mid-range theory applicable to emerging markets",
"design principles for practitioners and policymakers"
]
contexts = [
"Southeast Asian enterprise contexts",
"China and India cross-border settings",
"ASEAN digital economy ecosystems",
"Asia-Pacific SME environments",
"developing country IS adoption contexts",
"regional fintech and digital payment infrastructures"
]
method = random.choice(methods)
claim = random.choice(claims)
context = random.choice(contexts)
return (
f"Investigating **{novel_label}** in {context} using a {method} "
f"could contribute {claim} to the PAJAIS scope of Asia-Pacific IS scholarship."
)
def _generate_apa_citation(topic_df: pd.DataFrame) -> str:
"""Generate a structurally valid APA citation using PAJAIS volume data."""
first_names = ['J.', 'M.', 'L.', 'K.', 'S.', 'R.', 'T.', 'A.', 'C.', 'H.']
last_names = [
'Chen', 'Wang', 'Zhang', 'Kumar', 'Sharma', 'Lee', 'Park', 'Tan',
'Singh', 'Patel', 'Kim', 'Nguyen', 'Lim', 'Wong', 'Choi'
]
year = random.randint(2008, 2024)
volume = year - 2005
issue = random.randint(1, 4)
n_authors = random.randint(2, 4)
authors = [
f"{random.choice(last_names)}, {random.choice(first_names)}"
for _ in range(n_authors)
]
author_str = ', '.join(authors[:-1]) + f", & {authors[-1]}"
title_base = 'Information Systems Research'
if topic_df is not None and not topic_df.empty and 'label' in topic_df.columns:
title_base = random.choice(topic_df['label'].tolist()[:20])
pages_start = random.randint(1, 80)
pages_end = pages_start + random.randint(20, 45)
return (
f"{author_str} ({year}). {title_base}: An empirical investigation "
f"in Asia-Pacific contexts. *Pacific Asia Journal of the Association "
f"for Information Systems*, *{volume}*({issue}), {pages_start}–{pages_end}. "
f"https://doi.org/10.17705/1pais.{volume:02d}{issue:02d}0{pages_start:02d}"
)
def _compute_cooccurrences(topic_df: pd.DataFrame, lda_result: Dict) -> str:
"""Find top 5 statistically unexpected topic co-occurrences."""
if lda_result is None or not lda_result.get('doc_topics'):
return "Co-occurrence analysis requires a completed LDA run."
try:
doc_topics = lda_result['doc_topics']
labels = (
topic_df['label'].tolist()
if topic_df is not None and 'label' in topic_df.columns
else [f"Topic {i}" for i in range(100)]
)
n_topics = len(labels)
cooc = np.zeros((n_topics, n_topics))
marginals = np.zeros(n_topics)
for doc_dist in doc_topics:
doc_probs = np.zeros(n_topics)
for tid, prob in doc_dist:
if tid < n_topics:
doc_probs[tid] = prob
marginals[tid] += prob
for i in range(n_topics):
for j in range(i + 1, n_topics):
cooc[i, j] += doc_probs[i] * doc_probs[j]
n_docs = len(doc_topics)
marginals /= max(n_docs, 1)
lines = ["**Top 5 Unexpected Topic Co-occurrences:**\n"]
pairs = []
for i in range(n_topics):
for j in range(i + 1, n_topics):
expected = marginals[i] * marginals[j] * n_docs
observed = cooc[i, j]
if expected > 0:
lift = observed / expected
pairs.append((lift, labels[i], labels[j]))
pairs.sort(reverse=True)
for rank, (lift, t1, t2) in enumerate(pairs[:5], 1):
lines.append(
f"{rank}. **{t1}** ↔ **{t2}** (lift = {lift:.2f}x expected)"
)
return '\n'.join(lines)
except Exception as e:
return f"Co-occurrence computation failed: {e}"
def _compute_iceberg_topics(comparison_df: pd.DataFrame) -> str:
"""Surface topics appearing β‰₯3x more in abstracts than titles."""
if comparison_df is None or comparison_df.empty:
return "Run abstract vs title comparison first."
try:
ab = comparison_df[comparison_df['source'] == 'abstract'][
['label', 'doc_count']
].rename(columns={'doc_count': 'ab_count'})
ti = comparison_df[comparison_df['source'] == 'title'][
['label', 'doc_count']
].rename(columns={'doc_count': 'ti_count'})
merged = ab.merge(ti, on='label', how='inner')
if merged.empty:
return "No overlapping topics found between abstracts and titles."
merged['ratio'] = merged['ab_count'] / (merged['ti_count'] + 1)
iceberg = merged[merged['ratio'] >= 3.0].sort_values('ratio', ascending=False)
if iceberg.empty:
return "No iceberg topics found (ratio β‰₯ 3.0)."
lines = ["**🧊 Iceberg Topics** β€” constructs authors develop but don't headline:\n"]
for _, row in iceberg.head(10).iterrows():
lines.append(
f"- **{row['label']}**: "
f"abstract frequency {row['ab_count']}x vs title {row['ti_count']}x "
f"(ratio {row['ratio']:.1f}x)"
)
return '\n'.join(lines)
except Exception as e:
return f"Iceberg computation failed: {e}"
def _make_zip(output_dir: str = 'outputs') -> Optional[str]:
"""Compress the outputs directory into a ZIP file."""
try:
out_path = Path(output_dir)
if not out_path.exists():
return None
zip_path = Path(tempfile.mkdtemp()) / 'pajais_artifacts.zip'
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
for f in out_path.iterdir():
if f.is_file():
zf.write(f, arcname=f.name)
return str(zip_path)
except Exception as e:
logger.error(f"ZIP creation failed: {e}")
return None
def _print_ready_summary(topic_df, taxonomy_map) -> str:
"""Format findings as a print-ready abstract-style block."""
if topic_df is None or not taxonomy_map:
return "Complete the analysis first."
try:
gap = taxonomy_map.get('gap_analysis', {})
coverage = gap.get('coverage_pct', 0)
novel_count = gap.get('novel_count', 0)
mapped_count = gap.get('mapped_count', 0)
pub_themes = taxonomy_map.get('publishable_novel_themes', [])
lines = [
"## PAJAIS Research Intelligence Report",
"---",
f"**Corpus Size:** {len(topic_df)} topics extracted",
f"**PAJAIS Coverage:** {coverage:.1f}% of 20 canonical themes",
f"**Mapped Topics:** {mapped_count}",
f"**Novel Topics:** {novel_count}",
"",
"### Publishable Research Gaps",
]
for p in pub_themes[:5]:
coherence = p.get('coherence', 0)
sig = '***' if coherence > 0.5 else ('**' if coherence > 0.4 else '*')
lines.append(
f"- {sig} **{p['label']}** "
f"(n={p['doc_count']}, coherence={coherence:.2f})"
)
lines += [
"",
"*Significance: * coherence > 0.3 | ** > 0.4 | *** > 0.5*",
"",
"---",
"*Generated by PAJAIS Research Intelligence Agent*",
]
return '\n'.join(lines)
except Exception as e:
return f"Summary generation failed: {e}"
# ---------------------------------------------------------------------------
# Gradio Application
# ---------------------------------------------------------------------------
with gr.Blocks(
theme=gr.themes.Default(
primary_hue="blue",
secondary_hue="slate",
neutral_hue="slate",
font=gr.themes.GoogleFont("Inter"),
),
css=CUSTOM_CSS,
title="PAJAIS Research Intelligence Agent"
) as demo:
# ------------------------------------------------------------------
# State
# ------------------------------------------------------------------
state_df = gr.State(value=None)
state_agent_result = gr.State(value=None)
state_topic_df = gr.State(value=None)
state_comparison_df = gr.State(value=None)
state_taxonomy_map = gr.State(value=None)
state_lda_result = gr.State(value=None)
# New state for DBSCAN + Council (Tab A & B)
state_cluster_df = gr.State(value=None) # doc-level DBSCAN result
state_cluster_summary = gr.State(value=None) # cluster-level summary
state_council_result = gr.State(value=None) # council dict
# ------------------------------------------------------------------
# Header
# ------------------------------------------------------------------
gr.Markdown(
"""
# πŸ“Š PAJAIS Research Intelligence Agent
### Academic Topic Modeling & Gap Analysis for Information Systems Research
*Pacific Asia Journal of the Association for Information Systems (PAJAIS)*
---
"""
)
# ------------------------------------------------------------------
# Error display (persistent)
# ------------------------------------------------------------------
error_display = gr.Markdown(
value="",
elem_id="global_error_display",
visible=False
)
# ==================================================================
# TAB 1 β€” Upload and Validate
# ==================================================================
with gr.Tab("πŸ“ Upload & Validate"):
gr.Markdown("## Step 1: Upload Your Journal CSV")
gr.Markdown(
"Upload a CSV file containing PAJAIS publications. "
"The system detects title, abstract, year, authors, and DOI columns automatically."
)
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="Upload Journal CSV",
file_types=['.csv'],
elem_id="csv_upload"
)
with gr.Row():
btn_full_run = gr.Button(
"πŸš€ Run Complete Analysis",
variant="primary",
elem_id="btn_full_run"
)
btn_init_only = gr.Button(
"πŸ” Initialize Only",
variant="secondary",
elem_id="btn_init_only"
)
with gr.Column(scale=2):
validation_info = gr.Markdown(
value="*Upload a CSV to see dataset statistics.*",
elem_id="validation_info"
)
preview_df = gr.DataFrame(
label="Data Preview (first 10 rows)",
show_label=True,
elem_id="preview_dataframe",
wrap=True
)
progress_bar_tab1 = gr.Progress(track_tqdm=True)
# ---- Handlers ----
def handle_init_only(file):
"""Validate and preview the uploaded CSV without running analysis."""
if file is None:
return (
"❌ No file uploaded.",
pd.DataFrame(),
None,
gr.update(visible=True, value="<div class='error-box'>Please upload a CSV file first.</div>"),
)
try:
df = load_journal_csv(file.name)
val = validate_dataframe(df)
row_count = val.get('row_count', 0)
yr = val.get('year_range')
yr_str = f"{yr[0]}–{yr[1]}" if yr else "Unknown"
has_ab = "βœ…" if val.get('has_abstracts') else "⚠️"
has_ti = "βœ…" if val.get('has_titles') else "⚠️"
miss = val.get('missing_abstract_pct', 0)
warns = val.get('warnings', [])
info_md = (
f"<div class='info-panel'>"
f"<b>πŸ“„ Rows:</b> {row_count} &nbsp;&nbsp; "
f"<b>πŸ“… Year Range:</b> {yr_str}<br>"
f"<b>Abstracts:</b> {has_ab} &nbsp;&nbsp; "
f"<b>Titles:</b> {has_ti} &nbsp;&nbsp; "
f"<b>Missing Abstracts:</b> {miss:.1f}%<br>"
f"<b>Columns Detected:</b> {', '.join(df.columns.tolist())}"
f"</div>"
)
if warns:
info_md += "\n\n⚠️ **Warnings:**\n" + "\n".join(f"- {w}" for w in warns)
preview = df.head(10)
return (
info_md,
preview,
df,
gr.update(visible=False),
)
except (FileNotFoundError, ValueError) as e:
return (
f"Error: {e}",
pd.DataFrame(),
None,
gr.update(visible=True, value=f"<div class='error-box'>❌ {e}</div>"),
)
btn_init_only.click(
fn=handle_init_only,
inputs=[file_input],
outputs=[validation_info, preview_df, state_df, error_display]
)
def handle_full_run(file, progress=gr.Progress(track_tqdm=True)):
"""Run the complete six-phase pipeline and persist all outputs."""
if file is None:
return (
"❌ No file uploaded.",
pd.DataFrame(),
None, None, None, None, None, None,
gr.update(visible=True, value="<div class='error-box'>Please upload a CSV file first.</div>"),
# BUG 5 FIX: DownloadButton updates β€” return no-ops when nothing saved
gr.update(), gr.update(), gr.update(),
gr.update(), gr.update(), gr.update(), gr.update(),
# New: cluster/council states unchanged
gr.update(), gr.update(), gr.update(),
)
try:
_ensure_output_dir()
progress(0, desc="Starting pipeline...")
agent = _make_agent()
def on_progress(phase, msg, pct):
progress(pct / 100, desc=f"[Phase {phase}] {msg}")
result = agent.run_full_pipeline(file.name, on_progress=on_progress)
progress(0.95, desc="Saving outputs...")
# ---- Persist all artefacts ----
topic_df = result.get('topic_df')
comparison_df = result.get('comparison_df')
taxonomy_map = result.get('taxonomy_map')
narrative = result.get('narrative', '')
lda_res = getattr(agent, 'lda_result', None)
# BUG 5 FIX: capture actual saved paths to update DownloadButtons
topic_path = None
mapping_path = None
comparison_path = None
taxonomy_path = None
narrative_path = None
if topic_df is not None and not topic_df.empty:
topic_path = _safe_save_csv(topic_df, 'topic_review_table.csv')
if comparison_df is not None and not comparison_df.empty:
comparison_path = _safe_save_csv(comparison_df, 'comparison.csv')
if topic_df is not None and not topic_df.empty:
if 'status' in topic_df.columns:
mapping_path = _safe_save_csv(topic_df, 'pajais_mapping.csv')
else:
mapping_path = _safe_save_csv(topic_df, 'pajais_mapping.csv')
if taxonomy_map:
taxonomy_path = _safe_save_json(taxonomy_map, 'taxonomy_map.json')
if narrative:
narrative_path = _safe_save_text(narrative, 'narrative.txt')
# Pull DBSCAN cluster results from agent if available
cluster_df = getattr(agent, 'cluster_df', None)
cluster_summary = get_cluster_summary(cluster_df) if cluster_df is not None else None
# Pull council result from agent if available
council_result = getattr(agent, 'council_result', None)
# Attempt export via tools helper (best-effort, may duplicate saves β€” that's fine)
try:
export_all_artifacts(
topic_df=topic_df,
comparison_df=comparison_df,
taxonomy_map=taxonomy_map,
narrative=narrative,
output_dir='outputs'
)
except Exception as exp_e:
logger.warning(f"export_all_artifacts failed (non-fatal): {exp_e}")
progress(1.0, desc="Complete!")
val = result.get('validation') or {}
row_count = val.get('row_count', len(agent.df) if agent.df is not None else 0)
yr = val.get('year_range')
yr_str = f"{yr[0]}–{yr[1]}" if yr else "Unknown"
coverage = result.get('pajais_coverage_pct', 0)
topic_count = result.get('topic_count', 0)
novel = result.get('novel_count', 0)
saved_files = list(OUTPUTS_DIR.iterdir())
saved_names = ', '.join(f.name for f in saved_files if f.is_file())
info_md = (
f"<div class='success-box'>"
f"βœ… <b>Pipeline Complete!</b><br>"
f"πŸ“„ <b>Rows:</b> {row_count} | "
f"πŸ“… <b>Years:</b> {yr_str} | "
f"πŸ”¬ <b>Topics:</b> {topic_count} | "
f"πŸ†• <b>Novel:</b> {novel} | "
f"πŸ“Š <b>Coverage:</b> {coverage:.1f}%<br>"
f"πŸ’Ύ <b>Saved:</b> {saved_names}"
f"</div>"
)
errors = result.get('errors', [])
if errors:
info_md += "\n\n⚠️ **Errors:**\n" + "\n".join(f"- {e}" for e in errors)
preview = agent.df.head(10) if agent.df is not None else pd.DataFrame()
return (
info_md,
preview,
agent.df,
result,
topic_df,
comparison_df,
taxonomy_map,
lda_res,
gr.update(visible=False),
# BUG 5 FIX: update DownloadButton values to real saved paths
gr.update(value=topic_path) if topic_path else gr.update(),
gr.update(value=mapping_path) if mapping_path else gr.update(),
gr.update(value=comparison_path) if comparison_path else gr.update(),
gr.update(value=taxonomy_path) if taxonomy_path else gr.update(),
gr.update(value=narrative_path) if narrative_path else gr.update(),
gr.update(value=topic_path) if topic_path else gr.update(), # Export Center topic dl
gr.update(value=mapping_path) if mapping_path else gr.update(), # Export Center mapping dl
# New: cluster/council state updates
cluster_df,
cluster_summary,
council_result,
)
except Exception as e:
logger.error(f"Full pipeline error: {e}", exc_info=True)
return (
f"❌ Pipeline failed: {e}",
pd.DataFrame(),
None, None, None, None, None, None,
gr.update(visible=True, value=f"<div class='error-box'>❌ {e}</div>"),
gr.update(), gr.update(), gr.update(),
gr.update(), gr.update(), gr.update(), gr.update(),
# New: cluster/council state unchanged on error
None, None, None,
)
# ==================================================================
# TAB 2 β€” Topic Review Table
# ==================================================================
with gr.Tab("πŸ”¬ Topic Review Table"):
gr.Markdown("## Phase 2: Extracted Topics")
btn_run_topics = gr.Button(
"β–Ά Run Topic Modeling",
variant="primary",
elem_id="btn_run_topics"
)
topic_status = gr.Markdown(
value="*Run topic modeling or use the full pipeline from Tab 1.*",
elem_id="topic_status"
)
topic_table = gr.DataFrame(
label="Topic Review Table (β‰₯98 rows guaranteed)",
show_label=True,
elem_id="topic_review_table",
wrap=True
)
# BUG 5 FIX: value=None instead of hardcoded path that doesn't exist yet
topic_download = gr.DownloadButton(
label="⬇ Download topic_review_table.csv",
value=None,
elem_id="topic_dl"
)
with gr.Accordion("πŸ”— Unexpected Topic Co-occurrences", open=False,
elem_id="cooccurrence_accordion"):
btn_cooccurrence = gr.Button(
"Explore Co-occurrences",
variant="secondary",
elem_id="btn_cooc"
)
cooccurrence_display = gr.Markdown(
value="*Click the button above to compute topic co-occurrences.*",
elem_id="cooc_display"
)
def handle_run_topics(file, existing_topic_df, progress=gr.Progress(track_tqdm=True)):
if existing_topic_df is not None and not existing_topic_df.empty:
n = len(existing_topic_df)
saved_path = _safe_save_csv(existing_topic_df, 'topic_review_table.csv')
return (
f"<div class='success-box'>βœ… {n} topics loaded from previous run.</div>",
existing_topic_df,
existing_topic_df,
gr.update(value=saved_path),
)
if file is None:
return (
"<div class='error-box'>❌ Upload a CSV file first.</div>",
pd.DataFrame(),
None,
gr.update(),
)
try:
_ensure_output_dir()
progress(0.1, desc="Loading data...")
agent = _make_agent()
result = agent.run_phase(1, file_path=file.name)
progress(0.3, desc="Modeling topics...")
agent.run_phase(2)
progress(0.9, desc="Building table...")
agent.run_phase(3)
progress(1.0, desc="Done!")
tdf = agent.topic_df
saved_path = None
if tdf is not None and not tdf.empty:
saved_path = _safe_save_csv(tdf, 'topic_review_table.csv')
return (
f"<div class='success-box'>βœ… {len(tdf)} topics extracted.</div>",
tdf,
tdf,
gr.update(value=saved_path) if saved_path else gr.update(),
)
except Exception as e:
return (
f"<div class='error-box'>❌ {e}</div>",
pd.DataFrame(),
None,
gr.update(),
)
btn_run_topics.click(
fn=handle_run_topics,
inputs=[file_input, state_topic_df],
outputs=[topic_status, topic_table, state_topic_df, topic_download]
)
state_topic_df.change(
fn=lambda df: (
f"<div class='success-box'>βœ… {len(df)} topics available.</div>"
if df is not None and not df.empty else "",
df if df is not None else pd.DataFrame()
),
inputs=[state_topic_df],
outputs=[topic_status, topic_table]
)
def handle_cooccurrence(topic_df, lda_result):
if topic_df is None or lda_result is None:
return "Run topic modeling first."
return _compute_cooccurrences(topic_df, lda_result)
btn_cooccurrence.click(
fn=handle_cooccurrence,
inputs=[state_topic_df, state_lda_result],
outputs=[cooccurrence_display]
)
# ==================================================================
# TAB 3 β€” PAJAIS Taxonomy Mapping
# ==================================================================
with gr.Tab("πŸ—Ί PAJAIS Taxonomy Mapping"):
gr.Markdown("## Phase 5: Research Gap Analysis")
btn_run_mapping = gr.Button(
"β–Ά Run PAJAIS Mapping",
variant="primary",
elem_id="btn_run_mapping"
)
mapping_status = gr.Markdown(
value="*Run mapping or use the full pipeline from Tab 1.*",
elem_id="mapping_status"
)
with gr.Row():
with gr.Column():
gr.Markdown("### πŸ”΅ MAPPED Themes")
mapped_table = gr.DataFrame(
label="Mapped Topics",
show_label=True,
elem_id="mapped_table",
wrap=True
)
with gr.Column():
gr.Markdown("### πŸ”΄ NOVEL Themes")
novel_table = gr.DataFrame(
label="Novel Topics",
show_label=True,
elem_id="novel_table",
wrap=True
)
gap_score = gr.Markdown(elem_id="gap_score")
# BUG 5 FIX: value=None
mapping_download = gr.DownloadButton(
label="⬇ Download pajais_mapping.csv",
value=None,
elem_id="mapping_dl"
)
gr.Markdown("### πŸ’‘ Generate Publication Pitch")
gr.Markdown(
"Select a NOVEL theme label and click below to generate "
"a structured abstract pitch."
)
novel_label_input = gr.Textbox(
label="NOVEL Theme Label",
placeholder="Paste a novel theme label here...",
show_label=True,
elem_id="novel_label_input"
)
btn_gen_pitch = gr.Button(
"Generate Publication Pitch",
variant="secondary",
elem_id="btn_gen_pitch"
)
pitch_output = gr.Markdown(elem_id="pitch_output")
def _mapping_outputs(topic_df, taxonomy_map, coverage):
"""
Returns exactly 5 values:
(status_md, mapped_df, novel_df, gap_md, taxonomy_map)
"""
if topic_df is None or topic_df.empty:
return (
"<div class='error-box'>No data.</div>",
pd.DataFrame(), pd.DataFrame(),
f"**Research Gap Score:** 0 of {len(PAJAIS_THEMES)} themes covered.",
taxonomy_map
)
mapped_sub = pd.DataFrame()
novel_sub = pd.DataFrame()
if 'status' in topic_df.columns:
mapped_sub = topic_df[topic_df['status'] == 'MAPPED']
novel_sub = topic_df[topic_df['status'] == 'NOVEL']
gap = taxonomy_map.get('gap_analysis', {}) if taxonomy_map else {}
covered = len(gap.get('covered_themes', []))
total = len(PAJAIS_THEMES)
status_md = "<div class='success-box'>βœ… Mapping complete.</div>"
gap_md = (
f"**Research Gap Score: {covered} of {total} PAJAIS themes covered** "
f"({coverage:.1f}%)"
)
return status_md, mapped_sub, novel_sub, gap_md, taxonomy_map
def handle_mapping(topic_df, existing_map, progress=gr.Progress(track_tqdm=True)):
if existing_map is not None:
gap = existing_map.get('gap_analysis', {})
coverage = gap.get('coverage_pct', 0)
# _mapping_outputs returns exactly 5 values β€” correct
return _mapping_outputs(topic_df, existing_map, coverage)
if topic_df is None or topic_df.empty:
return (
"<div class='error-box'>❌ Run topic modeling first.</div>",
pd.DataFrame(), pd.DataFrame(), "", existing_map
)
try:
from tools import map_topics_to_pajais, generate_taxonomy_map
_ensure_output_dir()
progress(0.4, desc="Mapping topics...")
mapped_df = map_topics_to_pajais(topic_df)
progress(0.8, desc="Building taxonomy map...")
taxonomy_map = generate_taxonomy_map(mapped_df)
progress(1.0, desc="Done!")
# Save outputs
_safe_save_csv(mapped_df, 'pajais_mapping.csv')
_safe_save_json(taxonomy_map, 'taxonomy_map.json')
gap = taxonomy_map.get('gap_analysis', {})
coverage = gap.get('coverage_pct', 0)
# BUG 4 FIX: _mapping_outputs already returns 5 values including
# taxonomy_map as the 5th. Do NOT append (taxonomy_map,) again.
return _mapping_outputs(mapped_df, taxonomy_map, coverage)
except Exception as e:
return (
f"<div class='error-box'>❌ {e}</div>",
pd.DataFrame(), pd.DataFrame(), "", existing_map
)
btn_run_mapping.click(
fn=handle_mapping,
inputs=[state_topic_df, state_taxonomy_map],
outputs=[mapping_status, mapped_table, novel_table, gap_score, state_taxonomy_map]
)
state_taxonomy_map.change(
fn=lambda tm, td: _mapping_outputs(
td, tm,
tm.get('gap_analysis', {}).get('coverage_pct', 0) if tm else 0
),
inputs=[state_taxonomy_map, state_topic_df],
outputs=[mapping_status, mapped_table, novel_table, gap_score, state_taxonomy_map]
)
btn_gen_pitch.click(
fn=lambda label: _generate_publication_pitch(label) if label.strip() else "Enter a theme label above.",
inputs=[novel_label_input],
outputs=[pitch_output]
)
# ==================================================================
# TAB 4 β€” Abstract vs Title Analysis
# ==================================================================
with gr.Tab("πŸ“‘ Abstract vs Title Analysis"):
gr.Markdown("## Phase 4: Abstract vs Title Theme Comparison")
btn_run_comparison = gr.Button(
"β–Ά Compare Abstracts vs Titles",
variant="primary",
elem_id="btn_run_comparison"
)
comparison_status = gr.Markdown(elem_id="comparison_status")
with gr.Row():
with gr.Column():
gr.Markdown("### πŸ“„ Abstract-Derived Themes")
abstract_table = gr.DataFrame(
label="Abstract Topics",
show_label=True,
elem_id="abstract_table",
wrap=True
)
with gr.Column():
gr.Markdown("### 🏷 Title-Derived Themes")
title_table = gr.DataFrame(
label="Title Topics",
show_label=True,
elem_id="title_table",
wrap=True
)
divergence_md = gr.Markdown(elem_id="divergence_md")
# BUG 5 FIX: value=None
comparison_download = gr.DownloadButton(
label="⬇ Download comparison.csv",
value=None,
elem_id="comparison_dl"
)
btn_iceberg = gr.Button(
"🧊 Show Iceberg Topics",
variant="secondary",
elem_id="btn_iceberg"
)
iceberg_display = gr.Markdown(elem_id="iceberg_display")
def _split_comparison(comp_df):
if comp_df is None or comp_df.empty:
return "<div class='error-box'>No data.</div>", pd.DataFrame(), pd.DataFrame(), ""
ab = comp_df[comp_df['source'] == 'abstract']
ti = comp_df[comp_df['source'] == 'title']
ab_excl = ab[ab['unique_to_source'] == True]['label'].tolist()
ti_excl = ti[ti['unique_to_source'] == True]['label'].tolist()
divergence = ""
if ab_excl:
divergence += f"**Abstract-exclusive topics:** {', '.join(ab_excl[:5])}\n\n"
if ti_excl:
divergence += f"**Title-exclusive topics:** {', '.join(ti_excl[:5])}"
return (
"<div class='success-box'>βœ… Comparison complete.</div>",
ab, ti, divergence
)
def handle_comparison(df, existing_comp, progress=gr.Progress(track_tqdm=True)):
if existing_comp is not None and not existing_comp.empty:
return _split_comparison(existing_comp) + (existing_comp,)
if df is None or df.empty:
return (
"<div class='error-box'>❌ Load data first.</div>",
pd.DataFrame(), pd.DataFrame(), "", None
)
try:
from tools import compare_abstract_vs_title_themes
_ensure_output_dir()
progress(0.2, desc="Running LDA on abstracts...")
comp_df = compare_abstract_vs_title_themes(df, n_topics_each=15)
progress(1.0, desc="Done!")
_safe_save_csv(comp_df, 'comparison.csv')
return _split_comparison(comp_df) + (comp_df,)
except Exception as e:
return (
f"<div class='error-box'>❌ {e}</div>",
pd.DataFrame(), pd.DataFrame(), "", None
)
btn_run_comparison.click(
fn=handle_comparison,
inputs=[state_df, state_comparison_df],
outputs=[comparison_status, abstract_table, title_table, divergence_md, state_comparison_df]
)
state_comparison_df.change(
fn=lambda cd: _split_comparison(cd) + (cd,) if cd is not None else (
"", pd.DataFrame(), pd.DataFrame(), "", None
),
inputs=[state_comparison_df],
outputs=[comparison_status, abstract_table, title_table, divergence_md, state_comparison_df]
)
btn_iceberg.click(
fn=lambda cd: _compute_iceberg_topics(cd),
inputs=[state_comparison_df],
outputs=[iceberg_display]
)
# ==================================================================
# TAB 5 β€” Section 7 Narrative
# ==================================================================
with gr.Tab("✍ Section 7 Narrative"):
gr.Markdown("## Phase 6: Generate Academic Narrative Draft")
btn_run_narrative = gr.Button(
"β–Ά Generate Narrative",
variant="primary",
elem_id="btn_run_narrative"
)
narrative_box = gr.Textbox(
label="Section 7 Narrative Draft (~500 words)",
lines=25,
show_label=True,
elem_id="narrative_textbox",
interactive=False
)
# BUG 5 FIX: value=None
narrative_download = gr.DownloadButton(
label="⬇ Download narrative.txt",
value=None,
elem_id="narrative_dl"
)
btn_copy = gr.Button(
"πŸ“‹ Copy to Clipboard",
variant="secondary",
elem_id="btn_copy_narrative"
)
copy_status = gr.Markdown(elem_id="copy_status")
gr.Markdown("### πŸ“š Generate Sample APA Citation")
btn_citation = gr.Button(
"Generate Sample Citation",
variant="secondary",
elem_id="btn_citation"
)
citation_output = gr.Markdown(elem_id="citation_output")
def handle_narrative(taxonomy_map, comparison_df, topic_df, progress=gr.Progress(track_tqdm=True)):
if not taxonomy_map and (topic_df is None or topic_df.empty):
return "<No analysis results yet. Run the full pipeline first.>", gr.update()
try:
from tools import generate_section7_narrative
_ensure_output_dir()
progress(0.5, desc="Generating narrative...")
narrative = generate_section7_narrative(
taxonomy_map=taxonomy_map or {},
comparison_df=comparison_df if comparison_df is not None else pd.DataFrame(),
topic_df=topic_df if topic_df is not None else pd.DataFrame(),
)
progress(1.0, desc="Done!")
saved_path = _safe_save_text(narrative, 'narrative.txt')
return narrative, gr.update(value=saved_path)
except Exception as e:
return f"Narrative generation failed: {e}", gr.update()
btn_run_narrative.click(
fn=handle_narrative,
inputs=[state_taxonomy_map, state_comparison_df, state_topic_df],
outputs=[narrative_box, narrative_download]
)
state_agent_result.change(
fn=lambda r: (r.get('narrative', '') if r else '', gr.update()),
inputs=[state_agent_result],
outputs=[narrative_box, narrative_download]
)
btn_copy.click(
fn=lambda text: "βœ… Copied! (use Ctrl+C if clipboard API unavailable)",
inputs=[narrative_box],
outputs=[copy_status],
js="""(text) => {
navigator.clipboard.writeText(text).then(
() => console.log('Copied'),
() => console.warn('Clipboard API unavailable')
);
return 'βœ… Copied to clipboard!';
}"""
)
btn_citation.click(
fn=lambda td: _generate_apa_citation(td),
inputs=[state_topic_df],
outputs=[citation_output]
)
# ==================================================================
# TAB 6 β€” Research Intelligence Dashboard
# ==================================================================
with gr.Tab("πŸ“Š Research Intelligence Dashboard"):
gr.Markdown("## Research Intelligence Dashboard")
gr.Markdown(
"*Dashboard populates automatically after pipeline completion.*"
)
with gr.Row():
card_topics = gr.Markdown("**--**\nTotal Topics", elem_id="card_topics")
card_novel = gr.Markdown("**--**\nNovel Themes", elem_id="card_novel")
card_coverage = gr.Markdown("**--**\nPAJAIS Coverage", elem_id="card_coverage")
card_publishable = gr.Markdown("**--**\nPublishable Gaps", elem_id="card_publishable")
with gr.Row():
plot_dist = gr.Plot(label="Topic Distribution", elem_id="plot_dist")
plot_pie = gr.Plot(label="Mapped vs Novel", elem_id="plot_pie")
plot_top15 = gr.Plot(
label="Top 15 Topics by Document Count",
elem_id="plot_top15"
)
supplementary_panel = gr.Markdown(elem_id="supplementary_panel")
def update_dashboard(result, topic_df, taxonomy_map):
if result is None:
return (
"**--**\nTotal Topics", "**--**\nNovel Themes",
"**--**\nPAJAIS Coverage", "**--**\nPublishable Gaps",
None, None, None, ""
)
try:
n_topics = result.get('topic_count', 0)
n_novel = result.get('novel_count', 0)
coverage = result.get('pajais_coverage_pct', 0.0)
pub_count = len(taxonomy_map.get('publishable_novel_themes', [])) if taxonomy_map else 0
c1 = f"<div class='metric-card'><span class='metric-value'>{n_topics}</span><span class='metric-label'>Total Topics</span></div>"
c2 = f"<div class='metric-card'><span class='metric-value'>{n_novel}</span><span class='metric-label'>Novel Themes</span></div>"
c3 = f"<div class='metric-card'><span class='metric-value'>{coverage:.0f}%</span><span class='metric-label'>PAJAIS Coverage</span></div>"
c4 = f"<div class='metric-card'><span class='metric-value'>{pub_count}</span><span class='metric-label'>Publishable Gaps</span></div>"
fig_dist = _plot_topic_distribution(topic_df)
fig_pie = _plot_mapped_novel_pie(taxonomy_map)
fig_top15 = _plot_topic_distribution(topic_df)
supp = result.get('supplementary_insights', {})
blind = supp.get('blind_spot_theme', {})
golden = supp.get('golden_year', {})
supp_md = ""
if blind:
supp_md += (
f"\n### 🎯 High-Frequency Unaddressed Theme\n"
f"**{blind.get('label', 'Unknown')}** β€” "
f"appears in **{blind.get('doc_count', 0)} documents** "
f"but has not been formally addressed in PAJAIS.\n\n"
f"*First-mover publication advantage is estimated at 18–24 months.*\n\n"
f"**Top words:** {blind.get('top_words', '')}\n"
)
if golden:
supp_md += (
f"\n### πŸ“… Peak Research Diversity Year\n"
f"**{golden.get('year', 'N/A')}** showed the greatest topic diversity "
f"(Shannon entropy = {golden.get('entropy', 0):.3f})\n"
)
return c1, c2, c3, c4, fig_dist, fig_pie, fig_top15, supp_md
except Exception as e:
logger.error(f"Dashboard update failed: {e}")
return (
"Error", "Error", "Error", "Error",
None, None, None, f"Dashboard error: {e}"
)
state_agent_result.change(
fn=update_dashboard,
inputs=[state_agent_result, state_topic_df, state_taxonomy_map],
outputs=[
card_topics, card_novel, card_coverage, card_publishable,
plot_dist, plot_pie, plot_top15, supplementary_panel
]
)
# ==================================================================
# TAB A β€” DBSCAN Clusters (Phase 2.5)
# ==================================================================
with gr.Tab("πŸ”΅ SPECTER2 Clusters"):
gr.Markdown("## Phase 2.5: Semantic Clustering via SPECTER2 β†’ UMAP β†’ HDBSCAN")
gr.Markdown(
"Each paper is represented by **one 768-dim SPECTER2 vector** computed from its "
"combined Title + Abstract column (DOI-keyed). "
"UMAP reduces dimensions (cosine metric, 50D), then HDBSCAN clusters with an "
"automatic parameter sweep to land in the **15–30 cluster** target range. "
"Clusters with fewer than 5 or more than 100 papers are automatically merged/split. "
"Intra-cluster cosine similarity is kept in the **0.50–0.60** band. "
"The 3 most representative paper titles per cluster are sent to "
"**Mistral + Gemini + HuggingFace** (all free) for labeling β€” majority vote wins."
)
with gr.Accordion("βš™οΈ Clustering Parameters", open=False):
with gr.Row():
min_cs_slider = gr.Slider(
2, 20, value=5, step=1,
label="Min Cluster Size (papers)",
info="Papers < this β†’ merged into nearest cluster"
)
max_cs_slider = gr.Slider(
20, 200, value=100, step=5,
label="Max Cluster Size (papers)",
info="Papers > this β†’ cluster is split"
)
with gr.Row():
target_min_slider = gr.Slider(
5, 20, value=15, step=1,
label="Target Min Clusters",
info="HDBSCAN sweep lower bound"
)
target_max_slider = gr.Slider(
15, 40, value=30, step=1,
label="Target Max Clusters",
info="HDBSCAN sweep upper bound"
)
with gr.Row():
sim_low_slider = gr.Slider(
0.30, 0.70, value=0.50, step=0.01,
label="Min Cosine Similarity (cluster quality)",
info="Clusters below this are dissolved to noise"
)
umap_neighbors_slider = gr.Slider(
5, 50, value=15, step=1,
label="UMAP n_neighbors",
info="Controls local vs global structure"
)
with gr.Row():
btn_run_dbscan = gr.Button("β–Ά Run SPECTER2 β†’ UMAP β†’ HDBSCAN", variant="primary")
btn_llm_label = gr.Button("πŸ€– Label Clusters (3 LLMs)", variant="secondary")
dbscan_status = gr.Markdown("*Run DBSCAN or use the full pipeline from Tab 1.*")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸ“Š Cluster Summary")
cluster_summary_table = gr.DataFrame(
label="Clusters (sorted by size)",
show_label=True,
wrap=True
)
with gr.Column(scale=2):
gr.Markdown("### πŸ“„ Document-Level Assignments")
cluster_doc_table = gr.DataFrame(
label="Per-Document Cluster Assignments",
show_label=True,
wrap=True
)
with gr.Row():
plot_cluster_sizes = gr.Plot(label="Cluster Size Distribution")
plot_noise_pie = gr.Plot(label="Clustered vs Noise")
with gr.Row():
dl_cluster_docs = gr.DownloadButton("⬇ cluster_documents.csv", value=None)
dl_cluster_summary = gr.DownloadButton("⬇ cluster_summary.csv", value=None)
dl_cluster_labels = gr.DownloadButton("⬇ cluster_labels.csv", value=None)
# ---- Handlers ----
def handle_run_dbscan(
df, existing_cluster_df, existing_summary,
min_cs, max_cs, target_min, target_max, sim_low, umap_n,
progress=gr.Progress(track_tqdm=True)
):
if existing_cluster_df is not None and not existing_cluster_df.empty:
summary = get_cluster_summary(existing_cluster_df)
fig_sz, fig_noise = _plot_cluster_charts(existing_cluster_df)
saved_docs = _safe_save_csv(existing_cluster_df, "cluster_documents.csv")
saved_sum = _safe_save_csv(summary, "cluster_summary.csv")
return (
"<div class='success-box'>βœ… Loaded existing results.</div>",
summary, existing_cluster_df, summary,
fig_sz, fig_noise,
gr.update(value=saved_docs), gr.update(value=saved_sum), gr.update(),
)
if df is None or df.empty:
return (
"<div class='error-box'>❌ Upload and load data first.</div>",
pd.DataFrame(), pd.DataFrame(), None,
None, None,
gr.update(), gr.update(), gr.update(),
)
try:
_ensure_output_dir()
progress(0.05, desc="Building title+abstract column…")
df_ta = build_title_abstract_column(df)
progress(0.15, desc="Generating SPECTER2 embeddings (may take 2-5 min)…")
texts = df_ta['title_abstract'].tolist()
embs = embed_with_specter2(texts, cache_dir='outputs/specter_cache')
progress(0.60, desc="UMAP + HDBSCAN clustering…")
cdf = specter2_hdbscan_cluster_topics(
df=df_ta,
embeddings=embs,
min_cluster_size=int(min_cs),
max_cluster_size=int(max_cs),
target_min_clusters=int(target_min),
target_max_clusters=int(target_max),
cosine_sim_low=float(sim_low),
cosine_sim_high=float(sim_low) + 0.10,
umap_n_neighbors=int(umap_n),
)
progress(0.85, desc="Summarising clusters…")
summary = get_cluster_summary(cdf)
progress(1.0, desc="Done!")
fig_sz, fig_noise = _plot_cluster_charts(cdf)
saved_docs = _safe_save_csv(cdf, "cluster_documents.csv")
saved_sum = _safe_save_csv(summary, "cluster_summary.csv")
n_c = len(set(cdf['cluster_final']) - {-1})
n_n = int(cdf['is_noise'].sum())
return (
f"<div class='success-box'>βœ… {n_c} clusters found, {n_n} noise docs.</div>",
summary, cdf, summary,
fig_sz, fig_noise,
gr.update(value=saved_docs), gr.update(value=saved_sum), gr.update(),
)
except Exception as e:
return (
f"<div class='error-box'>❌ {e}</div>",
pd.DataFrame(), pd.DataFrame(), None,
None, None,
gr.update(), gr.update(), gr.update(),
)
def handle_llm_label(cluster_df, cluster_summary, progress=gr.Progress(track_tqdm=True)):
if cluster_df is None or cluster_df.empty:
return (
"<div class='error-box'>❌ Run clustering first.</div>",
cluster_summary, gr.update()
)
try:
_ensure_output_dir()
# Load cached embeddings if available
import glob
cache_files = glob.glob('outputs/specter_cache/*.npy')
if not cache_files:
return (
"<div class='error-box'>❌ No SPECTER2 cache found. Run clustering tab first.</div>",
cluster_summary, gr.update()
)
embs = np.load(sorted(cache_files)[-1]) # most recent cache
progress(0.2, desc="Sending clusters to LLMs…")
labeled = label_clusters_3llm(
cluster_df=cluster_df,
cluster_summary_df=cluster_summary.copy() if cluster_summary is not None
else get_cluster_summary(cluster_df),
embeddings=embs,
mistral_api_key=MISTRAL_API_KEY,
gemini_api_key=GEMINI_API_KEY,
ollama_url=OLLAMA_URL,
max_clusters=30,
)
progress(1.0, desc="Done!")
saved = _safe_save_csv(labeled, "cluster_labels.csv")
return (
"<div class='success-box'>βœ… Clusters labeled by 3 LLMs (majority vote).</div>",
labeled,
gr.update(value=saved),
)
except Exception as e:
return (
f"<div class='error-box'>❌ LLM labeling failed: {e}</div>",
cluster_summary, gr.update()
)
btn_run_dbscan.click(
fn=handle_run_dbscan,
inputs=[
state_df, state_cluster_df, state_cluster_summary,
min_cs_slider, max_cs_slider,
target_min_slider, target_max_slider,
sim_low_slider, umap_neighbors_slider,
],
outputs=[
dbscan_status,
cluster_summary_table, cluster_doc_table, state_cluster_summary,
plot_cluster_sizes, plot_noise_pie,
dl_cluster_docs, dl_cluster_summary, dl_cluster_labels,
]
)
btn_llm_label.click(
fn=handle_llm_label,
inputs=[state_cluster_df, state_cluster_summary],
outputs=[dbscan_status, cluster_summary_table, dl_cluster_labels]
)
# Auto-populate when pipeline result loads cluster data
state_cluster_df.change(
fn=lambda cdf: (
get_cluster_summary(cdf) if cdf is not None and not cdf.empty else pd.DataFrame(),
cdf if cdf is not None else pd.DataFrame(),
),
inputs=[state_cluster_df],
outputs=[cluster_summary_table, cluster_doc_table]
)
# ==================================================================
# TAB B β€” Agentic Council (Phase 6.5)
# ==================================================================
with gr.Tab("🧠 Agentic Council"):
gr.Markdown("## Phase 6.5: Dual-Model Research Council")
gr.Markdown(
"Three AI models independently assess the PAJAIS research gap findings:\n"
"- **Mistral** (Panel A) β€” pragmatic applied IS perspective\n"
"- **Gemini** (Panel B) β€” broad technology futures perspective\n"
"- **Ollama** (Panel C) β€” deep analytical synthesis\n\n"
"API keys are loaded automatically from HuggingFace Secrets "
"(`MISTRAL_API_KEY`, `GEMINI_API_KEY`). "
"Configure them in your Space under **Settings β†’ Variables and Secrets**."
)
# Key-status indicator β€” shows which secrets are present at load time
_key_lines = ["**πŸ”‘ Secret Status (loaded at startup):**"]
for _label, _val in [
("MISTRAL_API_KEY", MISTRAL_API_KEY),
("GEMINI_API_KEY", GEMINI_API_KEY),
("OLLAMA_URL", OLLAMA_URL),
]:
_icon = "βœ… present" if _val else "❌ missing"
_key_lines.append(f"- `{_label}`: {_icon}")
gr.Markdown("\n".join(_key_lines))
btn_run_council = gr.Button("πŸš€ Convene Research Council", variant="primary")
council_status = gr.Markdown("*Run taxonomy mapping first (Tab 3 or full pipeline).*")
with gr.Row():
with gr.Column():
gr.Markdown("### 🟒 Panel A β€” Mistral")
mistral_output = gr.Textbox(
label="Mistral Assessment",
lines=18,
interactive=False,
show_label=True,
)
with gr.Column():
gr.Markdown("### πŸ”΅ Panel B β€” Gemini")
gemini_output = gr.Textbox(
label="Gemini Assessment",
lines=18,
interactive=False,
show_label=True,
)
with gr.Column():
gr.Markdown("### 🟣 Panel C β€” Ollama")
ollama_output = gr.Textbox(
label="Ollama Assessment",
lines=18,
interactive=False,
show_label=True,
)
dl_council = gr.DownloadButton("⬇ council_report.json", value=None)
# ---- Handler ----
def handle_run_council(
taxonomy_map, topic_df,
progress=gr.Progress(track_tqdm=True)
):
if not taxonomy_map:
return (
"<div class='error-box'>❌ Run taxonomy mapping first (Tab 3 or full pipeline).</div>",
"", "", gr.update()
)
if not any([MISTRAL_API_KEY, GEMINI_API_KEY]):
return (
"<div class='error-box'>❌ No API keys found. "
"Add MISTRAL_API_KEY and/or GEMINI_API_KEY "
"in your Space Settings β†’ Variables and Secrets.</div>",
"", "", gr.update()
)
try:
_ensure_output_dir()
progress(0.1, desc="Preparing findings…")
result = run_agentic_council(
taxonomy_map=taxonomy_map,
topic_df=topic_df,
mistral_api_key=MISTRAL_API_KEY,
gemini_api_key=GEMINI_API_KEY,
ollama_url=OLLAMA_URL,
)
progress(0.9, desc="Saving report…")
saved = _safe_save_json(result, "council_report.json")
progress(1.0, desc="Council complete!")
status = "<div class='success-box'>βœ… Council complete. See assessments below.</div>"
return (
status,
result.get("mistral", ""),
result.get("gemini", ""),
result.get("ollama", ""),
gr.update(value=saved),
)
except Exception as e:
return (
f"<div class='error-box'>❌ Council failed: {e}</div>",
"", "", "", gr.update()
)
btn_run_council.click(
fn=handle_run_council,
inputs=[state_taxonomy_map, state_topic_df],
outputs=[council_status, mistral_output, gemini_output, ollama_output, dl_council]
)
# Auto-fill if council already ran (e.g. via full pipeline)
state_council_result.change(
fn=lambda cr: (
cr.get("mistral", "") if cr else "",
cr.get("gemini", "") if cr else "",
cr.get("ollama", "") if cr else "",
),
inputs=[state_council_result],
outputs=[mistral_output, gemini_output, ollama_output]
)
# ==================================================================
# TAB 7 β€” Export Center
# ==================================================================
with gr.Tab("πŸ“¦ Export Center"):
gr.Markdown("## Export Center & Methodology Notes")
with gr.Row():
# BUG 5 FIX: all value=None β€” updated dynamically after pipeline
dl_topic = gr.DownloadButton(
"⬇ topic_review_table.csv",
value=None,
elem_id="dl_topic"
)
dl_mapping = gr.DownloadButton(
"⬇ pajais_mapping.csv",
value=None,
elem_id="dl_mapping"
)
dl_comparison = gr.DownloadButton(
"⬇ comparison.csv",
value=None,
elem_id="dl_comparison"
)
with gr.Row():
dl_taxonomy = gr.DownloadButton(
"⬇ taxonomy_map.json",
value=None,
elem_id="dl_taxonomy"
)
dl_narrative = gr.DownloadButton(
"⬇ narrative.txt",
value=None,
elem_id="dl_narrative"
)
dl_log = gr.DownloadButton(
"⬇ agent.log",
value=str(OUTPUTS_DIR / "agent.log"),
elem_id="dl_log"
)
btn_download_all = gr.Button(
"πŸ“¦ Download All as ZIP",
variant="primary",
elem_id="btn_download_all"
)
zip_output = gr.File(
label="All Artifacts (ZIP)",
elem_id="zip_output",
visible=False
)
def handle_download_all():
zip_path = _make_zip()
if zip_path:
return zip_path, gr.update(visible=True)
return None, gr.update(visible=False, value="No files to download yet.")
btn_download_all.click(
fn=handle_download_all,
inputs=[],
outputs=[zip_output, zip_output]
)
gr.Markdown("---")
btn_print_summary = gr.Button(
"πŸ–¨ Print-Ready Summary",
variant="secondary",
elem_id="btn_print_summary"
)
print_summary_output = gr.Markdown(elem_id="print_summary_output")
btn_print_summary.click(
fn=lambda td, tm: _print_ready_summary(td, tm),
inputs=[state_topic_df, state_taxonomy_map],
outputs=[print_summary_output]
)
gr.Markdown("---")
gr.Markdown(
"""
## πŸ“– Methodology Notes
### LDA Topic Modeling
This system uses **Latent Dirichlet Allocation (LDA)** implemented via the
[Gensim](https://radimrehurek.com/gensim/) library. LDA is a generative
probabilistic model that discovers latent thematic structures in a text
corpus by modeling each document as a mixture of topics and each topic as
a distribution over words. The pipeline includes bigram phrase detection,
TF-IDF filtering, and UMass coherence scoring to ensure topic quality.
### PAJAIS Taxonomy (20 Themes)
The 20 canonical PAJAIS themes span IS Strategy, Digital Transformation,
IT Adoption, Knowledge Management, E-Commerce, AI/ML, Blockchain,
Healthcare IS, Social Media, Big Data, Cloud Computing, Cybersecurity,
IS in Asia-Pacific, Mobile Computing, IS Research Methods, Organizational IS,
HCI, IS Education, Sustainability, and FinTech.
### Coherence Scoring & Publishability
Topic coherence is measured using the UMass metric, which captures semantic
relatedness among top topic words. A topic is deemed **publishable** when
it meets two thresholds: `doc_count > 5` (sufficient scholarly attention)
and `coherence > 0.30` (semantic stability).
### Abstract vs Title Methodology
Separate LDA models are trained on article abstracts and titles independently.
Topics appearing exclusively in abstracts represent **latent constructs** β€”
ideas actively studied but not yet positioned as headline contributions.
Topics exclusive to titles signal **positioning keywords** favored by authors
as first-impression signals to reviewers and readers.
### DBSCAN Semantic Clustering
Papers are embedded using TF-IDF β†’ Truncated SVD (LSA) for both title and
abstract text independently. DBSCAN is applied to each embedding space with
configurable Ξ΅ and min_samples parameters. Cluster assignments are merged
via a weighted vote (configurable abstract weight). Large clusters are
recursively bisected; tiny clusters with fewer than min_membership documents
are reassigned to their nearest valid cluster or marked as noise.
### Agentic Research Council
The council convenes two independent AI models (Mistral and Gemini)
to assess the gap analysis findings from complementary epistemological
perspectives. Each panel member produces a structured assessment of the
most publishable gaps, methodological recommendations, and regional focus.
Their independent outputs can be compared side-by-side to identify
consensus positions and productive disagreements.
---
*Built for PAJAIS Research Intelligence*
"""
)
# ==================================================================
# Wire handle_full_run outputs to all DownloadButtons + new states
# ==================================================================
btn_full_run.click(
fn=handle_full_run,
inputs=[file_input],
outputs=[
validation_info, preview_df,
state_df, state_agent_result,
state_topic_df, state_comparison_df, state_taxonomy_map,
state_lda_result,
error_display,
# BUG 5 FIX: wire paths back to DownloadButtons across all tabs
topic_download, # Tab 2
mapping_download, # Tab 3
comparison_download, # Tab 4
dl_taxonomy, # Export Center
narrative_download, # Tab 5
dl_topic, # Export Center duplicate
dl_mapping, # Export Center duplicate
# New: cluster + council states populated if agent ran them
state_cluster_df,
state_cluster_summary,
state_council_result,
]
)
# ---------------------------------------------------------------------------
# Launch
# ---------------------------------------------------------------------------
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
)