Spaces:

Enayut
/

topic_modelling_agent

Sleeping

File size: 11,033 Bytes

ed51280

"""
agent.py — TopicAgent orchestrates the end-to-end topic modeling workflow.

This module defines the TopicAgent class, which:
  1. Loads and validates the CSV dataset.
  2. Preprocesses text for Titles and Abstracts separately.
  3. Runs topic modeling on each corpus (≥100 topics guaranteed).
  4. Generates human-readable labels for every topic.
  5. Compares dominant themes across Title and Abstract topics.
  6. Produces a taxonomy map (MAPPED / NOVEL classification).
  7. Exports structured outputs: topics table, comparison CSV, taxonomy JSON.

Usage:
    agent = TopicAgent(csv_path="dataset.csv")
    results = agent.run()
"""

import os
import json
import logging
from dataclasses import dataclass, field
from typing import Dict, Any, Optional

import pandas as pd

from tools import (
    load_csv,
    preprocess_text,
    run_topic_modeling,
    generate_labels,
    compare_themes,
    create_taxonomy_map,
)

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Structured result container
# ---------------------------------------------------------------------------
@dataclass
class AgentResult:
    """Container for all outputs produced by the TopicAgent."""
    # Core dataframes
    title_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
    abstract_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
    combined_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
    comparison: pd.DataFrame = field(default_factory=pd.DataFrame)

    # Taxonomy map (dict serialisable to JSON)
    taxonomy_map: Dict[str, Any] = field(default_factory=dict)

    # Execution metadata
    status: str = "pending"
    steps_completed: list = field(default_factory=list)
    errors: list = field(default_factory=list)

    # File paths of exported artefacts
    exported_files: Dict[str, str] = field(default_factory=dict)


# ---------------------------------------------------------------------------
# TopicAgent
# ---------------------------------------------------------------------------
class TopicAgent:
    """
    Orchestrates the research-paper topic modeling pipeline.

    Parameters
    ----------
    csv_path : str
        Path to the input CSV file.
    output_dir : str
        Directory to write output files.
    min_topics : int
        Minimum number of topics to generate per source (default 100).
    use_llm_labels : bool
        Whether to use Groq LLM for label generation.
    groq_api_key : str, optional
        API key for Groq (used only when use_llm_labels is True).
    """

    def __init__(
        self,
        csv_path: str,
        output_dir: str = "outputs",
        min_topics: int = 100,
        use_llm_labels: bool = False,
        groq_api_key: Optional[str] = None,
    ):
        self.csv_path = csv_path
        self.output_dir = output_dir
        self.min_topics = min_topics
        self.use_llm_labels = use_llm_labels
        self.groq_api_key = groq_api_key

        # Ensure output directory exists
        os.makedirs(self.output_dir, exist_ok=True)

        self._result = AgentResult()

    # -----------------------------------------------------------------
    # Public interface
    # -----------------------------------------------------------------
    def run(self) -> AgentResult:
        """
        Execute the full pipeline step by step.

        Returns
        -------
        AgentResult
            Structured results including all DataFrames, taxonomy, and file paths.
        """
        logger.info("=" * 60)
        logger.info("TopicAgent — Starting pipeline")
        logger.info("=" * 60)

        try:
            # Step 1: Load CSV
            self._step_load_csv()

            # Step 2: Preprocess text
            self._step_preprocess()

            # Step 3: Topic modeling on Titles
            self._step_model_titles()

            # Step 4: Topic modeling on Abstracts
            self._step_model_abstracts()

            # Step 5: Generate labels
            self._step_generate_labels()

            # Step 6: Build combined topics table
            self._step_combine_topics()

            # Step 7: Compare themes
            self._step_compare_themes()

            # Step 8: Create taxonomy map
            self._step_taxonomy_map()

            # Step 9: Export outputs
            self._step_export()

            self._result.status = "success"
            logger.info("Pipeline completed successfully.")

        except Exception as exc:
            self._result.status = "failed"
            self._result.errors.append(str(exc))
            logger.error("Pipeline failed: %s", exc, exc_info=True)

        return self._result

    # -----------------------------------------------------------------
    # Pipeline steps
    # -----------------------------------------------------------------
    def _step_load_csv(self):
        """Step 1 — Ingest CSV dataset."""
        logger.info("Step 1/9: Loading CSV …")
        self._df = load_csv(self.csv_path)
        self._result.steps_completed.append("load_csv")
        logger.info("  → %d papers loaded.", len(self._df))

    def _step_preprocess(self):
        """Step 2 — Preprocess Title and Abstract text."""
        logger.info("Step 2/9: Preprocessing text …")
        self._titles_clean = preprocess_text(self._df["Title"].tolist())
        self._abstracts_clean = preprocess_text(self._df["Abstract"].tolist())
        self._result.steps_completed.append("preprocess_text")
        logger.info("  → Titles preprocessed: %d docs", len(self._titles_clean))
        logger.info("  → Abstracts preprocessed: %d docs", len(self._abstracts_clean))

    def _step_model_titles(self):
        """Step 3 — Topic modeling on Titles."""
        logger.info("Step 3/9: Topic modeling on Titles …")
        self._title_topics_df, self._title_model = run_topic_modeling(
            self._titles_clean,
            source_label="Titles",
            min_topics=self.min_topics,
        )
        self._result.steps_completed.append("topic_modeling_titles")
        logger.info("  → %d title topics discovered.", len(self._title_topics_df))

    def _step_model_abstracts(self):
        """Step 4 — Topic modeling on Abstracts."""
        logger.info("Step 4/9: Topic modeling on Abstracts …")
        self._abstract_topics_df, self._abstract_model = run_topic_modeling(
            self._abstracts_clean,
            source_label="Abstracts",
            min_topics=self.min_topics,
        )
        self._result.steps_completed.append("topic_modeling_abstracts")
        logger.info("  → %d abstract topics discovered.", len(self._abstract_topics_df))

    def _step_generate_labels(self):
        """Step 5 — Generate human-readable labels."""
        logger.info("Step 5/9: Generating topic labels …")
        self._title_topics_df = generate_labels(
            self._title_topics_df,
            use_llm=self.use_llm_labels,
            groq_api_key=self.groq_api_key,
        )
        self._abstract_topics_df = generate_labels(
            self._abstract_topics_df,
            use_llm=self.use_llm_labels,
            groq_api_key=self.groq_api_key,
        )
        self._result.title_topics = self._title_topics_df
        self._result.abstract_topics = self._abstract_topics_df
        self._result.steps_completed.append("generate_labels")
        logger.info("  → Labels generated for all topics.")

    def _step_combine_topics(self):
        """Step 6 — Combine title and abstract topics into one table."""
        logger.info("Step 6/9: Building combined topics table …")
        combined = pd.concat(
            [self._title_topics_df, self._abstract_topics_df],
            ignore_index=True,
        )
        combined["global_id"] = range(len(combined))
        self._result.combined_topics = combined
        self._result.steps_completed.append("combine_topics")
        logger.info("  → Combined table: %d topics total.", len(combined))

    def _step_compare_themes(self):
        """Step 7 — Compare title vs abstract themes."""
        logger.info("Step 7/9: Comparing title vs abstract themes …")
        comparison = compare_themes(self._title_topics_df, self._abstract_topics_df)
        self._result.comparison = comparison
        self._result.steps_completed.append("compare_themes")
        logger.info("  → Comparison table: %d rows.", len(comparison))

    def _step_taxonomy_map(self):
        """Step 8 — Create taxonomy map (MAPPED / NOVEL)."""
        logger.info("Step 8/9: Building taxonomy map …")
        # Use the combined topics for taxonomy
        taxonomy = create_taxonomy_map(self._result.combined_topics)
        self._result.taxonomy_map = taxonomy
        self._result.steps_completed.append("create_taxonomy_map")
        logger.info(
            "  → MAPPED: %d, NOVEL: %d",
            taxonomy["metadata"]["mapped_count"],
            taxonomy["metadata"]["novel_count"],
        )

    def _step_export(self):
        """Step 9 — Export all outputs to disk."""
        logger.info("Step 9/9: Exporting outputs …")

        # (a) Combined topics table CSV
        topics_path = os.path.join(self.output_dir, "topics_table.csv")
        self._result.combined_topics.to_csv(topics_path, index=False)
        self._result.exported_files["topics_table"] = topics_path
        logger.info("  → Saved: %s", topics_path)

        # (b) Comparison CSV
        comparison_path = os.path.join(self.output_dir, "comparison.csv")
        self._result.comparison.to_csv(comparison_path, index=False)
        self._result.exported_files["comparison"] = comparison_path
        logger.info("  → Saved: %s", comparison_path)

        # (c) Taxonomy map JSON
        taxonomy_path = os.path.join(self.output_dir, "taxonomy_map.json")
        with open(taxonomy_path, "w", encoding="utf-8") as f:
            json.dump(self._result.taxonomy_map, f, indent=2, ensure_ascii=False)
        self._result.exported_files["taxonomy_map"] = taxonomy_path
        logger.info("  → Saved: %s", taxonomy_path)

        # (d) Title topics CSV
        title_path = os.path.join(self.output_dir, "title_topics.csv")
        self._result.title_topics.to_csv(title_path, index=False)
        self._result.exported_files["title_topics"] = title_path
        logger.info("  → Saved: %s", title_path)

        # (e) Abstract topics CSV
        abstract_path = os.path.join(self.output_dir, "abstract_topics.csv")
        self._result.abstract_topics.to_csv(abstract_path, index=False)
        self._result.exported_files["abstract_topics"] = abstract_path
        logger.info("  → Saved: %s", abstract_path)

        self._result.steps_completed.append("export")
        logger.info("  → All outputs exported successfully.")