Spaces:

reyansh2005
/

topic-model

Sleeping

App Files Files Community

topic-model / agent.py

reyansh2005

Upload 7 files

24ae147 verified about 1 month ago

raw

history blame contribute delete

11.3 kB

	"""
	agent.py — TopicAgent orchestrates the end-to-end topic modeling workflow.

	This module defines the TopicAgent class, which:
	1. Loads and validates the CSV dataset.
	2. Preprocesses text for Titles and Abstracts separately.
	3. Runs topic modeling on each corpus (≥100 topics guaranteed).
	4. Generates human-readable labels for every topic.
	5. Compares dominant themes across Title and Abstract topics.
	6. Produces a taxonomy map (MAPPED / NOVEL classification).
	7. Exports structured outputs: topics table, comparison CSV, taxonomy JSON.

	Usage:
	agent = TopicAgent(csv_path="dataset.csv")
	results = agent.run()
	"""

	import os
	import json
	import logging
	from dataclasses import dataclass, field
	from typing import Dict, Any, Optional

	import pandas as pd

	from tools import (
	load_csv,
	preprocess_text,
	run_topic_modeling,
	generate_labels,
	compare_themes,
	create_taxonomy_map,
	)

	# ---------------------------------------------------------------------------
	# Logging
	# ---------------------------------------------------------------------------
	logger = logging.getLogger(__name__)


	# ---------------------------------------------------------------------------
	# Structured result container
	# ---------------------------------------------------------------------------
	@dataclass
	class AgentResult:
	"""Container for all outputs produced by the TopicAgent."""
	# Core dataframes
	title_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
	abstract_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
	combined_topics: pd.DataFrame = field(default_factory=pd.DataFrame)
	comparison: pd.DataFrame = field(default_factory=pd.DataFrame)

	# Taxonomy map (dict serialisable to JSON)
	taxonomy_map: Dict[str, Any] = field(default_factory=dict)

	# Execution metadata
	status: str = "pending"
	steps_completed: list = field(default_factory=list)
	errors: list = field(default_factory=list)

	# File paths of exported artefacts
	exported_files: Dict[str, str] = field(default_factory=dict)


	# ---------------------------------------------------------------------------
	# TopicAgent
	# ---------------------------------------------------------------------------
	class TopicAgent:
	"""
	Orchestrates the research-paper topic modeling pipeline.

	Parameters
	----------
	csv_path : str
	Path to the input CSV file.
	output_dir : str
	Directory to write output files.
	min_topics : int
	Minimum number of topics to generate per source (default 100).
	use_llm_labels : bool
	Whether to use Groq LLM for label generation.
	groq_api_key : str, optional
	API key for Groq (used only when use_llm_labels is True).
	"""

	def __init__(
	self,
	csv_path: str,
	output_dir: str = "outputs",
	min_topics: int = 100,
	use_llm_labels: bool = False,
	groq_api_key: Optional[str] = None,
	):
	self.csv_path = csv_path
	self.output_dir = output_dir
	self.min_topics = min_topics
	self.use_llm_labels = use_llm_labels
	self.groq_api_key = groq_api_key

	# Ensure output directory exists
	os.makedirs(self.output_dir, exist_ok=True)

	self._result = AgentResult()

	# -----------------------------------------------------------------
	# Public interface
	# -----------------------------------------------------------------
	def run(self) -> AgentResult:
	"""
	Execute the full pipeline step by step.

	Returns
	-------
	AgentResult
	Structured results including all DataFrames, taxonomy, and file paths.
	"""
	logger.info("=" * 60)
	logger.info("TopicAgent — Starting pipeline")
	logger.info("=" * 60)

	try:
	# Step 1: Load CSV
	self._step_load_csv()

	# Step 2: Preprocess text
	self._step_preprocess()

	# Step 3: Topic modeling on Titles
	self._step_model_titles()

	# Step 4: Topic modeling on Abstracts
	self._step_model_abstracts()

	# Step 5: Generate labels
	self._step_generate_labels()

	# Step 6: Build combined topics table
	self._step_combine_topics()

	# Step 7: Compare themes
	self._step_compare_themes()

	# Step 8: Create taxonomy map
	self._step_taxonomy_map()

	# Step 9: Export outputs
	self._step_export()

	self._result.status = "success"
	logger.info("Pipeline completed successfully.")

	except Exception as exc:
	self._result.status = "failed"
	self._result.errors.append(str(exc))
	logger.error("Pipeline failed: %s", exc, exc_info=True)

	return self._result

	# -----------------------------------------------------------------
	# Pipeline steps
	# -----------------------------------------------------------------
	def _step_load_csv(self):
	"""Step 1 — Ingest CSV dataset."""
	logger.info("Step 1/9: Loading CSV …")
	self._df = load_csv(self.csv_path)
	self._result.steps_completed.append("load_csv")
	logger.info(" → %d papers loaded.", len(self._df))

	def _step_preprocess(self):
	"""Step 2 — Preprocess Title and Abstract text."""
	logger.info("Step 2/9: Preprocessing text …")
	self._titles_clean = preprocess_text(self._df["Title"].tolist())
	self._abstracts_clean = preprocess_text(self._df["Abstract"].tolist())
	self._result.steps_completed.append("preprocess_text")
	logger.info(" → Titles preprocessed: %d docs", len(self._titles_clean))
	logger.info(" → Abstracts preprocessed: %d docs", len(self._abstracts_clean))

	def _step_model_titles(self):
	"""Step 3 — Topic modeling on Titles."""
	logger.info("Step 3/9: Topic modeling on Titles …")
	self._title_topics_df, self._title_model = run_topic_modeling(
	self._titles_clean,
	source_label="Titles",
	min_topics=self.min_topics,
	)
	self._result.steps_completed.append("topic_modeling_titles")
	logger.info(" → %d title topics discovered.", len(self._title_topics_df))

	def _step_model_abstracts(self):
	"""Step 4 — Topic modeling on Abstracts."""
	logger.info("Step 4/9: Topic modeling on Abstracts …")
	self._abstract_topics_df, self._abstract_model = run_topic_modeling(
	self._abstracts_clean,
	source_label="Abstracts",
	min_topics=self.min_topics,
	)
	self._result.steps_completed.append("topic_modeling_abstracts")
	logger.info(" → %d abstract topics discovered.", len(self._abstract_topics_df))

	def _step_generate_labels(self):
	"""Step 5 — Generate human-readable labels."""
	logger.info("Step 5/9: Generating topic labels …")
	self._title_topics_df = generate_labels(
	self._title_topics_df,
	use_llm=self.use_llm_labels,
	groq_api_key=self.groq_api_key,
	)
	self._abstract_topics_df = generate_labels(
	self._abstract_topics_df,
	use_llm=self.use_llm_labels,
	groq_api_key=self.groq_api_key,
	)
	self._result.title_topics = self._title_topics_df
	self._result.abstract_topics = self._abstract_topics_df
	self._result.steps_completed.append("generate_labels")
	logger.info(" → Labels generated for all topics.")

	def _step_combine_topics(self):
	"""Step 6 — Combine title and abstract topics into one table."""
	logger.info("Step 6/9: Building combined topics table …")
	combined = pd.concat(
	[self._title_topics_df, self._abstract_topics_df],
	ignore_index=True,
	)
	combined["global_id"] = range(len(combined))
	self._result.combined_topics = combined
	self._result.steps_completed.append("combine_topics")
	logger.info(" → Combined table: %d topics total.", len(combined))

	def _step_compare_themes(self):
	"""Step 7 — Compare title vs abstract themes."""
	logger.info("Step 7/9: Comparing title vs abstract themes …")
	comparison = compare_themes(self._title_topics_df, self._abstract_topics_df)
	self._result.comparison = comparison
	self._result.steps_completed.append("compare_themes")
	logger.info(" → Comparison table: %d rows.", len(comparison))

	def _step_taxonomy_map(self):
	"""Step 8 — Create taxonomy map (MAPPED / NOVEL)."""
	logger.info("Step 8/9: Building taxonomy map …")
	# Use the combined topics for taxonomy
	taxonomy = create_taxonomy_map(self._result.combined_topics)
	self._result.taxonomy_map = taxonomy
	self._result.steps_completed.append("create_taxonomy_map")
	logger.info(
	" → MAPPED: %d, NOVEL: %d",
	taxonomy["metadata"]["mapped_count"],
	taxonomy["metadata"]["novel_count"],
	)

	def _step_export(self):
	"""Step 9 — Export all outputs to disk."""
	logger.info("Step 9/9: Exporting outputs …")

	# (a) Combined topics table CSV
	topics_path = os.path.join(self.output_dir, "topics_table.csv")
	self._result.combined_topics.to_csv(topics_path, index=False)
	self._result.exported_files["topics_table"] = topics_path
	logger.info(" → Saved: %s", topics_path)

	# (b) Comparison CSV
	comparison_path = os.path.join(self.output_dir, "comparison.csv")
	self._result.comparison.to_csv(comparison_path, index=False)
	self._result.exported_files["comparison"] = comparison_path
	logger.info(" → Saved: %s", comparison_path)

	# (c) Taxonomy map JSON
	taxonomy_path = os.path.join(self.output_dir, "taxonomy_map.json")
	with open(taxonomy_path, "w", encoding="utf-8") as f:
	json.dump(self._result.taxonomy_map, f, indent=2, ensure_ascii=False)
	self._result.exported_files["taxonomy_map"] = taxonomy_path
	logger.info(" → Saved: %s", taxonomy_path)

	# (d) Title topics CSV
	title_path = os.path.join(self.output_dir, "title_topics.csv")
	self._result.title_topics.to_csv(title_path, index=False)
	self._result.exported_files["title_topics"] = title_path
	logger.info(" → Saved: %s", title_path)

	# (e) Abstract topics CSV
	abstract_path = os.path.join(self.output_dir, "abstract_topics.csv")
	self._result.abstract_topics.to_csv(abstract_path, index=False)
	self._result.exported_files["abstract_topics"] = abstract_path
	logger.info(" → Saved: %s", abstract_path)

	self._result.steps_completed.append("export")
	logger.info(" → All outputs exported successfully.")