| from tools import ResearchTools
|
| import pandas as pd
|
| from typing import Dict
|
|
|
| class ResearchAgent:
|
|
|
| def __init__(self):
|
| self.tools = ResearchTools()
|
| self.results = {}
|
|
|
| def plan(self):
|
| self.pipeline = [
|
| "Load and validate data",
|
| "Preprocess text",
|
| "Perform topic modeling",
|
| "Label topics",
|
| "Compare title vs abstract themes",
|
| "Extract unique themes",
|
| "Map themes to taxonomy",
|
| "Generate outputs"
|
| ]
|
| print("π Pipeline planned:")
|
| for i, step in enumerate(self.pipeline, 1):
|
| print(f" {i}. {step}")
|
|
|
| def execute_pipeline(self, csv_path: str) -> Dict:
|
| print("="*60)
|
| print("π€ RESEARCH AGENT - STARTING PIPELINE")
|
| print("="*60)
|
|
|
| try:
|
| self.plan()
|
| print()
|
|
|
|
|
| print("π Loading data...")
|
| df = self.tools.load_csv(csv_path)
|
| if df is None or df.empty:
|
| raise ValueError("DataFrame is empty")
|
|
|
| self.results['num_documents'] = len(df)
|
|
|
|
|
| print("π§Ή Preprocessing...")
|
| df = self.tools.preprocess_corpus(df)
|
|
|
|
|
| print("π― Topic modeling...")
|
| topic_model, topic_info = self.tools.perform_topic_modeling(
|
| df['combined_clean'].tolist(), n_topics=100
|
| )
|
|
|
| self.results['num_topics'] = len(topic_info)
|
|
|
|
|
| print("π·οΈ Labeling topics...")
|
| label_df = self.tools.label_topics(topic_model, topic_info)
|
|
|
| topic_table = pd.merge(
|
| topic_info[['Topic', 'Count']],
|
| label_df,
|
| left_on='Topic',
|
| right_on='topic_id',
|
| how='left'
|
| )
|
|
|
| topic_table = topic_table[['topic_id', 'keywords', 'label', 'Count']]
|
| topic_table = topic_table.rename(columns={'Count': 'document_count'})
|
|
|
|
|
| print("π Comparing...")
|
| comparison_df = self.tools.compare_title_abstract_themes(df, topic_model)
|
|
|
|
|
| print("π Extracting themes...")
|
| all_themes = self.tools.extract_themes(label_df['label'].tolist())
|
|
|
|
|
| print("πΊοΈ Mapping...")
|
| taxonomy_map = self.tools.map_to_taxonomy(all_themes)
|
|
|
|
|
| print("πΎ Saving outputs...")
|
| self.tools.save_outputs(comparison_df, taxonomy_map, topic_table)
|
|
|
|
|
| self.tools.generate_keywords_csv(topic_table, taxonomy_map)
|
|
|
| print("β
DONE")
|
| return self.results
|
|
|
| except Exception as e:
|
| import traceback
|
| traceback.print_exc()
|
| return {"error": str(e)} |