File size: 3,041 Bytes
4bf6942 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | from tools import ResearchTools
import pandas as pd
from typing import Dict
class ResearchAgent:
def __init__(self):
self.tools = ResearchTools()
self.results = {}
def plan(self):
self.pipeline = [
"Load and validate data",
"Preprocess text",
"Perform topic modeling",
"Label topics",
"Compare title vs abstract themes",
"Extract unique themes",
"Map themes to taxonomy",
"Generate outputs"
]
print("π Pipeline planned:")
for i, step in enumerate(self.pipeline, 1):
print(f" {i}. {step}")
def execute_pipeline(self, csv_path: str) -> Dict:
print("="*60)
print("π€ RESEARCH AGENT - STARTING PIPELINE")
print("="*60)
try:
self.plan()
print()
# Load
print("π Loading data...")
df = self.tools.load_csv(csv_path)
if df is None or df.empty:
raise ValueError("DataFrame is empty")
self.results['num_documents'] = len(df)
# Preprocess
print("π§Ή Preprocessing...")
df = self.tools.preprocess_corpus(df)
# Topic modeling
print("π― Topic modeling...")
topic_model, topic_info = self.tools.perform_topic_modeling(
df['combined_clean'].tolist(), n_topics=100
)
self.results['num_topics'] = len(topic_info)
# Label
print("π·οΈ Labeling topics...")
label_df = self.tools.label_topics(topic_model, topic_info)
topic_table = pd.merge(
topic_info[['Topic', 'Count']],
label_df,
left_on='Topic',
right_on='topic_id',
how='left'
)
topic_table = topic_table[['topic_id', 'keywords', 'label', 'Count']]
topic_table = topic_table.rename(columns={'Count': 'document_count'})
# Compare
print("π Comparing...")
comparison_df = self.tools.compare_title_abstract_themes(df, topic_model)
# Themes
print("π Extracting themes...")
all_themes = self.tools.extract_themes(label_df['label'].tolist())
# Mapping
print("πΊοΈ Mapping...")
taxonomy_map = self.tools.map_to_taxonomy(all_themes)
# Save outputs
print("πΎ Saving outputs...")
self.tools.save_outputs(comparison_df, taxonomy_map, topic_table)
# π΄ NEW FILE
self.tools.generate_keywords_csv(topic_table, taxonomy_map)
print("β
DONE")
return self.results
except Exception as e:
import traceback
traceback.print_exc()
return {"error": str(e)} |