""" agent.py - Pipeline controller for the Topic Modelling System Orchestrates all phases in sequence and returns structured results. """ import os import logging from pathlib import Path import pandas as pd from tools import ( load_csv, extract_topics, build_review_table, compare_title_abstract, map_pajais, gap_analysis, save_comparison_csv, save_taxonomy_json, generate_narrative, ) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S" ) log = logging.getLogger(__name__) def run_pipeline(csv_path: str, output_dir: str = ".") -> dict: """ Execute the full topic modelling pipeline. Parameters ---------- csv_path : Path to input CSV file. output_dir : Directory where output files will be written. Returns ------- dict with keys: review_df – DataFrame (topic_id, keyword, frequency) comparison_df – DataFrame (title vs abstract comparison) topics – list of enriched topic dicts gap – dict of gap-analysis statistics narrative – str (narrative text, 480-520 words) output_files – list of written file paths """ os.makedirs(output_dir, exist_ok=True) # ── PHASE 1: Data Loading ──────────────────────────────────────────────── log.info("Phase 1 | Loading data from: %s", csv_path) df = load_csv(csv_path) log.info(" Loaded %d records.", len(df)) # ── PHASE 2: Topic Extraction ──────────────────────────────────────────── log.info("Phase 2 | Extracting topics (target ≥ 98) …") raw_topics = extract_topics(df, n_topics=100) log.info(" Extracted %d topics.", len(raw_topics)) # ── PHASE 3: Review Table ──────────────────────────────────────────────── log.info("Phase 3 | Building review table …") review_df = build_review_table(raw_topics) # ── PHASE 4: Comparison ────────────────────────────────────────────────── log.info("Phase 4 | Comparing title vs abstract themes …") comparison_df = compare_title_abstract(df, raw_topics) # ── PHASE 5: PAJAIS Mapping ────────────────────────────────────────────── log.info("Phase 5 | Mapping topics to PAJAIS taxonomy …") enriched_topics = map_pajais(raw_topics) # ── PHASE 5.5: Gap Analysis ────────────────────────────────────────────── log.info("Phase 5.5 | Running gap analysis …") gap = gap_analysis(enriched_topics) log.info( " MAPPED=%d (%.1f%%) NOVEL=%d (%.1f%%)", gap["mapped_count"], gap["mapped_percent"], gap["novel_count"], gap["novel_percent"] ) # ── PHASE 6: Output Files ──────────────────────────────────────────────── log.info("Phase 6 | Writing output files …") comparison_path = os.path.join(output_dir, "comparison.csv") taxonomy_path = os.path.join(output_dir, "taxonomy_map.json") save_comparison_csv(comparison_df, comparison_path) save_taxonomy_json(enriched_topics, gap, taxonomy_path) log.info(" Saved: %s", comparison_path) log.info(" Saved: %s", taxonomy_path) # ── PHASE 7: Narrative ─────────────────────────────────────────────────── log.info("Phase 7 | Generating narrative (target 480-520 words) …") narrative = generate_narrative(review_df, comparison_df, enriched_topics, gap) word_count = len(narrative.split()) log.info(" Narrative word count: %d", word_count) narrative_path = os.path.join(output_dir, "narrative.txt") with open(narrative_path, "w", encoding="utf-8") as f: f.write(narrative) log.info(" Saved: %s", narrative_path) output_files = [comparison_path, taxonomy_path, narrative_path] log.info("Pipeline complete. All phases executed successfully.") return { "review_df": review_df, "comparison_df": comparison_df, "topics": enriched_topics, "gap": gap, "narrative": narrative, "output_files": output_files, "record_count": len(df) }