tryH / agent.py
ronitsonawane24's picture
Upload 4 files
b7e9bf6 verified
"""
agent.py - Pipeline controller for the Topic Modelling System
Orchestrates all phases in sequence and returns structured results.
"""
import os
import logging
from pathlib import Path
import pandas as pd
from tools import (
load_csv,
extract_topics,
build_review_table,
compare_title_abstract,
map_pajais,
gap_analysis,
save_comparison_csv,
save_taxonomy_json,
generate_narrative,
)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S"
)
log = logging.getLogger(__name__)
def run_pipeline(csv_path: str, output_dir: str = ".") -> dict:
"""
Execute the full topic modelling pipeline.
Parameters
----------
csv_path : Path to input CSV file.
output_dir : Directory where output files will be written.
Returns
-------
dict with keys:
review_df – DataFrame (topic_id, keyword, frequency)
comparison_df – DataFrame (title vs abstract comparison)
topics – list of enriched topic dicts
gap – dict of gap-analysis statistics
narrative – str (narrative text, 480-520 words)
output_files – list of written file paths
"""
os.makedirs(output_dir, exist_ok=True)
# ── PHASE 1: Data Loading ────────────────────────────────────────────────
log.info("Phase 1 | Loading data from: %s", csv_path)
df = load_csv(csv_path)
log.info(" Loaded %d records.", len(df))
# ── PHASE 2: Topic Extraction ────────────────────────────────────────────
log.info("Phase 2 | Extracting topics (target β‰₯ 98) …")
raw_topics = extract_topics(df, n_topics=100)
log.info(" Extracted %d topics.", len(raw_topics))
# ── PHASE 3: Review Table ────────────────────────────────────────────────
log.info("Phase 3 | Building review table …")
review_df = build_review_table(raw_topics)
# ── PHASE 4: Comparison ──────────────────────────────────────────────────
log.info("Phase 4 | Comparing title vs abstract themes …")
comparison_df = compare_title_abstract(df, raw_topics)
# ── PHASE 5: PAJAIS Mapping ──────────────────────────────────────────────
log.info("Phase 5 | Mapping topics to PAJAIS taxonomy …")
enriched_topics = map_pajais(raw_topics)
# ── PHASE 5.5: Gap Analysis ──────────────────────────────────────────────
log.info("Phase 5.5 | Running gap analysis …")
gap = gap_analysis(enriched_topics)
log.info(
" MAPPED=%d (%.1f%%) NOVEL=%d (%.1f%%)",
gap["mapped_count"], gap["mapped_percent"],
gap["novel_count"], gap["novel_percent"]
)
# ── PHASE 6: Output Files ────────────────────────────────────────────────
log.info("Phase 6 | Writing output files …")
comparison_path = os.path.join(output_dir, "comparison.csv")
taxonomy_path = os.path.join(output_dir, "taxonomy_map.json")
save_comparison_csv(comparison_df, comparison_path)
save_taxonomy_json(enriched_topics, gap, taxonomy_path)
log.info(" Saved: %s", comparison_path)
log.info(" Saved: %s", taxonomy_path)
# ── PHASE 7: Narrative ───────────────────────────────────────────────────
log.info("Phase 7 | Generating narrative (target 480-520 words) …")
narrative = generate_narrative(review_df, comparison_df, enriched_topics, gap)
word_count = len(narrative.split())
log.info(" Narrative word count: %d", word_count)
narrative_path = os.path.join(output_dir, "narrative.txt")
with open(narrative_path, "w", encoding="utf-8") as f:
f.write(narrative)
log.info(" Saved: %s", narrative_path)
output_files = [comparison_path, taxonomy_path, narrative_path]
log.info("Pipeline complete. All phases executed successfully.")
return {
"review_df": review_df,
"comparison_df": comparison_df,
"topics": enriched_topics,
"gap": gap,
"narrative": narrative,
"output_files": output_files,
"record_count": len(df)
}