Spaces:

ronitsonawane24
/

tryH

Runtime error

App Files Files Community

tryH / agent.py

ronitsonawane24

Upload 4 files

b7e9bf6 verified about 2 months ago

raw

history blame contribute delete

4.96 kB

	"""
	agent.py - Pipeline controller for the Topic Modelling System
	Orchestrates all phases in sequence and returns structured results.
	"""

	import os
	import logging
	from pathlib import Path

	import pandas as pd

	from tools import (
	load_csv,
	extract_topics,
	build_review_table,
	compare_title_abstract,
	map_pajais,
	gap_analysis,
	save_comparison_csv,
	save_taxonomy_json,
	generate_narrative,
	)

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	datefmt="%H:%M:%S"
	)
	log = logging.getLogger(__name__)


	def run_pipeline(csv_path: str, output_dir: str = ".") -> dict:
	"""
	Execute the full topic modelling pipeline.

	Parameters
	----------
	csv_path : Path to input CSV file.
	output_dir : Directory where output files will be written.

	Returns
	-------
	dict with keys:
	review_df – DataFrame (topic_id, keyword, frequency)
	comparison_df – DataFrame (title vs abstract comparison)
	topics – list of enriched topic dicts
	gap – dict of gap-analysis statistics
	narrative – str (narrative text, 480-520 words)
	output_files – list of written file paths
	"""
	os.makedirs(output_dir, exist_ok=True)

	# ── PHASE 1: Data Loading ────────────────────────────────────────────────
	log.info("Phase 1 \| Loading data from: %s", csv_path)
	df = load_csv(csv_path)
	log.info(" Loaded %d records.", len(df))

	# ── PHASE 2: Topic Extraction ────────────────────────────────────────────
	log.info("Phase 2 \| Extracting topics (target ≥ 98) …")
	raw_topics = extract_topics(df, n_topics=100)
	log.info(" Extracted %d topics.", len(raw_topics))

	# ── PHASE 3: Review Table ────────────────────────────────────────────────
	log.info("Phase 3 \| Building review table …")
	review_df = build_review_table(raw_topics)

	# ── PHASE 4: Comparison ──────────────────────────────────────────────────
	log.info("Phase 4 \| Comparing title vs abstract themes …")
	comparison_df = compare_title_abstract(df, raw_topics)

	# ── PHASE 5: PAJAIS Mapping ──────────────────────────────────────────────
	log.info("Phase 5 \| Mapping topics to PAJAIS taxonomy …")
	enriched_topics = map_pajais(raw_topics)

	# ── PHASE 5.5: Gap Analysis ──────────────────────────────────────────────
	log.info("Phase 5.5 \| Running gap analysis …")
	gap = gap_analysis(enriched_topics)
	log.info(
	" MAPPED=%d (%.1f%%) NOVEL=%d (%.1f%%)",
	gap["mapped_count"], gap["mapped_percent"],
	gap["novel_count"], gap["novel_percent"]
	)

	# ── PHASE 6: Output Files ────────────────────────────────────────────────
	log.info("Phase 6 \| Writing output files …")
	comparison_path = os.path.join(output_dir, "comparison.csv")
	taxonomy_path = os.path.join(output_dir, "taxonomy_map.json")

	save_comparison_csv(comparison_df, comparison_path)
	save_taxonomy_json(enriched_topics, gap, taxonomy_path)
	log.info(" Saved: %s", comparison_path)
	log.info(" Saved: %s", taxonomy_path)

	# ── PHASE 7: Narrative ───────────────────────────────────────────────────
	log.info("Phase 7 \| Generating narrative (target 480-520 words) …")
	narrative = generate_narrative(review_df, comparison_df, enriched_topics, gap)
	word_count = len(narrative.split())
	log.info(" Narrative word count: %d", word_count)

	narrative_path = os.path.join(output_dir, "narrative.txt")
	with open(narrative_path, "w", encoding="utf-8") as f:
	f.write(narrative)
	log.info(" Saved: %s", narrative_path)

	output_files = [comparison_path, taxonomy_path, narrative_path]

	log.info("Pipeline complete. All phases executed successfully.")

	return {
	"review_df": review_df,
	"comparison_df": comparison_df,
	"topics": enriched_topics,
	"gap": gap,
	"narrative": narrative,
	"output_files": output_files,
	"record_count": len(df)
	}