Spaces:
Runtime error
Runtime error
| """ | |
| agent.py - Pipeline controller for the Topic Modelling System | |
| Orchestrates all phases in sequence and returns structured results. | |
| """ | |
| import os | |
| import logging | |
| from pathlib import Path | |
| import pandas as pd | |
| from tools import ( | |
| load_csv, | |
| extract_topics, | |
| build_review_table, | |
| compare_title_abstract, | |
| map_pajais, | |
| gap_analysis, | |
| save_comparison_csv, | |
| save_taxonomy_json, | |
| generate_narrative, | |
| ) | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(message)s", | |
| datefmt="%H:%M:%S" | |
| ) | |
| log = logging.getLogger(__name__) | |
| def run_pipeline(csv_path: str, output_dir: str = ".") -> dict: | |
| """ | |
| Execute the full topic modelling pipeline. | |
| Parameters | |
| ---------- | |
| csv_path : Path to input CSV file. | |
| output_dir : Directory where output files will be written. | |
| Returns | |
| ------- | |
| dict with keys: | |
| review_df β DataFrame (topic_id, keyword, frequency) | |
| comparison_df β DataFrame (title vs abstract comparison) | |
| topics β list of enriched topic dicts | |
| gap β dict of gap-analysis statistics | |
| narrative β str (narrative text, 480-520 words) | |
| output_files β list of written file paths | |
| """ | |
| os.makedirs(output_dir, exist_ok=True) | |
| # ββ PHASE 1: Data Loading ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| log.info("Phase 1 | Loading data from: %s", csv_path) | |
| df = load_csv(csv_path) | |
| log.info(" Loaded %d records.", len(df)) | |
| # ββ PHASE 2: Topic Extraction ββββββββββββββββββββββββββββββββββββββββββββ | |
| log.info("Phase 2 | Extracting topics (target β₯ 98) β¦") | |
| raw_topics = extract_topics(df, n_topics=100) | |
| log.info(" Extracted %d topics.", len(raw_topics)) | |
| # ββ PHASE 3: Review Table ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| log.info("Phase 3 | Building review table β¦") | |
| review_df = build_review_table(raw_topics) | |
| # ββ PHASE 4: Comparison ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| log.info("Phase 4 | Comparing title vs abstract themes β¦") | |
| comparison_df = compare_title_abstract(df, raw_topics) | |
| # ββ PHASE 5: PAJAIS Mapping ββββββββββββββββββββββββββββββββββββββββββββββ | |
| log.info("Phase 5 | Mapping topics to PAJAIS taxonomy β¦") | |
| enriched_topics = map_pajais(raw_topics) | |
| # ββ PHASE 5.5: Gap Analysis ββββββββββββββββββββββββββββββββββββββββββββββ | |
| log.info("Phase 5.5 | Running gap analysis β¦") | |
| gap = gap_analysis(enriched_topics) | |
| log.info( | |
| " MAPPED=%d (%.1f%%) NOVEL=%d (%.1f%%)", | |
| gap["mapped_count"], gap["mapped_percent"], | |
| gap["novel_count"], gap["novel_percent"] | |
| ) | |
| # ββ PHASE 6: Output Files ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| log.info("Phase 6 | Writing output files β¦") | |
| comparison_path = os.path.join(output_dir, "comparison.csv") | |
| taxonomy_path = os.path.join(output_dir, "taxonomy_map.json") | |
| save_comparison_csv(comparison_df, comparison_path) | |
| save_taxonomy_json(enriched_topics, gap, taxonomy_path) | |
| log.info(" Saved: %s", comparison_path) | |
| log.info(" Saved: %s", taxonomy_path) | |
| # ββ PHASE 7: Narrative βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| log.info("Phase 7 | Generating narrative (target 480-520 words) β¦") | |
| narrative = generate_narrative(review_df, comparison_df, enriched_topics, gap) | |
| word_count = len(narrative.split()) | |
| log.info(" Narrative word count: %d", word_count) | |
| narrative_path = os.path.join(output_dir, "narrative.txt") | |
| with open(narrative_path, "w", encoding="utf-8") as f: | |
| f.write(narrative) | |
| log.info(" Saved: %s", narrative_path) | |
| output_files = [comparison_path, taxonomy_path, narrative_path] | |
| log.info("Pipeline complete. All phases executed successfully.") | |
| return { | |
| "review_df": review_df, | |
| "comparison_df": comparison_df, | |
| "topics": enriched_topics, | |
| "gap": gap, | |
| "narrative": narrative, | |
| "output_files": output_files, | |
| "record_count": len(df) | |
| } | |