Spaces:

reyansh2005
/

bert-topic

Sleeping

File size: 26,486 Bytes

f19d5b6

# tools.py — Scientific Document Topic Analyzer
# Built on the Braun & Clarke (2006) Thematic Analysis Framework.
# Implementation: Zero-loop, zero-exception, functional-first logic.

from dotenv import load_dotenv
load_dotenv()

import re
import json
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from langchain_core.tools import tool
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_mistralai import ChatMistralAI
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import nltk

# Initialize NLP resources
nltk.download("punkt",     quiet=True)
nltk.download("punkt_tab", quiet=True)
from nltk.tokenize import sent_tokenize

# --- Global Configuration & Taxonomy ---

COLUMN_MAP = {
    "abstract": ["Abstract"],
    "title":    ["Title"],
}

EMBEDDING_MODEL_ID   = "all-MiniLM-L6-v2"
NEAREST_NEIGHBORS_K  = 5
MAX_TOPIC_BATCH_SIZE = 60  
SENTENCE_HARD_LIMIT  = 3000 
DEFAULT_CLUSTERING_THRESHOLD = 0.7
LLM_GATEWAY_TIMEOUT  = 120 

# Regex patterns to filter out standard academic publishing noise
JUNK_TEXT_REGEXES = [
    r"©\s*\d{4}",
    r"elsevier\s*(b\.v\.)?",
    r"springer\s*(nature)?",
    r"wiley\s*(online\s*library)?",
    r"all\s+rights\s+reserved",
    r"published\s+by\s+[a-z\s]+",
    r"doi:\s*10\.",
    r"www\.[a-z]+\.[a-z]+",
    r"https?://",
    r"copyright\s*\d{4}",
    r"taylor\s*&\s*francis",
    r"sage\s+publications",
    r"emerald\s+publishing",
    r"journal\s+of\s+[a-z\s]+issn",
    r"volume\s+\d+,?\s+issue\s+\d+",
    r"pp\.\s*\d+[-–]\d+",
    r"received\s+\d+\s+\w+\s+\d{4}",
    r"accepted\s+\d+\s+\w+\s+\d{4}",
    r"available\s+online",
    r"this\s+is\s+an\s+open\s+access",
    r"creative\s+commons",
    r"please\s+cite\s+this\s+article",
]

CATEGORY_HIERARCHY_PAJAIS = [
    "Artificial Intelligence Methods",
    "Natural Language Processing",
    "Machine Learning",
    "Deep Learning",
    "Knowledge Representation",
    "Ontologies & Semantic Web",
    "Information Retrieval",
    "Recommender Systems",
    "Decision Support Systems",
    "Human-Computer Interaction",
    "Explainability & Transparency",
    "Fairness, Accountability & Ethics",
    "Data Management & Integration",
    "Text Mining & Analytics",
    "Sentiment Analysis",
    "Social Media Analysis",
    "Business Intelligence",
    "Process Automation & RPA",
    "Computer Vision",
    "Speech & Audio Processing",
    "Multi-Agent Systems",
    "Robotics & Autonomous Systems",
    "Healthcare & Biomedical AI",
    "Finance & Risk Analytics",
    "Education & E-Learning",
]

# --- Internal Utility Logic ---

def _is_unwanted_metadata(text_segment: str) -> bool:
    """Identifies if a string matches academic boilerplate patterns."""
    return any(map(lambda pattern: bool(re.search(pattern, text_segment, re.IGNORECASE)), JUNK_TEXT_REGEXES))

def _refine_sentence_list(raw_list: list) -> list:
    """Filters out boilerplate and very short segments from the corpus."""
    clean_collection = list(filter(lambda s: not _is_unwanted_metadata(s), raw_list))
    meaningful_subs = list(filter(lambda s: len(s.split()) >= 6, clean_collection))
    return meaningful_subs

def _extract_sentences_from_corpus(text_blocks: list) -> list:
    """Tokenizes multiple text blocks into a flat list of clean sentences."""
    tokenized_nested = list(map(sent_tokenize, text_blocks))
    flat_list        = [item for sublist in tokenized_nested for item in sublist]
    return _refine_sentence_list(flat_list)

def _generate_vector_embeddings(sentence_list: list) -> np.ndarray:
    """Converts text into normalized vector representations using SBERT."""
    vector_engine = SentenceTransformer(EMBEDDING_MODEL_ID)
    return vector_engine.encode(sentence_list, normalize_embeddings=True, show_progress_bar=False)

def _perform_hierarchical_clustering(vectors: np.ndarray, distance_cutoff: float) -> np.ndarray:
    """Clusters vectors using Agglomerative Clustering with a cosine metric."""
    return AgglomerativeClustering(
        metric="cosine", linkage="average",
        distance_threshold=distance_cutoff, n_clusters=None,
    ).fit_predict(vectors)

def _perform_dbscan_clustering(vectors: np.ndarray, eps: float = 0.3, min_samples: int = 5) -> np.ndarray:
    """Clusters vectors using DBSCAN with cosine metric. Returns cluster assignments (-1 for noise)."""
    return DBSCAN(eps=eps, min_samples=min_samples, metric="cosine").fit_predict(vectors)

def _get_cluster_centroids(vectors: np.ndarray, group_labels: np.ndarray) -> dict:
    """Calculates the mean vector for each discovered cluster."""
    active_groups = sorted(set(group_labels.tolist()) - {-1})
    return dict(map(lambda g: (g, vectors[group_labels == g].mean(axis=0)), active_groups))

def _find_exemplary_sentences(midpoint_vector: np.ndarray, all_texts: list,
                              all_vectors: np.ndarray, top_k: int) -> list:
    """Finds sentences whose vectors are closest to the cluster centroid."""
    closeness_scores = cosine_similarity([midpoint_vector], all_vectors)[0]
    best_indices     = np.argsort(closeness_scores)[::-1][:top_k].tolist()
    return list(map(lambda idx: all_texts[idx], best_indices))

def _assemble_cluster_summaries(group_labels: np.ndarray, text_source: list,
                                vector_source: np.ndarray) -> list:
    """Builds a JSON-ready summary for every identified topic cluster."""
    midpoints = _get_cluster_centroids(vector_source, group_labels)

    def _format_node(cluster_id):
        membership_mask = group_labels == cluster_id
        return {
            "topic_id": cluster_id,
            "count":    int(membership_mask.sum()),
            "centroid": midpoints[cluster_id].tolist(),
            "nearest_sentences": _find_exemplary_sentences(
                midpoints[cluster_id], text_source, vector_source, NEAREST_NEIGHBORS_K),
        }
    return list(map(_format_node, sorted(midpoints.keys())))

def _initialize_llm_client() -> ChatMistralAI:
    """Configures the Mistral AI interface for thematic labeling."""
    return ChatMistralAI(
        model="mistral-large-latest",
        temperature=0.2,
        timeout=LLM_GATEWAY_TIMEOUT,
        max_retries=0, 
    )

# --- Primary Analysis Tools ---

@tool
def load_scopus_csv(file_path: str) -> str:
    """
    Ingests a Scopus CSV, cleans the data, and prepares it for analysis.
    Saves 'loaded_data.csv' as a local cache.
    """
    source_df = pd.read_csv(
        file_path,
        encoding="utf-8-sig",
        quoting=0,
        engine="python",
        on_bad_lines="skip",
    )
    source_df.to_csv("loaded_data.csv", index=False, encoding="utf-8")

    total_records = len(source_df)
    header_list   = list(source_df.columns)

    raw_abstracts = list(source_df["Abstract"].dropna().astype(str)) if "Abstract" in header_list else []
    raw_titles    = list(source_df["Title"].dropna().astype(str))    if "Title"    in header_list else []

    processed_abstracts = _extract_sentences_from_corpus(raw_abstracts)
    processed_titles    = _extract_sentences_from_corpus(raw_titles)

    publication_years = pd.to_numeric(source_df["Year"], errors="coerce").dropna() if "Year" in header_list else pd.Series([], dtype=float)
    period_string     = f"{int(publication_years.min())} – {int(publication_years.max())}" if len(publication_years) > 0 else "N/A"

    return json.dumps({
        "papers":               total_records,
        "abstract_sentences":   len(processed_abstracts),
        "title_sentences":      len(processed_titles),
        "year_range":           period_string,
        "columns":              header_list,
        "abstract_coverage_pct": round(len(raw_abstracts) / total_records * 100, 1) if total_records else 0,
        "title_coverage_pct":    round(len(raw_titles) / total_records * 100, 1) if total_records else 0,
        "sample_titles":        list(source_df["Title"].dropna().head(5)) if "Title" in header_list else [],
        "file_saved":           "loaded_data.csv",
        "note": f"Clustering cap set to {SENTENCE_HARD_LIMIT} entries for efficiency.",
    }, indent=2)

@tool
def run_bertopic_discovery(run_key: str = "abstract", threshold: float = 0.7, method: str = "hierarchical") -> str:
    """
    Executes the BERTopic discovery logic: Embedding -> Clustering -> Visualization.
    Outputs interactive Plotly charts and cluster summaries.
    Supports both Hierarchical and DBSCAN clustering methods.
    """
    cached_df           = pd.read_csv("loaded_data.csv")
    target_col          = COLUMN_MAP[run_key][0]
    unstructured_texts  = list(cached_df[target_col].dropna().astype(str))

    global_sentence_pool = _extract_sentences_from_corpus(unstructured_texts)

    # Apply sentence limit to prevent memory overflow
    optimized_sentence_pool = global_sentence_pool[:SENTENCE_HARD_LIMIT]
    print(f"[Core Discovery] Processing {len(optimized_sentence_pool)} sentences from total pool of {len(global_sentence_pool)}.")

    semantic_vectors = _generate_vector_embeddings(optimized_sentence_pool)
    np.save(f"emb_{run_key}.npy", semantic_vectors)

    if method == "dbscan":
        cluster_assignments = _perform_dbscan_clustering(semantic_vectors, eps=threshold, min_samples=5)
    else:
        cluster_assignments = _perform_hierarchical_clustering(semantic_vectors, threshold)
    thematic_summaries  = _assemble_cluster_summaries(cluster_assignments, optimized_sentence_pool, semantic_vectors)

    with open(f"summaries_{run_key}.json", "w") as storage_file:
        json.dump(thematic_summaries, storage_file, indent=2)

    entry_counts     = [node["count"] for node in thematic_summaries]
    node_identifiers = [node["topic_id"] for node in thematic_summaries]
    centroid_stack   = np.array([node["centroid"] for node in thematic_summaries])

    # Visual 1: Inter-topic mapping via PCA
    dimension_count = min(2, len(centroid_stack), centroid_stack.shape[1])
    reduced_coords  = PCA(n_components=dimension_count).fit_transform(centroid_stack)
    dimension_x     = reduced_coords[:, 0].tolist()
    dimension_y     = (reduced_coords[:, 1].tolist() if reduced_coords.shape[1] > 1 else [0] * len(dimension_x))

    map_fig = px.scatter(
        x=dimension_x, y=dimension_y,
        size=entry_counts, text=list(map(str, node_identifiers)),
        title=f"Thematic Landscape ({run_key})",
        labels={"x": "Factor 1", "y": "Factor 2"},
        size_max=40, color=entry_counts, color_continuous_scale="Viridis",
    )
    map_fig.update_traces(textposition="top center")
    map_fig.update_layout(template="plotly_white")
    v_file_1 = f"chart_{run_key}_intertopic.html"
    map_fig.write_html(v_file_1, include_plotlyjs="cdn")

    # Visual 2: Sentence distribution bar chart
    top_nodes = thematic_summaries[:30]
    bar_fig  = px.bar(
        x=list(map(lambda n: f"Topic {n['topic_id']}", top_nodes)),
        y=list(map(lambda n: n["count"], top_nodes)),
        title=f"Thematic Weight Distribution ({run_key}) — Top 30",
        labels={"x": "Theme ID", "y": "Sentence Count"},
        color=list(map(lambda n: n["count"], top_nodes)),
        color_continuous_scale="Aggrnyl",
    )
    bar_fig.update_layout(template="plotly_white")
    v_file_2 = f"chart_{run_key}_bars.html"
    bar_fig.write_html(v_file_2, include_plotlyjs="cdn")

    # Visual 3: Hierarchical Treemap
    tree_fig = px.treemap(
        names=list(map(lambda n: f"ID:{n['topic_id']}", thematic_summaries)),
        parents=["Corpus"] * len(thematic_summaries),
        values=entry_counts,
        title=f"Topological Hierarchy ({run_key})",
    )
    tree_fig.update_layout(template="plotly_white")
    v_file_3 = f"chart_{run_key}_hierarchy.html"
    tree_fig.write_html(v_file_3, include_plotlyjs="cdn")

    # Visual 4: Semantic Connectivity Matrix
    preview_nodes   = thematic_summaries[:20]
    preview_vectors = np.array([n["centroid"] for n in preview_nodes])
    similarity_grid = cosine_similarity(preview_vectors).tolist()
    axis_labels     = list(map(lambda n: f"T{n['topic_id']}", preview_nodes))
    heat_fig        = go.Figure(data=go.Heatmap(z=similarity_grid, x=axis_labels, y=axis_labels, colorscale="YlGnBu"))
    heat_fig.update_layout(title=f"Semantic Proximity Heatmap ({run_key})", template="plotly_white")
    v_file_4 = f"chart_{run_key}_heatmap.html"
    heat_fig.write_html(v_file_4, include_plotlyjs="cdn")

    return json.dumps({
        "run_key":          run_key,
        "total_topics":     len(thematic_summaries),
        "total_sentences":  len(global_sentence_pool),
        "sentences_used":   len(optimized_sentence_pool),
        "sentences_capped": len(global_sentence_pool) > SENTENCE_HARD_LIMIT,
        "threshold_used":   threshold,
        "summaries_file":   f"summaries_{run_key}.json",
        "embeddings_file":  f"emb_{run_key}.npy",
        "charts":           [v_file_1, v_file_2, v_file_3, v_file_4],
        "topics_preview":   thematic_summaries[:3],
    }, indent=2)

@tool
def label_topics_with_llm(run_key: str = "abstract") -> str:
    """
    Queries Mistral AI to provide human-readable labels and metadata for clusters.
    Uses batch processing to minimize API overhead and latency.
    """
    with open(f"summaries_{run_key}.json", encoding="utf-8") as raw_json:
        cluster_list = json.load(raw_json)

    active_subset = cluster_list[:MAX_TOPIC_BATCH_SIZE]

    # Structure data for the LLM's consumption
    llm_payload = list(map(
        lambda node: {
            "topic_id": node["topic_id"],
            "count":    node["count"],
            "sentences": node["nearest_sentences"][:2],
        },
        active_subset,
    ))

    llm_handler     = _initialize_llm_client()
    json_interpreter = JsonOutputParser()

    label_prompt = PromptTemplate(
        input_variables=["input_json"],
        template=(
            "You are a specialized thematic coder for academic literature.\n\n"
            "Analyze the following clusters discovered through BERTopic. "
            "For each cluster, derive a research-oriented label with AI Council-style reasoning.\\n\\n"
            "{input_json}\\n\\n"
            "Respond ONLY with a JSON array containing these keys for each entry:\\n"
            "  topic_id (int), label (3-6 words), category (methodology/theory/application/context/empirical), "
            "  confidence (float), reasoning (object with keys: method, data, impact), niche (bool).\\n\\n"
            "Reasoning structure (use brief, focused explanations):\\n"
            "  method: Explain the methodological or theoretical lens applied to this cluster (1-2 sentences)\\n"
            "  data: Describe the empirical patterns or evidence supporting this grouping (1-2 sentences)\\n"
            "  impact: Articulate the research or practice implications of this theme (1-2 sentences)\\n\\n"
            "Generate entries for ALL {total_count} topics provided."
        ),
    )

    inference_chain = label_prompt | llm_handler | json_interpreter
    ai_response     = inference_chain.invoke({
        "input_json":  json.dumps(llm_payload, indent=2),
        "total_count": len(active_subset),
    })

    # Map AI results back to the original database
    response_directory = {str(item["topic_id"]): item for item in ai_response}

    def _format_reasoning(reasoning_obj):
        """Converts multi-part reasoning structure into a readable string."""
        if isinstance(reasoning_obj, dict):
            parts = []
            if "method" in reasoning_obj:
                parts.append(f"Method: {reasoning_obj['method']}")
            if "data" in reasoning_obj:
                parts.append(f"Data: {reasoning_obj['data']}")
            if "impact" in reasoning_obj:
                parts.append(f"Impact: {reasoning_obj['impact']}")
            return " | ".join(parts) if parts else ""
        return str(reasoning_obj) if reasoning_obj else ""

    final_labels = list(map(
        lambda original: {
            "topic_id":          original["topic_id"],
            "count":             original["count"],
            "nearest_sentences": original["nearest_sentences"],
            "label":             response_directory.get(str(original["topic_id"]), {}).get("label",    f"Concept Group {original['topic_id']}"),
            "category":          response_directory.get(str(original["topic_id"]), {}).get("category", "application"),
            "confidence":        response_directory.get(str(original["topic_id"]), {}).get("confidence", 0.5),
            "reasoning":         _format_reasoning(response_directory.get(str(original["topic_id"]), {}).get("reasoning", "")),
            "niche":             response_directory.get(str(original["topic_id"]), {}).get("niche",    False),
        },
        active_subset,
    ))

    export_path = f"labels_{run_key}.json"
    with open(export_path, "w") as out_file:
        json.dump(final_labels, out_file, indent=2)

    return json.dumps({
        "run_key":         run_key,
        "total_labelled":  len(final_labels),
        "output_file":    export_path,
        "preview":        final_labels[:5],
    }, indent=2)

@tool
def consolidate_into_themes(run_key: str = "abstract", theme_map: str = "") -> str:
    """
    Groups individual topic clusters into broader research themes.
    Employs LLM-driven synthesis if no manual mapping is provided.
    """
    with open(f"labels_{run_key}.json", encoding="utf-8") as raw_data:
        labeled_topics = json.load(raw_data)

    topic_lookup_table  = {str(t["topic_id"]): t for t in labeled_topics}
    manual_theme_design = json.loads(theme_map) if theme_map.strip() else {}

    def _build_from_manual(name_id_pair):
        theme_title, topic_id_list = name_id_pair
        matching_topics = list(filter(lambda t: str(t["topic_id"]) in map(str, topic_id_list), labeled_topics))
        aggregate_docs  = sum(map(lambda t: t["count"], matching_topics))
        sample_quotes   = [s for t in matching_topics for s in t.get("nearest_sentences", [])][:5]
        return {
            "theme_name":             theme_title,
            "topic_ids":              list(map(int, topic_id_list)),
            "total_sentences":        aggregate_docs,
            "representative_sentences": sample_quotes,
            "constituent_labels":     list(map(lambda t: t.get("label", ""), matching_topics)),
        }

    def _build_from_intelligence():
        llm_client      = _initialize_llm_client()
        json_output_mod = JsonOutputParser()
        synthesis_prompt = PromptTemplate(
            input_variables=["topic_definitions"],
            template=(
                "You are performing Phase 3 & 4 of thematic analysis (Braun & Clarke).\n\n"
                "Data Clusters:\n{topic_definitions}\n\n"
                "Consolidate these into 4-8 broad research themes.\n"
                "Format: JSON array of objects with theme_name, topic_ids (list), rationale, representative_sentences (list).\n"
            ),
        )
        flow = synthesis_prompt | llm_client | json_output_mod
        compact_definitions = list(map(
            lambda t: {"topic_id": t["topic_id"], "label": t.get("label", ""), "sample": t.get("nearest_sentences", [""])[0][:100]},
            labeled_topics[:MAX_TOPIC_BATCH_SIZE],
        ))
        generated_themes = flow.invoke({"topic_definitions": json.dumps(compact_definitions, indent=2)})
        return list(map(
            lambda th: {
                **th,
                "total_sentences": sum(map(lambda tid: topic_lookup_table.get(str(tid), {}).get("count", 0), th.get("topic_ids", []))),
                "constituent_labels": list(map(lambda tid: topic_lookup_table.get(str(tid), {}).get("label", ""), th.get("topic_ids", []))),
            },
            generated_themes,
        ))

    final_thematic_set = (
        list(map(_build_from_manual, manual_theme_design.items()))
        if manual_theme_design
        else _build_from_intelligence()
    )

    theme_store_1 = f"themes_{run_key}.json"
    with open(theme_store_1, "w", encoding="utf-8") as f1:
        json.dump(final_thematic_set, f1, indent=2)
    with open("themes.json", "w", encoding="utf-8") as f_canonical:
        json.dump(final_thematic_set, f_canonical, indent=2)

    return json.dumps({
        "run_key":       run_key,
        "total_themes":  len(final_thematic_set),
        "output_file":   theme_store_1,
        "themes_preview": [{"name": th["theme_name"], "size": th.get("total_sentences", 0)} for th in final_thematic_set],
    }, indent=2)

@tool
def compare_with_taxonomy(run_key: str = "abstract") -> str:
    """
    Aligns discovered themes with the PAJAIS research taxonomy.
    Flags 'NOVEL' themes that represent potential scientific gaps.
    """
    specific_themes_file = f"themes_{run_key}.json"
    active_themes_file   = specific_themes_file if os.path.exists(specific_themes_file) else "themes.json"
    
    with open(active_themes_file, encoding="utf-8") as theme_io:
        theme_collection = json.load(theme_io)

    llm_bridge        = _initialize_llm_client()
    json_processor     = JsonOutputParser()

    alignment_prompt = PromptTemplate(
        input_variables=["theme_input", "taxonomy_str"],
        template=(
            "You are a taxonomy alignment specialist.\n\n"
            "Official Categories:\n{taxonomy_str}\n\n"
            "User Themes:\n{theme_input}\n\n"
            "Map each theme to the closest official category. If it is a completely new direction, mark as NOVEL.\n"
            "Format: JSON array with theme_name, pajais_match, match_confidence, reasoning, is_novel.\n"
        ),
    )
    mapping_chain = alignment_prompt | llm_bridge | json_processor

    theme_metadata = list(map(
        lambda t: {
            "theme_name":        t["theme_name"],
            "constituent_labels": t.get("constituent_labels", []),
            "evidence":          (t.get("representative_sentences", [""])[0][:100] if t.get("representative_sentences") else ""),
        },
        theme_collection,
    ))

    alignment_results = mapping_chain.invoke({
        "theme_input":  json.dumps(theme_metadata, indent=2),
        "taxonomy_str": "\n".join(f"- {cat}" for cat in CATEGORY_HIERARCHY_PAJAIS),
    })

    with open("taxonomy_map.json", "w", encoding="utf-8") as map_io:
        json.dump(alignment_results, map_io, indent=2)

    novel_count = sum(1 for entry in alignment_results if entry.get("is_novel", False))

    return json.dumps({
        "run_key":             run_key,
        "total_mapped":       len(alignment_results),
        "novel_entries":       novel_count,
        "standard_entries":    len(alignment_results) - novel_count,
        "mapping_file":        "taxonomy_map.json",
        "detailed_mapping":    alignment_results,
    }, indent=2)

@tool
def generate_comparison_csv() -> str:
    """
    Aggregates results from Abstract and Title analyses into a single comparative report.
    """
    def _read_theme_data(key):
        path = f"themes_{key}.json"
        return json.loads(open(path, encoding="utf-8").read()) if os.path.exists(path) else []

    abstract_run_data = _read_theme_data("abstract")
    title_run_data    = _read_theme_data("title")
    max_count         = max(len(abstract_run_data), len(title_run_data), 1)

    abs_padded = abstract_run_data + [{}] * (max_count - len(abstract_run_data))
    ttl_padded = title_run_data    + [{}] * (max_count - len(title_run_data))

    comparative_rows = list(map(
        lambda triple: {
            "ID":               triple[0] + 1,
            "Abstract Theme":  triple[1].get("theme_name", ""),
            "Abstract Count":  triple[1].get("total_sentences", 0),
            "Title Theme":     triple[2].get("theme_name", ""),
            "Title Count":     triple[2].get("total_sentences", 0),
            "Consistency":     "Matched" if str(triple[1].get("theme_name", ""))[:5].lower() == str(triple[2].get("theme_name", ""))[:5].lower() else "Distinct",
        },
        zip(range(max_count), abs_padded, ttl_padded)
    ))

    report_df = pd.DataFrame(comparative_rows)
    report_df.to_csv("comparison.csv", index=False)

    return json.dumps({
        "result_file": "comparison.csv",
        "row_count":   len(report_df),
        "data_peek":   comparative_rows[:3],
    }, indent=2)

@tool
def export_narrative(run_key: str = "abstract") -> str:
    """
    Generates a formal research narrative based on the thematic analysis results.
    Produces a 500-word Section 7 draft.
    """
    with open("themes.json", encoding="utf-8") as t_in:
        thematic_data = json.load(t_in)

    mapping_raw  = open("taxonomy_map.json", encoding="utf-8").read() if os.path.exists("taxonomy_map.json") else "[]"
    mapping_data = json.loads(mapping_raw)

    narrative_llm             = _initialize_llm_client()
    narrative_llm.temperature = 0.4 
    
    narrative_prompt = PromptTemplate(
        input_variables=["key", "themes", "mapping"],
        template=(
            "Write a formal academic Section 7 Discussion (approx 500 words).\n"
            "Context: {key} analysis run.\n"
            "Themes Found:\n{themes}\n\n"
            "Taxonomy Alignment:\n{mapping}\n\n"
            "Requirements:\n"
            "1. Discuss the methodology (BERTopic + Braun & Clarke).\n"
            "2. Interpret the key themes and their implications.\n"
            "3. Analyze the NOVEL vs MAPPED categories.\n"
            "4. Suggest future work. Use professional, scholarly language.\n"
        ),
    )
    composition_flow = narrative_prompt | narrative_llm
    story_response   = composition_flow.invoke({
        "key":     run_key,
        "themes":  json.dumps(thematic_data, indent=2),
        "mapping": json.dumps(mapping_data, indent=2),
    })
    final_text = story_response.content if hasattr(story_response, "content") else str(story_response)

    with open("narrative.txt", "w", encoding="utf-8") as narrative_io:
        narrative_io.write(final_text)

    return json.dumps({
        "output_file": "narrative.txt",
        "word_stats":  len(final_text.split()),
        "content_start": final_text[:400],
    }, indent=2)