Spaces:
Build error
Build error
| """ | |
| tools.py β 7 @tool functions for BERTopic Agentic AI | |
| Assignment: Text Analysis & Topic Modelling (Prof. Shailaja Jha) | |
| Generated via: Anthropic Claude Sonnet 4.5 | |
| Architecture: LangChain @tool + LangGraph | Model: Mistral Small Latest | |
| Rules: ZERO if/elif/else | ZERO for/while | ZERO try/except | handle_tool_error=True | |
| """ | |
| import os, re, json | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from sklearn.cluster import AgglomerativeClustering | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.decomposition import PCA | |
| from langchain_core.tools import tool | |
| from langchain_mistralai import ChatMistralAI | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_core.output_parsers import JsonOutputParser, StrOutputParser | |
| # βββ CONSTANTS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| OUTPUT_DIR = "./outputs" | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| BOILERPLATE_RE = re.compile( | |
| r"Β©\s*\d{4}[^.]*?\.|All\s+rights\s+reserved\.?|" | |
| r"Published\s+by\s+[A-Z][^.]*?\.|This\s+is\s+an\s+open\s+access[^.]*?\.|" | |
| r"Correspondence\s+(to|author):[^.]*?\.|E-?mail:[^.]*?\.|" | |
| r"Received:[^.]*?Accepted:[^.]*?\.|DOI:\S+|doi:\S+|https?://\S+|" | |
| r"Keywords:[^.]*?\.|JEL[^.]*?\.|ISSN[^.]*?\.|ISBN[^.]*?\.|" | |
| r"Elsevier[^.]*?\.|Springer[^.]*?\.|Emerald[^.]*?\.|" | |
| r"Wiley[^.]*?\.|Taylor\s*&\s*Francis[^.]*?\.|" | |
| r"This\s+paper\s+is\s+part\s+of[^.]*?\.|" | |
| r"Conflict\s+of\s+interest[^.]*?\.|" | |
| r"Funding[^.]*?:\s*[^.]*?\.|" | |
| r"Acknowledgement[s]?:[^.]*?\.", | |
| re.IGNORECASE | re.DOTALL, | |
| ) | |
| SENT_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z\"\(])") | |
| PAJAIS_25 = [ | |
| "IS Strategy and Management", "E-Commerce and E-Business", | |
| "IT Adoption and Diffusion", "Business Intelligence and Analytics", | |
| "Social Commerce and Social Media", "Mobile Commerce and Applications", | |
| "Knowledge Management", "Healthcare Information Systems", | |
| "Privacy, Security and Trust", "Enterprise Systems and ERP", | |
| "Digital Platforms and Ecosystems", "Blockchain and Distributed Ledgers", | |
| "Artificial Intelligence and Machine Learning", | |
| "Human-Computer Interaction and UX", | |
| "Digital Transformation and Innovation", | |
| "Financial Technology and Digital Finance", | |
| "Supply Chain and Logistics IS", "Smart Systems IoT and Smart Cities", | |
| "IS Research Methods and Theory", | |
| "Recommender and Personalization Systems", | |
| "Digital Marketing and Advertising", | |
| "Virtual Teams and Online Collaboration", | |
| "Cloud Computing and SaaS", "Big Data Analytics and Data Science", | |
| "IS Education and Training", | |
| ] | |
| _EMBED_MODEL = None | |
| def _get_embed_model(): | |
| global _EMBED_MODEL | |
| from sentence_transformers import SentenceTransformer | |
| _EMBED_MODEL = _EMBED_MODEL or SentenceTransformer("all-MiniLM-L6-v2") | |
| return _EMBED_MODEL | |
| def _get_llm(): | |
| return ChatMistralAI( | |
| model="mistral-small-latest", | |
| api_key=os.environ.get("MISTRAL_API_KEY", ""), | |
| temperature=0.1, | |
| ) | |
| def _clean(text: str) -> str: | |
| return BOILERPLATE_RE.sub(" ", str(text)).strip() | |
| def _split(text: str) -> list: | |
| return [s.strip() for s in SENT_RE.split(_clean(text)) if len(s.strip()) > 30] | |
| def _save(data, name: str) -> str: | |
| path = os.path.join(OUTPUT_DIR, name) | |
| with open(path, "w", encoding="utf-8") as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| return path | |
| def _load(name: str): | |
| with open(os.path.join(OUTPUT_DIR, name), "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| # βββ TOOL 1: LOAD CSV ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_scopus_csv(filepath: str) -> str: | |
| """Load a Scopus CSV export file and return statistics. | |
| Phase 1 of Braun & Clarke (2006) β Familiarisation. | |
| Call this FIRST before any analysis.""" | |
| df = pd.read_csv(filepath, encoding="utf-8-sig", on_bad_lines="skip") | |
| required = ["Title", "Abstract", "Authors", "Year", "Cited by", | |
| "Author Keywords", "Source title"] | |
| found = [c for c in required if c in df.columns] | |
| missing = [c for c in required if c not in df.columns] | |
| pairs_abs = [(s, i) for i, t in enumerate(df["Abstract"].fillna("").tolist()) | |
| for s in _split(t)] | |
| pairs_ttl = [(s, i) for i, t in enumerate(df["Title"].fillna("").tolist()) | |
| for s in _split(t)] | |
| year_min = int(df["Year"].dropna().min()) if "Year" in df.columns else 0 | |
| year_max = int(df["Year"].dropna().max()) if "Year" in df.columns else 0 | |
| journal = (df["Source title"].value_counts().index[0] | |
| if "Source title" in df.columns else "Unknown") | |
| _save({"filepath": filepath, "journal": journal, | |
| "rows": len(df), "year_min": year_min, "year_max": year_max}, "corpus_config.json") | |
| return ( | |
| f"β CSV Loaded\nJournal: {journal}\nPapers: {len(df)}\n" | |
| f"Year Range: {year_min}β{year_max}\n" | |
| f"Columns Found ({len(found)}/7): {found}\nMissing: {missing}\n" | |
| f"Abstract sentences: {len(pairs_abs):,}\n" | |
| f"Title sentences: {len(pairs_ttl):,}\n" | |
| f"Type 'run abstract' to begin Phase 2." | |
| ) | |
| # βββ TOOL 2: RUN BERTOPIC DISCOVERY ββββββββββββββββββββββββββββββββββββββββββ | |
| def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str: | |
| """Embed sentences with all-MiniLM-L6-v2 and cluster with AgglomerativeClustering | |
| (metric=cosine, linkage=average, distance_threshold=threshold). | |
| NO UMAP β clusters directly in 384d space. | |
| Saves summaries.json + emb.npy. Phase 2 of Braun & Clarke.""" | |
| cfg = _load("corpus_config.json") | |
| df = pd.read_csv(cfg["filepath"], encoding="utf-8-sig", on_bad_lines="skip") | |
| col = "Abstract" if run_key == "abstract" else "Title" | |
| pairs = [(s, i) for i, t in enumerate(df[col].fillna("").tolist()) for s in _split(t)] | |
| sentences = [p[0] for p in pairs] | |
| paper_ids = [p[1] for p in pairs] | |
| model = _get_embed_model() | |
| emb = model.encode(sentences, normalize_embeddings=True, | |
| batch_size=64, show_progress_bar=True) | |
| np.save(os.path.join(OUTPUT_DIR, f"{run_key}_emb.npy"), emb) | |
| _save({"sentences": sentences, "paper_ids": paper_ids}, | |
| f"{run_key}_sentences.json") | |
| clusterer = AgglomerativeClustering( | |
| metric="cosine", linkage="average", | |
| distance_threshold=threshold, n_clusters=None, | |
| ) | |
| labels = clusterer.fit_predict(emb) | |
| unique_labels = np.unique(labels) | |
| n_clusters = len(unique_labels) | |
| def make_summary(cid): | |
| mask = labels == cid | |
| idx = np.where(mask)[0] | |
| c_emb = emb[mask] | |
| centroid = c_emb.mean(axis=0, keepdims=True) | |
| sims = cosine_similarity(centroid, c_emb)[0] | |
| top5 = list(np.argsort(sims)[-5:][::-1]) | |
| return { | |
| "cluster_id": int(cid), | |
| "sentence_count": int(mask.sum()), | |
| "paper_count": len(set(paper_ids[i] for i in idx)), | |
| "top_sentences": [sentences[idx[i]] for i in top5], | |
| "centroid": centroid[0].tolist(), | |
| } | |
| summaries = list(map(make_summary, unique_labels)) | |
| summaries.sort(key=lambda x: x["sentence_count"], reverse=True) | |
| _save(summaries, f"{run_key}_summaries.json") | |
| # ββ 4 Plotly Charts ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| centroids = np.array([s["centroid"] for s in summaries]) | |
| sizes = [s["sentence_count"] for s in summaries] | |
| pca = PCA(n_components=2) | |
| coords = pca.fit_transform(centroids) | |
| fig1 = px.scatter(x=coords[:, 0], y=coords[:, 1], size=sizes, | |
| title=f"Intertopic Distance Map β {run_key.title()} Clusters", | |
| labels={"x": "PC1", "y": "PC2"}, | |
| hover_name=[f"Cluster {s['cluster_id']}" for s in summaries]) | |
| chart_dir = os.path.join(OUTPUT_DIR, f"{run_key}_charts") | |
| os.makedirs(chart_dir, exist_ok=True) | |
| fig1.write_html(os.path.join(chart_dir, "intertopic_map.html"), | |
| include_plotlyjs="cdn", full_html=True) | |
| fig2 = px.bar(x=[f"C{s['cluster_id']}" for s in summaries[:30]], | |
| y=sizes[:30], | |
| title=f"Top 30 Cluster Sizes β {run_key.title()}", | |
| labels={"x": "Cluster", "y": "Sentences"}) | |
| fig2.write_html(os.path.join(chart_dir, "bar_chart.html"), | |
| include_plotlyjs="cdn", full_html=True) | |
| fig3 = px.treemap(names=[f"C{s['cluster_id']}" for s in summaries], | |
| parents=["clusters"] * n_clusters, | |
| values=sizes, | |
| title=f"Topic Treemap β {run_key.title()}") | |
| fig3.write_html(os.path.join(chart_dir, "treemap.html"), | |
| include_plotlyjs="cdn", full_html=True) | |
| heatmap_data = np.array(sizes[:20]).reshape(4, 5) | |
| fig4 = go.Figure(go.Heatmap(z=heatmap_data, | |
| colorscale="Viridis", | |
| text=[[f"C{summaries[i*5+j]['cluster_id']}" | |
| for j in range(5)] for i in range(4)])) | |
| fig4.update_layout(title=f"Topic Size Heatmap β {run_key.title()}") | |
| fig4.write_html(os.path.join(chart_dir, "heatmap.html"), | |
| include_plotlyjs="cdn", full_html=True) | |
| return ( | |
| f"β BERTopic Discovery Complete ({run_key})\n" | |
| f"Total sentences: {len(sentences):,}\n" | |
| f"Topics discovered: {n_clusters}\n" | |
| f"Threshold: {threshold}\n" | |
| f"Largest cluster: {sizes[0]} sentences\n" | |
| f"Charts saved. Now calling label_topics_with_llmβ¦" | |
| ) | |
| # βββ TOOL 3: LABEL TOPICS WITH LLM βββββββββββββββββββββββββββββββββββββββββββ | |
| def label_topics_with_llm(run_key: str) -> str: | |
| """Send top 100 clusters to Mistral for labelling. | |
| Returns topic labels, categories, confidence scores. | |
| Saves labels.json. Phase 2 of Braun & Clarke.""" | |
| summaries = _load(f"{run_key}_summaries.json")[:100] | |
| llm = _get_llm() | |
| label_prompt = PromptTemplate.from_template( | |
| "You are a bibliometric research expert.\n" | |
| "Label each cluster below with a concise research area name.\n" | |
| "Return ONLY a JSON array β one object per cluster:\n" | |
| ' {{"cluster_id": N, "label": "...", "category": "...", ' | |
| '"confidence": 0.0-1.0, "reasoning": "...", "is_niche": true/false}}\n\n' | |
| "Clusters (ID | sentence_count | top 2 sentences):\n{clusters}\n\n" | |
| "Return valid JSON array only, no markdown fences." | |
| ) | |
| parser = JsonOutputParser() | |
| def label_batch(batch): | |
| lines = [ | |
| f"{s['cluster_id']} | {s['sentence_count']} sents | " | |
| + " /// ".join(s["top_sentences"][:2]) | |
| for s in batch | |
| ] | |
| text = "\n".join(lines) | |
| raw = (label_prompt | llm | StrOutputParser()).invoke({"clusters": text}) | |
| raw = raw.strip().lstrip("```json").lstrip("```").rstrip("```").strip() | |
| return json.loads(raw) | |
| batch_size = 20 | |
| batches = [summaries[i:i+batch_size] for i in range(0, len(summaries), batch_size)] | |
| results = [item for batch in map(label_batch, batches) for item in batch] | |
| label_map = {r["cluster_id"]: r for r in results} | |
| labeled = [ | |
| {**s, **label_map.get(s["cluster_id"], | |
| {"label": f"Topic {s['cluster_id']}", "category": "Unknown", | |
| "confidence": 0.5, "reasoning": "", "is_niche": False})} | |
| for s in summaries | |
| ] | |
| _save(labeled, f"{run_key}_labels.json") | |
| return ( | |
| f"β Labels Generated ({run_key})\n" | |
| f"Topics labeled: {len(labeled)}\n" | |
| f"Review table populated. Edit Approve/Rename columns, " | |
| f"then click Submit Review." | |
| ) | |
| # βββ TOOL 4: CONSOLIDATE INTO THEMES βββββββββββββββββββββββββββββββββββββββββ | |
| def consolidate_into_themes(run_key: str, theme_map: str) -> str: | |
| """Merge researcher-approved topic groups into consolidated themes. | |
| theme_map: JSON array from review table with approve/rename_to fields. | |
| Recomputes centroids and paper counts. Saves themes.json. Phase 3.""" | |
| decisions = json.loads(theme_map) | |
| emb = np.load(os.path.join(OUTPUT_DIR, f"{run_key}_emb.npy")) | |
| sent_data = _load(f"{run_key}_sentences.json") | |
| sentences = sent_data["sentences"] | |
| paper_ids = sent_data["paper_ids"] | |
| summaries = _load(f"{run_key}_summaries.json") | |
| sum_map = {s["cluster_id"]: s for s in summaries} | |
| approved = [d for d in decisions if str(d.get("approve", "")).upper() == "YES"] | |
| theme_groups: dict = {} | |
| for d in approved: | |
| cid = int(d["cluster_id"]) | |
| name = str(d.get("rename_to", "") or d.get("label", f"Topic {cid}")).strip() | |
| theme_groups.setdefault(name, []).append(cid) | |
| def build_theme(name, cids): | |
| all_idx = [i for cid in cids | |
| for i in range(len(sentences)) | |
| if sum_map.get(cid) and | |
| any(sentences[i] in sum_map[cid]["top_sentences"] | |
| for _ in [1])] | |
| mask = np.array([True if sum_map.get(cid) else False | |
| for cid in cids], dtype=bool) | |
| cluster_embs = np.vstack([emb[np.array(paper_ids) == cid] if np.any(np.array(paper_ids) == cid) | |
| else np.zeros((1, emb.shape[1])) | |
| for cid in cids]) | |
| centroid = cluster_embs.mean(axis=0) | |
| total_sents = sum(sum_map[cid]["sentence_count"] for cid in cids if cid in sum_map) | |
| total_papers = len(set(paper_ids[i] for cid in cids | |
| for i in range(len(paper_ids)) if paper_ids[i] in cids)) | |
| top_sents = sum_map[cids[0]]["top_sentences"][:3] if cids[0] in sum_map else [] | |
| return { | |
| "theme_name": name, | |
| "merged_cluster_ids": cids, | |
| "sentence_count": total_sents, | |
| "paper_count": total_papers, | |
| "top_sentences": top_sents, | |
| "centroid": centroid.tolist(), | |
| } | |
| themes = list(map(lambda item: build_theme(item[0], item[1]), | |
| theme_groups.items())) | |
| themes.sort(key=lambda x: x["sentence_count"], reverse=True) | |
| _save(themes, f"{run_key}_themes.json") | |
| return ( | |
| f"β Themes Consolidated ({run_key})\n" | |
| f"Approved topics: {len(approved)}\n" | |
| f"Final themes: {len(themes)}\n" | |
| f"Theme names: {[t['theme_name'] for t in themes]}\n" | |
| f"Review consolidated themes. Click Submit Review to confirm." | |
| ) | |
| # βββ TOOL 5: COMPARE WITH TAXONOMY βββββββββββββββββββββββββββββββββββββββββββ | |
| def compare_with_taxonomy(run_key: str) -> str: | |
| """Map final themes to PAJAIS taxonomy (Jiang et al. 2019) β 25 categories. | |
| Classifies themes as MAPPED or NOVEL. Saves taxonomy_map.json. Phase 5.5.""" | |
| themes_file = (f"{run_key}_themes.json" | |
| if os.path.exists(os.path.join(OUTPUT_DIR, f"{run_key}_themes.json")) | |
| else f"{run_key}_labels.json") | |
| themes_raw = _load(themes_file) | |
| theme_names = [t.get("theme_name", t.get("label", "")) for t in themes_raw] | |
| llm = _get_llm() | |
| tax_prompt = PromptTemplate.from_template( | |
| "You are a bibliometric taxonomy expert.\n" | |
| "Map each theme to the PAJAIS taxonomy (Jiang et al., 2019).\n\n" | |
| "PAJAIS 25 categories:\n{pajais}\n\n" | |
| "Themes to classify:\n{themes}\n\n" | |
| "Return ONLY a JSON array:\n" | |
| '[{{"theme": "...", "pajais_match": "category or NOVEL", ' | |
| '"match_confidence": 0.0-1.0, "reasoning": "...", "is_novel": true/false}}]\n' | |
| "If no category fits, set pajais_match to NOVEL. No markdown fences." | |
| ) | |
| pajais_str = "\n".join(f"{i+1}. {c}" for i, c in enumerate(PAJAIS_25)) | |
| themes_str = "\n".join(f"- {n}" for n in theme_names) | |
| raw = (tax_prompt | llm | StrOutputParser()).invoke( | |
| {"pajais": pajais_str, "themes": themes_str} | |
| ) | |
| raw = raw.strip().lstrip("```json").lstrip("```").rstrip("```").strip() | |
| results = json.loads(raw) | |
| mapped = [r for r in results if not r.get("is_novel", False)] | |
| novel = [r for r in results if r.get("is_novel", False)] | |
| covered = set(r["pajais_match"] for r in mapped) | |
| gaps = [c for c in PAJAIS_25 if c not in covered] | |
| taxonomy_map = { | |
| "taxonomy_mapping": {r["theme"]: r for r in results}, | |
| "novel_themes": [r["theme"] for r in novel], | |
| "pajais_gap_categories": gaps, | |
| "coverage_stats": { | |
| "total_themes": len(results), | |
| "mapped": len(mapped), | |
| "novel": len(novel), | |
| }, | |
| } | |
| _save(taxonomy_map, "taxonomy_map.json") | |
| return ( | |
| f"β PAJAIS Taxonomy Mapped ({run_key})\n" | |
| f"Themes mapped: {len(mapped)}\n" | |
| f"NOVEL themes: {len(novel)} β {[r['theme'] for r in novel]}\n" | |
| f"PAJAIS gaps: {gaps[:5]}\n" | |
| f"Review PAJAIS mapping in table. Click Submit Review." | |
| ) | |
| # βββ TOOL 6: GENERATE COMPARISON CSV βββββββββββββββββββββββββββββββββββββββββ | |
| def generate_comparison_csv() -> str: | |
| """Load themes from abstract and title runs and create side-by-side comparison. | |
| Saves comparison.csv. Phase 6 of Braun & Clarke.""" | |
| def load_themes(key): | |
| fname = (f"{key}_themes.json" | |
| if os.path.exists(os.path.join(OUTPUT_DIR, f"{key}_themes.json")) | |
| else f"{key}_labels.json") | |
| return _load(fname) | |
| abs_themes = load_themes("abstract") | |
| ttl_themes = load_themes("title") | |
| abs_names = [t.get("theme_name", t.get("label", "")) for t in abs_themes] | |
| ttl_names = [t.get("theme_name", t.get("label", "")) for t in ttl_themes] | |
| abs_kws = [" | ".join(t.get("top_sentences", [""])[:1]) for t in abs_themes] | |
| ttl_kws = [" | ".join(t.get("top_sentences", [""])[:1]) for t in ttl_themes] | |
| max_len = max(len(abs_themes), len(ttl_themes)) | |
| pad = lambda lst, val: lst + [val] * (max_len - len(lst)) | |
| df = pd.DataFrame({ | |
| "Abstract_Theme": pad(abs_names, ""), | |
| "Abstract_Evidence": pad(abs_kws, ""), | |
| "Abstract_Sentences": pad([t.get("sentence_count", 0) for t in abs_themes], 0), | |
| "Title_Theme": pad(ttl_names, ""), | |
| "Title_Evidence": pad(ttl_kws, ""), | |
| "Title_Sentences": pad([t.get("sentence_count", 0) for t in ttl_themes], 0), | |
| "Convergence": pad( | |
| ["STABLE" if a in ttl_names else "ABSTRACT-ONLY" for a in abs_names], "TITLE-ONLY" | |
| ), | |
| }) | |
| path = os.path.join(OUTPUT_DIR, "comparison.csv") | |
| df.to_csv(path, index=False) | |
| return ( | |
| f"β Comparison CSV Generated\n" | |
| f"Abstract themes: {len(abs_themes)}\n" | |
| f"Title themes: {len(ttl_themes)}\n" | |
| f"Rows: {len(df)}\n" | |
| f"File: {path}\n" | |
| f"Click Submit Review to generate the narrative." | |
| ) | |
| # βββ TOOL 7: EXPORT NARRATIVE βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def export_narrative(run_key: str) -> str: | |
| """Generate a 500-word Section 7 narrative via Mistral LLM. | |
| Uses themes + taxonomy mapping. Saves narrative.txt. Phase 6.""" | |
| cfg = _load("corpus_config.json") | |
| theme_file = (f"{run_key}_themes.json" | |
| if os.path.exists(os.path.join(OUTPUT_DIR, f"{run_key}_themes.json")) | |
| else f"{run_key}_labels.json") | |
| themes = _load(theme_file) | |
| tax = _load("taxonomy_map.json") | |
| theme_names = [t.get("theme_name", t.get("label", "")) for t in themes] | |
| novel_themes = tax.get("novel_themes", []) | |
| gaps = tax.get("pajais_gap_categories", []) | |
| mapped = tax.get("coverage_stats", {}).get("mapped", 0) | |
| llm = _get_llm() | |
| narr_prompt = PromptTemplate.from_template( | |
| "Write a 500-word Section 7 for a conference paper on topic modelling.\n" | |
| "Journal: {journal} | Papers: {papers} | Years: {y_min}β{y_max}\n" | |
| "Stable BERTopic themes: {themes}\n" | |
| "NOVEL themes (not in PAJAIS): {novel}\n" | |
| "PAJAIS gap categories: {gaps}\n" | |
| "Mapped themes: {mapped}\n\n" | |
| "Structure: 7.1 Methodology (LDA + BERTopic, Braun & Clarke), " | |
| "7.2 RQ4 LDA Findings, 7.3 RQ5 Abstract vs Title, " | |
| "7.4 RQ6 PAJAIS Mapping with NOVEL justification, " | |
| "7.5 RQ7 Future Research Agenda.\n" | |
| "Cite: Braun & Clarke (2006), Jiang et al. (2019), Grootendorst (2022).\n" | |
| "~500 words, academic tone, no bullet points." | |
| ) | |
| narrative = (narr_prompt | llm | StrOutputParser()).invoke({ | |
| "journal": cfg.get("journal", "Electronic Markets"), | |
| "papers": cfg.get("rows", 908), | |
| "y_min": cfg.get("year_min", 2007), | |
| "y_max": cfg.get("year_max", 2026), | |
| "themes": ", ".join(theme_names[:10]), | |
| "novel": ", ".join(novel_themes[:5]), | |
| "gaps": ", ".join(gaps[:5]), | |
| "mapped": mapped, | |
| }) | |
| path = os.path.join(OUTPUT_DIR, "narrative.txt") | |
| with open(path, "w", encoding="utf-8") as f: | |
| f.write(narrative) | |
| return ( | |
| f"β Narrative Exported\nWords: {len(narrative.split())}\n" | |
| f"File: {path}\nPipeline complete! Download all files from the Download tab." | |
| ) |