Spaces:
Sleeping
Sleeping
| """ | |
| tools.py β 7 @tool functions for BERTopic Agentic AI Application | |
| Rules: ZERO if/else, ZERO for/while, ZERO try/except. All decisions by LLM. | |
| """ | |
| import os | |
| import json | |
| import re | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| import plotly.express as px | |
| from plotly.subplots import make_subplots | |
| from langchain_core.tools import tool | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.cluster import AgglomerativeClustering | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_core.output_parsers import JsonOutputParser | |
| from langchain_mistralai import ChatMistralAI | |
| # ββ Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| NEAREST_K = 5 | |
| MAX_LABEL_TOPICS = 100 | |
| CHECKPOINT_DIR = "checkpoints" | |
| os.makedirs(CHECKPOINT_DIR, exist_ok=True) | |
| RUN_CONFIGS = { | |
| "abstract": ["Abstract"], | |
| "title": ["Title"], | |
| } | |
| BOILERPLATE_PATTERNS = [ | |
| r"Β©\s*\d{4}.*", | |
| r"All rights reserved.*", | |
| r"Published by Elsevier.*", | |
| r"doi:.*", | |
| r"http[s]?://\S+", | |
| r"www\.\S+", | |
| r"This article is.*", | |
| r"Please cite.*", | |
| r"Correspondence to.*", | |
| r"E-mail address.*", | |
| r"Received \d+.*", | |
| r"Accepted \d+.*", | |
| r"Available online.*", | |
| r"Keywords:.*", | |
| r"Abstract\.?\s*$", | |
| r"^\s*\d+\s*$", | |
| r"Springer.*", | |
| r"Taylor & Francis.*", | |
| r"Wiley.*", | |
| r"IEEE.*", | |
| r"ACM.*", | |
| r"Sage Publications.*", | |
| ] | |
| PAJAIS_CATEGORIES = [ | |
| "1. Smart Tourism Technologies", | |
| "2. AI and Machine Learning in Tourism", | |
| "3. Big Data Analytics in Hospitality", | |
| "4. Social Media and User-Generated Content", | |
| "5. Mobile Technologies and Applications", | |
| "6. Blockchain in Travel and Tourism", | |
| "7. Internet of Things in Hospitality", | |
| "8. Robotics and Automation", | |
| "9. Augmented and Virtual Reality", | |
| "10. Revenue Management and Pricing", | |
| "11. Customer Experience and Satisfaction", | |
| "12. Online Reviews and Reputation Management", | |
| "13. Digital Marketing and e-Commerce", | |
| "14. Sharing Economy Platforms", | |
| "15. Destination Management Systems", | |
| "16. Sustainable and Green Technologies", | |
| "17. Crisis Management and Resilience", | |
| "18. Human-Computer Interaction", | |
| "19. Recommendation Systems", | |
| "20. Natural Language Processing in Tourism", | |
| "21. Computer Vision in Hospitality", | |
| "22. Cybersecurity and Privacy", | |
| "23. Supply Chain and Logistics", | |
| "24. Accessibility and Inclusive Technology", | |
| "25. Metaverse and Immersive Experiences", | |
| ] | |
| CSV_PATH = os.path.join(CHECKPOINT_DIR, "uploaded.csv") | |
| def _ckpt(name): | |
| return os.path.join(CHECKPOINT_DIR, name) | |
| def _llm(): | |
| return ChatMistralAI( | |
| model="mistral-small-latest", | |
| api_key=os.environ.get("MISTRAL_API_KEY", ""), | |
| temperature=0.1, | |
| ) | |
| def _clean_sentence(s): | |
| cleaned = s.strip() | |
| cleaned = re.sub("|".join(BOILERPLATE_PATTERNS), "", cleaned, flags=re.IGNORECASE) | |
| return cleaned.strip() | |
| def _split_sentences(text): | |
| from nltk.tokenize import sent_tokenize | |
| import nltk | |
| nltk.download("punkt", quiet=True) | |
| nltk.download("punkt_tab", quiet=True) | |
| sentences = sent_tokenize(str(text)) | |
| cleaned = list(map(_clean_sentence, sentences)) | |
| return list(filter(lambda s: len(s) > 20, cleaned)) | |
| # ββ Encoding helper βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _try_read_csv(filepath, enc): | |
| """Return DataFrame if encoding works, else None.""" | |
| result = [None] | |
| def _read(): | |
| result[0] = pd.read_csv(filepath, encoding=enc, on_bad_lines="skip") | |
| import contextlib, io | |
| with contextlib.suppress(Exception): | |
| _read() | |
| return result[0] | |
| # ββ Tool 1: load_scopus_csv ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_scopus_csv(filepath: str) -> str: | |
| """Load a Scopus CSV export, count papers and sentences, apply boilerplate filtering. | |
| Returns stats string with paper count, abstract sentence count, title sentence count. | |
| filepath: path to the uploaded CSV file.""" | |
| # Auto-detect encoding: covers utf-8-sig (BOM), plain utf-8, latin-1, windows-1252 | |
| encodings = ["utf-8-sig", "utf-8", "latin-1", "cp1252", "iso-8859-1"] | |
| df = None | |
| detected_enc = None | |
| for enc in encodings: | |
| candidate = _try_read_csv(filepath, enc) | |
| if candidate is not None and len(candidate) > 0: | |
| df = candidate | |
| detected_enc = enc | |
| break | |
| if df is None: | |
| return "β Could not read CSV with any supported encoding. Please re-save as UTF-8 and re-upload." | |
| df.to_csv(CSV_PATH, index=False, encoding="utf-8") | |
| paper_count = len(df) | |
| abstract_sentences = list( | |
| filter(None, sum(map(_split_sentences, df["Abstract"].dropna().tolist()), [])) | |
| ) | |
| # Titles are atomic units β count each non-empty title as one unit (no sent_tokenize) | |
| title_sentences = list(filter( | |
| lambda s: len(s.strip()) >= 5, | |
| list(map(lambda t: _clean_sentence(str(t)), df["Title"].dropna().tolist())) | |
| )) | |
| stats = { | |
| "papers": paper_count, | |
| "abstract_sentences": len(abstract_sentences), | |
| "title_sentences": len(title_sentences), | |
| "columns": list(df.columns), | |
| "year_range": f"{int(df['Year'].min())} β {int(df['Year'].max())}" if "Year" in df.columns else "N/A", | |
| } | |
| with open(_ckpt("stats.json"), "w") as f: | |
| json.dump(stats, f, indent=2) | |
| return ( | |
| f"β CSV loaded successfully.\n" | |
| f"π Papers: {paper_count}\n" | |
| f"π Abstract sentences (after cleaning): {len(abstract_sentences)}\n" | |
| f"π€ Title records (after cleaning): {len(title_sentences)}\n" | |
| f"π Year range: {stats['year_range']}\n" | |
| f"π Columns: {', '.join(stats['columns'])}\n\n" | |
| f"Data is ready. Please type **'run abstract'** to begin Phase 2 BERTopic analysis on abstracts." | |
| ) | |
| # ββ Tool 2: run_bertopic_discovery ββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str: | |
| """Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering (cosine metric), | |
| find 5 nearest centroids per cluster, generate 4 Plotly charts. Save summaries.json + emb.npy. | |
| run_key: 'abstract' or 'title'. threshold: clustering distance threshold (default 0.7).""" | |
| df = pd.read_csv(CSV_PATH, encoding="utf-8") | |
| columns = RUN_CONFIGS[run_key] | |
| texts = sum( | |
| list(map(lambda col: df[col].dropna().tolist(), columns)), [] | |
| ) | |
| # Titles are already single semantic units β do NOT split into sentences. | |
| # Abstracts get split into sentences for finer-grained clustering. | |
| # Min-length: 5 chars for titles, 20 chars for abstract sentences. | |
| sentences = list(filter( | |
| lambda s: len(s.strip()) >= 5, | |
| list(map(lambda t: _clean_sentence(str(t)), texts)) | |
| )) if run_key == "title" else list(filter( | |
| lambda s: len(s) > 20, | |
| sum(list(map(_split_sentences, texts)), []) | |
| )) | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| embeddings = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False) | |
| np.save(_ckpt(f"{run_key}_emb.npy"), embeddings) | |
| clustering = AgglomerativeClustering( | |
| metric="cosine", | |
| linkage="average", | |
| distance_threshold=threshold, | |
| n_clusters=None, | |
| ) | |
| labels_arr = clustering.fit_predict(embeddings) | |
| unique_labels = list(set(labels_arr.tolist())) | |
| cluster_data = list(map(lambda lbl: _build_cluster_summary(lbl, labels_arr, sentences, embeddings), unique_labels)) | |
| cluster_data.sort(key=lambda x: x["sentence_count"], reverse=True) | |
| with open(_ckpt(f"{run_key}_summaries.json"), "w") as f: | |
| json.dump(cluster_data, f, indent=2) | |
| _generate_charts(cluster_data, run_key, embeddings, labels_arr) | |
| return ( | |
| f"β BERTopic discovery complete for **{run_key}** run.\n" | |
| f"π’ Topics discovered: {len(unique_labels)}\n" | |
| f"π Sentences clustered: {len(sentences)}\n" | |
| f"π Saved: {run_key}_summaries.json, {run_key}_emb.npy\n" | |
| f"π¨ 4 Plotly charts generated.\n\n" | |
| f"Now calling label_topics_with_llm to label the top {MAX_LABEL_TOPICS} topics..." | |
| ) | |
| def _build_cluster_summary(lbl, labels_arr, sentences, embeddings): | |
| mask = np.array(labels_arr) == lbl | |
| cluster_sents = [s for s, m in zip(sentences, mask.tolist()) if m] | |
| cluster_embs = embeddings[mask] | |
| centroid = cluster_embs.mean(axis=0, keepdims=True) | |
| sims = cosine_similarity(centroid, cluster_embs)[0] | |
| top_idxs = np.argsort(sims)[::-1][:NEAREST_K].tolist() | |
| top_sents = [cluster_sents[i] for i in top_idxs] | |
| return { | |
| "topic_id": int(lbl), | |
| "sentence_count": len(cluster_sents), | |
| "top_sentences": top_sents, | |
| "centroid": centroid[0].tolist(), | |
| "label": f"Topic_{lbl}", | |
| "category": "", | |
| "confidence": 0.0, | |
| "reasoning": "", | |
| "niche": False, | |
| } | |
| def _generate_charts(cluster_data, run_key, embeddings, labels_arr): | |
| top_n = min(30, len(cluster_data)) | |
| top_clusters = cluster_data[:top_n] | |
| topic_ids = list(map(lambda c: c["topic_id"], top_clusters)) | |
| counts = list(map(lambda c: c["sentence_count"], top_clusters)) | |
| topic_labels = list(map(lambda c: c["label"], top_clusters)) | |
| # Chart 1: Bar chart β top topics by sentence count | |
| fig_bar = px.bar( | |
| x=counts, y=topic_labels, orientation="h", | |
| title=f"Top {top_n} Topics by Sentence Count ({run_key})", | |
| labels={"x": "Sentences", "y": "Topic"}, | |
| color=counts, color_continuous_scale="Viridis", | |
| ) | |
| fig_bar.update_layout(height=700, yaxis=dict(autorange="reversed")) | |
| with open(_ckpt(f"{run_key}_chart_bar.html"), "w") as f: | |
| f.write(fig_bar.to_html(include_plotlyjs="cdn", full_html=True)) | |
| # Chart 2: Intertopic map (2D PCA projection of centroids) | |
| centroids = np.array(list(map(lambda c: c["centroid"], top_clusters))) | |
| from sklearn.decomposition import PCA | |
| pca = PCA(n_components=2) | |
| coords = pca.fit_transform(centroids) | |
| fig_map = px.scatter( | |
| x=coords[:, 0], y=coords[:, 1], | |
| text=topic_labels, size=counts, | |
| title=f"Intertopic Distance Map ({run_key})", | |
| labels={"x": "PC1", "y": "PC2"}, | |
| color=counts, color_continuous_scale="Plasma", | |
| ) | |
| fig_map.update_traces(textposition="top center") | |
| fig_map.update_layout(height=600) | |
| with open(_ckpt(f"{run_key}_chart_map.html"), "w") as f: | |
| f.write(fig_map.to_html(include_plotlyjs="cdn", full_html=True)) | |
| # Chart 3: Hierarchy (dendrogram-style using sorted counts) | |
| sorted_data = sorted(zip(topic_labels, counts), key=lambda x: x[1]) | |
| fig_hier = go.Figure(go.Bar( | |
| x=list(map(lambda d: d[1], sorted_data)), | |
| y=list(map(lambda d: d[0], sorted_data)), | |
| orientation="h", | |
| marker_color="teal", | |
| )) | |
| fig_hier.update_layout( | |
| title=f"Topic Hierarchy ({run_key})", | |
| height=700, | |
| xaxis_title="Sentence Count", | |
| ) | |
| with open(_ckpt(f"{run_key}_chart_hierarchy.html"), "w") as f: | |
| f.write(fig_hier.to_html(include_plotlyjs="cdn", full_html=True)) | |
| # Chart 4: Heatmap of top-10 topic co-occurrence (cosine sim of centroids) | |
| top10 = cluster_data[:10] | |
| top10_centroids = np.array(list(map(lambda c: c["centroid"], top10))) | |
| sim_matrix = cosine_similarity(top10_centroids) | |
| top10_labels = list(map(lambda c: c["label"], top10)) | |
| fig_heat = px.imshow( | |
| sim_matrix, | |
| x=top10_labels, y=top10_labels, | |
| color_continuous_scale="RdBu_r", | |
| title=f"Topic Similarity Heatmap β Top 10 ({run_key})", | |
| ) | |
| fig_heat.update_layout(height=500) | |
| with open(_ckpt(f"{run_key}_chart_heatmap.html"), "w") as f: | |
| f.write(fig_heat.to_html(include_plotlyjs="cdn", full_html=True)) | |
| # ββ Tool 3: label_topics_with_llm βββββββββββββββββββββββββββββββββββββββββββββ | |
| def label_topics_with_llm(run_key: str) -> str: | |
| """Send top MAX_LABEL_TOPICS topics to Mistral for labelling. Each topic gets: | |
| label, category, confidence, reasoning, niche (true/false). | |
| Saves labels.json. run_key: 'abstract' or 'title'.""" | |
| with open(_ckpt(f"{run_key}_summaries.json")) as f: | |
| summaries = json.load(f) | |
| top_topics = summaries[:MAX_LABEL_TOPICS] | |
| topic_texts = "\n\n".join(list(map( | |
| lambda t: ( | |
| f"Topic {t['topic_id']} ({t['sentence_count']} sentences):\n" | |
| + "\n".join(list(map(lambda s: f" - {s}", t["top_sentences"][:3]))) | |
| ), | |
| top_topics, | |
| ))) | |
| prompt = PromptTemplate.from_template( | |
| """You are a research labelling expert. For each topic below, provide a JSON array. | |
| Each element must have: topic_id (int), label (research area name, max 6 words), | |
| category (broad domain), confidence (0.0-1.0), reasoning (1 sentence), niche (true/false). | |
| Return ONLY a valid JSON array. No markdown, no explanation. | |
| Topics: | |
| {topics} | |
| JSON array:""" | |
| ) | |
| parser = JsonOutputParser() | |
| chain = prompt | _llm() | parser | |
| labeled = chain.invoke({"topics": topic_texts}) | |
| labeled_map = {item["topic_id"]: item for item in labeled} | |
| result = list(map( | |
| lambda t: {**t, **labeled_map.get(t["topic_id"], {})}, | |
| summaries, | |
| )) | |
| with open(_ckpt(f"{run_key}_labels.json"), "w") as f: | |
| json.dump(result, f, indent=2) | |
| labeled_count = len(labeled) | |
| return ( | |
| f"β Labelling complete for **{run_key}** run.\n" | |
| f"π·οΈ Topics labeled: {labeled_count}\n" | |
| f"π Saved: {run_key}_labels.json\n\n" | |
| f"The review table has been populated with {labeled_count} labeled topics.\n" | |
| f"**Please review the table below:** Edit the **Approve**, **Rename To**, and **Reasoning** columns, " | |
| f"then click **Submit Review** to proceed to Phase 3." | |
| ) | |
| # ββ Tool 4: consolidate_into_themes βββββββββββββββββββββββββββββββββββββββββββ | |
| def consolidate_into_themes(run_key: str, theme_map: str) -> str: | |
| """Merge researcher-approved topic groups into consolidated themes. | |
| Recomputes centroids, recounts sentences and papers. | |
| Saves themes.json. | |
| run_key: 'abstract' or 'title'. | |
| theme_map: JSON string mapping theme names to lists of topic_ids, | |
| e.g. '{"AI Tourism": [0,1,5], "Smart Hotels": [2,3]}'""" | |
| with open(_ckpt(f"{run_key}_labels.json")) as f: | |
| labels = json.load(f) | |
| theme_mapping = json.loads(theme_map) | |
| label_lookup = {item["topic_id"]: item for item in labels} | |
| themes = list(map( | |
| lambda kv: _build_theme(kv[0], kv[1], label_lookup), | |
| theme_mapping.items(), | |
| )) | |
| themes.sort(key=lambda t: t["sentence_count"], reverse=True) | |
| with open(_ckpt(f"{run_key}_themes.json"), "w") as f: | |
| json.dump(themes, f, indent=2) | |
| return ( | |
| f"β Themes consolidated for **{run_key}** run.\n" | |
| f"ποΈ Themes created: {len(themes)}\n" | |
| + "\n".join(list(map( | |
| lambda t: f" β’ **{t['name']}**: {t['sentence_count']} sentences, {len(t['topic_ids'])} topics", | |
| themes, | |
| ))) | |
| + f"\n\nπ Saved: {run_key}_themes.json\n\n" | |
| f"**Please review the consolidated themes in the table.** " | |
| f"Rename or adjust if needed, then click **Submit Review** to proceed to Phase 4." | |
| ) | |
| def _build_theme(name, topic_ids, label_lookup): | |
| topics = list(filter(lambda t: t["topic_id"] in topic_ids, label_lookup.values())) | |
| all_sents = sum(list(map(lambda t: t.get("top_sentences", []), topics)), []) | |
| all_centroids = list(map(lambda t: t.get("centroid", []), topics)) | |
| centroid = np.mean(all_centroids, axis=0).tolist() if all_centroids else [] | |
| return { | |
| "name": name, | |
| "topic_ids": topic_ids, | |
| "sentence_count": sum(list(map(lambda t: t.get("sentence_count", 0), topics))), | |
| "top_sentences": all_sents[:NEAREST_K], | |
| "centroid": centroid, | |
| "pajais_match": "", | |
| "match_confidence": 0.0, | |
| "reasoning": "", | |
| "is_novel": False, | |
| } | |
| # ββ Tool 5: compare_with_taxonomy βββββββββββββββββββββββββββββββββββββββββββββ | |
| def compare_with_taxonomy(run_key: str) -> str: | |
| """Map final themes to PAJAIS 25-category taxonomy using Mistral. | |
| Each theme gets: pajais_match (or NOVEL), match_confidence, reasoning, is_novel. | |
| Saves taxonomy_map.json. run_key: 'abstract' or 'title'.""" | |
| with open(_ckpt(f"{run_key}_themes.json")) as f: | |
| themes = json.load(f) | |
| theme_text = "\n".join(list(map( | |
| lambda t: ( | |
| f"Theme: {t['name']}\n" | |
| f"Evidence: {' | '.join(t.get('top_sentences', [])[:2])}" | |
| ), | |
| themes, | |
| ))) | |
| pajais_text = "\n".join(PAJAIS_CATEGORIES) | |
| prompt = PromptTemplate.from_template( | |
| """You are a PAJAIS taxonomy expert. Map each research theme to the closest PAJAIS category. | |
| If no category fits well (similarity < 0.6), mark as NOVEL. | |
| PAJAIS Categories: | |
| {pajais} | |
| Themes to map: | |
| {themes} | |
| Return ONLY a JSON array. Each element: theme_name (str), pajais_match (str, exact category name or "NOVEL"), | |
| match_confidence (float 0-1), reasoning (str, 1 sentence), is_novel (bool). | |
| JSON array:""" | |
| ) | |
| parser = JsonOutputParser() | |
| chain = prompt | _llm() | parser | |
| mapped = chain.invoke({"pajais": pajais_text, "themes": theme_text}) | |
| mapped_lookup = {item["theme_name"]: item for item in mapped} | |
| result = list(map( | |
| lambda t: {**t, **mapped_lookup.get(t["name"], {})}, | |
| themes, | |
| )) | |
| with open(_ckpt(f"{run_key}_taxonomy_map.json"), "w") as f: | |
| json.dump(result, f, indent=2) | |
| novel_count = len(list(filter(lambda t: t.get("is_novel", False), result))) | |
| mapped_count = len(result) - novel_count | |
| return ( | |
| f"β PAJAIS taxonomy mapping complete for **{run_key}** run.\n" | |
| f"β MAPPED themes: {mapped_count}\n" | |
| f"π NOVEL themes: {novel_count}\n\n" | |
| f"The review table now shows PAJAIS matches in the **Top Evidence** column.\n" | |
| f"**Review the mapping in the table.** Novel themes may represent publishable research gaps. " | |
| f"Click **Submit Review** to proceed to Phase 6." | |
| ) | |
| # ββ Tool 6: generate_comparison_csv βββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_comparison_csv() -> str: | |
| """Load themes from both abstract and title runs, create side-by-side comparison DataFrame. | |
| Saves comparison.csv showing convergence and divergence between runs.""" | |
| with open(_ckpt("abstract_taxonomy_map.json")) as f: | |
| abstract_themes = json.load(f) | |
| with open(_ckpt("title_taxonomy_map.json")) as f: | |
| title_themes = json.load(f) | |
| abstract_rows = list(map( | |
| lambda t: { | |
| "Run": "Abstract", | |
| "Theme": t["name"], | |
| "Sentences": t.get("sentence_count", 0), | |
| "PAJAIS Match": t.get("pajais_match", ""), | |
| "Confidence": t.get("match_confidence", 0), | |
| "Novel": t.get("is_novel", False), | |
| "Reasoning": t.get("reasoning", ""), | |
| }, | |
| abstract_themes, | |
| )) | |
| title_rows = list(map( | |
| lambda t: { | |
| "Run": "Title", | |
| "Theme": t["name"], | |
| "Sentences": t.get("sentence_count", 0), | |
| "PAJAIS Match": t.get("pajais_match", ""), | |
| "Confidence": t.get("match_confidence", 0), | |
| "Novel": t.get("is_novel", False), | |
| "Reasoning": t.get("reasoning", ""), | |
| }, | |
| title_themes, | |
| )) | |
| df = pd.DataFrame(abstract_rows + title_rows) | |
| df.to_csv(_ckpt("comparison.csv"), index=False) | |
| return ( | |
| f"β Comparison CSV generated.\n" | |
| f"π Abstract themes: {len(abstract_themes)}\n" | |
| f"π Title themes: {len(title_themes)}\n" | |
| f"π Saved: comparison.csv\n\n" | |
| f"Check the **Download** tab for comparison.csv. " | |
| f"Click **Submit Review** to confirm and generate the narrative report." | |
| ) | |
| # ββ Tool 7: export_narrative βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def export_narrative(run_key: str) -> str: | |
| """Generate a 500-word Section 7 narrative report for the literature review paper. | |
| Uses themes and taxonomy mapping via Mistral. Saves narrative.txt. | |
| run_key: 'abstract' or 'title'.""" | |
| with open(_ckpt(f"{run_key}_taxonomy_map.json")) as f: | |
| themes = json.load(f) | |
| themes_summary = "\n".join(list(map( | |
| lambda t: ( | |
| f"- {t['name']}: {t.get('sentence_count', 0)} sentences, " | |
| f"PAJAIS: {t.get('pajais_match', 'NOVEL')}, " | |
| f"Novel: {t.get('is_novel', False)}" | |
| ), | |
| themes, | |
| ))) | |
| prompt = PromptTemplate.from_template( | |
| """You are an academic writing expert. Write a formal 500-word Section 7 (Thematic Analysis Results) | |
| for a journal literature review paper using the following data. | |
| Reference: Braun & Clarke (2006) six-phase thematic analysis methodology. | |
| Mention: BERTopic clustering, AgglomerativeClustering with cosine metric, Mistral LLM labelling. | |
| Include: key themes, PAJAIS taxonomy mapping, NOVEL themes as research gaps, limitations. | |
| Use academic language. Do not use bullet points β write in paragraphs. | |
| Themes and PAJAIS mapping ({run_key} run): | |
| {themes} | |
| Write Section 7 now (exactly 500 words):""" | |
| ) | |
| chain = prompt | _llm() | |
| narrative = chain.invoke({"run_key": run_key, "themes": themes_summary}) | |
| text = narrative.content if hasattr(narrative, "content") else str(narrative) | |
| with open(_ckpt(f"{run_key}_narrative.txt"), "w") as f: | |
| f.write(text) | |
| return ( | |
| f"β Narrative report generated for **{run_key}** run.\n" | |
| f"π 500-word Section 7 draft saved.\n" | |
| f"π Saved: {run_key}_narrative.txt\n\n" | |
| f"Check the **Download** tab for all output files.\n\n" | |
| f"**Phase 6 complete. Thematic analysis finished.**\n" | |
| f"Download: comparison.csv, taxonomy_map.json, narrative.txt for your conference paper." | |
| ) | |
| # ββ Exported tool list βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ALL_TOOLS = [ | |
| load_scopus_csv, | |
| run_bertopic_discovery, | |
| label_topics_with_llm, | |
| consolidate_into_themes, | |
| compare_with_taxonomy, | |
| generate_comparison_csv, | |
| export_narrative, | |
| ] |