Spaces:
Sleeping
Sleeping
| from langchain_core.tools import tool | |
| import os | |
| import json | |
| import re | |
| import numpy as np | |
| import pandas as pd | |
| CHECKPOINT_DIR = "/tmp/checkpoints" | |
| os.makedirs(CHECKPOINT_DIR, exist_ok=True) | |
| NEAREST_K = 5 | |
| SENT_SPLIT_RE = r'(?<=[.!?])\s+(?=[A-Z])' | |
| MIN_SENT_LEN = 30 | |
| RUN_CONFIGS = {"abstract": ["Abstract"], "title": ["Title"]} | |
| _data = {} | |
| def _split_sentences(text): | |
| raw = re.split(SENT_SPLIT_RE, str(text)) | |
| return list(filter(lambda s: len(s.strip()) >= MIN_SENT_LEN, raw)) | |
| def load_scopus_csv(filepath: str) -> str: | |
| """Load and summarize a Scopus CSV dataset.""" | |
| df = pd.read_csv(filepath, encoding="utf-8-sig") | |
| _data["df"] = df | |
| cols = [c for c in ["Title", "Abstract", "Author Keywords"] if c in df.columns] | |
| sample = df[cols].head(3).to_string(max_colwidth=80) | |
| nulls = ", ".join([f"{c}: {df[c].notna().sum()}/{len(df)}" for c in cols]) | |
| avg_sents = df["Abstract"].head(5).apply(_split_sentences).apply(len).mean() | |
| est = int(avg_sents * len(df)) | |
| return (f"📊 **Dataset Statistics:**\n" | |
| f"- **Papers:** {len(df)}\n" | |
| f"- **Abstract sentences:** ~{est}\n" | |
| f"- **Title sentences:** {int(df['Title'].notna().sum())}\n" | |
| f"- **Non-null:** {nulls}\n\n" | |
| f"Columns: {', '.join(list(df.columns)[:15])}\n\n" | |
| f"Sample:\n{sample}") | |
| def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str: | |
| """Run BERTopic clustering on abstracts or titles.""" | |
| from bertopic import BERTopic | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.preprocessing import FunctionTransformer | |
| from sklearn.cluster import AgglomerativeClustering | |
| df = _data["df"].copy() | |
| available = [c for c in RUN_CONFIGS[run_key] if c in df.columns] | |
| df["_text"] = df[available].fillna("").agg(" ".join, axis=1) | |
| df["_paper_id"] = df.index | |
| df["_sentences"] = df["_text"].apply(_split_sentences) | |
| meta = [c for c in ["_paper_id", "Title", "Author Keywords", "_sentences"] if c in df.columns] | |
| sent_df = df[meta].explode("_sentences").rename(columns={"_sentences": "text"}).dropna(subset=["text"]).reset_index(drop=True) | |
| sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount() | |
| patterns = r"Licensee MDPI|Published by Informa|Published by Elsevier|Taylor & Francis|Copyright ©|Creative Commons|open access article|Inderscience Enterprises|All rights reserved|Springer Nature|Emerald Publishing|limitations and (future|implications|discussed)|implications (are|were) (discussed|presented)|concludes with .* implications" | |
| sent_df = sent_df[~sent_df["text"].str.contains(patterns, case=False, regex=True, na=False)].reset_index(drop=True) | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| embs = embedder.encode(sent_df["text"].tolist(), show_progress_bar=False, normalize_embeddings=True) | |
| np.save(f"{CHECKPOINT_DIR}/rq4_{run_key}_emb.npy", embs) | |
| cluster = AgglomerativeClustering(n_clusters=None, metric="cosine", linkage="average", distance_threshold=threshold) | |
| model = BERTopic(hdbscan_model=cluster, umap_model=FunctionTransformer()) | |
| topics, _ = model.fit_transform(sent_df["text"].tolist(), embs) | |
| _data[f"{run_key}_model"] = model | |
| _data[f"{run_key}_topics"] = np.array(topics) | |
| _data[f"{run_key}_embeddings"] = embs | |
| _data[f"{run_key}_sent_df"] = sent_df | |
| n = len(set(topics)) - int(-1 in topics) | |
| (n >= 3) and model.visualize_topics().write_html(f"/tmp/rq4_{run_key}_intertopic.html") | |
| (n >= 1) and model.visualize_barchart(top_n_topics=min(10, n)).write_html(f"/tmp/rq4_{run_key}_bars.html") | |
| (n >= 2) and model.visualize_hierarchy().write_html(f"/tmp/rq4_{run_key}_hierarchy.html") | |
| (n >= 2) and model.visualize_heatmap().write_html(f"/tmp/rq4_{run_key}_heatmap.html") | |
| t_arr = np.array(topics) | |
| valid = [r for r in model.get_topic_info().to_dict("records") if r["Topic"] != -1] | |
| def _centroid(row): | |
| mask = t_arr == row["Topic"] | |
| m_idx = np.where(mask)[0] | |
| m_embs = embs[mask] | |
| cent = m_embs.mean(axis=0) | |
| dists = 1 - (m_embs @ cent) / (np.linalg.norm(m_embs, axis=1) * np.linalg.norm(cent) + 1e-10) | |
| near = np.argsort(dists)[:NEAREST_K] | |
| evidence = [{"sentence": str(sent_df.iloc[m_idx[i]]["text"])[:250], "paper_id": int(sent_df.iloc[m_idx[i]]["_paper_id"]), "title": str(sent_df.iloc[m_idx[i]].get("Title", ""))[:150], "keywords": str(sent_df.iloc[m_idx[i]].get("Author Keywords", ""))[:150]} for i in near] | |
| p_df = sent_df.iloc[m_idx].drop_duplicates(subset=["_paper_id"]) | |
| titles = [str(p_df.iloc[i].get("Title", ""))[:200] for i in range(min(50, len(p_df)))] | |
| return {"topic_id": int(row["Topic"]), "sentence_count": int(row["Count"]), "paper_count": len(p_df), "top_words": str(row.get("Name", ""))[:100], "nearest": evidence, "paper_titles": titles} | |
| sums = list(map(_centroid, valid)) | |
| json.dump(sums, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json", "w"), indent=2, default=str) | |
| lines = [f" Topic {s['topic_id']} ({s['sentence_count']} sents, {s['paper_count']} papers): {s['top_words']}" for s in sums] | |
| return f"[{run_key}] {n} topics from {len(sent_df)} sentences.\n\n" + "\n".join(lines) | |
| def label_topics_with_llm(run_key: str) -> str: | |
| """Use Mistral to generate academic labels for discovered topics.""" | |
| from langchain_mistralai import ChatMistralAI | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_core.output_parsers import JsonOutputParser | |
| sums = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json")) | |
| to_label = sorted(sums, key=lambda s: s.get("sentence_count", 0), reverse=True)[:100] | |
| block = "\n\n".join([f"Topic {s['topic_id']} ({s['sentence_count']} sents):\n{NEAREST_K} entries:\n" + "\n".join([f"- {e['sentence']}\n Paper: {e['title']}" for e in s["nearest"]]) for s in to_label]) | |
| prompt = PromptTemplate.from_template("Return JSON ARRAY of objects with topic_id, label, category, confidence, reasoning, niche for:\n{topics}") | |
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0) | |
| labels = (prompt | llm | JsonOutputParser()).invoke({"topics": block}) | |
| labeled = [{**s, **l} for s, l in zip(sums, labels + sums)] | |
| json.dump(labeled, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json", "w"), indent=2, default=str) | |
| lines = [f" **Topic {l.get('topic_id')}: {l.get('label')}** [{l.get('category')}] ({l.get('sentence_count')} sents)" for l in labeled] | |
| return f"[{run_key}] {len(labeled)} topics labeled.\n\n" + "\n\n".join(lines) | |
| def generate_comparison_csv() -> str: | |
| """Create a comparison matrix across multiple analysis runs.""" | |
| done = [k for k in RUN_CONFIGS.keys() if os.path.exists(f"{CHECKPOINT_DIR}/rq4_{k}_labels.json")] | |
| rows = [] | |
| for k in done: | |
| ls = json.load(open(f"{CHECKPOINT_DIR}/rq4_{k}_labels.json")) | |
| rows.extend([{"run": k, "topic_id": l.get("topic_id"), "label": l.get("label"), "category": l.get("category"), "sentences": l.get("sentence_count"), "papers": l.get("paper_count")} for l in ls]) | |
| df = pd.DataFrame(rows) | |
| df.to_csv("/tmp/rq4_comparison.csv", index=False) | |
| return f"Saved to /tmp/rq4_comparison.csv\n\n{df.to_string(index=False)}" | |
| def export_narrative(run_key: str) -> str: | |
| """Generate a 500-word research narrative for the results section.""" | |
| from langchain_mistralai import ChatMistralAI | |
| ls = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json")) | |
| txt = "\n".join([f"- {l.get('label')} ({l.get('sentence_count')} sents)" for l in ls]) | |
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0.3) | |
| res = llm.invoke(f"Write a 500-word Section 7 'Topic Modeling Results' for {run_key} run:\n{txt}") | |
| open("/tmp/rq4_narrative.txt", "w", encoding="utf-8").write(res.content) | |
| return f"Saved to /tmp/rq4_narrative.txt\n\n{res.content}" | |
| def consolidate_into_themes(run_key: str, theme_map: dict) -> str: | |
| """Merge specific topics into broader research themes.""" | |
| t_arr, embs, s_df = _data[f"{run_key}_topics"], _data[f"{run_key}_embeddings"], _data[f"{run_key}_sent_df"] | |
| def _build(name, ids): | |
| mask = np.isin(t_arr, ids) | |
| m_idx, m_embs = np.where(mask)[0], embs[mask] | |
| cent = m_embs.mean(axis=0) | |
| dists = 1 - (m_embs @ cent) / (np.linalg.norm(m_embs, axis=1) * np.linalg.norm(cent) + 1e-10) | |
| near = np.argsort(dists)[:NEAREST_K] | |
| evidence = [{"sentence": str(s_df.iloc[m_idx[i]]["text"])[:250], "title": str(s_df.iloc[m_idx[i]].get("Title", ""))[:150]} for i in near] | |
| return {"label": name, "merged_topics": list(ids), "sentence_count": int(mask.sum()), "paper_count": int(s_df.iloc[m_idx]["_paper_id"].nunique()), "nearest": evidence} | |
| themes = [{"topic_id": i, **_build(n, ids)} for i, (n, ids) in enumerate(theme_map.items())] | |
| json.dump(themes, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json", "w"), indent=2, default=str) | |
| lines = [f" **{t['label']}** ({t['sentence_count']} sents)" for t in themes] | |
| return f"[{run_key}] {len(themes)} themes.\n\n" + "\n".join(lines) | |
| PAJAIS = ["Electronic Business", "HCI", "IS Strategy", "Business Intelligence", "Design Science", "Enterprise Systems", "Adoption", "Social Media", "Cultural Issues", "Security", "Smart/IoT", "Knowledge Management", "Digital Platform", "Healthcare", "Project Management", "Service Science", "Social/Org Aspects", "Research Methods", "E-Finance", "E-Government", "Education", "Sustainability"] | |
| def compare_with_taxonomy(run_key: str) -> str: | |
| """Map themes to the PAJAIS taxonomy or identify novel contributions.""" | |
| from langchain_mistralai import ChatMistralAI | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_core.output_parsers import JsonOutputParser | |
| src = (os.path.exists(f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json") and f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json") or f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json" | |
| ts = json.load(open(src)) | |
| prompt = PromptTemplate.from_template("Map themes to PAJAIS taxonomy or mark 'NOVEL'. Return JSON array for:\nThemes:\n{ts}\nTaxonomy:\n{tax}") | |
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0) | |
| ms = (prompt | llm | JsonOutputParser()).invoke({"ts": "\n".join([t['label'] for t in ts]), "tax": "\n".join(PAJAIS)}) | |
| json.dump(ms, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json", "w"), indent=2, default=str) | |
| return f"[{run_key}] Mapping complete." | |
| def get_all_tools(): | |
| ts = [load_scopus_csv, run_bertopic_discovery, label_topics_with_llm, consolidate_into_themes, compare_with_taxonomy, generate_comparison_csv, export_narrative] | |
| for t in ts: setattr(t, 'handle_tool_error', True) | |
| return ts | |