Dash10107's picture
Upload folder using huggingface_hub
822c198 verified
from langchain_core.tools import tool
import os
import json
import re
import numpy as np
import pandas as pd
CHECKPOINT_DIR = "/tmp/checkpoints"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
NEAREST_K = 5
SENT_SPLIT_RE = r'(?<=[.!?])\s+(?=[A-Z])'
MIN_SENT_LEN = 30
RUN_CONFIGS = {"abstract": ["Abstract"], "title": ["Title"]}
_data = {}
def _split_sentences(text):
raw = re.split(SENT_SPLIT_RE, str(text))
return list(filter(lambda s: len(s.strip()) >= MIN_SENT_LEN, raw))
@tool
def load_scopus_csv(filepath: str) -> str:
"""Load and summarize a Scopus CSV dataset."""
df = pd.read_csv(filepath, encoding="utf-8-sig")
_data["df"] = df
cols = [c for c in ["Title", "Abstract", "Author Keywords"] if c in df.columns]
sample = df[cols].head(3).to_string(max_colwidth=80)
nulls = ", ".join([f"{c}: {df[c].notna().sum()}/{len(df)}" for c in cols])
avg_sents = df["Abstract"].head(5).apply(_split_sentences).apply(len).mean()
est = int(avg_sents * len(df))
return (f"📊 **Dataset Statistics:**\n"
f"- **Papers:** {len(df)}\n"
f"- **Abstract sentences:** ~{est}\n"
f"- **Title sentences:** {int(df['Title'].notna().sum())}\n"
f"- **Non-null:** {nulls}\n\n"
f"Columns: {', '.join(list(df.columns)[:15])}\n\n"
f"Sample:\n{sample}")
@tool
def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str:
"""Run BERTopic clustering on abstracts or titles."""
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.cluster import AgglomerativeClustering
df = _data["df"].copy()
available = [c for c in RUN_CONFIGS[run_key] if c in df.columns]
df["_text"] = df[available].fillna("").agg(" ".join, axis=1)
df["_paper_id"] = df.index
df["_sentences"] = df["_text"].apply(_split_sentences)
meta = [c for c in ["_paper_id", "Title", "Author Keywords", "_sentences"] if c in df.columns]
sent_df = df[meta].explode("_sentences").rename(columns={"_sentences": "text"}).dropna(subset=["text"]).reset_index(drop=True)
sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount()
patterns = r"Licensee MDPI|Published by Informa|Published by Elsevier|Taylor & Francis|Copyright ©|Creative Commons|open access article|Inderscience Enterprises|All rights reserved|Springer Nature|Emerald Publishing|limitations and (future|implications|discussed)|implications (are|were) (discussed|presented)|concludes with .* implications"
sent_df = sent_df[~sent_df["text"].str.contains(patterns, case=False, regex=True, na=False)].reset_index(drop=True)
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embs = embedder.encode(sent_df["text"].tolist(), show_progress_bar=False, normalize_embeddings=True)
np.save(f"{CHECKPOINT_DIR}/rq4_{run_key}_emb.npy", embs)
cluster = AgglomerativeClustering(n_clusters=None, metric="cosine", linkage="average", distance_threshold=threshold)
model = BERTopic(hdbscan_model=cluster, umap_model=FunctionTransformer())
topics, _ = model.fit_transform(sent_df["text"].tolist(), embs)
_data[f"{run_key}_model"] = model
_data[f"{run_key}_topics"] = np.array(topics)
_data[f"{run_key}_embeddings"] = embs
_data[f"{run_key}_sent_df"] = sent_df
n = len(set(topics)) - int(-1 in topics)
(n >= 3) and model.visualize_topics().write_html(f"/tmp/rq4_{run_key}_intertopic.html")
(n >= 1) and model.visualize_barchart(top_n_topics=min(10, n)).write_html(f"/tmp/rq4_{run_key}_bars.html")
(n >= 2) and model.visualize_hierarchy().write_html(f"/tmp/rq4_{run_key}_hierarchy.html")
(n >= 2) and model.visualize_heatmap().write_html(f"/tmp/rq4_{run_key}_heatmap.html")
t_arr = np.array(topics)
valid = [r for r in model.get_topic_info().to_dict("records") if r["Topic"] != -1]
def _centroid(row):
mask = t_arr == row["Topic"]
m_idx = np.where(mask)[0]
m_embs = embs[mask]
cent = m_embs.mean(axis=0)
dists = 1 - (m_embs @ cent) / (np.linalg.norm(m_embs, axis=1) * np.linalg.norm(cent) + 1e-10)
near = np.argsort(dists)[:NEAREST_K]
evidence = [{"sentence": str(sent_df.iloc[m_idx[i]]["text"])[:250], "paper_id": int(sent_df.iloc[m_idx[i]]["_paper_id"]), "title": str(sent_df.iloc[m_idx[i]].get("Title", ""))[:150], "keywords": str(sent_df.iloc[m_idx[i]].get("Author Keywords", ""))[:150]} for i in near]
p_df = sent_df.iloc[m_idx].drop_duplicates(subset=["_paper_id"])
titles = [str(p_df.iloc[i].get("Title", ""))[:200] for i in range(min(50, len(p_df)))]
return {"topic_id": int(row["Topic"]), "sentence_count": int(row["Count"]), "paper_count": len(p_df), "top_words": str(row.get("Name", ""))[:100], "nearest": evidence, "paper_titles": titles}
sums = list(map(_centroid, valid))
json.dump(sums, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json", "w"), indent=2, default=str)
lines = [f" Topic {s['topic_id']} ({s['sentence_count']} sents, {s['paper_count']} papers): {s['top_words']}" for s in sums]
return f"[{run_key}] {n} topics from {len(sent_df)} sentences.\n\n" + "\n".join(lines)
@tool
def label_topics_with_llm(run_key: str) -> str:
"""Use Mistral to generate academic labels for discovered topics."""
from langchain_mistralai import ChatMistralAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
sums = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json"))
to_label = sorted(sums, key=lambda s: s.get("sentence_count", 0), reverse=True)[:100]
block = "\n\n".join([f"Topic {s['topic_id']} ({s['sentence_count']} sents):\n{NEAREST_K} entries:\n" + "\n".join([f"- {e['sentence']}\n Paper: {e['title']}" for e in s["nearest"]]) for s in to_label])
prompt = PromptTemplate.from_template("Return JSON ARRAY of objects with topic_id, label, category, confidence, reasoning, niche for:\n{topics}")
llm = ChatMistralAI(model="mistral-small-latest", temperature=0)
labels = (prompt | llm | JsonOutputParser()).invoke({"topics": block})
labeled = [{**s, **l} for s, l in zip(sums, labels + sums)]
json.dump(labeled, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json", "w"), indent=2, default=str)
lines = [f" **Topic {l.get('topic_id')}: {l.get('label')}** [{l.get('category')}] ({l.get('sentence_count')} sents)" for l in labeled]
return f"[{run_key}] {len(labeled)} topics labeled.\n\n" + "\n\n".join(lines)
@tool
def generate_comparison_csv() -> str:
"""Create a comparison matrix across multiple analysis runs."""
done = [k for k in RUN_CONFIGS.keys() if os.path.exists(f"{CHECKPOINT_DIR}/rq4_{k}_labels.json")]
rows = []
for k in done:
ls = json.load(open(f"{CHECKPOINT_DIR}/rq4_{k}_labels.json"))
rows.extend([{"run": k, "topic_id": l.get("topic_id"), "label": l.get("label"), "category": l.get("category"), "sentences": l.get("sentence_count"), "papers": l.get("paper_count")} for l in ls])
df = pd.DataFrame(rows)
df.to_csv("/tmp/rq4_comparison.csv", index=False)
return f"Saved to /tmp/rq4_comparison.csv\n\n{df.to_string(index=False)}"
@tool
def export_narrative(run_key: str) -> str:
"""Generate a 500-word research narrative for the results section."""
from langchain_mistralai import ChatMistralAI
ls = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json"))
txt = "\n".join([f"- {l.get('label')} ({l.get('sentence_count')} sents)" for l in ls])
llm = ChatMistralAI(model="mistral-small-latest", temperature=0.3)
res = llm.invoke(f"Write a 500-word Section 7 'Topic Modeling Results' for {run_key} run:\n{txt}")
open("/tmp/rq4_narrative.txt", "w", encoding="utf-8").write(res.content)
return f"Saved to /tmp/rq4_narrative.txt\n\n{res.content}"
@tool
def consolidate_into_themes(run_key: str, theme_map: dict) -> str:
"""Merge specific topics into broader research themes."""
t_arr, embs, s_df = _data[f"{run_key}_topics"], _data[f"{run_key}_embeddings"], _data[f"{run_key}_sent_df"]
def _build(name, ids):
mask = np.isin(t_arr, ids)
m_idx, m_embs = np.where(mask)[0], embs[mask]
cent = m_embs.mean(axis=0)
dists = 1 - (m_embs @ cent) / (np.linalg.norm(m_embs, axis=1) * np.linalg.norm(cent) + 1e-10)
near = np.argsort(dists)[:NEAREST_K]
evidence = [{"sentence": str(s_df.iloc[m_idx[i]]["text"])[:250], "title": str(s_df.iloc[m_idx[i]].get("Title", ""))[:150]} for i in near]
return {"label": name, "merged_topics": list(ids), "sentence_count": int(mask.sum()), "paper_count": int(s_df.iloc[m_idx]["_paper_id"].nunique()), "nearest": evidence}
themes = [{"topic_id": i, **_build(n, ids)} for i, (n, ids) in enumerate(theme_map.items())]
json.dump(themes, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json", "w"), indent=2, default=str)
lines = [f" **{t['label']}** ({t['sentence_count']} sents)" for t in themes]
return f"[{run_key}] {len(themes)} themes.\n\n" + "\n".join(lines)
PAJAIS = ["Electronic Business", "HCI", "IS Strategy", "Business Intelligence", "Design Science", "Enterprise Systems", "Adoption", "Social Media", "Cultural Issues", "Security", "Smart/IoT", "Knowledge Management", "Digital Platform", "Healthcare", "Project Management", "Service Science", "Social/Org Aspects", "Research Methods", "E-Finance", "E-Government", "Education", "Sustainability"]
@tool
def compare_with_taxonomy(run_key: str) -> str:
"""Map themes to the PAJAIS taxonomy or identify novel contributions."""
from langchain_mistralai import ChatMistralAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
src = (os.path.exists(f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json") and f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json") or f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json"
ts = json.load(open(src))
prompt = PromptTemplate.from_template("Map themes to PAJAIS taxonomy or mark 'NOVEL'. Return JSON array for:\nThemes:\n{ts}\nTaxonomy:\n{tax}")
llm = ChatMistralAI(model="mistral-small-latest", temperature=0)
ms = (prompt | llm | JsonOutputParser()).invoke({"ts": "\n".join([t['label'] for t in ts]), "tax": "\n".join(PAJAIS)})
json.dump(ms, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json", "w"), indent=2, default=str)
return f"[{run_key}] Mapping complete."
def get_all_tools():
ts = [load_scopus_csv, run_bertopic_discovery, label_topics_with_llm, consolidate_into_themes, compare_with_taxonomy, generate_comparison_csv, export_narrative]
for t in ts: setattr(t, 'handle_tool_error', True)
return ts