| import pandas as pd |
| import numpy as np |
| import json |
| import re |
| import nltk |
| from dotenv import load_dotenv |
| from functools import reduce |
| from sentence_transformers import SentenceTransformer |
| from sklearn.cluster import AgglomerativeClustering |
| from sklearn.metrics.pairwise import cosine_similarity |
| from langchain_core.tools import tool |
| from langchain_core.prompts import PromptTemplate |
| from langchain_core.output_parsers import JsonOutputParser |
| from langchain_mistralai import ChatMistralAI |
| import plotly.express as px |
| import plotly.graph_objects as go |
| load_dotenv() |
| |
| nltk.download('punkt', quiet=True) |
| nltk.download('punkt_tab', quiet=True) |
|
|
| |
| llm = ChatMistralAI(model="mistral-large-latest", temperature=0) |
|
|
| |
| BOILERPLATE_PATTERNS = [ |
| r"(?i)©\s*\d{4}\s*Elsevier.*", r"(?i)all rights reserved", r"(?i)peer-review under responsibility.*", |
| r"(?i)available online.*", r"(?i)author keywords.*", r"(?i)index terms.*", r"(?i)funding details.*", |
| r"(?i)conflict of interest.*", r"(?i)declaration of competing interest.*", r"(?i)data availability.*", |
| r"(?i)acknowledgement.*", r"(?i)open access.*", r"(?i)creative commons.*", r"(?i)licensee mdpi.*", |
| r"(?i)springer nature.*", r"(?i)taylor & francis.*", r"(?i)wiley & sons.*", r"(?i)emerald publishing.*", |
| r"(?i)ieee.*", r"(?i)acm.*", r"(?i)published by.*", r"(?i)copyright.*" |
| ] |
|
|
| @tool() |
| def load_scopus_csv(filepath: str) -> str: |
| """Loads CSV, splits abstracts/titles into sentences, applies regex noise filters, and reports stats.""" |
| df = pd.read_csv(filepath) |
|
|
| |
| clean_text = lambda text: reduce(lambda t, p: re.sub(p, "", t), BOILERPLATE_PATTERNS, str(text)) |
|
|
| |
| df['clean_abstract'] = df['Abstract'].fillna("").apply(clean_text) |
| df['clean_title'] = df['Title'].fillna("").apply(clean_text) |
|
|
| df['abstract_sentences'] = df['clean_abstract'].apply(nltk.sent_tokenize) |
| df['title_sentences'] = df['clean_title'].apply(nltk.sent_tokenize) |
|
|
| |
| df.to_json("processed_data.json", orient="records") |
|
|
| total_papers = len(df) |
| total_abstract_sents = df['abstract_sentences'].apply(len).sum() |
| total_title_sents = df['title_sentences'].apply(len).sum() |
|
|
| return f"Data loaded. Papers: {total_papers}, Abstract sentences: {total_abstract_sents}, Title sentences: {total_title_sents}." |
|
|
| @tool() |
| def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str: |
| """Embeds text, clusters with AgglomerativeClustering (NO UMAP), finds nearest centroids, saves summaries & charts.""" |
| |
| column_map = {"abstract": "abstract_sentences", "title": "title_sentences"} |
| target_col = column_map[run_key] |
|
|
| df = pd.read_json("processed_data.json") |
|
|
| |
| flat_data = [{"paper_id": row['EID'], "sentence": sent} for _, row in df.iterrows() for sent in row[target_col]] |
| sentences = [item['sentence'] for item in flat_data] |
|
|
| model = SentenceTransformer("all-MiniLM-L6-v2") |
| embeddings = model.encode(sentences, normalize_embeddings=True) |
|
|
| cluster_model = AgglomerativeClustering(n_clusters=None, metric="cosine", linkage="average", distance_threshold=threshold) |
| labels = cluster_model.fit_predict(embeddings) |
|
|
| |
| df_cluster = pd.DataFrame({"sentence": sentences, "label": labels, "paper_id": [item['paper_id'] for item in flat_data]}) |
| unique_labels = np.unique(labels) |
|
|
| |
| centroids = np.array([embeddings[labels == l].mean(axis=0) for l in unique_labels]) |
| sim_matrix = cosine_similarity(embeddings, centroids) |
|
|
| |
| top_5_indices = np.argsort(sim_matrix, axis=0)[-5:] |
|
|
| summaries = { |
| str(label): { |
| "top_sentences": [sentences[idx] for idx in top_5_indices[:, i]], |
| "size": int((labels == label).sum()), |
| "papers_count": int(df_cluster[df_cluster['label'] == label]['paper_id'].nunique()) |
| } |
| for i, label in enumerate(unique_labels) |
| } |
|
|
| |
| fig_bar = px.bar(x=[str(l) for l in unique_labels], y=[s['size'] for s in summaries.values()], title="Cluster Sizes") |
| fig_map = px.scatter(title="Intertopic Map (Placeholder - No UMAP space)") |
| fig_hier = px.line(title="Hierarchy (Placeholder)") |
| fig_heat = px.density_heatmap(title="Heatmap (Placeholder)") |
|
|
| |
| np.save("emb.npy", embeddings) |
| with open("summaries.json", "w") as f: json.dump(summaries, f) |
| with open("charts.html", "w") as f: |
| f.write(fig_bar.to_html(include_plotlyjs="cdn")) |
| f.write(fig_map.to_html(include_plotlyjs="cdn")) |
| f.write(fig_hier.to_html(include_plotlyjs="cdn")) |
| f.write(fig_heat.to_html(include_plotlyjs="cdn")) |
|
|
| return "Clustering complete. summaries.json, emb.npy, and charts.html saved." |
|
|
| @tool() |
| def label_topics_with_llm(run_key: str) -> str: |
| """Sends top 100 topics to Mistral to generate labels, categories, and confidence scores.""" |
| with open("summaries.json", "r") as f: summaries = json.load(f) |
|
|
| |
| top_100_keys = sorted(summaries.keys(), key=lambda k: summaries[k]['size'], reverse=True)[:100] |
| prompt_data = {k: summaries[k]['top_sentences'] for k in top_100_keys} |
|
|
| parser = JsonOutputParser() |
| prompt = PromptTemplate( |
| template="For each topic, provide: label (research area name), category, confidence, reasoning, niche (true/false).\nData: {data}\n\n{format_instructions}", |
| input_variables=["data"], |
| partial_variables={"format_instructions": parser.get_format_instructions()} |
| ) |
|
|
| chain = prompt | llm | parser |
| labels_output = chain.invoke({"data": json.dumps(prompt_data)}) |
|
|
| with open("labels.json", "w") as f: json.dump(labels_output, f) |
| return "Labels generated. labels.json saved." |
|
|
| @tool() |
| def consolidate_into_themes(run_key: str, theme_map: str) -> str: |
| """Recomputes centroids based on merged groups passed by the agent (JSON string).""" |
| mapping = json.loads(theme_map) |
| with open("summaries.json", "r") as f: summaries = json.load(f) |
|
|
| |
| def merge_clusters(cluster_ids): |
| combined_sentences = [sent for cid in cluster_ids for sent in summaries[str(cid)]['top_sentences']] |
| return { |
| "top_sentences": combined_sentences[:5], |
| "size": sum(summaries[str(cid)]['size'] for cid in cluster_ids), |
| "papers_count": sum(summaries[str(cid)]['papers_count'] for cid in cluster_ids) |
| } |
|
|
| themes = {theme_name: merge_clusters(c_ids) for theme_name, c_ids in mapping.items()} |
|
|
| with open("themes.json", "w") as f: json.dump(themes, f) |
| return "Themes consolidated. themes.json saved." |
|
|
| @tool() |
| def compare_with_taxonomy(run_key: str) -> str: |
| """Sends final themes to Mistral to map against the PAJAIS 25-category list.""" |
| with open("themes.json", "r") as f: themes = json.load(f) |
|
|
| parser = JsonOutputParser() |
| prompt = PromptTemplate( |
| template="Map these themes to PAJAIS 25 categories. For each theme return: pajais_match (or NOVEL), match_confidence, reasoning, is_novel.\nThemes: {themes}\n\n{format_instructions}", |
| input_variables=["themes"], |
| partial_variables={"format_instructions": parser.get_format_instructions()} |
| ) |
|
|
| chain = prompt | llm | parser |
| taxonomy_mapping = chain.invoke({"themes": json.dumps(themes)}) |
|
|
| with open("taxonomy_map.json", "w") as f: json.dump(taxonomy_mapping, f) |
| return "Taxonomy mapping complete. taxonomy_map.json saved." |
|
|
| @tool() |
| def generate_comparison_csv() -> str: |
| """Merges abstract and title runs from themes.json into a side-by-side Pandas DataFrame.""" |
| |
| |
| |
|
|
| |
| |
| df = pd.read_json("themes.json").T |
| df.to_csv("comparison.csv") |
|
|
| return "Comparison CSV generated and saved as comparison.csv." |
|
|
| @tool() |
| def export_narrative(run_key: str) -> str: |
| """Prompts Mistral to write a 500-word Section 7 literature review.""" |
| with open("themes.json", "r") as f: themes = json.load(f) |
| with open("taxonomy_map.json", "r") as f: taxonomy = json.load(f) |
|
|
| prompt = PromptTemplate.from_template( |
| "Write a 500-word Section 7 for a literature review paper, referencing methodology, B&C phases, key themes, limitations.\nThemes: {themes}\nTaxonomy: {taxonomy}" |
| ) |
|
|
| chain = prompt | llm |
| narrative = chain.invoke({"themes": json.dumps(themes), "taxonomy": json.dumps(taxonomy)}) |
|
|
| with open("narrative.txt", "w") as f: f.write(narrative.content) |
| return "Narrative exported to narrative.txt." |
|
|