Spaces:
Running
Running
Update tools.py
Browse files
tools.py
CHANGED
|
@@ -1,14 +1,21 @@
|
|
| 1 |
"""
|
| 2 |
tools.py - 7 LangChain @tool functions for BERTopic Thematic Analysis Agent.
|
| 3 |
Rules: ZERO if/else, ZERO for/while, ZERO try/except, ZERO PromptTemplate.
|
| 4 |
-
All LLM calls use plain HumanMessage strings.
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
import json
|
| 11 |
import re
|
|
|
|
| 12 |
from pathlib import Path
|
| 13 |
|
| 14 |
import numpy as np
|
|
@@ -26,63 +33,62 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
| 26 |
DATA_DIR = Path("data")
|
| 27 |
DATA_DIR.mkdir(exist_ok=True)
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
return {
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
"
|
| 40 |
-
"
|
| 41 |
-
"
|
| 42 |
-
"
|
| 43 |
-
"
|
|
|
|
| 44 |
}
|
| 45 |
|
| 46 |
-
# Shared fallback paths (used when run_config not specified)
|
| 47 |
-
SUMMARIES_PATH = DATA_DIR / "abstract" / "summaries.json"
|
| 48 |
-
THEMES_PATH = DATA_DIR / "abstract" / "themes.json"
|
| 49 |
-
TAXONOMY_PATH = DATA_DIR / "abstract" / "taxonomy.json"
|
| 50 |
-
NARRATIVE_PATH = DATA_DIR / "abstract" / "narrative.txt"
|
| 51 |
-
COMPARISON_PATH = DATA_DIR / "abstract" / "comparison.csv"
|
| 52 |
-
EMB_PATH = DATA_DIR / "abstract" / "emb.npy"
|
| 53 |
-
|
| 54 |
RUN_CONFIGS = {
|
| 55 |
"abstract": ["Abstract"],
|
| 56 |
"title": ["Title"],
|
| 57 |
-
"both": ["Abstract", "Title"],
|
| 58 |
}
|
| 59 |
|
|
|
|
| 60 |
BOILERPLATE_PATTERNS = [
|
| 61 |
-
r"\u00a9",
|
| 62 |
-
r"\\u00a9",
|
| 63 |
-
r"copyright\s*\d{4}",
|
| 64 |
-
r"\d{4}\s+john wiley",
|
| 65 |
-
r"john wiley\s*&\s*sons",
|
| 66 |
-
r"blackwell
|
| 67 |
-
r"blackwell\s+pub",
|
| 68 |
r"wiley\s+periodicals",
|
|
|
|
| 69 |
r"all rights reserved",
|
| 70 |
r"doi\s*:\s*\S+",
|
| 71 |
r"published by elsevier",
|
| 72 |
-
r"elsevier\s
|
| 73 |
-
r"springer\s
|
| 74 |
-
r"taylor\s*&\s*francis",
|
| 75 |
r"informa\s+uk",
|
| 76 |
r"sage\s+publications",
|
| 77 |
-
r"information systems journal",
|
|
|
|
| 78 |
r"this article is",
|
| 79 |
r"rights reserved",
|
| 80 |
r"permission from",
|
| 81 |
r"reproduced with",
|
| 82 |
-
r"^\s*abstract\s*$", # lone word "Abstract"
|
| 83 |
]
|
| 84 |
BOILERPLATE_RE = re.compile("|".join(BOILERPLATE_PATTERNS), re.IGNORECASE)
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
PAJAIS_CATEGORIES = [
|
| 87 |
"Information Systems Theory", "IS Strategy & Governance",
|
| 88 |
"Digital Innovation", "Enterprise Systems",
|
|
@@ -101,203 +107,167 @@ PAJAIS_CATEGORIES = [
|
|
| 101 |
|
| 102 |
|
| 103 |
def safe_read_csv(path):
|
| 104 |
-
"""Read CSV with UTF-8
|
| 105 |
try:
|
| 106 |
return pd.read_csv(path, encoding="utf-8")
|
| 107 |
except UnicodeDecodeError:
|
| 108 |
return pd.read_csv(path, encoding="latin-1")
|
| 109 |
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
def _call_llm_json(llm, prompt: str) -> list:
|
| 112 |
-
"""Call LLM with plain HumanMessage
|
| 113 |
response = llm.invoke([HumanMessage(content=prompt)])
|
| 114 |
raw = response.content.strip()
|
| 115 |
raw = raw.split("```json")[-1].split("```")[0].strip() if "```" in raw else raw
|
| 116 |
return json.loads(raw)
|
| 117 |
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
# =============================================================================
|
| 120 |
-
# TOOL 1
|
| 121 |
-
#
|
| 122 |
-
# Each config saves to its own subdirectory so nothing is overwritten
|
| 123 |
# =============================================================================
|
| 124 |
@tool
|
| 125 |
def load_scopus_csv(csv_path: str, run_config: str = "abstract") -> str:
|
| 126 |
-
"""Load a Scopus CSV
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
never overwrite each other.
|
| 131 |
|
| 132 |
Args:
|
| 133 |
csv_path: Path to the uploaded Scopus CSV file.
|
| 134 |
-
run_config: 'abstract'
|
| 135 |
"""
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
list(RUN_CONFIGS.items())
|
| 139 |
-
if run_config == "both"
|
| 140 |
-
else [(run_config, RUN_CONFIGS.get(run_config, ["Abstract"]))]
|
| 141 |
-
)
|
| 142 |
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
-
|
| 146 |
-
cfg_name, columns = config_pair
|
| 147 |
-
p = _paths(cfg_name)
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
lambda c: c in df_raw.columns, columns
|
| 156 |
-
)))
|
| 157 |
-
|
| 158 |
-
def split_sentences(text):
|
| 159 |
-
raw_sents = re.split(r"(?<=[.!?])\s+", str(text))
|
| 160 |
-
return list(filter(
|
| 161 |
-
lambda s: (
|
| 162 |
-
not BOILERPLATE_RE.search(s)
|
| 163 |
-
and len(s.split()) > 6 # at least 7 words
|
| 164 |
-
and len(s.strip()) > 40 # at least 40 chars
|
| 165 |
-
and not s.strip().startswith("©")
|
| 166 |
-
and "wiley" not in s.lower()
|
| 167 |
-
and "elsevier" not in s.lower()
|
| 168 |
-
and "blackwell" not in s.lower()
|
| 169 |
-
and "springer" not in s.lower()
|
| 170 |
-
and "information systems journal" not in s.lower()
|
| 171 |
-
),
|
| 172 |
-
raw_sents
|
| 173 |
-
))
|
| 174 |
-
|
| 175 |
-
sentences_lists = list(map(split_sentences, df[columns[0]].tolist()))
|
| 176 |
-
all_sentences = [s for lst in sentences_lists for s in lst]
|
| 177 |
-
|
| 178 |
-
stats = {
|
| 179 |
-
"papers": int(len(df)),
|
| 180 |
-
"sentences_after_filter": int(len(all_sentences)),
|
| 181 |
-
"columns_used": columns,
|
| 182 |
-
"csv_path": str(csv_path),
|
| 183 |
-
"run_config": cfg_name,
|
| 184 |
-
}
|
| 185 |
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
return stats
|
| 190 |
|
| 191 |
-
|
|
|
|
| 192 |
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
|
|
|
| 201 |
|
| 202 |
|
| 203 |
# =============================================================================
|
| 204 |
-
# TOOL 2
|
| 205 |
-
#
|
| 206 |
-
# distance_threshold=0.35 gives ~100 topics from 2000+ sentences
|
| 207 |
# =============================================================================
|
| 208 |
@tool
|
| 209 |
def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract") -> str:
|
| 210 |
-
"""Embed sentences, cluster with AgglomerativeClustering
|
| 211 |
-
targeting ~100 topics.
|
| 212 |
|
| 213 |
Args:
|
| 214 |
-
top_n_topics: Target number of
|
| 215 |
-
run_config:
|
| 216 |
"""
|
| 217 |
-
p =
|
| 218 |
sentences = json.loads(p["sentences"].read_text())
|
| 219 |
|
| 220 |
-
model
|
| 221 |
embeddings = model.encode(
|
| 222 |
-
sentences, normalize_embeddings=True,
|
|
|
|
| 223 |
)
|
| 224 |
np.save(p["emb"], embeddings)
|
| 225 |
|
| 226 |
-
# threshold=0.35 produces many fine-grained clusters (~100 for 2000+ sentences)
|
| 227 |
-
# threshold=0.70 produces fewer broad clusters (~40-60)
|
| 228 |
-
# We use 0.35 to get close to the desired 100 topics
|
| 229 |
clustering = AgglomerativeClustering(
|
| 230 |
-
metric="cosine",
|
| 231 |
-
|
| 232 |
-
distance_threshold=0.35,
|
| 233 |
-
n_clusters=None,
|
| 234 |
)
|
| 235 |
labels = clustering.fit_predict(embeddings)
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
retained = list(map(lambda x: x[0], label_sizes_filtered[:top_n_topics]))
|
| 244 |
-
|
| 245 |
-
# Extra boilerplate check for individual sentences
|
| 246 |
-
def is_clean_sentence(s):
|
| 247 |
-
return (
|
| 248 |
-
not BOILERPLATE_RE.search(s)
|
| 249 |
-
and len(s.split()) > 6
|
| 250 |
-
and len(s) > 40
|
| 251 |
-
and not s.strip().startswith("©")
|
| 252 |
-
and "wiley" not in s.lower()
|
| 253 |
-
and "elsevier" not in s.lower()
|
| 254 |
-
and "blackwell" not in s.lower()
|
| 255 |
-
and "springer" not in s.lower()
|
| 256 |
-
and "taylor" not in s.lower()
|
| 257 |
-
and "john wiley" not in s.lower()
|
| 258 |
-
)
|
| 259 |
|
| 260 |
-
def
|
| 261 |
-
seq_id,
|
| 262 |
-
mask
|
| 263 |
cluster_embs = embeddings[mask]
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
top_evidence_raw = list(map(lambda i: cluster_sents_raw[i], top5_idx))
|
| 275 |
-
top_evidence = list(filter(lambda s: s in clean_set, top_evidence_raw))[:5]
|
| 276 |
-
top_evidence = top_evidence if top_evidence else top_evidence_raw[:3]
|
| 277 |
return {
|
| 278 |
-
"topic_id": seq_id,
|
| 279 |
-
"raw_label": int(label), # original cluster label kept for internal use
|
| 280 |
"size": int(mask.sum()),
|
| 281 |
"top_evidence": top_evidence,
|
| 282 |
-
"sentences":
|
| 283 |
"centroid": centroid[0].tolist(),
|
| 284 |
"run_config": run_config,
|
| 285 |
}
|
| 286 |
|
| 287 |
-
#
|
| 288 |
-
|
| 289 |
-
summaries = list(map(
|
| 290 |
p["summaries"].write_text(json.dumps(summaries, indent=2, ensure_ascii=False))
|
| 291 |
|
| 292 |
-
sizes = list(map(lambda s: s["size"],
|
| 293 |
ids = list(map(lambda s: s["topic_id"], summaries))
|
| 294 |
|
| 295 |
-
fig1 = px.bar(x=ids, y=sizes,
|
| 296 |
-
labels={"x": "Topic
|
| 297 |
-
|
| 298 |
-
fig2 = px.histogram(x=sizes, nbins=30, title="Cluster Size Histogram ({})".format(run_config),
|
| 299 |
labels={"x": "Cluster Size"})
|
| 300 |
-
|
| 301 |
centroids = np.array(list(map(lambda s: s["centroid"], summaries)))
|
| 302 |
n_comp = min(2, centroids.shape[0], centroids.shape[1])
|
| 303 |
coords = PCA(n_components=n_comp).fit_transform(centroids)
|
|
@@ -305,14 +275,12 @@ def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract"
|
|
| 305 |
x=coords[:, 0],
|
| 306 |
y=(coords[:, 1] if coords.shape[1] > 1 else [0] * len(coords)),
|
| 307 |
text=list(map(str, ids)),
|
| 308 |
-
title="Topic Centroids PCA
|
| 309 |
labels={"x": "PC1", "y": "PC2"},
|
| 310 |
)
|
| 311 |
fig4 = px.treemap(
|
| 312 |
-
names=list(map(str, ids)),
|
| 313 |
-
|
| 314 |
-
values=sizes,
|
| 315 |
-
title="Topic Treemap ({})".format(run_config),
|
| 316 |
)
|
| 317 |
|
| 318 |
charts = {
|
|
@@ -327,22 +295,22 @@ def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract"
|
|
| 327 |
"topics_found": len(summaries),
|
| 328 |
"run_config": run_config,
|
| 329 |
"chart_types": list(charts.keys()),
|
| 330 |
-
"note":
|
| 331 |
})
|
| 332 |
|
| 333 |
|
| 334 |
# =============================================================================
|
| 335 |
-
# TOOL 3
|
| 336 |
# =============================================================================
|
| 337 |
@tool
|
| 338 |
def label_topics_with_llm(batch_size: int = 20, run_config: str = "abstract") -> str:
|
| 339 |
-
"""
|
| 340 |
|
| 341 |
Args:
|
| 342 |
-
batch_size: Topics per LLM
|
| 343 |
-
run_config:
|
| 344 |
"""
|
| 345 |
-
p
|
| 346 |
summaries = json.loads(p["summaries"].read_text())
|
| 347 |
top_summaries = summaries[:100]
|
| 348 |
llm = ChatMistralAI(model="mistral-large-latest", temperature=0.2)
|
|
@@ -355,14 +323,13 @@ def label_topics_with_llm(batch_size: int = 20, run_config: str = "abstract") ->
|
|
| 355 |
batch
|
| 356 |
))
|
| 357 |
prompt = (
|
| 358 |
-
"You are a thematic analysis expert
|
| 359 |
-
"
|
| 360 |
-
"
|
| 361 |
"TOPICS:\n" + json.dumps(mini, indent=2) + "\n\n"
|
| 362 |
-
"Return ONLY a
|
| 363 |
-
"Each element
|
| 364 |
-
"
|
| 365 |
-
"No markdown fences, no explanation, just the raw JSON array."
|
| 366 |
)
|
| 367 |
return _call_llm_json(llm, prompt)
|
| 368 |
|
|
@@ -383,32 +350,32 @@ def label_topics_with_llm(batch_size: int = 20, run_config: str = "abstract") ->
|
|
| 383 |
|
| 384 |
|
| 385 |
# =============================================================================
|
| 386 |
-
# TOOL 4
|
| 387 |
# =============================================================================
|
| 388 |
@tool
|
| 389 |
def consolidate_into_themes(approved_groups: str, run_config: str = "abstract") -> str:
|
| 390 |
-
"""Merge
|
| 391 |
|
| 392 |
Args:
|
| 393 |
-
approved_groups: JSON
|
| 394 |
-
run_config:
|
| 395 |
"""
|
| 396 |
-
p =
|
| 397 |
groups = json.loads(approved_groups)
|
| 398 |
summaries = json.loads(p["summaries"].read_text())
|
| 399 |
-
|
| 400 |
|
| 401 |
def build_theme(group):
|
| 402 |
ids = group["topic_ids"]
|
| 403 |
-
members = list(map(lambda tid:
|
| 404 |
-
|
| 405 |
centroids = np.array(list(map(lambda ms: ms["centroid"], members)))
|
| 406 |
return {
|
| 407 |
"theme_name": group["theme_name"],
|
| 408 |
"topic_ids": ids,
|
| 409 |
-
"sentences":
|
| 410 |
"centroid": centroids.mean(axis=0).tolist(),
|
| 411 |
-
"paper_count": len(set(
|
| 412 |
"run_config": run_config,
|
| 413 |
}
|
| 414 |
|
|
@@ -418,20 +385,21 @@ def consolidate_into_themes(approved_groups: str, run_config: str = "abstract")
|
|
| 418 |
"themes_created": len(themes),
|
| 419 |
"theme_names": list(map(lambda t: t["theme_name"], themes)),
|
| 420 |
"run_config": run_config,
|
|
|
|
| 421 |
})
|
| 422 |
|
| 423 |
|
| 424 |
# =============================================================================
|
| 425 |
-
# TOOL 5
|
| 426 |
# =============================================================================
|
| 427 |
@tool
|
| 428 |
def compare_with_taxonomy(run_config: str = "abstract") -> str:
|
| 429 |
-
"""Map
|
| 430 |
|
| 431 |
Args:
|
| 432 |
-
run_config:
|
| 433 |
"""
|
| 434 |
-
p =
|
| 435 |
themes = json.loads(p["themes"].read_text())
|
| 436 |
llm = ChatMistralAI(model="mistral-large-latest", temperature=0.1)
|
| 437 |
|
|
@@ -439,107 +407,152 @@ def compare_with_taxonomy(run_config: str = "abstract") -> str:
|
|
| 439 |
lambda t: {"name": t["theme_name"], "sample": t["sentences"][:2]},
|
| 440 |
themes
|
| 441 |
))
|
| 442 |
-
|
| 443 |
prompt = (
|
| 444 |
"You are a research classification expert in Information Systems.\n\n"
|
| 445 |
-
"Map each
|
| 446 |
-
"PAJAIS (Pacific Asia Journal of the Association for Information Systems) category.\n\n"
|
| 447 |
"THEMES:\n" + json.dumps(theme_mini, indent=2) + "\n\n"
|
| 448 |
"PAJAIS CATEGORIES:\n" + json.dumps(PAJAIS_CATEGORIES, indent=2) + "\n\n"
|
| 449 |
-
"Return ONLY a
|
| 450 |
-
"Each element
|
| 451 |
-
"
|
| 452 |
-
"No markdown fences, no explanation, just the raw JSON array."
|
| 453 |
)
|
| 454 |
-
|
| 455 |
result = _call_llm_json(llm, prompt)
|
| 456 |
p["taxonomy"].write_text(json.dumps(result, indent=2, ensure_ascii=False))
|
| 457 |
-
return json.dumps({
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
|
| 459 |
|
| 460 |
# =============================================================================
|
| 461 |
-
# TOOL 6
|
| 462 |
-
#
|
|
|
|
| 463 |
# =============================================================================
|
| 464 |
@tool
|
| 465 |
-
def generate_comparison_csv(
|
| 466 |
-
"""Generate Title | Abstract | Year | Source
|
| 467 |
|
| 468 |
-
|
| 469 |
-
|
| 470 |
"""
|
| 471 |
-
|
| 472 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
|
| 474 |
-
# Detect columns robustly
|
| 475 |
title_col = next(filter(lambda c: c.strip().lower() == "title", df.columns), None)
|
| 476 |
abstract_col = next(filter(lambda c: c.strip().lower() == "abstract", df.columns), None)
|
| 477 |
year_col = next(filter(lambda c: c.strip().lower() == "year", df.columns), None)
|
| 478 |
journal_col = next(filter(lambda c: "source" in c.lower(), df.columns), None)
|
| 479 |
|
| 480 |
-
# Build ordered column list: Title, Abstract, Year, Source title
|
| 481 |
ordered = [title_col, abstract_col, year_col, journal_col]
|
| 482 |
present = list(filter(lambda c: c is not None and c in df.columns, ordered))
|
|
|
|
| 483 |
|
| 484 |
-
out_df = df[present].copy()
|
| 485 |
-
# Rename columns to clean standard names
|
| 486 |
rename_map = {
|
| 487 |
title_col: "Title",
|
| 488 |
abstract_col: "Abstract",
|
| 489 |
year_col: "Year",
|
| 490 |
journal_col: "Source Journal",
|
| 491 |
}
|
| 492 |
-
out_df = out_df.rename(
|
| 493 |
-
|
| 494 |
-
|
|
|
|
|
|
|
| 495 |
return json.dumps({
|
| 496 |
-
"rows":
|
| 497 |
-
"columns":
|
| 498 |
-
"path":
|
| 499 |
-
"
|
| 500 |
})
|
| 501 |
|
| 502 |
|
| 503 |
# =============================================================================
|
| 504 |
-
# TOOL 7
|
|
|
|
| 505 |
# =============================================================================
|
| 506 |
@tool
|
| 507 |
-
def export_narrative(
|
| 508 |
-
"""
|
| 509 |
|
| 510 |
-
|
| 511 |
-
|
| 512 |
"""
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
|
| 523 |
prompt = (
|
| 524 |
"You are an academic writing expert in Information Systems.\n\n"
|
| 525 |
"Write Section 7 (Discussion and Thematic Synthesis) of a systematic "
|
| 526 |
-
"literature review paper.
|
| 527 |
-
"Cover
|
| 528 |
-
"(a) Overview of
|
| 529 |
-
"(b)
|
| 530 |
-
"(c)
|
| 531 |
-
"(d)
|
| 532 |
-
"
|
| 533 |
-
"
|
| 534 |
-
"
|
| 535 |
-
"
|
|
|
|
|
|
|
|
|
|
| 536 |
)
|
| 537 |
|
| 538 |
response = llm.invoke([HumanMessage(content=prompt)])
|
| 539 |
narrative_text = response.content
|
| 540 |
-
|
|
|
|
| 541 |
return json.dumps({
|
| 542 |
"word_count": len(narrative_text.split()),
|
| 543 |
-
"path": str(
|
| 544 |
-
"
|
| 545 |
})
|
|
|
|
| 1 |
"""
|
| 2 |
tools.py - 7 LangChain @tool functions for BERTopic Thematic Analysis Agent.
|
| 3 |
Rules: ZERO if/else, ZERO for/while, ZERO try/except, ZERO PromptTemplate.
|
| 4 |
+
All LLM calls use plain HumanMessage strings directly.
|
| 5 |
+
|
| 6 |
+
Workflow:
|
| 7 |
+
- Abstract run saves to data/abstract/
|
| 8 |
+
- Title run saves to data/title/
|
| 9 |
+
- Comparison CSV + narrative only generated when BOTH runs are complete
|
| 10 |
+
- Topic IDs are sequential 1..N (not raw cluster labels)
|
| 11 |
+
- Boilerplate filter catches © symbol, all major publishers
|
| 12 |
"""
|
| 13 |
|
| 14 |
from __future__ import annotations
|
| 15 |
|
| 16 |
import json
|
| 17 |
import re
|
| 18 |
+
import shutil
|
| 19 |
from pathlib import Path
|
| 20 |
|
| 21 |
import numpy as np
|
|
|
|
| 33 |
DATA_DIR = Path("data")
|
| 34 |
DATA_DIR.mkdir(exist_ok=True)
|
| 35 |
|
| 36 |
+
def _p(run_config: str) -> dict:
|
| 37 |
+
"""Return all file paths for a given run_config, creating subdirectory."""
|
| 38 |
+
d = DATA_DIR / run_config
|
| 39 |
+
d.mkdir(parents=True, exist_ok=True)
|
| 40 |
return {
|
| 41 |
+
"dir": d,
|
| 42 |
+
"sentences": d / "sentences.json",
|
| 43 |
+
"stats": d / "stats.json",
|
| 44 |
+
"papers": d / "papers.csv",
|
| 45 |
+
"emb": d / "emb.npy",
|
| 46 |
+
"summaries": d / "summaries.json",
|
| 47 |
+
"charts": d / "charts.json",
|
| 48 |
+
"themes": d / "themes.json",
|
| 49 |
+
"taxonomy": d / "taxonomy.json",
|
| 50 |
+
"narrative": d / "narrative.txt",
|
| 51 |
+
"comparison": DATA_DIR / "comparison.csv", # shared output
|
| 52 |
}
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
RUN_CONFIGS = {
|
| 55 |
"abstract": ["Abstract"],
|
| 56 |
"title": ["Title"],
|
|
|
|
| 57 |
}
|
| 58 |
|
| 59 |
+
# Comprehensive boilerplate filter — catches © symbol + all major publishers
|
| 60 |
BOILERPLATE_PATTERNS = [
|
| 61 |
+
r"\u00a9", # © unicode
|
| 62 |
+
r"\\u00a9", # escaped unicode
|
| 63 |
+
r"copyright\s*\d{4}",
|
| 64 |
+
r"\d{4}\s+john wiley",
|
| 65 |
+
r"john wiley\s*(&|and)\s*sons",
|
| 66 |
+
r"blackwell\s*(publishing|pub)",
|
|
|
|
| 67 |
r"wiley\s+periodicals",
|
| 68 |
+
r"wiley\s+online",
|
| 69 |
r"all rights reserved",
|
| 70 |
r"doi\s*:\s*\S+",
|
| 71 |
r"published by elsevier",
|
| 72 |
+
r"elsevier\s*(b\.v|inc|ltd|science)",
|
| 73 |
+
r"springer\s*(nature|verlag|science|link)",
|
| 74 |
+
r"taylor\s*(&|and)\s*francis",
|
| 75 |
r"informa\s+uk",
|
| 76 |
r"sage\s+publications",
|
| 77 |
+
r"information systems journal\s+published",
|
| 78 |
+
r"emerald\s+(publishing|group)",
|
| 79 |
r"this article is",
|
| 80 |
r"rights reserved",
|
| 81 |
r"permission from",
|
| 82 |
r"reproduced with",
|
|
|
|
| 83 |
]
|
| 84 |
BOILERPLATE_RE = re.compile("|".join(BOILERPLATE_PATTERNS), re.IGNORECASE)
|
| 85 |
|
| 86 |
+
# Extra keyword filter applied per-sentence
|
| 87 |
+
PUBLISHER_KEYWORDS = frozenset([
|
| 88 |
+
"wiley", "elsevier", "blackwell", "springer",
|
| 89 |
+
"taylor", "information systems journal", "emerald"
|
| 90 |
+
])
|
| 91 |
+
|
| 92 |
PAJAIS_CATEGORIES = [
|
| 93 |
"Information Systems Theory", "IS Strategy & Governance",
|
| 94 |
"Digital Innovation", "Enterprise Systems",
|
|
|
|
| 107 |
|
| 108 |
|
| 109 |
def safe_read_csv(path):
|
| 110 |
+
"""Read CSV with UTF-8 fallback to latin-1."""
|
| 111 |
try:
|
| 112 |
return pd.read_csv(path, encoding="utf-8")
|
| 113 |
except UnicodeDecodeError:
|
| 114 |
return pd.read_csv(path, encoding="latin-1")
|
| 115 |
|
| 116 |
|
| 117 |
+
def _is_clean(s: str) -> bool:
|
| 118 |
+
"""Return True if sentence passes all quality checks."""
|
| 119 |
+
sl = s.lower().strip()
|
| 120 |
+
return (
|
| 121 |
+
not BOILERPLATE_RE.search(s)
|
| 122 |
+
and not s.strip().startswith("\u00a9")
|
| 123 |
+
and not s.strip().startswith("©")
|
| 124 |
+
and len(s.split()) > 6
|
| 125 |
+
and len(s.strip()) > 40
|
| 126 |
+
and not any(kw in sl for kw in PUBLISHER_KEYWORDS)
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
def _call_llm_json(llm, prompt: str) -> list:
|
| 131 |
+
"""Call LLM with plain HumanMessage, strip markdown fences, parse JSON."""
|
| 132 |
response = llm.invoke([HumanMessage(content=prompt)])
|
| 133 |
raw = response.content.strip()
|
| 134 |
raw = raw.split("```json")[-1].split("```")[0].strip() if "```" in raw else raw
|
| 135 |
return json.loads(raw)
|
| 136 |
|
| 137 |
|
| 138 |
+
def _both_runs_complete() -> bool:
|
| 139 |
+
"""Return True only when BOTH abstract and title runs have themes saved."""
|
| 140 |
+
return (
|
| 141 |
+
(_p("abstract")["themes"]).exists()
|
| 142 |
+
and (_p("title")["themes"]).exists()
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
# =============================================================================
|
| 147 |
+
# TOOL 1 — load_scopus_csv
|
| 148 |
+
# Saves to data/uploaded.csv (permanent copy) AND data/{run_config}/papers.csv
|
|
|
|
| 149 |
# =============================================================================
|
| 150 |
@tool
|
| 151 |
def load_scopus_csv(csv_path: str, run_config: str = "abstract") -> str:
|
| 152 |
+
"""Load a Scopus CSV, filter boilerplate sentences, save per run_config.
|
| 153 |
|
| 154 |
+
Saves sentences to data/{run_config}/sentences.json.
|
| 155 |
+
Also copies the CSV permanently to data/uploaded.csv.
|
|
|
|
| 156 |
|
| 157 |
Args:
|
| 158 |
csv_path: Path to the uploaded Scopus CSV file.
|
| 159 |
+
run_config: 'abstract' or 'title' (default 'abstract').
|
| 160 |
"""
|
| 161 |
+
p = _p(run_config)
|
| 162 |
+
columns = RUN_CONFIGS.get(run_config, ["Abstract"])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
+
# Copy CSV to permanent location only if it is a different file
|
| 165 |
+
dest = DATA_DIR / "uploaded.csv"
|
| 166 |
+
src = Path(csv_path).resolve()
|
| 167 |
+
dst = dest.resolve()
|
| 168 |
+
_ = shutil.copy(str(src), str(dst)) if src != dst else None
|
| 169 |
|
| 170 |
+
df_raw = safe_read_csv(dest)
|
|
|
|
|
|
|
| 171 |
|
| 172 |
+
present_cols = list(filter(
|
| 173 |
+
lambda c: c in df_raw.columns,
|
| 174 |
+
columns + ["Title", "Year", "Source title", "Cited by"]
|
| 175 |
+
))
|
| 176 |
+
text_cols = list(filter(lambda c: c in df_raw.columns, columns))
|
| 177 |
+
df = df_raw[present_cols].dropna(subset=text_cols)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
+
def split_sentences(text):
|
| 180 |
+
parts = re.split(r"(?<=[.!?])\s+", str(text))
|
| 181 |
+
return list(filter(_is_clean, parts))
|
|
|
|
| 182 |
|
| 183 |
+
sentences_lists = list(map(split_sentences, df[columns[0]].tolist()))
|
| 184 |
+
all_sentences = [s for lst in sentences_lists for s in lst]
|
| 185 |
|
| 186 |
+
stats = {
|
| 187 |
+
"papers": int(len(df)),
|
| 188 |
+
"sentences_after_filter": int(len(all_sentences)),
|
| 189 |
+
"columns_used": columns,
|
| 190 |
+
"run_config": run_config,
|
| 191 |
+
}
|
| 192 |
|
| 193 |
+
p["sentences"].write_text(json.dumps(all_sentences, ensure_ascii=False))
|
| 194 |
+
p["stats"].write_text(json.dumps(stats, ensure_ascii=False))
|
| 195 |
+
df.to_csv(p["papers"], index=False)
|
| 196 |
+
|
| 197 |
+
return json.dumps(stats)
|
| 198 |
|
| 199 |
|
| 200 |
# =============================================================================
|
| 201 |
+
# TOOL 2 — run_bertopic_discovery
|
| 202 |
+
# threshold=0.35 → ~100 fine-grained clusters; IDs renumbered 1..N
|
|
|
|
| 203 |
# =============================================================================
|
| 204 |
@tool
|
| 205 |
def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract") -> str:
|
| 206 |
+
"""Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering
|
| 207 |
+
(cosine, threshold=0.35) targeting ~100 topics. Topic IDs are sequential 1..N.
|
| 208 |
|
| 209 |
Args:
|
| 210 |
+
top_n_topics: Target number of clusters (default 100).
|
| 211 |
+
run_config: 'abstract' or 'title' (default 'abstract').
|
| 212 |
"""
|
| 213 |
+
p = _p(run_config)
|
| 214 |
sentences = json.loads(p["sentences"].read_text())
|
| 215 |
|
| 216 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 217 |
embeddings = model.encode(
|
| 218 |
+
sentences, normalize_embeddings=True,
|
| 219 |
+
show_progress_bar=False, batch_size=64
|
| 220 |
)
|
| 221 |
np.save(p["emb"], embeddings)
|
| 222 |
|
|
|
|
|
|
|
|
|
|
| 223 |
clustering = AgglomerativeClustering(
|
| 224 |
+
metric="cosine", linkage="average",
|
| 225 |
+
distance_threshold=0.35, n_clusters=None,
|
|
|
|
|
|
|
| 226 |
)
|
| 227 |
labels = clustering.fit_predict(embeddings)
|
| 228 |
|
| 229 |
+
all_labels = sorted(set(labels.tolist()))
|
| 230 |
+
label_sizes = list(map(lambda lb: (lb, int((labels == lb).sum())), all_labels))
|
| 231 |
+
# Keep clusters with ≥3 sentences, sort by size desc, take top N
|
| 232 |
+
label_filtered = list(filter(lambda x: x[1] >= 3, label_sizes))
|
| 233 |
+
label_sorted = sorted(label_filtered, key=lambda x: -x[1])
|
| 234 |
+
retained = list(map(lambda x: x[0], label_sorted[:top_n_topics]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
+
def build_summary(seq_label):
|
| 237 |
+
seq_id, raw_label = seq_label
|
| 238 |
+
mask = labels == raw_label
|
| 239 |
cluster_embs = embeddings[mask]
|
| 240 |
+
raw_sents = [sentences[i] for i, m in enumerate(mask.tolist()) if m]
|
| 241 |
+
clean_sents = list(filter(_is_clean, raw_sents))
|
| 242 |
+
sents = clean_sents if clean_sents else raw_sents[:5]
|
| 243 |
+
centroid = cluster_embs.mean(axis=0, keepdims=True)
|
| 244 |
+
sims = cosine_similarity(centroid, cluster_embs)[0]
|
| 245 |
+
top5_idx = sims.argsort()[-5:][::-1].tolist()
|
| 246 |
+
raw_top = list(map(lambda i: raw_sents[i], top5_idx))
|
| 247 |
+
clean_set = set(sents)
|
| 248 |
+
top_evidence = list(filter(lambda s: s in clean_set, raw_top))[:5]
|
| 249 |
+
top_evidence = top_evidence if top_evidence else raw_top[:3]
|
|
|
|
|
|
|
|
|
|
| 250 |
return {
|
| 251 |
+
"topic_id": seq_id,
|
|
|
|
| 252 |
"size": int(mask.sum()),
|
| 253 |
"top_evidence": top_evidence,
|
| 254 |
+
"sentences": sents,
|
| 255 |
"centroid": centroid[0].tolist(),
|
| 256 |
"run_config": run_config,
|
| 257 |
}
|
| 258 |
|
| 259 |
+
# Sequential IDs starting at 1
|
| 260 |
+
seq_pairs = list(map(lambda x: (x[0] + 1, x[1]), enumerate(retained)))
|
| 261 |
+
summaries = list(map(build_summary, seq_pairs))
|
| 262 |
p["summaries"].write_text(json.dumps(summaries, indent=2, ensure_ascii=False))
|
| 263 |
|
| 264 |
+
sizes = list(map(lambda s: s["size"], summaries))
|
| 265 |
ids = list(map(lambda s: s["topic_id"], summaries))
|
| 266 |
|
| 267 |
+
fig1 = px.bar(x=ids, y=sizes, title="Topic Sizes — {}".format(run_config),
|
| 268 |
+
labels={"x": "Topic #", "y": "Sentences"})
|
| 269 |
+
fig2 = px.histogram(x=sizes, nbins=30, title="Size Distribution — {}".format(run_config),
|
|
|
|
| 270 |
labels={"x": "Cluster Size"})
|
|
|
|
| 271 |
centroids = np.array(list(map(lambda s: s["centroid"], summaries)))
|
| 272 |
n_comp = min(2, centroids.shape[0], centroids.shape[1])
|
| 273 |
coords = PCA(n_components=n_comp).fit_transform(centroids)
|
|
|
|
| 275 |
x=coords[:, 0],
|
| 276 |
y=(coords[:, 1] if coords.shape[1] > 1 else [0] * len(coords)),
|
| 277 |
text=list(map(str, ids)),
|
| 278 |
+
title="Topic Centroids PCA — {}".format(run_config),
|
| 279 |
labels={"x": "PC1", "y": "PC2"},
|
| 280 |
)
|
| 281 |
fig4 = px.treemap(
|
| 282 |
+
names=list(map(str, ids)), parents=["Topics"] * len(ids),
|
| 283 |
+
values=sizes, title="Treemap — {}".format(run_config),
|
|
|
|
|
|
|
| 284 |
)
|
| 285 |
|
| 286 |
charts = {
|
|
|
|
| 295 |
"topics_found": len(summaries),
|
| 296 |
"run_config": run_config,
|
| 297 |
"chart_types": list(charts.keys()),
|
| 298 |
+
"note": "Topics numbered 1..{}, threshold=0.35".format(len(summaries)),
|
| 299 |
})
|
| 300 |
|
| 301 |
|
| 302 |
# =============================================================================
|
| 303 |
+
# TOOL 3 — label_topics_with_llm
|
| 304 |
# =============================================================================
|
| 305 |
@tool
|
| 306 |
def label_topics_with_llm(batch_size: int = 20, run_config: str = "abstract") -> str:
|
| 307 |
+
"""Label topic clusters with human-readable names via Mistral LLM.
|
| 308 |
|
| 309 |
Args:
|
| 310 |
+
batch_size: Topics per LLM call (default 20).
|
| 311 |
+
run_config: 'abstract' or 'title' (default 'abstract').
|
| 312 |
"""
|
| 313 |
+
p = _p(run_config)
|
| 314 |
summaries = json.loads(p["summaries"].read_text())
|
| 315 |
top_summaries = summaries[:100]
|
| 316 |
llm = ChatMistralAI(model="mistral-large-latest", temperature=0.2)
|
|
|
|
| 323 |
batch
|
| 324 |
))
|
| 325 |
prompt = (
|
| 326 |
+
"You are a thematic analysis expert in Information Systems research.\n"
|
| 327 |
+
"Label each topic cluster with a concise 3-6 word academic label "
|
| 328 |
+
"and one-sentence reasoning.\n\n"
|
| 329 |
"TOPICS:\n" + json.dumps(mini, indent=2) + "\n\n"
|
| 330 |
+
"Return ONLY a raw JSON array. "
|
| 331 |
+
"Each element: topic_id (integer), label (string), reasoning (string). "
|
| 332 |
+
"No markdown, no explanation."
|
|
|
|
| 333 |
)
|
| 334 |
return _call_llm_json(llm, prompt)
|
| 335 |
|
|
|
|
| 350 |
|
| 351 |
|
| 352 |
# =============================================================================
|
| 353 |
+
# TOOL 4 — consolidate_into_themes
|
| 354 |
# =============================================================================
|
| 355 |
@tool
|
| 356 |
def consolidate_into_themes(approved_groups: str, run_config: str = "abstract") -> str:
|
| 357 |
+
"""Merge approved topic groups into themes and recompute centroids.
|
| 358 |
|
| 359 |
Args:
|
| 360 |
+
approved_groups: JSON list [{theme_name: str, topic_ids: [int,...]}]
|
| 361 |
+
run_config: 'abstract' or 'title' (default 'abstract').
|
| 362 |
"""
|
| 363 |
+
p = _p(run_config)
|
| 364 |
groups = json.loads(approved_groups)
|
| 365 |
summaries = json.loads(p["summaries"].read_text())
|
| 366 |
+
id_map = {s["topic_id"]: s for s in summaries}
|
| 367 |
|
| 368 |
def build_theme(group):
|
| 369 |
ids = group["topic_ids"]
|
| 370 |
+
members = list(map(lambda tid: id_map[tid], ids))
|
| 371 |
+
sents = [s for ms in members for s in ms.get("sentences", [])]
|
| 372 |
centroids = np.array(list(map(lambda ms: ms["centroid"], members)))
|
| 373 |
return {
|
| 374 |
"theme_name": group["theme_name"],
|
| 375 |
"topic_ids": ids,
|
| 376 |
+
"sentences": sents,
|
| 377 |
"centroid": centroids.mean(axis=0).tolist(),
|
| 378 |
+
"paper_count": len(set(sents)),
|
| 379 |
"run_config": run_config,
|
| 380 |
}
|
| 381 |
|
|
|
|
| 385 |
"themes_created": len(themes),
|
| 386 |
"theme_names": list(map(lambda t: t["theme_name"], themes)),
|
| 387 |
"run_config": run_config,
|
| 388 |
+
"both_complete": _both_runs_complete(),
|
| 389 |
})
|
| 390 |
|
| 391 |
|
| 392 |
# =============================================================================
|
| 393 |
+
# TOOL 5 — compare_with_taxonomy
|
| 394 |
# =============================================================================
|
| 395 |
@tool
|
| 396 |
def compare_with_taxonomy(run_config: str = "abstract") -> str:
|
| 397 |
+
"""Map themes to PAJAIS 25 categories via Mistral LLM.
|
| 398 |
|
| 399 |
Args:
|
| 400 |
+
run_config: 'abstract' or 'title' (default 'abstract').
|
| 401 |
"""
|
| 402 |
+
p = _p(run_config)
|
| 403 |
themes = json.loads(p["themes"].read_text())
|
| 404 |
llm = ChatMistralAI(model="mistral-large-latest", temperature=0.1)
|
| 405 |
|
|
|
|
| 407 |
lambda t: {"name": t["theme_name"], "sample": t["sentences"][:2]},
|
| 408 |
themes
|
| 409 |
))
|
|
|
|
| 410 |
prompt = (
|
| 411 |
"You are a research classification expert in Information Systems.\n\n"
|
| 412 |
+
"Map each theme to the single most relevant PAJAIS category.\n\n"
|
|
|
|
| 413 |
"THEMES:\n" + json.dumps(theme_mini, indent=2) + "\n\n"
|
| 414 |
"PAJAIS CATEGORIES:\n" + json.dumps(PAJAIS_CATEGORIES, indent=2) + "\n\n"
|
| 415 |
+
"Return ONLY a raw JSON array. "
|
| 416 |
+
"Each element: name, pajais_category, confidence, rationale. "
|
| 417 |
+
"No markdown, no explanation."
|
|
|
|
| 418 |
)
|
|
|
|
| 419 |
result = _call_llm_json(llm, prompt)
|
| 420 |
p["taxonomy"].write_text(json.dumps(result, indent=2, ensure_ascii=False))
|
| 421 |
+
return json.dumps({
|
| 422 |
+
"mapped_themes": len(result),
|
| 423 |
+
"run_config": run_config,
|
| 424 |
+
"both_complete": _both_runs_complete(),
|
| 425 |
+
})
|
| 426 |
|
| 427 |
|
| 428 |
# =============================================================================
|
| 429 |
+
# TOOL 6 — generate_comparison_csv
|
| 430 |
+
# ONLY runs when BOTH abstract and title runs are complete
|
| 431 |
+
# Columns: Title | Abstract | Year | Source Journal
|
| 432 |
# =============================================================================
|
| 433 |
@tool
|
| 434 |
+
def generate_comparison_csv() -> str:
|
| 435 |
+
"""Generate Title | Abstract | Year | Source Journal comparison CSV.
|
| 436 |
|
| 437 |
+
Only available after BOTH abstract and title runs have completed themes.
|
| 438 |
+
Saves to data/comparison.csv.
|
| 439 |
"""
|
| 440 |
+
abs_complete = _p("abstract")["themes"].exists()
|
| 441 |
+
title_complete = _p("title")["themes"].exists()
|
| 442 |
+
|
| 443 |
+
status_msg = (
|
| 444 |
+
"Abstract complete: {}, Title complete: {}. "
|
| 445 |
+
"Run 'run title' to complete the title analysis first."
|
| 446 |
+
).format(abs_complete, title_complete)
|
| 447 |
+
|
| 448 |
+
# Use ternary to avoid if/else
|
| 449 |
+
result = (
|
| 450 |
+
_do_generate_comparison_csv()
|
| 451 |
+
if (abs_complete and title_complete)
|
| 452 |
+
else status_msg
|
| 453 |
+
)
|
| 454 |
+
return result
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def _do_generate_comparison_csv() -> str:
|
| 458 |
+
"""Internal: actually generate the CSV when both runs are done."""
|
| 459 |
+
df = safe_read_csv(DATA_DIR / "uploaded.csv")
|
| 460 |
|
|
|
|
| 461 |
title_col = next(filter(lambda c: c.strip().lower() == "title", df.columns), None)
|
| 462 |
abstract_col = next(filter(lambda c: c.strip().lower() == "abstract", df.columns), None)
|
| 463 |
year_col = next(filter(lambda c: c.strip().lower() == "year", df.columns), None)
|
| 464 |
journal_col = next(filter(lambda c: "source" in c.lower(), df.columns), None)
|
| 465 |
|
|
|
|
| 466 |
ordered = [title_col, abstract_col, year_col, journal_col]
|
| 467 |
present = list(filter(lambda c: c is not None and c in df.columns, ordered))
|
| 468 |
+
out_df = df[present].copy()
|
| 469 |
|
|
|
|
|
|
|
| 470 |
rename_map = {
|
| 471 |
title_col: "Title",
|
| 472 |
abstract_col: "Abstract",
|
| 473 |
year_col: "Year",
|
| 474 |
journal_col: "Source Journal",
|
| 475 |
}
|
| 476 |
+
out_df = out_df.rename(
|
| 477 |
+
columns={k: v for k, v in rename_map.items() if k in out_df.columns}
|
| 478 |
+
)
|
| 479 |
+
dest = DATA_DIR / "comparison.csv"
|
| 480 |
+
out_df.to_csv(dest, index=False, encoding="utf-8-sig")
|
| 481 |
return json.dumps({
|
| 482 |
+
"rows": len(out_df),
|
| 483 |
+
"columns": list(out_df.columns),
|
| 484 |
+
"path": str(dest),
|
| 485 |
+
"note": "Both runs complete — comparison CSV generated",
|
| 486 |
})
|
| 487 |
|
| 488 |
|
| 489 |
# =============================================================================
|
| 490 |
+
# TOOL 7 — export_narrative
|
| 491 |
+
# ONLY runs when BOTH abstract and title runs are complete
|
| 492 |
# =============================================================================
|
| 493 |
@tool
|
| 494 |
+
def export_narrative() -> str:
|
| 495 |
+
"""Write a 500-word Section 7 narrative using themes from BOTH runs.
|
| 496 |
|
| 497 |
+
Only available after BOTH abstract and title runs have completed taxonomy mapping.
|
| 498 |
+
Saves to data/narrative.txt.
|
| 499 |
"""
|
| 500 |
+
abs_tax = _p("abstract")["taxonomy"]
|
| 501 |
+
title_tax = _p("title")["taxonomy"]
|
| 502 |
+
|
| 503 |
+
both_done = abs_tax.exists() and title_tax.exists()
|
| 504 |
+
|
| 505 |
+
result = (
|
| 506 |
+
_do_export_narrative()
|
| 507 |
+
if both_done
|
| 508 |
+
else (
|
| 509 |
+
"Narrative cannot be generated yet. "
|
| 510 |
+
"Abstract taxonomy complete: {}. Title taxonomy complete: {}. "
|
| 511 |
+
"Complete both runs through Phase 5.5 first.".format(
|
| 512 |
+
abs_tax.exists(), title_tax.exists()
|
| 513 |
+
)
|
| 514 |
+
)
|
| 515 |
+
)
|
| 516 |
+
return result
|
| 517 |
|
| 518 |
+
|
| 519 |
+
def _do_export_narrative() -> str:
|
| 520 |
+
"""Internal: generate narrative when both runs are done."""
|
| 521 |
+
abs_themes = json.loads(_p("abstract")["themes"].read_text())
|
| 522 |
+
title_themes = json.loads(_p("title")["themes"].read_text())
|
| 523 |
+
abs_taxonomy = json.loads(_p("abstract")["taxonomy"].read_text())
|
| 524 |
+
title_taxonomy = json.loads(_p("title")["taxonomy"].read_text())
|
| 525 |
+
llm = ChatMistralAI(model="mistral-large-latest", temperature=0.4)
|
| 526 |
+
|
| 527 |
+
abs_summary = list(map(lambda t: {"name": t["theme_name"],
|
| 528 |
+
"sentences": len(t["sentences"])}, abs_themes))
|
| 529 |
+
title_summary = list(map(lambda t: {"name": t["theme_name"],
|
| 530 |
+
"sentences": len(t["sentences"])}, title_themes))
|
| 531 |
|
| 532 |
prompt = (
|
| 533 |
"You are an academic writing expert in Information Systems.\n\n"
|
| 534 |
"Write Section 7 (Discussion and Thematic Synthesis) of a systematic "
|
| 535 |
+
"literature review paper. Approximately 500 words, formal academic prose.\n"
|
| 536 |
+
"Cover:\n"
|
| 537 |
+
"(a) Overview of themes from abstract analysis\n"
|
| 538 |
+
"(b) Overview of themes from title analysis\n"
|
| 539 |
+
"(c) Comparison: what themes appear in both vs only one\n"
|
| 540 |
+
"(d) PAJAIS taxonomy mapping and implications\n"
|
| 541 |
+
"(e) Implications for IS research and practice\n"
|
| 542 |
+
"(f) Limitations\n\n"
|
| 543 |
+
"ABSTRACT THEMES:\n" + json.dumps(abs_summary, indent=2) + "\n\n"
|
| 544 |
+
"TITLE THEMES:\n" + json.dumps(title_summary, indent=2) + "\n\n"
|
| 545 |
+
"ABSTRACT PAJAIS MAPPING:\n" + json.dumps(abs_taxonomy, indent=2) + "\n\n"
|
| 546 |
+
"TITLE PAJAIS MAPPING:\n" + json.dumps(title_taxonomy, indent=2) + "\n\n"
|
| 547 |
+
"Write in continuous academic paragraphs. No bullet points or headers."
|
| 548 |
)
|
| 549 |
|
| 550 |
response = llm.invoke([HumanMessage(content=prompt)])
|
| 551 |
narrative_text = response.content
|
| 552 |
+
dest = DATA_DIR / "narrative.txt"
|
| 553 |
+
dest.write_text(narrative_text, encoding="utf-8")
|
| 554 |
return json.dumps({
|
| 555 |
"word_count": len(narrative_text.split()),
|
| 556 |
+
"path": str(dest),
|
| 557 |
+
"note": "Narrative combines both abstract and title run themes",
|
| 558 |
})
|