aadisawant2912 commited on
Commit
0b98cfb
·
verified ·
1 Parent(s): 9fd51d0

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +271 -258
tools.py CHANGED
@@ -1,14 +1,21 @@
1
  """
2
  tools.py - 7 LangChain @tool functions for BERTopic Thematic Analysis Agent.
3
  Rules: ZERO if/else, ZERO for/while, ZERO try/except, ZERO PromptTemplate.
4
- All LLM calls use plain HumanMessage strings.
5
- Supports BOTH abstract and title run_configs simultaneously without overwriting.
 
 
 
 
 
 
6
  """
7
 
8
  from __future__ import annotations
9
 
10
  import json
11
  import re
 
12
  from pathlib import Path
13
 
14
  import numpy as np
@@ -26,63 +33,62 @@ from sklearn.metrics.pairwise import cosine_similarity
26
  DATA_DIR = Path("data")
27
  DATA_DIR.mkdir(exist_ok=True)
28
 
29
- # Run-config-specific paths — keyed by run_config name so they never overwrite
30
- def _paths(run_config: str) -> dict:
31
- prefix = DATA_DIR / run_config # data/abstract/ or data/title/
32
- prefix.mkdir(exist_ok=True)
33
  return {
34
- "sentences": prefix / "sentences.json",
35
- "stats": prefix / "stats.json",
36
- "papers": prefix / "papers.csv",
37
- "emb": prefix / "emb.npy",
38
- "summaries": prefix / "summaries.json",
39
- "charts": prefix / "charts.json",
40
- "themes": prefix / "themes.json",
41
- "taxonomy": prefix / "taxonomy.json",
42
- "narrative": prefix / "narrative.txt",
43
- "comparison": prefix / "comparison.csv",
 
44
  }
45
 
46
- # Shared fallback paths (used when run_config not specified)
47
- SUMMARIES_PATH = DATA_DIR / "abstract" / "summaries.json"
48
- THEMES_PATH = DATA_DIR / "abstract" / "themes.json"
49
- TAXONOMY_PATH = DATA_DIR / "abstract" / "taxonomy.json"
50
- NARRATIVE_PATH = DATA_DIR / "abstract" / "narrative.txt"
51
- COMPARISON_PATH = DATA_DIR / "abstract" / "comparison.csv"
52
- EMB_PATH = DATA_DIR / "abstract" / "emb.npy"
53
-
54
  RUN_CONFIGS = {
55
  "abstract": ["Abstract"],
56
  "title": ["Title"],
57
- "both": ["Abstract", "Title"],
58
  }
59
 
 
60
  BOILERPLATE_PATTERNS = [
61
- r"\u00a9", # © unicode symbol
62
- r"\\u00a9", # escaped unicode
63
- r"copyright\s*\d{4}", # copyright 2018
64
- r"\d{4}\s+john wiley", # 2018 John Wiley
65
- r"john wiley\s*&\s*sons", # John Wiley & Sons
66
- r"blackwell publishing", # Blackwell Publishing
67
- r"blackwell\s+pub",
68
  r"wiley\s+periodicals",
 
69
  r"all rights reserved",
70
  r"doi\s*:\s*\S+",
71
  r"published by elsevier",
72
- r"elsevier\s+(b\.v|inc|ltd)",
73
- r"springer\s+(nature|verlag|science)",
74
- r"taylor\s*&\s*francis",
75
  r"informa\s+uk",
76
  r"sage\s+publications",
77
- r"information systems journal", # journal boilerplate
 
78
  r"this article is",
79
  r"rights reserved",
80
  r"permission from",
81
  r"reproduced with",
82
- r"^\s*abstract\s*$", # lone word "Abstract"
83
  ]
84
  BOILERPLATE_RE = re.compile("|".join(BOILERPLATE_PATTERNS), re.IGNORECASE)
85
 
 
 
 
 
 
 
86
  PAJAIS_CATEGORIES = [
87
  "Information Systems Theory", "IS Strategy & Governance",
88
  "Digital Innovation", "Enterprise Systems",
@@ -101,203 +107,167 @@ PAJAIS_CATEGORIES = [
101
 
102
 
103
  def safe_read_csv(path):
104
- """Read CSV with UTF-8, fall back to latin-1 on encoding errors."""
105
  try:
106
  return pd.read_csv(path, encoding="utf-8")
107
  except UnicodeDecodeError:
108
  return pd.read_csv(path, encoding="latin-1")
109
 
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  def _call_llm_json(llm, prompt: str) -> list:
112
- """Call LLM with plain HumanMessage and parse JSON response."""
113
  response = llm.invoke([HumanMessage(content=prompt)])
114
  raw = response.content.strip()
115
  raw = raw.split("```json")[-1].split("```")[0].strip() if "```" in raw else raw
116
  return json.loads(raw)
117
 
118
 
 
 
 
 
 
 
 
 
119
  # =============================================================================
120
- # TOOL 1 - load_scopus_csv
121
- # Supports run_config = "abstract", "title", or "both"
122
- # Each config saves to its own subdirectory so nothing is overwritten
123
  # =============================================================================
124
  @tool
125
  def load_scopus_csv(csv_path: str, run_config: str = "abstract") -> str:
126
- """Load a Scopus CSV export, filter boilerplate, save sentences per run_config.
127
 
128
- Supports run_config = 'abstract', 'title', or 'both'.
129
- Each config saves to data/abstract/, data/title/ separately so they
130
- never overwrite each other.
131
 
132
  Args:
133
  csv_path: Path to the uploaded Scopus CSV file.
134
- run_config: 'abstract', 'title', or 'both' (default 'abstract').
135
  """
136
- # Resolve which columns to process
137
- configs_to_run = (
138
- list(RUN_CONFIGS.items())
139
- if run_config == "both"
140
- else [(run_config, RUN_CONFIGS.get(run_config, ["Abstract"]))]
141
- )
142
 
143
- df_raw = safe_read_csv(csv_path)
 
 
 
 
144
 
145
- def process_config(config_pair):
146
- cfg_name, columns = config_pair
147
- p = _paths(cfg_name)
148
 
149
- # Keep only columns that exist in this CSV
150
- present_cols = list(filter(
151
- lambda c: c in df_raw.columns,
152
- columns + ["Title", "Year", "Source title", "Cited by"]
153
- ))
154
- df = df_raw[present_cols].dropna(subset=list(filter(
155
- lambda c: c in df_raw.columns, columns
156
- )))
157
-
158
- def split_sentences(text):
159
- raw_sents = re.split(r"(?<=[.!?])\s+", str(text))
160
- return list(filter(
161
- lambda s: (
162
- not BOILERPLATE_RE.search(s)
163
- and len(s.split()) > 6 # at least 7 words
164
- and len(s.strip()) > 40 # at least 40 chars
165
- and not s.strip().startswith("©")
166
- and "wiley" not in s.lower()
167
- and "elsevier" not in s.lower()
168
- and "blackwell" not in s.lower()
169
- and "springer" not in s.lower()
170
- and "information systems journal" not in s.lower()
171
- ),
172
- raw_sents
173
- ))
174
-
175
- sentences_lists = list(map(split_sentences, df[columns[0]].tolist()))
176
- all_sentences = [s for lst in sentences_lists for s in lst]
177
-
178
- stats = {
179
- "papers": int(len(df)),
180
- "sentences_after_filter": int(len(all_sentences)),
181
- "columns_used": columns,
182
- "csv_path": str(csv_path),
183
- "run_config": cfg_name,
184
- }
185
 
186
- p["sentences"].write_text(json.dumps(all_sentences, ensure_ascii=False))
187
- p["stats"].write_text(json.dumps(stats, ensure_ascii=False))
188
- df.to_csv(p["papers"], index=False)
189
- return stats
190
 
191
- all_stats = list(map(process_config, configs_to_run))
 
192
 
193
- # Also save original CSV as uploaded.csv for reference
194
- import shutil
195
- shutil.copy(csv_path, DATA_DIR / "uploaded.csv")
 
 
 
196
 
197
- return json.dumps({
198
- "configs_processed": list(map(lambda s: s["run_config"], all_stats)),
199
- "results": all_stats,
200
- })
 
201
 
202
 
203
  # =============================================================================
204
- # TOOL 2 - run_bertopic_discovery
205
- # run_config controls which sentences file to use
206
- # distance_threshold=0.35 gives ~100 topics from 2000+ sentences
207
  # =============================================================================
208
  @tool
209
  def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract") -> str:
210
- """Embed sentences, cluster with AgglomerativeClustering (cosine, threshold=0.35),
211
- targeting ~100 topics. Saves summaries, embeddings, and 4 Plotly charts.
212
 
213
  Args:
214
- top_n_topics: Target number of topic clusters (default 100).
215
- run_config: Which sentences to use: 'abstract' or 'title' (default 'abstract').
216
  """
217
- p = _paths(run_config)
218
  sentences = json.loads(p["sentences"].read_text())
219
 
220
- model = SentenceTransformer("all-MiniLM-L6-v2")
221
  embeddings = model.encode(
222
- sentences, normalize_embeddings=True, show_progress_bar=False, batch_size=64
 
223
  )
224
  np.save(p["emb"], embeddings)
225
 
226
- # threshold=0.35 produces many fine-grained clusters (~100 for 2000+ sentences)
227
- # threshold=0.70 produces fewer broad clusters (~40-60)
228
- # We use 0.35 to get close to the desired 100 topics
229
  clustering = AgglomerativeClustering(
230
- metric="cosine",
231
- linkage="average",
232
- distance_threshold=0.35,
233
- n_clusters=None,
234
  )
235
  labels = clustering.fit_predict(embeddings)
236
 
237
- # Sort clusters by size descending, take top_n_topics largest
238
- all_labels = sorted(set(labels.tolist()))
239
- label_sizes = list(map(lambda lb: (lb, int((labels == lb).sum())), all_labels))
240
- label_sizes_sorted = sorted(label_sizes, key=lambda x: -x[1])
241
- # Only keep clusters with at least 3 sentences (filter noise)
242
- label_sizes_filtered = list(filter(lambda x: x[1] >= 3, label_sizes_sorted))
243
- retained = list(map(lambda x: x[0], label_sizes_filtered[:top_n_topics]))
244
-
245
- # Extra boilerplate check for individual sentences
246
- def is_clean_sentence(s):
247
- return (
248
- not BOILERPLATE_RE.search(s)
249
- and len(s.split()) > 6
250
- and len(s) > 40
251
- and not s.strip().startswith("©")
252
- and "wiley" not in s.lower()
253
- and "elsevier" not in s.lower()
254
- and "blackwell" not in s.lower()
255
- and "springer" not in s.lower()
256
- and "taylor" not in s.lower()
257
- and "john wiley" not in s.lower()
258
- )
259
 
260
- def build_cluster_summary(seq_and_label):
261
- seq_id, label = seq_and_label # seq_id = 1-based sequential number
262
- mask = labels == label
263
  cluster_embs = embeddings[mask]
264
- cluster_sents_raw = [sentences[i] for i, m in enumerate(mask.tolist()) if m]
265
- # Apply extra boilerplate filter to sentences inside each cluster
266
- cluster_sents = list(filter(is_clean_sentence, cluster_sents_raw))
267
- # Fall back to raw if filter removed everything
268
- cluster_sents = cluster_sents if cluster_sents else cluster_sents_raw[:5]
269
- centroid = cluster_embs.mean(axis=0, keepdims=True)
270
- sims = cosine_similarity(centroid, cluster_embs)[0]
271
- top5_idx = sims.argsort()[-5:][::-1].tolist()
272
- # top_evidence: pick from clean sentences preferring highest-similarity ones
273
- clean_set = set(cluster_sents)
274
- top_evidence_raw = list(map(lambda i: cluster_sents_raw[i], top5_idx))
275
- top_evidence = list(filter(lambda s: s in clean_set, top_evidence_raw))[:5]
276
- top_evidence = top_evidence if top_evidence else top_evidence_raw[:3]
277
  return {
278
- "topic_id": seq_id, # sequential 1-based ID shown in table
279
- "raw_label": int(label), # original cluster label kept for internal use
280
  "size": int(mask.sum()),
281
  "top_evidence": top_evidence,
282
- "sentences": cluster_sents,
283
  "centroid": centroid[0].tolist(),
284
  "run_config": run_config,
285
  }
286
 
287
- # Enumerate gives (1-based index, raw_label) pairs
288
- seq_label_pairs = list(map(lambda x: (x[0]+1, x[1]), enumerate(retained)))
289
- summaries = list(map(build_cluster_summary, seq_label_pairs))
290
  p["summaries"].write_text(json.dumps(summaries, indent=2, ensure_ascii=False))
291
 
292
- sizes = list(map(lambda s: s["size"], summaries))
293
  ids = list(map(lambda s: s["topic_id"], summaries))
294
 
295
- fig1 = px.bar(x=ids, y=sizes,
296
- labels={"x": "Topic ID", "y": "Sentence Count"},
297
- title="Topic Size Distribution ({})".format(run_config))
298
- fig2 = px.histogram(x=sizes, nbins=30, title="Cluster Size Histogram ({})".format(run_config),
299
  labels={"x": "Cluster Size"})
300
-
301
  centroids = np.array(list(map(lambda s: s["centroid"], summaries)))
302
  n_comp = min(2, centroids.shape[0], centroids.shape[1])
303
  coords = PCA(n_components=n_comp).fit_transform(centroids)
@@ -305,14 +275,12 @@ def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract"
305
  x=coords[:, 0],
306
  y=(coords[:, 1] if coords.shape[1] > 1 else [0] * len(coords)),
307
  text=list(map(str, ids)),
308
- title="Topic Centroids PCA ({})".format(run_config),
309
  labels={"x": "PC1", "y": "PC2"},
310
  )
311
  fig4 = px.treemap(
312
- names=list(map(str, ids)),
313
- parents=["Topics"] * len(ids),
314
- values=sizes,
315
- title="Topic Treemap ({})".format(run_config),
316
  )
317
 
318
  charts = {
@@ -327,22 +295,22 @@ def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract"
327
  "topics_found": len(summaries),
328
  "run_config": run_config,
329
  "chart_types": list(charts.keys()),
330
- "note": "threshold=0.35 used for ~100 fine-grained clusters",
331
  })
332
 
333
 
334
  # =============================================================================
335
- # TOOL 3 - label_topics_with_llm
336
  # =============================================================================
337
  @tool
338
  def label_topics_with_llm(batch_size: int = 20, run_config: str = "abstract") -> str:
339
- """Send top topics to Mistral LLM to generate human-readable labels and reasoning.
340
 
341
  Args:
342
- batch_size: Topics per LLM batch (default 20).
343
- run_config: Which summaries to label: 'abstract' or 'title' (default 'abstract').
344
  """
345
- p = _paths(run_config)
346
  summaries = json.loads(p["summaries"].read_text())
347
  top_summaries = summaries[:100]
348
  llm = ChatMistralAI(model="mistral-large-latest", temperature=0.2)
@@ -355,14 +323,13 @@ def label_topics_with_llm(batch_size: int = 20, run_config: str = "abstract") ->
355
  batch
356
  ))
357
  prompt = (
358
- "You are a thematic analysis expert specialising in Information Systems research.\n"
359
- "Given the following research topic clusters with sample sentences, "
360
- "assign a concise label (3-6 words) and one-sentence reasoning for each.\n\n"
361
  "TOPICS:\n" + json.dumps(mini, indent=2) + "\n\n"
362
- "Return ONLY a valid JSON array. "
363
- "Each element must have exactly three keys: "
364
- "topic_id (integer), label (string), reasoning (string). "
365
- "No markdown fences, no explanation, just the raw JSON array."
366
  )
367
  return _call_llm_json(llm, prompt)
368
 
@@ -383,32 +350,32 @@ def label_topics_with_llm(batch_size: int = 20, run_config: str = "abstract") ->
383
 
384
 
385
  # =============================================================================
386
- # TOOL 4 - consolidate_into_themes
387
  # =============================================================================
388
  @tool
389
  def consolidate_into_themes(approved_groups: str, run_config: str = "abstract") -> str:
390
- """Merge researcher-approved topic groups into themes, recompute centroids.
391
 
392
  Args:
393
- approved_groups: JSON string [{theme_name: str, topic_ids: [int, ...]}]
394
- run_config: Which summaries to use (default 'abstract').
395
  """
396
- p = _paths(run_config)
397
  groups = json.loads(approved_groups)
398
  summaries = json.loads(p["summaries"].read_text())
399
- id_to_summary = {s["topic_id"]: s for s in summaries}
400
 
401
  def build_theme(group):
402
  ids = group["topic_ids"]
403
- members = list(map(lambda tid: id_to_summary[tid], ids))
404
- all_sents = [s for ms in members for s in ms.get("sentences", [])]
405
  centroids = np.array(list(map(lambda ms: ms["centroid"], members)))
406
  return {
407
  "theme_name": group["theme_name"],
408
  "topic_ids": ids,
409
- "sentences": all_sents,
410
  "centroid": centroids.mean(axis=0).tolist(),
411
- "paper_count": len(set(all_sents)),
412
  "run_config": run_config,
413
  }
414
 
@@ -418,20 +385,21 @@ def consolidate_into_themes(approved_groups: str, run_config: str = "abstract")
418
  "themes_created": len(themes),
419
  "theme_names": list(map(lambda t: t["theme_name"], themes)),
420
  "run_config": run_config,
 
421
  })
422
 
423
 
424
  # =============================================================================
425
- # TOOL 5 - compare_with_taxonomy
426
  # =============================================================================
427
  @tool
428
  def compare_with_taxonomy(run_config: str = "abstract") -> str:
429
- """Map consolidated themes to PAJAIS 25 categories via Mistral LLM.
430
 
431
  Args:
432
- run_config: Which themes to map (default 'abstract').
433
  """
434
- p = _paths(run_config)
435
  themes = json.loads(p["themes"].read_text())
436
  llm = ChatMistralAI(model="mistral-large-latest", temperature=0.1)
437
 
@@ -439,107 +407,152 @@ def compare_with_taxonomy(run_config: str = "abstract") -> str:
439
  lambda t: {"name": t["theme_name"], "sample": t["sentences"][:2]},
440
  themes
441
  ))
442
-
443
  prompt = (
444
  "You are a research classification expert in Information Systems.\n\n"
445
- "Map each of the following research themes to the single most relevant "
446
- "PAJAIS (Pacific Asia Journal of the Association for Information Systems) category.\n\n"
447
  "THEMES:\n" + json.dumps(theme_mini, indent=2) + "\n\n"
448
  "PAJAIS CATEGORIES:\n" + json.dumps(PAJAIS_CATEGORIES, indent=2) + "\n\n"
449
- "Return ONLY a valid JSON array. "
450
- "Each element must have exactly four string keys: "
451
- "name, pajais_category, confidence, rationale. "
452
- "No markdown fences, no explanation, just the raw JSON array."
453
  )
454
-
455
  result = _call_llm_json(llm, prompt)
456
  p["taxonomy"].write_text(json.dumps(result, indent=2, ensure_ascii=False))
457
- return json.dumps({"mapped_themes": len(result), "run_config": run_config})
 
 
 
 
458
 
459
 
460
  # =============================================================================
461
- # TOOL 6 - generate_comparison_csv
462
- # Columns: Title | Abstract | Year | Source title (journal)
 
463
  # =============================================================================
464
  @tool
465
- def generate_comparison_csv(run_config: str = "abstract") -> str:
466
- """Generate Title | Abstract | Year | Source title comparison CSV.
467
 
468
- Args:
469
- run_config: Which papers.csv to use (default 'abstract').
470
  """
471
- p = _paths(run_config)
472
- df = safe_read_csv(p["papers"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
 
474
- # Detect columns robustly
475
  title_col = next(filter(lambda c: c.strip().lower() == "title", df.columns), None)
476
  abstract_col = next(filter(lambda c: c.strip().lower() == "abstract", df.columns), None)
477
  year_col = next(filter(lambda c: c.strip().lower() == "year", df.columns), None)
478
  journal_col = next(filter(lambda c: "source" in c.lower(), df.columns), None)
479
 
480
- # Build ordered column list: Title, Abstract, Year, Source title
481
  ordered = [title_col, abstract_col, year_col, journal_col]
482
  present = list(filter(lambda c: c is not None and c in df.columns, ordered))
 
483
 
484
- out_df = df[present].copy()
485
- # Rename columns to clean standard names
486
  rename_map = {
487
  title_col: "Title",
488
  abstract_col: "Abstract",
489
  year_col: "Year",
490
  journal_col: "Source Journal",
491
  }
492
- out_df = out_df.rename(columns={k: v for k, v in rename_map.items() if k in out_df.columns})
493
- out_df.to_csv(p["comparison"], index=False, encoding="utf-8-sig")
494
-
 
 
495
  return json.dumps({
496
- "rows": len(out_df),
497
- "columns": list(out_df.columns),
498
- "path": str(p["comparison"]),
499
- "run_config": run_config,
500
  })
501
 
502
 
503
  # =============================================================================
504
- # TOOL 7 - export_narrative
 
505
  # =============================================================================
506
  @tool
507
- def export_narrative(run_config: str = "abstract") -> str:
508
- """Generate a 500-word Section 7 narrative report via Mistral LLM.
509
 
510
- Args:
511
- run_config: Which themes and taxonomy to use (default 'abstract').
512
  """
513
- p = _paths(run_config)
514
- themes = json.loads(p["themes"].read_text())
515
- taxonomy = json.loads(p["taxonomy"].read_text())
516
- llm = ChatMistralAI(model="mistral-large-latest", temperature=0.4)
 
 
 
 
 
 
 
 
 
 
 
 
 
517
 
518
- theme_summary = list(map(
519
- lambda t: {"name": t["theme_name"], "sentence_count": len(t["sentences"])},
520
- themes
521
- ))
 
 
 
 
 
 
 
 
 
522
 
523
  prompt = (
524
  "You are an academic writing expert in Information Systems.\n\n"
525
  "Write Section 7 (Discussion and Thematic Synthesis) of a systematic "
526
- "literature review paper. Write approximately 500 words in formal academic prose.\n"
527
- "Cover all four of these points:\n"
528
- "(a) Overview of the identified themes and their significance\n"
529
- "(b) How the themes map to the PAJAIS taxonomy categories\n"
530
- "(c) Implications for IS research and practice\n"
531
- "(d) Limitations of the thematic analysis\n\n"
532
- "IDENTIFIED THEMES:\n" + json.dumps(theme_summary, indent=2) + "\n\n"
533
- "PAJAIS TAXONOMY MAPPING:\n" + json.dumps(taxonomy, indent=2) + "\n\n"
534
- "Write the full section now in continuous academic paragraphs. "
535
- "Do not use bullet points, numbered lists, or section headers."
 
 
 
536
  )
537
 
538
  response = llm.invoke([HumanMessage(content=prompt)])
539
  narrative_text = response.content
540
- p["narrative"].write_text(narrative_text, encoding="utf-8")
 
541
  return json.dumps({
542
  "word_count": len(narrative_text.split()),
543
- "path": str(p["narrative"]),
544
- "run_config": run_config,
545
  })
 
1
  """
2
  tools.py - 7 LangChain @tool functions for BERTopic Thematic Analysis Agent.
3
  Rules: ZERO if/else, ZERO for/while, ZERO try/except, ZERO PromptTemplate.
4
+ All LLM calls use plain HumanMessage strings directly.
5
+
6
+ Workflow:
7
+ - Abstract run saves to data/abstract/
8
+ - Title run saves to data/title/
9
+ - Comparison CSV + narrative only generated when BOTH runs are complete
10
+ - Topic IDs are sequential 1..N (not raw cluster labels)
11
+ - Boilerplate filter catches © symbol, all major publishers
12
  """
13
 
14
  from __future__ import annotations
15
 
16
  import json
17
  import re
18
+ import shutil
19
  from pathlib import Path
20
 
21
  import numpy as np
 
33
  DATA_DIR = Path("data")
34
  DATA_DIR.mkdir(exist_ok=True)
35
 
36
+ def _p(run_config: str) -> dict:
37
+ """Return all file paths for a given run_config, creating subdirectory."""
38
+ d = DATA_DIR / run_config
39
+ d.mkdir(parents=True, exist_ok=True)
40
  return {
41
+ "dir": d,
42
+ "sentences": d / "sentences.json",
43
+ "stats": d / "stats.json",
44
+ "papers": d / "papers.csv",
45
+ "emb": d / "emb.npy",
46
+ "summaries": d / "summaries.json",
47
+ "charts": d / "charts.json",
48
+ "themes": d / "themes.json",
49
+ "taxonomy": d / "taxonomy.json",
50
+ "narrative": d / "narrative.txt",
51
+ "comparison": DATA_DIR / "comparison.csv", # shared output
52
  }
53
 
 
 
 
 
 
 
 
 
54
  RUN_CONFIGS = {
55
  "abstract": ["Abstract"],
56
  "title": ["Title"],
 
57
  }
58
 
59
+ # Comprehensive boilerplate filter — catches © symbol + all major publishers
60
  BOILERPLATE_PATTERNS = [
61
+ r"\u00a9", # © unicode
62
+ r"\\u00a9", # escaped unicode
63
+ r"copyright\s*\d{4}",
64
+ r"\d{4}\s+john wiley",
65
+ r"john wiley\s*(&|and)\s*sons",
66
+ r"blackwell\s*(publishing|pub)",
 
67
  r"wiley\s+periodicals",
68
+ r"wiley\s+online",
69
  r"all rights reserved",
70
  r"doi\s*:\s*\S+",
71
  r"published by elsevier",
72
+ r"elsevier\s*(b\.v|inc|ltd|science)",
73
+ r"springer\s*(nature|verlag|science|link)",
74
+ r"taylor\s*(&|and)\s*francis",
75
  r"informa\s+uk",
76
  r"sage\s+publications",
77
+ r"information systems journal\s+published",
78
+ r"emerald\s+(publishing|group)",
79
  r"this article is",
80
  r"rights reserved",
81
  r"permission from",
82
  r"reproduced with",
 
83
  ]
84
  BOILERPLATE_RE = re.compile("|".join(BOILERPLATE_PATTERNS), re.IGNORECASE)
85
 
86
+ # Extra keyword filter applied per-sentence
87
+ PUBLISHER_KEYWORDS = frozenset([
88
+ "wiley", "elsevier", "blackwell", "springer",
89
+ "taylor", "information systems journal", "emerald"
90
+ ])
91
+
92
  PAJAIS_CATEGORIES = [
93
  "Information Systems Theory", "IS Strategy & Governance",
94
  "Digital Innovation", "Enterprise Systems",
 
107
 
108
 
109
  def safe_read_csv(path):
110
+ """Read CSV with UTF-8 fallback to latin-1."""
111
  try:
112
  return pd.read_csv(path, encoding="utf-8")
113
  except UnicodeDecodeError:
114
  return pd.read_csv(path, encoding="latin-1")
115
 
116
 
117
+ def _is_clean(s: str) -> bool:
118
+ """Return True if sentence passes all quality checks."""
119
+ sl = s.lower().strip()
120
+ return (
121
+ not BOILERPLATE_RE.search(s)
122
+ and not s.strip().startswith("\u00a9")
123
+ and not s.strip().startswith("©")
124
+ and len(s.split()) > 6
125
+ and len(s.strip()) > 40
126
+ and not any(kw in sl for kw in PUBLISHER_KEYWORDS)
127
+ )
128
+
129
+
130
  def _call_llm_json(llm, prompt: str) -> list:
131
+ """Call LLM with plain HumanMessage, strip markdown fences, parse JSON."""
132
  response = llm.invoke([HumanMessage(content=prompt)])
133
  raw = response.content.strip()
134
  raw = raw.split("```json")[-1].split("```")[0].strip() if "```" in raw else raw
135
  return json.loads(raw)
136
 
137
 
138
+ def _both_runs_complete() -> bool:
139
+ """Return True only when BOTH abstract and title runs have themes saved."""
140
+ return (
141
+ (_p("abstract")["themes"]).exists()
142
+ and (_p("title")["themes"]).exists()
143
+ )
144
+
145
+
146
  # =============================================================================
147
+ # TOOL 1 load_scopus_csv
148
+ # Saves to data/uploaded.csv (permanent copy) AND data/{run_config}/papers.csv
 
149
  # =============================================================================
150
  @tool
151
  def load_scopus_csv(csv_path: str, run_config: str = "abstract") -> str:
152
+ """Load a Scopus CSV, filter boilerplate sentences, save per run_config.
153
 
154
+ Saves sentences to data/{run_config}/sentences.json.
155
+ Also copies the CSV permanently to data/uploaded.csv.
 
156
 
157
  Args:
158
  csv_path: Path to the uploaded Scopus CSV file.
159
+ run_config: 'abstract' or 'title' (default 'abstract').
160
  """
161
+ p = _p(run_config)
162
+ columns = RUN_CONFIGS.get(run_config, ["Abstract"])
 
 
 
 
163
 
164
+ # Copy CSV to permanent location only if it is a different file
165
+ dest = DATA_DIR / "uploaded.csv"
166
+ src = Path(csv_path).resolve()
167
+ dst = dest.resolve()
168
+ _ = shutil.copy(str(src), str(dst)) if src != dst else None
169
 
170
+ df_raw = safe_read_csv(dest)
 
 
171
 
172
+ present_cols = list(filter(
173
+ lambda c: c in df_raw.columns,
174
+ columns + ["Title", "Year", "Source title", "Cited by"]
175
+ ))
176
+ text_cols = list(filter(lambda c: c in df_raw.columns, columns))
177
+ df = df_raw[present_cols].dropna(subset=text_cols)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
+ def split_sentences(text):
180
+ parts = re.split(r"(?<=[.!?])\s+", str(text))
181
+ return list(filter(_is_clean, parts))
 
182
 
183
+ sentences_lists = list(map(split_sentences, df[columns[0]].tolist()))
184
+ all_sentences = [s for lst in sentences_lists for s in lst]
185
 
186
+ stats = {
187
+ "papers": int(len(df)),
188
+ "sentences_after_filter": int(len(all_sentences)),
189
+ "columns_used": columns,
190
+ "run_config": run_config,
191
+ }
192
 
193
+ p["sentences"].write_text(json.dumps(all_sentences, ensure_ascii=False))
194
+ p["stats"].write_text(json.dumps(stats, ensure_ascii=False))
195
+ df.to_csv(p["papers"], index=False)
196
+
197
+ return json.dumps(stats)
198
 
199
 
200
  # =============================================================================
201
+ # TOOL 2 run_bertopic_discovery
202
+ # threshold=0.35 ~100 fine-grained clusters; IDs renumbered 1..N
 
203
  # =============================================================================
204
  @tool
205
  def run_bertopic_discovery(top_n_topics: int = 100, run_config: str = "abstract") -> str:
206
+ """Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering
207
+ (cosine, threshold=0.35) targeting ~100 topics. Topic IDs are sequential 1..N.
208
 
209
  Args:
210
+ top_n_topics: Target number of clusters (default 100).
211
+ run_config: 'abstract' or 'title' (default 'abstract').
212
  """
213
+ p = _p(run_config)
214
  sentences = json.loads(p["sentences"].read_text())
215
 
216
+ model = SentenceTransformer("all-MiniLM-L6-v2")
217
  embeddings = model.encode(
218
+ sentences, normalize_embeddings=True,
219
+ show_progress_bar=False, batch_size=64
220
  )
221
  np.save(p["emb"], embeddings)
222
 
 
 
 
223
  clustering = AgglomerativeClustering(
224
+ metric="cosine", linkage="average",
225
+ distance_threshold=0.35, n_clusters=None,
 
 
226
  )
227
  labels = clustering.fit_predict(embeddings)
228
 
229
+ all_labels = sorted(set(labels.tolist()))
230
+ label_sizes = list(map(lambda lb: (lb, int((labels == lb).sum())), all_labels))
231
+ # Keep clusters with ≥3 sentences, sort by size desc, take top N
232
+ label_filtered = list(filter(lambda x: x[1] >= 3, label_sizes))
233
+ label_sorted = sorted(label_filtered, key=lambda x: -x[1])
234
+ retained = list(map(lambda x: x[0], label_sorted[:top_n_topics]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
+ def build_summary(seq_label):
237
+ seq_id, raw_label = seq_label
238
+ mask = labels == raw_label
239
  cluster_embs = embeddings[mask]
240
+ raw_sents = [sentences[i] for i, m in enumerate(mask.tolist()) if m]
241
+ clean_sents = list(filter(_is_clean, raw_sents))
242
+ sents = clean_sents if clean_sents else raw_sents[:5]
243
+ centroid = cluster_embs.mean(axis=0, keepdims=True)
244
+ sims = cosine_similarity(centroid, cluster_embs)[0]
245
+ top5_idx = sims.argsort()[-5:][::-1].tolist()
246
+ raw_top = list(map(lambda i: raw_sents[i], top5_idx))
247
+ clean_set = set(sents)
248
+ top_evidence = list(filter(lambda s: s in clean_set, raw_top))[:5]
249
+ top_evidence = top_evidence if top_evidence else raw_top[:3]
 
 
 
250
  return {
251
+ "topic_id": seq_id,
 
252
  "size": int(mask.sum()),
253
  "top_evidence": top_evidence,
254
+ "sentences": sents,
255
  "centroid": centroid[0].tolist(),
256
  "run_config": run_config,
257
  }
258
 
259
+ # Sequential IDs starting at 1
260
+ seq_pairs = list(map(lambda x: (x[0] + 1, x[1]), enumerate(retained)))
261
+ summaries = list(map(build_summary, seq_pairs))
262
  p["summaries"].write_text(json.dumps(summaries, indent=2, ensure_ascii=False))
263
 
264
+ sizes = list(map(lambda s: s["size"], summaries))
265
  ids = list(map(lambda s: s["topic_id"], summaries))
266
 
267
+ fig1 = px.bar(x=ids, y=sizes, title="Topic Sizes — {}".format(run_config),
268
+ labels={"x": "Topic #", "y": "Sentences"})
269
+ fig2 = px.histogram(x=sizes, nbins=30, title="Size Distribution {}".format(run_config),
 
270
  labels={"x": "Cluster Size"})
 
271
  centroids = np.array(list(map(lambda s: s["centroid"], summaries)))
272
  n_comp = min(2, centroids.shape[0], centroids.shape[1])
273
  coords = PCA(n_components=n_comp).fit_transform(centroids)
 
275
  x=coords[:, 0],
276
  y=(coords[:, 1] if coords.shape[1] > 1 else [0] * len(coords)),
277
  text=list(map(str, ids)),
278
+ title="Topic Centroids PCA {}".format(run_config),
279
  labels={"x": "PC1", "y": "PC2"},
280
  )
281
  fig4 = px.treemap(
282
+ names=list(map(str, ids)), parents=["Topics"] * len(ids),
283
+ values=sizes, title="Treemap {}".format(run_config),
 
 
284
  )
285
 
286
  charts = {
 
295
  "topics_found": len(summaries),
296
  "run_config": run_config,
297
  "chart_types": list(charts.keys()),
298
+ "note": "Topics numbered 1..{}, threshold=0.35".format(len(summaries)),
299
  })
300
 
301
 
302
  # =============================================================================
303
+ # TOOL 3 label_topics_with_llm
304
  # =============================================================================
305
  @tool
306
  def label_topics_with_llm(batch_size: int = 20, run_config: str = "abstract") -> str:
307
+ """Label topic clusters with human-readable names via Mistral LLM.
308
 
309
  Args:
310
+ batch_size: Topics per LLM call (default 20).
311
+ run_config: 'abstract' or 'title' (default 'abstract').
312
  """
313
+ p = _p(run_config)
314
  summaries = json.loads(p["summaries"].read_text())
315
  top_summaries = summaries[:100]
316
  llm = ChatMistralAI(model="mistral-large-latest", temperature=0.2)
 
323
  batch
324
  ))
325
  prompt = (
326
+ "You are a thematic analysis expert in Information Systems research.\n"
327
+ "Label each topic cluster with a concise 3-6 word academic label "
328
+ "and one-sentence reasoning.\n\n"
329
  "TOPICS:\n" + json.dumps(mini, indent=2) + "\n\n"
330
+ "Return ONLY a raw JSON array. "
331
+ "Each element: topic_id (integer), label (string), reasoning (string). "
332
+ "No markdown, no explanation."
 
333
  )
334
  return _call_llm_json(llm, prompt)
335
 
 
350
 
351
 
352
  # =============================================================================
353
+ # TOOL 4 consolidate_into_themes
354
  # =============================================================================
355
  @tool
356
  def consolidate_into_themes(approved_groups: str, run_config: str = "abstract") -> str:
357
+ """Merge approved topic groups into themes and recompute centroids.
358
 
359
  Args:
360
+ approved_groups: JSON list [{theme_name: str, topic_ids: [int,...]}]
361
+ run_config: 'abstract' or 'title' (default 'abstract').
362
  """
363
+ p = _p(run_config)
364
  groups = json.loads(approved_groups)
365
  summaries = json.loads(p["summaries"].read_text())
366
+ id_map = {s["topic_id"]: s for s in summaries}
367
 
368
  def build_theme(group):
369
  ids = group["topic_ids"]
370
+ members = list(map(lambda tid: id_map[tid], ids))
371
+ sents = [s for ms in members for s in ms.get("sentences", [])]
372
  centroids = np.array(list(map(lambda ms: ms["centroid"], members)))
373
  return {
374
  "theme_name": group["theme_name"],
375
  "topic_ids": ids,
376
+ "sentences": sents,
377
  "centroid": centroids.mean(axis=0).tolist(),
378
+ "paper_count": len(set(sents)),
379
  "run_config": run_config,
380
  }
381
 
 
385
  "themes_created": len(themes),
386
  "theme_names": list(map(lambda t: t["theme_name"], themes)),
387
  "run_config": run_config,
388
+ "both_complete": _both_runs_complete(),
389
  })
390
 
391
 
392
  # =============================================================================
393
+ # TOOL 5 compare_with_taxonomy
394
  # =============================================================================
395
  @tool
396
  def compare_with_taxonomy(run_config: str = "abstract") -> str:
397
+ """Map themes to PAJAIS 25 categories via Mistral LLM.
398
 
399
  Args:
400
+ run_config: 'abstract' or 'title' (default 'abstract').
401
  """
402
+ p = _p(run_config)
403
  themes = json.loads(p["themes"].read_text())
404
  llm = ChatMistralAI(model="mistral-large-latest", temperature=0.1)
405
 
 
407
  lambda t: {"name": t["theme_name"], "sample": t["sentences"][:2]},
408
  themes
409
  ))
 
410
  prompt = (
411
  "You are a research classification expert in Information Systems.\n\n"
412
+ "Map each theme to the single most relevant PAJAIS category.\n\n"
 
413
  "THEMES:\n" + json.dumps(theme_mini, indent=2) + "\n\n"
414
  "PAJAIS CATEGORIES:\n" + json.dumps(PAJAIS_CATEGORIES, indent=2) + "\n\n"
415
+ "Return ONLY a raw JSON array. "
416
+ "Each element: name, pajais_category, confidence, rationale. "
417
+ "No markdown, no explanation."
 
418
  )
 
419
  result = _call_llm_json(llm, prompt)
420
  p["taxonomy"].write_text(json.dumps(result, indent=2, ensure_ascii=False))
421
+ return json.dumps({
422
+ "mapped_themes": len(result),
423
+ "run_config": run_config,
424
+ "both_complete": _both_runs_complete(),
425
+ })
426
 
427
 
428
  # =============================================================================
429
+ # TOOL 6 generate_comparison_csv
430
+ # ONLY runs when BOTH abstract and title runs are complete
431
+ # Columns: Title | Abstract | Year | Source Journal
432
  # =============================================================================
433
  @tool
434
+ def generate_comparison_csv() -> str:
435
+ """Generate Title | Abstract | Year | Source Journal comparison CSV.
436
 
437
+ Only available after BOTH abstract and title runs have completed themes.
438
+ Saves to data/comparison.csv.
439
  """
440
+ abs_complete = _p("abstract")["themes"].exists()
441
+ title_complete = _p("title")["themes"].exists()
442
+
443
+ status_msg = (
444
+ "Abstract complete: {}, Title complete: {}. "
445
+ "Run 'run title' to complete the title analysis first."
446
+ ).format(abs_complete, title_complete)
447
+
448
+ # Use ternary to avoid if/else
449
+ result = (
450
+ _do_generate_comparison_csv()
451
+ if (abs_complete and title_complete)
452
+ else status_msg
453
+ )
454
+ return result
455
+
456
+
457
+ def _do_generate_comparison_csv() -> str:
458
+ """Internal: actually generate the CSV when both runs are done."""
459
+ df = safe_read_csv(DATA_DIR / "uploaded.csv")
460
 
 
461
  title_col = next(filter(lambda c: c.strip().lower() == "title", df.columns), None)
462
  abstract_col = next(filter(lambda c: c.strip().lower() == "abstract", df.columns), None)
463
  year_col = next(filter(lambda c: c.strip().lower() == "year", df.columns), None)
464
  journal_col = next(filter(lambda c: "source" in c.lower(), df.columns), None)
465
 
 
466
  ordered = [title_col, abstract_col, year_col, journal_col]
467
  present = list(filter(lambda c: c is not None and c in df.columns, ordered))
468
+ out_df = df[present].copy()
469
 
 
 
470
  rename_map = {
471
  title_col: "Title",
472
  abstract_col: "Abstract",
473
  year_col: "Year",
474
  journal_col: "Source Journal",
475
  }
476
+ out_df = out_df.rename(
477
+ columns={k: v for k, v in rename_map.items() if k in out_df.columns}
478
+ )
479
+ dest = DATA_DIR / "comparison.csv"
480
+ out_df.to_csv(dest, index=False, encoding="utf-8-sig")
481
  return json.dumps({
482
+ "rows": len(out_df),
483
+ "columns": list(out_df.columns),
484
+ "path": str(dest),
485
+ "note": "Both runs complete — comparison CSV generated",
486
  })
487
 
488
 
489
  # =============================================================================
490
+ # TOOL 7 export_narrative
491
+ # ONLY runs when BOTH abstract and title runs are complete
492
  # =============================================================================
493
  @tool
494
+ def export_narrative() -> str:
495
+ """Write a 500-word Section 7 narrative using themes from BOTH runs.
496
 
497
+ Only available after BOTH abstract and title runs have completed taxonomy mapping.
498
+ Saves to data/narrative.txt.
499
  """
500
+ abs_tax = _p("abstract")["taxonomy"]
501
+ title_tax = _p("title")["taxonomy"]
502
+
503
+ both_done = abs_tax.exists() and title_tax.exists()
504
+
505
+ result = (
506
+ _do_export_narrative()
507
+ if both_done
508
+ else (
509
+ "Narrative cannot be generated yet. "
510
+ "Abstract taxonomy complete: {}. Title taxonomy complete: {}. "
511
+ "Complete both runs through Phase 5.5 first.".format(
512
+ abs_tax.exists(), title_tax.exists()
513
+ )
514
+ )
515
+ )
516
+ return result
517
 
518
+
519
+ def _do_export_narrative() -> str:
520
+ """Internal: generate narrative when both runs are done."""
521
+ abs_themes = json.loads(_p("abstract")["themes"].read_text())
522
+ title_themes = json.loads(_p("title")["themes"].read_text())
523
+ abs_taxonomy = json.loads(_p("abstract")["taxonomy"].read_text())
524
+ title_taxonomy = json.loads(_p("title")["taxonomy"].read_text())
525
+ llm = ChatMistralAI(model="mistral-large-latest", temperature=0.4)
526
+
527
+ abs_summary = list(map(lambda t: {"name": t["theme_name"],
528
+ "sentences": len(t["sentences"])}, abs_themes))
529
+ title_summary = list(map(lambda t: {"name": t["theme_name"],
530
+ "sentences": len(t["sentences"])}, title_themes))
531
 
532
  prompt = (
533
  "You are an academic writing expert in Information Systems.\n\n"
534
  "Write Section 7 (Discussion and Thematic Synthesis) of a systematic "
535
+ "literature review paper. Approximately 500 words, formal academic prose.\n"
536
+ "Cover:\n"
537
+ "(a) Overview of themes from abstract analysis\n"
538
+ "(b) Overview of themes from title analysis\n"
539
+ "(c) Comparison: what themes appear in both vs only one\n"
540
+ "(d) PAJAIS taxonomy mapping and implications\n"
541
+ "(e) Implications for IS research and practice\n"
542
+ "(f) Limitations\n\n"
543
+ "ABSTRACT THEMES:\n" + json.dumps(abs_summary, indent=2) + "\n\n"
544
+ "TITLE THEMES:\n" + json.dumps(title_summary, indent=2) + "\n\n"
545
+ "ABSTRACT PAJAIS MAPPING:\n" + json.dumps(abs_taxonomy, indent=2) + "\n\n"
546
+ "TITLE PAJAIS MAPPING:\n" + json.dumps(title_taxonomy, indent=2) + "\n\n"
547
+ "Write in continuous academic paragraphs. No bullet points or headers."
548
  )
549
 
550
  response = llm.invoke([HumanMessage(content=prompt)])
551
  narrative_text = response.content
552
+ dest = DATA_DIR / "narrative.txt"
553
+ dest.write_text(narrative_text, encoding="utf-8")
554
  return json.dumps({
555
  "word_count": len(narrative_text.split()),
556
+ "path": str(dest),
557
+ "note": "Narrative combines both abstract and title run themes",
558
  })