chitsanfei commited on
Commit
d082b18
·
0 Parent(s):

init: init

Browse files
Files changed (15) hide show
  1. .Rhistory +512 -0
  2. .env.example +35 -0
  3. .github/workflows/deploy_to_hf_space.yml +43 -0
  4. .gitignore +181 -0
  5. LICENSE +661 -0
  6. README.md +97 -0
  7. analyzer.py +511 -0
  8. app.py +724 -0
  9. deduplicator.py +183 -0
  10. file_processor.py +407 -0
  11. model_manager.py +528 -0
  12. prompt_manager.py +191 -0
  13. renovate.json +6 -0
  14. requirements.txt +9 -0
  15. result_processor.py +393 -0
.Rhistory ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
2
+ b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
3
+ # C -> Final
4
+ c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
5
+ c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
6
+ c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
7
+ c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
8
+ c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
9
+ c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
10
+ # 准备链接数据
11
+ links <- data.frame(
12
+ source = c(
13
+ # A -> B
14
+ rep(0, 2), rep(1, 2),
15
+ # B -> C
16
+ rep(2, 3), rep(3, 3),
17
+ # C -> Final
18
+ rep(4, 2), rep(5, 2), rep(6, 2)
19
+ ),
20
+ target = c(
21
+ # A -> B
22
+ 2, 3, 2, 3,
23
+ # B -> C
24
+ 4, 5, 6, 4, 5, 6,
25
+ # C -> Final
26
+ 7, 8, 7, 8, 7, 8
27
+ ),
28
+ value = c(
29
+ # A -> B
30
+ a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
31
+ # B -> C
32
+ b_true_c_true, b_true_c_false, b_true_c_na,
33
+ b_false_c_true, b_false_c_false, b_false_c_na,
34
+ # C -> Final
35
+ c_true_final_true, c_true_final_false,
36
+ c_false_final_true, c_false_final_false,
37
+ c_na_final_true, c_na_final_false
38
+ )
39
+ )
40
+ # 创建颜色向量
41
+ my_color <- 'd3.scaleOrdinal()
42
+ .domain(["Model A True", "Model A False",
43
+ "Model B True", "Model B False",
44
+ "Model C True", "Model C False", "Model C NA",
45
+ "Final True", "Final False"])
46
+ .range(["#fbf8cc", "#fde4cf",
47
+ "#FFCFD2", "#F1C0E8",
48
+ "#CFBAF0", "#A3C4F3", "#90DBF4",
49
+ "#98F5E1", "#B9FBC0"])'
50
+ # 绘制桑基图
51
+ sankeyNetwork(Links = links, Nodes = nodes,
52
+ Source = "source", Target = "target",
53
+ Value = "value", NodeID = "name",
54
+ sinksRight = TRUE,
55
+ nodeWidth = 40,
56
+ nodePadding = 20,
57
+ colourScale = my_color,
58
+ fontSize = 12,
59
+ height = 500,
60
+ width = 800)
61
+ # 保存为HTML文件
62
+ saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
63
+ Source = "source", Target = "target",
64
+ Value = "value", NodeID = "name",
65
+ sinksRight = TRUE,
66
+ nodeWidth = 40,
67
+ nodePadding = 20,
68
+ colourScale = my_color,
69
+ fontSize = 12,
70
+ height = 500,
71
+ width = 800),
72
+ "sankey_plot.html")
73
+ setwd("C:/Users/admin/Desktop/article-analyzer")
74
+ # 加载必要的包
75
+ library(networkD3)
76
+ library(dplyr)
77
+ library(readr)
78
+ # 读取数据
79
+ data <- read_csv("data/picos_analysis.csv")
80
+ # 准备节点数据
81
+ nodes <- data.frame(
82
+ name = c(
83
+ "Model A True", "Model A False",
84
+ "Model B True", "Model B False",
85
+ "Model C True", "Model C False", "Model C NA",
86
+ "Final True", "Final False"
87
+ )
88
+ )
89
+ # 计算流向
90
+ # A -> B
91
+ a_true_b_true <- sum(data$A_Decision & data$B_Decision, na.rm = TRUE)
92
+ a_true_b_false <- sum(data$A_Decision & !data$B_Decision, na.rm = TRUE)
93
+ a_false_b_true <- sum(!data$A_Decision & data$B_Decision, na.rm = TRUE)
94
+ a_false_b_false <- sum(!data$A_Decision & !data$B_Decision, na.rm = TRUE)
95
+ # B -> C
96
+ b_true_c_true <- sum(data$B_Decision & data$C_Decision, na.rm = TRUE)
97
+ b_true_c_false <- sum(data$B_Decision & !data$C_Decision, na.rm = TRUE)
98
+ b_true_c_na <- sum(data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
99
+ b_false_c_true <- sum(!data$B_Decision & data$C_Decision, na.rm = TRUE)
100
+ b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
101
+ b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
102
+ # C -> Final
103
+ c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
104
+ c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
105
+ c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
106
+ c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
107
+ c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
108
+ c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
109
+ # 准备链接数据
110
+ links <- data.frame(
111
+ source = c(
112
+ # A -> B
113
+ rep(0, 2), rep(1, 2),
114
+ # B -> C
115
+ rep(2, 3), rep(3, 3),
116
+ # C -> Final
117
+ rep(4, 2), rep(5, 2), rep(6, 2)
118
+ ),
119
+ target = c(
120
+ # A -> B
121
+ 2, 3, 2, 3,
122
+ # B -> C
123
+ 4, 5, 6, 4, 5, 6,
124
+ # C -> Final
125
+ 7, 8, 7, 8, 7, 8
126
+ ),
127
+ value = c(
128
+ # A -> B
129
+ a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
130
+ # B -> C
131
+ b_true_c_true, b_true_c_false, b_true_c_na,
132
+ b_false_c_true, b_false_c_false, b_false_c_na,
133
+ # C -> Final
134
+ c_true_final_true, c_true_final_false,
135
+ c_false_final_true, c_false_final_false,
136
+ c_na_final_true, c_na_final_false
137
+ )
138
+ )
139
+ # 创建颜色向量
140
+ my_color <- 'd3.scaleOrdinal()
141
+ .domain(["Model A True", "Model A False",
142
+ "Model B True", "Model B False",
143
+ "Model C True", "Model C False", "Model C NA",
144
+ "Final True", "Final False"])
145
+ .range(["#fbf8cc", "#fde4cf",
146
+ "#FFCFD2", "#F1C0E8",
147
+ "#CFBAF0", "#A3C4F3", "#90DBF4",
148
+ "#98F5E1", "#B9FBC0"])'
149
+ # 绘制桑基图
150
+ sankeyNetwork(Links = links, Nodes = nodes,
151
+ Source = "source", Target = "target",
152
+ Value = "value", NodeID = "name",
153
+ sinksRight = TRUE,
154
+ nodeWidth = 40,
155
+ nodePadding = 20,
156
+ colourScale = my_color,
157
+ fontSize = 12,
158
+ height = 500,
159
+ width = 800)
160
+ # 保存为HTML文件
161
+ saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
162
+ Source = "source", Target = "target",
163
+ Value = "value", NodeID = "name",
164
+ sinksRight = TRUE,
165
+ nodeWidth = 40,
166
+ nodePadding = 20,
167
+ colourScale = my_color,
168
+ fontSize = 12,
169
+ height = 500,
170
+ width = 800),
171
+ "sankey_plot.html")
172
+ # 保存为HTML文件
173
+ saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
174
+ Source = "source", Target = "target",
175
+ Value = "value", NodeID = "name",
176
+ sinksRight = TRUE,
177
+ nodeWidth = 40,
178
+ nodePadding = 20,
179
+ colourScale = my_color,
180
+ fontSize = 12,
181
+ height = 500,
182
+ width = 800),
183
+ "sankey_plot.html")
184
+ setwd("C:/Users/admin/Desktop/article-analyzer")
185
+ # 加载必要的包
186
+ library(networkD3)
187
+ library(dplyr)
188
+ library(readr)
189
+ # 读取数据
190
+ data <- read_csv("data/picos_analysis.csv")
191
+ # 准备节点数据
192
+ nodes <- data.frame(
193
+ name = c(
194
+ "Model A True", "Model A False",
195
+ "Model B True", "Model B False",
196
+ "Model C True", "Model C False", "Model C NA",
197
+ "Final True", "Final False"
198
+ ),
199
+ group = c(
200
+ "A True", "A False",
201
+ "B True", "B False",
202
+ "C True", "C False", "C NA",
203
+ "F True", "F False"
204
+ )
205
+ )
206
+ # 计算流向
207
+ # A -> B
208
+ a_true_b_true <- sum(data$A_Decision & data$B_Decision, na.rm = TRUE)
209
+ a_true_b_false <- sum(data$A_Decision & !data$B_Decision, na.rm = TRUE)
210
+ a_false_b_true <- sum(!data$A_Decision & data$B_Decision, na.rm = TRUE)
211
+ a_false_b_false <- sum(!data$A_Decision & !data$B_Decision, na.rm = TRUE)
212
+ # B -> C
213
+ b_true_c_true <- sum(data$B_Decision & data$C_Decision, na.rm = TRUE)
214
+ b_true_c_false <- sum(data$B_Decision & !data$C_Decision, na.rm = TRUE)
215
+ b_true_c_na <- sum(data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
216
+ b_false_c_true <- sum(!data$B_Decision & data$C_Decision, na.rm = TRUE)
217
+ b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
218
+ b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
219
+ # C -> Final
220
+ c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
221
+ c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
222
+ c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
223
+ c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
224
+ c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
225
+ c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
226
+ # 准备链接数据
227
+ links <- data.frame(
228
+ source = c(
229
+ # A -> B
230
+ rep(0, 2), rep(1, 2),
231
+ # B -> C
232
+ rep(2, 3), rep(3, 3),
233
+ # C -> Final
234
+ rep(4, 2), rep(5, 2), rep(6, 2)
235
+ ),
236
+ target = c(
237
+ # A -> B
238
+ 2, 3, 2, 3,
239
+ # B -> C
240
+ 4, 5, 6, 4, 5, 6,
241
+ # C -> Final
242
+ 7, 8, 7, 8, 7, 8
243
+ ),
244
+ value = c(
245
+ # A -> B
246
+ a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
247
+ # B -> C
248
+ b_true_c_true, b_true_c_false, b_true_c_na,
249
+ b_false_c_true, b_false_c_false, b_false_c_na,
250
+ # C -> Final
251
+ c_true_final_true, c_true_final_false,
252
+ c_false_final_true, c_false_final_false,
253
+ c_na_final_true, c_na_final_false
254
+ )
255
+ )
256
+ # 创建颜色向量
257
+ my_color <- 'function(d) {
258
+ const colors = {
259
+ "Model A True": "#fbf8cc",
260
+ "Model A False": "#fde4cf",
261
+ "Model B True": "#FFCFD2",
262
+ "Model B False": "#F1C0E8",
263
+ "Model C True": "#CFBAF0",
264
+ "Model C False": "#A3C4F3",
265
+ "Model C NA": "#90DBF4",
266
+ "Final True": "#98F5E1",
267
+ "Final False": "#B9FBC0"
268
+ };
269
+ return colors[d.name] || "#cccccc";
270
+ }'
271
+ # 绘制桑基图
272
+ sankeyNetwork(Links = links, Nodes = nodes,
273
+ Source = "source", Target = "target",
274
+ Value = "value", NodeID = "name",
275
+ sinksRight = TRUE,
276
+ nodeWidth = 40,
277
+ nodePadding = 20,
278
+ colourScale = my_color,
279
+ fontSize = 12,
280
+ height = 500,
281
+ width = 800)
282
+ # 保存为HTML文件
283
+ saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
284
+ Source = "source", Target = "target",
285
+ Value = "value", NodeID = "name",
286
+ sinksRight = TRUE,
287
+ nodeWidth = 40,
288
+ nodePadding = 20,
289
+ colourScale = my_color,
290
+ fontSize = 12,
291
+ height = 500,
292
+ width = 800),
293
+ "sankey_plot.html")
294
+ setwd("C:/Users/admin/Desktop/article-analyzer")
295
+ # 加载必要的包
296
+ library(networkD3)
297
+ library(dplyr)
298
+ library(readr)
299
+ # 读取数据
300
+ data <- read_csv("data/picos_analysis.csv")
301
+ # 准备节点数据
302
+ nodes <- data.frame(
303
+ name = c(
304
+ "Model A True", "Model A False",
305
+ "Model B True", "Model B False",
306
+ "Model C True", "Model C False", "Model C NA",
307
+ "Final True", "Final False"
308
+ ),
309
+ group = c(
310
+ "A True", "A False",
311
+ "B True", "B False",
312
+ "C True", "C False", "C NA",
313
+ "F True", "F False"
314
+ )
315
+ )
316
+ # 计算流向
317
+ # A -> B
318
+ a_true_b_true <- sum(data$A_Decision & data$B_Decision, na.rm = TRUE)
319
+ a_true_b_false <- sum(data$A_Decision & !data$B_Decision, na.rm = TRUE)
320
+ a_false_b_true <- sum(!data$A_Decision & data$B_Decision, na.rm = TRUE)
321
+ a_false_b_false <- sum(!data$A_Decision & !data$B_Decision, na.rm = TRUE)
322
+ # B -> C
323
+ b_true_c_true <- sum(data$B_Decision & data$C_Decision, na.rm = TRUE)
324
+ b_true_c_false <- sum(data$B_Decision & !data$C_Decision, na.rm = TRUE)
325
+ b_true_c_na <- sum(data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
326
+ b_false_c_true <- sum(!data$B_Decision & data$C_Decision, na.rm = TRUE)
327
+ b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
328
+ b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
329
+ # C -> Final
330
+ c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
331
+ c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
332
+ c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
333
+ c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
334
+ c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
335
+ c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
336
+ # 准备链接数据
337
+ links <- data.frame(
338
+ source = c(
339
+ # A -> B
340
+ rep(0, 2), rep(1, 2),
341
+ # B -> C
342
+ rep(2, 3), rep(3, 3),
343
+ # C -> Final
344
+ rep(4, 2), rep(5, 2), rep(6, 2)
345
+ ),
346
+ target = c(
347
+ # A -> B
348
+ 2, 3, 2, 3,
349
+ # B -> C
350
+ 4, 5, 6, 4, 5, 6,
351
+ # C -> Final
352
+ 7, 8, 7, 8, 7, 8
353
+ ),
354
+ value = c(
355
+ # A -> B
356
+ a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
357
+ # B -> C
358
+ b_true_c_true, b_true_c_false, b_true_c_na,
359
+ b_false_c_true, b_false_c_false, b_false_c_na,
360
+ # C -> Final
361
+ c_true_final_true, c_true_final_false,
362
+ c_false_final_true, c_false_final_false,
363
+ c_na_final_true, c_na_final_false
364
+ )
365
+ )
366
+ # 创建颜色向量
367
+ my_color <- 'function(d) {
368
+ const colors = {
369
+ "Model A True": "#fbf8cc",
370
+ "Model A False": "#fde4cf",
371
+ "Model B True": "#FFCFD2",
372
+ "Model B False": "#F1C0E8",
373
+ "Model C True": "#CFBAF0",
374
+ "Model C False": "#A3C4F3",
375
+ "Model C NA": "#90DBF4",
376
+ "Final True": "#98F5E1",
377
+ "Final False": "#B9FBC0"
378
+ };
379
+ return colors[d.name] || "#cccccc";
380
+ }'
381
+ # 绘制桑基图
382
+ sankeyNetwork(Links = links, Nodes = nodes,
383
+ Source = "source", Target = "target",
384
+ Value = "value", NodeID = "name",
385
+ sinksRight = TRUE,
386
+ nodeWidth = 40,
387
+ nodePadding = 20,
388
+ colourScale = my_color,
389
+ fontSize = 12,
390
+ height = 500,
391
+ width = 800)
392
+ # 保存为HTML文件
393
+ saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
394
+ Source = "source", Target = "target",
395
+ Value = "value", NodeID = "name",
396
+ sinksRight = TRUE,
397
+ nodeWidth = 40,
398
+ nodePadding = 20,
399
+ colourScale = my_color,
400
+ fontSize = 12,
401
+ height = 500,
402
+ width = 800),
403
+ "sankey_plot.html")
404
+ setwd("C:/Users/admin/Desktop/article-analyzer")
405
+ # 加载必要的包
406
+ library(networkD3)
407
+ library(dplyr)
408
+ library(readr)
409
+ # 读取数据
410
+ data <- read_csv("data/picos_analysis.csv")
411
+ # 准备节点数据
412
+ nodes <- data.frame(
413
+ name = c(
414
+ "Model A True", "Model A False",
415
+ "Model B True", "Model B False",
416
+ "Model C True", "Model C False", "Model C NA",
417
+ "Final True", "Final False"
418
+ ),
419
+ group = c(
420
+ "A True", "A False",
421
+ "B True", "B False",
422
+ "C True", "C False", "C NA",
423
+ "F True", "F False"
424
+ )
425
+ )
426
+ # 计算流向
427
+ # A -> B
428
+ a_true_b_true <- sum(data$A_Decision & data$B_Decision, na.rm = TRUE)
429
+ a_true_b_false <- sum(data$A_Decision & !data$B_Decision, na.rm = TRUE)
430
+ a_false_b_true <- sum(!data$A_Decision & data$B_Decision, na.rm = TRUE)
431
+ a_false_b_false <- sum(!data$A_Decision & !data$B_Decision, na.rm = TRUE)
432
+ # B -> C
433
+ b_true_c_true <- sum(data$B_Decision & data$C_Decision, na.rm = TRUE)
434
+ b_true_c_false <- sum(data$B_Decision & !data$C_Decision, na.rm = TRUE)
435
+ b_true_c_na <- sum(data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
436
+ b_false_c_true <- sum(!data$B_Decision & data$C_Decision, na.rm = TRUE)
437
+ b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
438
+ b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
439
+ # C -> Final
440
+ c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
441
+ c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
442
+ c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
443
+ c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
444
+ c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
445
+ c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
446
+ # 准备链接数据
447
+ links <- data.frame(
448
+ source = c(
449
+ # A -> B
450
+ rep(0, 2), rep(1, 2),
451
+ # B -> C
452
+ rep(2, 3), rep(3, 3),
453
+ # C -> Final
454
+ rep(4, 2), rep(5, 2), rep(6, 2)
455
+ ),
456
+ target = c(
457
+ # A -> B
458
+ 2, 3, 2, 3,
459
+ # B -> C
460
+ 4, 5, 6, 4, 5, 6,
461
+ # C -> Final
462
+ 7, 8, 7, 8, 7, 8
463
+ ),
464
+ value = c(
465
+ # A -> B
466
+ a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
467
+ # B -> C
468
+ b_true_c_true, b_true_c_false, b_true_c_na,
469
+ b_false_c_true, b_false_c_false, b_false_c_na,
470
+ # C -> Final
471
+ c_true_final_true, c_true_final_false,
472
+ c_false_final_true, c_false_final_false,
473
+ c_na_final_true, c_na_final_false
474
+ )
475
+ )
476
+ # 创建颜色向量
477
+ my_color <- paste0(
478
+ 'd3.scaleOrdinal()
479
+ .domain(["A True", "A False",
480
+ "B True", "B False",
481
+ "C True", "C False", "C NA",
482
+ "F True", "F False"])
483
+ .range(["#fbf8cc", "#fde4cf",
484
+ "#FFCFD2", "#F1C0E8",
485
+ "#CFBAF0", "#A3C4F3", "#90DBF4",
486
+ "#98F5E1", "#B9FBC0"])'
487
+ )
488
+ # 绘制桑基图
489
+ sankeyNetwork(Links = links, Nodes = nodes,
490
+ Source = "source", Target = "target",
491
+ Value = "value", NodeID = "name",
492
+ NodeGroup = "group",
493
+ sinksRight = TRUE,
494
+ nodeWidth = 40,
495
+ nodePadding = 20,
496
+ colourScale = my_color,
497
+ fontSize = 12,
498
+ height = 500,
499
+ width = 800)
500
+ # 保存为HTML文件
501
+ saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
502
+ Source = "source", Target = "target",
503
+ Value = "value", NodeID = "name",
504
+ NodeGroup = "group",
505
+ sinksRight = TRUE,
506
+ nodeWidth = 40,
507
+ nodePadding = 20,
508
+ colourScale = my_color,
509
+ fontSize = 12,
510
+ height = 500,
511
+ width = 800),
512
+ "sankey_plot.html")
.env.example ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Primary Model Configuration (Model A)
2
+ # Used for initial screening and basic PICOS criteria evaluation
3
+ MODEL_A_API_URL=https://api.example.com/v1
4
+ MODEL_A_API_KEY=your_model_a_api_key
5
+ MODEL_A_MODEL_NAME=model-a-name
6
+ MODEL_A_TEMPERATURE=0.3
7
+ MODEL_A_MAX_TOKENS=16384
8
+ MODEL_A_BATCH_SIZE=10
9
+ MODEL_A_THREADS=8
10
+ MODEL_A_TIMEOUT=180
11
+ MODEL_A_IS_INFERENCE=false
12
+
13
+ # Secondary Model Configuration (Model B)
14
+ # Used for detailed analysis and verification of Model A results
15
+ MODEL_B_API_URL=https://api.example.com/v1
16
+ MODEL_B_API_KEY=your_model_b_api_key
17
+ MODEL_B_MODEL_NAME=model-b-name
18
+ MODEL_B_TEMPERATURE=0.3
19
+ MODEL_B_MAX_TOKENS=16384
20
+ MODEL_B_BATCH_SIZE=10
21
+ MODEL_B_THREADS=8
22
+ MODEL_B_TIMEOUT=180
23
+ MODEL_B_IS_INFERENCE=false
24
+
25
+ # Arbitration Model Configuration (Model C)
26
+ # Used to resolve conflicts between Model A and B results
27
+ MODEL_C_API_URL=https://api.example.com/v1
28
+ MODEL_C_API_KEY=your_model_c_api_key
29
+ MODEL_C_MODEL_NAME=model-c-name
30
+ MODEL_C_TEMPERATURE=0.3
31
+ MODEL_C_MAX_TOKENS=16384
32
+ MODEL_C_BATCH_SIZE=10
33
+ MODEL_C_THREADS=8
34
+ MODEL_C_TIMEOUT=180
35
+ MODEL_C_IS_INFERENCE=false
.github/workflows/deploy_to_hf_space.yml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # .github/workflows/deploy_to_hf_space.yml
2
+ name: Deploy Gradio to Hugging Face Spaces
3
+
4
+ on:
5
+ push:
6
+ branches:
7
+ - master
8
+ workflow_dispatch:
9
+
10
+ jobs:
11
+ deploy:
12
+ runs-on: ubuntu-latest
13
+ env:
14
+ HF_USERNAME: chitsanfei
15
+ SPACE_NAME: review-screening-analyzer
16
+
17
+ steps:
18
+ - name: Checkout repository
19
+ uses: actions/checkout@v3
20
+ with:
21
+ fetch-depth: 0
22
+ lfs: true
23
+
24
+ - name: Set up Python 3.8
25
+ uses: actions/setup-python@v4
26
+ with:
27
+ python-version: "3.8"
28
+
29
+ - name: Install dependencies
30
+ run: pip install -r requirements.txt
31
+
32
+ - name: Push to Hugging Face Space
33
+ # HF_TOKEN 需在仓库 Settings → Secrets 中配置
34
+ env:
35
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
36
+ run: |
37
+ # 配置 Git 用户信息
38
+ git config --global user.name "${{ github.actor }}"
39
+ git config --global user.email "${{ github.actor }}@users.noreply.github.com"
40
+ # 强制推送当前 HEAD 到远端 main 分支
41
+ git push -f \
42
+ https://$HF_USERNAME:${{ secrets.HF_TOKEN }}@huggingface.co/spaces/$HF_USERNAME/$SPACE_NAME \
43
+ HEAD:main
.gitignore ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # PyPI configuration file
171
+ .pypirc
172
+
173
+ # Environment variables
174
+ .env
175
+ .env.local
176
+ .env.*.local
177
+
178
+ # For HF
179
+ .static/banner.png
180
+ data/*.xlsx
181
+
LICENSE ADDED
@@ -0,0 +1,661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GNU AFFERO GENERAL PUBLIC LICENSE
2
+ Version 3, 19 November 2007
3
+
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+ Preamble
9
+
10
+ The GNU Affero General Public License is a free, copyleft license for
11
+ software and other kinds of works, specifically designed to ensure
12
+ cooperation with the community in the case of network server software.
13
+
14
+ The licenses for most software and other practical works are designed
15
+ to take away your freedom to share and change the works. By contrast,
16
+ our General Public Licenses are intended to guarantee your freedom to
17
+ share and change all versions of a program--to make sure it remains free
18
+ software for all its users.
19
+
20
+ When we speak of free software, we are referring to freedom, not
21
+ price. Our General Public Licenses are designed to make sure that you
22
+ have the freedom to distribute copies of free software (and charge for
23
+ them if you wish), that you receive source code or can get it if you
24
+ want it, that you can change the software or use pieces of it in new
25
+ free programs, and that you know you can do these things.
26
+
27
+ Developers that use our General Public Licenses protect your rights
28
+ with two steps: (1) assert copyright on the software, and (2) offer
29
+ you this License which gives you legal permission to copy, distribute
30
+ and/or modify the software.
31
+
32
+ A secondary benefit of defending all users' freedom is that
33
+ improvements made in alternate versions of the program, if they
34
+ receive widespread use, become available for other developers to
35
+ incorporate. Many developers of free software are heartened and
36
+ encouraged by the resulting cooperation. However, in the case of
37
+ software used on network servers, this result may fail to come about.
38
+ The GNU General Public License permits making a modified version and
39
+ letting the public access it on a server without ever releasing its
40
+ source code to the public.
41
+
42
+ The GNU Affero General Public License is designed specifically to
43
+ ensure that, in such cases, the modified source code becomes available
44
+ to the community. It requires the operator of a network server to
45
+ provide the source code of the modified version running there to the
46
+ users of that server. Therefore, public use of a modified version, on
47
+ a publicly accessible server, gives the public access to the source
48
+ code of the modified version.
49
+
50
+ An older license, called the Affero General Public License and
51
+ published by Affero, was designed to accomplish similar goals. This is
52
+ a different license, not a version of the Affero GPL, but Affero has
53
+ released a new version of the Affero GPL which permits relicensing under
54
+ this license.
55
+
56
+ The precise terms and conditions for copying, distribution and
57
+ modification follow.
58
+
59
+ TERMS AND CONDITIONS
60
+
61
+ 0. Definitions.
62
+
63
+ "This License" refers to version 3 of the GNU Affero General Public License.
64
+
65
+ "Copyright" also means copyright-like laws that apply to other kinds of
66
+ works, such as semiconductor masks.
67
+
68
+ "The Program" refers to any copyrightable work licensed under this
69
+ License. Each licensee is addressed as "you". "Licensees" and
70
+ "recipients" may be individuals or organizations.
71
+
72
+ To "modify" a work means to copy from or adapt all or part of the work
73
+ in a fashion requiring copyright permission, other than the making of an
74
+ exact copy. The resulting work is called a "modified version" of the
75
+ earlier work or a work "based on" the earlier work.
76
+
77
+ A "covered work" means either the unmodified Program or a work based
78
+ on the Program.
79
+
80
+ To "propagate" a work means to do anything with it that, without
81
+ permission, would make you directly or secondarily liable for
82
+ infringement under applicable copyright law, except executing it on a
83
+ computer or modifying a private copy. Propagation includes copying,
84
+ distribution (with or without modification), making available to the
85
+ public, and in some countries other activities as well.
86
+
87
+ To "convey" a work means any kind of propagation that enables other
88
+ parties to make or receive copies. Mere interaction with a user through
89
+ a computer network, with no transfer of a copy, is not conveying.
90
+
91
+ An interactive user interface displays "Appropriate Legal Notices"
92
+ to the extent that it includes a convenient and prominently visible
93
+ feature that (1) displays an appropriate copyright notice, and (2)
94
+ tells the user that there is no warranty for the work (except to the
95
+ extent that warranties are provided), that licensees may convey the
96
+ work under this License, and how to view a copy of this License. If
97
+ the interface presents a list of user commands or options, such as a
98
+ menu, a prominent item in the list meets this criterion.
99
+
100
+ 1. Source Code.
101
+
102
+ The "source code" for a work means the preferred form of the work
103
+ for making modifications to it. "Object code" means any non-source
104
+ form of a work.
105
+
106
+ A "Standard Interface" means an interface that either is an official
107
+ standard defined by a recognized standards body, or, in the case of
108
+ interfaces specified for a particular programming language, one that
109
+ is widely used among developers working in that language.
110
+
111
+ The "System Libraries" of an executable work include anything, other
112
+ than the work as a whole, that (a) is included in the normal form of
113
+ packaging a Major Component, but which is not part of that Major
114
+ Component, and (b) serves only to enable use of the work with that
115
+ Major Component, or to implement a Standard Interface for which an
116
+ implementation is available to the public in source code form. A
117
+ "Major Component", in this context, means a major essential component
118
+ (kernel, window system, and so on) of the specific operating system
119
+ (if any) on which the executable work runs, or a compiler used to
120
+ produce the work, or an object code interpreter used to run it.
121
+
122
+ The "Corresponding Source" for a work in object code form means all
123
+ the source code needed to generate, install, and (for an executable
124
+ work) run the object code and to modify the work, including scripts to
125
+ control those activities. However, it does not include the work's
126
+ System Libraries, or general-purpose tools or generally available free
127
+ programs which are used unmodified in performing those activities but
128
+ which are not part of the work. For example, Corresponding Source
129
+ includes interface definition files associated with source files for
130
+ the work, and the source code for shared libraries and dynamically
131
+ linked subprograms that the work is specifically designed to require,
132
+ such as by intimate data communication or control flow between those
133
+ subprograms and other parts of the work.
134
+
135
+ The Corresponding Source need not include anything that users
136
+ can regenerate automatically from other parts of the Corresponding
137
+ Source.
138
+
139
+ The Corresponding Source for a work in source code form is that
140
+ same work.
141
+
142
+ 2. Basic Permissions.
143
+
144
+ All rights granted under this License are granted for the term of
145
+ copyright on the Program, and are irrevocable provided the stated
146
+ conditions are met. This License explicitly affirms your unlimited
147
+ permission to run the unmodified Program. The output from running a
148
+ covered work is covered by this License only if the output, given its
149
+ content, constitutes a covered work. This License acknowledges your
150
+ rights of fair use or other equivalent, as provided by copyright law.
151
+
152
+ You may make, run and propagate covered works that you do not
153
+ convey, without conditions so long as your license otherwise remains
154
+ in force. You may convey covered works to others for the sole purpose
155
+ of having them make modifications exclusively for you, or provide you
156
+ with facilities for running those works, provided that you comply with
157
+ the terms of this License in conveying all material for which you do
158
+ not control copyright. Those thus making or running the covered works
159
+ for you must do so exclusively on your behalf, under your direction
160
+ and control, on terms that prohibit them from making any copies of
161
+ your copyrighted material outside their relationship with you.
162
+
163
+ Conveying under any other circumstances is permitted solely under
164
+ the conditions stated below. Sublicensing is not allowed; section 10
165
+ makes it unnecessary.
166
+
167
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168
+
169
+ No covered work shall be deemed part of an effective technological
170
+ measure under any applicable law fulfilling obligations under article
171
+ 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172
+ similar laws prohibiting or restricting circumvention of such
173
+ measures.
174
+
175
+ When you convey a covered work, you waive any legal power to forbid
176
+ circumvention of technological measures to the extent such circumvention
177
+ is effected by exercising rights under this License with respect to
178
+ the covered work, and you disclaim any intention to limit operation or
179
+ modification of the work as a means of enforcing, against the work's
180
+ users, your or third parties' legal rights to forbid circumvention of
181
+ technological measures.
182
+
183
+ 4. Conveying Verbatim Copies.
184
+
185
+ You may convey verbatim copies of the Program's source code as you
186
+ receive it, in any medium, provided that you conspicuously and
187
+ appropriately publish on each copy an appropriate copyright notice;
188
+ keep intact all notices stating that this License and any
189
+ non-permissive terms added in accord with section 7 apply to the code;
190
+ keep intact all notices of the absence of any warranty; and give all
191
+ recipients a copy of this License along with the Program.
192
+
193
+ You may charge any price or no price for each copy that you convey,
194
+ and you may offer support or warranty protection for a fee.
195
+
196
+ 5. Conveying Modified Source Versions.
197
+
198
+ You may convey a work based on the Program, or the modifications to
199
+ produce it from the Program, in the form of source code under the
200
+ terms of section 4, provided that you also meet all of these conditions:
201
+
202
+ a) The work must carry prominent notices stating that you modified
203
+ it, and giving a relevant date.
204
+
205
+ b) The work must carry prominent notices stating that it is
206
+ released under this License and any conditions added under section
207
+ 7. This requirement modifies the requirement in section 4 to
208
+ "keep intact all notices".
209
+
210
+ c) You must license the entire work, as a whole, under this
211
+ License to anyone who comes into possession of a copy. This
212
+ License will therefore apply, along with any applicable section 7
213
+ additional terms, to the whole of the work, and all its parts,
214
+ regardless of how they are packaged. This License gives no
215
+ permission to license the work in any other way, but it does not
216
+ invalidate such permission if you have separately received it.
217
+
218
+ d) If the work has interactive user interfaces, each must display
219
+ Appropriate Legal Notices; however, if the Program has interactive
220
+ interfaces that do not display Appropriate Legal Notices, your
221
+ work need not make them do so.
222
+
223
+ A compilation of a covered work with other separate and independent
224
+ works, which are not by their nature extensions of the covered work,
225
+ and which are not combined with it such as to form a larger program,
226
+ in or on a volume of a storage or distribution medium, is called an
227
+ "aggregate" if the compilation and its resulting copyright are not
228
+ used to limit the access or legal rights of the compilation's users
229
+ beyond what the individual works permit. Inclusion of a covered work
230
+ in an aggregate does not cause this License to apply to the other
231
+ parts of the aggregate.
232
+
233
+ 6. Conveying Non-Source Forms.
234
+
235
+ You may convey a covered work in object code form under the terms
236
+ of sections 4 and 5, provided that you also convey the
237
+ machine-readable Corresponding Source under the terms of this License,
238
+ in one of these ways:
239
+
240
+ a) Convey the object code in, or embodied in, a physical product
241
+ (including a physical distribution medium), accompanied by the
242
+ Corresponding Source fixed on a durable physical medium
243
+ customarily used for software interchange.
244
+
245
+ b) Convey the object code in, or embodied in, a physical product
246
+ (including a physical distribution medium), accompanied by a
247
+ written offer, valid for at least three years and valid for as
248
+ long as you offer spare parts or customer support for that product
249
+ model, to give anyone who possesses the object code either (1) a
250
+ copy of the Corresponding Source for all the software in the
251
+ product that is covered by this License, on a durable physical
252
+ medium customarily used for software interchange, for a price no
253
+ more than your reasonable cost of physically performing this
254
+ conveying of source, or (2) access to copy the
255
+ Corresponding Source from a network server at no charge.
256
+
257
+ c) Convey individual copies of the object code with a copy of the
258
+ written offer to provide the Corresponding Source. This
259
+ alternative is allowed only occasionally and noncommercially, and
260
+ only if you received the object code with such an offer, in accord
261
+ with subsection 6b.
262
+
263
+ d) Convey the object code by offering access from a designated
264
+ place (gratis or for a charge), and offer equivalent access to the
265
+ Corresponding Source in the same way through the same place at no
266
+ further charge. You need not require recipients to copy the
267
+ Corresponding Source along with the object code. If the place to
268
+ copy the object code is a network server, the Corresponding Source
269
+ may be on a different server (operated by you or a third party)
270
+ that supports equivalent copying facilities, provided you maintain
271
+ clear directions next to the object code saying where to find the
272
+ Corresponding Source. Regardless of what server hosts the
273
+ Corresponding Source, you remain obligated to ensure that it is
274
+ available for as long as needed to satisfy these requirements.
275
+
276
+ e) Convey the object code using peer-to-peer transmission, provided
277
+ you inform other peers where the object code and Corresponding
278
+ Source of the work are being offered to the general public at no
279
+ charge under subsection 6d.
280
+
281
+ A separable portion of the object code, whose source code is excluded
282
+ from the Corresponding Source as a System Library, need not be
283
+ included in conveying the object code work.
284
+
285
+ A "User Product" is either (1) a "consumer product", which means any
286
+ tangible personal property which is normally used for personal, family,
287
+ or household purposes, or (2) anything designed or sold for incorporation
288
+ into a dwelling. In determining whether a product is a consumer product,
289
+ doubtful cases shall be resolved in favor of coverage. For a particular
290
+ product received by a particular user, "normally used" refers to a
291
+ typical or common use of that class of product, regardless of the status
292
+ of the particular user or of the way in which the particular user
293
+ actually uses, or expects or is expected to use, the product. A product
294
+ is a consumer product regardless of whether the product has substantial
295
+ commercial, industrial or non-consumer uses, unless such uses represent
296
+ the only significant mode of use of the product.
297
+
298
+ "Installation Information" for a User Product means any methods,
299
+ procedures, authorization keys, or other information required to install
300
+ and execute modified versions of a covered work in that User Product from
301
+ a modified version of its Corresponding Source. The information must
302
+ suffice to ensure that the continued functioning of the modified object
303
+ code is in no case prevented or interfered with solely because
304
+ modification has been made.
305
+
306
+ If you convey an object code work under this section in, or with, or
307
+ specifically for use in, a User Product, and the conveying occurs as
308
+ part of a transaction in which the right of possession and use of the
309
+ User Product is transferred to the recipient in perpetuity or for a
310
+ fixed term (regardless of how the transaction is characterized), the
311
+ Corresponding Source conveyed under this section must be accompanied
312
+ by the Installation Information. But this requirement does not apply
313
+ if neither you nor any third party retains the ability to install
314
+ modified object code on the User Product (for example, the work has
315
+ been installed in ROM).
316
+
317
+ The requirement to provide Installation Information does not include a
318
+ requirement to continue to provide support service, warranty, or updates
319
+ for a work that has been modified or installed by the recipient, or for
320
+ the User Product in which it has been modified or installed. Access to a
321
+ network may be denied when the modification itself materially and
322
+ adversely affects the operation of the network or violates the rules and
323
+ protocols for communication across the network.
324
+
325
+ Corresponding Source conveyed, and Installation Information provided,
326
+ in accord with this section must be in a format that is publicly
327
+ documented (and with an implementation available to the public in
328
+ source code form), and must require no special password or key for
329
+ unpacking, reading or copying.
330
+
331
+ 7. Additional Terms.
332
+
333
+ "Additional permissions" are terms that supplement the terms of this
334
+ License by making exceptions from one or more of its conditions.
335
+ Additional permissions that are applicable to the entire Program shall
336
+ be treated as though they were included in this License, to the extent
337
+ that they are valid under applicable law. If additional permissions
338
+ apply only to part of the Program, that part may be used separately
339
+ under those permissions, but the entire Program remains governed by
340
+ this License without regard to the additional permissions.
341
+
342
+ When you convey a copy of a covered work, you may at your option
343
+ remove any additional permissions from that copy, or from any part of
344
+ it. (Additional permissions may be written to require their own
345
+ removal in certain cases when you modify the work.) You may place
346
+ additional permissions on material, added by you to a covered work,
347
+ for which you have or can give appropriate copyright permission.
348
+
349
+ Notwithstanding any other provision of this License, for material you
350
+ add to a covered work, you may (if authorized by the copyright holders of
351
+ that material) supplement the terms of this License with terms:
352
+
353
+ a) Disclaiming warranty or limiting liability differently from the
354
+ terms of sections 15 and 16 of this License; or
355
+
356
+ b) Requiring preservation of specified reasonable legal notices or
357
+ author attributions in that material or in the Appropriate Legal
358
+ Notices displayed by works containing it; or
359
+
360
+ c) Prohibiting misrepresentation of the origin of that material, or
361
+ requiring that modified versions of such material be marked in
362
+ reasonable ways as different from the original version; or
363
+
364
+ d) Limiting the use for publicity purposes of names of licensors or
365
+ authors of the material; or
366
+
367
+ e) Declining to grant rights under trademark law for use of some
368
+ trade names, trademarks, or service marks; or
369
+
370
+ f) Requiring indemnification of licensors and authors of that
371
+ material by anyone who conveys the material (or modified versions of
372
+ it) with contractual assumptions of liability to the recipient, for
373
+ any liability that these contractual assumptions directly impose on
374
+ those licensors and authors.
375
+
376
+ All other non-permissive additional terms are considered "further
377
+ restrictions" within the meaning of section 10. If the Program as you
378
+ received it, or any part of it, contains a notice stating that it is
379
+ governed by this License along with a term that is a further
380
+ restriction, you may remove that term. If a license document contains
381
+ a further restriction but permits relicensing or conveying under this
382
+ License, you may add to a covered work material governed by the terms
383
+ of that license document, provided that the further restriction does
384
+ not survive such relicensing or conveying.
385
+
386
+ If you add terms to a covered work in accord with this section, you
387
+ must place, in the relevant source files, a statement of the
388
+ additional terms that apply to those files, or a notice indicating
389
+ where to find the applicable terms.
390
+
391
+ Additional terms, permissive or non-permissive, may be stated in the
392
+ form of a separately written license, or stated as exceptions;
393
+ the above requirements apply either way.
394
+
395
+ 8. Termination.
396
+
397
+ You may not propagate or modify a covered work except as expressly
398
+ provided under this License. Any attempt otherwise to propagate or
399
+ modify it is void, and will automatically terminate your rights under
400
+ this License (including any patent licenses granted under the third
401
+ paragraph of section 11).
402
+
403
+ However, if you cease all violation of this License, then your
404
+ license from a particular copyright holder is reinstated (a)
405
+ provisionally, unless and until the copyright holder explicitly and
406
+ finally terminates your license, and (b) permanently, if the copyright
407
+ holder fails to notify you of the violation by some reasonable means
408
+ prior to 60 days after the cessation.
409
+
410
+ Moreover, your license from a particular copyright holder is
411
+ reinstated permanently if the copyright holder notifies you of the
412
+ violation by some reasonable means, this is the first time you have
413
+ received notice of violation of this License (for any work) from that
414
+ copyright holder, and you cure the violation prior to 30 days after
415
+ your receipt of the notice.
416
+
417
+ Termination of your rights under this section does not terminate the
418
+ licenses of parties who have received copies or rights from you under
419
+ this License. If your rights have been terminated and not permanently
420
+ reinstated, you do not qualify to receive new licenses for the same
421
+ material under section 10.
422
+
423
+ 9. Acceptance Not Required for Having Copies.
424
+
425
+ You are not required to accept this License in order to receive or
426
+ run a copy of the Program. Ancillary propagation of a covered work
427
+ occurring solely as a consequence of using peer-to-peer transmission
428
+ to receive a copy likewise does not require acceptance. However,
429
+ nothing other than this License grants you permission to propagate or
430
+ modify any covered work. These actions infringe copyright if you do
431
+ not accept this License. Therefore, by modifying or propagating a
432
+ covered work, you indicate your acceptance of this License to do so.
433
+
434
+ 10. Automatic Licensing of Downstream Recipients.
435
+
436
+ Each time you convey a covered work, the recipient automatically
437
+ receives a license from the original licensors, to run, modify and
438
+ propagate that work, subject to this License. You are not responsible
439
+ for enforcing compliance by third parties with this License.
440
+
441
+ An "entity transaction" is a transaction transferring control of an
442
+ organization, or substantially all assets of one, or subdividing an
443
+ organization, or merging organizations. If propagation of a covered
444
+ work results from an entity transaction, each party to that
445
+ transaction who receives a copy of the work also receives whatever
446
+ licenses to the work the party's predecessor in interest had or could
447
+ give under the previous paragraph, plus a right to possession of the
448
+ Corresponding Source of the work from the predecessor in interest, if
449
+ the predecessor has it or can get it with reasonable efforts.
450
+
451
+ You may not impose any further restrictions on the exercise of the
452
+ rights granted or affirmed under this License. For example, you may
453
+ not impose a license fee, royalty, or other charge for exercise of
454
+ rights granted under this License, and you may not initiate litigation
455
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
456
+ any patent claim is infringed by making, using, selling, offering for
457
+ sale, or importing the Program or any portion of it.
458
+
459
+ 11. Patents.
460
+
461
+ A "contributor" is a copyright holder who authorizes use under this
462
+ License of the Program or a work on which the Program is based. The
463
+ work thus licensed is called the contributor's "contributor version".
464
+
465
+ A contributor's "essential patent claims" are all patent claims
466
+ owned or controlled by the contributor, whether already acquired or
467
+ hereafter acquired, that would be infringed by some manner, permitted
468
+ by this License, of making, using, or selling its contributor version,
469
+ but do not include claims that would be infringed only as a
470
+ consequence of further modification of the contributor version. For
471
+ purposes of this definition, "control" includes the right to grant
472
+ patent sublicenses in a manner consistent with the requirements of
473
+ this License.
474
+
475
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
476
+ patent license under the contributor's essential patent claims, to
477
+ make, use, sell, offer for sale, import and otherwise run, modify and
478
+ propagate the contents of its contributor version.
479
+
480
+ In the following three paragraphs, a "patent license" is any express
481
+ agreement or commitment, however denominated, not to enforce a patent
482
+ (such as an express permission to practice a patent or covenant not to
483
+ sue for patent infringement). To "grant" such a patent license to a
484
+ party means to make such an agreement or commitment not to enforce a
485
+ patent against the party.
486
+
487
+ If you convey a covered work, knowingly relying on a patent license,
488
+ and the Corresponding Source of the work is not available for anyone
489
+ to copy, free of charge and under the terms of this License, through a
490
+ publicly available network server or other readily accessible means,
491
+ then you must either (1) cause the Corresponding Source to be so
492
+ available, or (2) arrange to deprive yourself of the benefit of the
493
+ patent license for this particular work, or (3) arrange, in a manner
494
+ consistent with the requirements of this License, to extend the patent
495
+ license to downstream recipients. "Knowingly relying" means you have
496
+ actual knowledge that, but for the patent license, your conveying the
497
+ covered work in a country, or your recipient's use of the covered work
498
+ in a country, would infringe one or more identifiable patents in that
499
+ country that you have reason to believe are valid.
500
+
501
+ If, pursuant to or in connection with a single transaction or
502
+ arrangement, you convey, or propagate by procuring conveyance of, a
503
+ covered work, and grant a patent license to some of the parties
504
+ receiving the covered work authorizing them to use, propagate, modify
505
+ or convey a specific copy of the covered work, then the patent license
506
+ you grant is automatically extended to all recipients of the covered
507
+ work and works based on it.
508
+
509
+ A patent license is "discriminatory" if it does not include within
510
+ the scope of its coverage, prohibits the exercise of, or is
511
+ conditioned on the non-exercise of one or more of the rights that are
512
+ specifically granted under this License. You may not convey a covered
513
+ work if you are a party to an arrangement with a third party that is
514
+ in the business of distributing software, under which you make payment
515
+ to the third party based on the extent of your activity of conveying
516
+ the work, and under which the third party grants, to any of the
517
+ parties who would receive the covered work from you, a discriminatory
518
+ patent license (a) in connection with copies of the covered work
519
+ conveyed by you (or copies made from those copies), or (b) primarily
520
+ for and in connection with specific products or compilations that
521
+ contain the covered work, unless you entered into that arrangement,
522
+ or that patent license was granted, prior to 28 March 2007.
523
+
524
+ Nothing in this License shall be construed as excluding or limiting
525
+ any implied license or other defenses to infringement that may
526
+ otherwise be available to you under applicable patent law.
527
+
528
+ 12. No Surrender of Others' Freedom.
529
+
530
+ If conditions are imposed on you (whether by court order, agreement or
531
+ otherwise) that contradict the conditions of this License, they do not
532
+ excuse you from the conditions of this License. If you cannot convey a
533
+ covered work so as to satisfy simultaneously your obligations under this
534
+ License and any other pertinent obligations, then as a consequence you may
535
+ not convey it at all. For example, if you agree to terms that obligate you
536
+ to collect a royalty for further conveying from those to whom you convey
537
+ the Program, the only way you could satisfy both those terms and this
538
+ License would be to refrain entirely from conveying the Program.
539
+
540
+ 13. Remote Network Interaction; Use with the GNU General Public License.
541
+
542
+ Notwithstanding any other provision of this License, if you modify the
543
+ Program, your modified version must prominently offer all users
544
+ interacting with it remotely through a computer network (if your version
545
+ supports such interaction) an opportunity to receive the Corresponding
546
+ Source of your version by providing access to the Corresponding Source
547
+ from a network server at no charge, through some standard or customary
548
+ means of facilitating copying of software. This Corresponding Source
549
+ shall include the Corresponding Source for any work covered by version 3
550
+ of the GNU General Public License that is incorporated pursuant to the
551
+ following paragraph.
552
+
553
+ Notwithstanding any other provision of this License, you have
554
+ permission to link or combine any covered work with a work licensed
555
+ under version 3 of the GNU General Public License into a single
556
+ combined work, and to convey the resulting work. The terms of this
557
+ License will continue to apply to the part which is the covered work,
558
+ but the work with which it is combined will remain governed by version
559
+ 3 of the GNU General Public License.
560
+
561
+ 14. Revised Versions of this License.
562
+
563
+ The Free Software Foundation may publish revised and/or new versions of
564
+ the GNU Affero General Public License from time to time. Such new versions
565
+ will be similar in spirit to the present version, but may differ in detail to
566
+ address new problems or concerns.
567
+
568
+ Each version is given a distinguishing version number. If the
569
+ Program specifies that a certain numbered version of the GNU Affero General
570
+ Public License "or any later version" applies to it, you have the
571
+ option of following the terms and conditions either of that numbered
572
+ version or of any later version published by the Free Software
573
+ Foundation. If the Program does not specify a version number of the
574
+ GNU Affero General Public License, you may choose any version ever published
575
+ by the Free Software Foundation.
576
+
577
+ If the Program specifies that a proxy can decide which future
578
+ versions of the GNU Affero General Public License can be used, that proxy's
579
+ public statement of acceptance of a version permanently authorizes you
580
+ to choose that version for the Program.
581
+
582
+ Later license versions may give you additional or different
583
+ permissions. However, no additional obligations are imposed on any
584
+ author or copyright holder as a result of your choosing to follow a
585
+ later version.
586
+
587
+ 15. Disclaimer of Warranty.
588
+
589
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590
+ APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591
+ HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592
+ OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595
+ IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596
+ ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597
+
598
+ 16. Limitation of Liability.
599
+
600
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602
+ THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603
+ GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604
+ USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605
+ DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606
+ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607
+ EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608
+ SUCH DAMAGES.
609
+
610
+ 17. Interpretation of Sections 15 and 16.
611
+
612
+ If the disclaimer of warranty and limitation of liability provided
613
+ above cannot be given local legal effect according to their terms,
614
+ reviewing courts shall apply local law that most closely approximates
615
+ an absolute waiver of all civil liability in connection with the
616
+ Program, unless a warranty or assumption of liability accompanies a
617
+ copy of the Program in return for a fee.
618
+
619
+ END OF TERMS AND CONDITIONS
620
+
621
+ How to Apply These Terms to Your New Programs
622
+
623
+ If you develop a new program, and you want it to be of the greatest
624
+ possible use to the public, the best way to achieve this is to make it
625
+ free software which everyone can redistribute and change under these terms.
626
+
627
+ To do so, attach the following notices to the program. It is safest
628
+ to attach them to the start of each source file to most effectively
629
+ state the exclusion of warranty; and each file should have at least
630
+ the "copyright" line and a pointer to where the full notice is found.
631
+
632
+ <one line to give the program's name and a brief idea of what it does.>
633
+ Copyright (C) <year> <name of author>
634
+
635
+ This program is free software: you can redistribute it and/or modify
636
+ it under the terms of the GNU Affero General Public License as published
637
+ by the Free Software Foundation, either version 3 of the License, or
638
+ (at your option) any later version.
639
+
640
+ This program is distributed in the hope that it will be useful,
641
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
642
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643
+ GNU Affero General Public License for more details.
644
+
645
+ You should have received a copy of the GNU Affero General Public License
646
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
647
+
648
+ Also add information on how to contact you by electronic and paper mail.
649
+
650
+ If your software can interact with users remotely through a computer
651
+ network, you should also make sure that it provides a way for users to
652
+ get its source. For example, if your program is a web application, its
653
+ interface could display a "Source" link that leads users to an archive
654
+ of the code. There are many ways you could offer source, and different
655
+ solutions will be better for different programs; see section 13 for the
656
+ specific requirements.
657
+
658
+ You should also get your employer (if you work as a programmer) or school,
659
+ if any, to sign a "copyright disclaimer" for the program, if necessary.
660
+ For more information on this, and how to apply and follow the GNU AGPL, see
661
+ <https://www.gnu.org/licenses/>.
README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Review Screening Analyzer
3
+ emoji: 📚
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: "5.39.0"
8
+ app_file: app.py
9
+ pinned: true
10
+ ---
11
+
12
+ <div align="center">
13
+ <hr>
14
+ <h1>Review Screening Analyzer</h1>
15
+ <b>A Simple Literature Filtering Tool</b>
16
+ </div>
17
+
18
+ ---
19
+
20
+ > [!important]
21
+ > This project is currently under development and marked as research in progress status, don't use it withour authors' permission.
22
+
23
+ > [!important]
24
+ > This is a demo code for the paper "Automated Literature Screening for Hepatocellular Carcinoma Treatment: Integrating Three Large Language Models" published in the Journal of Medical Internet Research Medical Informatics.
25
+
26
+ ## 目录
27
+
28
+ - [Introduction](#Introduction)
29
+ - [Usage](#usage)
30
+ - [License](#license)
31
+ - [Contact Information](#contact-information)
32
+
33
+ ---
34
+
35
+ ## Introduction
36
+
37
+ Review Screening Analyzer is a literature screening tool that combines three large language models for analysis to determine the inclusion and exclusion of studies in systematic reviews based on PICOS criteria.
38
+
39
+ This is a demo project for demonstration purposes, not a production application. If you find any bugs, please report them in the Issues.
40
+
41
+
42
+ ## File Structure
43
+ ```
44
+ review-screening-analyzer/
45
+
46
+ ├── analyzer.py
47
+ ├── deduplicator.py
48
+ ├── file_processor.py
49
+ ├── LICENSE
50
+ ├── README.md
51
+ ├── requirements.txt
52
+ └── app.py # Gradio Entry Point
53
+ ```
54
+
55
+ ## Usage
56
+
57
+ > [!warning]
58
+ > The following content is a temporary solution for local deployment.
59
+
60
+ Please ensure that [Python](https://www.python.org/) and [pip](https://pip.pypa.io/en/stable/) are installed on your system.
61
+
62
+ Create the environment variable file `.env` in the project directory [/](file:///Users/chitsanfei/Downloads/review-screening-analyzer/README.md):
63
+ ```
64
+ # API Keys
65
+ DEEPSEEK_API_KEY=
66
+ QWEN_API_KEY=
67
+ GPTGE_API_KEY=
68
+ ```
69
+
70
+ Then run the following commands:
71
+ ```bash
72
+ bash
73
+ git clone https://github.com/chitsanfei/review-screening-analyzer.git
74
+ cd review-screening-analyzer
75
+ pip install -r requirements.txt
76
+ python3 app.py
77
+ ```
78
+
79
+ ## License
80
+
81
+ This project is licensed under the [MIT License](LICENSE).
82
+ ```
83
+ This project is licensed under the GNU General Public License v3.0 (GPL-3.0).
84
+ You are free to use, modify and distribute this software, provided that you keep it open source and license it under the same terms.
85
+ For more details, see the full [GNU GPL v3.0 license text](https://www.gnu.org/licenses/gpl-3.0.html).
86
+ ```
87
+
88
+ ## Contact Information
89
+
90
+ If you have any questions or suggestions, please contact us through the following methods:
91
+
92
+ - Email: chitsanfei@emu.ac.cn
93
+ - GitHub: [chitsanfei](https://github.com/chitsanfei)
94
+
95
+ ---
96
+
97
+ Thank you for your usage and support! 🌟
analyzer.py ADDED
@@ -0,0 +1,511 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import logging
3
+ import json
4
+ from typing import Dict, List, Optional
5
+ from model_manager import ModelManager
6
+ from prompt_manager import PromptManager
7
+ from result_processor import ResultProcessor
8
+ import re
9
+ import time
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+
12
+
13
+ class PICOSAnalyzer:
14
+ def __init__(self):
15
+ # Initialize managers for models, prompts, and result processing
16
+ self.model_manager = ModelManager()
17
+ self.prompt_manager = PromptManager()
18
+ self.result_processor = ResultProcessor()
19
+ # Example PICOS filtering criteria
20
+ self.picos_criteria = {
21
+ "population": "patients with non-alcoholic fatty liver disease (NAFLD)",
22
+ "intervention": "observation or management of NAFLD",
23
+ "comparison": "patients without NAFLD or general population",
24
+ "outcome": "incidence of various types of extra-hepatic cancers, such as colorectal cancer, stomach cancer, breast cancer, etc.",
25
+ "study_design": "retrospective cohort studies"
26
+ }
27
+
28
+ def update_picos_criteria(self, criteria: Dict[str, str]) -> None:
29
+ """Update the PICOS criteria with a given dictionary of criteria."""
30
+ self.picos_criteria.update(criteria)
31
+
32
+ def update_model_config(self, model_key: str, config: Dict) -> None:
33
+ """Update configuration settings for a specific model."""
34
+ self.model_manager.update_model_config(model_key, config)
35
+
36
+ def update_prompt(self, model_key: str, prompt: str) -> None:
37
+ """Update the prompt template for a specific model."""
38
+ self.prompt_manager.update_prompt(model_key, prompt)
39
+
40
+ def test_api_connection(self, model_key: str) -> str:
41
+ """Test the API connection for the specified model."""
42
+ return self.model_manager.test_api_connection(model_key)
43
+
44
+ def _validate_data(self, idx: str, row: pd.Series, model_key: str, previous_results: Dict) -> bool:
45
+ """
46
+ Validate the completeness of a single data item.
47
+
48
+ Returns:
49
+ Tuple[bool, bool]: (is_valid, is_empty_abstract)
50
+ """
51
+ try:
52
+ # Check if abstract exists and is not empty
53
+ if not pd.notna(row.get("Abstract")):
54
+ logging.warning(f"Empty abstract for index {idx}")
55
+ return False, True # Second value indicates empty abstract
56
+
57
+ # For Model B and C, validate Model A results
58
+ if model_key in ["model_b", "model_c"]:
59
+ if not previous_results or "model_a" not in previous_results:
60
+ logging.warning(f"Missing Model A results for {model_key}")
61
+ return False, False
62
+ if idx not in previous_results["model_a"].index:
63
+ logging.warning(f"Index {idx} not found in Model A results")
64
+ return False, False
65
+
66
+ # For Model C, validate Model B results
67
+ if model_key == "model_c":
68
+ if "model_b" not in previous_results:
69
+ logging.warning("Missing Model B results")
70
+ return False, False
71
+ if idx not in previous_results["model_b"].index:
72
+ logging.warning(f"Index {idx} not found in Model B results")
73
+ return False, False
74
+
75
+ return True, False
76
+ except Exception as e:
77
+ logging.error(f"Validation error for index {idx}: {str(e)}")
78
+ return False, False
79
+
80
+ def _process_single_item(self, idx: str, row: pd.Series, model_key: str, previous_results: Dict) -> Optional[Dict]:
81
+ """
82
+ Process a single data item and prepare it for API call.
83
+ """
84
+ try:
85
+ # Prepare base result with abstract
86
+ result = {
87
+ "Index": idx,
88
+ "abstract": str(row["Abstract"]).strip()
89
+ }
90
+
91
+ # Add Model A results for Model B and C
92
+ if model_key in ["model_b", "model_c"]:
93
+ a_result = previous_results["model_a"].loc[idx]
94
+ result["model_a_analysis"] = {
95
+ "A_Decision": bool(a_result["A_Decision"]),
96
+ "A_Reason": str(a_result["A_Reason"]),
97
+ "A_P": str(a_result["A_P"]),
98
+ "A_I": str(a_result["A_I"]),
99
+ "A_C": str(a_result["A_C"]),
100
+ "A_O": str(a_result["A_O"]),
101
+ "A_S": str(a_result["A_S"])
102
+ }
103
+
104
+ # Add Model B results for Model C
105
+ if model_key == "model_c":
106
+ b_result = previous_results["model_b"].loc[idx]
107
+ result["model_b_analysis"] = {
108
+ "B_Decision": bool(b_result["B_Decision"]),
109
+ "B_Reason": str(b_result["B_Reason"]),
110
+ "B_P": str(b_result["B_P"]),
111
+ "B_I": str(b_result["B_I"]),
112
+ "B_C": str(b_result["B_C"]),
113
+ "B_O": str(b_result["B_O"]),
114
+ "B_S": str(b_result["B_S"])
115
+ }
116
+
117
+ return result
118
+ except Exception as e:
119
+ logging.error(f"Processing error for index {idx}: {str(e)}")
120
+ return None
121
+
122
+ def _process_api_response(self, response: Dict, model_key: str) -> List[Dict]:
123
+ """
124
+ Process API response and extract results.
125
+ """
126
+ try:
127
+ if not response or not isinstance(response, dict):
128
+ logging.error(f"Invalid response format from {model_key}")
129
+ return []
130
+
131
+ # Extract results from response
132
+ if "results" not in response:
133
+ # For inference mode, try to parse from content directly (model_c only)
134
+ if model_key == "model_c" and self.model_manager.get_config(model_key).get("is_inference"):
135
+ try:
136
+ content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
137
+ json_match = re.search(r"```json\s*(\{.*?\})\s*```", content, re.DOTALL)
138
+ if json_match:
139
+ content = json_match.group(1)
140
+ parsed_response = json.loads(content)
141
+ if "results" not in parsed_response:
142
+ logging.error(f"No results found in {model_key} inference response")
143
+ return []
144
+ response = parsed_response
145
+ except Exception as e:
146
+ logging.error(f"Failed to parse inference response from {model_key}: {str(e)}")
147
+ return []
148
+ else:
149
+ logging.error(f"No results found in {model_key} response")
150
+ return []
151
+
152
+ results = response["results"]
153
+ if not isinstance(results, list):
154
+ logging.error(f"Results from {model_key} is not a list")
155
+ return []
156
+
157
+ # Validate each result
158
+ valid_results = []
159
+ for result in results:
160
+ if not isinstance(result, dict) or "Index" not in result:
161
+ logging.warning(f"Invalid result format in {model_key} response: {result}")
162
+ continue
163
+
164
+ # Ensure all required fields are present based on model type
165
+ if model_key == "model_a":
166
+ required_fields = ["A_P", "A_I", "A_C", "A_O", "A_S", "A_Decision", "A_Reason"]
167
+ elif model_key == "model_b":
168
+ required_fields = ["B_P", "B_I", "B_C", "B_O", "B_S", "B_Decision", "B_Reason"]
169
+ else: # model_c
170
+ required_fields = ["C_Decision", "C_Reason"]
171
+
172
+ missing_fields = [field for field in required_fields if field not in result]
173
+ if missing_fields:
174
+ logging.warning(f"Missing fields {missing_fields} in {model_key} result for Index {result['Index']}")
175
+ continue
176
+
177
+ # Convert decision to boolean if it's a string
178
+ if model_key == "model_c" and isinstance(result.get("C_Decision"), str):
179
+ result["C_Decision"] = result["C_Decision"].lower() == "true"
180
+
181
+ valid_results.append(result)
182
+
183
+ return valid_results
184
+
185
+ except Exception as e:
186
+ logging.error(f"Error processing {model_key} response: {str(e)}")
187
+ return []
188
+
189
+ def process_batch(self, df: pd.DataFrame, model_key: str, previous_results: Dict = None, progress_callback=None) -> pd.DataFrame:
190
+ """
191
+ Process a batch of data with improved data flow and validation.
192
+ """
193
+ # Get model configuration
194
+ config = self.model_manager.get_config(model_key)
195
+ batch_size = config["batch_size"]
196
+ threads = config["threads"]
197
+ results_dict = {} # Use dictionary to prevent duplicate indices
198
+ failed_indices = set()
199
+ total_rows = len(df)
200
+ start_time = time.time()
201
+ processed_count = 0
202
+ skipped_count = 0
203
+
204
+ # Ensure consistent index type
205
+ df.index = df.index.astype(str)
206
+ if previous_results:
207
+ for key in previous_results:
208
+ previous_results[key].index = previous_results[key].index.astype(str)
209
+
210
+ # For Model C, first identify indices where A and B disagree
211
+ if model_key == "model_c":
212
+ disagreement_indices = []
213
+ for idx in df.index:
214
+ try:
215
+ if not self._validate_previous_results(idx, model_key, previous_results):
216
+ empty_result = self._create_empty_result(idx, model_key, "Invalid or missing previous results")
217
+ results_dict[str(idx)] = empty_result
218
+ failed_indices.add(str(idx))
219
+ if progress_callback:
220
+ progress_callback(idx, True, False)
221
+ continue
222
+
223
+ if self._check_disagreement(idx, previous_results):
224
+ disagreement_indices.append(idx)
225
+ else:
226
+ # If no disagreement, use Model A's decision
227
+ no_disagreement_result = self._create_no_disagreement_result(idx, previous_results)
228
+ results_dict[str(idx)] = no_disagreement_result
229
+ skipped_count += 1
230
+ if progress_callback:
231
+ progress_callback(idx, False, False)
232
+ except Exception as e:
233
+ logging.error(f"Error checking disagreement for index {idx}: {str(e)}")
234
+ empty_result = self._create_empty_result(idx, model_key, f"Error: {str(e)}")
235
+ results_dict[str(idx)] = empty_result
236
+ failed_indices.add(str(idx))
237
+ if progress_callback:
238
+ progress_callback(idx, True, False)
239
+
240
+ # Update df to only include disagreement cases for Model C
241
+ if disagreement_indices:
242
+ df = df.loc[disagreement_indices]
243
+ else:
244
+ # If no disagreements, return results with default values
245
+ results = list(results_dict.values())
246
+ results_df = pd.DataFrame(results)
247
+ results_df.set_index("Index", inplace=True)
248
+ results_df.index = results_df.index.astype(str)
249
+ return results_df
250
+
251
+ def process_batch_data(batch_df: pd.DataFrame) -> List[Dict]:
252
+ nonlocal processed_count, skipped_count
253
+ batch_results = []
254
+ empty_results = []
255
+
256
+ # Process each item in the batch
257
+ for idx, row in batch_df.iterrows():
258
+ try:
259
+ # Skip if already processed (for Model C)
260
+ if str(idx) in results_dict:
261
+ skipped_count += 1
262
+ continue
263
+
264
+ # Validate data completeness
265
+ is_valid, is_empty = self._validate_data(idx, row, model_key, previous_results)
266
+ if not is_valid:
267
+ empty_result = self._create_empty_result(idx, model_key, "Not processed - Empty abstract" if is_empty else "Not processed - Invalid data")
268
+ empty_results.append(empty_result)
269
+ failed_indices.add(idx)
270
+ if progress_callback:
271
+ progress_callback(idx, True, is_empty)
272
+ continue
273
+
274
+ # Prepare data for API call
275
+ abstract_text = row.get("Abstract", "").strip()
276
+ if not abstract_text:
277
+ empty_result = self._create_empty_result(idx, model_key, "Not processed - Empty abstract")
278
+ empty_results.append(empty_result)
279
+ failed_indices.add(idx)
280
+ if progress_callback:
281
+ progress_callback(idx, True, True)
282
+ continue
283
+
284
+ # Add to batch for processing
285
+ batch_item = self._process_single_item(idx, row, model_key, previous_results)
286
+ if batch_item:
287
+ batch_results.append(batch_item)
288
+ else:
289
+ empty_result = self._create_empty_result(idx, model_key, "Error preparing batch data")
290
+ empty_results.append(empty_result)
291
+ failed_indices.add(idx)
292
+ if progress_callback:
293
+ progress_callback(idx, True, False)
294
+
295
+ except Exception as e:
296
+ logging.error(f"Error preparing data for index {idx}: {str(e)}")
297
+ empty_result = self._create_empty_result(idx, model_key, f"Error: {str(e)}")
298
+ empty_results.append(empty_result)
299
+ failed_indices.add(idx)
300
+ if progress_callback:
301
+ progress_callback(idx, True, False)
302
+
303
+ # Process batch with API if there are valid entries
304
+ if batch_results:
305
+ try:
306
+ # Prepare prompt with PICOS criteria and batch data
307
+ prompt = self.prompt_manager.get_prompt(model_key).format(
308
+ **{
309
+ **self.picos_criteria,
310
+ "abstracts_json": json.dumps(batch_results, ensure_ascii=False, indent=2)
311
+ }
312
+ )
313
+
314
+ # Call API and process response
315
+ response = self.model_manager.call_api(model_key, prompt)
316
+ api_results = self._process_api_response(response, model_key)
317
+
318
+ # If API call failed or returned no results, create empty results for all items
319
+ if not api_results:
320
+ for item in batch_results:
321
+ empty_result = self._create_empty_result(item["Index"], model_key, "API call failed or returned no results")
322
+ empty_results.append(empty_result)
323
+ if progress_callback:
324
+ progress_callback(item["Index"], True, False)
325
+ else:
326
+ # Update progress for successfully processed items
327
+ for result in api_results:
328
+ if progress_callback:
329
+ progress_callback(result["Index"], False, False)
330
+ # Add result to the batch results
331
+ results_dict[str(result["Index"])] = result
332
+ processed_count += 1
333
+
334
+ # Calculate time statistics
335
+ elapsed_time = time.time() - start_time
336
+ if processed_count > 0:
337
+ avg_time_per_item = elapsed_time / processed_count
338
+ remaining_items = total_rows - (processed_count + len(failed_indices) + skipped_count)
339
+ estimated_remaining_time = avg_time_per_item * remaining_items
340
+
341
+ # Log detailed progress information
342
+ logging.info(
343
+ f"{model_key.upper()} Progress: "
344
+ f"Processed: {processed_count} - "
345
+ f"Remaining: {remaining_items} - "
346
+ f"Skipped: {skipped_count} - "
347
+ f"Elapsed Time: {elapsed_time:.1f}s - "
348
+ f"Est. Remaining: {estimated_remaining_time:.1f}s"
349
+ )
350
+
351
+ return api_results + empty_results
352
+
353
+ except Exception as e:
354
+ error_msg = f"Error processing batch: {str(e)}"
355
+ logging.error(error_msg)
356
+ for item in batch_results:
357
+ empty_result = self._create_empty_result(item["Index"], model_key, error_msg)
358
+ empty_results.append(empty_result)
359
+ failed_indices.add(item["Index"])
360
+ if progress_callback:
361
+ progress_callback(item["Index"], True, False)
362
+
363
+ return empty_results
364
+
365
+ # Process batches using thread pool
366
+ with ThreadPoolExecutor(max_workers=threads) as executor:
367
+ futures = []
368
+ for i in range(0, len(df), batch_size):
369
+ batch_df = df.iloc[i:i + batch_size]
370
+ futures.append(executor.submit(process_batch_data, batch_df))
371
+
372
+ # Collect results
373
+ for future in as_completed(futures):
374
+ try:
375
+ batch_results = future.result()
376
+ # Store results in dictionary to handle potential duplicates
377
+ for result in batch_results:
378
+ idx = str(result["Index"])
379
+ results_dict[idx] = result
380
+ except Exception as e:
381
+ error_msg = f"Error collecting batch results: {str(e)}"
382
+ logging.error(error_msg)
383
+
384
+ # Convert results dictionary to DataFrame
385
+ results = list(results_dict.values())
386
+ results_df = pd.DataFrame(results)
387
+
388
+ if not results_df.empty:
389
+ # Set index properly
390
+ results_df.set_index("Index", inplace=True)
391
+ results_df.index = results_df.index.astype(str)
392
+
393
+ # Ensure all required columns exist with default values
394
+ for col in self._get_model_columns(model_key):
395
+ if col not in results_df.columns:
396
+ if col.endswith("_Decision"):
397
+ results_df[col] = False
398
+ elif col.endswith("_Reason"):
399
+ results_df[col] = "Not provided"
400
+ else:
401
+ results_df[col] = "not applicable"
402
+
403
+ # Convert boolean columns
404
+ decision_columns = [col for col in results_df.columns if col.endswith("_Decision")]
405
+ for col in decision_columns:
406
+ results_df[col] = results_df[col].astype(bool)
407
+ else:
408
+ # Create empty DataFrame with required columns
409
+ results_df = pd.DataFrame(columns=self._get_model_columns(model_key))
410
+ results_df.index.name = "Index"
411
+
412
+ # Log final statistics
413
+ total_time = time.time() - start_time
414
+ success_rate = ((total_rows - len(failed_indices)) / total_rows) * 100
415
+ logging.info(f"{model_key.upper()} completed in {total_time:.1f}s - "
416
+ f"Success rate: {success_rate:.1f}% ({total_rows - len(failed_indices)}/{total_rows})")
417
+
418
+ return results_df
419
+
420
+ def merge_results(self, df: pd.DataFrame, model_results: Dict) -> pd.DataFrame:
421
+ """Merge results from all models into a single DataFrame."""
422
+ return self.result_processor.merge_results(df, model_results)
423
+
424
+ def _create_empty_result(self, idx: str, model_key: str, reason: Optional[str] = None) -> Dict:
425
+ """
426
+ Create a default empty result entry for cases where the abstract is empty
427
+ or previous results are missing. The default reason is 'Not applicable' if not provided.
428
+ """
429
+ default_reason = reason if reason is not None else "Not applicable - Empty or invalid data"
430
+ result = {"Index": str(idx)}
431
+ if model_key == "model_a":
432
+ result.update({
433
+ "A_P": "not applicable",
434
+ "A_I": "not applicable",
435
+ "A_C": "not applicable",
436
+ "A_O": "not applicable",
437
+ "A_S": "not applicable",
438
+ "A_Decision": False,
439
+ "A_Reason": default_reason
440
+ })
441
+ elif model_key == "model_b":
442
+ result.update({
443
+ "B_P": "not applicable",
444
+ "B_I": "not applicable",
445
+ "B_C": "not applicable",
446
+ "B_O": "not applicable",
447
+ "B_S": "not applicable",
448
+ "B_Decision": False,
449
+ "B_Reason": default_reason
450
+ })
451
+ else: # For model_c
452
+ result.update({
453
+ "C_Decision": False,
454
+ "C_Reason": default_reason
455
+ })
456
+ return result
457
+
458
+ def _create_no_disagreement_result(self, idx: str, previous_results: Dict) -> Dict:
459
+ """
460
+ When Model A and Model B agree on the decision,
461
+ directly return Model A's result with a note indicating no disagreement.
462
+ """
463
+ str_idx = str(idx)
464
+ a_result = previous_results["model_a"].loc[str_idx]
465
+ return {
466
+ "Index": str_idx,
467
+ "C_Decision": a_result["A_Decision"],
468
+ "C_Reason": "No disagreement between Model A and B"
469
+ }
470
+
471
+ def _validate_previous_results(self, idx: str, model_key: str, previous_results: Dict) -> bool:
472
+ """
473
+ Validate if previous model results exist for a given index.
474
+ Returns False if any required result is missing.
475
+ """
476
+ str_idx = str(idx)
477
+ if "model_a" not in previous_results:
478
+ raise Exception("Model A results required")
479
+ model_a_data = previous_results["model_a"]
480
+ if str_idx not in model_a_data.index.astype(str).values:
481
+ logging.warning(f"Missing Model A result for index {idx}")
482
+ return False
483
+
484
+ if model_key == "model_c":
485
+ if "model_b" not in previous_results:
486
+ raise Exception("Model B results required")
487
+ model_b_data = previous_results["model_b"]
488
+ if str_idx not in model_b_data.index.astype(str).values:
489
+ logging.warning(f"Missing Model B result for index {idx}")
490
+ return False
491
+
492
+ return True
493
+
494
+ def _check_disagreement(self, idx: str, previous_results: Dict) -> bool:
495
+ """
496
+ Check whether there is a disagreement between Model A and Model B for a given index.
497
+ Returns True if the decisions differ, otherwise False.
498
+ """
499
+ str_idx = str(idx)
500
+ a_result = previous_results["model_a"].loc[str_idx]
501
+ b_result = previous_results["model_b"].loc[str_idx]
502
+ return a_result["A_Decision"] != b_result["B_Decision"]
503
+
504
+ def _get_model_columns(self, model_key: str) -> List[str]:
505
+ """Get the expected columns for a specific model's output."""
506
+ if model_key == "model_a":
507
+ return ["A_Decision", "A_Reason", "A_P", "A_I", "A_C", "A_O", "A_S"]
508
+ elif model_key == "model_b":
509
+ return ["B_Decision", "B_Reason", "B_P", "B_I", "B_C", "B_O", "B_S"]
510
+ else: # model_c
511
+ return ["C_Decision", "C_Reason"]
app.py ADDED
@@ -0,0 +1,724 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import time
4
+ import logging
5
+ from datetime import datetime
6
+ import gradio as gr
7
+ from file_processor import FileProcessor
8
+ from analyzer import PICOSAnalyzer
9
+ from deduplicator import Deduplicator
10
+ from result_processor import ResultProcessor
11
+
12
+ # Configuration of directories
13
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
14
+ DATA_DIR = os.path.join(BASE_DIR, "data")
15
+ LOG_DIR = os.path.join(BASE_DIR, "logs")
16
+
17
+ # Load .env file if it exists
18
+ dotenv_path = os.path.join(os.path.dirname(__file__), '.env')
19
+ if os.path.exists(dotenv_path):
20
+ load_dotenv(dotenv_path)
21
+ else:
22
+ print("Warning: .env file not found.")
23
+
24
+ # Initialize components for analysis, file processing, deduplication, and result processing
25
+ analyzer = PICOSAnalyzer()
26
+ file_processor = FileProcessor(DATA_DIR)
27
+ model_results = {}
28
+ deduplicator = Deduplicator()
29
+ result_processor = ResultProcessor()
30
+
31
+ # Ensure required directories exist
32
+ for directory in [DATA_DIR, LOG_DIR]:
33
+ try:
34
+ os.makedirs(directory, exist_ok=True)
35
+ except Exception as e:
36
+ raise RuntimeError(f"Failed to create directory {directory}: {str(e)}")
37
+
38
+ # Configure logging: log to both a file and the console
39
+ try:
40
+ log_file = os.path.join(LOG_DIR, f"picos_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
41
+
42
+ # File handler for logging to a file
43
+ file_handler = logging.FileHandler(log_file, encoding='utf-8')
44
+ file_handler.setLevel(logging.INFO)
45
+
46
+ # Console handler for logging to the terminal
47
+ console_handler = logging.StreamHandler()
48
+ console_handler.setLevel(logging.INFO)
49
+
50
+ # Formatter for log messages
51
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
52
+ file_handler.setFormatter(formatter)
53
+ console_handler.setFormatter(formatter)
54
+
55
+ # Configure the root logger
56
+ root_logger = logging.getLogger()
57
+
58
+ root_logger.setLevel(logging.INFO)
59
+ root_logger.addHandler(file_handler)
60
+ root_logger.addHandler(console_handler)
61
+ except Exception as e:
62
+ print(f"Failed to initialize logging: {str(e)}")
63
+ raise
64
+
65
+ def create_gradio_interface():
66
+ """Create and return the Gradio interface for the PICOS Analysis System."""
67
+
68
+ def parse_nbib(file) -> tuple:
69
+ """
70
+ Parse a citation file in NBIB format.
71
+ Returns a tuple containing the Excel output path and a preview text.
72
+ """
73
+ try:
74
+ if not file:
75
+ return None, "No file uploaded"
76
+
77
+ # Determine file type based on extension
78
+ file_extension = os.path.splitext(file.name)[1].lower()
79
+
80
+ if file_extension == '.nbib':
81
+ output_path, preview = file_processor.parse_nbib(file.name)
82
+ elif file_extension == '.ris':
83
+ # Read file content to determine RIS format (Embase or Web of Science)
84
+ with open(file.name, 'r', encoding='utf-8') as f:
85
+ content = f.read()
86
+ if 'T1 - ' in content: # Embase RIS format
87
+ output_path, preview = file_processor.parse_embase_ris(file.name)
88
+ else: # Assume Web of Science RIS format
89
+ output_path, preview = file_processor.parse_wos_ris(file.name)
90
+ else:
91
+ return None, "Unsupported file format. Please upload a .nbib or .ris file"
92
+
93
+ if not output_path:
94
+ return None, "Failed to parse file"
95
+
96
+ return output_path, preview
97
+
98
+ except Exception as e:
99
+ error_msg = f"Error parsing file: {str(e)}"
100
+ logging.error(error_msg)
101
+ return None, error_msg
102
+
103
+ def parse_scopus(file) -> tuple:
104
+ """
105
+ Parse a Scopus RIS file.
106
+ Returns a tuple containing the Excel output path and a preview text.
107
+ """
108
+ try:
109
+ if not file:
110
+ return None, "No file uploaded"
111
+ output_path, preview = file_processor.parse_scopus_ris(file.name)
112
+ if not output_path:
113
+ return None, "Failed to parse file"
114
+ return output_path, preview
115
+ except Exception as e:
116
+ error_msg = f"Error parsing Scopus file: {str(e)}"
117
+ logging.error(error_msg)
118
+ return None, error_msg
119
+
120
+ def update_picos_criteria(p, i, c, o, s):
121
+ """Update the PICOS criteria used for analysis."""
122
+ try:
123
+ analyzer.update_picos_criteria({
124
+ "population": p.strip(),
125
+ "intervention": i.strip(),
126
+ "comparison": c.strip(),
127
+ "outcome": o.strip(),
128
+ "study_design": s.strip()
129
+ })
130
+ return "✓ PICOS criteria updated successfully"
131
+ except Exception as e:
132
+ return f"❌ Error updating PICOS criteria: {str(e)}"
133
+
134
+ def update_model_settings(model_key, api_url, api_key, model_name, temperature, max_tokens, batch_size, threads, prompt, is_inference, timeout):
135
+ """Update the settings for a specified model."""
136
+ try:
137
+ analyzer.update_model_config(model_key, {
138
+ "api_url": api_url.strip(),
139
+ "api_key": api_key.strip(),
140
+ "model": model_name.strip(),
141
+ "temperature": float(temperature),
142
+ "max_tokens": int(max_tokens),
143
+ "batch_size": int(batch_size),
144
+ "threads": int(threads),
145
+ "is_inference": bool(is_inference),
146
+ "timeout": float(timeout),
147
+ "updated": True # mark as manually updated
148
+ })
149
+ analyzer.update_prompt(model_key, prompt.strip())
150
+ return "✓ Settings updated successfully"
151
+ except Exception as e:
152
+ return f"❌ Error updating settings: {str(e)}"
153
+
154
+ def test_connection(model_key):
155
+ """Test the API connection for a specified model."""
156
+ try:
157
+ result = analyzer.test_api_connection(model_key)
158
+ return result
159
+ except Exception as e:
160
+ return f"❌ Error testing connection: {str(e)}"
161
+
162
+ def process_model(input_file, model_key, model_a_input=None, model_b_input=None):
163
+ """
164
+ Process analysis for a single model and return the results.
165
+ For Model B and C, the required previous results files must be provided.
166
+ """
167
+ try:
168
+ logging.info(f"Loading input file for {model_key.upper()}...")
169
+ df = file_processor.load_excel(input_file.name)
170
+ if df is None:
171
+ return None, "Failed to load Excel file"
172
+
173
+ # For Model B, require Model A results; for Model C, require both Model A and B results
174
+ if model_key == "model_b":
175
+ if model_a_input is None or not os.path.exists(model_a_input.name):
176
+ return None, "Model A results file required for MODEL_B"
177
+ model_results["model_a"] = file_processor.load_excel(model_a_input.name)
178
+ elif model_key == "model_c":
179
+ logging.info("Loading Model A and B results for Model C analysis...")
180
+ if model_a_input is None or not os.path.exists(model_a_input.name) or \
181
+ model_b_input is None or not os.path.exists(model_b_input.name):
182
+ return None, "Both Model A and B results files required for MODEL_C"
183
+ model_results["model_a"] = file_processor.load_excel(model_a_input.name)
184
+ model_results["model_b"] = file_processor.load_excel(model_b_input.name)
185
+
186
+ # Process the model
187
+ logging.info(f"Starting {model_key.upper()} analysis...")
188
+ total_rows = len(df)
189
+ processed_rows = 0
190
+ errors = 0
191
+ empty_abstracts = 0
192
+ start_time = time.time()
193
+
194
+ def progress_callback(row_index, error=False, is_empty=False):
195
+ nonlocal processed_rows, errors, empty_abstracts
196
+ # Increase the count only when the actual processing is complete
197
+ if not error:
198
+ processed_rows += 1
199
+ elif is_empty:
200
+ empty_abstracts += 1
201
+ else:
202
+ errors += 1
203
+
204
+ # Calculate progress and time estimates
205
+ elapsed_time = time.time() - start_time
206
+ progress = processed_rows / total_rows
207
+ if progress > 0:
208
+ # Use moving averages to smooth time estimates
209
+ avg_time_per_item = elapsed_time / (processed_rows + errors + empty_abstracts)
210
+ remaining_items = total_rows - (processed_rows + errors + empty_abstracts)
211
+ remaining_time = avg_time_per_item * remaining_items
212
+
213
+ # Use the batch size of the model to control the log output frequency
214
+ batch_size = analyzer.model_manager.get_config(model_key)["batch_size"]
215
+ if (processed_rows + errors + empty_abstracts) % batch_size == 0:
216
+ logging.info(f"{model_key.upper()} Progress: {processed_rows + errors + empty_abstracts}/{total_rows} rows "
217
+ f"({(processed_rows + errors + empty_abstracts) / total_rows:.1%}) - "
218
+ f"Processed: {processed_rows}, Errors: {errors}, Empty: {empty_abstracts} - "
219
+ f"Elapsed: {elapsed_time:.1f}s, Remaining: {remaining_time:.1f}s")
220
+
221
+ results_df = analyzer.process_batch(df, model_key, model_results, progress_callback)
222
+
223
+ if results_df is None:
224
+ return None, f"{model_key.upper()} failed to process results"
225
+
226
+ # Save results immediately with fixed path in DATA_DIR
227
+ output_file = os.path.join(DATA_DIR, f"{model_key}_results.xlsx")
228
+ if model_key == "model_c":
229
+ # For Model C, merge all results before saving
230
+ merged_df = analyzer.merge_results(df, {
231
+ "model_a": model_results["model_a"],
232
+ "model_b": model_results["model_b"],
233
+ "model_c": results_df
234
+ })
235
+ if not file_processor.save_excel(merged_df, output_file):
236
+ return None, f"Failed to save {model_key.upper()} results"
237
+ else:
238
+ # For Model A and B, save individual results
239
+ if not file_processor.save_excel(results_df, output_file):
240
+ return None, f"Failed to save {model_key.upper()} results"
241
+
242
+ total_time = time.time() - start_time
243
+ completion_msg = (f"{model_key.upper()} analysis completed in {total_time:.1f}s - "
244
+ f"Processed {processed_rows} rows with {errors} errors")
245
+ logging.info(completion_msg)
246
+
247
+ # Return the full path to the saved file with gr.update
248
+ if os.path.exists(output_file):
249
+ return gr.update(value=output_file), completion_msg
250
+ else:
251
+ return None, f"Failed to verify {model_key.upper()} results file"
252
+
253
+ except Exception as e:
254
+ error_msg = f"Error in {model_key.upper()} analysis: {str(e)}"
255
+ logging.error(error_msg)
256
+ return None, error_msg
257
+
258
+ def merge_results_with_files(input_file, model_a_file, model_b_file, model_c_file):
259
+ """
260
+ Merge all model results from the provided files and export the merged results as an Excel file.
261
+ """
262
+ if not all([input_file, model_a_file, model_b_file]):
263
+ return None, "Original file, Model A and B results are required"
264
+
265
+ try:
266
+ df = file_processor.load_excel(input_file.name)
267
+ model_a_results = file_processor.load_excel(model_a_file.name)
268
+ model_b_results = file_processor.load_excel(model_b_file.name)
269
+ model_c_results = file_processor.load_excel(model_c_file.name) if model_c_file else None
270
+
271
+ if any(result is None for result in [df, model_a_results, model_b_results]):
272
+ return None, "Failed to load one or more required files"
273
+
274
+ model_results["model_a"] = model_a_results
275
+ model_results["model_b"] = model_b_results
276
+ if model_c_results is not None:
277
+ model_results["model_c"] = model_c_results
278
+
279
+ merged_df = analyzer.merge_results(df, model_results)
280
+
281
+ final_filename = os.path.join(DATA_DIR, "final_results.xlsx")
282
+ result_processor.export_to_excel(merged_df, final_filename)
283
+
284
+ return final_filename, "Results merged successfully"
285
+ except Exception as e:
286
+ return None, f"Error merging results: {str(e)}"
287
+
288
+ def run_all_models(input_file):
289
+ """Run analysis pipeline for all models with streaming updates"""
290
+ try:
291
+ # Read Excel file using file processor
292
+ df = file_processor.load_excel(input_file.name)
293
+ if df is None:
294
+ yield [None, None, None, None, "Failed to load input file"]
295
+ return
296
+
297
+ # --- Process Model A ---
298
+ logging.info("Starting Model A analysis...")
299
+ model_a_results = analyzer.process_batch(df, "model_a")
300
+ if model_a_results is None:
301
+ yield [None, None, None, None, "Model A failed to process results"]
302
+ return
303
+
304
+ # Save Model A results with fixed path
305
+ model_a_path = os.path.join(DATA_DIR, "model_a_results.xlsx")
306
+ if not file_processor.save_excel(model_a_results, model_a_path):
307
+ yield [None, None, None, None, "Failed to save Model A results"]
308
+ return
309
+ model_results["model_a"] = model_a_results
310
+ status_msg = "Model A completed successfully"
311
+ # Yield update: Model A result available
312
+ yield [gr.update(value=model_a_path), None, None, None, status_msg]
313
+
314
+ # --- Process Model B ---
315
+ logging.info("Starting Model B analysis...")
316
+ model_b_results = analyzer.process_batch(df, "model_b", {"model_a": model_a_results})
317
+ if model_b_results is None:
318
+ yield [gr.update(value=model_a_path), None, None, None, "Model B failed to process results"]
319
+ return
320
+
321
+ # Save Model B results with fixed path
322
+ model_b_path = os.path.join(DATA_DIR, "model_b_results.xlsx")
323
+ if not file_processor.save_excel(model_b_results, model_b_path):
324
+ yield [gr.update(value=model_a_path), None, None, None, "Failed to save Model B results"]
325
+ return
326
+ model_results["model_b"] = model_b_results
327
+ status_msg = "Model B completed successfully"
328
+ # Yield update: Both Model A and B results available
329
+ yield [gr.update(value=model_a_path), gr.update(value=model_b_path), None, None, status_msg]
330
+
331
+ # --- Process Model C ---
332
+ logging.info("Starting Model C analysis...")
333
+ model_c_results = analyzer.process_batch(df, "model_c", {
334
+ "model_a": model_a_results,
335
+ "model_b": model_b_results
336
+ })
337
+
338
+ model_c_path = None
339
+ if model_c_results is not None:
340
+ # Save Model C results with fixed path
341
+ model_c_path = os.path.join(DATA_DIR, "model_c_results.xlsx")
342
+ if not file_processor.save_excel(model_c_results, model_c_path):
343
+ yield [gr.update(value=model_a_path), gr.update(value=model_b_path), None, None, "Failed to save Model C results"]
344
+ return
345
+ model_results["model_c"] = model_c_results
346
+ status_msg = "Model C completed successfully"
347
+ # Yield update: Model A, B and C results available
348
+ yield [gr.update(value=model_a_path), gr.update(value=model_b_path), gr.update(value=model_c_path), None, status_msg]
349
+
350
+ # Merge results
351
+ logging.info("Merging results...")
352
+ merged_df = analyzer.merge_results(df, model_results)
353
+
354
+ # Save final results with fixed path
355
+ final_path = os.path.join(DATA_DIR, "final_results.xlsx")
356
+ if not file_processor.save_excel(merged_df, final_path):
357
+ yield [gr.update(value=model_a_path), gr.update(value=model_b_path), gr.update(value=model_c_path), None, "Failed to save final results"]
358
+ return
359
+
360
+ completion_msg = "All models completed successfully"
361
+ # Yield final update with all results available
362
+ yield [gr.update(value=model_a_path), gr.update(value=model_b_path), gr.update(value=model_c_path), gr.update(value=final_path), completion_msg]
363
+
364
+ except Exception as e:
365
+ error_msg = f"Error in pipeline: {str(e)}"
366
+ logging.error(error_msg)
367
+ yield [None, None, None, None, error_msg]
368
+
369
+ def process_deduplication(files, threshold):
370
+ """
371
+ Process deduplication for multiple Excel files.
372
+ The function identifies duplicate entries based on a similarity threshold.
373
+ """
374
+ try:
375
+ if not files:
376
+ return None, None, "No files uploaded"
377
+
378
+ dataframes = []
379
+ for file in files:
380
+ if not file:
381
+ continue
382
+ df = file_processor.load_excel(file.name)
383
+ if df is None:
384
+ return None, None, f"Failed to load file: {file.name}"
385
+ dataframes.append(df)
386
+
387
+ if not dataframes:
388
+ return None, None, "No valid files to process"
389
+
390
+ unique_df, clusters_df = deduplicator.process_dataframes(dataframes, threshold)
391
+
392
+ unique_path = file_processor.save_excel(unique_df, "deduplicated_data.xlsx")
393
+ clusters_path = file_processor.save_excel(clusters_df, "duplicate_clusters.xlsx")
394
+
395
+ if not unique_path or not clusters_path:
396
+ return None, None, "Failed to save results"
397
+
398
+ status_msg = f"Deduplication completed successfully:\n"
399
+ status_msg += f"Original entries: {sum(len(df) for df in dataframes)}\n"
400
+ status_msg += f"Unique entries: {len(unique_df)}\n"
401
+ status_msg += f"Duplicate clusters: {len(clusters_df['Cluster_ID'].unique()) if len(clusters_df) > 0 else 0}"
402
+
403
+ return unique_path, clusters_path, status_msg
404
+
405
+ except Exception as e:
406
+ error_msg = f"Error in deduplication: {str(e)}"
407
+ logging.error(error_msg)
408
+ return None, None, error_msg
409
+
410
+ # Build the Gradio interface
411
+ interface = gr.Blocks(title="PICOS Analysis System")
412
+
413
+ with interface:
414
+ gr.Markdown("""
415
+ <div style="text-align: center;">
416
+ <h1>PICOS Literature Analysis System</h1>
417
+ <p>This system uses a multi-model approach to analyze medical literature abstracts.</p>
418
+ </div>
419
+ """)
420
+
421
+ with gr.Tab("Instructions"):
422
+ gr.Markdown("""
423
+ ## System Overview
424
+ This system helps researchers analyze medical literature by providing tools for citation management,
425
+ deduplication, and automated PICOS analysis using multiple language models.
426
+
427
+ ## Workflow Steps
428
+ **Citation Processing** -> **Deduplication** (Optional) -> **PICOS Analysis Setup** -> **Analysis Execution**
429
+
430
+ ## File Format Requirements
431
+ ### Input Files
432
+ - **Pubmed**: NBIB format (.nbib)
433
+ - **Embase**: RIS format (.ris)
434
+ - **Web of Science**: RIS format (.ris)
435
+ - **Scopus**: RIS format (.ris)
436
+
437
+ ### Processed Format
438
+ The system will generate standardized Excel files (XLSX format) with these columns:
439
+ - **Index**: Unique identifier for each abstract
440
+ - **Title**: Article title
441
+ - **Authors**: Author list (semicolon-separated)
442
+ - **Abstract**: Full abstract text
443
+ - **DOI**: Digital Object Identifier (when available)
444
+
445
+ ### Analysis Results
446
+ Each model will generate an Excel file containing:
447
+ - All original citation data
448
+ - PICOS analysis results
449
+ - Inclusion/exclusion decisions
450
+ - Reasoning for decisions
451
+ """)
452
+
453
+ with gr.Tab("Citation File Processing"):
454
+ with gr.Tab("Pubmed"):
455
+ gr.Markdown("""
456
+ ## Pubmed NBIB Processing
457
+ Upload a .nbib file from Pubmed to extract and convert it to Excel format. The extracted data will include:
458
+ - DOI
459
+ - Title
460
+ - Authors
461
+ - Abstract
462
+ """)
463
+
464
+ with gr.Row():
465
+ nbib_file = gr.File(label="Upload NBIB File", file_types=[".nbib"])
466
+ process_nbib_btn = gr.Button("Process NBIB File")
467
+
468
+ with gr.Row():
469
+ nbib_preview = gr.Textbox(label="Preview", lines=20)
470
+ nbib_output = gr.File(label="Download Excel")
471
+
472
+ process_nbib_btn.click(
473
+ parse_nbib,
474
+ inputs=[nbib_file],
475
+ outputs=[nbib_output, nbib_preview]
476
+ )
477
+
478
+ with gr.Tab("Embase"):
479
+ gr.Markdown("""
480
+ ## Embase RIS Processing
481
+ Upload a .ris file from Embase to extract and convert it to Excel format. The extracted data will include:
482
+ - DOI
483
+ - Title
484
+ - Authors
485
+ - Abstract
486
+ """)
487
+
488
+ with gr.Row():
489
+ embase_file = gr.File(label="Upload Embase RIS File", file_types=[".ris"])
490
+ process_embase_btn = gr.Button("Process Embase RIS File")
491
+
492
+ with gr.Row():
493
+ embase_preview = gr.Textbox(label="Preview", lines=20)
494
+ embase_output = gr.File(label="Download Excel")
495
+
496
+ process_embase_btn.click(
497
+ parse_nbib,
498
+ inputs=[embase_file],
499
+ outputs=[embase_output, embase_preview]
500
+ )
501
+
502
+ with gr.Tab("Web of Science"):
503
+ gr.Markdown("""
504
+ ## Web of Science RIS Processing
505
+ Upload a .ris file from Web of Science to extract and convert it to Excel format. The extracted data will include:
506
+ - DOI
507
+ - Title
508
+ - Authors
509
+ - Abstract
510
+ """)
511
+
512
+ with gr.Row():
513
+ wos_file = gr.File(label="Upload WOS RIS File", file_types=[".ris"])
514
+ process_wos_btn = gr.Button("Process WOS RIS File")
515
+
516
+ with gr.Row():
517
+ wos_preview = gr.Textbox(label="Preview", lines=20)
518
+ wos_output = gr.File(label="Download Excel")
519
+
520
+ process_wos_btn.click(
521
+ lambda file: parse_nbib(file) if file else (None, "No file uploaded"),
522
+ inputs=[wos_file],
523
+ outputs=[wos_output, wos_preview]
524
+ )
525
+
526
+ with gr.Tab("Scopus"):
527
+ gr.Markdown("""
528
+ ## Scopus RIS Processing
529
+ Upload a .ris file from Scopus to extract and convert it to Excel format. The extracted data will include:
530
+ - DOI
531
+ - Title
532
+ - Authors
533
+ - Abstract
534
+ """)
535
+
536
+ with gr.Row():
537
+ scopus_file = gr.File(label="Upload Scopus RIS File", file_types=[".ris"])
538
+ process_scopus_btn = gr.Button("Process Scopus RIS File")
539
+
540
+ with gr.Row():
541
+ scopus_preview = gr.Textbox(label="Preview", lines=20)
542
+ scopus_output = gr.File(label="Download Excel")
543
+
544
+ process_scopus_btn.click(
545
+ parse_scopus,
546
+ inputs=[scopus_file],
547
+ outputs=[scopus_output, scopus_preview]
548
+ )
549
+
550
+ with gr.Tab("Deduplication"):
551
+ gr.Markdown("""
552
+ ## Citation Deduplication
553
+ Upload multiple Excel files to remove duplicate entries across different citation sources.
554
+ The system will identify similar entries based on title and author information.
555
+
556
+ ### Features:
557
+ - Support for multiple Excel files
558
+ - Adjustable similarity threshold
559
+ - Detailed duplicate clusters report
560
+ - Standardized output format
561
+ """)
562
+
563
+ with gr.Row():
564
+ input_files = gr.File(
565
+ label="Upload Excel Files",
566
+ file_types=[".xlsx", ".xls"],
567
+ file_count="multiple"
568
+ )
569
+ threshold = gr.Slider(
570
+ label="Similarity Threshold",
571
+ minimum=0.1,
572
+ maximum=1.0,
573
+ value=0.8,
574
+ step=0.05,
575
+ info="Higher values mean stricter matching (0.8 recommended)"
576
+ )
577
+
578
+ with gr.Row():
579
+ process_btn = gr.Button("Process Deduplication")
580
+
581
+ with gr.Row():
582
+ status = gr.Textbox(label="Status", lines=5)
583
+
584
+ with gr.Row():
585
+ unique_output = gr.File(label="Download Deduplicated Data")
586
+ clusters_output = gr.File(label="Download Duplicate Clusters")
587
+
588
+ process_btn.click(
589
+ process_deduplication,
590
+ inputs=[input_files, threshold],
591
+ outputs=[unique_output, clusters_output, status]
592
+ )
593
+
594
+ with gr.Tab("LLM Analysis"):
595
+ with gr.Tab("PICOS Criteria"):
596
+ gr.Markdown("""
597
+ ## PICOS Criteria Settings
598
+ Define the standard PICOS criteria that will be used by all models.
599
+ These criteria will be used to evaluate whether each article meets the requirements.
600
+ """)
601
+
602
+ with gr.Group("Standard PICOS Criteria"):
603
+ population = gr.Textbox(label="Population", value=analyzer.picos_criteria["population"],
604
+ placeholder="e.g., patients with hepatocellular carcinoma")
605
+ intervention = gr.Textbox(label="Intervention", value=analyzer.picos_criteria["intervention"],
606
+ placeholder="e.g., immunotherapy or targeted therapy")
607
+ comparison = gr.Textbox(label="Comparison", value=analyzer.picos_criteria["comparison"],
608
+ placeholder="e.g., standard therapy or placebo")
609
+ outcome = gr.Textbox(label="Outcome", value=analyzer.picos_criteria["outcome"],
610
+ placeholder="e.g., survival or response rate")
611
+ study_design = gr.Textbox(label="Study Design", value=analyzer.picos_criteria["study_design"],
612
+ placeholder="e.g., randomized controlled trial")
613
+
614
+ update_picos_btn = gr.Button("Update PICOS Criteria")
615
+ picos_status = gr.Textbox(label="Status")
616
+
617
+ update_picos_btn.click(
618
+ update_picos_criteria,
619
+ inputs=[population, intervention, comparison, outcome, study_design],
620
+ outputs=picos_status
621
+ )
622
+
623
+ with gr.Tab("Model Settings"):
624
+ for model_key in ["model_a", "model_b", "model_c"]:
625
+ with gr.Group(f"{model_key.upper()} Settings"):
626
+ config = analyzer.model_manager.get_config(model_key)
627
+ api_url = gr.Textbox(label="API URL", value=config["api_url"])
628
+ api_key = gr.Textbox(label="API Key", value=config["api_key"])
629
+ model_name = gr.Textbox(label="Model", value=config["model"])
630
+ is_inference = gr.Checkbox(
631
+ label="Inference Model",
632
+ value=config.get("is_inference", False),
633
+ info="Enable inference compatibility mode for models that return reasoning process"
634
+ )
635
+ temperature = gr.Slider(label="Temperature", minimum=0, maximum=10, value=config["temperature"])
636
+ max_tokens = gr.Number(label="Max Tokens", value=config["max_tokens"])
637
+ batch_size = gr.Number(label="Batch Size", value=config["batch_size"])
638
+ threads = gr.Slider(label="Threads", minimum=1, maximum=32, step=1, value=config["threads"])
639
+ timeout = gr.Number(label="Timeout (seconds)", value=config.get("timeout", 180))
640
+ prompt = gr.Textbox(label="Prompt Template", value=analyzer.prompt_manager.get_prompt(model_key), lines=10)
641
+
642
+ update_btn = gr.Button(f"Update {model_key.upper().replace('_', ' ')} Settings")
643
+ test_btn = gr.Button(f"Test {model_key.upper().replace('_', ' ')} Connection")
644
+ status = gr.Textbox(label="Status", lines=10)
645
+
646
+ update_btn.click(
647
+ update_model_settings,
648
+ inputs=[gr.Textbox(value=model_key, visible=False),
649
+ api_url,
650
+ api_key,
651
+ model_name,
652
+ temperature,
653
+ max_tokens,
654
+ batch_size,
655
+ threads,
656
+ prompt,
657
+ is_inference,
658
+ timeout],
659
+ outputs=status
660
+ )
661
+ test_btn.click(
662
+ test_connection,
663
+ inputs=[gr.Textbox(value=model_key, visible=False)],
664
+ outputs=status
665
+ )
666
+
667
+ with gr.Tab("Analysis"):
668
+ with gr.Row():
669
+ input_file = gr.File(label="Original Excel File")
670
+ model_a_input = gr.File(label="Model A Results")
671
+ model_b_input = gr.File(label="Model B Results")
672
+ model_c_input = gr.File(label="Model C Results")
673
+
674
+ with gr.Row():
675
+ model_a_btn = gr.Button("Run Model A")
676
+ model_b_btn = gr.Button("Run Model B")
677
+ model_c_btn = gr.Button("Run Model C")
678
+ merge_btn = gr.Button("Merge Results")
679
+ # Register run_all_btn with streaming enabled for intermediate updates
680
+ run_all_btn = gr.Button("Run All", variant="primary")
681
+
682
+ status = gr.Textbox(label="Status")
683
+
684
+ with gr.Row():
685
+ model_a_output = gr.File(label="Model A Results", interactive=True)
686
+ model_b_output = gr.File(label="Model B Results", interactive=True)
687
+ model_c_output = gr.File(label="Model C Results", interactive=True)
688
+ final_output = gr.File(label="Final Results", interactive=True)
689
+
690
+ # Individual model runs
691
+ model_a_btn.click(
692
+ lambda x: process_model(x, "model_a"),
693
+ inputs=[input_file],
694
+ outputs=[model_a_output, status]
695
+ )
696
+ model_b_btn.click(
697
+ lambda x, y: process_model(x, "model_b", y),
698
+ inputs=[input_file, model_a_input],
699
+ outputs=[model_b_output, status]
700
+ )
701
+ model_c_btn.click(
702
+ lambda x, y, z: process_model(x, "model_c", y, z),
703
+ inputs=[input_file, model_a_input, model_b_input],
704
+ outputs=[model_c_output, status]
705
+ )
706
+ merge_btn.click(
707
+ merge_results_with_files,
708
+ inputs=[input_file, model_a_input, model_b_input, model_c_input],
709
+ outputs=[final_output, status]
710
+ )
711
+ run_all_btn.click(
712
+ fn=run_all_models,
713
+ inputs=[input_file],
714
+ outputs=[model_a_output, model_b_output, model_c_output, final_output, status]
715
+ )
716
+
717
+ return interface
718
+
719
+ if __name__ == "__main__":
720
+ interface = create_gradio_interface()
721
+ if interface:
722
+ interface.launch(server_name="0.0.0.0", server_port=7860, pwa=True)
723
+ else:
724
+ print("Error: Failed to create Gradio interface")
deduplicator.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import logging
5
+
6
+ class Deduplicator:
7
+ def __init__(self):
8
+ """Initialize Deduplicator with required columns for processing"""
9
+ self.required_columns = ['Title', 'Authors', 'Abstract', 'DOI']
10
+
11
+ def validate_dataframe(self, df):
12
+ """
13
+ Validate if dataframe has required columns
14
+
15
+ Args:
16
+ df: DataFrame to validate
17
+
18
+ Returns:
19
+ bool: True if validation passes
20
+
21
+ Raises:
22
+ ValueError: If required columns are missing
23
+ """
24
+ missing_cols = [col for col in self.required_columns if col not in df.columns]
25
+ if missing_cols:
26
+ raise ValueError(f"Missing required columns: {', '.join(missing_cols)}")
27
+ return True
28
+
29
+ def process_dataframes(self, dataframes, threshold=0.8):
30
+ """
31
+ Process multiple dataframes and remove duplicates
32
+
33
+ Args:
34
+ dataframes: List of DataFrames to process
35
+ threshold: Similarity threshold for duplicate detection (default: 0.8)
36
+
37
+ Returns:
38
+ tuple: (unique_df, clusters_df) where:
39
+ - unique_df: DataFrame containing unique entries
40
+ - clusters_df: DataFrame containing duplicate clusters
41
+
42
+ Raises:
43
+ Exception: If deduplication process fails
44
+ """
45
+ try:
46
+ # Validate and combine dataframes
47
+ for df in dataframes:
48
+ self.validate_dataframe(df)
49
+
50
+ combined_df = pd.concat(dataframes, ignore_index=True)
51
+
52
+ # Create Title_Author column for similarity comparison
53
+ combined_df['Title_Author'] = combined_df['Title'].fillna('') + ' ' + combined_df['Authors'].fillna('')
54
+
55
+ # Find duplicate clusters
56
+ clusters_df, unique_df = self.find_duplicate_clusters(combined_df, threshold)
57
+
58
+ # Ensure output format consistency
59
+ unique_df = self.standardize_output(unique_df)
60
+ clusters_df = self.standardize_clusters(clusters_df)
61
+
62
+ return unique_df, clusters_df
63
+
64
+ except Exception as e:
65
+ logging.error(f"Error in deduplication process: {str(e)}")
66
+ raise
67
+
68
+ def find_duplicate_clusters(self, df, threshold):
69
+ """
70
+ Find duplicate clusters using TF-IDF and cosine similarity
71
+
72
+ Args:
73
+ df: DataFrame to process
74
+ threshold: Similarity threshold for duplicate detection
75
+
76
+ Returns:
77
+ tuple: (clusters_df, unique_df) where:
78
+ - clusters_df: DataFrame containing duplicate clusters
79
+ - unique_df: DataFrame containing unique entries
80
+ """
81
+ # Create TF-IDF vectors for similarity comparison
82
+ vectorizer = TfidfVectorizer().fit_transform(df['Title_Author'])
83
+ cosine_sim = cosine_similarity(vectorizer)
84
+
85
+ n = cosine_sim.shape[0]
86
+ parent = list(range(n))
87
+
88
+ def find(x):
89
+ """Find the root of a cluster using path compression"""
90
+ if parent[x] != x:
91
+ parent[x] = find(parent[x])
92
+ return parent[x]
93
+
94
+ def union(x, y):
95
+ """Union two clusters by rank"""
96
+ rootX = find(x)
97
+ rootY = find(y)
98
+ if rootX != rootY:
99
+ parent[rootY] = rootX
100
+
101
+ # Build clusters using union-find
102
+ for i in range(n):
103
+ for j in range(i + 1, n):
104
+ if cosine_sim[i, j] > threshold:
105
+ union(i, j)
106
+
107
+ # Collect clusters and prepare output
108
+ clusters = {}
109
+ for i in range(n):
110
+ root = find(i)
111
+ if root not in clusters:
112
+ clusters[root] = []
113
+ clusters[root].append(i)
114
+
115
+ # Prepare output dataframes
116
+ cluster_data = []
117
+ unique_indices = []
118
+
119
+ for cluster_id, indices in clusters.items():
120
+ if len(indices) > 1:
121
+ for index in indices:
122
+ cluster_data.append({
123
+ "Cluster_ID": cluster_id,
124
+ "Index": index,
125
+ "Title": df.iloc[index]["Title"],
126
+ "Authors": df.iloc[index]["Authors"],
127
+ "DOI": df.iloc[index]["DOI"],
128
+ "Abstract": df.iloc[index]["Abstract"]
129
+ })
130
+ unique_indices.append(indices[0]) # Keep first occurrence
131
+ else:
132
+ unique_indices.extend(indices)
133
+
134
+ clusters_df = pd.DataFrame(cluster_data) if cluster_data else pd.DataFrame(columns=["Cluster_ID", "Index", "Title", "Authors", "DOI", "Abstract"])
135
+ unique_df = df.iloc[unique_indices].copy()
136
+
137
+ # Reset index to ensure it starts from 0
138
+ unique_df = unique_df.reset_index(drop=True)
139
+ # Add Index column that matches NBIB/RIS format
140
+ unique_df.index.name = 'Index'
141
+
142
+ return clusters_df, unique_df
143
+
144
+ def standardize_output(self, df):
145
+ """
146
+ Ensure output dataframe has consistent format
147
+
148
+ Args:
149
+ df: DataFrame to standardize
150
+
151
+ Returns:
152
+ DataFrame with standardized format
153
+ """
154
+ # Make sure Index is properly set
155
+ if 'Index' not in df.index.name:
156
+ df = df.reset_index(drop=True)
157
+ df.index.name = 'Index'
158
+
159
+ # Ensure all required columns exist
160
+ required_columns = ['Title', 'Authors', 'Abstract', 'DOI']
161
+ for col in required_columns:
162
+ if col not in df.columns:
163
+ df[col] = ''
164
+
165
+ # Select and order columns while preserving the index
166
+ df = df[required_columns]
167
+ return df
168
+
169
+ def standardize_clusters(self, df):
170
+ """
171
+ Ensure clusters dataframe has consistent format
172
+
173
+ Args:
174
+ df: DataFrame containing cluster information
175
+
176
+ Returns:
177
+ DataFrame with standardized cluster format
178
+ """
179
+ required_columns = ['Cluster_ID', 'Index', 'Title', 'Authors', 'DOI', 'Abstract']
180
+ for col in required_columns:
181
+ if col not in df.columns:
182
+ df[col] = ''
183
+ return df[required_columns]
file_processor.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import logging
4
+ import re
5
+ from typing import Tuple, Optional
6
+
7
+ class FileProcessor:
8
+ def __init__(self, data_dir: str):
9
+ """
10
+ Initialize FileProcessor
11
+
12
+ Args:
13
+ data_dir: Directory path for storing processed data
14
+ """
15
+ self.data_dir = data_dir
16
+
17
+ def parse_nbib(self, file_path: str) -> Tuple[Optional[str], str]:
18
+ """
19
+ Parse NBIB file and return Excel output path and preview text
20
+
21
+ Args:
22
+ file_path: Path to the NBIB file to parse
23
+
24
+ Returns:
25
+ tuple: (output_path, preview_text) where:
26
+ - output_path: Path to the generated Excel file (None if parsing fails)
27
+ - preview_text: Preview of the parsed data or error message
28
+ """
29
+ if not file_path or not os.path.exists(file_path):
30
+ return None, "Invalid file"
31
+
32
+ try:
33
+ records = []
34
+ record = {}
35
+ authors = []
36
+ current_field = None
37
+
38
+ with open(file_path, 'r', encoding='utf-8') as f:
39
+ lines = f.readlines()
40
+
41
+ if not lines:
42
+ return None, "Empty file"
43
+
44
+ # Process each line in the NBIB file
45
+ for line in lines:
46
+ if line.startswith('TI - '):
47
+ record['Title'] = line.replace('TI - ', '').strip()
48
+ current_field = 'Title'
49
+ elif line.startswith('AB - '):
50
+ record['Abstract'] = line.replace('AB - ', '').strip()
51
+ current_field = 'Abstract'
52
+ elif line.startswith('AU - '):
53
+ authors.append(line.replace('AU - ', '').strip())
54
+ current_field = None
55
+ elif line.startswith('LID - '):
56
+ if '[doi]' in line:
57
+ doi_part = line.replace('LID - ', '').strip()
58
+ record['DOI'] = doi_part.replace(' [doi]', '').strip()
59
+ current_field = None
60
+ elif line.startswith('PMID- '):
61
+ if record: # Save the previous record
62
+ record['Authors'] = '; '.join(authors)
63
+ records.append(record)
64
+ record = {}
65
+ authors = []
66
+ current_field = None
67
+ elif line.startswith(' ') and current_field in ['Abstract', 'Title']:
68
+ record[current_field] += ' ' + line.strip()
69
+
70
+ # Save the last record if exists
71
+ if record:
72
+ record['Authors'] = '; '.join(authors)
73
+ records.append(record)
74
+
75
+ # Create DataFrame and save to Excel
76
+ df = pd.DataFrame(records)
77
+ df.index.name = 'Index'
78
+ output_path = os.path.join(self.data_dir, "extracted_data.xlsx")
79
+ df.to_excel(output_path, index=True)
80
+ preview = self._generate_preview(records)
81
+
82
+ return output_path, preview
83
+
84
+ except Exception as e:
85
+ return None, f"Error processing NBIB file: {str(e)}"
86
+
87
+ def parse_wos_ris(self, file_path: str) -> Tuple[Optional[str], str]:
88
+ """
89
+ Parse Web of Science RIS file and return Excel output path and preview text
90
+
91
+ Args:
92
+ file_path: Path to the WOS RIS file to parse
93
+
94
+ Returns:
95
+ tuple: (output_path, preview_text) where:
96
+ - output_path: Path to the generated Excel file (None if parsing fails)
97
+ - preview_text: Preview of the parsed data or error message
98
+ """
99
+ if not file_path or not os.path.exists(file_path):
100
+ return None, "Invalid file"
101
+
102
+ try:
103
+ records = []
104
+ record = {}
105
+ authors = []
106
+ current_field = None
107
+
108
+ with open(file_path, 'r', encoding='utf-8') as f:
109
+ content = f.read()
110
+
111
+ if not content:
112
+ return None, "Empty file"
113
+
114
+ # Split content into individual articles
115
+ articles = content.split("\nER -")
116
+
117
+ for article in articles:
118
+ if not article.strip():
119
+ continue
120
+
121
+ record = {}
122
+ authors = []
123
+
124
+ # Process each line in the article
125
+ lines = article.strip().split('\n')
126
+ for line in lines:
127
+ if not line.strip():
128
+ continue
129
+ if line.startswith('TI - '):
130
+ record['Title'] = line.replace('TI - ', '').strip()
131
+ elif line.startswith('AB - '):
132
+ record['Abstract'] = line.replace('AB - ', '').strip()
133
+ elif line.startswith('AU - '):
134
+ authors.append(line.replace('AU - ', '').strip())
135
+ elif line.startswith('DO - '):
136
+ record['DOI'] = line.replace('DO - ', '').strip()
137
+ elif line.startswith(' '):
138
+ if 'Abstract' in record:
139
+ record['Abstract'] += ' ' + line.strip()
140
+ elif 'Title' in record:
141
+ record['Title'] += ' ' + line.strip()
142
+
143
+ if record:
144
+ record['Authors'] = '; '.join(authors)
145
+ records.append(record)
146
+
147
+ # Create DataFrame with required columns
148
+ df = pd.DataFrame(records)
149
+ required_columns = ['Title', 'Abstract', 'Authors', 'DOI']
150
+ for col in required_columns:
151
+ if col not in df.columns:
152
+ df[col] = ''
153
+ df.index.name = 'Index'
154
+ output_path = os.path.join(self.data_dir, "extracted_data.xlsx")
155
+ df.to_excel(output_path, index=True)
156
+ preview = self._generate_preview(records)
157
+
158
+ return output_path, preview
159
+
160
+ except Exception as e:
161
+ return None, f"Error processing WOS RIS file: {str(e)}"
162
+
163
+ def parse_embase_ris(self, file_path: str) -> Tuple[Optional[str], str]:
164
+ """
165
+ Parse Embase RIS file and return Excel output path and preview text
166
+
167
+ Args:
168
+ file_path: Path to the Embase RIS file to parse
169
+
170
+ Returns:
171
+ tuple: (output_path, preview_text) where:
172
+ - output_path: Path to the generated Excel file (None if parsing fails)
173
+ - preview_text: Preview of the parsed data or error message
174
+ """
175
+ if not file_path or not os.path.exists(file_path):
176
+ return None, "Invalid file"
177
+
178
+ try:
179
+ records = []
180
+ record = {}
181
+ authors = []
182
+ current_field = None
183
+
184
+ with open(file_path, 'r', encoding='utf-8') as f:
185
+ content = f.read()
186
+
187
+ if not content:
188
+ return None, "Empty file"
189
+
190
+ # Split content into individual articles
191
+ articles = content.split("\n\n")
192
+
193
+ for article in articles:
194
+ if not article.strip():
195
+ continue
196
+
197
+ record = {}
198
+ authors = []
199
+
200
+ # Process each line in the article
201
+ lines = article.strip().split('\n')
202
+ for line in lines:
203
+ if not line.strip():
204
+ continue
205
+ if line.startswith('T1 - '): # Title field
206
+ record['Title'] = line.replace('T1 - ', '').strip()
207
+ elif line.startswith('N2 - '): # Abstract field
208
+ record['Abstract'] = line.replace('N2 - ', '').strip()
209
+ elif line.startswith('A1 - '): # Authors field
210
+ authors.append(line.replace('A1 - ', '').strip())
211
+ elif line.startswith('DO - '): # DOI field
212
+ record['DOI'] = line.replace('DO - ', '').strip()
213
+ elif line.startswith(' '): # Handle multi-line fields
214
+ if 'Abstract' in record:
215
+ record['Abstract'] += ' ' + line.strip()
216
+ elif 'Title' in record:
217
+ record['Title'] += ' ' + line.strip()
218
+
219
+ if record:
220
+ record['Authors'] = '; '.join(authors) if authors else ''
221
+ records.append(record)
222
+
223
+ # Create DataFrame with required columns
224
+ df = pd.DataFrame(records)
225
+ required_columns = ['Title', 'Abstract', 'Authors', 'DOI']
226
+ for col in required_columns:
227
+ if col not in df.columns:
228
+ df[col] = ''
229
+ df.index.name = 'Index'
230
+ output_path = os.path.join(self.data_dir, "extracted_data.xlsx")
231
+ df.to_excel(output_path, index=True)
232
+ preview = self._generate_preview(records)
233
+
234
+ return output_path, preview
235
+
236
+ except Exception as e:
237
+ return None, f"Error processing Embase RIS file: {str(e)}"
238
+
239
+ def parse_scopus_ris(self, file_path: str) -> Tuple[Optional[str], str]:
240
+ """
241
+ Parse Scopus RIS file and return Excel output path and preview text
242
+
243
+ Args:
244
+ file_path: Path to the Scopus RIS file to parse
245
+
246
+ Returns:
247
+ tuple: (output_path, preview_text) where:
248
+ - output_path: Path to the generated Excel file (None if parsing fails)
249
+ - preview_text: Preview of the parsed data or error message
250
+ """
251
+ if not file_path or not os.path.exists(file_path):
252
+ return None, "Invalid file"
253
+
254
+ try:
255
+ records = []
256
+ with open(file_path, 'r', encoding='utf-8') as f:
257
+ content = f.read()
258
+ if not content:
259
+ return None, "Empty file"
260
+
261
+ # Use regex to split records by "ER -" (note the double space)
262
+ articles = re.split(r'\nER\s*-\s*', content)
263
+
264
+ for article in articles:
265
+ if not article.strip():
266
+ continue
267
+ record = {}
268
+ authors = []
269
+ lines = article.strip().split('\n')
270
+ for line in lines:
271
+ line = line.strip()
272
+ if not line:
273
+ continue
274
+ if line.startswith('TI - '):
275
+ record['Title'] = line.replace('TI - ', '').strip()
276
+ elif line.startswith('AB - '):
277
+ record['Abstract'] = line.replace('AB - ', '').strip()
278
+ elif line.startswith('AU - '):
279
+ authors.append(line.replace('AU - ', '').strip())
280
+ elif line.startswith('DO - '):
281
+ record['DOI'] = line.replace('DO - ', '').strip()
282
+ elif line.startswith(' '):
283
+ if 'Abstract' in record:
284
+ record['Abstract'] += ' ' + line.strip()
285
+ elif 'Title' in record:
286
+ record['Title'] += ' ' + line.strip()
287
+ record['Authors'] = '; '.join(authors)
288
+ records.append(record)
289
+
290
+ # Create DataFrame with required columns
291
+ df = pd.DataFrame(records)
292
+ required_columns = ['Title', 'Abstract', 'Authors', 'DOI']
293
+ for col in required_columns:
294
+ if col not in df.columns:
295
+ df[col] = ''
296
+ df.index.name = 'Index'
297
+ output_path = os.path.join(self.data_dir, "extracted_data.xlsx")
298
+ df.to_excel(output_path, index=True)
299
+ preview = self._generate_preview(records)
300
+
301
+ return output_path, preview
302
+
303
+ except Exception as e:
304
+ return None, f"Error processing Scopus RIS file: {str(e)}"
305
+
306
+ def _generate_preview(self, records: list) -> str:
307
+ """
308
+ Generate a preview text for the first few parsed records
309
+
310
+ Args:
311
+ records: List of parsed records
312
+
313
+ Returns:
314
+ str: Formatted preview text showing sample records
315
+ """
316
+ preview = ""
317
+ for i, record in enumerate(records[:3], 0):
318
+ preview += f"\nRecord {i}:\n"
319
+ preview += f"DOI: {record.get('DOI', '')[:50]}\n"
320
+ preview += f"Title: {record.get('Title', '')[:100]}...\n"
321
+ preview += f"Authors: {record.get('Authors', '')[:100]}...\n"
322
+ preview += f"Abstract: {record.get('Abstract', '')[:200]}...\n"
323
+ preview += "-" * 80 + "\n"
324
+
325
+ preview += f"\nTotal records extracted: {len(records)}"
326
+ return preview
327
+
328
+ def load_excel(self, file_path: str) -> Optional[pd.DataFrame]:
329
+ """
330
+ Load Excel file and ensure the index is set correctly
331
+
332
+ Args:
333
+ file_path: Path to the Excel file to load
334
+
335
+ Returns:
336
+ DataFrame or None if loading fails
337
+ """
338
+ try:
339
+ # First try to read with index_col=0
340
+ df = pd.read_excel(file_path, index_col=0)
341
+
342
+ # If Index is still in columns, it means it wasn't properly set as index
343
+ if "Index" in df.columns:
344
+ df.set_index("Index", inplace=True)
345
+ elif df.index.name != "Index":
346
+ df.index.name = "Index"
347
+
348
+ # Ensure index is string type and handle any potential NaN values
349
+ df.index = df.index.astype(str)
350
+ df.index = df.index.str.strip()
351
+
352
+ # Remove any duplicate indices by keeping the first occurrence
353
+ if df.index.duplicated().any():
354
+ logging.warning(f"Found duplicate indices in {file_path}")
355
+ df = df[~df.index.duplicated(keep='first')]
356
+
357
+ logging.debug(f"Loaded DataFrame from {file_path}")
358
+ logging.debug(f"Shape: {df.shape}")
359
+ logging.debug(f"Columns: {df.columns.tolist()}")
360
+ logging.debug(f"Index name: {df.index.name}")
361
+ logging.debug(f"First few indices: {df.index.tolist()[:5]}")
362
+
363
+ return df
364
+ except Exception as e:
365
+ logging.error(f"Error loading Excel file: {str(e)}")
366
+ return None
367
+
368
+ def save_excel(self, df: pd.DataFrame, filename: str) -> str:
369
+ """
370
+ Save a DataFrame to an Excel file
371
+
372
+ Args:
373
+ df: DataFrame to save
374
+ filename: Target filename
375
+
376
+ Returns:
377
+ str: Path to the saved file or empty string if saving fails
378
+ """
379
+ try:
380
+ # Ensure we have a copy to avoid modifying the original
381
+ df = df.copy()
382
+
383
+ # Ensure index is properly named
384
+ if df.index.name != "Index":
385
+ df.index.name = "Index"
386
+
387
+ # Ensure index is string type
388
+ df.index = df.index.astype(str)
389
+
390
+ # Remove any duplicate indices
391
+ if df.index.duplicated().any():
392
+ logging.warning(f"Found duplicate indices when saving {filename}")
393
+ df = df[~df.index.duplicated(keep='first')]
394
+
395
+ output_path = os.path.join(self.data_dir, filename)
396
+
397
+ # Save with index
398
+ df.to_excel(output_path, index=True)
399
+
400
+ logging.debug(f"Saved DataFrame to {output_path}")
401
+ logging.debug(f"Shape: {df.shape}")
402
+ logging.debug(f"Columns: {df.columns.tolist()}")
403
+
404
+ return output_path
405
+ except Exception as e:
406
+ logging.error(f"Error saving Excel file: {str(e)}")
407
+ return ""
model_manager.py ADDED
@@ -0,0 +1,528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import requests
4
+ import logging
5
+ import time
6
+ import re
7
+ from typing import Dict, Any
8
+ from dotenv import load_dotenv
9
+
10
+ # Ensure .env file is loaded (with override enabled to pick up any modifications)
11
+ load_dotenv(override=True)
12
+
13
+ class ModelManager:
14
+ def __init__(self):
15
+ # Load base configuration from environment variables
16
+ self.model_configs = {
17
+ "model_a": {
18
+ "api_key": os.getenv("MODEL_A_API_KEY", ""),
19
+ "api_url": os.getenv("MODEL_A_API_URL", ""),
20
+ "model": os.getenv("MODEL_A_MODEL_NAME", ""),
21
+ "name": "Model A (Primary Analyzer)",
22
+ "temperature": float(os.getenv("MODEL_A_TEMPERATURE", "0.3")),
23
+ "max_tokens": int(os.getenv("MODEL_A_MAX_TOKENS", "4096")),
24
+ "batch_size": int(os.getenv("MODEL_A_BATCH_SIZE", "10")),
25
+ "threads": int(os.getenv("MODEL_A_THREADS", "8")),
26
+ "timeout": int(os.getenv("MODEL_A_TIMEOUT", "180")),
27
+ "is_inference": os.getenv("MODEL_A_IS_INFERENCE", "").lower() == "true",
28
+ "updated": False # flag to indicate if manually updated
29
+ },
30
+ "model_b": {
31
+ "api_key": os.getenv("MODEL_B_API_KEY", ""),
32
+ "api_url": os.getenv("MODEL_B_API_URL", ""),
33
+ "model": os.getenv("MODEL_B_MODEL_NAME", ""),
34
+ "name": "Model B (Critical Reviewer)",
35
+ "temperature": float(os.getenv("MODEL_B_TEMPERATURE", "0.3")),
36
+ "max_tokens": int(os.getenv("MODEL_B_MAX_TOKENS", "4096")),
37
+ "batch_size": int(os.getenv("MODEL_B_BATCH_SIZE", "10")),
38
+ "threads": int(os.getenv("MODEL_B_THREADS", "8")),
39
+ "timeout": int(os.getenv("MODEL_B_TIMEOUT", "180")),
40
+ "is_inference": os.getenv("MODEL_B_IS_INFERENCE", "").lower() == "true",
41
+ "updated": False
42
+ },
43
+ "model_c": {
44
+ "api_key": os.getenv("MODEL_C_API_KEY", ""),
45
+ "api_url": os.getenv("MODEL_C_API_URL", ""),
46
+ "model": os.getenv("MODEL_C_MODEL_NAME", ""),
47
+ "name": "Model C (Final Arbitrator)",
48
+ "temperature": float(os.getenv("MODEL_C_TEMPERATURE", "0.3")),
49
+ "max_tokens": int(os.getenv("MODEL_C_MAX_TOKENS", "4096")),
50
+ "batch_size": int(os.getenv("MODEL_C_BATCH_SIZE", "10")),
51
+ "threads": int(os.getenv("MODEL_C_THREADS", "8")),
52
+ "timeout": int(os.getenv("MODEL_C_TIMEOUT", "180")),
53
+ "is_inference": os.getenv("MODEL_C_IS_INFERENCE", "").lower() == "true",
54
+ "updated": False
55
+ }
56
+ }
57
+
58
+ # Validate API keys
59
+ for model_key, config in self.model_configs.items():
60
+ if not config["api_key"]:
61
+ logging.warning(f"API key not found for {config['name']}")
62
+
63
+ def update_model_config(self, model_key: str, config: Dict[str, Any]) -> None:
64
+ """Update model configuration."""
65
+ if model_key not in self.model_configs:
66
+ raise ValueError(f"Invalid model key: {model_key}")
67
+ self.model_configs[model_key].update(config)
68
+
69
+ def process_model_response(self, model_key: str, response: str) -> Dict:
70
+ """Process response based on model type."""
71
+ try:
72
+ logging.debug(f"Raw response from {model_key}: {response}")
73
+ logging.debug(f"Response type: {type(response)}")
74
+
75
+ # Parse outer JSON
76
+ response_obj = json.loads(response) if isinstance(response, str) else response
77
+ logging.debug(f"Parsed response object: {json.dumps(response_obj, indent=2)}")
78
+
79
+ # Process based on mode
80
+ if self.model_configs[model_key].get("is_inference", False):
81
+ logging.debug(f"Processing {model_key} response in inference mode")
82
+ logging.debug(f"Model config: {json.dumps(self.model_configs[model_key], indent=2)}")
83
+ return self.process_inference_result(response_obj, model_key)
84
+
85
+ # Get content from response
86
+ if not isinstance(response_obj, dict):
87
+ logging.error(f"Invalid response format from {model_key}: {response_obj}")
88
+ return self.get_default_response(model_key)
89
+
90
+ if "choices" not in response_obj:
91
+ logging.error(f"No choices in response: {response_obj}")
92
+ return self.get_default_response(model_key)
93
+
94
+ if not response_obj["choices"]:
95
+ logging.error(f"Empty choices in response: {response_obj}")
96
+ return self.get_default_response(model_key)
97
+
98
+ content = response_obj["choices"][0].get("message", {}).get("content", "")
99
+ logging.debug(f"Extracted content: {content}")
100
+
101
+ if not content:
102
+ logging.error(f"Empty content in {model_key} response")
103
+ return self.get_default_response(model_key)
104
+
105
+ # Handle markdown code blocks
106
+ if "```json" in content:
107
+ pattern = r"```json\s*(.*?)\s*```"
108
+ match = re.search(pattern, content, re.DOTALL)
109
+ if match:
110
+ content = match.group(1).strip()
111
+ logging.debug(f"Extracted JSON from markdown: {content}")
112
+
113
+ # Parse inner JSON
114
+ try:
115
+ result = json.loads(content)
116
+ logging.debug(f"Parsed content result: {json.dumps(result, indent=2)}")
117
+
118
+ # Validate results field
119
+ if "results" not in result:
120
+ logging.error(f"Missing 'results' field in {model_key} response")
121
+ return self.get_default_response(model_key)
122
+
123
+ # Validate each result item
124
+ valid_results = []
125
+ for item in result.get("results", []):
126
+ logging.debug(f"Processing result item: {json.dumps(item, indent=2)}")
127
+ if not isinstance(item, dict):
128
+ logging.error(f"Invalid result item format: {item}")
129
+ continue
130
+ if "Index" not in item:
131
+ logging.error(f"Missing Index in result item: {item}")
132
+ continue
133
+ valid_results.append(item)
134
+
135
+ if not valid_results:
136
+ logging.error(f"No valid results found in {model_key} response")
137
+ return self.get_default_response(model_key)
138
+
139
+ result["results"] = valid_results
140
+ return result
141
+
142
+ except json.JSONDecodeError as e:
143
+ logging.error(f"JSON parse error for {model_key}: {str(e)}")
144
+ logging.error(f"Content causing error: {content}")
145
+ return self.get_default_response(model_key)
146
+
147
+ except Exception as e:
148
+ logging.error(f"Error processing {model_key} response: {str(e)}")
149
+ logging.error("Full traceback:", exc_info=True)
150
+ return self.get_default_response(model_key)
151
+
152
+ def get_default_response(self, model_key: str) -> Dict:
153
+ """
154
+ Return default response format for each model type.
155
+
156
+ Args:
157
+ model_key: Identifier of the model.
158
+
159
+ Returns:
160
+ Dict containing default response structure.
161
+ """
162
+ if model_key == "model_a":
163
+ return {
164
+ "results": [{
165
+ "Index": "0",
166
+ "A_P": "not applicable",
167
+ "A_I": "not applicable",
168
+ "A_C": "not applicable",
169
+ "A_O": "not applicable",
170
+ "A_S": "not applicable",
171
+ "A_Decision": False,
172
+ "A_Reason": "API call failed or returned no results"
173
+ }]
174
+ }
175
+ elif model_key == "model_b":
176
+ return {
177
+ "results": [{
178
+ "Index": "0",
179
+ "B_P": "not applicable",
180
+ "B_I": "not applicable",
181
+ "B_C": "not applicable",
182
+ "B_O": "not applicable",
183
+ "B_S": "not applicable",
184
+ "B_Decision": False,
185
+ "B_Reason": "API call failed or returned no results"
186
+ }]
187
+ }
188
+ else: # model_c
189
+ return {
190
+ "results": [{
191
+ "Index": "0",
192
+ "C_Decision": False,
193
+ "C_Reason": "API call failed or returned no results"
194
+ }]
195
+ }
196
+
197
+ def process_inference_result(self, result: Dict, model_key: str) -> Dict:
198
+ """
199
+ Process inference model results.
200
+
201
+ Args:
202
+ result: Raw inference result.
203
+ model_key: Identifier of the model.
204
+
205
+ Returns:
206
+ Dict containing processed inference results.
207
+ """
208
+ try:
209
+ if not isinstance(result, dict) or "choices" not in result:
210
+ logging.error(f"Invalid inference result format from {model_key}")
211
+ return self.get_default_response(model_key)
212
+
213
+ for choice in result["choices"]:
214
+ if "message" not in choice:
215
+ logging.warning(f"Missing message in choice: {choice}")
216
+ continue
217
+
218
+ content = choice["message"].get("content", "")
219
+ if not content:
220
+ logging.warning(f"Empty content in {model_key} choice")
221
+ choice["message"]["content"] = json.dumps(self.get_default_response(model_key))
222
+ continue
223
+
224
+ # Handle markdown code blocks
225
+ if "```json" in content:
226
+ pattern = r"```json\s*(.*?)\s*```"
227
+ match = re.search(pattern, content, re.DOTALL)
228
+ if match:
229
+ content = match.group(1).strip()
230
+ logging.debug(f"Extracted JSON from markdown in inference result: {content}")
231
+
232
+ try:
233
+ content_data = json.loads(content)
234
+ logging.debug(f"Parsed inference content: {json.dumps(content_data, indent=2, ensure_ascii=False)}")
235
+
236
+ # Return the parsed content data directly, not the original response
237
+ return content_data
238
+
239
+ except json.JSONDecodeError as e:
240
+ logging.error(f"Failed to parse {model_key} inference content: {str(e)}")
241
+ logging.error(f"Content was: {content}")
242
+ return self.get_default_response(model_key)
243
+
244
+ return self.get_default_response(model_key)
245
+
246
+ except Exception as e:
247
+ logging.error(f"Error processing {model_key} inference result: {str(e)}")
248
+ return self.get_default_response(model_key)
249
+
250
+ def process_reviews(self, result: Dict, model_key: str) -> Dict:
251
+ """
252
+ Process reviews format response.
253
+
254
+ Args:
255
+ result: Raw review data.
256
+ model_key: Identifier of the model.
257
+
258
+ Returns:
259
+ Dict containing processed reviews.
260
+ """
261
+ try:
262
+ if not isinstance(result.get("reviews", []), list):
263
+ logging.error("Invalid reviews format")
264
+ return {"reviews": []}
265
+
266
+ field_name = "B_Reason" if model_key == "model_b" else "C_Reason"
267
+ for review in result["reviews"]:
268
+ if field_name in review:
269
+ # Remove duplicate Reason fields
270
+ if isinstance(review[field_name], list):
271
+ review[field_name] = review[field_name][-1]
272
+
273
+ # Process inference content (remove think tags etc.)
274
+ review[field_name] = self.process_inference_response(review[field_name])
275
+
276
+ return result
277
+ except Exception as e:
278
+ logging.error(f"Error processing reviews: {str(e)}")
279
+ return {"reviews": []}
280
+
281
+ def process_inference_response(self, response: str) -> str:
282
+ """
283
+ Process special markers in inference response.
284
+
285
+ Args:
286
+ response: Raw inference response string.
287
+
288
+ Returns:
289
+ Processed response string with special markers removed.
290
+ """
291
+ try:
292
+ if not isinstance(response, str):
293
+ return response
294
+
295
+ # Remove thinking process
296
+ response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
297
+
298
+ # Remove HTML tags
299
+ response = re.sub(r'<[^>]+>', '', response)
300
+
301
+ # Clean extra whitespace
302
+ response = re.sub(r'\n\s*\n', '\n\n', response.strip())
303
+
304
+ return response
305
+
306
+ except Exception as e:
307
+ logging.error(f"Error processing inference response: {str(e)}")
308
+ return response
309
+
310
+ def test_api_connection(self, model_key: str) -> str:
311
+ """
312
+ Test API connection for a specific model.
313
+
314
+ Args:
315
+ model_key: Identifier of the model to test.
316
+
317
+ Returns:
318
+ String indicating connection status.
319
+ """
320
+ config = self.model_configs.get(model_key)
321
+ if not config:
322
+ return f"❌ Configuration not found for {model_key}"
323
+
324
+ try:
325
+ headers = {
326
+ "Content-Type": "application/json",
327
+ "Authorization": f"Bearer {config['api_key']}"
328
+ }
329
+
330
+ data = {
331
+ "model": config["model"],
332
+ "messages": [{"role": "user", "content": "test"}],
333
+ "temperature": config["temperature"],
334
+ "max_tokens": 10
335
+ }
336
+
337
+ response = requests.post(
338
+ config["api_url"],
339
+ headers=headers,
340
+ json=data,
341
+ timeout=10
342
+ )
343
+
344
+ if response.status_code == 200:
345
+ return f"✓ {config['name']} connection successful"
346
+ else:
347
+ return f"❌ {config['name']} connection failed: {response.status_code}"
348
+
349
+ except Exception as e:
350
+ return f"❌ {config['name']} connection error: {str(e)}"
351
+
352
+ def call_api(self, model_key: str, prompt: str) -> Dict:
353
+ """Call API with retry mechanism and improved error handling."""
354
+ try:
355
+ config = self.model_configs.get(model_key)
356
+ if not config:
357
+ logging.error(f"Configuration not found for {model_key}")
358
+ raise Exception(f"Configuration not found for {model_key}")
359
+
360
+ logging.debug(f"API call config for {model_key}: {json.dumps({k:v for k,v in config.items() if k != 'api_key'}, indent=2)}")
361
+
362
+ headers = {
363
+ "Content-Type": "application/json",
364
+ "Authorization": f"Bearer {config['api_key']}"
365
+ }
366
+ logging.debug(f"Request headers: {json.dumps({k:v for k,v in headers.items() if k != 'Authorization'}, indent=2)}")
367
+
368
+ data = {
369
+ "model": config["model"],
370
+ "messages": [
371
+ {"role": "system", "content": "You are a helpful assistant specialized in analyzing medical literature based on PICOS criteria."},
372
+ {"role": "user", "content": prompt}
373
+ ],
374
+ "temperature": config["temperature"],
375
+ "max_tokens": config["max_tokens"]
376
+ }
377
+ logging.debug(f"Request data: {json.dumps(data, indent=2)}")
378
+
379
+ max_retries = 3
380
+ retry_delay = 1
381
+
382
+ for attempt in range(max_retries):
383
+ try:
384
+ logging.debug(f"Attempt {attempt + 1} of {max_retries}")
385
+ response = requests.post(
386
+ config["api_url"],
387
+ headers=headers,
388
+ json=data,
389
+ timeout=config["timeout"]
390
+ )
391
+
392
+ logging.debug(f"API Response status: {response.status_code}")
393
+ logging.debug(f"API Response headers: {dict(response.headers)}")
394
+
395
+ if response.status_code != 200:
396
+ error_msg = f"API call failed for {config.get('name', model_key)}: {response.status_code} {response.reason}"
397
+ if response.text:
398
+ error_msg += f"\nResponse: {response.text}"
399
+ logging.error(error_msg)
400
+ if attempt < max_retries - 1:
401
+ time.sleep(retry_delay * (attempt + 1))
402
+ continue
403
+ raise Exception(error_msg)
404
+
405
+ return self.process_model_response(model_key, response.text)
406
+
407
+ except requests.Timeout:
408
+ logging.error(f"Timeout on attempt {attempt + 1}/{max_retries}")
409
+ if attempt < max_retries - 1:
410
+ time.sleep(retry_delay * (attempt + 1))
411
+ continue
412
+ raise Exception(f"API call timed out after {max_retries} attempts")
413
+
414
+ except Exception as e:
415
+ logging.error(f"API call error for {config.get('name', model_key)}: {str(e)}")
416
+ logging.error("Full traceback:", exc_info=True)
417
+ if attempt < max_retries - 1:
418
+ time.sleep(retry_delay)
419
+ continue
420
+ raise
421
+
422
+ raise Exception(f"API call failed after {max_retries} attempts")
423
+
424
+ except Exception as e:
425
+ logging.error(f"Fatal error in API call: {str(e)}")
426
+ logging.error("Full traceback:", exc_info=True)
427
+ raise
428
+
429
+ def get_config(self, model_key: str) -> Dict[str, Any]:
430
+ """
431
+ Get model configuration.
432
+ This method re-reads environment variables for models that haven't been manually updated.
433
+ """
434
+ # Reload environment variables from .env file to capture any modifications
435
+ load_dotenv(override=True)
436
+ if model_key not in self.model_configs:
437
+ return {}
438
+ config = self.model_configs[model_key]
439
+ if not config.get("updated", False):
440
+ # For models not manually updated, refresh config from environment variables
441
+ if model_key == "model_a":
442
+ refreshed_config = {
443
+ "api_key": os.getenv("MODEL_A_API_KEY", ""),
444
+ "api_url": os.getenv("MODEL_A_API_URL", ""),
445
+ "model": os.getenv("MODEL_A_MODEL_NAME", ""),
446
+ "name": "Model A (Primary Analyzer)",
447
+ "temperature": float(os.getenv("MODEL_A_TEMPERATURE", "0.3")),
448
+ "max_tokens": int(os.getenv("MODEL_A_MAX_TOKENS", "4096")),
449
+ "batch_size": int(os.getenv("MODEL_A_BATCH_SIZE", "10")),
450
+ "threads": int(os.getenv("MODEL_A_THREADS", "8")),
451
+ "timeout": int(os.getenv("MODEL_A_TIMEOUT", "180")),
452
+ "is_inference": os.getenv("MODEL_A_IS_INFERENCE", "").lower() == "true",
453
+ "updated": False
454
+ }
455
+ elif model_key == "model_b":
456
+ refreshed_config = {
457
+ "api_key": os.getenv("MODEL_B_API_KEY", ""),
458
+ "api_url": os.getenv("MODEL_B_API_URL", ""),
459
+ "model": os.getenv("MODEL_B_MODEL_NAME", ""),
460
+ "name": "Model B (Critical Reviewer)",
461
+ "temperature": float(os.getenv("MODEL_B_TEMPERATURE", "0.3")),
462
+ "max_tokens": int(os.getenv("MODEL_B_MAX_TOKENS", "4096")),
463
+ "batch_size": int(os.getenv("MODEL_B_BATCH_SIZE", "10")),
464
+ "threads": int(os.getenv("MODEL_B_THREADS", "8")),
465
+ "timeout": int(os.getenv("MODEL_B_TIMEOUT", "180")),
466
+ "is_inference": os.getenv("MODEL_B_IS_INFERENCE", "").lower() == "true",
467
+ "updated": False
468
+ }
469
+ elif model_key == "model_c":
470
+ refreshed_config = {
471
+ "api_key": os.getenv("MODEL_C_API_KEY", ""),
472
+ "api_url": os.getenv("MODEL_C_API_URL", ""),
473
+ "model": os.getenv("MODEL_C_MODEL_NAME", ""),
474
+ "name": "Model C (Final Arbitrator)",
475
+ "temperature": float(os.getenv("MODEL_C_TEMPERATURE", "0.3")),
476
+ "max_tokens": int(os.getenv("MODEL_C_MAX_TOKENS", "4096")),
477
+ "batch_size": int(os.getenv("MODEL_C_BATCH_SIZE", "10")),
478
+ "threads": int(os.getenv("MODEL_C_THREADS", "8")),
479
+ "timeout": int(os.getenv("MODEL_C_TIMEOUT", "180")),
480
+ "is_inference": os.getenv("MODEL_C_IS_INFERENCE", "").lower() == "true",
481
+ "updated": False
482
+ }
483
+ else:
484
+ refreshed_config = {}
485
+ self.model_configs[model_key] = refreshed_config
486
+ config = refreshed_config
487
+ return config
488
+
489
+ def process_analysis(self, result: Dict, model_key: str) -> Dict:
490
+ """
491
+ Process analysis format response.
492
+
493
+ Args:
494
+ result: Raw analysis data.
495
+ model_key: Identifier of the model.
496
+
497
+ Returns:
498
+ Dict containing processed analysis.
499
+ """
500
+ try:
501
+ if not isinstance(result.get("analysis", []), list):
502
+ logging.error("Invalid analysis format")
503
+ return {"analysis": []}
504
+
505
+ # Process each analysis item
506
+ for analysis in result["analysis"]:
507
+ if "A_Reason" in analysis:
508
+ # Remove duplicate Reason fields
509
+ if isinstance(analysis["A_Reason"], list):
510
+ analysis["A_Reason"] = analysis["A_Reason"][-1]
511
+
512
+ # Process inference content (remove think tags etc.)
513
+ analysis["A_Reason"] = self.process_inference_response(analysis["A_Reason"])
514
+
515
+ # Ensure boolean fields are proper booleans
516
+ if "A_Decision" in analysis:
517
+ analysis["A_Decision"] = bool(analysis["A_Decision"])
518
+
519
+ # Ensure all PICOS fields are strings
520
+ for field in ["A_P", "A_I", "A_C", "A_O", "A_S"]:
521
+ if field in analysis:
522
+ analysis[field] = str(analysis[field])
523
+
524
+ return result
525
+
526
+ except Exception as e:
527
+ logging.error(f"Error processing analysis: {str(e)}")
528
+ return {"analysis": []}
prompt_manager.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+
3
+ class PromptManager:
4
+ def __init__(self):
5
+ self.prompts = {
6
+ "model_a": """You are a medical research expert analyzing clinical trial abstracts.
7
+ Your task is to analyze each abstract and determine if it matches the PICOS criteria.
8
+
9
+ Target PICOS criteria:
10
+ - Population: {population}
11
+ - Intervention: {intervention}
12
+ - Comparison: {comparison}
13
+ - Outcome: {outcome}
14
+ - Study Design: {study_design}
15
+
16
+ Input abstracts:
17
+ {abstracts_json}
18
+
19
+ Each article in the input contains:
20
+ - index: article identifier
21
+ - abstract: the text to analyze
22
+
23
+ IMPORTANT: You must follow these strict JSON formatting rules:
24
+ 1. Use double quotes for all strings
25
+ 2. Ensure all strings are properly terminated
26
+ 3. Use commas between array items and object properties
27
+ 4. Do not use trailing commas
28
+ 5. Keep the response concise and avoid unnecessary whitespace
29
+ 6. Escape any special characters in strings
30
+ 7. Use true/false (not True/False) for boolean values
31
+
32
+ Provide your analysis in this exact JSON format:
33
+ {{
34
+ "results": [
35
+ {{
36
+ "Index": "ARTICLE_INDEX",
37
+ "A_P": "brief population description",
38
+ "A_I": "brief intervention description",
39
+ "A_C": "brief comparison description",
40
+ "A_O": "brief outcome description",
41
+ "A_S": "brief study design description",
42
+ "A_Decision": true/false,
43
+ "A_Reason": "brief reasoning for match/mismatch"
44
+ }},
45
+ ...
46
+ ]
47
+ }}
48
+
49
+ Keep all descriptions brief and focused. Do not include line breaks or special characters in the text fields.
50
+ If any field is not found in the abstract, use "not specified" as the value.
51
+ Be strict in your evaluation and ensure the output is valid JSON format.""",
52
+
53
+ "model_b": """You are a critical reviewer in a systematic review team.
54
+ Your task is to rigorously scrutinize Model A's analysis and provide your own assessment.
55
+ You should actively look for potential flaws or oversights in Model A's analysis, while maintaining a high standard of evidence-based evaluation.
56
+
57
+ Target PICOS criteria:
58
+ - Population: {population}
59
+ - Intervention: {intervention}
60
+ - Comparison: {comparison}
61
+ - Outcome: {outcome}
62
+ - Study Design: {study_design}
63
+
64
+ Input abstracts:
65
+ {abstracts_json}
66
+
67
+ Each article in the input contains:
68
+ - Index: article identifier
69
+ - abstract: original article abstract
70
+ - model_a_analysis:
71
+ - A_P: Model A's population description
72
+ - A_I: Model A's intervention description
73
+ - A_C: Model A's comparison description
74
+ - A_O: Model A's outcome description
75
+ - A_S: Model A's study design description
76
+ - A_Decision: Model A's inclusion decision
77
+ - A_Reason: Model A's explanation
78
+
79
+ Your task is to:
80
+ 1. Thoroughly examine the original abstract
81
+ 2. Critically review Model A's PICOS extraction, actively seeking potential issues:
82
+ - Look for missing details or nuances in population characteristics
83
+ - Check for precise intervention specifications
84
+ - Verify completeness of comparison group description
85
+ - Examine outcome measurements and their relevance
86
+ - Scrutinize study design classification
87
+ 3. Provide corrections with evidence from the abstract:
88
+ - B_P: Your corrected population description (use "-" only if A_P is completely accurate)
89
+ - B_I: Your corrected intervention description (use "-" only if A_I is completely accurate)
90
+ - B_C: Your corrected comparison description (use "-" only if A_C is completely accurate)
91
+ - B_O: Your corrected outcome description (use "-" only if A_O is completely accurate)
92
+ - B_S: Your corrected study design description (use "-" only if A_S is completely accurate)
93
+ 4. Make your own independent inclusion decision (B_Decision)
94
+ 5. Provide detailed reasoning (B_Reason) that:
95
+ - Points out any oversights or inaccuracies in Model A's analysis
96
+ - Cites specific evidence from the abstract
97
+ - Explains why your corrections or agreements are justified
98
+
99
+ IMPORTANT: You must follow these strict JSON formatting rules:
100
+ 1. Use double quotes for all strings
101
+ 2. Ensure all strings are properly terminated
102
+ 3. Use commas between array items and object properties
103
+ 4. Do not use trailing commas
104
+ 5. Keep the response concise and avoid unnecessary whitespace
105
+ 6. Escape any special characters in strings
106
+ 7. Use true/false for B_Decision (true means the article should be included)
107
+ 8. ALL fields (B_P, B_I, B_C, B_O, B_S) must be provided for each review
108
+ 9. NEVER omit any field, even if you agree with Model A's analysis
109
+ 10. For B_S specifically, you must either provide a corrected study design description or use "-" if you agree with A_S
110
+
111
+ Return your analysis in this exact JSON format:
112
+ {{
113
+ "results": [
114
+ {{
115
+ "Index": "ARTICLE_INDEX",
116
+ "B_Decision": true/false,
117
+ "B_Reason": "detailed reasoning with evidence from abstract",
118
+ "B_P": "-" or "corrected population description with evidence",
119
+ "B_I": "-" or "corrected intervention description with evidence",
120
+ "B_C": "-" or "corrected comparison description with evidence",
121
+ "B_O": "-" or "corrected outcome description with evidence",
122
+ "B_S": "-" or "corrected study design description with evidence"
123
+ }},
124
+ ...
125
+ ]
126
+ }}
127
+
128
+ Keep descriptions focused and evidence-based. Do not include line breaks or special characters.
129
+ Use "-" only when you are completely certain that Model A's extraction is accurate and complete.
130
+ Your B_Decision should be based on whether the article meets all PICOS criteria.
131
+ Remember to be thorough in your critique while maintaining objectivity and evidence-based reasoning.
132
+
133
+ CRITICAL: You MUST include ALL fields in your response, especially B_S. If you agree with Model A's study design analysis, use "-" for B_S, but NEVER omit it.""",
134
+
135
+ "model_c": """You are the final arbitrator in a systematic review team.
136
+ Your task is to analyze the assessments from Model A and Model B, and make a final decision.
137
+
138
+ Target PICOS criteria:
139
+ - Population: {population}
140
+ - Intervention: {intervention}
141
+ - Comparison: {comparison}
142
+ - Outcome: {outcome}
143
+ - Study Design: {study_design}
144
+
145
+ Input abstracts:
146
+ {abstracts_json}
147
+
148
+ Each article in the input contains:
149
+ - Index: article identifier
150
+ - abstract: original article abstract
151
+ - model_a_analysis: Model A's assessment
152
+ - model_b_analysis: Model B's assessment
153
+
154
+ Your task is to:
155
+ 1. Review the original abstract
156
+ 2. Compare Model A and Model B's assessments
157
+ 3. Make a final decision considering:
158
+ - Accuracy of PICOS criteria matching
159
+ - Validity of reasoning from both models
160
+ - Evidence from the abstract
161
+ 4. Provide your final assessment:
162
+ - C_Decision: final inclusion decision
163
+ - C_Reason: detailed explanation of your decision
164
+ - Note any disagreements between models and how you resolved them
165
+
166
+ Return your analysis in this exact JSON format:
167
+ {{
168
+ "results": [
169
+ {{
170
+ "Index": "ARTICLE_INDEX",
171
+ "C_Decision": true/false,
172
+ "C_Reason": "detailed reasoning with evidence"
173
+ }},
174
+ ...
175
+ ]
176
+ }}
177
+
178
+ Keep your reasoning focused and evidence-based.
179
+ Your C_Decision should be based on whether the article truly meets all PICOS criteria.
180
+ Be thorough in your analysis while maintaining objectivity."""
181
+ }
182
+
183
+ def update_prompt(self, model_key: str, prompt: str) -> None:
184
+ """Update model prompt"""
185
+ if model_key not in self.prompts:
186
+ raise ValueError(f"Invalid model key: {model_key}")
187
+ self.prompts[model_key] = prompt
188
+
189
+ def get_prompt(self, model_key: str) -> str:
190
+ """Get model prompt"""
191
+ return self.prompts.get(model_key, "")
renovate.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3
+ "extends": [
4
+ "config:recommended"
5
+ ]
6
+ }
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ pandas>=1.5.0
2
+ requests>=2.31.0
3
+ python-dotenv>=1.0.0
4
+ tqdm>=4.66.0
5
+ tabulate>=0.9.0
6
+ gradio>=4.19.0
7
+ xlrd
8
+ scikit-learn>=1.3.0
9
+ openpyxl>=3.1.2
result_processor.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import logging
3
+ from typing import Dict
4
+ import json
5
+ import re
6
+
7
+ class ResultProcessor:
8
+ def __init__(self):
9
+ """Initialize ResultProcessor with required column definitions for each model"""
10
+ # Define required columns for each model's output
11
+ self.required_columns = {
12
+ "model_a": ["A_Decision", "A_Reason", "A_P", "A_I", "A_C", "A_O", "A_S"],
13
+ "model_b": ["B_Decision", "B_Reason", "B_P", "B_I", "B_C", "B_O", "B_S"],
14
+ "model_c": ["C_Decision", "C_Reason"]
15
+ }
16
+
17
+ # Define the order of columns in the final Excel output
18
+ self.output_columns = [
19
+ "Index",
20
+ "A_Decision", "A_Reason", "A_P", "A_I", "A_C", "A_O", "A_S",
21
+ "B_Decision", "B_Reason", "B_P", "B_I", "B_C", "B_O", "B_S",
22
+ "C_Decision", "C_Reason"
23
+ ]
24
+
25
+ def validate_model_response(self, result: Dict, model_key: str) -> None:
26
+ """
27
+ Validate the response format from each model
28
+
29
+ Args:
30
+ result: The model's response to validate
31
+ model_key: The identifier of the model ('model_a', 'model_b', or 'model_c')
32
+
33
+ Raises:
34
+ Exception: If the response format is invalid
35
+ """
36
+ # Log validation start
37
+ logging.debug(f"Starting validation for {model_key}")
38
+ logging.debug(f"Raw result type: {type(result)}")
39
+
40
+ if model_key == "model_a":
41
+ # Check if response is in completion format
42
+ if "choices" in result and len(result["choices"]) > 0:
43
+ content = result["choices"][0].get("message", {}).get("content", "")
44
+ if content:
45
+ try:
46
+ # Handle markdown-wrapped JSON content
47
+ json_content = content
48
+ if "```json" in content:
49
+ pattern = r"```json\s*(.*?)\s*```"
50
+ match = re.search(pattern, content, re.DOTALL)
51
+ if match:
52
+ json_content = match.group(1)
53
+ logging.debug(f"Extracted JSON content: {json_content}")
54
+
55
+ # Parse JSON content
56
+ parsed = json.loads(json_content)
57
+ if isinstance(parsed, dict) and "results" in parsed:
58
+ result.clear()
59
+ result.update(parsed)
60
+ logging.debug("Successfully parsed Model A response")
61
+ except json.JSONDecodeError as e:
62
+ raise Exception(f"Invalid JSON in Model A response content: {content}. Error: {str(e)}")
63
+
64
+ # Validate Model A specific format
65
+ if not isinstance(result, dict):
66
+ raise Exception("Invalid Model A response format: result is not a dictionary")
67
+ if "results" not in result:
68
+ raise Exception("Invalid Model A response format: missing 'results' field")
69
+ if not isinstance(result["results"], list):
70
+ raise Exception("Invalid Model A response format: 'results' is not a list")
71
+ if not result["results"]:
72
+ raise Exception("Empty results array in Model A response")
73
+
74
+ # Validate each result item
75
+ for item in result["results"]:
76
+ if not isinstance(item, dict):
77
+ raise Exception(f"Invalid result item format: {item}")
78
+ if "Index" not in item:
79
+ raise Exception(f"Missing 'Index' in result item: {item}")
80
+ missing_fields = [field for field in self.required_columns[model_key] if field not in item]
81
+ if missing_fields:
82
+ raise Exception(f"Missing fields in result item: {missing_fields}")
83
+
84
+ elif model_key == "model_b":
85
+ # Handle Model B's response format
86
+ if "choices" in result and len(result["choices"]) > 0:
87
+ content = result["choices"][0].get("message", {}).get("content", "")
88
+ if content:
89
+ try:
90
+ json_content = content
91
+ if "```json" in content:
92
+ pattern = r"```json\s*(.*?)\s*```"
93
+ match = re.search(pattern, content, re.DOTALL)
94
+ if match:
95
+ json_content = match.group(1)
96
+ logging.debug(f"Extracted JSON content for Model B: {json_content}")
97
+
98
+ parsed = json.loads(json_content)
99
+ if isinstance(parsed, dict) and "results" in parsed:
100
+ result.clear()
101
+ result.update(parsed)
102
+ logging.debug("Successfully parsed Model B response")
103
+ except json.JSONDecodeError as e:
104
+ raise Exception(f"Invalid JSON in Model B response content: {content}. Error: {str(e)}")
105
+
106
+ # Validate Model B specific format
107
+ if not isinstance(result, dict):
108
+ raise Exception("Invalid Model B response format: result is not a dictionary")
109
+ if "results" not in result:
110
+ raise Exception("Invalid Model B response format: missing 'results' field")
111
+ if not isinstance(result["results"], list):
112
+ raise Exception("Invalid Model B response format: 'results' is not a list")
113
+ if not result["results"]:
114
+ raise Exception("Empty results array in Model B response")
115
+
116
+ # Validate each result item
117
+ for item in result["results"]:
118
+ if not isinstance(item, dict):
119
+ raise Exception(f"Invalid result item format: {item}")
120
+ if "Index" not in item:
121
+ raise Exception(f"Missing 'Index' in result item: {item}")
122
+ missing_fields = [field for field in self.required_columns[model_key] if field not in item]
123
+ if missing_fields:
124
+ raise Exception(f"Missing fields in Model B result: {missing_fields}")
125
+
126
+ else: # model_c
127
+ # Handle Model C's response format
128
+ if "choices" in result and len(result["choices"]) > 0:
129
+ content = result["choices"][0].get("message", {}).get("content", "")
130
+ if content:
131
+ try:
132
+ json_content = content
133
+ if "```json" in content:
134
+ pattern = r"```json\s*(.*?)\s*```"
135
+ match = re.search(pattern, content, re.DOTALL)
136
+ if match:
137
+ json_content = match.group(1)
138
+ logging.debug(f"Extracted JSON content for Model C: {json_content}")
139
+
140
+ parsed = json.loads(json_content)
141
+ if isinstance(parsed, dict) and "results" in parsed:
142
+ result.clear()
143
+ result.update(parsed)
144
+ logging.debug("Successfully parsed Model C response")
145
+ except json.JSONDecodeError as e:
146
+ raise Exception(f"Invalid JSON in Model C response content: {content}. Error: {str(e)}")
147
+
148
+ # Validate Model C specific format
149
+ if not isinstance(result, dict):
150
+ raise Exception("Invalid Model C response format: result is not a dictionary")
151
+ if "results" not in result:
152
+ raise Exception("Invalid Model C response format: missing 'results' field")
153
+ if not isinstance(result["results"], list):
154
+ raise Exception("Invalid Model C response format: 'results' is not a list")
155
+ if not result["results"]:
156
+ raise Exception("Empty results array in Model C response")
157
+
158
+ # Validate each result item
159
+ for item in result["results"]:
160
+ if not isinstance(item, dict):
161
+ raise Exception(f"Invalid result item format: {item}")
162
+ if "Index" not in item:
163
+ raise Exception(f"Missing 'Index' in result item: {item}")
164
+ missing_fields = [field for field in self.required_columns[model_key] if field not in item]
165
+ if missing_fields:
166
+ raise Exception(f"Missing fields in Model C result: {missing_fields}")
167
+ try:
168
+ str(item["Index"])
169
+ bool(item["C_Decision"])
170
+ str(item["C_Reason"])
171
+ except (ValueError, TypeError) as e:
172
+ raise Exception(f"Invalid data type in Model C result: {str(e)}")
173
+
174
+ # Log successful validation
175
+ logging.debug(f"Validation completed successfully for {model_key}")
176
+
177
+ def merge_results(self, df: pd.DataFrame, model_results: Dict[str, pd.DataFrame]) -> pd.DataFrame:
178
+ """
179
+ Merge all model results with correct column alignment and compute final decision
180
+
181
+ Args:
182
+ df: Original DataFrame with abstracts
183
+ model_results: Dictionary containing results from each model
184
+
185
+ Returns:
186
+ DataFrame with merged results from all models
187
+ """
188
+ try:
189
+ # Copy and clean the original DataFrame's index (remove potential whitespace)
190
+ df = df.copy()
191
+ df.index = df.index.astype(str).str.strip()
192
+
193
+ # Handle missing values and clean base columns
194
+ for col in ["Abstract", "DOI", "Title", "Authors"]:
195
+ if col in df.columns:
196
+ df[col] = df[col].fillna("").astype(str)
197
+ df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else "")
198
+ df[col] = df[col].replace(r'^[\s-]*$', "", regex=True)
199
+
200
+ # Create base DataFrame for merging model results
201
+ merged_df = df.copy()
202
+
203
+ def join_model_results(base_df: pd.DataFrame, model_key: str) -> pd.DataFrame:
204
+ """
205
+ Merge results from a specific model, ensuring data alignment and cleaning
206
+
207
+ Args:
208
+ base_df: Base DataFrame to merge with
209
+ model_key: Identifier of the model
210
+
211
+ Returns:
212
+ DataFrame with merged model results
213
+ """
214
+ if model_key not in model_results:
215
+ logging.warning(f"{model_key} results not found")
216
+ # Create default values for all rows
217
+ for col in self.required_columns[model_key]:
218
+ if col.endswith('_Decision'):
219
+ base_df[col] = False
220
+ elif col.endswith('_Reason'):
221
+ base_df[col] = "Not applicable - No model result"
222
+ else:
223
+ base_df[col] = "not applicable"
224
+ return base_df
225
+
226
+ try:
227
+ model_df = model_results[model_key].copy()
228
+ # Ensure model result indices and column names are strings without whitespace
229
+ model_df.index = model_df.index.astype(str).str.strip()
230
+ model_df.columns = model_df.columns.astype(str).str.strip()
231
+
232
+ # Ensure all required columns exist
233
+ for col in self.required_columns[model_key]:
234
+ if col not in model_df.columns:
235
+ if col.endswith('_Decision'):
236
+ model_df[col] = False
237
+ elif col.endswith('_Reason'):
238
+ model_df[col] = "Not applicable - Missing column"
239
+ else:
240
+ model_df[col] = "not applicable"
241
+
242
+ # Add default values for indices present in original data but missing in model results
243
+ missing_indices = set(base_df.index) - set(model_df.index)
244
+ if missing_indices:
245
+ logging.info(f"Found {len(missing_indices)} missing entries in {model_key}")
246
+ default_values = pd.DataFrame(
247
+ index=list(missing_indices),
248
+ columns=self.required_columns[model_key]
249
+ )
250
+ for col in self.required_columns[model_key]:
251
+ if col.endswith('_Decision'):
252
+ default_values[col] = False
253
+ elif col.endswith('_Reason'):
254
+ default_values[col] = "Not applicable - No result"
255
+ else:
256
+ default_values[col] = "not applicable"
257
+ model_df = pd.concat([model_df, default_values])
258
+
259
+ # Select only required columns
260
+ model_df = model_df[self.required_columns[model_key]]
261
+
262
+ # Use left join to preserve all original data indices
263
+ result = pd.merge(
264
+ base_df,
265
+ model_df,
266
+ left_index=True,
267
+ right_index=True,
268
+ how='left'
269
+ )
270
+
271
+ # Fill potential NaN values
272
+ for col in self.required_columns[model_key]:
273
+ if col in result.columns:
274
+ if col.endswith('_Decision'):
275
+ result[col] = result[col].fillna(False)
276
+ elif col.endswith('_Reason'):
277
+ result[col] = result[col].fillna("Not applicable - Missing value")
278
+ else:
279
+ result[col] = result[col].fillna("not applicable")
280
+
281
+ return result
282
+
283
+ except Exception as e:
284
+ logging.error(f"Error processing {model_key} results: {str(e)}")
285
+ # Return base DataFrame with default values
286
+ for col in self.required_columns[model_key]:
287
+ if col.endswith('_Decision'):
288
+ base_df[col] = False
289
+ elif col.endswith('_Reason'):
290
+ base_df[col] = f"Error processing {model_key} results: {str(e)}"
291
+ else:
292
+ base_df[col] = "not applicable"
293
+ return base_df
294
+
295
+ # Merge results from each model in sequence
296
+ merged_df = join_model_results(merged_df, "model_a")
297
+ merged_df = join_model_results(merged_df, "model_b")
298
+
299
+ # Merge Model C results or generate default values
300
+ if "model_c" in model_results:
301
+ merged_df = join_model_results(merged_df, "model_c")
302
+ else:
303
+ merged_df["C_Decision"] = False
304
+ merged_df["C_Reason"] = merged_df.apply(
305
+ lambda row: "No disagreement between Model A and B"
306
+ if pd.notna(row.get("A_Decision")) and pd.notna(row.get("B_Decision")) and row["A_Decision"] == row["B_Decision"]
307
+ else "Not applicable - No Model C result",
308
+ axis=1
309
+ )
310
+
311
+ # Compute final decision based on model results
312
+ def compute_final_decision(row):
313
+ """
314
+ Compute final decision based on available model decisions
315
+ Priority: Model C > Agreement between A&B > Model B > Model A > False
316
+ """
317
+ try:
318
+ if pd.notna(row.get("C_Decision")):
319
+ return bool(row["C_Decision"])
320
+ elif pd.notna(row.get("A_Decision")) and pd.notna(row.get("B_Decision")):
321
+ if bool(row["A_Decision"]) == bool(row["B_Decision"]):
322
+ return bool(row["A_Decision"])
323
+ else:
324
+ return bool(row["B_Decision"]) # Use Model B's result in case of disagreement
325
+ elif pd.notna(row.get("B_Decision")):
326
+ return bool(row["B_Decision"])
327
+ elif pd.notna(row.get("A_Decision")):
328
+ return bool(row["A_Decision"])
329
+ except Exception as e:
330
+ logging.error(f"Error computing final decision: {str(e)}")
331
+ return False
332
+
333
+ merged_df["Final_Decision"] = merged_df.apply(compute_final_decision, axis=1)
334
+
335
+ # Define final output columns and their order
336
+ output_cols = [
337
+ "Title", "DOI", "Abstract", "Authors",
338
+ *self.required_columns.get("model_a", []),
339
+ *self.required_columns.get("model_b", []),
340
+ *self.required_columns.get("model_c", []),
341
+ "Final_Decision"
342
+ ]
343
+
344
+ # Ensure all required columns exist (assign default values if missing)
345
+ for col in output_cols:
346
+ if col not in merged_df.columns:
347
+ if col.endswith('Decision'):
348
+ merged_df[col] = False
349
+ elif col.endswith('Reason'):
350
+ merged_df[col] = "Not applicable - Missing column"
351
+ else:
352
+ merged_df[col] = ""
353
+
354
+ # Select existing columns in the specified order
355
+ existing_cols = [col for col in output_cols if col in merged_df.columns]
356
+ merged_df = merged_df[existing_cols]
357
+
358
+ # Final cleaning of all column values
359
+ for col in merged_df.columns:
360
+ if col.endswith('Decision'):
361
+ merged_df[col] = merged_df[col].fillna(False).astype(bool)
362
+ elif col.endswith('Reason'):
363
+ merged_df[col] = merged_df[col].fillna("Not applicable - Missing value")
364
+ elif col in ["Title", "DOI", "Abstract", "Authors"]:
365
+ merged_df[col] = merged_df[col].fillna("").astype(str)
366
+ else:
367
+ merged_df[col] = merged_df[col].fillna("not applicable")
368
+
369
+ # Add index as a column in the final result
370
+ merged_df.insert(0, "Index", merged_df.index)
371
+
372
+ return merged_df
373
+
374
+ except Exception as e:
375
+ logging.error(f"Error merging results: {str(e)}")
376
+ # Return a minimal DataFrame with error information
377
+ error_df = pd.DataFrame(index=df.index)
378
+ error_df["Error"] = f"Failed to merge results: {str(e)}"
379
+ return error_df
380
+
381
+ def export_to_excel(self, df: pd.DataFrame, filename: str) -> None:
382
+ """
383
+ Export DataFrame to Excel file
384
+
385
+ Args:
386
+ df: DataFrame to export
387
+ filename: Target Excel file path
388
+ """
389
+ try:
390
+ df.to_excel(filename, index=False)
391
+ logging.info(f"Exported results to {filename} successfully.")
392
+ except Exception as e:
393
+ logging.error(f"Error exporting to Excel: {str(e)}")