chitsanfei
commited on
Commit
·
d082b18
0
Parent(s):
init: init
Browse files- .Rhistory +512 -0
- .env.example +35 -0
- .github/workflows/deploy_to_hf_space.yml +43 -0
- .gitignore +181 -0
- LICENSE +661 -0
- README.md +97 -0
- analyzer.py +511 -0
- app.py +724 -0
- deduplicator.py +183 -0
- file_processor.py +407 -0
- model_manager.py +528 -0
- prompt_manager.py +191 -0
- renovate.json +6 -0
- requirements.txt +9 -0
- result_processor.py +393 -0
.Rhistory
ADDED
|
@@ -0,0 +1,512 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
|
| 2 |
+
b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
|
| 3 |
+
# C -> Final
|
| 4 |
+
c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
|
| 5 |
+
c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
|
| 6 |
+
c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
|
| 7 |
+
c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
|
| 8 |
+
c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
|
| 9 |
+
c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
|
| 10 |
+
# 准备链接数据
|
| 11 |
+
links <- data.frame(
|
| 12 |
+
source = c(
|
| 13 |
+
# A -> B
|
| 14 |
+
rep(0, 2), rep(1, 2),
|
| 15 |
+
# B -> C
|
| 16 |
+
rep(2, 3), rep(3, 3),
|
| 17 |
+
# C -> Final
|
| 18 |
+
rep(4, 2), rep(5, 2), rep(6, 2)
|
| 19 |
+
),
|
| 20 |
+
target = c(
|
| 21 |
+
# A -> B
|
| 22 |
+
2, 3, 2, 3,
|
| 23 |
+
# B -> C
|
| 24 |
+
4, 5, 6, 4, 5, 6,
|
| 25 |
+
# C -> Final
|
| 26 |
+
7, 8, 7, 8, 7, 8
|
| 27 |
+
),
|
| 28 |
+
value = c(
|
| 29 |
+
# A -> B
|
| 30 |
+
a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
|
| 31 |
+
# B -> C
|
| 32 |
+
b_true_c_true, b_true_c_false, b_true_c_na,
|
| 33 |
+
b_false_c_true, b_false_c_false, b_false_c_na,
|
| 34 |
+
# C -> Final
|
| 35 |
+
c_true_final_true, c_true_final_false,
|
| 36 |
+
c_false_final_true, c_false_final_false,
|
| 37 |
+
c_na_final_true, c_na_final_false
|
| 38 |
+
)
|
| 39 |
+
)
|
| 40 |
+
# 创建颜色向量
|
| 41 |
+
my_color <- 'd3.scaleOrdinal()
|
| 42 |
+
.domain(["Model A True", "Model A False",
|
| 43 |
+
"Model B True", "Model B False",
|
| 44 |
+
"Model C True", "Model C False", "Model C NA",
|
| 45 |
+
"Final True", "Final False"])
|
| 46 |
+
.range(["#fbf8cc", "#fde4cf",
|
| 47 |
+
"#FFCFD2", "#F1C0E8",
|
| 48 |
+
"#CFBAF0", "#A3C4F3", "#90DBF4",
|
| 49 |
+
"#98F5E1", "#B9FBC0"])'
|
| 50 |
+
# 绘制桑基图
|
| 51 |
+
sankeyNetwork(Links = links, Nodes = nodes,
|
| 52 |
+
Source = "source", Target = "target",
|
| 53 |
+
Value = "value", NodeID = "name",
|
| 54 |
+
sinksRight = TRUE,
|
| 55 |
+
nodeWidth = 40,
|
| 56 |
+
nodePadding = 20,
|
| 57 |
+
colourScale = my_color,
|
| 58 |
+
fontSize = 12,
|
| 59 |
+
height = 500,
|
| 60 |
+
width = 800)
|
| 61 |
+
# 保存为HTML文件
|
| 62 |
+
saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
|
| 63 |
+
Source = "source", Target = "target",
|
| 64 |
+
Value = "value", NodeID = "name",
|
| 65 |
+
sinksRight = TRUE,
|
| 66 |
+
nodeWidth = 40,
|
| 67 |
+
nodePadding = 20,
|
| 68 |
+
colourScale = my_color,
|
| 69 |
+
fontSize = 12,
|
| 70 |
+
height = 500,
|
| 71 |
+
width = 800),
|
| 72 |
+
"sankey_plot.html")
|
| 73 |
+
setwd("C:/Users/admin/Desktop/article-analyzer")
|
| 74 |
+
# 加载必要的包
|
| 75 |
+
library(networkD3)
|
| 76 |
+
library(dplyr)
|
| 77 |
+
library(readr)
|
| 78 |
+
# 读取数据
|
| 79 |
+
data <- read_csv("data/picos_analysis.csv")
|
| 80 |
+
# 准备节点数据
|
| 81 |
+
nodes <- data.frame(
|
| 82 |
+
name = c(
|
| 83 |
+
"Model A True", "Model A False",
|
| 84 |
+
"Model B True", "Model B False",
|
| 85 |
+
"Model C True", "Model C False", "Model C NA",
|
| 86 |
+
"Final True", "Final False"
|
| 87 |
+
)
|
| 88 |
+
)
|
| 89 |
+
# 计算流向
|
| 90 |
+
# A -> B
|
| 91 |
+
a_true_b_true <- sum(data$A_Decision & data$B_Decision, na.rm = TRUE)
|
| 92 |
+
a_true_b_false <- sum(data$A_Decision & !data$B_Decision, na.rm = TRUE)
|
| 93 |
+
a_false_b_true <- sum(!data$A_Decision & data$B_Decision, na.rm = TRUE)
|
| 94 |
+
a_false_b_false <- sum(!data$A_Decision & !data$B_Decision, na.rm = TRUE)
|
| 95 |
+
# B -> C
|
| 96 |
+
b_true_c_true <- sum(data$B_Decision & data$C_Decision, na.rm = TRUE)
|
| 97 |
+
b_true_c_false <- sum(data$B_Decision & !data$C_Decision, na.rm = TRUE)
|
| 98 |
+
b_true_c_na <- sum(data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
|
| 99 |
+
b_false_c_true <- sum(!data$B_Decision & data$C_Decision, na.rm = TRUE)
|
| 100 |
+
b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
|
| 101 |
+
b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
|
| 102 |
+
# C -> Final
|
| 103 |
+
c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
|
| 104 |
+
c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
|
| 105 |
+
c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
|
| 106 |
+
c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
|
| 107 |
+
c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
|
| 108 |
+
c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
|
| 109 |
+
# 准备链接数据
|
| 110 |
+
links <- data.frame(
|
| 111 |
+
source = c(
|
| 112 |
+
# A -> B
|
| 113 |
+
rep(0, 2), rep(1, 2),
|
| 114 |
+
# B -> C
|
| 115 |
+
rep(2, 3), rep(3, 3),
|
| 116 |
+
# C -> Final
|
| 117 |
+
rep(4, 2), rep(5, 2), rep(6, 2)
|
| 118 |
+
),
|
| 119 |
+
target = c(
|
| 120 |
+
# A -> B
|
| 121 |
+
2, 3, 2, 3,
|
| 122 |
+
# B -> C
|
| 123 |
+
4, 5, 6, 4, 5, 6,
|
| 124 |
+
# C -> Final
|
| 125 |
+
7, 8, 7, 8, 7, 8
|
| 126 |
+
),
|
| 127 |
+
value = c(
|
| 128 |
+
# A -> B
|
| 129 |
+
a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
|
| 130 |
+
# B -> C
|
| 131 |
+
b_true_c_true, b_true_c_false, b_true_c_na,
|
| 132 |
+
b_false_c_true, b_false_c_false, b_false_c_na,
|
| 133 |
+
# C -> Final
|
| 134 |
+
c_true_final_true, c_true_final_false,
|
| 135 |
+
c_false_final_true, c_false_final_false,
|
| 136 |
+
c_na_final_true, c_na_final_false
|
| 137 |
+
)
|
| 138 |
+
)
|
| 139 |
+
# 创建颜色向量
|
| 140 |
+
my_color <- 'd3.scaleOrdinal()
|
| 141 |
+
.domain(["Model A True", "Model A False",
|
| 142 |
+
"Model B True", "Model B False",
|
| 143 |
+
"Model C True", "Model C False", "Model C NA",
|
| 144 |
+
"Final True", "Final False"])
|
| 145 |
+
.range(["#fbf8cc", "#fde4cf",
|
| 146 |
+
"#FFCFD2", "#F1C0E8",
|
| 147 |
+
"#CFBAF0", "#A3C4F3", "#90DBF4",
|
| 148 |
+
"#98F5E1", "#B9FBC0"])'
|
| 149 |
+
# 绘制桑基图
|
| 150 |
+
sankeyNetwork(Links = links, Nodes = nodes,
|
| 151 |
+
Source = "source", Target = "target",
|
| 152 |
+
Value = "value", NodeID = "name",
|
| 153 |
+
sinksRight = TRUE,
|
| 154 |
+
nodeWidth = 40,
|
| 155 |
+
nodePadding = 20,
|
| 156 |
+
colourScale = my_color,
|
| 157 |
+
fontSize = 12,
|
| 158 |
+
height = 500,
|
| 159 |
+
width = 800)
|
| 160 |
+
# 保存为HTML文件
|
| 161 |
+
saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
|
| 162 |
+
Source = "source", Target = "target",
|
| 163 |
+
Value = "value", NodeID = "name",
|
| 164 |
+
sinksRight = TRUE,
|
| 165 |
+
nodeWidth = 40,
|
| 166 |
+
nodePadding = 20,
|
| 167 |
+
colourScale = my_color,
|
| 168 |
+
fontSize = 12,
|
| 169 |
+
height = 500,
|
| 170 |
+
width = 800),
|
| 171 |
+
"sankey_plot.html")
|
| 172 |
+
# 保存为HTML文件
|
| 173 |
+
saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
|
| 174 |
+
Source = "source", Target = "target",
|
| 175 |
+
Value = "value", NodeID = "name",
|
| 176 |
+
sinksRight = TRUE,
|
| 177 |
+
nodeWidth = 40,
|
| 178 |
+
nodePadding = 20,
|
| 179 |
+
colourScale = my_color,
|
| 180 |
+
fontSize = 12,
|
| 181 |
+
height = 500,
|
| 182 |
+
width = 800),
|
| 183 |
+
"sankey_plot.html")
|
| 184 |
+
setwd("C:/Users/admin/Desktop/article-analyzer")
|
| 185 |
+
# 加载必要的包
|
| 186 |
+
library(networkD3)
|
| 187 |
+
library(dplyr)
|
| 188 |
+
library(readr)
|
| 189 |
+
# 读取数据
|
| 190 |
+
data <- read_csv("data/picos_analysis.csv")
|
| 191 |
+
# 准备节点数据
|
| 192 |
+
nodes <- data.frame(
|
| 193 |
+
name = c(
|
| 194 |
+
"Model A True", "Model A False",
|
| 195 |
+
"Model B True", "Model B False",
|
| 196 |
+
"Model C True", "Model C False", "Model C NA",
|
| 197 |
+
"Final True", "Final False"
|
| 198 |
+
),
|
| 199 |
+
group = c(
|
| 200 |
+
"A True", "A False",
|
| 201 |
+
"B True", "B False",
|
| 202 |
+
"C True", "C False", "C NA",
|
| 203 |
+
"F True", "F False"
|
| 204 |
+
)
|
| 205 |
+
)
|
| 206 |
+
# 计算流向
|
| 207 |
+
# A -> B
|
| 208 |
+
a_true_b_true <- sum(data$A_Decision & data$B_Decision, na.rm = TRUE)
|
| 209 |
+
a_true_b_false <- sum(data$A_Decision & !data$B_Decision, na.rm = TRUE)
|
| 210 |
+
a_false_b_true <- sum(!data$A_Decision & data$B_Decision, na.rm = TRUE)
|
| 211 |
+
a_false_b_false <- sum(!data$A_Decision & !data$B_Decision, na.rm = TRUE)
|
| 212 |
+
# B -> C
|
| 213 |
+
b_true_c_true <- sum(data$B_Decision & data$C_Decision, na.rm = TRUE)
|
| 214 |
+
b_true_c_false <- sum(data$B_Decision & !data$C_Decision, na.rm = TRUE)
|
| 215 |
+
b_true_c_na <- sum(data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
|
| 216 |
+
b_false_c_true <- sum(!data$B_Decision & data$C_Decision, na.rm = TRUE)
|
| 217 |
+
b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
|
| 218 |
+
b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
|
| 219 |
+
# C -> Final
|
| 220 |
+
c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
|
| 221 |
+
c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
|
| 222 |
+
c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
|
| 223 |
+
c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
|
| 224 |
+
c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
|
| 225 |
+
c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
|
| 226 |
+
# 准备链接数据
|
| 227 |
+
links <- data.frame(
|
| 228 |
+
source = c(
|
| 229 |
+
# A -> B
|
| 230 |
+
rep(0, 2), rep(1, 2),
|
| 231 |
+
# B -> C
|
| 232 |
+
rep(2, 3), rep(3, 3),
|
| 233 |
+
# C -> Final
|
| 234 |
+
rep(4, 2), rep(5, 2), rep(6, 2)
|
| 235 |
+
),
|
| 236 |
+
target = c(
|
| 237 |
+
# A -> B
|
| 238 |
+
2, 3, 2, 3,
|
| 239 |
+
# B -> C
|
| 240 |
+
4, 5, 6, 4, 5, 6,
|
| 241 |
+
# C -> Final
|
| 242 |
+
7, 8, 7, 8, 7, 8
|
| 243 |
+
),
|
| 244 |
+
value = c(
|
| 245 |
+
# A -> B
|
| 246 |
+
a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
|
| 247 |
+
# B -> C
|
| 248 |
+
b_true_c_true, b_true_c_false, b_true_c_na,
|
| 249 |
+
b_false_c_true, b_false_c_false, b_false_c_na,
|
| 250 |
+
# C -> Final
|
| 251 |
+
c_true_final_true, c_true_final_false,
|
| 252 |
+
c_false_final_true, c_false_final_false,
|
| 253 |
+
c_na_final_true, c_na_final_false
|
| 254 |
+
)
|
| 255 |
+
)
|
| 256 |
+
# 创建颜色向量
|
| 257 |
+
my_color <- 'function(d) {
|
| 258 |
+
const colors = {
|
| 259 |
+
"Model A True": "#fbf8cc",
|
| 260 |
+
"Model A False": "#fde4cf",
|
| 261 |
+
"Model B True": "#FFCFD2",
|
| 262 |
+
"Model B False": "#F1C0E8",
|
| 263 |
+
"Model C True": "#CFBAF0",
|
| 264 |
+
"Model C False": "#A3C4F3",
|
| 265 |
+
"Model C NA": "#90DBF4",
|
| 266 |
+
"Final True": "#98F5E1",
|
| 267 |
+
"Final False": "#B9FBC0"
|
| 268 |
+
};
|
| 269 |
+
return colors[d.name] || "#cccccc";
|
| 270 |
+
}'
|
| 271 |
+
# 绘制桑基图
|
| 272 |
+
sankeyNetwork(Links = links, Nodes = nodes,
|
| 273 |
+
Source = "source", Target = "target",
|
| 274 |
+
Value = "value", NodeID = "name",
|
| 275 |
+
sinksRight = TRUE,
|
| 276 |
+
nodeWidth = 40,
|
| 277 |
+
nodePadding = 20,
|
| 278 |
+
colourScale = my_color,
|
| 279 |
+
fontSize = 12,
|
| 280 |
+
height = 500,
|
| 281 |
+
width = 800)
|
| 282 |
+
# 保存为HTML文件
|
| 283 |
+
saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
|
| 284 |
+
Source = "source", Target = "target",
|
| 285 |
+
Value = "value", NodeID = "name",
|
| 286 |
+
sinksRight = TRUE,
|
| 287 |
+
nodeWidth = 40,
|
| 288 |
+
nodePadding = 20,
|
| 289 |
+
colourScale = my_color,
|
| 290 |
+
fontSize = 12,
|
| 291 |
+
height = 500,
|
| 292 |
+
width = 800),
|
| 293 |
+
"sankey_plot.html")
|
| 294 |
+
setwd("C:/Users/admin/Desktop/article-analyzer")
|
| 295 |
+
# 加载必要的包
|
| 296 |
+
library(networkD3)
|
| 297 |
+
library(dplyr)
|
| 298 |
+
library(readr)
|
| 299 |
+
# 读取数据
|
| 300 |
+
data <- read_csv("data/picos_analysis.csv")
|
| 301 |
+
# 准备节点数据
|
| 302 |
+
nodes <- data.frame(
|
| 303 |
+
name = c(
|
| 304 |
+
"Model A True", "Model A False",
|
| 305 |
+
"Model B True", "Model B False",
|
| 306 |
+
"Model C True", "Model C False", "Model C NA",
|
| 307 |
+
"Final True", "Final False"
|
| 308 |
+
),
|
| 309 |
+
group = c(
|
| 310 |
+
"A True", "A False",
|
| 311 |
+
"B True", "B False",
|
| 312 |
+
"C True", "C False", "C NA",
|
| 313 |
+
"F True", "F False"
|
| 314 |
+
)
|
| 315 |
+
)
|
| 316 |
+
# 计算流向
|
| 317 |
+
# A -> B
|
| 318 |
+
a_true_b_true <- sum(data$A_Decision & data$B_Decision, na.rm = TRUE)
|
| 319 |
+
a_true_b_false <- sum(data$A_Decision & !data$B_Decision, na.rm = TRUE)
|
| 320 |
+
a_false_b_true <- sum(!data$A_Decision & data$B_Decision, na.rm = TRUE)
|
| 321 |
+
a_false_b_false <- sum(!data$A_Decision & !data$B_Decision, na.rm = TRUE)
|
| 322 |
+
# B -> C
|
| 323 |
+
b_true_c_true <- sum(data$B_Decision & data$C_Decision, na.rm = TRUE)
|
| 324 |
+
b_true_c_false <- sum(data$B_Decision & !data$C_Decision, na.rm = TRUE)
|
| 325 |
+
b_true_c_na <- sum(data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
|
| 326 |
+
b_false_c_true <- sum(!data$B_Decision & data$C_Decision, na.rm = TRUE)
|
| 327 |
+
b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
|
| 328 |
+
b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
|
| 329 |
+
# C -> Final
|
| 330 |
+
c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
|
| 331 |
+
c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
|
| 332 |
+
c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
|
| 333 |
+
c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
|
| 334 |
+
c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
|
| 335 |
+
c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
|
| 336 |
+
# 准备链接数据
|
| 337 |
+
links <- data.frame(
|
| 338 |
+
source = c(
|
| 339 |
+
# A -> B
|
| 340 |
+
rep(0, 2), rep(1, 2),
|
| 341 |
+
# B -> C
|
| 342 |
+
rep(2, 3), rep(3, 3),
|
| 343 |
+
# C -> Final
|
| 344 |
+
rep(4, 2), rep(5, 2), rep(6, 2)
|
| 345 |
+
),
|
| 346 |
+
target = c(
|
| 347 |
+
# A -> B
|
| 348 |
+
2, 3, 2, 3,
|
| 349 |
+
# B -> C
|
| 350 |
+
4, 5, 6, 4, 5, 6,
|
| 351 |
+
# C -> Final
|
| 352 |
+
7, 8, 7, 8, 7, 8
|
| 353 |
+
),
|
| 354 |
+
value = c(
|
| 355 |
+
# A -> B
|
| 356 |
+
a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
|
| 357 |
+
# B -> C
|
| 358 |
+
b_true_c_true, b_true_c_false, b_true_c_na,
|
| 359 |
+
b_false_c_true, b_false_c_false, b_false_c_na,
|
| 360 |
+
# C -> Final
|
| 361 |
+
c_true_final_true, c_true_final_false,
|
| 362 |
+
c_false_final_true, c_false_final_false,
|
| 363 |
+
c_na_final_true, c_na_final_false
|
| 364 |
+
)
|
| 365 |
+
)
|
| 366 |
+
# 创建颜色向量
|
| 367 |
+
my_color <- 'function(d) {
|
| 368 |
+
const colors = {
|
| 369 |
+
"Model A True": "#fbf8cc",
|
| 370 |
+
"Model A False": "#fde4cf",
|
| 371 |
+
"Model B True": "#FFCFD2",
|
| 372 |
+
"Model B False": "#F1C0E8",
|
| 373 |
+
"Model C True": "#CFBAF0",
|
| 374 |
+
"Model C False": "#A3C4F3",
|
| 375 |
+
"Model C NA": "#90DBF4",
|
| 376 |
+
"Final True": "#98F5E1",
|
| 377 |
+
"Final False": "#B9FBC0"
|
| 378 |
+
};
|
| 379 |
+
return colors[d.name] || "#cccccc";
|
| 380 |
+
}'
|
| 381 |
+
# 绘制桑基图
|
| 382 |
+
sankeyNetwork(Links = links, Nodes = nodes,
|
| 383 |
+
Source = "source", Target = "target",
|
| 384 |
+
Value = "value", NodeID = "name",
|
| 385 |
+
sinksRight = TRUE,
|
| 386 |
+
nodeWidth = 40,
|
| 387 |
+
nodePadding = 20,
|
| 388 |
+
colourScale = my_color,
|
| 389 |
+
fontSize = 12,
|
| 390 |
+
height = 500,
|
| 391 |
+
width = 800)
|
| 392 |
+
# 保存为HTML文件
|
| 393 |
+
saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
|
| 394 |
+
Source = "source", Target = "target",
|
| 395 |
+
Value = "value", NodeID = "name",
|
| 396 |
+
sinksRight = TRUE,
|
| 397 |
+
nodeWidth = 40,
|
| 398 |
+
nodePadding = 20,
|
| 399 |
+
colourScale = my_color,
|
| 400 |
+
fontSize = 12,
|
| 401 |
+
height = 500,
|
| 402 |
+
width = 800),
|
| 403 |
+
"sankey_plot.html")
|
| 404 |
+
setwd("C:/Users/admin/Desktop/article-analyzer")
|
| 405 |
+
# 加载必要的包
|
| 406 |
+
library(networkD3)
|
| 407 |
+
library(dplyr)
|
| 408 |
+
library(readr)
|
| 409 |
+
# 读取数据
|
| 410 |
+
data <- read_csv("data/picos_analysis.csv")
|
| 411 |
+
# 准备节点数据
|
| 412 |
+
nodes <- data.frame(
|
| 413 |
+
name = c(
|
| 414 |
+
"Model A True", "Model A False",
|
| 415 |
+
"Model B True", "Model B False",
|
| 416 |
+
"Model C True", "Model C False", "Model C NA",
|
| 417 |
+
"Final True", "Final False"
|
| 418 |
+
),
|
| 419 |
+
group = c(
|
| 420 |
+
"A True", "A False",
|
| 421 |
+
"B True", "B False",
|
| 422 |
+
"C True", "C False", "C NA",
|
| 423 |
+
"F True", "F False"
|
| 424 |
+
)
|
| 425 |
+
)
|
| 426 |
+
# 计算流向
|
| 427 |
+
# A -> B
|
| 428 |
+
a_true_b_true <- sum(data$A_Decision & data$B_Decision, na.rm = TRUE)
|
| 429 |
+
a_true_b_false <- sum(data$A_Decision & !data$B_Decision, na.rm = TRUE)
|
| 430 |
+
a_false_b_true <- sum(!data$A_Decision & data$B_Decision, na.rm = TRUE)
|
| 431 |
+
a_false_b_false <- sum(!data$A_Decision & !data$B_Decision, na.rm = TRUE)
|
| 432 |
+
# B -> C
|
| 433 |
+
b_true_c_true <- sum(data$B_Decision & data$C_Decision, na.rm = TRUE)
|
| 434 |
+
b_true_c_false <- sum(data$B_Decision & !data$C_Decision, na.rm = TRUE)
|
| 435 |
+
b_true_c_na <- sum(data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
|
| 436 |
+
b_false_c_true <- sum(!data$B_Decision & data$C_Decision, na.rm = TRUE)
|
| 437 |
+
b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
|
| 438 |
+
b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
|
| 439 |
+
# C -> Final
|
| 440 |
+
c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
|
| 441 |
+
c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
|
| 442 |
+
c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
|
| 443 |
+
c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
|
| 444 |
+
c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
|
| 445 |
+
c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
|
| 446 |
+
# 准备链接数据
|
| 447 |
+
links <- data.frame(
|
| 448 |
+
source = c(
|
| 449 |
+
# A -> B
|
| 450 |
+
rep(0, 2), rep(1, 2),
|
| 451 |
+
# B -> C
|
| 452 |
+
rep(2, 3), rep(3, 3),
|
| 453 |
+
# C -> Final
|
| 454 |
+
rep(4, 2), rep(5, 2), rep(6, 2)
|
| 455 |
+
),
|
| 456 |
+
target = c(
|
| 457 |
+
# A -> B
|
| 458 |
+
2, 3, 2, 3,
|
| 459 |
+
# B -> C
|
| 460 |
+
4, 5, 6, 4, 5, 6,
|
| 461 |
+
# C -> Final
|
| 462 |
+
7, 8, 7, 8, 7, 8
|
| 463 |
+
),
|
| 464 |
+
value = c(
|
| 465 |
+
# A -> B
|
| 466 |
+
a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
|
| 467 |
+
# B -> C
|
| 468 |
+
b_true_c_true, b_true_c_false, b_true_c_na,
|
| 469 |
+
b_false_c_true, b_false_c_false, b_false_c_na,
|
| 470 |
+
# C -> Final
|
| 471 |
+
c_true_final_true, c_true_final_false,
|
| 472 |
+
c_false_final_true, c_false_final_false,
|
| 473 |
+
c_na_final_true, c_na_final_false
|
| 474 |
+
)
|
| 475 |
+
)
|
| 476 |
+
# 创建颜色向量
|
| 477 |
+
my_color <- paste0(
|
| 478 |
+
'd3.scaleOrdinal()
|
| 479 |
+
.domain(["A True", "A False",
|
| 480 |
+
"B True", "B False",
|
| 481 |
+
"C True", "C False", "C NA",
|
| 482 |
+
"F True", "F False"])
|
| 483 |
+
.range(["#fbf8cc", "#fde4cf",
|
| 484 |
+
"#FFCFD2", "#F1C0E8",
|
| 485 |
+
"#CFBAF0", "#A3C4F3", "#90DBF4",
|
| 486 |
+
"#98F5E1", "#B9FBC0"])'
|
| 487 |
+
)
|
| 488 |
+
# 绘制桑基图
|
| 489 |
+
sankeyNetwork(Links = links, Nodes = nodes,
|
| 490 |
+
Source = "source", Target = "target",
|
| 491 |
+
Value = "value", NodeID = "name",
|
| 492 |
+
NodeGroup = "group",
|
| 493 |
+
sinksRight = TRUE,
|
| 494 |
+
nodeWidth = 40,
|
| 495 |
+
nodePadding = 20,
|
| 496 |
+
colourScale = my_color,
|
| 497 |
+
fontSize = 12,
|
| 498 |
+
height = 500,
|
| 499 |
+
width = 800)
|
| 500 |
+
# 保存为HTML文件
|
| 501 |
+
saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
|
| 502 |
+
Source = "source", Target = "target",
|
| 503 |
+
Value = "value", NodeID = "name",
|
| 504 |
+
NodeGroup = "group",
|
| 505 |
+
sinksRight = TRUE,
|
| 506 |
+
nodeWidth = 40,
|
| 507 |
+
nodePadding = 20,
|
| 508 |
+
colourScale = my_color,
|
| 509 |
+
fontSize = 12,
|
| 510 |
+
height = 500,
|
| 511 |
+
width = 800),
|
| 512 |
+
"sankey_plot.html")
|
.env.example
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Primary Model Configuration (Model A)
|
| 2 |
+
# Used for initial screening and basic PICOS criteria evaluation
|
| 3 |
+
MODEL_A_API_URL=https://api.example.com/v1
|
| 4 |
+
MODEL_A_API_KEY=your_model_a_api_key
|
| 5 |
+
MODEL_A_MODEL_NAME=model-a-name
|
| 6 |
+
MODEL_A_TEMPERATURE=0.3
|
| 7 |
+
MODEL_A_MAX_TOKENS=16384
|
| 8 |
+
MODEL_A_BATCH_SIZE=10
|
| 9 |
+
MODEL_A_THREADS=8
|
| 10 |
+
MODEL_A_TIMEOUT=180
|
| 11 |
+
MODEL_A_IS_INFERENCE=false
|
| 12 |
+
|
| 13 |
+
# Secondary Model Configuration (Model B)
|
| 14 |
+
# Used for detailed analysis and verification of Model A results
|
| 15 |
+
MODEL_B_API_URL=https://api.example.com/v1
|
| 16 |
+
MODEL_B_API_KEY=your_model_b_api_key
|
| 17 |
+
MODEL_B_MODEL_NAME=model-b-name
|
| 18 |
+
MODEL_B_TEMPERATURE=0.3
|
| 19 |
+
MODEL_B_MAX_TOKENS=16384
|
| 20 |
+
MODEL_B_BATCH_SIZE=10
|
| 21 |
+
MODEL_B_THREADS=8
|
| 22 |
+
MODEL_B_TIMEOUT=180
|
| 23 |
+
MODEL_B_IS_INFERENCE=false
|
| 24 |
+
|
| 25 |
+
# Arbitration Model Configuration (Model C)
|
| 26 |
+
# Used to resolve conflicts between Model A and B results
|
| 27 |
+
MODEL_C_API_URL=https://api.example.com/v1
|
| 28 |
+
MODEL_C_API_KEY=your_model_c_api_key
|
| 29 |
+
MODEL_C_MODEL_NAME=model-c-name
|
| 30 |
+
MODEL_C_TEMPERATURE=0.3
|
| 31 |
+
MODEL_C_MAX_TOKENS=16384
|
| 32 |
+
MODEL_C_BATCH_SIZE=10
|
| 33 |
+
MODEL_C_THREADS=8
|
| 34 |
+
MODEL_C_TIMEOUT=180
|
| 35 |
+
MODEL_C_IS_INFERENCE=false
|
.github/workflows/deploy_to_hf_space.yml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# .github/workflows/deploy_to_hf_space.yml
|
| 2 |
+
name: Deploy Gradio to Hugging Face Spaces
|
| 3 |
+
|
| 4 |
+
on:
|
| 5 |
+
push:
|
| 6 |
+
branches:
|
| 7 |
+
- master
|
| 8 |
+
workflow_dispatch:
|
| 9 |
+
|
| 10 |
+
jobs:
|
| 11 |
+
deploy:
|
| 12 |
+
runs-on: ubuntu-latest
|
| 13 |
+
env:
|
| 14 |
+
HF_USERNAME: chitsanfei
|
| 15 |
+
SPACE_NAME: review-screening-analyzer
|
| 16 |
+
|
| 17 |
+
steps:
|
| 18 |
+
- name: Checkout repository
|
| 19 |
+
uses: actions/checkout@v3
|
| 20 |
+
with:
|
| 21 |
+
fetch-depth: 0
|
| 22 |
+
lfs: true
|
| 23 |
+
|
| 24 |
+
- name: Set up Python 3.8
|
| 25 |
+
uses: actions/setup-python@v4
|
| 26 |
+
with:
|
| 27 |
+
python-version: "3.8"
|
| 28 |
+
|
| 29 |
+
- name: Install dependencies
|
| 30 |
+
run: pip install -r requirements.txt
|
| 31 |
+
|
| 32 |
+
- name: Push to Hugging Face Space
|
| 33 |
+
# HF_TOKEN 需在仓库 Settings → Secrets 中配置
|
| 34 |
+
env:
|
| 35 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 36 |
+
run: |
|
| 37 |
+
# 配置 Git 用户信息
|
| 38 |
+
git config --global user.name "${{ github.actor }}"
|
| 39 |
+
git config --global user.email "${{ github.actor }}@users.noreply.github.com"
|
| 40 |
+
# 强制推送当前 HEAD 到远端 main 分支
|
| 41 |
+
git push -f \
|
| 42 |
+
https://$HF_USERNAME:${{ secrets.HF_TOKEN }}@huggingface.co/spaces/$HF_USERNAME/$SPACE_NAME \
|
| 43 |
+
HEAD:main
|
.gitignore
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
|
| 110 |
+
# pdm
|
| 111 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 112 |
+
#pdm.lock
|
| 113 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 114 |
+
# in version control.
|
| 115 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
| 116 |
+
.pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 121 |
+
__pypackages__/
|
| 122 |
+
|
| 123 |
+
# Celery stuff
|
| 124 |
+
celerybeat-schedule
|
| 125 |
+
celerybeat.pid
|
| 126 |
+
|
| 127 |
+
# SageMath parsed files
|
| 128 |
+
*.sage.py
|
| 129 |
+
|
| 130 |
+
# Environments
|
| 131 |
+
.env
|
| 132 |
+
.venv
|
| 133 |
+
env/
|
| 134 |
+
venv/
|
| 135 |
+
ENV/
|
| 136 |
+
env.bak/
|
| 137 |
+
venv.bak/
|
| 138 |
+
|
| 139 |
+
# Spyder project settings
|
| 140 |
+
.spyderproject
|
| 141 |
+
.spyproject
|
| 142 |
+
|
| 143 |
+
# Rope project settings
|
| 144 |
+
.ropeproject
|
| 145 |
+
|
| 146 |
+
# mkdocs documentation
|
| 147 |
+
/site
|
| 148 |
+
|
| 149 |
+
# mypy
|
| 150 |
+
.mypy_cache/
|
| 151 |
+
.dmypy.json
|
| 152 |
+
dmypy.json
|
| 153 |
+
|
| 154 |
+
# Pyre type checker
|
| 155 |
+
.pyre/
|
| 156 |
+
|
| 157 |
+
# pytype static type analyzer
|
| 158 |
+
.pytype/
|
| 159 |
+
|
| 160 |
+
# Cython debug symbols
|
| 161 |
+
cython_debug/
|
| 162 |
+
|
| 163 |
+
# PyCharm
|
| 164 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 165 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 166 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 167 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 168 |
+
#.idea/
|
| 169 |
+
|
| 170 |
+
# PyPI configuration file
|
| 171 |
+
.pypirc
|
| 172 |
+
|
| 173 |
+
# Environment variables
|
| 174 |
+
.env
|
| 175 |
+
.env.local
|
| 176 |
+
.env.*.local
|
| 177 |
+
|
| 178 |
+
# For HF
|
| 179 |
+
.static/banner.png
|
| 180 |
+
data/*.xlsx
|
| 181 |
+
|
LICENSE
ADDED
|
@@ -0,0 +1,661 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
GNU AFFERO GENERAL PUBLIC LICENSE
|
| 2 |
+
Version 3, 19 November 2007
|
| 3 |
+
|
| 4 |
+
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
| 5 |
+
Everyone is permitted to copy and distribute verbatim copies
|
| 6 |
+
of this license document, but changing it is not allowed.
|
| 7 |
+
|
| 8 |
+
Preamble
|
| 9 |
+
|
| 10 |
+
The GNU Affero General Public License is a free, copyleft license for
|
| 11 |
+
software and other kinds of works, specifically designed to ensure
|
| 12 |
+
cooperation with the community in the case of network server software.
|
| 13 |
+
|
| 14 |
+
The licenses for most software and other practical works are designed
|
| 15 |
+
to take away your freedom to share and change the works. By contrast,
|
| 16 |
+
our General Public Licenses are intended to guarantee your freedom to
|
| 17 |
+
share and change all versions of a program--to make sure it remains free
|
| 18 |
+
software for all its users.
|
| 19 |
+
|
| 20 |
+
When we speak of free software, we are referring to freedom, not
|
| 21 |
+
price. Our General Public Licenses are designed to make sure that you
|
| 22 |
+
have the freedom to distribute copies of free software (and charge for
|
| 23 |
+
them if you wish), that you receive source code or can get it if you
|
| 24 |
+
want it, that you can change the software or use pieces of it in new
|
| 25 |
+
free programs, and that you know you can do these things.
|
| 26 |
+
|
| 27 |
+
Developers that use our General Public Licenses protect your rights
|
| 28 |
+
with two steps: (1) assert copyright on the software, and (2) offer
|
| 29 |
+
you this License which gives you legal permission to copy, distribute
|
| 30 |
+
and/or modify the software.
|
| 31 |
+
|
| 32 |
+
A secondary benefit of defending all users' freedom is that
|
| 33 |
+
improvements made in alternate versions of the program, if they
|
| 34 |
+
receive widespread use, become available for other developers to
|
| 35 |
+
incorporate. Many developers of free software are heartened and
|
| 36 |
+
encouraged by the resulting cooperation. However, in the case of
|
| 37 |
+
software used on network servers, this result may fail to come about.
|
| 38 |
+
The GNU General Public License permits making a modified version and
|
| 39 |
+
letting the public access it on a server without ever releasing its
|
| 40 |
+
source code to the public.
|
| 41 |
+
|
| 42 |
+
The GNU Affero General Public License is designed specifically to
|
| 43 |
+
ensure that, in such cases, the modified source code becomes available
|
| 44 |
+
to the community. It requires the operator of a network server to
|
| 45 |
+
provide the source code of the modified version running there to the
|
| 46 |
+
users of that server. Therefore, public use of a modified version, on
|
| 47 |
+
a publicly accessible server, gives the public access to the source
|
| 48 |
+
code of the modified version.
|
| 49 |
+
|
| 50 |
+
An older license, called the Affero General Public License and
|
| 51 |
+
published by Affero, was designed to accomplish similar goals. This is
|
| 52 |
+
a different license, not a version of the Affero GPL, but Affero has
|
| 53 |
+
released a new version of the Affero GPL which permits relicensing under
|
| 54 |
+
this license.
|
| 55 |
+
|
| 56 |
+
The precise terms and conditions for copying, distribution and
|
| 57 |
+
modification follow.
|
| 58 |
+
|
| 59 |
+
TERMS AND CONDITIONS
|
| 60 |
+
|
| 61 |
+
0. Definitions.
|
| 62 |
+
|
| 63 |
+
"This License" refers to version 3 of the GNU Affero General Public License.
|
| 64 |
+
|
| 65 |
+
"Copyright" also means copyright-like laws that apply to other kinds of
|
| 66 |
+
works, such as semiconductor masks.
|
| 67 |
+
|
| 68 |
+
"The Program" refers to any copyrightable work licensed under this
|
| 69 |
+
License. Each licensee is addressed as "you". "Licensees" and
|
| 70 |
+
"recipients" may be individuals or organizations.
|
| 71 |
+
|
| 72 |
+
To "modify" a work means to copy from or adapt all or part of the work
|
| 73 |
+
in a fashion requiring copyright permission, other than the making of an
|
| 74 |
+
exact copy. The resulting work is called a "modified version" of the
|
| 75 |
+
earlier work or a work "based on" the earlier work.
|
| 76 |
+
|
| 77 |
+
A "covered work" means either the unmodified Program or a work based
|
| 78 |
+
on the Program.
|
| 79 |
+
|
| 80 |
+
To "propagate" a work means to do anything with it that, without
|
| 81 |
+
permission, would make you directly or secondarily liable for
|
| 82 |
+
infringement under applicable copyright law, except executing it on a
|
| 83 |
+
computer or modifying a private copy. Propagation includes copying,
|
| 84 |
+
distribution (with or without modification), making available to the
|
| 85 |
+
public, and in some countries other activities as well.
|
| 86 |
+
|
| 87 |
+
To "convey" a work means any kind of propagation that enables other
|
| 88 |
+
parties to make or receive copies. Mere interaction with a user through
|
| 89 |
+
a computer network, with no transfer of a copy, is not conveying.
|
| 90 |
+
|
| 91 |
+
An interactive user interface displays "Appropriate Legal Notices"
|
| 92 |
+
to the extent that it includes a convenient and prominently visible
|
| 93 |
+
feature that (1) displays an appropriate copyright notice, and (2)
|
| 94 |
+
tells the user that there is no warranty for the work (except to the
|
| 95 |
+
extent that warranties are provided), that licensees may convey the
|
| 96 |
+
work under this License, and how to view a copy of this License. If
|
| 97 |
+
the interface presents a list of user commands or options, such as a
|
| 98 |
+
menu, a prominent item in the list meets this criterion.
|
| 99 |
+
|
| 100 |
+
1. Source Code.
|
| 101 |
+
|
| 102 |
+
The "source code" for a work means the preferred form of the work
|
| 103 |
+
for making modifications to it. "Object code" means any non-source
|
| 104 |
+
form of a work.
|
| 105 |
+
|
| 106 |
+
A "Standard Interface" means an interface that either is an official
|
| 107 |
+
standard defined by a recognized standards body, or, in the case of
|
| 108 |
+
interfaces specified for a particular programming language, one that
|
| 109 |
+
is widely used among developers working in that language.
|
| 110 |
+
|
| 111 |
+
The "System Libraries" of an executable work include anything, other
|
| 112 |
+
than the work as a whole, that (a) is included in the normal form of
|
| 113 |
+
packaging a Major Component, but which is not part of that Major
|
| 114 |
+
Component, and (b) serves only to enable use of the work with that
|
| 115 |
+
Major Component, or to implement a Standard Interface for which an
|
| 116 |
+
implementation is available to the public in source code form. A
|
| 117 |
+
"Major Component", in this context, means a major essential component
|
| 118 |
+
(kernel, window system, and so on) of the specific operating system
|
| 119 |
+
(if any) on which the executable work runs, or a compiler used to
|
| 120 |
+
produce the work, or an object code interpreter used to run it.
|
| 121 |
+
|
| 122 |
+
The "Corresponding Source" for a work in object code form means all
|
| 123 |
+
the source code needed to generate, install, and (for an executable
|
| 124 |
+
work) run the object code and to modify the work, including scripts to
|
| 125 |
+
control those activities. However, it does not include the work's
|
| 126 |
+
System Libraries, or general-purpose tools or generally available free
|
| 127 |
+
programs which are used unmodified in performing those activities but
|
| 128 |
+
which are not part of the work. For example, Corresponding Source
|
| 129 |
+
includes interface definition files associated with source files for
|
| 130 |
+
the work, and the source code for shared libraries and dynamically
|
| 131 |
+
linked subprograms that the work is specifically designed to require,
|
| 132 |
+
such as by intimate data communication or control flow between those
|
| 133 |
+
subprograms and other parts of the work.
|
| 134 |
+
|
| 135 |
+
The Corresponding Source need not include anything that users
|
| 136 |
+
can regenerate automatically from other parts of the Corresponding
|
| 137 |
+
Source.
|
| 138 |
+
|
| 139 |
+
The Corresponding Source for a work in source code form is that
|
| 140 |
+
same work.
|
| 141 |
+
|
| 142 |
+
2. Basic Permissions.
|
| 143 |
+
|
| 144 |
+
All rights granted under this License are granted for the term of
|
| 145 |
+
copyright on the Program, and are irrevocable provided the stated
|
| 146 |
+
conditions are met. This License explicitly affirms your unlimited
|
| 147 |
+
permission to run the unmodified Program. The output from running a
|
| 148 |
+
covered work is covered by this License only if the output, given its
|
| 149 |
+
content, constitutes a covered work. This License acknowledges your
|
| 150 |
+
rights of fair use or other equivalent, as provided by copyright law.
|
| 151 |
+
|
| 152 |
+
You may make, run and propagate covered works that you do not
|
| 153 |
+
convey, without conditions so long as your license otherwise remains
|
| 154 |
+
in force. You may convey covered works to others for the sole purpose
|
| 155 |
+
of having them make modifications exclusively for you, or provide you
|
| 156 |
+
with facilities for running those works, provided that you comply with
|
| 157 |
+
the terms of this License in conveying all material for which you do
|
| 158 |
+
not control copyright. Those thus making or running the covered works
|
| 159 |
+
for you must do so exclusively on your behalf, under your direction
|
| 160 |
+
and control, on terms that prohibit them from making any copies of
|
| 161 |
+
your copyrighted material outside their relationship with you.
|
| 162 |
+
|
| 163 |
+
Conveying under any other circumstances is permitted solely under
|
| 164 |
+
the conditions stated below. Sublicensing is not allowed; section 10
|
| 165 |
+
makes it unnecessary.
|
| 166 |
+
|
| 167 |
+
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
| 168 |
+
|
| 169 |
+
No covered work shall be deemed part of an effective technological
|
| 170 |
+
measure under any applicable law fulfilling obligations under article
|
| 171 |
+
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
| 172 |
+
similar laws prohibiting or restricting circumvention of such
|
| 173 |
+
measures.
|
| 174 |
+
|
| 175 |
+
When you convey a covered work, you waive any legal power to forbid
|
| 176 |
+
circumvention of technological measures to the extent such circumvention
|
| 177 |
+
is effected by exercising rights under this License with respect to
|
| 178 |
+
the covered work, and you disclaim any intention to limit operation or
|
| 179 |
+
modification of the work as a means of enforcing, against the work's
|
| 180 |
+
users, your or third parties' legal rights to forbid circumvention of
|
| 181 |
+
technological measures.
|
| 182 |
+
|
| 183 |
+
4. Conveying Verbatim Copies.
|
| 184 |
+
|
| 185 |
+
You may convey verbatim copies of the Program's source code as you
|
| 186 |
+
receive it, in any medium, provided that you conspicuously and
|
| 187 |
+
appropriately publish on each copy an appropriate copyright notice;
|
| 188 |
+
keep intact all notices stating that this License and any
|
| 189 |
+
non-permissive terms added in accord with section 7 apply to the code;
|
| 190 |
+
keep intact all notices of the absence of any warranty; and give all
|
| 191 |
+
recipients a copy of this License along with the Program.
|
| 192 |
+
|
| 193 |
+
You may charge any price or no price for each copy that you convey,
|
| 194 |
+
and you may offer support or warranty protection for a fee.
|
| 195 |
+
|
| 196 |
+
5. Conveying Modified Source Versions.
|
| 197 |
+
|
| 198 |
+
You may convey a work based on the Program, or the modifications to
|
| 199 |
+
produce it from the Program, in the form of source code under the
|
| 200 |
+
terms of section 4, provided that you also meet all of these conditions:
|
| 201 |
+
|
| 202 |
+
a) The work must carry prominent notices stating that you modified
|
| 203 |
+
it, and giving a relevant date.
|
| 204 |
+
|
| 205 |
+
b) The work must carry prominent notices stating that it is
|
| 206 |
+
released under this License and any conditions added under section
|
| 207 |
+
7. This requirement modifies the requirement in section 4 to
|
| 208 |
+
"keep intact all notices".
|
| 209 |
+
|
| 210 |
+
c) You must license the entire work, as a whole, under this
|
| 211 |
+
License to anyone who comes into possession of a copy. This
|
| 212 |
+
License will therefore apply, along with any applicable section 7
|
| 213 |
+
additional terms, to the whole of the work, and all its parts,
|
| 214 |
+
regardless of how they are packaged. This License gives no
|
| 215 |
+
permission to license the work in any other way, but it does not
|
| 216 |
+
invalidate such permission if you have separately received it.
|
| 217 |
+
|
| 218 |
+
d) If the work has interactive user interfaces, each must display
|
| 219 |
+
Appropriate Legal Notices; however, if the Program has interactive
|
| 220 |
+
interfaces that do not display Appropriate Legal Notices, your
|
| 221 |
+
work need not make them do so.
|
| 222 |
+
|
| 223 |
+
A compilation of a covered work with other separate and independent
|
| 224 |
+
works, which are not by their nature extensions of the covered work,
|
| 225 |
+
and which are not combined with it such as to form a larger program,
|
| 226 |
+
in or on a volume of a storage or distribution medium, is called an
|
| 227 |
+
"aggregate" if the compilation and its resulting copyright are not
|
| 228 |
+
used to limit the access or legal rights of the compilation's users
|
| 229 |
+
beyond what the individual works permit. Inclusion of a covered work
|
| 230 |
+
in an aggregate does not cause this License to apply to the other
|
| 231 |
+
parts of the aggregate.
|
| 232 |
+
|
| 233 |
+
6. Conveying Non-Source Forms.
|
| 234 |
+
|
| 235 |
+
You may convey a covered work in object code form under the terms
|
| 236 |
+
of sections 4 and 5, provided that you also convey the
|
| 237 |
+
machine-readable Corresponding Source under the terms of this License,
|
| 238 |
+
in one of these ways:
|
| 239 |
+
|
| 240 |
+
a) Convey the object code in, or embodied in, a physical product
|
| 241 |
+
(including a physical distribution medium), accompanied by the
|
| 242 |
+
Corresponding Source fixed on a durable physical medium
|
| 243 |
+
customarily used for software interchange.
|
| 244 |
+
|
| 245 |
+
b) Convey the object code in, or embodied in, a physical product
|
| 246 |
+
(including a physical distribution medium), accompanied by a
|
| 247 |
+
written offer, valid for at least three years and valid for as
|
| 248 |
+
long as you offer spare parts or customer support for that product
|
| 249 |
+
model, to give anyone who possesses the object code either (1) a
|
| 250 |
+
copy of the Corresponding Source for all the software in the
|
| 251 |
+
product that is covered by this License, on a durable physical
|
| 252 |
+
medium customarily used for software interchange, for a price no
|
| 253 |
+
more than your reasonable cost of physically performing this
|
| 254 |
+
conveying of source, or (2) access to copy the
|
| 255 |
+
Corresponding Source from a network server at no charge.
|
| 256 |
+
|
| 257 |
+
c) Convey individual copies of the object code with a copy of the
|
| 258 |
+
written offer to provide the Corresponding Source. This
|
| 259 |
+
alternative is allowed only occasionally and noncommercially, and
|
| 260 |
+
only if you received the object code with such an offer, in accord
|
| 261 |
+
with subsection 6b.
|
| 262 |
+
|
| 263 |
+
d) Convey the object code by offering access from a designated
|
| 264 |
+
place (gratis or for a charge), and offer equivalent access to the
|
| 265 |
+
Corresponding Source in the same way through the same place at no
|
| 266 |
+
further charge. You need not require recipients to copy the
|
| 267 |
+
Corresponding Source along with the object code. If the place to
|
| 268 |
+
copy the object code is a network server, the Corresponding Source
|
| 269 |
+
may be on a different server (operated by you or a third party)
|
| 270 |
+
that supports equivalent copying facilities, provided you maintain
|
| 271 |
+
clear directions next to the object code saying where to find the
|
| 272 |
+
Corresponding Source. Regardless of what server hosts the
|
| 273 |
+
Corresponding Source, you remain obligated to ensure that it is
|
| 274 |
+
available for as long as needed to satisfy these requirements.
|
| 275 |
+
|
| 276 |
+
e) Convey the object code using peer-to-peer transmission, provided
|
| 277 |
+
you inform other peers where the object code and Corresponding
|
| 278 |
+
Source of the work are being offered to the general public at no
|
| 279 |
+
charge under subsection 6d.
|
| 280 |
+
|
| 281 |
+
A separable portion of the object code, whose source code is excluded
|
| 282 |
+
from the Corresponding Source as a System Library, need not be
|
| 283 |
+
included in conveying the object code work.
|
| 284 |
+
|
| 285 |
+
A "User Product" is either (1) a "consumer product", which means any
|
| 286 |
+
tangible personal property which is normally used for personal, family,
|
| 287 |
+
or household purposes, or (2) anything designed or sold for incorporation
|
| 288 |
+
into a dwelling. In determining whether a product is a consumer product,
|
| 289 |
+
doubtful cases shall be resolved in favor of coverage. For a particular
|
| 290 |
+
product received by a particular user, "normally used" refers to a
|
| 291 |
+
typical or common use of that class of product, regardless of the status
|
| 292 |
+
of the particular user or of the way in which the particular user
|
| 293 |
+
actually uses, or expects or is expected to use, the product. A product
|
| 294 |
+
is a consumer product regardless of whether the product has substantial
|
| 295 |
+
commercial, industrial or non-consumer uses, unless such uses represent
|
| 296 |
+
the only significant mode of use of the product.
|
| 297 |
+
|
| 298 |
+
"Installation Information" for a User Product means any methods,
|
| 299 |
+
procedures, authorization keys, or other information required to install
|
| 300 |
+
and execute modified versions of a covered work in that User Product from
|
| 301 |
+
a modified version of its Corresponding Source. The information must
|
| 302 |
+
suffice to ensure that the continued functioning of the modified object
|
| 303 |
+
code is in no case prevented or interfered with solely because
|
| 304 |
+
modification has been made.
|
| 305 |
+
|
| 306 |
+
If you convey an object code work under this section in, or with, or
|
| 307 |
+
specifically for use in, a User Product, and the conveying occurs as
|
| 308 |
+
part of a transaction in which the right of possession and use of the
|
| 309 |
+
User Product is transferred to the recipient in perpetuity or for a
|
| 310 |
+
fixed term (regardless of how the transaction is characterized), the
|
| 311 |
+
Corresponding Source conveyed under this section must be accompanied
|
| 312 |
+
by the Installation Information. But this requirement does not apply
|
| 313 |
+
if neither you nor any third party retains the ability to install
|
| 314 |
+
modified object code on the User Product (for example, the work has
|
| 315 |
+
been installed in ROM).
|
| 316 |
+
|
| 317 |
+
The requirement to provide Installation Information does not include a
|
| 318 |
+
requirement to continue to provide support service, warranty, or updates
|
| 319 |
+
for a work that has been modified or installed by the recipient, or for
|
| 320 |
+
the User Product in which it has been modified or installed. Access to a
|
| 321 |
+
network may be denied when the modification itself materially and
|
| 322 |
+
adversely affects the operation of the network or violates the rules and
|
| 323 |
+
protocols for communication across the network.
|
| 324 |
+
|
| 325 |
+
Corresponding Source conveyed, and Installation Information provided,
|
| 326 |
+
in accord with this section must be in a format that is publicly
|
| 327 |
+
documented (and with an implementation available to the public in
|
| 328 |
+
source code form), and must require no special password or key for
|
| 329 |
+
unpacking, reading or copying.
|
| 330 |
+
|
| 331 |
+
7. Additional Terms.
|
| 332 |
+
|
| 333 |
+
"Additional permissions" are terms that supplement the terms of this
|
| 334 |
+
License by making exceptions from one or more of its conditions.
|
| 335 |
+
Additional permissions that are applicable to the entire Program shall
|
| 336 |
+
be treated as though they were included in this License, to the extent
|
| 337 |
+
that they are valid under applicable law. If additional permissions
|
| 338 |
+
apply only to part of the Program, that part may be used separately
|
| 339 |
+
under those permissions, but the entire Program remains governed by
|
| 340 |
+
this License without regard to the additional permissions.
|
| 341 |
+
|
| 342 |
+
When you convey a copy of a covered work, you may at your option
|
| 343 |
+
remove any additional permissions from that copy, or from any part of
|
| 344 |
+
it. (Additional permissions may be written to require their own
|
| 345 |
+
removal in certain cases when you modify the work.) You may place
|
| 346 |
+
additional permissions on material, added by you to a covered work,
|
| 347 |
+
for which you have or can give appropriate copyright permission.
|
| 348 |
+
|
| 349 |
+
Notwithstanding any other provision of this License, for material you
|
| 350 |
+
add to a covered work, you may (if authorized by the copyright holders of
|
| 351 |
+
that material) supplement the terms of this License with terms:
|
| 352 |
+
|
| 353 |
+
a) Disclaiming warranty or limiting liability differently from the
|
| 354 |
+
terms of sections 15 and 16 of this License; or
|
| 355 |
+
|
| 356 |
+
b) Requiring preservation of specified reasonable legal notices or
|
| 357 |
+
author attributions in that material or in the Appropriate Legal
|
| 358 |
+
Notices displayed by works containing it; or
|
| 359 |
+
|
| 360 |
+
c) Prohibiting misrepresentation of the origin of that material, or
|
| 361 |
+
requiring that modified versions of such material be marked in
|
| 362 |
+
reasonable ways as different from the original version; or
|
| 363 |
+
|
| 364 |
+
d) Limiting the use for publicity purposes of names of licensors or
|
| 365 |
+
authors of the material; or
|
| 366 |
+
|
| 367 |
+
e) Declining to grant rights under trademark law for use of some
|
| 368 |
+
trade names, trademarks, or service marks; or
|
| 369 |
+
|
| 370 |
+
f) Requiring indemnification of licensors and authors of that
|
| 371 |
+
material by anyone who conveys the material (or modified versions of
|
| 372 |
+
it) with contractual assumptions of liability to the recipient, for
|
| 373 |
+
any liability that these contractual assumptions directly impose on
|
| 374 |
+
those licensors and authors.
|
| 375 |
+
|
| 376 |
+
All other non-permissive additional terms are considered "further
|
| 377 |
+
restrictions" within the meaning of section 10. If the Program as you
|
| 378 |
+
received it, or any part of it, contains a notice stating that it is
|
| 379 |
+
governed by this License along with a term that is a further
|
| 380 |
+
restriction, you may remove that term. If a license document contains
|
| 381 |
+
a further restriction but permits relicensing or conveying under this
|
| 382 |
+
License, you may add to a covered work material governed by the terms
|
| 383 |
+
of that license document, provided that the further restriction does
|
| 384 |
+
not survive such relicensing or conveying.
|
| 385 |
+
|
| 386 |
+
If you add terms to a covered work in accord with this section, you
|
| 387 |
+
must place, in the relevant source files, a statement of the
|
| 388 |
+
additional terms that apply to those files, or a notice indicating
|
| 389 |
+
where to find the applicable terms.
|
| 390 |
+
|
| 391 |
+
Additional terms, permissive or non-permissive, may be stated in the
|
| 392 |
+
form of a separately written license, or stated as exceptions;
|
| 393 |
+
the above requirements apply either way.
|
| 394 |
+
|
| 395 |
+
8. Termination.
|
| 396 |
+
|
| 397 |
+
You may not propagate or modify a covered work except as expressly
|
| 398 |
+
provided under this License. Any attempt otherwise to propagate or
|
| 399 |
+
modify it is void, and will automatically terminate your rights under
|
| 400 |
+
this License (including any patent licenses granted under the third
|
| 401 |
+
paragraph of section 11).
|
| 402 |
+
|
| 403 |
+
However, if you cease all violation of this License, then your
|
| 404 |
+
license from a particular copyright holder is reinstated (a)
|
| 405 |
+
provisionally, unless and until the copyright holder explicitly and
|
| 406 |
+
finally terminates your license, and (b) permanently, if the copyright
|
| 407 |
+
holder fails to notify you of the violation by some reasonable means
|
| 408 |
+
prior to 60 days after the cessation.
|
| 409 |
+
|
| 410 |
+
Moreover, your license from a particular copyright holder is
|
| 411 |
+
reinstated permanently if the copyright holder notifies you of the
|
| 412 |
+
violation by some reasonable means, this is the first time you have
|
| 413 |
+
received notice of violation of this License (for any work) from that
|
| 414 |
+
copyright holder, and you cure the violation prior to 30 days after
|
| 415 |
+
your receipt of the notice.
|
| 416 |
+
|
| 417 |
+
Termination of your rights under this section does not terminate the
|
| 418 |
+
licenses of parties who have received copies or rights from you under
|
| 419 |
+
this License. If your rights have been terminated and not permanently
|
| 420 |
+
reinstated, you do not qualify to receive new licenses for the same
|
| 421 |
+
material under section 10.
|
| 422 |
+
|
| 423 |
+
9. Acceptance Not Required for Having Copies.
|
| 424 |
+
|
| 425 |
+
You are not required to accept this License in order to receive or
|
| 426 |
+
run a copy of the Program. Ancillary propagation of a covered work
|
| 427 |
+
occurring solely as a consequence of using peer-to-peer transmission
|
| 428 |
+
to receive a copy likewise does not require acceptance. However,
|
| 429 |
+
nothing other than this License grants you permission to propagate or
|
| 430 |
+
modify any covered work. These actions infringe copyright if you do
|
| 431 |
+
not accept this License. Therefore, by modifying or propagating a
|
| 432 |
+
covered work, you indicate your acceptance of this License to do so.
|
| 433 |
+
|
| 434 |
+
10. Automatic Licensing of Downstream Recipients.
|
| 435 |
+
|
| 436 |
+
Each time you convey a covered work, the recipient automatically
|
| 437 |
+
receives a license from the original licensors, to run, modify and
|
| 438 |
+
propagate that work, subject to this License. You are not responsible
|
| 439 |
+
for enforcing compliance by third parties with this License.
|
| 440 |
+
|
| 441 |
+
An "entity transaction" is a transaction transferring control of an
|
| 442 |
+
organization, or substantially all assets of one, or subdividing an
|
| 443 |
+
organization, or merging organizations. If propagation of a covered
|
| 444 |
+
work results from an entity transaction, each party to that
|
| 445 |
+
transaction who receives a copy of the work also receives whatever
|
| 446 |
+
licenses to the work the party's predecessor in interest had or could
|
| 447 |
+
give under the previous paragraph, plus a right to possession of the
|
| 448 |
+
Corresponding Source of the work from the predecessor in interest, if
|
| 449 |
+
the predecessor has it or can get it with reasonable efforts.
|
| 450 |
+
|
| 451 |
+
You may not impose any further restrictions on the exercise of the
|
| 452 |
+
rights granted or affirmed under this License. For example, you may
|
| 453 |
+
not impose a license fee, royalty, or other charge for exercise of
|
| 454 |
+
rights granted under this License, and you may not initiate litigation
|
| 455 |
+
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
| 456 |
+
any patent claim is infringed by making, using, selling, offering for
|
| 457 |
+
sale, or importing the Program or any portion of it.
|
| 458 |
+
|
| 459 |
+
11. Patents.
|
| 460 |
+
|
| 461 |
+
A "contributor" is a copyright holder who authorizes use under this
|
| 462 |
+
License of the Program or a work on which the Program is based. The
|
| 463 |
+
work thus licensed is called the contributor's "contributor version".
|
| 464 |
+
|
| 465 |
+
A contributor's "essential patent claims" are all patent claims
|
| 466 |
+
owned or controlled by the contributor, whether already acquired or
|
| 467 |
+
hereafter acquired, that would be infringed by some manner, permitted
|
| 468 |
+
by this License, of making, using, or selling its contributor version,
|
| 469 |
+
but do not include claims that would be infringed only as a
|
| 470 |
+
consequence of further modification of the contributor version. For
|
| 471 |
+
purposes of this definition, "control" includes the right to grant
|
| 472 |
+
patent sublicenses in a manner consistent with the requirements of
|
| 473 |
+
this License.
|
| 474 |
+
|
| 475 |
+
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
| 476 |
+
patent license under the contributor's essential patent claims, to
|
| 477 |
+
make, use, sell, offer for sale, import and otherwise run, modify and
|
| 478 |
+
propagate the contents of its contributor version.
|
| 479 |
+
|
| 480 |
+
In the following three paragraphs, a "patent license" is any express
|
| 481 |
+
agreement or commitment, however denominated, not to enforce a patent
|
| 482 |
+
(such as an express permission to practice a patent or covenant not to
|
| 483 |
+
sue for patent infringement). To "grant" such a patent license to a
|
| 484 |
+
party means to make such an agreement or commitment not to enforce a
|
| 485 |
+
patent against the party.
|
| 486 |
+
|
| 487 |
+
If you convey a covered work, knowingly relying on a patent license,
|
| 488 |
+
and the Corresponding Source of the work is not available for anyone
|
| 489 |
+
to copy, free of charge and under the terms of this License, through a
|
| 490 |
+
publicly available network server or other readily accessible means,
|
| 491 |
+
then you must either (1) cause the Corresponding Source to be so
|
| 492 |
+
available, or (2) arrange to deprive yourself of the benefit of the
|
| 493 |
+
patent license for this particular work, or (3) arrange, in a manner
|
| 494 |
+
consistent with the requirements of this License, to extend the patent
|
| 495 |
+
license to downstream recipients. "Knowingly relying" means you have
|
| 496 |
+
actual knowledge that, but for the patent license, your conveying the
|
| 497 |
+
covered work in a country, or your recipient's use of the covered work
|
| 498 |
+
in a country, would infringe one or more identifiable patents in that
|
| 499 |
+
country that you have reason to believe are valid.
|
| 500 |
+
|
| 501 |
+
If, pursuant to or in connection with a single transaction or
|
| 502 |
+
arrangement, you convey, or propagate by procuring conveyance of, a
|
| 503 |
+
covered work, and grant a patent license to some of the parties
|
| 504 |
+
receiving the covered work authorizing them to use, propagate, modify
|
| 505 |
+
or convey a specific copy of the covered work, then the patent license
|
| 506 |
+
you grant is automatically extended to all recipients of the covered
|
| 507 |
+
work and works based on it.
|
| 508 |
+
|
| 509 |
+
A patent license is "discriminatory" if it does not include within
|
| 510 |
+
the scope of its coverage, prohibits the exercise of, or is
|
| 511 |
+
conditioned on the non-exercise of one or more of the rights that are
|
| 512 |
+
specifically granted under this License. You may not convey a covered
|
| 513 |
+
work if you are a party to an arrangement with a third party that is
|
| 514 |
+
in the business of distributing software, under which you make payment
|
| 515 |
+
to the third party based on the extent of your activity of conveying
|
| 516 |
+
the work, and under which the third party grants, to any of the
|
| 517 |
+
parties who would receive the covered work from you, a discriminatory
|
| 518 |
+
patent license (a) in connection with copies of the covered work
|
| 519 |
+
conveyed by you (or copies made from those copies), or (b) primarily
|
| 520 |
+
for and in connection with specific products or compilations that
|
| 521 |
+
contain the covered work, unless you entered into that arrangement,
|
| 522 |
+
or that patent license was granted, prior to 28 March 2007.
|
| 523 |
+
|
| 524 |
+
Nothing in this License shall be construed as excluding or limiting
|
| 525 |
+
any implied license or other defenses to infringement that may
|
| 526 |
+
otherwise be available to you under applicable patent law.
|
| 527 |
+
|
| 528 |
+
12. No Surrender of Others' Freedom.
|
| 529 |
+
|
| 530 |
+
If conditions are imposed on you (whether by court order, agreement or
|
| 531 |
+
otherwise) that contradict the conditions of this License, they do not
|
| 532 |
+
excuse you from the conditions of this License. If you cannot convey a
|
| 533 |
+
covered work so as to satisfy simultaneously your obligations under this
|
| 534 |
+
License and any other pertinent obligations, then as a consequence you may
|
| 535 |
+
not convey it at all. For example, if you agree to terms that obligate you
|
| 536 |
+
to collect a royalty for further conveying from those to whom you convey
|
| 537 |
+
the Program, the only way you could satisfy both those terms and this
|
| 538 |
+
License would be to refrain entirely from conveying the Program.
|
| 539 |
+
|
| 540 |
+
13. Remote Network Interaction; Use with the GNU General Public License.
|
| 541 |
+
|
| 542 |
+
Notwithstanding any other provision of this License, if you modify the
|
| 543 |
+
Program, your modified version must prominently offer all users
|
| 544 |
+
interacting with it remotely through a computer network (if your version
|
| 545 |
+
supports such interaction) an opportunity to receive the Corresponding
|
| 546 |
+
Source of your version by providing access to the Corresponding Source
|
| 547 |
+
from a network server at no charge, through some standard or customary
|
| 548 |
+
means of facilitating copying of software. This Corresponding Source
|
| 549 |
+
shall include the Corresponding Source for any work covered by version 3
|
| 550 |
+
of the GNU General Public License that is incorporated pursuant to the
|
| 551 |
+
following paragraph.
|
| 552 |
+
|
| 553 |
+
Notwithstanding any other provision of this License, you have
|
| 554 |
+
permission to link or combine any covered work with a work licensed
|
| 555 |
+
under version 3 of the GNU General Public License into a single
|
| 556 |
+
combined work, and to convey the resulting work. The terms of this
|
| 557 |
+
License will continue to apply to the part which is the covered work,
|
| 558 |
+
but the work with which it is combined will remain governed by version
|
| 559 |
+
3 of the GNU General Public License.
|
| 560 |
+
|
| 561 |
+
14. Revised Versions of this License.
|
| 562 |
+
|
| 563 |
+
The Free Software Foundation may publish revised and/or new versions of
|
| 564 |
+
the GNU Affero General Public License from time to time. Such new versions
|
| 565 |
+
will be similar in spirit to the present version, but may differ in detail to
|
| 566 |
+
address new problems or concerns.
|
| 567 |
+
|
| 568 |
+
Each version is given a distinguishing version number. If the
|
| 569 |
+
Program specifies that a certain numbered version of the GNU Affero General
|
| 570 |
+
Public License "or any later version" applies to it, you have the
|
| 571 |
+
option of following the terms and conditions either of that numbered
|
| 572 |
+
version or of any later version published by the Free Software
|
| 573 |
+
Foundation. If the Program does not specify a version number of the
|
| 574 |
+
GNU Affero General Public License, you may choose any version ever published
|
| 575 |
+
by the Free Software Foundation.
|
| 576 |
+
|
| 577 |
+
If the Program specifies that a proxy can decide which future
|
| 578 |
+
versions of the GNU Affero General Public License can be used, that proxy's
|
| 579 |
+
public statement of acceptance of a version permanently authorizes you
|
| 580 |
+
to choose that version for the Program.
|
| 581 |
+
|
| 582 |
+
Later license versions may give you additional or different
|
| 583 |
+
permissions. However, no additional obligations are imposed on any
|
| 584 |
+
author or copyright holder as a result of your choosing to follow a
|
| 585 |
+
later version.
|
| 586 |
+
|
| 587 |
+
15. Disclaimer of Warranty.
|
| 588 |
+
|
| 589 |
+
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
| 590 |
+
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
| 591 |
+
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
| 592 |
+
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
| 593 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 594 |
+
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
| 595 |
+
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
| 596 |
+
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
| 597 |
+
|
| 598 |
+
16. Limitation of Liability.
|
| 599 |
+
|
| 600 |
+
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
| 601 |
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
| 602 |
+
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
| 603 |
+
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
| 604 |
+
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
| 605 |
+
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
| 606 |
+
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
| 607 |
+
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
| 608 |
+
SUCH DAMAGES.
|
| 609 |
+
|
| 610 |
+
17. Interpretation of Sections 15 and 16.
|
| 611 |
+
|
| 612 |
+
If the disclaimer of warranty and limitation of liability provided
|
| 613 |
+
above cannot be given local legal effect according to their terms,
|
| 614 |
+
reviewing courts shall apply local law that most closely approximates
|
| 615 |
+
an absolute waiver of all civil liability in connection with the
|
| 616 |
+
Program, unless a warranty or assumption of liability accompanies a
|
| 617 |
+
copy of the Program in return for a fee.
|
| 618 |
+
|
| 619 |
+
END OF TERMS AND CONDITIONS
|
| 620 |
+
|
| 621 |
+
How to Apply These Terms to Your New Programs
|
| 622 |
+
|
| 623 |
+
If you develop a new program, and you want it to be of the greatest
|
| 624 |
+
possible use to the public, the best way to achieve this is to make it
|
| 625 |
+
free software which everyone can redistribute and change under these terms.
|
| 626 |
+
|
| 627 |
+
To do so, attach the following notices to the program. It is safest
|
| 628 |
+
to attach them to the start of each source file to most effectively
|
| 629 |
+
state the exclusion of warranty; and each file should have at least
|
| 630 |
+
the "copyright" line and a pointer to where the full notice is found.
|
| 631 |
+
|
| 632 |
+
<one line to give the program's name and a brief idea of what it does.>
|
| 633 |
+
Copyright (C) <year> <name of author>
|
| 634 |
+
|
| 635 |
+
This program is free software: you can redistribute it and/or modify
|
| 636 |
+
it under the terms of the GNU Affero General Public License as published
|
| 637 |
+
by the Free Software Foundation, either version 3 of the License, or
|
| 638 |
+
(at your option) any later version.
|
| 639 |
+
|
| 640 |
+
This program is distributed in the hope that it will be useful,
|
| 641 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 642 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 643 |
+
GNU Affero General Public License for more details.
|
| 644 |
+
|
| 645 |
+
You should have received a copy of the GNU Affero General Public License
|
| 646 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
| 647 |
+
|
| 648 |
+
Also add information on how to contact you by electronic and paper mail.
|
| 649 |
+
|
| 650 |
+
If your software can interact with users remotely through a computer
|
| 651 |
+
network, you should also make sure that it provides a way for users to
|
| 652 |
+
get its source. For example, if your program is a web application, its
|
| 653 |
+
interface could display a "Source" link that leads users to an archive
|
| 654 |
+
of the code. There are many ways you could offer source, and different
|
| 655 |
+
solutions will be better for different programs; see section 13 for the
|
| 656 |
+
specific requirements.
|
| 657 |
+
|
| 658 |
+
You should also get your employer (if you work as a programmer) or school,
|
| 659 |
+
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
| 660 |
+
For more information on this, and how to apply and follow the GNU AGPL, see
|
| 661 |
+
<https://www.gnu.org/licenses/>.
|
README.md
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Review Screening Analyzer
|
| 3 |
+
emoji: 📚
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: "5.39.0"
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: true
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
<div align="center">
|
| 13 |
+
<hr>
|
| 14 |
+
<h1>Review Screening Analyzer</h1>
|
| 15 |
+
<b>A Simple Literature Filtering Tool</b>
|
| 16 |
+
</div>
|
| 17 |
+
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
> [!important]
|
| 21 |
+
> This project is currently under development and marked as research in progress status, don't use it withour authors' permission.
|
| 22 |
+
|
| 23 |
+
> [!important]
|
| 24 |
+
> This is a demo code for the paper "Automated Literature Screening for Hepatocellular Carcinoma Treatment: Integrating Three Large Language Models" published in the Journal of Medical Internet Research Medical Informatics.
|
| 25 |
+
|
| 26 |
+
## 目录
|
| 27 |
+
|
| 28 |
+
- [Introduction](#Introduction)
|
| 29 |
+
- [Usage](#usage)
|
| 30 |
+
- [License](#license)
|
| 31 |
+
- [Contact Information](#contact-information)
|
| 32 |
+
|
| 33 |
+
---
|
| 34 |
+
|
| 35 |
+
## Introduction
|
| 36 |
+
|
| 37 |
+
Review Screening Analyzer is a literature screening tool that combines three large language models for analysis to determine the inclusion and exclusion of studies in systematic reviews based on PICOS criteria.
|
| 38 |
+
|
| 39 |
+
This is a demo project for demonstration purposes, not a production application. If you find any bugs, please report them in the Issues.
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
## File Structure
|
| 43 |
+
```
|
| 44 |
+
review-screening-analyzer/
|
| 45 |
+
│
|
| 46 |
+
├── analyzer.py
|
| 47 |
+
├── deduplicator.py
|
| 48 |
+
├── file_processor.py
|
| 49 |
+
├── LICENSE
|
| 50 |
+
├── README.md
|
| 51 |
+
├── requirements.txt
|
| 52 |
+
└── app.py # Gradio Entry Point
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
## Usage
|
| 56 |
+
|
| 57 |
+
> [!warning]
|
| 58 |
+
> The following content is a temporary solution for local deployment.
|
| 59 |
+
|
| 60 |
+
Please ensure that [Python](https://www.python.org/) and [pip](https://pip.pypa.io/en/stable/) are installed on your system.
|
| 61 |
+
|
| 62 |
+
Create the environment variable file `.env` in the project directory [/](file:///Users/chitsanfei/Downloads/review-screening-analyzer/README.md):
|
| 63 |
+
```
|
| 64 |
+
# API Keys
|
| 65 |
+
DEEPSEEK_API_KEY=
|
| 66 |
+
QWEN_API_KEY=
|
| 67 |
+
GPTGE_API_KEY=
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
Then run the following commands:
|
| 71 |
+
```bash
|
| 72 |
+
bash
|
| 73 |
+
git clone https://github.com/chitsanfei/review-screening-analyzer.git
|
| 74 |
+
cd review-screening-analyzer
|
| 75 |
+
pip install -r requirements.txt
|
| 76 |
+
python3 app.py
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
## License
|
| 80 |
+
|
| 81 |
+
This project is licensed under the [MIT License](LICENSE).
|
| 82 |
+
```
|
| 83 |
+
This project is licensed under the GNU General Public License v3.0 (GPL-3.0).
|
| 84 |
+
You are free to use, modify and distribute this software, provided that you keep it open source and license it under the same terms.
|
| 85 |
+
For more details, see the full [GNU GPL v3.0 license text](https://www.gnu.org/licenses/gpl-3.0.html).
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
## Contact Information
|
| 89 |
+
|
| 90 |
+
If you have any questions or suggestions, please contact us through the following methods:
|
| 91 |
+
|
| 92 |
+
- Email: chitsanfei@emu.ac.cn
|
| 93 |
+
- GitHub: [chitsanfei](https://github.com/chitsanfei)
|
| 94 |
+
|
| 95 |
+
---
|
| 96 |
+
|
| 97 |
+
Thank you for your usage and support! 🌟
|
analyzer.py
ADDED
|
@@ -0,0 +1,511 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import logging
|
| 3 |
+
import json
|
| 4 |
+
from typing import Dict, List, Optional
|
| 5 |
+
from model_manager import ModelManager
|
| 6 |
+
from prompt_manager import PromptManager
|
| 7 |
+
from result_processor import ResultProcessor
|
| 8 |
+
import re
|
| 9 |
+
import time
|
| 10 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class PICOSAnalyzer:
|
| 14 |
+
def __init__(self):
|
| 15 |
+
# Initialize managers for models, prompts, and result processing
|
| 16 |
+
self.model_manager = ModelManager()
|
| 17 |
+
self.prompt_manager = PromptManager()
|
| 18 |
+
self.result_processor = ResultProcessor()
|
| 19 |
+
# Example PICOS filtering criteria
|
| 20 |
+
self.picos_criteria = {
|
| 21 |
+
"population": "patients with non-alcoholic fatty liver disease (NAFLD)",
|
| 22 |
+
"intervention": "observation or management of NAFLD",
|
| 23 |
+
"comparison": "patients without NAFLD or general population",
|
| 24 |
+
"outcome": "incidence of various types of extra-hepatic cancers, such as colorectal cancer, stomach cancer, breast cancer, etc.",
|
| 25 |
+
"study_design": "retrospective cohort studies"
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
def update_picos_criteria(self, criteria: Dict[str, str]) -> None:
|
| 29 |
+
"""Update the PICOS criteria with a given dictionary of criteria."""
|
| 30 |
+
self.picos_criteria.update(criteria)
|
| 31 |
+
|
| 32 |
+
def update_model_config(self, model_key: str, config: Dict) -> None:
|
| 33 |
+
"""Update configuration settings for a specific model."""
|
| 34 |
+
self.model_manager.update_model_config(model_key, config)
|
| 35 |
+
|
| 36 |
+
def update_prompt(self, model_key: str, prompt: str) -> None:
|
| 37 |
+
"""Update the prompt template for a specific model."""
|
| 38 |
+
self.prompt_manager.update_prompt(model_key, prompt)
|
| 39 |
+
|
| 40 |
+
def test_api_connection(self, model_key: str) -> str:
|
| 41 |
+
"""Test the API connection for the specified model."""
|
| 42 |
+
return self.model_manager.test_api_connection(model_key)
|
| 43 |
+
|
| 44 |
+
def _validate_data(self, idx: str, row: pd.Series, model_key: str, previous_results: Dict) -> bool:
|
| 45 |
+
"""
|
| 46 |
+
Validate the completeness of a single data item.
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
Tuple[bool, bool]: (is_valid, is_empty_abstract)
|
| 50 |
+
"""
|
| 51 |
+
try:
|
| 52 |
+
# Check if abstract exists and is not empty
|
| 53 |
+
if not pd.notna(row.get("Abstract")):
|
| 54 |
+
logging.warning(f"Empty abstract for index {idx}")
|
| 55 |
+
return False, True # Second value indicates empty abstract
|
| 56 |
+
|
| 57 |
+
# For Model B and C, validate Model A results
|
| 58 |
+
if model_key in ["model_b", "model_c"]:
|
| 59 |
+
if not previous_results or "model_a" not in previous_results:
|
| 60 |
+
logging.warning(f"Missing Model A results for {model_key}")
|
| 61 |
+
return False, False
|
| 62 |
+
if idx not in previous_results["model_a"].index:
|
| 63 |
+
logging.warning(f"Index {idx} not found in Model A results")
|
| 64 |
+
return False, False
|
| 65 |
+
|
| 66 |
+
# For Model C, validate Model B results
|
| 67 |
+
if model_key == "model_c":
|
| 68 |
+
if "model_b" not in previous_results:
|
| 69 |
+
logging.warning("Missing Model B results")
|
| 70 |
+
return False, False
|
| 71 |
+
if idx not in previous_results["model_b"].index:
|
| 72 |
+
logging.warning(f"Index {idx} not found in Model B results")
|
| 73 |
+
return False, False
|
| 74 |
+
|
| 75 |
+
return True, False
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logging.error(f"Validation error for index {idx}: {str(e)}")
|
| 78 |
+
return False, False
|
| 79 |
+
|
| 80 |
+
def _process_single_item(self, idx: str, row: pd.Series, model_key: str, previous_results: Dict) -> Optional[Dict]:
|
| 81 |
+
"""
|
| 82 |
+
Process a single data item and prepare it for API call.
|
| 83 |
+
"""
|
| 84 |
+
try:
|
| 85 |
+
# Prepare base result with abstract
|
| 86 |
+
result = {
|
| 87 |
+
"Index": idx,
|
| 88 |
+
"abstract": str(row["Abstract"]).strip()
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
# Add Model A results for Model B and C
|
| 92 |
+
if model_key in ["model_b", "model_c"]:
|
| 93 |
+
a_result = previous_results["model_a"].loc[idx]
|
| 94 |
+
result["model_a_analysis"] = {
|
| 95 |
+
"A_Decision": bool(a_result["A_Decision"]),
|
| 96 |
+
"A_Reason": str(a_result["A_Reason"]),
|
| 97 |
+
"A_P": str(a_result["A_P"]),
|
| 98 |
+
"A_I": str(a_result["A_I"]),
|
| 99 |
+
"A_C": str(a_result["A_C"]),
|
| 100 |
+
"A_O": str(a_result["A_O"]),
|
| 101 |
+
"A_S": str(a_result["A_S"])
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
# Add Model B results for Model C
|
| 105 |
+
if model_key == "model_c":
|
| 106 |
+
b_result = previous_results["model_b"].loc[idx]
|
| 107 |
+
result["model_b_analysis"] = {
|
| 108 |
+
"B_Decision": bool(b_result["B_Decision"]),
|
| 109 |
+
"B_Reason": str(b_result["B_Reason"]),
|
| 110 |
+
"B_P": str(b_result["B_P"]),
|
| 111 |
+
"B_I": str(b_result["B_I"]),
|
| 112 |
+
"B_C": str(b_result["B_C"]),
|
| 113 |
+
"B_O": str(b_result["B_O"]),
|
| 114 |
+
"B_S": str(b_result["B_S"])
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
return result
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logging.error(f"Processing error for index {idx}: {str(e)}")
|
| 120 |
+
return None
|
| 121 |
+
|
| 122 |
+
def _process_api_response(self, response: Dict, model_key: str) -> List[Dict]:
|
| 123 |
+
"""
|
| 124 |
+
Process API response and extract results.
|
| 125 |
+
"""
|
| 126 |
+
try:
|
| 127 |
+
if not response or not isinstance(response, dict):
|
| 128 |
+
logging.error(f"Invalid response format from {model_key}")
|
| 129 |
+
return []
|
| 130 |
+
|
| 131 |
+
# Extract results from response
|
| 132 |
+
if "results" not in response:
|
| 133 |
+
# For inference mode, try to parse from content directly (model_c only)
|
| 134 |
+
if model_key == "model_c" and self.model_manager.get_config(model_key).get("is_inference"):
|
| 135 |
+
try:
|
| 136 |
+
content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
|
| 137 |
+
json_match = re.search(r"```json\s*(\{.*?\})\s*```", content, re.DOTALL)
|
| 138 |
+
if json_match:
|
| 139 |
+
content = json_match.group(1)
|
| 140 |
+
parsed_response = json.loads(content)
|
| 141 |
+
if "results" not in parsed_response:
|
| 142 |
+
logging.error(f"No results found in {model_key} inference response")
|
| 143 |
+
return []
|
| 144 |
+
response = parsed_response
|
| 145 |
+
except Exception as e:
|
| 146 |
+
logging.error(f"Failed to parse inference response from {model_key}: {str(e)}")
|
| 147 |
+
return []
|
| 148 |
+
else:
|
| 149 |
+
logging.error(f"No results found in {model_key} response")
|
| 150 |
+
return []
|
| 151 |
+
|
| 152 |
+
results = response["results"]
|
| 153 |
+
if not isinstance(results, list):
|
| 154 |
+
logging.error(f"Results from {model_key} is not a list")
|
| 155 |
+
return []
|
| 156 |
+
|
| 157 |
+
# Validate each result
|
| 158 |
+
valid_results = []
|
| 159 |
+
for result in results:
|
| 160 |
+
if not isinstance(result, dict) or "Index" not in result:
|
| 161 |
+
logging.warning(f"Invalid result format in {model_key} response: {result}")
|
| 162 |
+
continue
|
| 163 |
+
|
| 164 |
+
# Ensure all required fields are present based on model type
|
| 165 |
+
if model_key == "model_a":
|
| 166 |
+
required_fields = ["A_P", "A_I", "A_C", "A_O", "A_S", "A_Decision", "A_Reason"]
|
| 167 |
+
elif model_key == "model_b":
|
| 168 |
+
required_fields = ["B_P", "B_I", "B_C", "B_O", "B_S", "B_Decision", "B_Reason"]
|
| 169 |
+
else: # model_c
|
| 170 |
+
required_fields = ["C_Decision", "C_Reason"]
|
| 171 |
+
|
| 172 |
+
missing_fields = [field for field in required_fields if field not in result]
|
| 173 |
+
if missing_fields:
|
| 174 |
+
logging.warning(f"Missing fields {missing_fields} in {model_key} result for Index {result['Index']}")
|
| 175 |
+
continue
|
| 176 |
+
|
| 177 |
+
# Convert decision to boolean if it's a string
|
| 178 |
+
if model_key == "model_c" and isinstance(result.get("C_Decision"), str):
|
| 179 |
+
result["C_Decision"] = result["C_Decision"].lower() == "true"
|
| 180 |
+
|
| 181 |
+
valid_results.append(result)
|
| 182 |
+
|
| 183 |
+
return valid_results
|
| 184 |
+
|
| 185 |
+
except Exception as e:
|
| 186 |
+
logging.error(f"Error processing {model_key} response: {str(e)}")
|
| 187 |
+
return []
|
| 188 |
+
|
| 189 |
+
def process_batch(self, df: pd.DataFrame, model_key: str, previous_results: Dict = None, progress_callback=None) -> pd.DataFrame:
|
| 190 |
+
"""
|
| 191 |
+
Process a batch of data with improved data flow and validation.
|
| 192 |
+
"""
|
| 193 |
+
# Get model configuration
|
| 194 |
+
config = self.model_manager.get_config(model_key)
|
| 195 |
+
batch_size = config["batch_size"]
|
| 196 |
+
threads = config["threads"]
|
| 197 |
+
results_dict = {} # Use dictionary to prevent duplicate indices
|
| 198 |
+
failed_indices = set()
|
| 199 |
+
total_rows = len(df)
|
| 200 |
+
start_time = time.time()
|
| 201 |
+
processed_count = 0
|
| 202 |
+
skipped_count = 0
|
| 203 |
+
|
| 204 |
+
# Ensure consistent index type
|
| 205 |
+
df.index = df.index.astype(str)
|
| 206 |
+
if previous_results:
|
| 207 |
+
for key in previous_results:
|
| 208 |
+
previous_results[key].index = previous_results[key].index.astype(str)
|
| 209 |
+
|
| 210 |
+
# For Model C, first identify indices where A and B disagree
|
| 211 |
+
if model_key == "model_c":
|
| 212 |
+
disagreement_indices = []
|
| 213 |
+
for idx in df.index:
|
| 214 |
+
try:
|
| 215 |
+
if not self._validate_previous_results(idx, model_key, previous_results):
|
| 216 |
+
empty_result = self._create_empty_result(idx, model_key, "Invalid or missing previous results")
|
| 217 |
+
results_dict[str(idx)] = empty_result
|
| 218 |
+
failed_indices.add(str(idx))
|
| 219 |
+
if progress_callback:
|
| 220 |
+
progress_callback(idx, True, False)
|
| 221 |
+
continue
|
| 222 |
+
|
| 223 |
+
if self._check_disagreement(idx, previous_results):
|
| 224 |
+
disagreement_indices.append(idx)
|
| 225 |
+
else:
|
| 226 |
+
# If no disagreement, use Model A's decision
|
| 227 |
+
no_disagreement_result = self._create_no_disagreement_result(idx, previous_results)
|
| 228 |
+
results_dict[str(idx)] = no_disagreement_result
|
| 229 |
+
skipped_count += 1
|
| 230 |
+
if progress_callback:
|
| 231 |
+
progress_callback(idx, False, False)
|
| 232 |
+
except Exception as e:
|
| 233 |
+
logging.error(f"Error checking disagreement for index {idx}: {str(e)}")
|
| 234 |
+
empty_result = self._create_empty_result(idx, model_key, f"Error: {str(e)}")
|
| 235 |
+
results_dict[str(idx)] = empty_result
|
| 236 |
+
failed_indices.add(str(idx))
|
| 237 |
+
if progress_callback:
|
| 238 |
+
progress_callback(idx, True, False)
|
| 239 |
+
|
| 240 |
+
# Update df to only include disagreement cases for Model C
|
| 241 |
+
if disagreement_indices:
|
| 242 |
+
df = df.loc[disagreement_indices]
|
| 243 |
+
else:
|
| 244 |
+
# If no disagreements, return results with default values
|
| 245 |
+
results = list(results_dict.values())
|
| 246 |
+
results_df = pd.DataFrame(results)
|
| 247 |
+
results_df.set_index("Index", inplace=True)
|
| 248 |
+
results_df.index = results_df.index.astype(str)
|
| 249 |
+
return results_df
|
| 250 |
+
|
| 251 |
+
def process_batch_data(batch_df: pd.DataFrame) -> List[Dict]:
|
| 252 |
+
nonlocal processed_count, skipped_count
|
| 253 |
+
batch_results = []
|
| 254 |
+
empty_results = []
|
| 255 |
+
|
| 256 |
+
# Process each item in the batch
|
| 257 |
+
for idx, row in batch_df.iterrows():
|
| 258 |
+
try:
|
| 259 |
+
# Skip if already processed (for Model C)
|
| 260 |
+
if str(idx) in results_dict:
|
| 261 |
+
skipped_count += 1
|
| 262 |
+
continue
|
| 263 |
+
|
| 264 |
+
# Validate data completeness
|
| 265 |
+
is_valid, is_empty = self._validate_data(idx, row, model_key, previous_results)
|
| 266 |
+
if not is_valid:
|
| 267 |
+
empty_result = self._create_empty_result(idx, model_key, "Not processed - Empty abstract" if is_empty else "Not processed - Invalid data")
|
| 268 |
+
empty_results.append(empty_result)
|
| 269 |
+
failed_indices.add(idx)
|
| 270 |
+
if progress_callback:
|
| 271 |
+
progress_callback(idx, True, is_empty)
|
| 272 |
+
continue
|
| 273 |
+
|
| 274 |
+
# Prepare data for API call
|
| 275 |
+
abstract_text = row.get("Abstract", "").strip()
|
| 276 |
+
if not abstract_text:
|
| 277 |
+
empty_result = self._create_empty_result(idx, model_key, "Not processed - Empty abstract")
|
| 278 |
+
empty_results.append(empty_result)
|
| 279 |
+
failed_indices.add(idx)
|
| 280 |
+
if progress_callback:
|
| 281 |
+
progress_callback(idx, True, True)
|
| 282 |
+
continue
|
| 283 |
+
|
| 284 |
+
# Add to batch for processing
|
| 285 |
+
batch_item = self._process_single_item(idx, row, model_key, previous_results)
|
| 286 |
+
if batch_item:
|
| 287 |
+
batch_results.append(batch_item)
|
| 288 |
+
else:
|
| 289 |
+
empty_result = self._create_empty_result(idx, model_key, "Error preparing batch data")
|
| 290 |
+
empty_results.append(empty_result)
|
| 291 |
+
failed_indices.add(idx)
|
| 292 |
+
if progress_callback:
|
| 293 |
+
progress_callback(idx, True, False)
|
| 294 |
+
|
| 295 |
+
except Exception as e:
|
| 296 |
+
logging.error(f"Error preparing data for index {idx}: {str(e)}")
|
| 297 |
+
empty_result = self._create_empty_result(idx, model_key, f"Error: {str(e)}")
|
| 298 |
+
empty_results.append(empty_result)
|
| 299 |
+
failed_indices.add(idx)
|
| 300 |
+
if progress_callback:
|
| 301 |
+
progress_callback(idx, True, False)
|
| 302 |
+
|
| 303 |
+
# Process batch with API if there are valid entries
|
| 304 |
+
if batch_results:
|
| 305 |
+
try:
|
| 306 |
+
# Prepare prompt with PICOS criteria and batch data
|
| 307 |
+
prompt = self.prompt_manager.get_prompt(model_key).format(
|
| 308 |
+
**{
|
| 309 |
+
**self.picos_criteria,
|
| 310 |
+
"abstracts_json": json.dumps(batch_results, ensure_ascii=False, indent=2)
|
| 311 |
+
}
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
# Call API and process response
|
| 315 |
+
response = self.model_manager.call_api(model_key, prompt)
|
| 316 |
+
api_results = self._process_api_response(response, model_key)
|
| 317 |
+
|
| 318 |
+
# If API call failed or returned no results, create empty results for all items
|
| 319 |
+
if not api_results:
|
| 320 |
+
for item in batch_results:
|
| 321 |
+
empty_result = self._create_empty_result(item["Index"], model_key, "API call failed or returned no results")
|
| 322 |
+
empty_results.append(empty_result)
|
| 323 |
+
if progress_callback:
|
| 324 |
+
progress_callback(item["Index"], True, False)
|
| 325 |
+
else:
|
| 326 |
+
# Update progress for successfully processed items
|
| 327 |
+
for result in api_results:
|
| 328 |
+
if progress_callback:
|
| 329 |
+
progress_callback(result["Index"], False, False)
|
| 330 |
+
# Add result to the batch results
|
| 331 |
+
results_dict[str(result["Index"])] = result
|
| 332 |
+
processed_count += 1
|
| 333 |
+
|
| 334 |
+
# Calculate time statistics
|
| 335 |
+
elapsed_time = time.time() - start_time
|
| 336 |
+
if processed_count > 0:
|
| 337 |
+
avg_time_per_item = elapsed_time / processed_count
|
| 338 |
+
remaining_items = total_rows - (processed_count + len(failed_indices) + skipped_count)
|
| 339 |
+
estimated_remaining_time = avg_time_per_item * remaining_items
|
| 340 |
+
|
| 341 |
+
# Log detailed progress information
|
| 342 |
+
logging.info(
|
| 343 |
+
f"{model_key.upper()} Progress: "
|
| 344 |
+
f"Processed: {processed_count} - "
|
| 345 |
+
f"Remaining: {remaining_items} - "
|
| 346 |
+
f"Skipped: {skipped_count} - "
|
| 347 |
+
f"Elapsed Time: {elapsed_time:.1f}s - "
|
| 348 |
+
f"Est. Remaining: {estimated_remaining_time:.1f}s"
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
return api_results + empty_results
|
| 352 |
+
|
| 353 |
+
except Exception as e:
|
| 354 |
+
error_msg = f"Error processing batch: {str(e)}"
|
| 355 |
+
logging.error(error_msg)
|
| 356 |
+
for item in batch_results:
|
| 357 |
+
empty_result = self._create_empty_result(item["Index"], model_key, error_msg)
|
| 358 |
+
empty_results.append(empty_result)
|
| 359 |
+
failed_indices.add(item["Index"])
|
| 360 |
+
if progress_callback:
|
| 361 |
+
progress_callback(item["Index"], True, False)
|
| 362 |
+
|
| 363 |
+
return empty_results
|
| 364 |
+
|
| 365 |
+
# Process batches using thread pool
|
| 366 |
+
with ThreadPoolExecutor(max_workers=threads) as executor:
|
| 367 |
+
futures = []
|
| 368 |
+
for i in range(0, len(df), batch_size):
|
| 369 |
+
batch_df = df.iloc[i:i + batch_size]
|
| 370 |
+
futures.append(executor.submit(process_batch_data, batch_df))
|
| 371 |
+
|
| 372 |
+
# Collect results
|
| 373 |
+
for future in as_completed(futures):
|
| 374 |
+
try:
|
| 375 |
+
batch_results = future.result()
|
| 376 |
+
# Store results in dictionary to handle potential duplicates
|
| 377 |
+
for result in batch_results:
|
| 378 |
+
idx = str(result["Index"])
|
| 379 |
+
results_dict[idx] = result
|
| 380 |
+
except Exception as e:
|
| 381 |
+
error_msg = f"Error collecting batch results: {str(e)}"
|
| 382 |
+
logging.error(error_msg)
|
| 383 |
+
|
| 384 |
+
# Convert results dictionary to DataFrame
|
| 385 |
+
results = list(results_dict.values())
|
| 386 |
+
results_df = pd.DataFrame(results)
|
| 387 |
+
|
| 388 |
+
if not results_df.empty:
|
| 389 |
+
# Set index properly
|
| 390 |
+
results_df.set_index("Index", inplace=True)
|
| 391 |
+
results_df.index = results_df.index.astype(str)
|
| 392 |
+
|
| 393 |
+
# Ensure all required columns exist with default values
|
| 394 |
+
for col in self._get_model_columns(model_key):
|
| 395 |
+
if col not in results_df.columns:
|
| 396 |
+
if col.endswith("_Decision"):
|
| 397 |
+
results_df[col] = False
|
| 398 |
+
elif col.endswith("_Reason"):
|
| 399 |
+
results_df[col] = "Not provided"
|
| 400 |
+
else:
|
| 401 |
+
results_df[col] = "not applicable"
|
| 402 |
+
|
| 403 |
+
# Convert boolean columns
|
| 404 |
+
decision_columns = [col for col in results_df.columns if col.endswith("_Decision")]
|
| 405 |
+
for col in decision_columns:
|
| 406 |
+
results_df[col] = results_df[col].astype(bool)
|
| 407 |
+
else:
|
| 408 |
+
# Create empty DataFrame with required columns
|
| 409 |
+
results_df = pd.DataFrame(columns=self._get_model_columns(model_key))
|
| 410 |
+
results_df.index.name = "Index"
|
| 411 |
+
|
| 412 |
+
# Log final statistics
|
| 413 |
+
total_time = time.time() - start_time
|
| 414 |
+
success_rate = ((total_rows - len(failed_indices)) / total_rows) * 100
|
| 415 |
+
logging.info(f"{model_key.upper()} completed in {total_time:.1f}s - "
|
| 416 |
+
f"Success rate: {success_rate:.1f}% ({total_rows - len(failed_indices)}/{total_rows})")
|
| 417 |
+
|
| 418 |
+
return results_df
|
| 419 |
+
|
| 420 |
+
def merge_results(self, df: pd.DataFrame, model_results: Dict) -> pd.DataFrame:
|
| 421 |
+
"""Merge results from all models into a single DataFrame."""
|
| 422 |
+
return self.result_processor.merge_results(df, model_results)
|
| 423 |
+
|
| 424 |
+
def _create_empty_result(self, idx: str, model_key: str, reason: Optional[str] = None) -> Dict:
|
| 425 |
+
"""
|
| 426 |
+
Create a default empty result entry for cases where the abstract is empty
|
| 427 |
+
or previous results are missing. The default reason is 'Not applicable' if not provided.
|
| 428 |
+
"""
|
| 429 |
+
default_reason = reason if reason is not None else "Not applicable - Empty or invalid data"
|
| 430 |
+
result = {"Index": str(idx)}
|
| 431 |
+
if model_key == "model_a":
|
| 432 |
+
result.update({
|
| 433 |
+
"A_P": "not applicable",
|
| 434 |
+
"A_I": "not applicable",
|
| 435 |
+
"A_C": "not applicable",
|
| 436 |
+
"A_O": "not applicable",
|
| 437 |
+
"A_S": "not applicable",
|
| 438 |
+
"A_Decision": False,
|
| 439 |
+
"A_Reason": default_reason
|
| 440 |
+
})
|
| 441 |
+
elif model_key == "model_b":
|
| 442 |
+
result.update({
|
| 443 |
+
"B_P": "not applicable",
|
| 444 |
+
"B_I": "not applicable",
|
| 445 |
+
"B_C": "not applicable",
|
| 446 |
+
"B_O": "not applicable",
|
| 447 |
+
"B_S": "not applicable",
|
| 448 |
+
"B_Decision": False,
|
| 449 |
+
"B_Reason": default_reason
|
| 450 |
+
})
|
| 451 |
+
else: # For model_c
|
| 452 |
+
result.update({
|
| 453 |
+
"C_Decision": False,
|
| 454 |
+
"C_Reason": default_reason
|
| 455 |
+
})
|
| 456 |
+
return result
|
| 457 |
+
|
| 458 |
+
def _create_no_disagreement_result(self, idx: str, previous_results: Dict) -> Dict:
|
| 459 |
+
"""
|
| 460 |
+
When Model A and Model B agree on the decision,
|
| 461 |
+
directly return Model A's result with a note indicating no disagreement.
|
| 462 |
+
"""
|
| 463 |
+
str_idx = str(idx)
|
| 464 |
+
a_result = previous_results["model_a"].loc[str_idx]
|
| 465 |
+
return {
|
| 466 |
+
"Index": str_idx,
|
| 467 |
+
"C_Decision": a_result["A_Decision"],
|
| 468 |
+
"C_Reason": "No disagreement between Model A and B"
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
def _validate_previous_results(self, idx: str, model_key: str, previous_results: Dict) -> bool:
|
| 472 |
+
"""
|
| 473 |
+
Validate if previous model results exist for a given index.
|
| 474 |
+
Returns False if any required result is missing.
|
| 475 |
+
"""
|
| 476 |
+
str_idx = str(idx)
|
| 477 |
+
if "model_a" not in previous_results:
|
| 478 |
+
raise Exception("Model A results required")
|
| 479 |
+
model_a_data = previous_results["model_a"]
|
| 480 |
+
if str_idx not in model_a_data.index.astype(str).values:
|
| 481 |
+
logging.warning(f"Missing Model A result for index {idx}")
|
| 482 |
+
return False
|
| 483 |
+
|
| 484 |
+
if model_key == "model_c":
|
| 485 |
+
if "model_b" not in previous_results:
|
| 486 |
+
raise Exception("Model B results required")
|
| 487 |
+
model_b_data = previous_results["model_b"]
|
| 488 |
+
if str_idx not in model_b_data.index.astype(str).values:
|
| 489 |
+
logging.warning(f"Missing Model B result for index {idx}")
|
| 490 |
+
return False
|
| 491 |
+
|
| 492 |
+
return True
|
| 493 |
+
|
| 494 |
+
def _check_disagreement(self, idx: str, previous_results: Dict) -> bool:
|
| 495 |
+
"""
|
| 496 |
+
Check whether there is a disagreement between Model A and Model B for a given index.
|
| 497 |
+
Returns True if the decisions differ, otherwise False.
|
| 498 |
+
"""
|
| 499 |
+
str_idx = str(idx)
|
| 500 |
+
a_result = previous_results["model_a"].loc[str_idx]
|
| 501 |
+
b_result = previous_results["model_b"].loc[str_idx]
|
| 502 |
+
return a_result["A_Decision"] != b_result["B_Decision"]
|
| 503 |
+
|
| 504 |
+
def _get_model_columns(self, model_key: str) -> List[str]:
|
| 505 |
+
"""Get the expected columns for a specific model's output."""
|
| 506 |
+
if model_key == "model_a":
|
| 507 |
+
return ["A_Decision", "A_Reason", "A_P", "A_I", "A_C", "A_O", "A_S"]
|
| 508 |
+
elif model_key == "model_b":
|
| 509 |
+
return ["B_Decision", "B_Reason", "B_P", "B_I", "B_C", "B_O", "B_S"]
|
| 510 |
+
else: # model_c
|
| 511 |
+
return ["C_Decision", "C_Reason"]
|
app.py
ADDED
|
@@ -0,0 +1,724 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
import time
|
| 4 |
+
import logging
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
import gradio as gr
|
| 7 |
+
from file_processor import FileProcessor
|
| 8 |
+
from analyzer import PICOSAnalyzer
|
| 9 |
+
from deduplicator import Deduplicator
|
| 10 |
+
from result_processor import ResultProcessor
|
| 11 |
+
|
| 12 |
+
# Configuration of directories
|
| 13 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 14 |
+
DATA_DIR = os.path.join(BASE_DIR, "data")
|
| 15 |
+
LOG_DIR = os.path.join(BASE_DIR, "logs")
|
| 16 |
+
|
| 17 |
+
# Load .env file if it exists
|
| 18 |
+
dotenv_path = os.path.join(os.path.dirname(__file__), '.env')
|
| 19 |
+
if os.path.exists(dotenv_path):
|
| 20 |
+
load_dotenv(dotenv_path)
|
| 21 |
+
else:
|
| 22 |
+
print("Warning: .env file not found.")
|
| 23 |
+
|
| 24 |
+
# Initialize components for analysis, file processing, deduplication, and result processing
|
| 25 |
+
analyzer = PICOSAnalyzer()
|
| 26 |
+
file_processor = FileProcessor(DATA_DIR)
|
| 27 |
+
model_results = {}
|
| 28 |
+
deduplicator = Deduplicator()
|
| 29 |
+
result_processor = ResultProcessor()
|
| 30 |
+
|
| 31 |
+
# Ensure required directories exist
|
| 32 |
+
for directory in [DATA_DIR, LOG_DIR]:
|
| 33 |
+
try:
|
| 34 |
+
os.makedirs(directory, exist_ok=True)
|
| 35 |
+
except Exception as e:
|
| 36 |
+
raise RuntimeError(f"Failed to create directory {directory}: {str(e)}")
|
| 37 |
+
|
| 38 |
+
# Configure logging: log to both a file and the console
|
| 39 |
+
try:
|
| 40 |
+
log_file = os.path.join(LOG_DIR, f"picos_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
|
| 41 |
+
|
| 42 |
+
# File handler for logging to a file
|
| 43 |
+
file_handler = logging.FileHandler(log_file, encoding='utf-8')
|
| 44 |
+
file_handler.setLevel(logging.INFO)
|
| 45 |
+
|
| 46 |
+
# Console handler for logging to the terminal
|
| 47 |
+
console_handler = logging.StreamHandler()
|
| 48 |
+
console_handler.setLevel(logging.INFO)
|
| 49 |
+
|
| 50 |
+
# Formatter for log messages
|
| 51 |
+
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
| 52 |
+
file_handler.setFormatter(formatter)
|
| 53 |
+
console_handler.setFormatter(formatter)
|
| 54 |
+
|
| 55 |
+
# Configure the root logger
|
| 56 |
+
root_logger = logging.getLogger()
|
| 57 |
+
|
| 58 |
+
root_logger.setLevel(logging.INFO)
|
| 59 |
+
root_logger.addHandler(file_handler)
|
| 60 |
+
root_logger.addHandler(console_handler)
|
| 61 |
+
except Exception as e:
|
| 62 |
+
print(f"Failed to initialize logging: {str(e)}")
|
| 63 |
+
raise
|
| 64 |
+
|
| 65 |
+
def create_gradio_interface():
|
| 66 |
+
"""Create and return the Gradio interface for the PICOS Analysis System."""
|
| 67 |
+
|
| 68 |
+
def parse_nbib(file) -> tuple:
|
| 69 |
+
"""
|
| 70 |
+
Parse a citation file in NBIB format.
|
| 71 |
+
Returns a tuple containing the Excel output path and a preview text.
|
| 72 |
+
"""
|
| 73 |
+
try:
|
| 74 |
+
if not file:
|
| 75 |
+
return None, "No file uploaded"
|
| 76 |
+
|
| 77 |
+
# Determine file type based on extension
|
| 78 |
+
file_extension = os.path.splitext(file.name)[1].lower()
|
| 79 |
+
|
| 80 |
+
if file_extension == '.nbib':
|
| 81 |
+
output_path, preview = file_processor.parse_nbib(file.name)
|
| 82 |
+
elif file_extension == '.ris':
|
| 83 |
+
# Read file content to determine RIS format (Embase or Web of Science)
|
| 84 |
+
with open(file.name, 'r', encoding='utf-8') as f:
|
| 85 |
+
content = f.read()
|
| 86 |
+
if 'T1 - ' in content: # Embase RIS format
|
| 87 |
+
output_path, preview = file_processor.parse_embase_ris(file.name)
|
| 88 |
+
else: # Assume Web of Science RIS format
|
| 89 |
+
output_path, preview = file_processor.parse_wos_ris(file.name)
|
| 90 |
+
else:
|
| 91 |
+
return None, "Unsupported file format. Please upload a .nbib or .ris file"
|
| 92 |
+
|
| 93 |
+
if not output_path:
|
| 94 |
+
return None, "Failed to parse file"
|
| 95 |
+
|
| 96 |
+
return output_path, preview
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
error_msg = f"Error parsing file: {str(e)}"
|
| 100 |
+
logging.error(error_msg)
|
| 101 |
+
return None, error_msg
|
| 102 |
+
|
| 103 |
+
def parse_scopus(file) -> tuple:
|
| 104 |
+
"""
|
| 105 |
+
Parse a Scopus RIS file.
|
| 106 |
+
Returns a tuple containing the Excel output path and a preview text.
|
| 107 |
+
"""
|
| 108 |
+
try:
|
| 109 |
+
if not file:
|
| 110 |
+
return None, "No file uploaded"
|
| 111 |
+
output_path, preview = file_processor.parse_scopus_ris(file.name)
|
| 112 |
+
if not output_path:
|
| 113 |
+
return None, "Failed to parse file"
|
| 114 |
+
return output_path, preview
|
| 115 |
+
except Exception as e:
|
| 116 |
+
error_msg = f"Error parsing Scopus file: {str(e)}"
|
| 117 |
+
logging.error(error_msg)
|
| 118 |
+
return None, error_msg
|
| 119 |
+
|
| 120 |
+
def update_picos_criteria(p, i, c, o, s):
|
| 121 |
+
"""Update the PICOS criteria used for analysis."""
|
| 122 |
+
try:
|
| 123 |
+
analyzer.update_picos_criteria({
|
| 124 |
+
"population": p.strip(),
|
| 125 |
+
"intervention": i.strip(),
|
| 126 |
+
"comparison": c.strip(),
|
| 127 |
+
"outcome": o.strip(),
|
| 128 |
+
"study_design": s.strip()
|
| 129 |
+
})
|
| 130 |
+
return "✓ PICOS criteria updated successfully"
|
| 131 |
+
except Exception as e:
|
| 132 |
+
return f"❌ Error updating PICOS criteria: {str(e)}"
|
| 133 |
+
|
| 134 |
+
def update_model_settings(model_key, api_url, api_key, model_name, temperature, max_tokens, batch_size, threads, prompt, is_inference, timeout):
|
| 135 |
+
"""Update the settings for a specified model."""
|
| 136 |
+
try:
|
| 137 |
+
analyzer.update_model_config(model_key, {
|
| 138 |
+
"api_url": api_url.strip(),
|
| 139 |
+
"api_key": api_key.strip(),
|
| 140 |
+
"model": model_name.strip(),
|
| 141 |
+
"temperature": float(temperature),
|
| 142 |
+
"max_tokens": int(max_tokens),
|
| 143 |
+
"batch_size": int(batch_size),
|
| 144 |
+
"threads": int(threads),
|
| 145 |
+
"is_inference": bool(is_inference),
|
| 146 |
+
"timeout": float(timeout),
|
| 147 |
+
"updated": True # mark as manually updated
|
| 148 |
+
})
|
| 149 |
+
analyzer.update_prompt(model_key, prompt.strip())
|
| 150 |
+
return "✓ Settings updated successfully"
|
| 151 |
+
except Exception as e:
|
| 152 |
+
return f"❌ Error updating settings: {str(e)}"
|
| 153 |
+
|
| 154 |
+
def test_connection(model_key):
|
| 155 |
+
"""Test the API connection for a specified model."""
|
| 156 |
+
try:
|
| 157 |
+
result = analyzer.test_api_connection(model_key)
|
| 158 |
+
return result
|
| 159 |
+
except Exception as e:
|
| 160 |
+
return f"❌ Error testing connection: {str(e)}"
|
| 161 |
+
|
| 162 |
+
def process_model(input_file, model_key, model_a_input=None, model_b_input=None):
|
| 163 |
+
"""
|
| 164 |
+
Process analysis for a single model and return the results.
|
| 165 |
+
For Model B and C, the required previous results files must be provided.
|
| 166 |
+
"""
|
| 167 |
+
try:
|
| 168 |
+
logging.info(f"Loading input file for {model_key.upper()}...")
|
| 169 |
+
df = file_processor.load_excel(input_file.name)
|
| 170 |
+
if df is None:
|
| 171 |
+
return None, "Failed to load Excel file"
|
| 172 |
+
|
| 173 |
+
# For Model B, require Model A results; for Model C, require both Model A and B results
|
| 174 |
+
if model_key == "model_b":
|
| 175 |
+
if model_a_input is None or not os.path.exists(model_a_input.name):
|
| 176 |
+
return None, "Model A results file required for MODEL_B"
|
| 177 |
+
model_results["model_a"] = file_processor.load_excel(model_a_input.name)
|
| 178 |
+
elif model_key == "model_c":
|
| 179 |
+
logging.info("Loading Model A and B results for Model C analysis...")
|
| 180 |
+
if model_a_input is None or not os.path.exists(model_a_input.name) or \
|
| 181 |
+
model_b_input is None or not os.path.exists(model_b_input.name):
|
| 182 |
+
return None, "Both Model A and B results files required for MODEL_C"
|
| 183 |
+
model_results["model_a"] = file_processor.load_excel(model_a_input.name)
|
| 184 |
+
model_results["model_b"] = file_processor.load_excel(model_b_input.name)
|
| 185 |
+
|
| 186 |
+
# Process the model
|
| 187 |
+
logging.info(f"Starting {model_key.upper()} analysis...")
|
| 188 |
+
total_rows = len(df)
|
| 189 |
+
processed_rows = 0
|
| 190 |
+
errors = 0
|
| 191 |
+
empty_abstracts = 0
|
| 192 |
+
start_time = time.time()
|
| 193 |
+
|
| 194 |
+
def progress_callback(row_index, error=False, is_empty=False):
|
| 195 |
+
nonlocal processed_rows, errors, empty_abstracts
|
| 196 |
+
# Increase the count only when the actual processing is complete
|
| 197 |
+
if not error:
|
| 198 |
+
processed_rows += 1
|
| 199 |
+
elif is_empty:
|
| 200 |
+
empty_abstracts += 1
|
| 201 |
+
else:
|
| 202 |
+
errors += 1
|
| 203 |
+
|
| 204 |
+
# Calculate progress and time estimates
|
| 205 |
+
elapsed_time = time.time() - start_time
|
| 206 |
+
progress = processed_rows / total_rows
|
| 207 |
+
if progress > 0:
|
| 208 |
+
# Use moving averages to smooth time estimates
|
| 209 |
+
avg_time_per_item = elapsed_time / (processed_rows + errors + empty_abstracts)
|
| 210 |
+
remaining_items = total_rows - (processed_rows + errors + empty_abstracts)
|
| 211 |
+
remaining_time = avg_time_per_item * remaining_items
|
| 212 |
+
|
| 213 |
+
# Use the batch size of the model to control the log output frequency
|
| 214 |
+
batch_size = analyzer.model_manager.get_config(model_key)["batch_size"]
|
| 215 |
+
if (processed_rows + errors + empty_abstracts) % batch_size == 0:
|
| 216 |
+
logging.info(f"{model_key.upper()} Progress: {processed_rows + errors + empty_abstracts}/{total_rows} rows "
|
| 217 |
+
f"({(processed_rows + errors + empty_abstracts) / total_rows:.1%}) - "
|
| 218 |
+
f"Processed: {processed_rows}, Errors: {errors}, Empty: {empty_abstracts} - "
|
| 219 |
+
f"Elapsed: {elapsed_time:.1f}s, Remaining: {remaining_time:.1f}s")
|
| 220 |
+
|
| 221 |
+
results_df = analyzer.process_batch(df, model_key, model_results, progress_callback)
|
| 222 |
+
|
| 223 |
+
if results_df is None:
|
| 224 |
+
return None, f"{model_key.upper()} failed to process results"
|
| 225 |
+
|
| 226 |
+
# Save results immediately with fixed path in DATA_DIR
|
| 227 |
+
output_file = os.path.join(DATA_DIR, f"{model_key}_results.xlsx")
|
| 228 |
+
if model_key == "model_c":
|
| 229 |
+
# For Model C, merge all results before saving
|
| 230 |
+
merged_df = analyzer.merge_results(df, {
|
| 231 |
+
"model_a": model_results["model_a"],
|
| 232 |
+
"model_b": model_results["model_b"],
|
| 233 |
+
"model_c": results_df
|
| 234 |
+
})
|
| 235 |
+
if not file_processor.save_excel(merged_df, output_file):
|
| 236 |
+
return None, f"Failed to save {model_key.upper()} results"
|
| 237 |
+
else:
|
| 238 |
+
# For Model A and B, save individual results
|
| 239 |
+
if not file_processor.save_excel(results_df, output_file):
|
| 240 |
+
return None, f"Failed to save {model_key.upper()} results"
|
| 241 |
+
|
| 242 |
+
total_time = time.time() - start_time
|
| 243 |
+
completion_msg = (f"{model_key.upper()} analysis completed in {total_time:.1f}s - "
|
| 244 |
+
f"Processed {processed_rows} rows with {errors} errors")
|
| 245 |
+
logging.info(completion_msg)
|
| 246 |
+
|
| 247 |
+
# Return the full path to the saved file with gr.update
|
| 248 |
+
if os.path.exists(output_file):
|
| 249 |
+
return gr.update(value=output_file), completion_msg
|
| 250 |
+
else:
|
| 251 |
+
return None, f"Failed to verify {model_key.upper()} results file"
|
| 252 |
+
|
| 253 |
+
except Exception as e:
|
| 254 |
+
error_msg = f"Error in {model_key.upper()} analysis: {str(e)}"
|
| 255 |
+
logging.error(error_msg)
|
| 256 |
+
return None, error_msg
|
| 257 |
+
|
| 258 |
+
def merge_results_with_files(input_file, model_a_file, model_b_file, model_c_file):
|
| 259 |
+
"""
|
| 260 |
+
Merge all model results from the provided files and export the merged results as an Excel file.
|
| 261 |
+
"""
|
| 262 |
+
if not all([input_file, model_a_file, model_b_file]):
|
| 263 |
+
return None, "Original file, Model A and B results are required"
|
| 264 |
+
|
| 265 |
+
try:
|
| 266 |
+
df = file_processor.load_excel(input_file.name)
|
| 267 |
+
model_a_results = file_processor.load_excel(model_a_file.name)
|
| 268 |
+
model_b_results = file_processor.load_excel(model_b_file.name)
|
| 269 |
+
model_c_results = file_processor.load_excel(model_c_file.name) if model_c_file else None
|
| 270 |
+
|
| 271 |
+
if any(result is None for result in [df, model_a_results, model_b_results]):
|
| 272 |
+
return None, "Failed to load one or more required files"
|
| 273 |
+
|
| 274 |
+
model_results["model_a"] = model_a_results
|
| 275 |
+
model_results["model_b"] = model_b_results
|
| 276 |
+
if model_c_results is not None:
|
| 277 |
+
model_results["model_c"] = model_c_results
|
| 278 |
+
|
| 279 |
+
merged_df = analyzer.merge_results(df, model_results)
|
| 280 |
+
|
| 281 |
+
final_filename = os.path.join(DATA_DIR, "final_results.xlsx")
|
| 282 |
+
result_processor.export_to_excel(merged_df, final_filename)
|
| 283 |
+
|
| 284 |
+
return final_filename, "Results merged successfully"
|
| 285 |
+
except Exception as e:
|
| 286 |
+
return None, f"Error merging results: {str(e)}"
|
| 287 |
+
|
| 288 |
+
def run_all_models(input_file):
|
| 289 |
+
"""Run analysis pipeline for all models with streaming updates"""
|
| 290 |
+
try:
|
| 291 |
+
# Read Excel file using file processor
|
| 292 |
+
df = file_processor.load_excel(input_file.name)
|
| 293 |
+
if df is None:
|
| 294 |
+
yield [None, None, None, None, "Failed to load input file"]
|
| 295 |
+
return
|
| 296 |
+
|
| 297 |
+
# --- Process Model A ---
|
| 298 |
+
logging.info("Starting Model A analysis...")
|
| 299 |
+
model_a_results = analyzer.process_batch(df, "model_a")
|
| 300 |
+
if model_a_results is None:
|
| 301 |
+
yield [None, None, None, None, "Model A failed to process results"]
|
| 302 |
+
return
|
| 303 |
+
|
| 304 |
+
# Save Model A results with fixed path
|
| 305 |
+
model_a_path = os.path.join(DATA_DIR, "model_a_results.xlsx")
|
| 306 |
+
if not file_processor.save_excel(model_a_results, model_a_path):
|
| 307 |
+
yield [None, None, None, None, "Failed to save Model A results"]
|
| 308 |
+
return
|
| 309 |
+
model_results["model_a"] = model_a_results
|
| 310 |
+
status_msg = "Model A completed successfully"
|
| 311 |
+
# Yield update: Model A result available
|
| 312 |
+
yield [gr.update(value=model_a_path), None, None, None, status_msg]
|
| 313 |
+
|
| 314 |
+
# --- Process Model B ---
|
| 315 |
+
logging.info("Starting Model B analysis...")
|
| 316 |
+
model_b_results = analyzer.process_batch(df, "model_b", {"model_a": model_a_results})
|
| 317 |
+
if model_b_results is None:
|
| 318 |
+
yield [gr.update(value=model_a_path), None, None, None, "Model B failed to process results"]
|
| 319 |
+
return
|
| 320 |
+
|
| 321 |
+
# Save Model B results with fixed path
|
| 322 |
+
model_b_path = os.path.join(DATA_DIR, "model_b_results.xlsx")
|
| 323 |
+
if not file_processor.save_excel(model_b_results, model_b_path):
|
| 324 |
+
yield [gr.update(value=model_a_path), None, None, None, "Failed to save Model B results"]
|
| 325 |
+
return
|
| 326 |
+
model_results["model_b"] = model_b_results
|
| 327 |
+
status_msg = "Model B completed successfully"
|
| 328 |
+
# Yield update: Both Model A and B results available
|
| 329 |
+
yield [gr.update(value=model_a_path), gr.update(value=model_b_path), None, None, status_msg]
|
| 330 |
+
|
| 331 |
+
# --- Process Model C ---
|
| 332 |
+
logging.info("Starting Model C analysis...")
|
| 333 |
+
model_c_results = analyzer.process_batch(df, "model_c", {
|
| 334 |
+
"model_a": model_a_results,
|
| 335 |
+
"model_b": model_b_results
|
| 336 |
+
})
|
| 337 |
+
|
| 338 |
+
model_c_path = None
|
| 339 |
+
if model_c_results is not None:
|
| 340 |
+
# Save Model C results with fixed path
|
| 341 |
+
model_c_path = os.path.join(DATA_DIR, "model_c_results.xlsx")
|
| 342 |
+
if not file_processor.save_excel(model_c_results, model_c_path):
|
| 343 |
+
yield [gr.update(value=model_a_path), gr.update(value=model_b_path), None, None, "Failed to save Model C results"]
|
| 344 |
+
return
|
| 345 |
+
model_results["model_c"] = model_c_results
|
| 346 |
+
status_msg = "Model C completed successfully"
|
| 347 |
+
# Yield update: Model A, B and C results available
|
| 348 |
+
yield [gr.update(value=model_a_path), gr.update(value=model_b_path), gr.update(value=model_c_path), None, status_msg]
|
| 349 |
+
|
| 350 |
+
# Merge results
|
| 351 |
+
logging.info("Merging results...")
|
| 352 |
+
merged_df = analyzer.merge_results(df, model_results)
|
| 353 |
+
|
| 354 |
+
# Save final results with fixed path
|
| 355 |
+
final_path = os.path.join(DATA_DIR, "final_results.xlsx")
|
| 356 |
+
if not file_processor.save_excel(merged_df, final_path):
|
| 357 |
+
yield [gr.update(value=model_a_path), gr.update(value=model_b_path), gr.update(value=model_c_path), None, "Failed to save final results"]
|
| 358 |
+
return
|
| 359 |
+
|
| 360 |
+
completion_msg = "All models completed successfully"
|
| 361 |
+
# Yield final update with all results available
|
| 362 |
+
yield [gr.update(value=model_a_path), gr.update(value=model_b_path), gr.update(value=model_c_path), gr.update(value=final_path), completion_msg]
|
| 363 |
+
|
| 364 |
+
except Exception as e:
|
| 365 |
+
error_msg = f"Error in pipeline: {str(e)}"
|
| 366 |
+
logging.error(error_msg)
|
| 367 |
+
yield [None, None, None, None, error_msg]
|
| 368 |
+
|
| 369 |
+
def process_deduplication(files, threshold):
|
| 370 |
+
"""
|
| 371 |
+
Process deduplication for multiple Excel files.
|
| 372 |
+
The function identifies duplicate entries based on a similarity threshold.
|
| 373 |
+
"""
|
| 374 |
+
try:
|
| 375 |
+
if not files:
|
| 376 |
+
return None, None, "No files uploaded"
|
| 377 |
+
|
| 378 |
+
dataframes = []
|
| 379 |
+
for file in files:
|
| 380 |
+
if not file:
|
| 381 |
+
continue
|
| 382 |
+
df = file_processor.load_excel(file.name)
|
| 383 |
+
if df is None:
|
| 384 |
+
return None, None, f"Failed to load file: {file.name}"
|
| 385 |
+
dataframes.append(df)
|
| 386 |
+
|
| 387 |
+
if not dataframes:
|
| 388 |
+
return None, None, "No valid files to process"
|
| 389 |
+
|
| 390 |
+
unique_df, clusters_df = deduplicator.process_dataframes(dataframes, threshold)
|
| 391 |
+
|
| 392 |
+
unique_path = file_processor.save_excel(unique_df, "deduplicated_data.xlsx")
|
| 393 |
+
clusters_path = file_processor.save_excel(clusters_df, "duplicate_clusters.xlsx")
|
| 394 |
+
|
| 395 |
+
if not unique_path or not clusters_path:
|
| 396 |
+
return None, None, "Failed to save results"
|
| 397 |
+
|
| 398 |
+
status_msg = f"Deduplication completed successfully:\n"
|
| 399 |
+
status_msg += f"Original entries: {sum(len(df) for df in dataframes)}\n"
|
| 400 |
+
status_msg += f"Unique entries: {len(unique_df)}\n"
|
| 401 |
+
status_msg += f"Duplicate clusters: {len(clusters_df['Cluster_ID'].unique()) if len(clusters_df) > 0 else 0}"
|
| 402 |
+
|
| 403 |
+
return unique_path, clusters_path, status_msg
|
| 404 |
+
|
| 405 |
+
except Exception as e:
|
| 406 |
+
error_msg = f"Error in deduplication: {str(e)}"
|
| 407 |
+
logging.error(error_msg)
|
| 408 |
+
return None, None, error_msg
|
| 409 |
+
|
| 410 |
+
# Build the Gradio interface
|
| 411 |
+
interface = gr.Blocks(title="PICOS Analysis System")
|
| 412 |
+
|
| 413 |
+
with interface:
|
| 414 |
+
gr.Markdown("""
|
| 415 |
+
<div style="text-align: center;">
|
| 416 |
+
<h1>PICOS Literature Analysis System</h1>
|
| 417 |
+
<p>This system uses a multi-model approach to analyze medical literature abstracts.</p>
|
| 418 |
+
</div>
|
| 419 |
+
""")
|
| 420 |
+
|
| 421 |
+
with gr.Tab("Instructions"):
|
| 422 |
+
gr.Markdown("""
|
| 423 |
+
## System Overview
|
| 424 |
+
This system helps researchers analyze medical literature by providing tools for citation management,
|
| 425 |
+
deduplication, and automated PICOS analysis using multiple language models.
|
| 426 |
+
|
| 427 |
+
## Workflow Steps
|
| 428 |
+
**Citation Processing** -> **Deduplication** (Optional) -> **PICOS Analysis Setup** -> **Analysis Execution**
|
| 429 |
+
|
| 430 |
+
## File Format Requirements
|
| 431 |
+
### Input Files
|
| 432 |
+
- **Pubmed**: NBIB format (.nbib)
|
| 433 |
+
- **Embase**: RIS format (.ris)
|
| 434 |
+
- **Web of Science**: RIS format (.ris)
|
| 435 |
+
- **Scopus**: RIS format (.ris)
|
| 436 |
+
|
| 437 |
+
### Processed Format
|
| 438 |
+
The system will generate standardized Excel files (XLSX format) with these columns:
|
| 439 |
+
- **Index**: Unique identifier for each abstract
|
| 440 |
+
- **Title**: Article title
|
| 441 |
+
- **Authors**: Author list (semicolon-separated)
|
| 442 |
+
- **Abstract**: Full abstract text
|
| 443 |
+
- **DOI**: Digital Object Identifier (when available)
|
| 444 |
+
|
| 445 |
+
### Analysis Results
|
| 446 |
+
Each model will generate an Excel file containing:
|
| 447 |
+
- All original citation data
|
| 448 |
+
- PICOS analysis results
|
| 449 |
+
- Inclusion/exclusion decisions
|
| 450 |
+
- Reasoning for decisions
|
| 451 |
+
""")
|
| 452 |
+
|
| 453 |
+
with gr.Tab("Citation File Processing"):
|
| 454 |
+
with gr.Tab("Pubmed"):
|
| 455 |
+
gr.Markdown("""
|
| 456 |
+
## Pubmed NBIB Processing
|
| 457 |
+
Upload a .nbib file from Pubmed to extract and convert it to Excel format. The extracted data will include:
|
| 458 |
+
- DOI
|
| 459 |
+
- Title
|
| 460 |
+
- Authors
|
| 461 |
+
- Abstract
|
| 462 |
+
""")
|
| 463 |
+
|
| 464 |
+
with gr.Row():
|
| 465 |
+
nbib_file = gr.File(label="Upload NBIB File", file_types=[".nbib"])
|
| 466 |
+
process_nbib_btn = gr.Button("Process NBIB File")
|
| 467 |
+
|
| 468 |
+
with gr.Row():
|
| 469 |
+
nbib_preview = gr.Textbox(label="Preview", lines=20)
|
| 470 |
+
nbib_output = gr.File(label="Download Excel")
|
| 471 |
+
|
| 472 |
+
process_nbib_btn.click(
|
| 473 |
+
parse_nbib,
|
| 474 |
+
inputs=[nbib_file],
|
| 475 |
+
outputs=[nbib_output, nbib_preview]
|
| 476 |
+
)
|
| 477 |
+
|
| 478 |
+
with gr.Tab("Embase"):
|
| 479 |
+
gr.Markdown("""
|
| 480 |
+
## Embase RIS Processing
|
| 481 |
+
Upload a .ris file from Embase to extract and convert it to Excel format. The extracted data will include:
|
| 482 |
+
- DOI
|
| 483 |
+
- Title
|
| 484 |
+
- Authors
|
| 485 |
+
- Abstract
|
| 486 |
+
""")
|
| 487 |
+
|
| 488 |
+
with gr.Row():
|
| 489 |
+
embase_file = gr.File(label="Upload Embase RIS File", file_types=[".ris"])
|
| 490 |
+
process_embase_btn = gr.Button("Process Embase RIS File")
|
| 491 |
+
|
| 492 |
+
with gr.Row():
|
| 493 |
+
embase_preview = gr.Textbox(label="Preview", lines=20)
|
| 494 |
+
embase_output = gr.File(label="Download Excel")
|
| 495 |
+
|
| 496 |
+
process_embase_btn.click(
|
| 497 |
+
parse_nbib,
|
| 498 |
+
inputs=[embase_file],
|
| 499 |
+
outputs=[embase_output, embase_preview]
|
| 500 |
+
)
|
| 501 |
+
|
| 502 |
+
with gr.Tab("Web of Science"):
|
| 503 |
+
gr.Markdown("""
|
| 504 |
+
## Web of Science RIS Processing
|
| 505 |
+
Upload a .ris file from Web of Science to extract and convert it to Excel format. The extracted data will include:
|
| 506 |
+
- DOI
|
| 507 |
+
- Title
|
| 508 |
+
- Authors
|
| 509 |
+
- Abstract
|
| 510 |
+
""")
|
| 511 |
+
|
| 512 |
+
with gr.Row():
|
| 513 |
+
wos_file = gr.File(label="Upload WOS RIS File", file_types=[".ris"])
|
| 514 |
+
process_wos_btn = gr.Button("Process WOS RIS File")
|
| 515 |
+
|
| 516 |
+
with gr.Row():
|
| 517 |
+
wos_preview = gr.Textbox(label="Preview", lines=20)
|
| 518 |
+
wos_output = gr.File(label="Download Excel")
|
| 519 |
+
|
| 520 |
+
process_wos_btn.click(
|
| 521 |
+
lambda file: parse_nbib(file) if file else (None, "No file uploaded"),
|
| 522 |
+
inputs=[wos_file],
|
| 523 |
+
outputs=[wos_output, wos_preview]
|
| 524 |
+
)
|
| 525 |
+
|
| 526 |
+
with gr.Tab("Scopus"):
|
| 527 |
+
gr.Markdown("""
|
| 528 |
+
## Scopus RIS Processing
|
| 529 |
+
Upload a .ris file from Scopus to extract and convert it to Excel format. The extracted data will include:
|
| 530 |
+
- DOI
|
| 531 |
+
- Title
|
| 532 |
+
- Authors
|
| 533 |
+
- Abstract
|
| 534 |
+
""")
|
| 535 |
+
|
| 536 |
+
with gr.Row():
|
| 537 |
+
scopus_file = gr.File(label="Upload Scopus RIS File", file_types=[".ris"])
|
| 538 |
+
process_scopus_btn = gr.Button("Process Scopus RIS File")
|
| 539 |
+
|
| 540 |
+
with gr.Row():
|
| 541 |
+
scopus_preview = gr.Textbox(label="Preview", lines=20)
|
| 542 |
+
scopus_output = gr.File(label="Download Excel")
|
| 543 |
+
|
| 544 |
+
process_scopus_btn.click(
|
| 545 |
+
parse_scopus,
|
| 546 |
+
inputs=[scopus_file],
|
| 547 |
+
outputs=[scopus_output, scopus_preview]
|
| 548 |
+
)
|
| 549 |
+
|
| 550 |
+
with gr.Tab("Deduplication"):
|
| 551 |
+
gr.Markdown("""
|
| 552 |
+
## Citation Deduplication
|
| 553 |
+
Upload multiple Excel files to remove duplicate entries across different citation sources.
|
| 554 |
+
The system will identify similar entries based on title and author information.
|
| 555 |
+
|
| 556 |
+
### Features:
|
| 557 |
+
- Support for multiple Excel files
|
| 558 |
+
- Adjustable similarity threshold
|
| 559 |
+
- Detailed duplicate clusters report
|
| 560 |
+
- Standardized output format
|
| 561 |
+
""")
|
| 562 |
+
|
| 563 |
+
with gr.Row():
|
| 564 |
+
input_files = gr.File(
|
| 565 |
+
label="Upload Excel Files",
|
| 566 |
+
file_types=[".xlsx", ".xls"],
|
| 567 |
+
file_count="multiple"
|
| 568 |
+
)
|
| 569 |
+
threshold = gr.Slider(
|
| 570 |
+
label="Similarity Threshold",
|
| 571 |
+
minimum=0.1,
|
| 572 |
+
maximum=1.0,
|
| 573 |
+
value=0.8,
|
| 574 |
+
step=0.05,
|
| 575 |
+
info="Higher values mean stricter matching (0.8 recommended)"
|
| 576 |
+
)
|
| 577 |
+
|
| 578 |
+
with gr.Row():
|
| 579 |
+
process_btn = gr.Button("Process Deduplication")
|
| 580 |
+
|
| 581 |
+
with gr.Row():
|
| 582 |
+
status = gr.Textbox(label="Status", lines=5)
|
| 583 |
+
|
| 584 |
+
with gr.Row():
|
| 585 |
+
unique_output = gr.File(label="Download Deduplicated Data")
|
| 586 |
+
clusters_output = gr.File(label="Download Duplicate Clusters")
|
| 587 |
+
|
| 588 |
+
process_btn.click(
|
| 589 |
+
process_deduplication,
|
| 590 |
+
inputs=[input_files, threshold],
|
| 591 |
+
outputs=[unique_output, clusters_output, status]
|
| 592 |
+
)
|
| 593 |
+
|
| 594 |
+
with gr.Tab("LLM Analysis"):
|
| 595 |
+
with gr.Tab("PICOS Criteria"):
|
| 596 |
+
gr.Markdown("""
|
| 597 |
+
## PICOS Criteria Settings
|
| 598 |
+
Define the standard PICOS criteria that will be used by all models.
|
| 599 |
+
These criteria will be used to evaluate whether each article meets the requirements.
|
| 600 |
+
""")
|
| 601 |
+
|
| 602 |
+
with gr.Group("Standard PICOS Criteria"):
|
| 603 |
+
population = gr.Textbox(label="Population", value=analyzer.picos_criteria["population"],
|
| 604 |
+
placeholder="e.g., patients with hepatocellular carcinoma")
|
| 605 |
+
intervention = gr.Textbox(label="Intervention", value=analyzer.picos_criteria["intervention"],
|
| 606 |
+
placeholder="e.g., immunotherapy or targeted therapy")
|
| 607 |
+
comparison = gr.Textbox(label="Comparison", value=analyzer.picos_criteria["comparison"],
|
| 608 |
+
placeholder="e.g., standard therapy or placebo")
|
| 609 |
+
outcome = gr.Textbox(label="Outcome", value=analyzer.picos_criteria["outcome"],
|
| 610 |
+
placeholder="e.g., survival or response rate")
|
| 611 |
+
study_design = gr.Textbox(label="Study Design", value=analyzer.picos_criteria["study_design"],
|
| 612 |
+
placeholder="e.g., randomized controlled trial")
|
| 613 |
+
|
| 614 |
+
update_picos_btn = gr.Button("Update PICOS Criteria")
|
| 615 |
+
picos_status = gr.Textbox(label="Status")
|
| 616 |
+
|
| 617 |
+
update_picos_btn.click(
|
| 618 |
+
update_picos_criteria,
|
| 619 |
+
inputs=[population, intervention, comparison, outcome, study_design],
|
| 620 |
+
outputs=picos_status
|
| 621 |
+
)
|
| 622 |
+
|
| 623 |
+
with gr.Tab("Model Settings"):
|
| 624 |
+
for model_key in ["model_a", "model_b", "model_c"]:
|
| 625 |
+
with gr.Group(f"{model_key.upper()} Settings"):
|
| 626 |
+
config = analyzer.model_manager.get_config(model_key)
|
| 627 |
+
api_url = gr.Textbox(label="API URL", value=config["api_url"])
|
| 628 |
+
api_key = gr.Textbox(label="API Key", value=config["api_key"])
|
| 629 |
+
model_name = gr.Textbox(label="Model", value=config["model"])
|
| 630 |
+
is_inference = gr.Checkbox(
|
| 631 |
+
label="Inference Model",
|
| 632 |
+
value=config.get("is_inference", False),
|
| 633 |
+
info="Enable inference compatibility mode for models that return reasoning process"
|
| 634 |
+
)
|
| 635 |
+
temperature = gr.Slider(label="Temperature", minimum=0, maximum=10, value=config["temperature"])
|
| 636 |
+
max_tokens = gr.Number(label="Max Tokens", value=config["max_tokens"])
|
| 637 |
+
batch_size = gr.Number(label="Batch Size", value=config["batch_size"])
|
| 638 |
+
threads = gr.Slider(label="Threads", minimum=1, maximum=32, step=1, value=config["threads"])
|
| 639 |
+
timeout = gr.Number(label="Timeout (seconds)", value=config.get("timeout", 180))
|
| 640 |
+
prompt = gr.Textbox(label="Prompt Template", value=analyzer.prompt_manager.get_prompt(model_key), lines=10)
|
| 641 |
+
|
| 642 |
+
update_btn = gr.Button(f"Update {model_key.upper().replace('_', ' ')} Settings")
|
| 643 |
+
test_btn = gr.Button(f"Test {model_key.upper().replace('_', ' ')} Connection")
|
| 644 |
+
status = gr.Textbox(label="Status", lines=10)
|
| 645 |
+
|
| 646 |
+
update_btn.click(
|
| 647 |
+
update_model_settings,
|
| 648 |
+
inputs=[gr.Textbox(value=model_key, visible=False),
|
| 649 |
+
api_url,
|
| 650 |
+
api_key,
|
| 651 |
+
model_name,
|
| 652 |
+
temperature,
|
| 653 |
+
max_tokens,
|
| 654 |
+
batch_size,
|
| 655 |
+
threads,
|
| 656 |
+
prompt,
|
| 657 |
+
is_inference,
|
| 658 |
+
timeout],
|
| 659 |
+
outputs=status
|
| 660 |
+
)
|
| 661 |
+
test_btn.click(
|
| 662 |
+
test_connection,
|
| 663 |
+
inputs=[gr.Textbox(value=model_key, visible=False)],
|
| 664 |
+
outputs=status
|
| 665 |
+
)
|
| 666 |
+
|
| 667 |
+
with gr.Tab("Analysis"):
|
| 668 |
+
with gr.Row():
|
| 669 |
+
input_file = gr.File(label="Original Excel File")
|
| 670 |
+
model_a_input = gr.File(label="Model A Results")
|
| 671 |
+
model_b_input = gr.File(label="Model B Results")
|
| 672 |
+
model_c_input = gr.File(label="Model C Results")
|
| 673 |
+
|
| 674 |
+
with gr.Row():
|
| 675 |
+
model_a_btn = gr.Button("Run Model A")
|
| 676 |
+
model_b_btn = gr.Button("Run Model B")
|
| 677 |
+
model_c_btn = gr.Button("Run Model C")
|
| 678 |
+
merge_btn = gr.Button("Merge Results")
|
| 679 |
+
# Register run_all_btn with streaming enabled for intermediate updates
|
| 680 |
+
run_all_btn = gr.Button("Run All", variant="primary")
|
| 681 |
+
|
| 682 |
+
status = gr.Textbox(label="Status")
|
| 683 |
+
|
| 684 |
+
with gr.Row():
|
| 685 |
+
model_a_output = gr.File(label="Model A Results", interactive=True)
|
| 686 |
+
model_b_output = gr.File(label="Model B Results", interactive=True)
|
| 687 |
+
model_c_output = gr.File(label="Model C Results", interactive=True)
|
| 688 |
+
final_output = gr.File(label="Final Results", interactive=True)
|
| 689 |
+
|
| 690 |
+
# Individual model runs
|
| 691 |
+
model_a_btn.click(
|
| 692 |
+
lambda x: process_model(x, "model_a"),
|
| 693 |
+
inputs=[input_file],
|
| 694 |
+
outputs=[model_a_output, status]
|
| 695 |
+
)
|
| 696 |
+
model_b_btn.click(
|
| 697 |
+
lambda x, y: process_model(x, "model_b", y),
|
| 698 |
+
inputs=[input_file, model_a_input],
|
| 699 |
+
outputs=[model_b_output, status]
|
| 700 |
+
)
|
| 701 |
+
model_c_btn.click(
|
| 702 |
+
lambda x, y, z: process_model(x, "model_c", y, z),
|
| 703 |
+
inputs=[input_file, model_a_input, model_b_input],
|
| 704 |
+
outputs=[model_c_output, status]
|
| 705 |
+
)
|
| 706 |
+
merge_btn.click(
|
| 707 |
+
merge_results_with_files,
|
| 708 |
+
inputs=[input_file, model_a_input, model_b_input, model_c_input],
|
| 709 |
+
outputs=[final_output, status]
|
| 710 |
+
)
|
| 711 |
+
run_all_btn.click(
|
| 712 |
+
fn=run_all_models,
|
| 713 |
+
inputs=[input_file],
|
| 714 |
+
outputs=[model_a_output, model_b_output, model_c_output, final_output, status]
|
| 715 |
+
)
|
| 716 |
+
|
| 717 |
+
return interface
|
| 718 |
+
|
| 719 |
+
if __name__ == "__main__":
|
| 720 |
+
interface = create_gradio_interface()
|
| 721 |
+
if interface:
|
| 722 |
+
interface.launch(server_name="0.0.0.0", server_port=7860, pwa=True)
|
| 723 |
+
else:
|
| 724 |
+
print("Error: Failed to create Gradio interface")
|
deduplicator.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 3 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 4 |
+
import logging
|
| 5 |
+
|
| 6 |
+
class Deduplicator:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
"""Initialize Deduplicator with required columns for processing"""
|
| 9 |
+
self.required_columns = ['Title', 'Authors', 'Abstract', 'DOI']
|
| 10 |
+
|
| 11 |
+
def validate_dataframe(self, df):
|
| 12 |
+
"""
|
| 13 |
+
Validate if dataframe has required columns
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
df: DataFrame to validate
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
bool: True if validation passes
|
| 20 |
+
|
| 21 |
+
Raises:
|
| 22 |
+
ValueError: If required columns are missing
|
| 23 |
+
"""
|
| 24 |
+
missing_cols = [col for col in self.required_columns if col not in df.columns]
|
| 25 |
+
if missing_cols:
|
| 26 |
+
raise ValueError(f"Missing required columns: {', '.join(missing_cols)}")
|
| 27 |
+
return True
|
| 28 |
+
|
| 29 |
+
def process_dataframes(self, dataframes, threshold=0.8):
|
| 30 |
+
"""
|
| 31 |
+
Process multiple dataframes and remove duplicates
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
dataframes: List of DataFrames to process
|
| 35 |
+
threshold: Similarity threshold for duplicate detection (default: 0.8)
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
tuple: (unique_df, clusters_df) where:
|
| 39 |
+
- unique_df: DataFrame containing unique entries
|
| 40 |
+
- clusters_df: DataFrame containing duplicate clusters
|
| 41 |
+
|
| 42 |
+
Raises:
|
| 43 |
+
Exception: If deduplication process fails
|
| 44 |
+
"""
|
| 45 |
+
try:
|
| 46 |
+
# Validate and combine dataframes
|
| 47 |
+
for df in dataframes:
|
| 48 |
+
self.validate_dataframe(df)
|
| 49 |
+
|
| 50 |
+
combined_df = pd.concat(dataframes, ignore_index=True)
|
| 51 |
+
|
| 52 |
+
# Create Title_Author column for similarity comparison
|
| 53 |
+
combined_df['Title_Author'] = combined_df['Title'].fillna('') + ' ' + combined_df['Authors'].fillna('')
|
| 54 |
+
|
| 55 |
+
# Find duplicate clusters
|
| 56 |
+
clusters_df, unique_df = self.find_duplicate_clusters(combined_df, threshold)
|
| 57 |
+
|
| 58 |
+
# Ensure output format consistency
|
| 59 |
+
unique_df = self.standardize_output(unique_df)
|
| 60 |
+
clusters_df = self.standardize_clusters(clusters_df)
|
| 61 |
+
|
| 62 |
+
return unique_df, clusters_df
|
| 63 |
+
|
| 64 |
+
except Exception as e:
|
| 65 |
+
logging.error(f"Error in deduplication process: {str(e)}")
|
| 66 |
+
raise
|
| 67 |
+
|
| 68 |
+
def find_duplicate_clusters(self, df, threshold):
|
| 69 |
+
"""
|
| 70 |
+
Find duplicate clusters using TF-IDF and cosine similarity
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
df: DataFrame to process
|
| 74 |
+
threshold: Similarity threshold for duplicate detection
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
tuple: (clusters_df, unique_df) where:
|
| 78 |
+
- clusters_df: DataFrame containing duplicate clusters
|
| 79 |
+
- unique_df: DataFrame containing unique entries
|
| 80 |
+
"""
|
| 81 |
+
# Create TF-IDF vectors for similarity comparison
|
| 82 |
+
vectorizer = TfidfVectorizer().fit_transform(df['Title_Author'])
|
| 83 |
+
cosine_sim = cosine_similarity(vectorizer)
|
| 84 |
+
|
| 85 |
+
n = cosine_sim.shape[0]
|
| 86 |
+
parent = list(range(n))
|
| 87 |
+
|
| 88 |
+
def find(x):
|
| 89 |
+
"""Find the root of a cluster using path compression"""
|
| 90 |
+
if parent[x] != x:
|
| 91 |
+
parent[x] = find(parent[x])
|
| 92 |
+
return parent[x]
|
| 93 |
+
|
| 94 |
+
def union(x, y):
|
| 95 |
+
"""Union two clusters by rank"""
|
| 96 |
+
rootX = find(x)
|
| 97 |
+
rootY = find(y)
|
| 98 |
+
if rootX != rootY:
|
| 99 |
+
parent[rootY] = rootX
|
| 100 |
+
|
| 101 |
+
# Build clusters using union-find
|
| 102 |
+
for i in range(n):
|
| 103 |
+
for j in range(i + 1, n):
|
| 104 |
+
if cosine_sim[i, j] > threshold:
|
| 105 |
+
union(i, j)
|
| 106 |
+
|
| 107 |
+
# Collect clusters and prepare output
|
| 108 |
+
clusters = {}
|
| 109 |
+
for i in range(n):
|
| 110 |
+
root = find(i)
|
| 111 |
+
if root not in clusters:
|
| 112 |
+
clusters[root] = []
|
| 113 |
+
clusters[root].append(i)
|
| 114 |
+
|
| 115 |
+
# Prepare output dataframes
|
| 116 |
+
cluster_data = []
|
| 117 |
+
unique_indices = []
|
| 118 |
+
|
| 119 |
+
for cluster_id, indices in clusters.items():
|
| 120 |
+
if len(indices) > 1:
|
| 121 |
+
for index in indices:
|
| 122 |
+
cluster_data.append({
|
| 123 |
+
"Cluster_ID": cluster_id,
|
| 124 |
+
"Index": index,
|
| 125 |
+
"Title": df.iloc[index]["Title"],
|
| 126 |
+
"Authors": df.iloc[index]["Authors"],
|
| 127 |
+
"DOI": df.iloc[index]["DOI"],
|
| 128 |
+
"Abstract": df.iloc[index]["Abstract"]
|
| 129 |
+
})
|
| 130 |
+
unique_indices.append(indices[0]) # Keep first occurrence
|
| 131 |
+
else:
|
| 132 |
+
unique_indices.extend(indices)
|
| 133 |
+
|
| 134 |
+
clusters_df = pd.DataFrame(cluster_data) if cluster_data else pd.DataFrame(columns=["Cluster_ID", "Index", "Title", "Authors", "DOI", "Abstract"])
|
| 135 |
+
unique_df = df.iloc[unique_indices].copy()
|
| 136 |
+
|
| 137 |
+
# Reset index to ensure it starts from 0
|
| 138 |
+
unique_df = unique_df.reset_index(drop=True)
|
| 139 |
+
# Add Index column that matches NBIB/RIS format
|
| 140 |
+
unique_df.index.name = 'Index'
|
| 141 |
+
|
| 142 |
+
return clusters_df, unique_df
|
| 143 |
+
|
| 144 |
+
def standardize_output(self, df):
|
| 145 |
+
"""
|
| 146 |
+
Ensure output dataframe has consistent format
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
df: DataFrame to standardize
|
| 150 |
+
|
| 151 |
+
Returns:
|
| 152 |
+
DataFrame with standardized format
|
| 153 |
+
"""
|
| 154 |
+
# Make sure Index is properly set
|
| 155 |
+
if 'Index' not in df.index.name:
|
| 156 |
+
df = df.reset_index(drop=True)
|
| 157 |
+
df.index.name = 'Index'
|
| 158 |
+
|
| 159 |
+
# Ensure all required columns exist
|
| 160 |
+
required_columns = ['Title', 'Authors', 'Abstract', 'DOI']
|
| 161 |
+
for col in required_columns:
|
| 162 |
+
if col not in df.columns:
|
| 163 |
+
df[col] = ''
|
| 164 |
+
|
| 165 |
+
# Select and order columns while preserving the index
|
| 166 |
+
df = df[required_columns]
|
| 167 |
+
return df
|
| 168 |
+
|
| 169 |
+
def standardize_clusters(self, df):
|
| 170 |
+
"""
|
| 171 |
+
Ensure clusters dataframe has consistent format
|
| 172 |
+
|
| 173 |
+
Args:
|
| 174 |
+
df: DataFrame containing cluster information
|
| 175 |
+
|
| 176 |
+
Returns:
|
| 177 |
+
DataFrame with standardized cluster format
|
| 178 |
+
"""
|
| 179 |
+
required_columns = ['Cluster_ID', 'Index', 'Title', 'Authors', 'DOI', 'Abstract']
|
| 180 |
+
for col in required_columns:
|
| 181 |
+
if col not in df.columns:
|
| 182 |
+
df[col] = ''
|
| 183 |
+
return df[required_columns]
|
file_processor.py
ADDED
|
@@ -0,0 +1,407 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import logging
|
| 4 |
+
import re
|
| 5 |
+
from typing import Tuple, Optional
|
| 6 |
+
|
| 7 |
+
class FileProcessor:
|
| 8 |
+
def __init__(self, data_dir: str):
|
| 9 |
+
"""
|
| 10 |
+
Initialize FileProcessor
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
data_dir: Directory path for storing processed data
|
| 14 |
+
"""
|
| 15 |
+
self.data_dir = data_dir
|
| 16 |
+
|
| 17 |
+
def parse_nbib(self, file_path: str) -> Tuple[Optional[str], str]:
|
| 18 |
+
"""
|
| 19 |
+
Parse NBIB file and return Excel output path and preview text
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
file_path: Path to the NBIB file to parse
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
tuple: (output_path, preview_text) where:
|
| 26 |
+
- output_path: Path to the generated Excel file (None if parsing fails)
|
| 27 |
+
- preview_text: Preview of the parsed data or error message
|
| 28 |
+
"""
|
| 29 |
+
if not file_path or not os.path.exists(file_path):
|
| 30 |
+
return None, "Invalid file"
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
records = []
|
| 34 |
+
record = {}
|
| 35 |
+
authors = []
|
| 36 |
+
current_field = None
|
| 37 |
+
|
| 38 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 39 |
+
lines = f.readlines()
|
| 40 |
+
|
| 41 |
+
if not lines:
|
| 42 |
+
return None, "Empty file"
|
| 43 |
+
|
| 44 |
+
# Process each line in the NBIB file
|
| 45 |
+
for line in lines:
|
| 46 |
+
if line.startswith('TI - '):
|
| 47 |
+
record['Title'] = line.replace('TI - ', '').strip()
|
| 48 |
+
current_field = 'Title'
|
| 49 |
+
elif line.startswith('AB - '):
|
| 50 |
+
record['Abstract'] = line.replace('AB - ', '').strip()
|
| 51 |
+
current_field = 'Abstract'
|
| 52 |
+
elif line.startswith('AU - '):
|
| 53 |
+
authors.append(line.replace('AU - ', '').strip())
|
| 54 |
+
current_field = None
|
| 55 |
+
elif line.startswith('LID - '):
|
| 56 |
+
if '[doi]' in line:
|
| 57 |
+
doi_part = line.replace('LID - ', '').strip()
|
| 58 |
+
record['DOI'] = doi_part.replace(' [doi]', '').strip()
|
| 59 |
+
current_field = None
|
| 60 |
+
elif line.startswith('PMID- '):
|
| 61 |
+
if record: # Save the previous record
|
| 62 |
+
record['Authors'] = '; '.join(authors)
|
| 63 |
+
records.append(record)
|
| 64 |
+
record = {}
|
| 65 |
+
authors = []
|
| 66 |
+
current_field = None
|
| 67 |
+
elif line.startswith(' ') and current_field in ['Abstract', 'Title']:
|
| 68 |
+
record[current_field] += ' ' + line.strip()
|
| 69 |
+
|
| 70 |
+
# Save the last record if exists
|
| 71 |
+
if record:
|
| 72 |
+
record['Authors'] = '; '.join(authors)
|
| 73 |
+
records.append(record)
|
| 74 |
+
|
| 75 |
+
# Create DataFrame and save to Excel
|
| 76 |
+
df = pd.DataFrame(records)
|
| 77 |
+
df.index.name = 'Index'
|
| 78 |
+
output_path = os.path.join(self.data_dir, "extracted_data.xlsx")
|
| 79 |
+
df.to_excel(output_path, index=True)
|
| 80 |
+
preview = self._generate_preview(records)
|
| 81 |
+
|
| 82 |
+
return output_path, preview
|
| 83 |
+
|
| 84 |
+
except Exception as e:
|
| 85 |
+
return None, f"Error processing NBIB file: {str(e)}"
|
| 86 |
+
|
| 87 |
+
def parse_wos_ris(self, file_path: str) -> Tuple[Optional[str], str]:
|
| 88 |
+
"""
|
| 89 |
+
Parse Web of Science RIS file and return Excel output path and preview text
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
file_path: Path to the WOS RIS file to parse
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
tuple: (output_path, preview_text) where:
|
| 96 |
+
- output_path: Path to the generated Excel file (None if parsing fails)
|
| 97 |
+
- preview_text: Preview of the parsed data or error message
|
| 98 |
+
"""
|
| 99 |
+
if not file_path or not os.path.exists(file_path):
|
| 100 |
+
return None, "Invalid file"
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
records = []
|
| 104 |
+
record = {}
|
| 105 |
+
authors = []
|
| 106 |
+
current_field = None
|
| 107 |
+
|
| 108 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 109 |
+
content = f.read()
|
| 110 |
+
|
| 111 |
+
if not content:
|
| 112 |
+
return None, "Empty file"
|
| 113 |
+
|
| 114 |
+
# Split content into individual articles
|
| 115 |
+
articles = content.split("\nER -")
|
| 116 |
+
|
| 117 |
+
for article in articles:
|
| 118 |
+
if not article.strip():
|
| 119 |
+
continue
|
| 120 |
+
|
| 121 |
+
record = {}
|
| 122 |
+
authors = []
|
| 123 |
+
|
| 124 |
+
# Process each line in the article
|
| 125 |
+
lines = article.strip().split('\n')
|
| 126 |
+
for line in lines:
|
| 127 |
+
if not line.strip():
|
| 128 |
+
continue
|
| 129 |
+
if line.startswith('TI - '):
|
| 130 |
+
record['Title'] = line.replace('TI - ', '').strip()
|
| 131 |
+
elif line.startswith('AB - '):
|
| 132 |
+
record['Abstract'] = line.replace('AB - ', '').strip()
|
| 133 |
+
elif line.startswith('AU - '):
|
| 134 |
+
authors.append(line.replace('AU - ', '').strip())
|
| 135 |
+
elif line.startswith('DO - '):
|
| 136 |
+
record['DOI'] = line.replace('DO - ', '').strip()
|
| 137 |
+
elif line.startswith(' '):
|
| 138 |
+
if 'Abstract' in record:
|
| 139 |
+
record['Abstract'] += ' ' + line.strip()
|
| 140 |
+
elif 'Title' in record:
|
| 141 |
+
record['Title'] += ' ' + line.strip()
|
| 142 |
+
|
| 143 |
+
if record:
|
| 144 |
+
record['Authors'] = '; '.join(authors)
|
| 145 |
+
records.append(record)
|
| 146 |
+
|
| 147 |
+
# Create DataFrame with required columns
|
| 148 |
+
df = pd.DataFrame(records)
|
| 149 |
+
required_columns = ['Title', 'Abstract', 'Authors', 'DOI']
|
| 150 |
+
for col in required_columns:
|
| 151 |
+
if col not in df.columns:
|
| 152 |
+
df[col] = ''
|
| 153 |
+
df.index.name = 'Index'
|
| 154 |
+
output_path = os.path.join(self.data_dir, "extracted_data.xlsx")
|
| 155 |
+
df.to_excel(output_path, index=True)
|
| 156 |
+
preview = self._generate_preview(records)
|
| 157 |
+
|
| 158 |
+
return output_path, preview
|
| 159 |
+
|
| 160 |
+
except Exception as e:
|
| 161 |
+
return None, f"Error processing WOS RIS file: {str(e)}"
|
| 162 |
+
|
| 163 |
+
def parse_embase_ris(self, file_path: str) -> Tuple[Optional[str], str]:
|
| 164 |
+
"""
|
| 165 |
+
Parse Embase RIS file and return Excel output path and preview text
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
file_path: Path to the Embase RIS file to parse
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
tuple: (output_path, preview_text) where:
|
| 172 |
+
- output_path: Path to the generated Excel file (None if parsing fails)
|
| 173 |
+
- preview_text: Preview of the parsed data or error message
|
| 174 |
+
"""
|
| 175 |
+
if not file_path or not os.path.exists(file_path):
|
| 176 |
+
return None, "Invalid file"
|
| 177 |
+
|
| 178 |
+
try:
|
| 179 |
+
records = []
|
| 180 |
+
record = {}
|
| 181 |
+
authors = []
|
| 182 |
+
current_field = None
|
| 183 |
+
|
| 184 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 185 |
+
content = f.read()
|
| 186 |
+
|
| 187 |
+
if not content:
|
| 188 |
+
return None, "Empty file"
|
| 189 |
+
|
| 190 |
+
# Split content into individual articles
|
| 191 |
+
articles = content.split("\n\n")
|
| 192 |
+
|
| 193 |
+
for article in articles:
|
| 194 |
+
if not article.strip():
|
| 195 |
+
continue
|
| 196 |
+
|
| 197 |
+
record = {}
|
| 198 |
+
authors = []
|
| 199 |
+
|
| 200 |
+
# Process each line in the article
|
| 201 |
+
lines = article.strip().split('\n')
|
| 202 |
+
for line in lines:
|
| 203 |
+
if not line.strip():
|
| 204 |
+
continue
|
| 205 |
+
if line.startswith('T1 - '): # Title field
|
| 206 |
+
record['Title'] = line.replace('T1 - ', '').strip()
|
| 207 |
+
elif line.startswith('N2 - '): # Abstract field
|
| 208 |
+
record['Abstract'] = line.replace('N2 - ', '').strip()
|
| 209 |
+
elif line.startswith('A1 - '): # Authors field
|
| 210 |
+
authors.append(line.replace('A1 - ', '').strip())
|
| 211 |
+
elif line.startswith('DO - '): # DOI field
|
| 212 |
+
record['DOI'] = line.replace('DO - ', '').strip()
|
| 213 |
+
elif line.startswith(' '): # Handle multi-line fields
|
| 214 |
+
if 'Abstract' in record:
|
| 215 |
+
record['Abstract'] += ' ' + line.strip()
|
| 216 |
+
elif 'Title' in record:
|
| 217 |
+
record['Title'] += ' ' + line.strip()
|
| 218 |
+
|
| 219 |
+
if record:
|
| 220 |
+
record['Authors'] = '; '.join(authors) if authors else ''
|
| 221 |
+
records.append(record)
|
| 222 |
+
|
| 223 |
+
# Create DataFrame with required columns
|
| 224 |
+
df = pd.DataFrame(records)
|
| 225 |
+
required_columns = ['Title', 'Abstract', 'Authors', 'DOI']
|
| 226 |
+
for col in required_columns:
|
| 227 |
+
if col not in df.columns:
|
| 228 |
+
df[col] = ''
|
| 229 |
+
df.index.name = 'Index'
|
| 230 |
+
output_path = os.path.join(self.data_dir, "extracted_data.xlsx")
|
| 231 |
+
df.to_excel(output_path, index=True)
|
| 232 |
+
preview = self._generate_preview(records)
|
| 233 |
+
|
| 234 |
+
return output_path, preview
|
| 235 |
+
|
| 236 |
+
except Exception as e:
|
| 237 |
+
return None, f"Error processing Embase RIS file: {str(e)}"
|
| 238 |
+
|
| 239 |
+
def parse_scopus_ris(self, file_path: str) -> Tuple[Optional[str], str]:
|
| 240 |
+
"""
|
| 241 |
+
Parse Scopus RIS file and return Excel output path and preview text
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
file_path: Path to the Scopus RIS file to parse
|
| 245 |
+
|
| 246 |
+
Returns:
|
| 247 |
+
tuple: (output_path, preview_text) where:
|
| 248 |
+
- output_path: Path to the generated Excel file (None if parsing fails)
|
| 249 |
+
- preview_text: Preview of the parsed data or error message
|
| 250 |
+
"""
|
| 251 |
+
if not file_path or not os.path.exists(file_path):
|
| 252 |
+
return None, "Invalid file"
|
| 253 |
+
|
| 254 |
+
try:
|
| 255 |
+
records = []
|
| 256 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 257 |
+
content = f.read()
|
| 258 |
+
if not content:
|
| 259 |
+
return None, "Empty file"
|
| 260 |
+
|
| 261 |
+
# Use regex to split records by "ER -" (note the double space)
|
| 262 |
+
articles = re.split(r'\nER\s*-\s*', content)
|
| 263 |
+
|
| 264 |
+
for article in articles:
|
| 265 |
+
if not article.strip():
|
| 266 |
+
continue
|
| 267 |
+
record = {}
|
| 268 |
+
authors = []
|
| 269 |
+
lines = article.strip().split('\n')
|
| 270 |
+
for line in lines:
|
| 271 |
+
line = line.strip()
|
| 272 |
+
if not line:
|
| 273 |
+
continue
|
| 274 |
+
if line.startswith('TI - '):
|
| 275 |
+
record['Title'] = line.replace('TI - ', '').strip()
|
| 276 |
+
elif line.startswith('AB - '):
|
| 277 |
+
record['Abstract'] = line.replace('AB - ', '').strip()
|
| 278 |
+
elif line.startswith('AU - '):
|
| 279 |
+
authors.append(line.replace('AU - ', '').strip())
|
| 280 |
+
elif line.startswith('DO - '):
|
| 281 |
+
record['DOI'] = line.replace('DO - ', '').strip()
|
| 282 |
+
elif line.startswith(' '):
|
| 283 |
+
if 'Abstract' in record:
|
| 284 |
+
record['Abstract'] += ' ' + line.strip()
|
| 285 |
+
elif 'Title' in record:
|
| 286 |
+
record['Title'] += ' ' + line.strip()
|
| 287 |
+
record['Authors'] = '; '.join(authors)
|
| 288 |
+
records.append(record)
|
| 289 |
+
|
| 290 |
+
# Create DataFrame with required columns
|
| 291 |
+
df = pd.DataFrame(records)
|
| 292 |
+
required_columns = ['Title', 'Abstract', 'Authors', 'DOI']
|
| 293 |
+
for col in required_columns:
|
| 294 |
+
if col not in df.columns:
|
| 295 |
+
df[col] = ''
|
| 296 |
+
df.index.name = 'Index'
|
| 297 |
+
output_path = os.path.join(self.data_dir, "extracted_data.xlsx")
|
| 298 |
+
df.to_excel(output_path, index=True)
|
| 299 |
+
preview = self._generate_preview(records)
|
| 300 |
+
|
| 301 |
+
return output_path, preview
|
| 302 |
+
|
| 303 |
+
except Exception as e:
|
| 304 |
+
return None, f"Error processing Scopus RIS file: {str(e)}"
|
| 305 |
+
|
| 306 |
+
def _generate_preview(self, records: list) -> str:
|
| 307 |
+
"""
|
| 308 |
+
Generate a preview text for the first few parsed records
|
| 309 |
+
|
| 310 |
+
Args:
|
| 311 |
+
records: List of parsed records
|
| 312 |
+
|
| 313 |
+
Returns:
|
| 314 |
+
str: Formatted preview text showing sample records
|
| 315 |
+
"""
|
| 316 |
+
preview = ""
|
| 317 |
+
for i, record in enumerate(records[:3], 0):
|
| 318 |
+
preview += f"\nRecord {i}:\n"
|
| 319 |
+
preview += f"DOI: {record.get('DOI', '')[:50]}\n"
|
| 320 |
+
preview += f"Title: {record.get('Title', '')[:100]}...\n"
|
| 321 |
+
preview += f"Authors: {record.get('Authors', '')[:100]}...\n"
|
| 322 |
+
preview += f"Abstract: {record.get('Abstract', '')[:200]}...\n"
|
| 323 |
+
preview += "-" * 80 + "\n"
|
| 324 |
+
|
| 325 |
+
preview += f"\nTotal records extracted: {len(records)}"
|
| 326 |
+
return preview
|
| 327 |
+
|
| 328 |
+
def load_excel(self, file_path: str) -> Optional[pd.DataFrame]:
|
| 329 |
+
"""
|
| 330 |
+
Load Excel file and ensure the index is set correctly
|
| 331 |
+
|
| 332 |
+
Args:
|
| 333 |
+
file_path: Path to the Excel file to load
|
| 334 |
+
|
| 335 |
+
Returns:
|
| 336 |
+
DataFrame or None if loading fails
|
| 337 |
+
"""
|
| 338 |
+
try:
|
| 339 |
+
# First try to read with index_col=0
|
| 340 |
+
df = pd.read_excel(file_path, index_col=0)
|
| 341 |
+
|
| 342 |
+
# If Index is still in columns, it means it wasn't properly set as index
|
| 343 |
+
if "Index" in df.columns:
|
| 344 |
+
df.set_index("Index", inplace=True)
|
| 345 |
+
elif df.index.name != "Index":
|
| 346 |
+
df.index.name = "Index"
|
| 347 |
+
|
| 348 |
+
# Ensure index is string type and handle any potential NaN values
|
| 349 |
+
df.index = df.index.astype(str)
|
| 350 |
+
df.index = df.index.str.strip()
|
| 351 |
+
|
| 352 |
+
# Remove any duplicate indices by keeping the first occurrence
|
| 353 |
+
if df.index.duplicated().any():
|
| 354 |
+
logging.warning(f"Found duplicate indices in {file_path}")
|
| 355 |
+
df = df[~df.index.duplicated(keep='first')]
|
| 356 |
+
|
| 357 |
+
logging.debug(f"Loaded DataFrame from {file_path}")
|
| 358 |
+
logging.debug(f"Shape: {df.shape}")
|
| 359 |
+
logging.debug(f"Columns: {df.columns.tolist()}")
|
| 360 |
+
logging.debug(f"Index name: {df.index.name}")
|
| 361 |
+
logging.debug(f"First few indices: {df.index.tolist()[:5]}")
|
| 362 |
+
|
| 363 |
+
return df
|
| 364 |
+
except Exception as e:
|
| 365 |
+
logging.error(f"Error loading Excel file: {str(e)}")
|
| 366 |
+
return None
|
| 367 |
+
|
| 368 |
+
def save_excel(self, df: pd.DataFrame, filename: str) -> str:
|
| 369 |
+
"""
|
| 370 |
+
Save a DataFrame to an Excel file
|
| 371 |
+
|
| 372 |
+
Args:
|
| 373 |
+
df: DataFrame to save
|
| 374 |
+
filename: Target filename
|
| 375 |
+
|
| 376 |
+
Returns:
|
| 377 |
+
str: Path to the saved file or empty string if saving fails
|
| 378 |
+
"""
|
| 379 |
+
try:
|
| 380 |
+
# Ensure we have a copy to avoid modifying the original
|
| 381 |
+
df = df.copy()
|
| 382 |
+
|
| 383 |
+
# Ensure index is properly named
|
| 384 |
+
if df.index.name != "Index":
|
| 385 |
+
df.index.name = "Index"
|
| 386 |
+
|
| 387 |
+
# Ensure index is string type
|
| 388 |
+
df.index = df.index.astype(str)
|
| 389 |
+
|
| 390 |
+
# Remove any duplicate indices
|
| 391 |
+
if df.index.duplicated().any():
|
| 392 |
+
logging.warning(f"Found duplicate indices when saving {filename}")
|
| 393 |
+
df = df[~df.index.duplicated(keep='first')]
|
| 394 |
+
|
| 395 |
+
output_path = os.path.join(self.data_dir, filename)
|
| 396 |
+
|
| 397 |
+
# Save with index
|
| 398 |
+
df.to_excel(output_path, index=True)
|
| 399 |
+
|
| 400 |
+
logging.debug(f"Saved DataFrame to {output_path}")
|
| 401 |
+
logging.debug(f"Shape: {df.shape}")
|
| 402 |
+
logging.debug(f"Columns: {df.columns.tolist()}")
|
| 403 |
+
|
| 404 |
+
return output_path
|
| 405 |
+
except Exception as e:
|
| 406 |
+
logging.error(f"Error saving Excel file: {str(e)}")
|
| 407 |
+
return ""
|
model_manager.py
ADDED
|
@@ -0,0 +1,528 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import requests
|
| 4 |
+
import logging
|
| 5 |
+
import time
|
| 6 |
+
import re
|
| 7 |
+
from typing import Dict, Any
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
|
| 10 |
+
# Ensure .env file is loaded (with override enabled to pick up any modifications)
|
| 11 |
+
load_dotenv(override=True)
|
| 12 |
+
|
| 13 |
+
class ModelManager:
|
| 14 |
+
def __init__(self):
|
| 15 |
+
# Load base configuration from environment variables
|
| 16 |
+
self.model_configs = {
|
| 17 |
+
"model_a": {
|
| 18 |
+
"api_key": os.getenv("MODEL_A_API_KEY", ""),
|
| 19 |
+
"api_url": os.getenv("MODEL_A_API_URL", ""),
|
| 20 |
+
"model": os.getenv("MODEL_A_MODEL_NAME", ""),
|
| 21 |
+
"name": "Model A (Primary Analyzer)",
|
| 22 |
+
"temperature": float(os.getenv("MODEL_A_TEMPERATURE", "0.3")),
|
| 23 |
+
"max_tokens": int(os.getenv("MODEL_A_MAX_TOKENS", "4096")),
|
| 24 |
+
"batch_size": int(os.getenv("MODEL_A_BATCH_SIZE", "10")),
|
| 25 |
+
"threads": int(os.getenv("MODEL_A_THREADS", "8")),
|
| 26 |
+
"timeout": int(os.getenv("MODEL_A_TIMEOUT", "180")),
|
| 27 |
+
"is_inference": os.getenv("MODEL_A_IS_INFERENCE", "").lower() == "true",
|
| 28 |
+
"updated": False # flag to indicate if manually updated
|
| 29 |
+
},
|
| 30 |
+
"model_b": {
|
| 31 |
+
"api_key": os.getenv("MODEL_B_API_KEY", ""),
|
| 32 |
+
"api_url": os.getenv("MODEL_B_API_URL", ""),
|
| 33 |
+
"model": os.getenv("MODEL_B_MODEL_NAME", ""),
|
| 34 |
+
"name": "Model B (Critical Reviewer)",
|
| 35 |
+
"temperature": float(os.getenv("MODEL_B_TEMPERATURE", "0.3")),
|
| 36 |
+
"max_tokens": int(os.getenv("MODEL_B_MAX_TOKENS", "4096")),
|
| 37 |
+
"batch_size": int(os.getenv("MODEL_B_BATCH_SIZE", "10")),
|
| 38 |
+
"threads": int(os.getenv("MODEL_B_THREADS", "8")),
|
| 39 |
+
"timeout": int(os.getenv("MODEL_B_TIMEOUT", "180")),
|
| 40 |
+
"is_inference": os.getenv("MODEL_B_IS_INFERENCE", "").lower() == "true",
|
| 41 |
+
"updated": False
|
| 42 |
+
},
|
| 43 |
+
"model_c": {
|
| 44 |
+
"api_key": os.getenv("MODEL_C_API_KEY", ""),
|
| 45 |
+
"api_url": os.getenv("MODEL_C_API_URL", ""),
|
| 46 |
+
"model": os.getenv("MODEL_C_MODEL_NAME", ""),
|
| 47 |
+
"name": "Model C (Final Arbitrator)",
|
| 48 |
+
"temperature": float(os.getenv("MODEL_C_TEMPERATURE", "0.3")),
|
| 49 |
+
"max_tokens": int(os.getenv("MODEL_C_MAX_TOKENS", "4096")),
|
| 50 |
+
"batch_size": int(os.getenv("MODEL_C_BATCH_SIZE", "10")),
|
| 51 |
+
"threads": int(os.getenv("MODEL_C_THREADS", "8")),
|
| 52 |
+
"timeout": int(os.getenv("MODEL_C_TIMEOUT", "180")),
|
| 53 |
+
"is_inference": os.getenv("MODEL_C_IS_INFERENCE", "").lower() == "true",
|
| 54 |
+
"updated": False
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
# Validate API keys
|
| 59 |
+
for model_key, config in self.model_configs.items():
|
| 60 |
+
if not config["api_key"]:
|
| 61 |
+
logging.warning(f"API key not found for {config['name']}")
|
| 62 |
+
|
| 63 |
+
def update_model_config(self, model_key: str, config: Dict[str, Any]) -> None:
|
| 64 |
+
"""Update model configuration."""
|
| 65 |
+
if model_key not in self.model_configs:
|
| 66 |
+
raise ValueError(f"Invalid model key: {model_key}")
|
| 67 |
+
self.model_configs[model_key].update(config)
|
| 68 |
+
|
| 69 |
+
def process_model_response(self, model_key: str, response: str) -> Dict:
|
| 70 |
+
"""Process response based on model type."""
|
| 71 |
+
try:
|
| 72 |
+
logging.debug(f"Raw response from {model_key}: {response}")
|
| 73 |
+
logging.debug(f"Response type: {type(response)}")
|
| 74 |
+
|
| 75 |
+
# Parse outer JSON
|
| 76 |
+
response_obj = json.loads(response) if isinstance(response, str) else response
|
| 77 |
+
logging.debug(f"Parsed response object: {json.dumps(response_obj, indent=2)}")
|
| 78 |
+
|
| 79 |
+
# Process based on mode
|
| 80 |
+
if self.model_configs[model_key].get("is_inference", False):
|
| 81 |
+
logging.debug(f"Processing {model_key} response in inference mode")
|
| 82 |
+
logging.debug(f"Model config: {json.dumps(self.model_configs[model_key], indent=2)}")
|
| 83 |
+
return self.process_inference_result(response_obj, model_key)
|
| 84 |
+
|
| 85 |
+
# Get content from response
|
| 86 |
+
if not isinstance(response_obj, dict):
|
| 87 |
+
logging.error(f"Invalid response format from {model_key}: {response_obj}")
|
| 88 |
+
return self.get_default_response(model_key)
|
| 89 |
+
|
| 90 |
+
if "choices" not in response_obj:
|
| 91 |
+
logging.error(f"No choices in response: {response_obj}")
|
| 92 |
+
return self.get_default_response(model_key)
|
| 93 |
+
|
| 94 |
+
if not response_obj["choices"]:
|
| 95 |
+
logging.error(f"Empty choices in response: {response_obj}")
|
| 96 |
+
return self.get_default_response(model_key)
|
| 97 |
+
|
| 98 |
+
content = response_obj["choices"][0].get("message", {}).get("content", "")
|
| 99 |
+
logging.debug(f"Extracted content: {content}")
|
| 100 |
+
|
| 101 |
+
if not content:
|
| 102 |
+
logging.error(f"Empty content in {model_key} response")
|
| 103 |
+
return self.get_default_response(model_key)
|
| 104 |
+
|
| 105 |
+
# Handle markdown code blocks
|
| 106 |
+
if "```json" in content:
|
| 107 |
+
pattern = r"```json\s*(.*?)\s*```"
|
| 108 |
+
match = re.search(pattern, content, re.DOTALL)
|
| 109 |
+
if match:
|
| 110 |
+
content = match.group(1).strip()
|
| 111 |
+
logging.debug(f"Extracted JSON from markdown: {content}")
|
| 112 |
+
|
| 113 |
+
# Parse inner JSON
|
| 114 |
+
try:
|
| 115 |
+
result = json.loads(content)
|
| 116 |
+
logging.debug(f"Parsed content result: {json.dumps(result, indent=2)}")
|
| 117 |
+
|
| 118 |
+
# Validate results field
|
| 119 |
+
if "results" not in result:
|
| 120 |
+
logging.error(f"Missing 'results' field in {model_key} response")
|
| 121 |
+
return self.get_default_response(model_key)
|
| 122 |
+
|
| 123 |
+
# Validate each result item
|
| 124 |
+
valid_results = []
|
| 125 |
+
for item in result.get("results", []):
|
| 126 |
+
logging.debug(f"Processing result item: {json.dumps(item, indent=2)}")
|
| 127 |
+
if not isinstance(item, dict):
|
| 128 |
+
logging.error(f"Invalid result item format: {item}")
|
| 129 |
+
continue
|
| 130 |
+
if "Index" not in item:
|
| 131 |
+
logging.error(f"Missing Index in result item: {item}")
|
| 132 |
+
continue
|
| 133 |
+
valid_results.append(item)
|
| 134 |
+
|
| 135 |
+
if not valid_results:
|
| 136 |
+
logging.error(f"No valid results found in {model_key} response")
|
| 137 |
+
return self.get_default_response(model_key)
|
| 138 |
+
|
| 139 |
+
result["results"] = valid_results
|
| 140 |
+
return result
|
| 141 |
+
|
| 142 |
+
except json.JSONDecodeError as e:
|
| 143 |
+
logging.error(f"JSON parse error for {model_key}: {str(e)}")
|
| 144 |
+
logging.error(f"Content causing error: {content}")
|
| 145 |
+
return self.get_default_response(model_key)
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
logging.error(f"Error processing {model_key} response: {str(e)}")
|
| 149 |
+
logging.error("Full traceback:", exc_info=True)
|
| 150 |
+
return self.get_default_response(model_key)
|
| 151 |
+
|
| 152 |
+
def get_default_response(self, model_key: str) -> Dict:
|
| 153 |
+
"""
|
| 154 |
+
Return default response format for each model type.
|
| 155 |
+
|
| 156 |
+
Args:
|
| 157 |
+
model_key: Identifier of the model.
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
Dict containing default response structure.
|
| 161 |
+
"""
|
| 162 |
+
if model_key == "model_a":
|
| 163 |
+
return {
|
| 164 |
+
"results": [{
|
| 165 |
+
"Index": "0",
|
| 166 |
+
"A_P": "not applicable",
|
| 167 |
+
"A_I": "not applicable",
|
| 168 |
+
"A_C": "not applicable",
|
| 169 |
+
"A_O": "not applicable",
|
| 170 |
+
"A_S": "not applicable",
|
| 171 |
+
"A_Decision": False,
|
| 172 |
+
"A_Reason": "API call failed or returned no results"
|
| 173 |
+
}]
|
| 174 |
+
}
|
| 175 |
+
elif model_key == "model_b":
|
| 176 |
+
return {
|
| 177 |
+
"results": [{
|
| 178 |
+
"Index": "0",
|
| 179 |
+
"B_P": "not applicable",
|
| 180 |
+
"B_I": "not applicable",
|
| 181 |
+
"B_C": "not applicable",
|
| 182 |
+
"B_O": "not applicable",
|
| 183 |
+
"B_S": "not applicable",
|
| 184 |
+
"B_Decision": False,
|
| 185 |
+
"B_Reason": "API call failed or returned no results"
|
| 186 |
+
}]
|
| 187 |
+
}
|
| 188 |
+
else: # model_c
|
| 189 |
+
return {
|
| 190 |
+
"results": [{
|
| 191 |
+
"Index": "0",
|
| 192 |
+
"C_Decision": False,
|
| 193 |
+
"C_Reason": "API call failed or returned no results"
|
| 194 |
+
}]
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
def process_inference_result(self, result: Dict, model_key: str) -> Dict:
|
| 198 |
+
"""
|
| 199 |
+
Process inference model results.
|
| 200 |
+
|
| 201 |
+
Args:
|
| 202 |
+
result: Raw inference result.
|
| 203 |
+
model_key: Identifier of the model.
|
| 204 |
+
|
| 205 |
+
Returns:
|
| 206 |
+
Dict containing processed inference results.
|
| 207 |
+
"""
|
| 208 |
+
try:
|
| 209 |
+
if not isinstance(result, dict) or "choices" not in result:
|
| 210 |
+
logging.error(f"Invalid inference result format from {model_key}")
|
| 211 |
+
return self.get_default_response(model_key)
|
| 212 |
+
|
| 213 |
+
for choice in result["choices"]:
|
| 214 |
+
if "message" not in choice:
|
| 215 |
+
logging.warning(f"Missing message in choice: {choice}")
|
| 216 |
+
continue
|
| 217 |
+
|
| 218 |
+
content = choice["message"].get("content", "")
|
| 219 |
+
if not content:
|
| 220 |
+
logging.warning(f"Empty content in {model_key} choice")
|
| 221 |
+
choice["message"]["content"] = json.dumps(self.get_default_response(model_key))
|
| 222 |
+
continue
|
| 223 |
+
|
| 224 |
+
# Handle markdown code blocks
|
| 225 |
+
if "```json" in content:
|
| 226 |
+
pattern = r"```json\s*(.*?)\s*```"
|
| 227 |
+
match = re.search(pattern, content, re.DOTALL)
|
| 228 |
+
if match:
|
| 229 |
+
content = match.group(1).strip()
|
| 230 |
+
logging.debug(f"Extracted JSON from markdown in inference result: {content}")
|
| 231 |
+
|
| 232 |
+
try:
|
| 233 |
+
content_data = json.loads(content)
|
| 234 |
+
logging.debug(f"Parsed inference content: {json.dumps(content_data, indent=2, ensure_ascii=False)}")
|
| 235 |
+
|
| 236 |
+
# Return the parsed content data directly, not the original response
|
| 237 |
+
return content_data
|
| 238 |
+
|
| 239 |
+
except json.JSONDecodeError as e:
|
| 240 |
+
logging.error(f"Failed to parse {model_key} inference content: {str(e)}")
|
| 241 |
+
logging.error(f"Content was: {content}")
|
| 242 |
+
return self.get_default_response(model_key)
|
| 243 |
+
|
| 244 |
+
return self.get_default_response(model_key)
|
| 245 |
+
|
| 246 |
+
except Exception as e:
|
| 247 |
+
logging.error(f"Error processing {model_key} inference result: {str(e)}")
|
| 248 |
+
return self.get_default_response(model_key)
|
| 249 |
+
|
| 250 |
+
def process_reviews(self, result: Dict, model_key: str) -> Dict:
|
| 251 |
+
"""
|
| 252 |
+
Process reviews format response.
|
| 253 |
+
|
| 254 |
+
Args:
|
| 255 |
+
result: Raw review data.
|
| 256 |
+
model_key: Identifier of the model.
|
| 257 |
+
|
| 258 |
+
Returns:
|
| 259 |
+
Dict containing processed reviews.
|
| 260 |
+
"""
|
| 261 |
+
try:
|
| 262 |
+
if not isinstance(result.get("reviews", []), list):
|
| 263 |
+
logging.error("Invalid reviews format")
|
| 264 |
+
return {"reviews": []}
|
| 265 |
+
|
| 266 |
+
field_name = "B_Reason" if model_key == "model_b" else "C_Reason"
|
| 267 |
+
for review in result["reviews"]:
|
| 268 |
+
if field_name in review:
|
| 269 |
+
# Remove duplicate Reason fields
|
| 270 |
+
if isinstance(review[field_name], list):
|
| 271 |
+
review[field_name] = review[field_name][-1]
|
| 272 |
+
|
| 273 |
+
# Process inference content (remove think tags etc.)
|
| 274 |
+
review[field_name] = self.process_inference_response(review[field_name])
|
| 275 |
+
|
| 276 |
+
return result
|
| 277 |
+
except Exception as e:
|
| 278 |
+
logging.error(f"Error processing reviews: {str(e)}")
|
| 279 |
+
return {"reviews": []}
|
| 280 |
+
|
| 281 |
+
def process_inference_response(self, response: str) -> str:
|
| 282 |
+
"""
|
| 283 |
+
Process special markers in inference response.
|
| 284 |
+
|
| 285 |
+
Args:
|
| 286 |
+
response: Raw inference response string.
|
| 287 |
+
|
| 288 |
+
Returns:
|
| 289 |
+
Processed response string with special markers removed.
|
| 290 |
+
"""
|
| 291 |
+
try:
|
| 292 |
+
if not isinstance(response, str):
|
| 293 |
+
return response
|
| 294 |
+
|
| 295 |
+
# Remove thinking process
|
| 296 |
+
response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
|
| 297 |
+
|
| 298 |
+
# Remove HTML tags
|
| 299 |
+
response = re.sub(r'<[^>]+>', '', response)
|
| 300 |
+
|
| 301 |
+
# Clean extra whitespace
|
| 302 |
+
response = re.sub(r'\n\s*\n', '\n\n', response.strip())
|
| 303 |
+
|
| 304 |
+
return response
|
| 305 |
+
|
| 306 |
+
except Exception as e:
|
| 307 |
+
logging.error(f"Error processing inference response: {str(e)}")
|
| 308 |
+
return response
|
| 309 |
+
|
| 310 |
+
def test_api_connection(self, model_key: str) -> str:
|
| 311 |
+
"""
|
| 312 |
+
Test API connection for a specific model.
|
| 313 |
+
|
| 314 |
+
Args:
|
| 315 |
+
model_key: Identifier of the model to test.
|
| 316 |
+
|
| 317 |
+
Returns:
|
| 318 |
+
String indicating connection status.
|
| 319 |
+
"""
|
| 320 |
+
config = self.model_configs.get(model_key)
|
| 321 |
+
if not config:
|
| 322 |
+
return f"❌ Configuration not found for {model_key}"
|
| 323 |
+
|
| 324 |
+
try:
|
| 325 |
+
headers = {
|
| 326 |
+
"Content-Type": "application/json",
|
| 327 |
+
"Authorization": f"Bearer {config['api_key']}"
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
data = {
|
| 331 |
+
"model": config["model"],
|
| 332 |
+
"messages": [{"role": "user", "content": "test"}],
|
| 333 |
+
"temperature": config["temperature"],
|
| 334 |
+
"max_tokens": 10
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
response = requests.post(
|
| 338 |
+
config["api_url"],
|
| 339 |
+
headers=headers,
|
| 340 |
+
json=data,
|
| 341 |
+
timeout=10
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
if response.status_code == 200:
|
| 345 |
+
return f"✓ {config['name']} connection successful"
|
| 346 |
+
else:
|
| 347 |
+
return f"❌ {config['name']} connection failed: {response.status_code}"
|
| 348 |
+
|
| 349 |
+
except Exception as e:
|
| 350 |
+
return f"❌ {config['name']} connection error: {str(e)}"
|
| 351 |
+
|
| 352 |
+
def call_api(self, model_key: str, prompt: str) -> Dict:
|
| 353 |
+
"""Call API with retry mechanism and improved error handling."""
|
| 354 |
+
try:
|
| 355 |
+
config = self.model_configs.get(model_key)
|
| 356 |
+
if not config:
|
| 357 |
+
logging.error(f"Configuration not found for {model_key}")
|
| 358 |
+
raise Exception(f"Configuration not found for {model_key}")
|
| 359 |
+
|
| 360 |
+
logging.debug(f"API call config for {model_key}: {json.dumps({k:v for k,v in config.items() if k != 'api_key'}, indent=2)}")
|
| 361 |
+
|
| 362 |
+
headers = {
|
| 363 |
+
"Content-Type": "application/json",
|
| 364 |
+
"Authorization": f"Bearer {config['api_key']}"
|
| 365 |
+
}
|
| 366 |
+
logging.debug(f"Request headers: {json.dumps({k:v for k,v in headers.items() if k != 'Authorization'}, indent=2)}")
|
| 367 |
+
|
| 368 |
+
data = {
|
| 369 |
+
"model": config["model"],
|
| 370 |
+
"messages": [
|
| 371 |
+
{"role": "system", "content": "You are a helpful assistant specialized in analyzing medical literature based on PICOS criteria."},
|
| 372 |
+
{"role": "user", "content": prompt}
|
| 373 |
+
],
|
| 374 |
+
"temperature": config["temperature"],
|
| 375 |
+
"max_tokens": config["max_tokens"]
|
| 376 |
+
}
|
| 377 |
+
logging.debug(f"Request data: {json.dumps(data, indent=2)}")
|
| 378 |
+
|
| 379 |
+
max_retries = 3
|
| 380 |
+
retry_delay = 1
|
| 381 |
+
|
| 382 |
+
for attempt in range(max_retries):
|
| 383 |
+
try:
|
| 384 |
+
logging.debug(f"Attempt {attempt + 1} of {max_retries}")
|
| 385 |
+
response = requests.post(
|
| 386 |
+
config["api_url"],
|
| 387 |
+
headers=headers,
|
| 388 |
+
json=data,
|
| 389 |
+
timeout=config["timeout"]
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
logging.debug(f"API Response status: {response.status_code}")
|
| 393 |
+
logging.debug(f"API Response headers: {dict(response.headers)}")
|
| 394 |
+
|
| 395 |
+
if response.status_code != 200:
|
| 396 |
+
error_msg = f"API call failed for {config.get('name', model_key)}: {response.status_code} {response.reason}"
|
| 397 |
+
if response.text:
|
| 398 |
+
error_msg += f"\nResponse: {response.text}"
|
| 399 |
+
logging.error(error_msg)
|
| 400 |
+
if attempt < max_retries - 1:
|
| 401 |
+
time.sleep(retry_delay * (attempt + 1))
|
| 402 |
+
continue
|
| 403 |
+
raise Exception(error_msg)
|
| 404 |
+
|
| 405 |
+
return self.process_model_response(model_key, response.text)
|
| 406 |
+
|
| 407 |
+
except requests.Timeout:
|
| 408 |
+
logging.error(f"Timeout on attempt {attempt + 1}/{max_retries}")
|
| 409 |
+
if attempt < max_retries - 1:
|
| 410 |
+
time.sleep(retry_delay * (attempt + 1))
|
| 411 |
+
continue
|
| 412 |
+
raise Exception(f"API call timed out after {max_retries} attempts")
|
| 413 |
+
|
| 414 |
+
except Exception as e:
|
| 415 |
+
logging.error(f"API call error for {config.get('name', model_key)}: {str(e)}")
|
| 416 |
+
logging.error("Full traceback:", exc_info=True)
|
| 417 |
+
if attempt < max_retries - 1:
|
| 418 |
+
time.sleep(retry_delay)
|
| 419 |
+
continue
|
| 420 |
+
raise
|
| 421 |
+
|
| 422 |
+
raise Exception(f"API call failed after {max_retries} attempts")
|
| 423 |
+
|
| 424 |
+
except Exception as e:
|
| 425 |
+
logging.error(f"Fatal error in API call: {str(e)}")
|
| 426 |
+
logging.error("Full traceback:", exc_info=True)
|
| 427 |
+
raise
|
| 428 |
+
|
| 429 |
+
def get_config(self, model_key: str) -> Dict[str, Any]:
|
| 430 |
+
"""
|
| 431 |
+
Get model configuration.
|
| 432 |
+
This method re-reads environment variables for models that haven't been manually updated.
|
| 433 |
+
"""
|
| 434 |
+
# Reload environment variables from .env file to capture any modifications
|
| 435 |
+
load_dotenv(override=True)
|
| 436 |
+
if model_key not in self.model_configs:
|
| 437 |
+
return {}
|
| 438 |
+
config = self.model_configs[model_key]
|
| 439 |
+
if not config.get("updated", False):
|
| 440 |
+
# For models not manually updated, refresh config from environment variables
|
| 441 |
+
if model_key == "model_a":
|
| 442 |
+
refreshed_config = {
|
| 443 |
+
"api_key": os.getenv("MODEL_A_API_KEY", ""),
|
| 444 |
+
"api_url": os.getenv("MODEL_A_API_URL", ""),
|
| 445 |
+
"model": os.getenv("MODEL_A_MODEL_NAME", ""),
|
| 446 |
+
"name": "Model A (Primary Analyzer)",
|
| 447 |
+
"temperature": float(os.getenv("MODEL_A_TEMPERATURE", "0.3")),
|
| 448 |
+
"max_tokens": int(os.getenv("MODEL_A_MAX_TOKENS", "4096")),
|
| 449 |
+
"batch_size": int(os.getenv("MODEL_A_BATCH_SIZE", "10")),
|
| 450 |
+
"threads": int(os.getenv("MODEL_A_THREADS", "8")),
|
| 451 |
+
"timeout": int(os.getenv("MODEL_A_TIMEOUT", "180")),
|
| 452 |
+
"is_inference": os.getenv("MODEL_A_IS_INFERENCE", "").lower() == "true",
|
| 453 |
+
"updated": False
|
| 454 |
+
}
|
| 455 |
+
elif model_key == "model_b":
|
| 456 |
+
refreshed_config = {
|
| 457 |
+
"api_key": os.getenv("MODEL_B_API_KEY", ""),
|
| 458 |
+
"api_url": os.getenv("MODEL_B_API_URL", ""),
|
| 459 |
+
"model": os.getenv("MODEL_B_MODEL_NAME", ""),
|
| 460 |
+
"name": "Model B (Critical Reviewer)",
|
| 461 |
+
"temperature": float(os.getenv("MODEL_B_TEMPERATURE", "0.3")),
|
| 462 |
+
"max_tokens": int(os.getenv("MODEL_B_MAX_TOKENS", "4096")),
|
| 463 |
+
"batch_size": int(os.getenv("MODEL_B_BATCH_SIZE", "10")),
|
| 464 |
+
"threads": int(os.getenv("MODEL_B_THREADS", "8")),
|
| 465 |
+
"timeout": int(os.getenv("MODEL_B_TIMEOUT", "180")),
|
| 466 |
+
"is_inference": os.getenv("MODEL_B_IS_INFERENCE", "").lower() == "true",
|
| 467 |
+
"updated": False
|
| 468 |
+
}
|
| 469 |
+
elif model_key == "model_c":
|
| 470 |
+
refreshed_config = {
|
| 471 |
+
"api_key": os.getenv("MODEL_C_API_KEY", ""),
|
| 472 |
+
"api_url": os.getenv("MODEL_C_API_URL", ""),
|
| 473 |
+
"model": os.getenv("MODEL_C_MODEL_NAME", ""),
|
| 474 |
+
"name": "Model C (Final Arbitrator)",
|
| 475 |
+
"temperature": float(os.getenv("MODEL_C_TEMPERATURE", "0.3")),
|
| 476 |
+
"max_tokens": int(os.getenv("MODEL_C_MAX_TOKENS", "4096")),
|
| 477 |
+
"batch_size": int(os.getenv("MODEL_C_BATCH_SIZE", "10")),
|
| 478 |
+
"threads": int(os.getenv("MODEL_C_THREADS", "8")),
|
| 479 |
+
"timeout": int(os.getenv("MODEL_C_TIMEOUT", "180")),
|
| 480 |
+
"is_inference": os.getenv("MODEL_C_IS_INFERENCE", "").lower() == "true",
|
| 481 |
+
"updated": False
|
| 482 |
+
}
|
| 483 |
+
else:
|
| 484 |
+
refreshed_config = {}
|
| 485 |
+
self.model_configs[model_key] = refreshed_config
|
| 486 |
+
config = refreshed_config
|
| 487 |
+
return config
|
| 488 |
+
|
| 489 |
+
def process_analysis(self, result: Dict, model_key: str) -> Dict:
|
| 490 |
+
"""
|
| 491 |
+
Process analysis format response.
|
| 492 |
+
|
| 493 |
+
Args:
|
| 494 |
+
result: Raw analysis data.
|
| 495 |
+
model_key: Identifier of the model.
|
| 496 |
+
|
| 497 |
+
Returns:
|
| 498 |
+
Dict containing processed analysis.
|
| 499 |
+
"""
|
| 500 |
+
try:
|
| 501 |
+
if not isinstance(result.get("analysis", []), list):
|
| 502 |
+
logging.error("Invalid analysis format")
|
| 503 |
+
return {"analysis": []}
|
| 504 |
+
|
| 505 |
+
# Process each analysis item
|
| 506 |
+
for analysis in result["analysis"]:
|
| 507 |
+
if "A_Reason" in analysis:
|
| 508 |
+
# Remove duplicate Reason fields
|
| 509 |
+
if isinstance(analysis["A_Reason"], list):
|
| 510 |
+
analysis["A_Reason"] = analysis["A_Reason"][-1]
|
| 511 |
+
|
| 512 |
+
# Process inference content (remove think tags etc.)
|
| 513 |
+
analysis["A_Reason"] = self.process_inference_response(analysis["A_Reason"])
|
| 514 |
+
|
| 515 |
+
# Ensure boolean fields are proper booleans
|
| 516 |
+
if "A_Decision" in analysis:
|
| 517 |
+
analysis["A_Decision"] = bool(analysis["A_Decision"])
|
| 518 |
+
|
| 519 |
+
# Ensure all PICOS fields are strings
|
| 520 |
+
for field in ["A_P", "A_I", "A_C", "A_O", "A_S"]:
|
| 521 |
+
if field in analysis:
|
| 522 |
+
analysis[field] = str(analysis[field])
|
| 523 |
+
|
| 524 |
+
return result
|
| 525 |
+
|
| 526 |
+
except Exception as e:
|
| 527 |
+
logging.error(f"Error processing analysis: {str(e)}")
|
| 528 |
+
return {"analysis": []}
|
prompt_manager.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict
|
| 2 |
+
|
| 3 |
+
class PromptManager:
|
| 4 |
+
def __init__(self):
|
| 5 |
+
self.prompts = {
|
| 6 |
+
"model_a": """You are a medical research expert analyzing clinical trial abstracts.
|
| 7 |
+
Your task is to analyze each abstract and determine if it matches the PICOS criteria.
|
| 8 |
+
|
| 9 |
+
Target PICOS criteria:
|
| 10 |
+
- Population: {population}
|
| 11 |
+
- Intervention: {intervention}
|
| 12 |
+
- Comparison: {comparison}
|
| 13 |
+
- Outcome: {outcome}
|
| 14 |
+
- Study Design: {study_design}
|
| 15 |
+
|
| 16 |
+
Input abstracts:
|
| 17 |
+
{abstracts_json}
|
| 18 |
+
|
| 19 |
+
Each article in the input contains:
|
| 20 |
+
- index: article identifier
|
| 21 |
+
- abstract: the text to analyze
|
| 22 |
+
|
| 23 |
+
IMPORTANT: You must follow these strict JSON formatting rules:
|
| 24 |
+
1. Use double quotes for all strings
|
| 25 |
+
2. Ensure all strings are properly terminated
|
| 26 |
+
3. Use commas between array items and object properties
|
| 27 |
+
4. Do not use trailing commas
|
| 28 |
+
5. Keep the response concise and avoid unnecessary whitespace
|
| 29 |
+
6. Escape any special characters in strings
|
| 30 |
+
7. Use true/false (not True/False) for boolean values
|
| 31 |
+
|
| 32 |
+
Provide your analysis in this exact JSON format:
|
| 33 |
+
{{
|
| 34 |
+
"results": [
|
| 35 |
+
{{
|
| 36 |
+
"Index": "ARTICLE_INDEX",
|
| 37 |
+
"A_P": "brief population description",
|
| 38 |
+
"A_I": "brief intervention description",
|
| 39 |
+
"A_C": "brief comparison description",
|
| 40 |
+
"A_O": "brief outcome description",
|
| 41 |
+
"A_S": "brief study design description",
|
| 42 |
+
"A_Decision": true/false,
|
| 43 |
+
"A_Reason": "brief reasoning for match/mismatch"
|
| 44 |
+
}},
|
| 45 |
+
...
|
| 46 |
+
]
|
| 47 |
+
}}
|
| 48 |
+
|
| 49 |
+
Keep all descriptions brief and focused. Do not include line breaks or special characters in the text fields.
|
| 50 |
+
If any field is not found in the abstract, use "not specified" as the value.
|
| 51 |
+
Be strict in your evaluation and ensure the output is valid JSON format.""",
|
| 52 |
+
|
| 53 |
+
"model_b": """You are a critical reviewer in a systematic review team.
|
| 54 |
+
Your task is to rigorously scrutinize Model A's analysis and provide your own assessment.
|
| 55 |
+
You should actively look for potential flaws or oversights in Model A's analysis, while maintaining a high standard of evidence-based evaluation.
|
| 56 |
+
|
| 57 |
+
Target PICOS criteria:
|
| 58 |
+
- Population: {population}
|
| 59 |
+
- Intervention: {intervention}
|
| 60 |
+
- Comparison: {comparison}
|
| 61 |
+
- Outcome: {outcome}
|
| 62 |
+
- Study Design: {study_design}
|
| 63 |
+
|
| 64 |
+
Input abstracts:
|
| 65 |
+
{abstracts_json}
|
| 66 |
+
|
| 67 |
+
Each article in the input contains:
|
| 68 |
+
- Index: article identifier
|
| 69 |
+
- abstract: original article abstract
|
| 70 |
+
- model_a_analysis:
|
| 71 |
+
- A_P: Model A's population description
|
| 72 |
+
- A_I: Model A's intervention description
|
| 73 |
+
- A_C: Model A's comparison description
|
| 74 |
+
- A_O: Model A's outcome description
|
| 75 |
+
- A_S: Model A's study design description
|
| 76 |
+
- A_Decision: Model A's inclusion decision
|
| 77 |
+
- A_Reason: Model A's explanation
|
| 78 |
+
|
| 79 |
+
Your task is to:
|
| 80 |
+
1. Thoroughly examine the original abstract
|
| 81 |
+
2. Critically review Model A's PICOS extraction, actively seeking potential issues:
|
| 82 |
+
- Look for missing details or nuances in population characteristics
|
| 83 |
+
- Check for precise intervention specifications
|
| 84 |
+
- Verify completeness of comparison group description
|
| 85 |
+
- Examine outcome measurements and their relevance
|
| 86 |
+
- Scrutinize study design classification
|
| 87 |
+
3. Provide corrections with evidence from the abstract:
|
| 88 |
+
- B_P: Your corrected population description (use "-" only if A_P is completely accurate)
|
| 89 |
+
- B_I: Your corrected intervention description (use "-" only if A_I is completely accurate)
|
| 90 |
+
- B_C: Your corrected comparison description (use "-" only if A_C is completely accurate)
|
| 91 |
+
- B_O: Your corrected outcome description (use "-" only if A_O is completely accurate)
|
| 92 |
+
- B_S: Your corrected study design description (use "-" only if A_S is completely accurate)
|
| 93 |
+
4. Make your own independent inclusion decision (B_Decision)
|
| 94 |
+
5. Provide detailed reasoning (B_Reason) that:
|
| 95 |
+
- Points out any oversights or inaccuracies in Model A's analysis
|
| 96 |
+
- Cites specific evidence from the abstract
|
| 97 |
+
- Explains why your corrections or agreements are justified
|
| 98 |
+
|
| 99 |
+
IMPORTANT: You must follow these strict JSON formatting rules:
|
| 100 |
+
1. Use double quotes for all strings
|
| 101 |
+
2. Ensure all strings are properly terminated
|
| 102 |
+
3. Use commas between array items and object properties
|
| 103 |
+
4. Do not use trailing commas
|
| 104 |
+
5. Keep the response concise and avoid unnecessary whitespace
|
| 105 |
+
6. Escape any special characters in strings
|
| 106 |
+
7. Use true/false for B_Decision (true means the article should be included)
|
| 107 |
+
8. ALL fields (B_P, B_I, B_C, B_O, B_S) must be provided for each review
|
| 108 |
+
9. NEVER omit any field, even if you agree with Model A's analysis
|
| 109 |
+
10. For B_S specifically, you must either provide a corrected study design description or use "-" if you agree with A_S
|
| 110 |
+
|
| 111 |
+
Return your analysis in this exact JSON format:
|
| 112 |
+
{{
|
| 113 |
+
"results": [
|
| 114 |
+
{{
|
| 115 |
+
"Index": "ARTICLE_INDEX",
|
| 116 |
+
"B_Decision": true/false,
|
| 117 |
+
"B_Reason": "detailed reasoning with evidence from abstract",
|
| 118 |
+
"B_P": "-" or "corrected population description with evidence",
|
| 119 |
+
"B_I": "-" or "corrected intervention description with evidence",
|
| 120 |
+
"B_C": "-" or "corrected comparison description with evidence",
|
| 121 |
+
"B_O": "-" or "corrected outcome description with evidence",
|
| 122 |
+
"B_S": "-" or "corrected study design description with evidence"
|
| 123 |
+
}},
|
| 124 |
+
...
|
| 125 |
+
]
|
| 126 |
+
}}
|
| 127 |
+
|
| 128 |
+
Keep descriptions focused and evidence-based. Do not include line breaks or special characters.
|
| 129 |
+
Use "-" only when you are completely certain that Model A's extraction is accurate and complete.
|
| 130 |
+
Your B_Decision should be based on whether the article meets all PICOS criteria.
|
| 131 |
+
Remember to be thorough in your critique while maintaining objectivity and evidence-based reasoning.
|
| 132 |
+
|
| 133 |
+
CRITICAL: You MUST include ALL fields in your response, especially B_S. If you agree with Model A's study design analysis, use "-" for B_S, but NEVER omit it.""",
|
| 134 |
+
|
| 135 |
+
"model_c": """You are the final arbitrator in a systematic review team.
|
| 136 |
+
Your task is to analyze the assessments from Model A and Model B, and make a final decision.
|
| 137 |
+
|
| 138 |
+
Target PICOS criteria:
|
| 139 |
+
- Population: {population}
|
| 140 |
+
- Intervention: {intervention}
|
| 141 |
+
- Comparison: {comparison}
|
| 142 |
+
- Outcome: {outcome}
|
| 143 |
+
- Study Design: {study_design}
|
| 144 |
+
|
| 145 |
+
Input abstracts:
|
| 146 |
+
{abstracts_json}
|
| 147 |
+
|
| 148 |
+
Each article in the input contains:
|
| 149 |
+
- Index: article identifier
|
| 150 |
+
- abstract: original article abstract
|
| 151 |
+
- model_a_analysis: Model A's assessment
|
| 152 |
+
- model_b_analysis: Model B's assessment
|
| 153 |
+
|
| 154 |
+
Your task is to:
|
| 155 |
+
1. Review the original abstract
|
| 156 |
+
2. Compare Model A and Model B's assessments
|
| 157 |
+
3. Make a final decision considering:
|
| 158 |
+
- Accuracy of PICOS criteria matching
|
| 159 |
+
- Validity of reasoning from both models
|
| 160 |
+
- Evidence from the abstract
|
| 161 |
+
4. Provide your final assessment:
|
| 162 |
+
- C_Decision: final inclusion decision
|
| 163 |
+
- C_Reason: detailed explanation of your decision
|
| 164 |
+
- Note any disagreements between models and how you resolved them
|
| 165 |
+
|
| 166 |
+
Return your analysis in this exact JSON format:
|
| 167 |
+
{{
|
| 168 |
+
"results": [
|
| 169 |
+
{{
|
| 170 |
+
"Index": "ARTICLE_INDEX",
|
| 171 |
+
"C_Decision": true/false,
|
| 172 |
+
"C_Reason": "detailed reasoning with evidence"
|
| 173 |
+
}},
|
| 174 |
+
...
|
| 175 |
+
]
|
| 176 |
+
}}
|
| 177 |
+
|
| 178 |
+
Keep your reasoning focused and evidence-based.
|
| 179 |
+
Your C_Decision should be based on whether the article truly meets all PICOS criteria.
|
| 180 |
+
Be thorough in your analysis while maintaining objectivity."""
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
def update_prompt(self, model_key: str, prompt: str) -> None:
|
| 184 |
+
"""Update model prompt"""
|
| 185 |
+
if model_key not in self.prompts:
|
| 186 |
+
raise ValueError(f"Invalid model key: {model_key}")
|
| 187 |
+
self.prompts[model_key] = prompt
|
| 188 |
+
|
| 189 |
+
def get_prompt(self, model_key: str) -> str:
|
| 190 |
+
"""Get model prompt"""
|
| 191 |
+
return self.prompts.get(model_key, "")
|
renovate.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
|
| 3 |
+
"extends": [
|
| 4 |
+
"config:recommended"
|
| 5 |
+
]
|
| 6 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas>=1.5.0
|
| 2 |
+
requests>=2.31.0
|
| 3 |
+
python-dotenv>=1.0.0
|
| 4 |
+
tqdm>=4.66.0
|
| 5 |
+
tabulate>=0.9.0
|
| 6 |
+
gradio>=4.19.0
|
| 7 |
+
xlrd
|
| 8 |
+
scikit-learn>=1.3.0
|
| 9 |
+
openpyxl>=3.1.2
|
result_processor.py
ADDED
|
@@ -0,0 +1,393 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import logging
|
| 3 |
+
from typing import Dict
|
| 4 |
+
import json
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
class ResultProcessor:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
"""Initialize ResultProcessor with required column definitions for each model"""
|
| 10 |
+
# Define required columns for each model's output
|
| 11 |
+
self.required_columns = {
|
| 12 |
+
"model_a": ["A_Decision", "A_Reason", "A_P", "A_I", "A_C", "A_O", "A_S"],
|
| 13 |
+
"model_b": ["B_Decision", "B_Reason", "B_P", "B_I", "B_C", "B_O", "B_S"],
|
| 14 |
+
"model_c": ["C_Decision", "C_Reason"]
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
# Define the order of columns in the final Excel output
|
| 18 |
+
self.output_columns = [
|
| 19 |
+
"Index",
|
| 20 |
+
"A_Decision", "A_Reason", "A_P", "A_I", "A_C", "A_O", "A_S",
|
| 21 |
+
"B_Decision", "B_Reason", "B_P", "B_I", "B_C", "B_O", "B_S",
|
| 22 |
+
"C_Decision", "C_Reason"
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
def validate_model_response(self, result: Dict, model_key: str) -> None:
|
| 26 |
+
"""
|
| 27 |
+
Validate the response format from each model
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
result: The model's response to validate
|
| 31 |
+
model_key: The identifier of the model ('model_a', 'model_b', or 'model_c')
|
| 32 |
+
|
| 33 |
+
Raises:
|
| 34 |
+
Exception: If the response format is invalid
|
| 35 |
+
"""
|
| 36 |
+
# Log validation start
|
| 37 |
+
logging.debug(f"Starting validation for {model_key}")
|
| 38 |
+
logging.debug(f"Raw result type: {type(result)}")
|
| 39 |
+
|
| 40 |
+
if model_key == "model_a":
|
| 41 |
+
# Check if response is in completion format
|
| 42 |
+
if "choices" in result and len(result["choices"]) > 0:
|
| 43 |
+
content = result["choices"][0].get("message", {}).get("content", "")
|
| 44 |
+
if content:
|
| 45 |
+
try:
|
| 46 |
+
# Handle markdown-wrapped JSON content
|
| 47 |
+
json_content = content
|
| 48 |
+
if "```json" in content:
|
| 49 |
+
pattern = r"```json\s*(.*?)\s*```"
|
| 50 |
+
match = re.search(pattern, content, re.DOTALL)
|
| 51 |
+
if match:
|
| 52 |
+
json_content = match.group(1)
|
| 53 |
+
logging.debug(f"Extracted JSON content: {json_content}")
|
| 54 |
+
|
| 55 |
+
# Parse JSON content
|
| 56 |
+
parsed = json.loads(json_content)
|
| 57 |
+
if isinstance(parsed, dict) and "results" in parsed:
|
| 58 |
+
result.clear()
|
| 59 |
+
result.update(parsed)
|
| 60 |
+
logging.debug("Successfully parsed Model A response")
|
| 61 |
+
except json.JSONDecodeError as e:
|
| 62 |
+
raise Exception(f"Invalid JSON in Model A response content: {content}. Error: {str(e)}")
|
| 63 |
+
|
| 64 |
+
# Validate Model A specific format
|
| 65 |
+
if not isinstance(result, dict):
|
| 66 |
+
raise Exception("Invalid Model A response format: result is not a dictionary")
|
| 67 |
+
if "results" not in result:
|
| 68 |
+
raise Exception("Invalid Model A response format: missing 'results' field")
|
| 69 |
+
if not isinstance(result["results"], list):
|
| 70 |
+
raise Exception("Invalid Model A response format: 'results' is not a list")
|
| 71 |
+
if not result["results"]:
|
| 72 |
+
raise Exception("Empty results array in Model A response")
|
| 73 |
+
|
| 74 |
+
# Validate each result item
|
| 75 |
+
for item in result["results"]:
|
| 76 |
+
if not isinstance(item, dict):
|
| 77 |
+
raise Exception(f"Invalid result item format: {item}")
|
| 78 |
+
if "Index" not in item:
|
| 79 |
+
raise Exception(f"Missing 'Index' in result item: {item}")
|
| 80 |
+
missing_fields = [field for field in self.required_columns[model_key] if field not in item]
|
| 81 |
+
if missing_fields:
|
| 82 |
+
raise Exception(f"Missing fields in result item: {missing_fields}")
|
| 83 |
+
|
| 84 |
+
elif model_key == "model_b":
|
| 85 |
+
# Handle Model B's response format
|
| 86 |
+
if "choices" in result and len(result["choices"]) > 0:
|
| 87 |
+
content = result["choices"][0].get("message", {}).get("content", "")
|
| 88 |
+
if content:
|
| 89 |
+
try:
|
| 90 |
+
json_content = content
|
| 91 |
+
if "```json" in content:
|
| 92 |
+
pattern = r"```json\s*(.*?)\s*```"
|
| 93 |
+
match = re.search(pattern, content, re.DOTALL)
|
| 94 |
+
if match:
|
| 95 |
+
json_content = match.group(1)
|
| 96 |
+
logging.debug(f"Extracted JSON content for Model B: {json_content}")
|
| 97 |
+
|
| 98 |
+
parsed = json.loads(json_content)
|
| 99 |
+
if isinstance(parsed, dict) and "results" in parsed:
|
| 100 |
+
result.clear()
|
| 101 |
+
result.update(parsed)
|
| 102 |
+
logging.debug("Successfully parsed Model B response")
|
| 103 |
+
except json.JSONDecodeError as e:
|
| 104 |
+
raise Exception(f"Invalid JSON in Model B response content: {content}. Error: {str(e)}")
|
| 105 |
+
|
| 106 |
+
# Validate Model B specific format
|
| 107 |
+
if not isinstance(result, dict):
|
| 108 |
+
raise Exception("Invalid Model B response format: result is not a dictionary")
|
| 109 |
+
if "results" not in result:
|
| 110 |
+
raise Exception("Invalid Model B response format: missing 'results' field")
|
| 111 |
+
if not isinstance(result["results"], list):
|
| 112 |
+
raise Exception("Invalid Model B response format: 'results' is not a list")
|
| 113 |
+
if not result["results"]:
|
| 114 |
+
raise Exception("Empty results array in Model B response")
|
| 115 |
+
|
| 116 |
+
# Validate each result item
|
| 117 |
+
for item in result["results"]:
|
| 118 |
+
if not isinstance(item, dict):
|
| 119 |
+
raise Exception(f"Invalid result item format: {item}")
|
| 120 |
+
if "Index" not in item:
|
| 121 |
+
raise Exception(f"Missing 'Index' in result item: {item}")
|
| 122 |
+
missing_fields = [field for field in self.required_columns[model_key] if field not in item]
|
| 123 |
+
if missing_fields:
|
| 124 |
+
raise Exception(f"Missing fields in Model B result: {missing_fields}")
|
| 125 |
+
|
| 126 |
+
else: # model_c
|
| 127 |
+
# Handle Model C's response format
|
| 128 |
+
if "choices" in result and len(result["choices"]) > 0:
|
| 129 |
+
content = result["choices"][0].get("message", {}).get("content", "")
|
| 130 |
+
if content:
|
| 131 |
+
try:
|
| 132 |
+
json_content = content
|
| 133 |
+
if "```json" in content:
|
| 134 |
+
pattern = r"```json\s*(.*?)\s*```"
|
| 135 |
+
match = re.search(pattern, content, re.DOTALL)
|
| 136 |
+
if match:
|
| 137 |
+
json_content = match.group(1)
|
| 138 |
+
logging.debug(f"Extracted JSON content for Model C: {json_content}")
|
| 139 |
+
|
| 140 |
+
parsed = json.loads(json_content)
|
| 141 |
+
if isinstance(parsed, dict) and "results" in parsed:
|
| 142 |
+
result.clear()
|
| 143 |
+
result.update(parsed)
|
| 144 |
+
logging.debug("Successfully parsed Model C response")
|
| 145 |
+
except json.JSONDecodeError as e:
|
| 146 |
+
raise Exception(f"Invalid JSON in Model C response content: {content}. Error: {str(e)}")
|
| 147 |
+
|
| 148 |
+
# Validate Model C specific format
|
| 149 |
+
if not isinstance(result, dict):
|
| 150 |
+
raise Exception("Invalid Model C response format: result is not a dictionary")
|
| 151 |
+
if "results" not in result:
|
| 152 |
+
raise Exception("Invalid Model C response format: missing 'results' field")
|
| 153 |
+
if not isinstance(result["results"], list):
|
| 154 |
+
raise Exception("Invalid Model C response format: 'results' is not a list")
|
| 155 |
+
if not result["results"]:
|
| 156 |
+
raise Exception("Empty results array in Model C response")
|
| 157 |
+
|
| 158 |
+
# Validate each result item
|
| 159 |
+
for item in result["results"]:
|
| 160 |
+
if not isinstance(item, dict):
|
| 161 |
+
raise Exception(f"Invalid result item format: {item}")
|
| 162 |
+
if "Index" not in item:
|
| 163 |
+
raise Exception(f"Missing 'Index' in result item: {item}")
|
| 164 |
+
missing_fields = [field for field in self.required_columns[model_key] if field not in item]
|
| 165 |
+
if missing_fields:
|
| 166 |
+
raise Exception(f"Missing fields in Model C result: {missing_fields}")
|
| 167 |
+
try:
|
| 168 |
+
str(item["Index"])
|
| 169 |
+
bool(item["C_Decision"])
|
| 170 |
+
str(item["C_Reason"])
|
| 171 |
+
except (ValueError, TypeError) as e:
|
| 172 |
+
raise Exception(f"Invalid data type in Model C result: {str(e)}")
|
| 173 |
+
|
| 174 |
+
# Log successful validation
|
| 175 |
+
logging.debug(f"Validation completed successfully for {model_key}")
|
| 176 |
+
|
| 177 |
+
def merge_results(self, df: pd.DataFrame, model_results: Dict[str, pd.DataFrame]) -> pd.DataFrame:
|
| 178 |
+
"""
|
| 179 |
+
Merge all model results with correct column alignment and compute final decision
|
| 180 |
+
|
| 181 |
+
Args:
|
| 182 |
+
df: Original DataFrame with abstracts
|
| 183 |
+
model_results: Dictionary containing results from each model
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
DataFrame with merged results from all models
|
| 187 |
+
"""
|
| 188 |
+
try:
|
| 189 |
+
# Copy and clean the original DataFrame's index (remove potential whitespace)
|
| 190 |
+
df = df.copy()
|
| 191 |
+
df.index = df.index.astype(str).str.strip()
|
| 192 |
+
|
| 193 |
+
# Handle missing values and clean base columns
|
| 194 |
+
for col in ["Abstract", "DOI", "Title", "Authors"]:
|
| 195 |
+
if col in df.columns:
|
| 196 |
+
df[col] = df[col].fillna("").astype(str)
|
| 197 |
+
df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else "")
|
| 198 |
+
df[col] = df[col].replace(r'^[\s-]*$', "", regex=True)
|
| 199 |
+
|
| 200 |
+
# Create base DataFrame for merging model results
|
| 201 |
+
merged_df = df.copy()
|
| 202 |
+
|
| 203 |
+
def join_model_results(base_df: pd.DataFrame, model_key: str) -> pd.DataFrame:
|
| 204 |
+
"""
|
| 205 |
+
Merge results from a specific model, ensuring data alignment and cleaning
|
| 206 |
+
|
| 207 |
+
Args:
|
| 208 |
+
base_df: Base DataFrame to merge with
|
| 209 |
+
model_key: Identifier of the model
|
| 210 |
+
|
| 211 |
+
Returns:
|
| 212 |
+
DataFrame with merged model results
|
| 213 |
+
"""
|
| 214 |
+
if model_key not in model_results:
|
| 215 |
+
logging.warning(f"{model_key} results not found")
|
| 216 |
+
# Create default values for all rows
|
| 217 |
+
for col in self.required_columns[model_key]:
|
| 218 |
+
if col.endswith('_Decision'):
|
| 219 |
+
base_df[col] = False
|
| 220 |
+
elif col.endswith('_Reason'):
|
| 221 |
+
base_df[col] = "Not applicable - No model result"
|
| 222 |
+
else:
|
| 223 |
+
base_df[col] = "not applicable"
|
| 224 |
+
return base_df
|
| 225 |
+
|
| 226 |
+
try:
|
| 227 |
+
model_df = model_results[model_key].copy()
|
| 228 |
+
# Ensure model result indices and column names are strings without whitespace
|
| 229 |
+
model_df.index = model_df.index.astype(str).str.strip()
|
| 230 |
+
model_df.columns = model_df.columns.astype(str).str.strip()
|
| 231 |
+
|
| 232 |
+
# Ensure all required columns exist
|
| 233 |
+
for col in self.required_columns[model_key]:
|
| 234 |
+
if col not in model_df.columns:
|
| 235 |
+
if col.endswith('_Decision'):
|
| 236 |
+
model_df[col] = False
|
| 237 |
+
elif col.endswith('_Reason'):
|
| 238 |
+
model_df[col] = "Not applicable - Missing column"
|
| 239 |
+
else:
|
| 240 |
+
model_df[col] = "not applicable"
|
| 241 |
+
|
| 242 |
+
# Add default values for indices present in original data but missing in model results
|
| 243 |
+
missing_indices = set(base_df.index) - set(model_df.index)
|
| 244 |
+
if missing_indices:
|
| 245 |
+
logging.info(f"Found {len(missing_indices)} missing entries in {model_key}")
|
| 246 |
+
default_values = pd.DataFrame(
|
| 247 |
+
index=list(missing_indices),
|
| 248 |
+
columns=self.required_columns[model_key]
|
| 249 |
+
)
|
| 250 |
+
for col in self.required_columns[model_key]:
|
| 251 |
+
if col.endswith('_Decision'):
|
| 252 |
+
default_values[col] = False
|
| 253 |
+
elif col.endswith('_Reason'):
|
| 254 |
+
default_values[col] = "Not applicable - No result"
|
| 255 |
+
else:
|
| 256 |
+
default_values[col] = "not applicable"
|
| 257 |
+
model_df = pd.concat([model_df, default_values])
|
| 258 |
+
|
| 259 |
+
# Select only required columns
|
| 260 |
+
model_df = model_df[self.required_columns[model_key]]
|
| 261 |
+
|
| 262 |
+
# Use left join to preserve all original data indices
|
| 263 |
+
result = pd.merge(
|
| 264 |
+
base_df,
|
| 265 |
+
model_df,
|
| 266 |
+
left_index=True,
|
| 267 |
+
right_index=True,
|
| 268 |
+
how='left'
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
# Fill potential NaN values
|
| 272 |
+
for col in self.required_columns[model_key]:
|
| 273 |
+
if col in result.columns:
|
| 274 |
+
if col.endswith('_Decision'):
|
| 275 |
+
result[col] = result[col].fillna(False)
|
| 276 |
+
elif col.endswith('_Reason'):
|
| 277 |
+
result[col] = result[col].fillna("Not applicable - Missing value")
|
| 278 |
+
else:
|
| 279 |
+
result[col] = result[col].fillna("not applicable")
|
| 280 |
+
|
| 281 |
+
return result
|
| 282 |
+
|
| 283 |
+
except Exception as e:
|
| 284 |
+
logging.error(f"Error processing {model_key} results: {str(e)}")
|
| 285 |
+
# Return base DataFrame with default values
|
| 286 |
+
for col in self.required_columns[model_key]:
|
| 287 |
+
if col.endswith('_Decision'):
|
| 288 |
+
base_df[col] = False
|
| 289 |
+
elif col.endswith('_Reason'):
|
| 290 |
+
base_df[col] = f"Error processing {model_key} results: {str(e)}"
|
| 291 |
+
else:
|
| 292 |
+
base_df[col] = "not applicable"
|
| 293 |
+
return base_df
|
| 294 |
+
|
| 295 |
+
# Merge results from each model in sequence
|
| 296 |
+
merged_df = join_model_results(merged_df, "model_a")
|
| 297 |
+
merged_df = join_model_results(merged_df, "model_b")
|
| 298 |
+
|
| 299 |
+
# Merge Model C results or generate default values
|
| 300 |
+
if "model_c" in model_results:
|
| 301 |
+
merged_df = join_model_results(merged_df, "model_c")
|
| 302 |
+
else:
|
| 303 |
+
merged_df["C_Decision"] = False
|
| 304 |
+
merged_df["C_Reason"] = merged_df.apply(
|
| 305 |
+
lambda row: "No disagreement between Model A and B"
|
| 306 |
+
if pd.notna(row.get("A_Decision")) and pd.notna(row.get("B_Decision")) and row["A_Decision"] == row["B_Decision"]
|
| 307 |
+
else "Not applicable - No Model C result",
|
| 308 |
+
axis=1
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
# Compute final decision based on model results
|
| 312 |
+
def compute_final_decision(row):
|
| 313 |
+
"""
|
| 314 |
+
Compute final decision based on available model decisions
|
| 315 |
+
Priority: Model C > Agreement between A&B > Model B > Model A > False
|
| 316 |
+
"""
|
| 317 |
+
try:
|
| 318 |
+
if pd.notna(row.get("C_Decision")):
|
| 319 |
+
return bool(row["C_Decision"])
|
| 320 |
+
elif pd.notna(row.get("A_Decision")) and pd.notna(row.get("B_Decision")):
|
| 321 |
+
if bool(row["A_Decision"]) == bool(row["B_Decision"]):
|
| 322 |
+
return bool(row["A_Decision"])
|
| 323 |
+
else:
|
| 324 |
+
return bool(row["B_Decision"]) # Use Model B's result in case of disagreement
|
| 325 |
+
elif pd.notna(row.get("B_Decision")):
|
| 326 |
+
return bool(row["B_Decision"])
|
| 327 |
+
elif pd.notna(row.get("A_Decision")):
|
| 328 |
+
return bool(row["A_Decision"])
|
| 329 |
+
except Exception as e:
|
| 330 |
+
logging.error(f"Error computing final decision: {str(e)}")
|
| 331 |
+
return False
|
| 332 |
+
|
| 333 |
+
merged_df["Final_Decision"] = merged_df.apply(compute_final_decision, axis=1)
|
| 334 |
+
|
| 335 |
+
# Define final output columns and their order
|
| 336 |
+
output_cols = [
|
| 337 |
+
"Title", "DOI", "Abstract", "Authors",
|
| 338 |
+
*self.required_columns.get("model_a", []),
|
| 339 |
+
*self.required_columns.get("model_b", []),
|
| 340 |
+
*self.required_columns.get("model_c", []),
|
| 341 |
+
"Final_Decision"
|
| 342 |
+
]
|
| 343 |
+
|
| 344 |
+
# Ensure all required columns exist (assign default values if missing)
|
| 345 |
+
for col in output_cols:
|
| 346 |
+
if col not in merged_df.columns:
|
| 347 |
+
if col.endswith('Decision'):
|
| 348 |
+
merged_df[col] = False
|
| 349 |
+
elif col.endswith('Reason'):
|
| 350 |
+
merged_df[col] = "Not applicable - Missing column"
|
| 351 |
+
else:
|
| 352 |
+
merged_df[col] = ""
|
| 353 |
+
|
| 354 |
+
# Select existing columns in the specified order
|
| 355 |
+
existing_cols = [col for col in output_cols if col in merged_df.columns]
|
| 356 |
+
merged_df = merged_df[existing_cols]
|
| 357 |
+
|
| 358 |
+
# Final cleaning of all column values
|
| 359 |
+
for col in merged_df.columns:
|
| 360 |
+
if col.endswith('Decision'):
|
| 361 |
+
merged_df[col] = merged_df[col].fillna(False).astype(bool)
|
| 362 |
+
elif col.endswith('Reason'):
|
| 363 |
+
merged_df[col] = merged_df[col].fillna("Not applicable - Missing value")
|
| 364 |
+
elif col in ["Title", "DOI", "Abstract", "Authors"]:
|
| 365 |
+
merged_df[col] = merged_df[col].fillna("").astype(str)
|
| 366 |
+
else:
|
| 367 |
+
merged_df[col] = merged_df[col].fillna("not applicable")
|
| 368 |
+
|
| 369 |
+
# Add index as a column in the final result
|
| 370 |
+
merged_df.insert(0, "Index", merged_df.index)
|
| 371 |
+
|
| 372 |
+
return merged_df
|
| 373 |
+
|
| 374 |
+
except Exception as e:
|
| 375 |
+
logging.error(f"Error merging results: {str(e)}")
|
| 376 |
+
# Return a minimal DataFrame with error information
|
| 377 |
+
error_df = pd.DataFrame(index=df.index)
|
| 378 |
+
error_df["Error"] = f"Failed to merge results: {str(e)}"
|
| 379 |
+
return error_df
|
| 380 |
+
|
| 381 |
+
def export_to_excel(self, df: pd.DataFrame, filename: str) -> None:
|
| 382 |
+
"""
|
| 383 |
+
Export DataFrame to Excel file
|
| 384 |
+
|
| 385 |
+
Args:
|
| 386 |
+
df: DataFrame to export
|
| 387 |
+
filename: Target Excel file path
|
| 388 |
+
"""
|
| 389 |
+
try:
|
| 390 |
+
df.to_excel(filename, index=False)
|
| 391 |
+
logging.info(f"Exported results to {filename} successfully.")
|
| 392 |
+
except Exception as e:
|
| 393 |
+
logging.error(f"Error exporting to Excel: {str(e)}")
|