Spaces:

chitsanfei
/

review-screening-analyzer

Running

App Files Files Community

chitsanfei commited on Aug 16, 2025

Commit

d082b18

0 Parent(s):

init: init

Browse files

Files changed (15) hide show

.Rhistory +512 -0
.env.example +35 -0
.github/workflows/deploy_to_hf_space.yml +43 -0
.gitignore +181 -0
LICENSE +661 -0
README.md +97 -0
analyzer.py +511 -0
app.py +724 -0
deduplicator.py +183 -0
file_processor.py +407 -0
model_manager.py +528 -0
prompt_manager.py +191 -0
renovate.json +6 -0
requirements.txt +9 -0
result_processor.py +393 -0

.Rhistory ADDED Viewed

	@@ -0,0 +1,512 @@

+b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
+b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
+# C -> Final
+c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
+c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
+c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
+c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
+c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
+c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
+# 准备链接数据
+links <- data.frame(
+source = c(
+# A -> B
+rep(0, 2), rep(1, 2),
+# B -> C
+rep(2, 3), rep(3, 3),
+# C -> Final
+rep(4, 2), rep(5, 2), rep(6, 2)
+),
+target = c(
+# A -> B
+2, 3, 2, 3,
+# B -> C
+4, 5, 6, 4, 5, 6,
+# C -> Final
+7, 8, 7, 8, 7, 8
+),
+value = c(
+# A -> B
+a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
+# B -> C
+b_true_c_true, b_true_c_false, b_true_c_na,
+b_false_c_true, b_false_c_false, b_false_c_na,
+# C -> Final
+c_true_final_true, c_true_final_false,
+c_false_final_true, c_false_final_false,
+c_na_final_true, c_na_final_false
+)
+)
+# 创建颜色向量
+my_color <- 'd3.scaleOrdinal()
+.domain(["Model A True", "Model A False",
+"Model B True", "Model B False",
+"Model C True", "Model C False", "Model C NA",
+"Final True", "Final False"])
+.range(["#fbf8cc", "#fde4cf",
+"#FFCFD2", "#F1C0E8",
+"#CFBAF0", "#A3C4F3", "#90DBF4",
+"#98F5E1", "#B9FBC0"])'
+# 绘制桑基图
+sankeyNetwork(Links = links, Nodes = nodes,
+Source = "source", Target = "target",
+Value = "value", NodeID = "name",
+sinksRight = TRUE,
+nodeWidth = 40,
+nodePadding = 20,
+colourScale = my_color,
+fontSize = 12,
+height = 500,
+width = 800)
+# 保存为HTML文件
+saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
+Source = "source", Target = "target",
+Value = "value", NodeID = "name",
+sinksRight = TRUE,
+nodeWidth = 40,
+nodePadding = 20,
+colourScale = my_color,
+fontSize = 12,
+height = 500,
+width = 800),
+"sankey_plot.html")
+setwd("C:/Users/admin/Desktop/article-analyzer")
+# 加载必要的包
+library(networkD3)
+library(dplyr)
+library(readr)
+# 读取数据
+data <- read_csv("data/picos_analysis.csv")
+# 准备节点数据
+nodes <- data.frame(
+name = c(
+"Model A True", "Model A False",
+"Model B True", "Model B False",
+"Model C True", "Model C False", "Model C NA",
+"Final True", "Final False"
+)
+)
+# 计算流向
+# A -> B
+a_true_b_true <- sum(data$A_Decision & data$B_Decision, na.rm = TRUE)
+a_true_b_false <- sum(data$A_Decision & !data$B_Decision, na.rm = TRUE)
+a_false_b_true <- sum(!data$A_Decision & data$B_Decision, na.rm = TRUE)
+a_false_b_false <- sum(!data$A_Decision & !data$B_Decision, na.rm = TRUE)
+# B -> C
+b_true_c_true <- sum(data$B_Decision & data$C_Decision, na.rm = TRUE)
+b_true_c_false <- sum(data$B_Decision & !data$C_Decision, na.rm = TRUE)
+b_true_c_na <- sum(data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
+b_false_c_true <- sum(!data$B_Decision & data$C_Decision, na.rm = TRUE)
+b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
+b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
+# C -> Final
+c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
+c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
+c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
+c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
+c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
+c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
+# 准备链接数据
+links <- data.frame(
+source = c(
+# A -> B
+rep(0, 2), rep(1, 2),
+# B -> C
+rep(2, 3), rep(3, 3),
+# C -> Final
+rep(4, 2), rep(5, 2), rep(6, 2)
+),
+target = c(
+# A -> B
+2, 3, 2, 3,
+# B -> C
+4, 5, 6, 4, 5, 6,
+# C -> Final
+7, 8, 7, 8, 7, 8
+),
+value = c(
+# A -> B
+a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
+# B -> C
+b_true_c_true, b_true_c_false, b_true_c_na,
+b_false_c_true, b_false_c_false, b_false_c_na,
+# C -> Final
+c_true_final_true, c_true_final_false,
+c_false_final_true, c_false_final_false,
+c_na_final_true, c_na_final_false
+)
+)
+# 创建颜色向量
+my_color <- 'd3.scaleOrdinal()
+.domain(["Model A True", "Model A False",
+"Model B True", "Model B False",
+"Model C True", "Model C False", "Model C NA",
+"Final True", "Final False"])
+.range(["#fbf8cc", "#fde4cf",
+"#FFCFD2", "#F1C0E8",
+"#CFBAF0", "#A3C4F3", "#90DBF4",
+"#98F5E1", "#B9FBC0"])'
+# 绘制桑基图
+sankeyNetwork(Links = links, Nodes = nodes,
+Source = "source", Target = "target",
+Value = "value", NodeID = "name",
+sinksRight = TRUE,
+nodeWidth = 40,
+nodePadding = 20,
+colourScale = my_color,
+fontSize = 12,
+height = 500,
+width = 800)
+# 保存为HTML文件
+saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
+Source = "source", Target = "target",
+Value = "value", NodeID = "name",
+sinksRight = TRUE,
+nodeWidth = 40,
+nodePadding = 20,
+colourScale = my_color,
+fontSize = 12,
+height = 500,
+width = 800),
+"sankey_plot.html")
+# 保存为HTML文件
+saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
+Source = "source", Target = "target",
+Value = "value", NodeID = "name",
+sinksRight = TRUE,
+nodeWidth = 40,
+nodePadding = 20,
+colourScale = my_color,
+fontSize = 12,
+height = 500,
+width = 800),
+"sankey_plot.html")
+setwd("C:/Users/admin/Desktop/article-analyzer")
+# 加载必要的包
+library(networkD3)
+library(dplyr)
+library(readr)
+# 读取数据
+data <- read_csv("data/picos_analysis.csv")
+# 准备节点数据
+nodes <- data.frame(
+name = c(
+"Model A True", "Model A False",
+"Model B True", "Model B False",
+"Model C True", "Model C False", "Model C NA",
+"Final True", "Final False"
+),
+group = c(
+"A True", "A False",
+"B True", "B False",
+"C True", "C False", "C NA",
+"F True", "F False"
+)
+)
+# 计算流向
+# A -> B
+a_true_b_true <- sum(data$A_Decision & data$B_Decision, na.rm = TRUE)
+a_true_b_false <- sum(data$A_Decision & !data$B_Decision, na.rm = TRUE)
+a_false_b_true <- sum(!data$A_Decision & data$B_Decision, na.rm = TRUE)
+a_false_b_false <- sum(!data$A_Decision & !data$B_Decision, na.rm = TRUE)
+# B -> C
+b_true_c_true <- sum(data$B_Decision & data$C_Decision, na.rm = TRUE)
+b_true_c_false <- sum(data$B_Decision & !data$C_Decision, na.rm = TRUE)
+b_true_c_na <- sum(data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
+b_false_c_true <- sum(!data$B_Decision & data$C_Decision, na.rm = TRUE)
+b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
+b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
+# C -> Final
+c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
+c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
+c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
+c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
+c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
+c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
+# 准备链接数据
+links <- data.frame(
+source = c(
+# A -> B
+rep(0, 2), rep(1, 2),
+# B -> C
+rep(2, 3), rep(3, 3),
+# C -> Final
+rep(4, 2), rep(5, 2), rep(6, 2)
+),
+target = c(
+# A -> B
+2, 3, 2, 3,
+# B -> C
+4, 5, 6, 4, 5, 6,
+# C -> Final
+7, 8, 7, 8, 7, 8
+),
+value = c(
+# A -> B
+a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
+# B -> C
+b_true_c_true, b_true_c_false, b_true_c_na,
+b_false_c_true, b_false_c_false, b_false_c_na,
+# C -> Final
+c_true_final_true, c_true_final_false,
+c_false_final_true, c_false_final_false,
+c_na_final_true, c_na_final_false
+)
+)
+# 创建颜色向量
+my_color <- 'function(d) {
+const colors = {
+"Model A True": "#fbf8cc",
+"Model A False": "#fde4cf",
+"Model B True": "#FFCFD2",
+"Model B False": "#F1C0E8",
+"Model C True": "#CFBAF0",
+"Model C False": "#A3C4F3",
+"Model C NA": "#90DBF4",
+"Final True": "#98F5E1",
+"Final False": "#B9FBC0"
+};
+return colors[d.name] || "#cccccc";
+}'
+# 绘制桑基图
+sankeyNetwork(Links = links, Nodes = nodes,
+Source = "source", Target = "target",
+Value = "value", NodeID = "name",
+sinksRight = TRUE,
+nodeWidth = 40,
+nodePadding = 20,
+colourScale = my_color,
+fontSize = 12,
+height = 500,
+width = 800)
+# 保存为HTML文件
+saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
+Source = "source", Target = "target",
+Value = "value", NodeID = "name",
+sinksRight = TRUE,
+nodeWidth = 40,
+nodePadding = 20,
+colourScale = my_color,
+fontSize = 12,
+height = 500,
+width = 800),
+"sankey_plot.html")
+setwd("C:/Users/admin/Desktop/article-analyzer")
+# 加载必要的包
+library(networkD3)
+library(dplyr)
+library(readr)
+# 读取数据
+data <- read_csv("data/picos_analysis.csv")
+# 准备节点数据
+nodes <- data.frame(
+name = c(
+"Model A True", "Model A False",
+"Model B True", "Model B False",
+"Model C True", "Model C False", "Model C NA",
+"Final True", "Final False"
+),
+group = c(
+"A True", "A False",
+"B True", "B False",
+"C True", "C False", "C NA",
+"F True", "F False"
+)
+)
+# 计算流向
+# A -> B
+a_true_b_true <- sum(data$A_Decision & data$B_Decision, na.rm = TRUE)
+a_true_b_false <- sum(data$A_Decision & !data$B_Decision, na.rm = TRUE)
+a_false_b_true <- sum(!data$A_Decision & data$B_Decision, na.rm = TRUE)
+a_false_b_false <- sum(!data$A_Decision & !data$B_Decision, na.rm = TRUE)
+# B -> C
+b_true_c_true <- sum(data$B_Decision & data$C_Decision, na.rm = TRUE)
+b_true_c_false <- sum(data$B_Decision & !data$C_Decision, na.rm = TRUE)
+b_true_c_na <- sum(data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
+b_false_c_true <- sum(!data$B_Decision & data$C_Decision, na.rm = TRUE)
+b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
+b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
+# C -> Final
+c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
+c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
+c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
+c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
+c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
+c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
+# 准备链接数据
+links <- data.frame(
+source = c(
+# A -> B
+rep(0, 2), rep(1, 2),
+# B -> C
+rep(2, 3), rep(3, 3),
+# C -> Final
+rep(4, 2), rep(5, 2), rep(6, 2)
+),
+target = c(
+# A -> B
+2, 3, 2, 3,
+# B -> C
+4, 5, 6, 4, 5, 6,
+# C -> Final
+7, 8, 7, 8, 7, 8
+),
+value = c(
+# A -> B
+a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
+# B -> C
+b_true_c_true, b_true_c_false, b_true_c_na,
+b_false_c_true, b_false_c_false, b_false_c_na,
+# C -> Final
+c_true_final_true, c_true_final_false,
+c_false_final_true, c_false_final_false,
+c_na_final_true, c_na_final_false
+)
+)
+# 创建颜色向量
+my_color <- 'function(d) {
+const colors = {
+"Model A True": "#fbf8cc",
+"Model A False": "#fde4cf",
+"Model B True": "#FFCFD2",
+"Model B False": "#F1C0E8",
+"Model C True": "#CFBAF0",
+"Model C False": "#A3C4F3",
+"Model C NA": "#90DBF4",
+"Final True": "#98F5E1",
+"Final False": "#B9FBC0"
+};
+return colors[d.name] || "#cccccc";
+}'
+# 绘制桑基图
+sankeyNetwork(Links = links, Nodes = nodes,
+Source = "source", Target = "target",
+Value = "value", NodeID = "name",
+sinksRight = TRUE,
+nodeWidth = 40,
+nodePadding = 20,
+colourScale = my_color,
+fontSize = 12,
+height = 500,
+width = 800)
+# 保存为HTML文件
+saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
+Source = "source", Target = "target",
+Value = "value", NodeID = "name",
+sinksRight = TRUE,
+nodeWidth = 40,
+nodePadding = 20,
+colourScale = my_color,
+fontSize = 12,
+height = 500,
+width = 800),
+"sankey_plot.html")
+setwd("C:/Users/admin/Desktop/article-analyzer")
+# 加载必要的包
+library(networkD3)
+library(dplyr)
+library(readr)
+# 读取数据
+data <- read_csv("data/picos_analysis.csv")
+# 准备节点数据
+nodes <- data.frame(
+name = c(
+"Model A True", "Model A False",
+"Model B True", "Model B False",
+"Model C True", "Model C False", "Model C NA",
+"Final True", "Final False"
+),
+group = c(
+"A True", "A False",
+"B True", "B False",
+"C True", "C False", "C NA",
+"F True", "F False"
+)
+)
+# 计算流向
+# A -> B
+a_true_b_true <- sum(data$A_Decision & data$B_Decision, na.rm = TRUE)
+a_true_b_false <- sum(data$A_Decision & !data$B_Decision, na.rm = TRUE)
+a_false_b_true <- sum(!data$A_Decision & data$B_Decision, na.rm = TRUE)
+a_false_b_false <- sum(!data$A_Decision & !data$B_Decision, na.rm = TRUE)
+# B -> C
+b_true_c_true <- sum(data$B_Decision & data$C_Decision, na.rm = TRUE)
+b_true_c_false <- sum(data$B_Decision & !data$C_Decision, na.rm = TRUE)
+b_true_c_na <- sum(data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
+b_false_c_true <- sum(!data$B_Decision & data$C_Decision, na.rm = TRUE)
+b_false_c_false <- sum(!data$B_Decision & !data$C_Decision, na.rm = TRUE)
+b_false_c_na <- sum(!data$B_Decision & is.na(data$C_Decision), na.rm = TRUE)
+# C -> Final
+c_true_final_true <- sum(data$C_Decision & data$Final_Decision, na.rm = TRUE)
+c_true_final_false <- sum(data$C_Decision & !data$Final_Decision, na.rm = TRUE)
+c_false_final_true <- sum(!data$C_Decision & data$Final_Decision, na.rm = TRUE)
+c_false_final_false <- sum(!data$C_Decision & !data$Final_Decision, na.rm = TRUE)
+c_na_final_true <- sum(is.na(data$C_Decision) & data$Final_Decision, na.rm = TRUE)
+c_na_final_false <- sum(is.na(data$C_Decision) & !data$Final_Decision, na.rm = TRUE)
+# 准备链接数据
+links <- data.frame(
+source = c(
+# A -> B
+rep(0, 2), rep(1, 2),
+# B -> C
+rep(2, 3), rep(3, 3),
+# C -> Final
+rep(4, 2), rep(5, 2), rep(6, 2)
+),
+target = c(
+# A -> B
+2, 3, 2, 3,
+# B -> C
+4, 5, 6, 4, 5, 6,
+# C -> Final
+7, 8, 7, 8, 7, 8
+),
+value = c(
+# A -> B
+a_true_b_true, a_true_b_false, a_false_b_true, a_false_b_false,
+# B -> C
+b_true_c_true, b_true_c_false, b_true_c_na,
+b_false_c_true, b_false_c_false, b_false_c_na,
+# C -> Final
+c_true_final_true, c_true_final_false,
+c_false_final_true, c_false_final_false,
+c_na_final_true, c_na_final_false
+)
+)
+# 创建颜色向量
+my_color <- paste0(
+'d3.scaleOrdinal()
+.domain(["A True", "A False",
+"B True", "B False",
+"C True", "C False", "C NA",
+"F True", "F False"])
+.range(["#fbf8cc", "#fde4cf",
+"#FFCFD2", "#F1C0E8",
+"#CFBAF0", "#A3C4F3", "#90DBF4",
+"#98F5E1", "#B9FBC0"])'
+)
+# 绘制桑基图
+sankeyNetwork(Links = links, Nodes = nodes,
+Source = "source", Target = "target",
+Value = "value", NodeID = "name",
+NodeGroup = "group",
+sinksRight = TRUE,
+nodeWidth = 40,
+nodePadding = 20,
+colourScale = my_color,
+fontSize = 12,
+height = 500,
+width = 800)
+# 保存为HTML文件
+saveNetwork(sankeyNetwork(Links = links, Nodes = nodes,
+Source = "source", Target = "target",
+Value = "value", NodeID = "name",
+NodeGroup = "group",
+sinksRight = TRUE,
+nodeWidth = 40,
+nodePadding = 20,
+colourScale = my_color,
+fontSize = 12,
+height = 500,
+width = 800),
+"sankey_plot.html")

.env.example ADDED Viewed

	@@ -0,0 +1,35 @@

+# Primary Model Configuration (Model A)
+# Used for initial screening and basic PICOS criteria evaluation
+MODEL_A_API_URL=https://api.example.com/v1
+MODEL_A_API_KEY=your_model_a_api_key
+MODEL_A_MODEL_NAME=model-a-name
+MODEL_A_TEMPERATURE=0.3
+MODEL_A_MAX_TOKENS=16384
+MODEL_A_BATCH_SIZE=10
+MODEL_A_THREADS=8
+MODEL_A_TIMEOUT=180
+MODEL_A_IS_INFERENCE=false
+# Secondary Model Configuration (Model B)
+# Used for detailed analysis and verification of Model A results
+MODEL_B_API_URL=https://api.example.com/v1
+MODEL_B_API_KEY=your_model_b_api_key
+MODEL_B_MODEL_NAME=model-b-name
+MODEL_B_TEMPERATURE=0.3
+MODEL_B_MAX_TOKENS=16384
+MODEL_B_BATCH_SIZE=10
+MODEL_B_THREADS=8
+MODEL_B_TIMEOUT=180
+MODEL_B_IS_INFERENCE=false
+# Arbitration Model Configuration (Model C)
+# Used to resolve conflicts between Model A and B results
+MODEL_C_API_URL=https://api.example.com/v1
+MODEL_C_API_KEY=your_model_c_api_key
+MODEL_C_MODEL_NAME=model-c-name
+MODEL_C_TEMPERATURE=0.3
+MODEL_C_MAX_TOKENS=16384
+MODEL_C_BATCH_SIZE=10
+MODEL_C_THREADS=8
+MODEL_C_TIMEOUT=180
+MODEL_C_IS_INFERENCE=false

.github/workflows/deploy_to_hf_space.yml ADDED Viewed

	@@ -0,0 +1,43 @@

+# .github/workflows/deploy_to_hf_space.yml
+name: Deploy Gradio to Hugging Face Spaces
+on:
+  push:
+    branches:
+      - master
+  workflow_dispatch:
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    env:
+      HF_USERNAME: chitsanfei
+      SPACE_NAME: review-screening-analyzer
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: pip install -r requirements.txt
+      - name: Push to Hugging Face Space
+        # HF_TOKEN 需在仓库 Settings → Secrets 中配置
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          # 配置 Git 用户信息
+          git config --global user.name "${{ github.actor }}"
+          git config --global user.email "${{ github.actor }}@users.noreply.github.com"
+          # 强制推送当前 HEAD 到远端 main 分支
+          git push -f \
+            https://$HF_USERNAME:${{ secrets.HF_TOKEN }}@huggingface.co/spaces/$HF_USERNAME/$SPACE_NAME \
+            HEAD:main

.gitignore ADDED Viewed

	@@ -0,0 +1,181 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc
+# Environment variables
+.env
+.env.local
+.env.*.local
+# For HF
+.static/banner.png
+data/*.xlsx

LICENSE ADDED Viewed

	@@ -0,0 +1,661 @@

+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU Affero General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Remote Network Interaction; Use with the GNU General Public License.
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+  16. Limitation of Liability.
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published
+    by the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+Also add information on how to contact you by electronic and paper mail.
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<https://www.gnu.org/licenses/>.

README.md ADDED Viewed

	@@ -0,0 +1,97 @@

+---
+title: Review Screening Analyzer
+emoji: 📚
+colorFrom: indigo
+colorTo: purple
+sdk: gradio
+sdk_version: "5.39.0"
+app_file: app.py
+pinned: true
+---
+<div align="center">
+    <hr>
+    <h1>Review Screening Analyzer</h1>
+    <b>A Simple Literature Filtering Tool</b>
+</div>
+---
+> [!important]
+> This project is currently under development and marked as research in progress status, don't use it withour authors' permission.
+> [!important]
+> This is a demo code for the paper "Automated Literature Screening for Hepatocellular Carcinoma Treatment: Integrating Three Large Language Models" published in the Journal of Medical Internet Research Medical Informatics.
+## 目录
+- [Introduction](#Introduction)
+- [Usage](#usage)
+- [License](#license)
+- [Contact Information](#contact-information)
+---
+## Introduction
+Review Screening Analyzer is a literature screening tool that combines three large language models for analysis to determine the inclusion and exclusion of studies in systematic reviews based on PICOS criteria.
+This is a demo project for demonstration purposes, not a production application. If you find any bugs, please report them in the Issues.
+## File Structure
+```
+review-screening-analyzer/
+│
+├── analyzer.py
+├── deduplicator.py
+├── file_processor.py
+├── LICENSE
+├── README.md
+├── requirements.txt
+└── app.py # Gradio Entry Point
+```
+## Usage
+> [!warning]
+> The following content is a temporary solution for local deployment.
+Please ensure that [Python](https://www.python.org/) and [pip](https://pip.pypa.io/en/stable/) are installed on your system.
+Create the environment variable file `.env` in the project directory [/](file:///Users/chitsanfei/Downloads/review-screening-analyzer/README.md):
+```
+# API Keys
+DEEPSEEK_API_KEY=
+QWEN_API_KEY=
+GPTGE_API_KEY=
+```
+Then run the following commands:
+```bash
+bash
+git clone https://github.com/chitsanfei/review-screening-analyzer.git
+cd review-screening-analyzer
+pip install -r requirements.txt
+python3 app.py
+```
+## License
+This project is licensed under the [MIT License](LICENSE).
+```
+This project is licensed under the GNU General Public License v3.0 (GPL-3.0).
+You are free to use, modify and distribute this software, provided that you keep it open source and license it under the same terms.
+For more details, see the full [GNU GPL v3.0 license text](https://www.gnu.org/licenses/gpl-3.0.html).
+```
+## Contact Information
+If you have any questions or suggestions, please contact us through the following methods:
+- Email: chitsanfei@emu.ac.cn
+- GitHub: [chitsanfei](https://github.com/chitsanfei)
+---
+Thank you for your usage and support! 🌟

analyzer.py ADDED Viewed

	@@ -0,0 +1,511 @@

+import pandas as pd
+import logging
+import json
+from typing import Dict, List, Optional
+from model_manager import ModelManager
+from prompt_manager import PromptManager
+from result_processor import ResultProcessor
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+class PICOSAnalyzer:
+    def __init__(self):
+        # Initialize managers for models, prompts, and result processing
+        self.model_manager = ModelManager()
+        self.prompt_manager = PromptManager()
+        self.result_processor = ResultProcessor()
+        # Example PICOS filtering criteria
+        self.picos_criteria = {
+            "population": "patients with non-alcoholic fatty liver disease (NAFLD)",
+            "intervention": "observation or management of NAFLD",
+            "comparison": "patients without NAFLD or general population",
+            "outcome": "incidence of various types of extra-hepatic cancers, such as colorectal cancer, stomach cancer, breast cancer, etc.",
+            "study_design": "retrospective cohort studies"
+        }
+    def update_picos_criteria(self, criteria: Dict[str, str]) -> None:
+        """Update the PICOS criteria with a given dictionary of criteria."""
+        self.picos_criteria.update(criteria)
+    def update_model_config(self, model_key: str, config: Dict) -> None:
+        """Update configuration settings for a specific model."""
+        self.model_manager.update_model_config(model_key, config)
+    def update_prompt(self, model_key: str, prompt: str) -> None:
+        """Update the prompt template for a specific model."""
+        self.prompt_manager.update_prompt(model_key, prompt)
+    def test_api_connection(self, model_key: str) -> str:
+        """Test the API connection for the specified model."""
+        return self.model_manager.test_api_connection(model_key)
+    def _validate_data(self, idx: str, row: pd.Series, model_key: str, previous_results: Dict) -> bool:
+        """
+        Validate the completeness of a single data item.
+        Returns:
+            Tuple[bool, bool]: (is_valid, is_empty_abstract)
+        """
+        try:
+            # Check if abstract exists and is not empty
+            if not pd.notna(row.get("Abstract")):
+                logging.warning(f"Empty abstract for index {idx}")
+                return False, True  # Second value indicates empty abstract
+            # For Model B and C, validate Model A results
+            if model_key in ["model_b", "model_c"]:
+                if not previous_results or "model_a" not in previous_results:
+                    logging.warning(f"Missing Model A results for {model_key}")
+                    return False, False
+                if idx not in previous_results["model_a"].index:
+                    logging.warning(f"Index {idx} not found in Model A results")
+                    return False, False
+            # For Model C, validate Model B results
+            if model_key == "model_c":
+                if "model_b" not in previous_results:
+                    logging.warning("Missing Model B results")
+                    return False, False
+                if idx not in previous_results["model_b"].index:
+                    logging.warning(f"Index {idx} not found in Model B results")
+                    return False, False
+            return True, False
+        except Exception as e:
+            logging.error(f"Validation error for index {idx}: {str(e)}")
+            return False, False
+    def _process_single_item(self, idx: str, row: pd.Series, model_key: str, previous_results: Dict) -> Optional[Dict]:
+        """
+        Process a single data item and prepare it for API call.
+        """
+        try:
+            # Prepare base result with abstract
+            result = {
+                "Index": idx,
+                "abstract": str(row["Abstract"]).strip()
+            }
+            # Add Model A results for Model B and C
+            if model_key in ["model_b", "model_c"]:
+                a_result = previous_results["model_a"].loc[idx]
+                result["model_a_analysis"] = {
+                    "A_Decision": bool(a_result["A_Decision"]),
+                    "A_Reason": str(a_result["A_Reason"]),
+                    "A_P": str(a_result["A_P"]),
+                    "A_I": str(a_result["A_I"]),
+                    "A_C": str(a_result["A_C"]),
+                    "A_O": str(a_result["A_O"]),
+                    "A_S": str(a_result["A_S"])
+                }
+            # Add Model B results for Model C
+            if model_key == "model_c":
+                b_result = previous_results["model_b"].loc[idx]
+                result["model_b_analysis"] = {
+                    "B_Decision": bool(b_result["B_Decision"]),
+                    "B_Reason": str(b_result["B_Reason"]),
+                    "B_P": str(b_result["B_P"]),
+                    "B_I": str(b_result["B_I"]),
+                    "B_C": str(b_result["B_C"]),
+                    "B_O": str(b_result["B_O"]),
+                    "B_S": str(b_result["B_S"])
+                }
+            return result
+        except Exception as e:
+            logging.error(f"Processing error for index {idx}: {str(e)}")
+            return None
+    def _process_api_response(self, response: Dict, model_key: str) -> List[Dict]:
+        """
+        Process API response and extract results.
+        """
+        try:
+            if not response or not isinstance(response, dict):
+                logging.error(f"Invalid response format from {model_key}")
+                return []
+            # Extract results from response
+            if "results" not in response:
+                # For inference mode, try to parse from content directly (model_c only)
+                if model_key == "model_c" and self.model_manager.get_config(model_key).get("is_inference"):
+                    try:
+                        content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
+                        json_match = re.search(r"```json\s*(\{.*?\})\s*```", content, re.DOTALL)
+                        if json_match:
+                            content = json_match.group(1)
+                        parsed_response = json.loads(content)
+                        if "results" not in parsed_response:
+                            logging.error(f"No results found in {model_key} inference response")
+                            return []
+                        response = parsed_response
+                    except Exception as e:
+                        logging.error(f"Failed to parse inference response from {model_key}: {str(e)}")
+                        return []
+                else:
+                    logging.error(f"No results found in {model_key} response")
+                    return []
+            results = response["results"]
+            if not isinstance(results, list):
+                logging.error(f"Results from {model_key} is not a list")
+                return []
+            # Validate each result
+            valid_results = []
+            for result in results:
+                if not isinstance(result, dict) or "Index" not in result:
+                    logging.warning(f"Invalid result format in {model_key} response: {result}")
+                    continue
+                # Ensure all required fields are present based on model type
+                if model_key == "model_a":
+                    required_fields = ["A_P", "A_I", "A_C", "A_O", "A_S", "A_Decision", "A_Reason"]
+                elif model_key == "model_b":
+                    required_fields = ["B_P", "B_I", "B_C", "B_O", "B_S", "B_Decision", "B_Reason"]
+                else:  # model_c
+                    required_fields = ["C_Decision", "C_Reason"]
+                missing_fields = [field for field in required_fields if field not in result]
+                if missing_fields:
+                    logging.warning(f"Missing fields {missing_fields} in {model_key} result for Index {result['Index']}")
+                    continue
+                # Convert decision to boolean if it's a string
+                if model_key == "model_c" and isinstance(result.get("C_Decision"), str):
+                    result["C_Decision"] = result["C_Decision"].lower() == "true"
+                valid_results.append(result)
+            return valid_results
+        except Exception as e:
+            logging.error(f"Error processing {model_key} response: {str(e)}")
+            return []
+    def process_batch(self, df: pd.DataFrame, model_key: str, previous_results: Dict = None, progress_callback=None) -> pd.DataFrame:
+        """
+        Process a batch of data with improved data flow and validation.
+        """
+        # Get model configuration
+        config = self.model_manager.get_config(model_key)
+        batch_size = config["batch_size"]
+        threads = config["threads"]
+        results_dict = {}  # Use dictionary to prevent duplicate indices
+        failed_indices = set()
+        total_rows = len(df)
+        start_time = time.time()
+        processed_count = 0
+        skipped_count = 0
+        # Ensure consistent index type
+        df.index = df.index.astype(str)
+        if previous_results:
+            for key in previous_results:
+                previous_results[key].index = previous_results[key].index.astype(str)
+        # For Model C, first identify indices where A and B disagree
+        if model_key == "model_c":
+            disagreement_indices = []
+            for idx in df.index:
+                try:
+                    if not self._validate_previous_results(idx, model_key, previous_results):
+                        empty_result = self._create_empty_result(idx, model_key, "Invalid or missing previous results")
+                        results_dict[str(idx)] = empty_result
+                        failed_indices.add(str(idx))
+                        if progress_callback:
+                            progress_callback(idx, True, False)
+                        continue
+                    if self._check_disagreement(idx, previous_results):
+                        disagreement_indices.append(idx)
+                    else:
+                        # If no disagreement, use Model A's decision
+                        no_disagreement_result = self._create_no_disagreement_result(idx, previous_results)
+                        results_dict[str(idx)] = no_disagreement_result
+                        skipped_count += 1
+                        if progress_callback:
+                            progress_callback(idx, False, False)
+                except Exception as e:
+                    logging.error(f"Error checking disagreement for index {idx}: {str(e)}")
+                    empty_result = self._create_empty_result(idx, model_key, f"Error: {str(e)}")
+                    results_dict[str(idx)] = empty_result
+                    failed_indices.add(str(idx))
+                    if progress_callback:
+                        progress_callback(idx, True, False)
+            # Update df to only include disagreement cases for Model C
+            if disagreement_indices:
+                df = df.loc[disagreement_indices]
+            else:
+                # If no disagreements, return results with default values
+                results = list(results_dict.values())
+                results_df = pd.DataFrame(results)
+                results_df.set_index("Index", inplace=True)
+                results_df.index = results_df.index.astype(str)
+                return results_df
+        def process_batch_data(batch_df: pd.DataFrame) -> List[Dict]:
+            nonlocal processed_count, skipped_count
+            batch_results = []
+            empty_results = []
+            # Process each item in the batch
+            for idx, row in batch_df.iterrows():
+                try:
+                    # Skip if already processed (for Model C)
+                    if str(idx) in results_dict:
+                        skipped_count += 1
+                        continue
+                    # Validate data completeness
+                    is_valid, is_empty = self._validate_data(idx, row, model_key, previous_results)
+                    if not is_valid:
+                        empty_result = self._create_empty_result(idx, model_key, "Not processed - Empty abstract" if is_empty else "Not processed - Invalid data")
+                        empty_results.append(empty_result)
+                        failed_indices.add(idx)
+                        if progress_callback:
+                            progress_callback(idx, True, is_empty)
+                        continue
+                    # Prepare data for API call
+                    abstract_text = row.get("Abstract", "").strip()
+                    if not abstract_text:
+                        empty_result = self._create_empty_result(idx, model_key, "Not processed - Empty abstract")
+                        empty_results.append(empty_result)
+                        failed_indices.add(idx)
+                        if progress_callback:
+                            progress_callback(idx, True, True)
+                        continue
+                    # Add to batch for processing
+                    batch_item = self._process_single_item(idx, row, model_key, previous_results)
+                    if batch_item:
+                        batch_results.append(batch_item)
+                    else:
+                        empty_result = self._create_empty_result(idx, model_key, "Error preparing batch data")
+                        empty_results.append(empty_result)
+                        failed_indices.add(idx)
+                        if progress_callback:
+                            progress_callback(idx, True, False)
+                except Exception as e:
+                    logging.error(f"Error preparing data for index {idx}: {str(e)}")
+                    empty_result = self._create_empty_result(idx, model_key, f"Error: {str(e)}")
+                    empty_results.append(empty_result)
+                    failed_indices.add(idx)
+                    if progress_callback:
+                        progress_callback(idx, True, False)
+            # Process batch with API if there are valid entries
+            if batch_results:
+                try:
+                    # Prepare prompt with PICOS criteria and batch data
+                    prompt = self.prompt_manager.get_prompt(model_key).format(
+                        **{
+                            **self.picos_criteria,
+                            "abstracts_json": json.dumps(batch_results, ensure_ascii=False, indent=2)
+                        }
+                    )
+                    # Call API and process response
+                    response = self.model_manager.call_api(model_key, prompt)
+                    api_results = self._process_api_response(response, model_key)
+                    # If API call failed or returned no results, create empty results for all items
+                    if not api_results:
+                        for item in batch_results:
+                            empty_result = self._create_empty_result(item["Index"], model_key, "API call failed or returned no results")
+                            empty_results.append(empty_result)
+                            if progress_callback:
+                                progress_callback(item["Index"], True, False)
+                    else:
+                        # Update progress for successfully processed items
+                        for result in api_results:
+                            if progress_callback:
+                                progress_callback(result["Index"], False, False)
+                            # Add result to the batch results
+                            results_dict[str(result["Index"])] = result
+                            processed_count += 1
+                        # Calculate time statistics
+                        elapsed_time = time.time() - start_time
+                        if processed_count > 0:
+                            avg_time_per_item = elapsed_time / processed_count
+                            remaining_items = total_rows - (processed_count + len(failed_indices) + skipped_count)
+                            estimated_remaining_time = avg_time_per_item * remaining_items
+                            # Log detailed progress information
+                            logging.info(
+                                f"{model_key.upper()} Progress: "
+                                f"Processed: {processed_count} - "
+                                f"Remaining: {remaining_items} - "
+                                f"Skipped: {skipped_count} - "
+                                f"Elapsed Time: {elapsed_time:.1f}s - "
+                                f"Est. Remaining: {estimated_remaining_time:.1f}s"
+                            )
+                    return api_results + empty_results
+                except Exception as e:
+                    error_msg = f"Error processing batch: {str(e)}"
+                    logging.error(error_msg)
+                    for item in batch_results:
+                        empty_result = self._create_empty_result(item["Index"], model_key, error_msg)
+                        empty_results.append(empty_result)
+                        failed_indices.add(item["Index"])
+                        if progress_callback:
+                            progress_callback(item["Index"], True, False)
+            return empty_results
+        # Process batches using thread pool
+        with ThreadPoolExecutor(max_workers=threads) as executor:
+            futures = []
+            for i in range(0, len(df), batch_size):
+                batch_df = df.iloc[i:i + batch_size]
+                futures.append(executor.submit(process_batch_data, batch_df))
+            # Collect results
+            for future in as_completed(futures):
+                try:
+                    batch_results = future.result()
+                    # Store results in dictionary to handle potential duplicates
+                    for result in batch_results:
+                        idx = str(result["Index"])
+                        results_dict[idx] = result
+                except Exception as e:
+                    error_msg = f"Error collecting batch results: {str(e)}"
+                    logging.error(error_msg)
+        # Convert results dictionary to DataFrame
+        results = list(results_dict.values())
+        results_df = pd.DataFrame(results)
+        if not results_df.empty:
+            # Set index properly
+            results_df.set_index("Index", inplace=True)
+            results_df.index = results_df.index.astype(str)
+            # Ensure all required columns exist with default values
+            for col in self._get_model_columns(model_key):
+                if col not in results_df.columns:
+                    if col.endswith("_Decision"):
+                        results_df[col] = False
+                    elif col.endswith("_Reason"):
+                        results_df[col] = "Not provided"
+                    else:
+                        results_df[col] = "not applicable"
+            # Convert boolean columns
+            decision_columns = [col for col in results_df.columns if col.endswith("_Decision")]
+            for col in decision_columns:
+                results_df[col] = results_df[col].astype(bool)
+        else:
+            # Create empty DataFrame with required columns
+            results_df = pd.DataFrame(columns=self._get_model_columns(model_key))
+            results_df.index.name = "Index"
+        # Log final statistics
+        total_time = time.time() - start_time
+        success_rate = ((total_rows - len(failed_indices)) / total_rows) * 100
+        logging.info(f"{model_key.upper()} completed in {total_time:.1f}s - "
+                    f"Success rate: {success_rate:.1f}% ({total_rows - len(failed_indices)}/{total_rows})")
+        return results_df
+    def merge_results(self, df: pd.DataFrame, model_results: Dict) -> pd.DataFrame:
+        """Merge results from all models into a single DataFrame."""
+        return self.result_processor.merge_results(df, model_results)
+    def _create_empty_result(self, idx: str, model_key: str, reason: Optional[str] = None) -> Dict:
+        """
+        Create a default empty result entry for cases where the abstract is empty
+        or previous results are missing. The default reason is 'Not applicable' if not provided.
+        """
+        default_reason = reason if reason is not None else "Not applicable - Empty or invalid data"
+        result = {"Index": str(idx)}
+        if model_key == "model_a":
+            result.update({
+                "A_P": "not applicable",
+                "A_I": "not applicable",
+                "A_C": "not applicable",
+                "A_O": "not applicable",
+                "A_S": "not applicable",
+                "A_Decision": False,
+                "A_Reason": default_reason
+            })
+        elif model_key == "model_b":
+            result.update({
+                "B_P": "not applicable",
+                "B_I": "not applicable",
+                "B_C": "not applicable",
+                "B_O": "not applicable",
+                "B_S": "not applicable",
+                "B_Decision": False,
+                "B_Reason": default_reason
+            })
+        else:  # For model_c
+            result.update({
+                "C_Decision": False,
+                "C_Reason": default_reason
+            })
+        return result
+    def _create_no_disagreement_result(self, idx: str, previous_results: Dict) -> Dict:
+        """
+        When Model A and Model B agree on the decision,
+        directly return Model A's result with a note indicating no disagreement.
+        """
+        str_idx = str(idx)
+        a_result = previous_results["model_a"].loc[str_idx]
+        return {
+            "Index": str_idx,
+            "C_Decision": a_result["A_Decision"],
+            "C_Reason": "No disagreement between Model A and B"
+        }
+    def _validate_previous_results(self, idx: str, model_key: str, previous_results: Dict) -> bool:
+        """
+        Validate if previous model results exist for a given index.
+        Returns False if any required result is missing.
+        """
+        str_idx = str(idx)
+        if "model_a" not in previous_results:
+            raise Exception("Model A results required")
+        model_a_data = previous_results["model_a"]
+        if str_idx not in model_a_data.index.astype(str).values:
+            logging.warning(f"Missing Model A result for index {idx}")
+            return False
+        if model_key == "model_c":
+            if "model_b" not in previous_results:
+                raise Exception("Model B results required")
+            model_b_data = previous_results["model_b"]
+            if str_idx not in model_b_data.index.astype(str).values:
+                logging.warning(f"Missing Model B result for index {idx}")
+                return False
+        return True
+    def _check_disagreement(self, idx: str, previous_results: Dict) -> bool:
+        """
+        Check whether there is a disagreement between Model A and Model B for a given index.
+        Returns True if the decisions differ, otherwise False.
+        """
+        str_idx = str(idx)
+        a_result = previous_results["model_a"].loc[str_idx]
+        b_result = previous_results["model_b"].loc[str_idx]
+        return a_result["A_Decision"] != b_result["B_Decision"]
+    def _get_model_columns(self, model_key: str) -> List[str]:
+        """Get the expected columns for a specific model's output."""
+        if model_key == "model_a":
+            return ["A_Decision", "A_Reason", "A_P", "A_I", "A_C", "A_O", "A_S"]
+        elif model_key == "model_b":
+            return ["B_Decision", "B_Reason", "B_P", "B_I", "B_C", "B_O", "B_S"]
+        else:  # model_c
+            return ["C_Decision", "C_Reason"]

app.py ADDED Viewed

	@@ -0,0 +1,724 @@

+import os
+from dotenv import load_dotenv
+import time
+import logging
+from datetime import datetime
+import gradio as gr
+from file_processor import FileProcessor
+from analyzer import PICOSAnalyzer
+from deduplicator import Deduplicator
+from result_processor import ResultProcessor
+# Configuration of directories
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(BASE_DIR, "data")
+LOG_DIR = os.path.join(BASE_DIR, "logs")
+# Load .env file if it exists
+dotenv_path = os.path.join(os.path.dirname(__file__), '.env')
+if os.path.exists(dotenv_path):
+    load_dotenv(dotenv_path)
+else:
+    print("Warning: .env file not found.")
+# Initialize components for analysis, file processing, deduplication, and result processing
+analyzer = PICOSAnalyzer()
+file_processor = FileProcessor(DATA_DIR)
+model_results = {}
+deduplicator = Deduplicator()
+result_processor = ResultProcessor()
+# Ensure required directories exist
+for directory in [DATA_DIR, LOG_DIR]:
+    try:
+        os.makedirs(directory, exist_ok=True)
+    except Exception as e:
+        raise RuntimeError(f"Failed to create directory {directory}: {str(e)}")
+# Configure logging: log to both a file and the console
+try:
+    log_file = os.path.join(LOG_DIR, f"picos_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
+    # File handler for logging to a file
+    file_handler = logging.FileHandler(log_file, encoding='utf-8')
+    file_handler.setLevel(logging.INFO)
+    # Console handler for logging to the terminal
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    # Formatter for log messages
+    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    file_handler.setFormatter(formatter)
+    console_handler.setFormatter(formatter)
+    # Configure the root logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(logging.INFO)
+    root_logger.addHandler(file_handler)
+    root_logger.addHandler(console_handler)
+except Exception as e:
+    print(f"Failed to initialize logging: {str(e)}")
+    raise
+def create_gradio_interface():
+    """Create and return the Gradio interface for the PICOS Analysis System."""
+    def parse_nbib(file) -> tuple:
+        """
+        Parse a citation file in NBIB format.
+        Returns a tuple containing the Excel output path and a preview text.
+        """
+        try:
+            if not file:
+                return None, "No file uploaded"
+            # Determine file type based on extension
+            file_extension = os.path.splitext(file.name)[1].lower()
+            if file_extension == '.nbib':
+                output_path, preview = file_processor.parse_nbib(file.name)
+            elif file_extension == '.ris':
+                # Read file content to determine RIS format (Embase or Web of Science)
+                with open(file.name, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                if 'T1  - ' in content:  # Embase RIS format
+                    output_path, preview = file_processor.parse_embase_ris(file.name)
+                else:  # Assume Web of Science RIS format
+                    output_path, preview = file_processor.parse_wos_ris(file.name)
+            else:
+                return None, "Unsupported file format. Please upload a .nbib or .ris file"
+            if not output_path:
+                return None, "Failed to parse file"
+            return output_path, preview
+        except Exception as e:
+            error_msg = f"Error parsing file: {str(e)}"
+            logging.error(error_msg)
+            return None, error_msg
+    def parse_scopus(file) -> tuple:
+        """
+        Parse a Scopus RIS file.
+        Returns a tuple containing the Excel output path and a preview text.
+        """
+        try:
+            if not file:
+                return None, "No file uploaded"
+            output_path, preview = file_processor.parse_scopus_ris(file.name)
+            if not output_path:
+                return None, "Failed to parse file"
+            return output_path, preview
+        except Exception as e:
+            error_msg = f"Error parsing Scopus file: {str(e)}"
+            logging.error(error_msg)
+            return None, error_msg
+    def update_picos_criteria(p, i, c, o, s):
+        """Update the PICOS criteria used for analysis."""
+        try:
+            analyzer.update_picos_criteria({
+                "population": p.strip(),
+                "intervention": i.strip(),
+                "comparison": c.strip(),
+                "outcome": o.strip(),
+                "study_design": s.strip()
+            })
+            return "✓ PICOS criteria updated successfully"
+        except Exception as e:
+            return f"❌ Error updating PICOS criteria: {str(e)}"
+    def update_model_settings(model_key, api_url, api_key, model_name, temperature, max_tokens, batch_size, threads, prompt, is_inference, timeout):
+        """Update the settings for a specified model."""
+        try:
+            analyzer.update_model_config(model_key, {
+                "api_url": api_url.strip(),
+                "api_key": api_key.strip(),
+                "model": model_name.strip(),
+                "temperature": float(temperature),
+                "max_tokens": int(max_tokens),
+                "batch_size": int(batch_size),
+                "threads": int(threads),
+                "is_inference": bool(is_inference),
+                "timeout": float(timeout),
+                "updated": True  # mark as manually updated
+            })
+            analyzer.update_prompt(model_key, prompt.strip())
+            return "✓ Settings updated successfully"
+        except Exception as e:
+            return f"❌ Error updating settings: {str(e)}"
+    def test_connection(model_key):
+        """Test the API connection for a specified model."""
+        try:
+            result = analyzer.test_api_connection(model_key)
+            return result
+        except Exception as e:
+            return f"❌ Error testing connection: {str(e)}"
+    def process_model(input_file, model_key, model_a_input=None, model_b_input=None):
+        """
+        Process analysis for a single model and return the results.
+        For Model B and C, the required previous results files must be provided.
+        """
+        try:
+            logging.info(f"Loading input file for {model_key.upper()}...")
+            df = file_processor.load_excel(input_file.name)
+            if df is None:
+                return None, "Failed to load Excel file"
+            # For Model B, require Model A results; for Model C, require both Model A and B results
+            if model_key == "model_b":
+                if model_a_input is None or not os.path.exists(model_a_input.name):
+                    return None, "Model A results file required for MODEL_B"
+                model_results["model_a"] = file_processor.load_excel(model_a_input.name)
+            elif model_key == "model_c":
+                logging.info("Loading Model A and B results for Model C analysis...")
+                if model_a_input is None or not os.path.exists(model_a_input.name) or \
+                   model_b_input is None or not os.path.exists(model_b_input.name):
+                    return None, "Both Model A and B results files required for MODEL_C"
+                model_results["model_a"] = file_processor.load_excel(model_a_input.name)
+                model_results["model_b"] = file_processor.load_excel(model_b_input.name)
+            # Process the model
+            logging.info(f"Starting {model_key.upper()} analysis...")
+            total_rows = len(df)
+            processed_rows = 0
+            errors = 0
+            empty_abstracts = 0
+            start_time = time.time()
+            def progress_callback(row_index, error=False, is_empty=False):
+                nonlocal processed_rows, errors, empty_abstracts
+                # Increase the count only when the actual processing is complete
+                if not error:
+                    processed_rows += 1
+                elif is_empty:
+                    empty_abstracts += 1
+                else:
+                    errors += 1
+                # Calculate progress and time estimates
+                elapsed_time = time.time() - start_time
+                progress = processed_rows / total_rows
+                if progress > 0:
+                    # Use moving averages to smooth time estimates
+                    avg_time_per_item = elapsed_time / (processed_rows + errors + empty_abstracts)
+                    remaining_items = total_rows - (processed_rows + errors + empty_abstracts)
+                    remaining_time = avg_time_per_item * remaining_items
+                    # Use the batch size of the model to control the log output frequency
+                    batch_size = analyzer.model_manager.get_config(model_key)["batch_size"]
+                    if (processed_rows + errors + empty_abstracts) % batch_size == 0:
+                        logging.info(f"{model_key.upper()} Progress: {processed_rows + errors + empty_abstracts}/{total_rows} rows "
+                                   f"({(processed_rows + errors + empty_abstracts) / total_rows:.1%}) - "
+                                   f"Processed: {processed_rows}, Errors: {errors}, Empty: {empty_abstracts} - "
+                                   f"Elapsed: {elapsed_time:.1f}s, Remaining: {remaining_time:.1f}s")
+            results_df = analyzer.process_batch(df, model_key, model_results, progress_callback)
+            if results_df is None:
+                return None, f"{model_key.upper()} failed to process results"
+            # Save results immediately with fixed path in DATA_DIR
+            output_file = os.path.join(DATA_DIR, f"{model_key}_results.xlsx")
+            if model_key == "model_c":
+                # For Model C, merge all results before saving
+                merged_df = analyzer.merge_results(df, {
+                    "model_a": model_results["model_a"],
+                    "model_b": model_results["model_b"],
+                    "model_c": results_df
+                })
+                if not file_processor.save_excel(merged_df, output_file):
+                    return None, f"Failed to save {model_key.upper()} results"
+            else:
+                # For Model A and B, save individual results
+                if not file_processor.save_excel(results_df, output_file):
+                    return None, f"Failed to save {model_key.upper()} results"
+            total_time = time.time() - start_time
+            completion_msg = (f"{model_key.upper()} analysis completed in {total_time:.1f}s - "
+                            f"Processed {processed_rows} rows with {errors} errors")
+            logging.info(completion_msg)
+            # Return the full path to the saved file with gr.update
+            if os.path.exists(output_file):
+                return gr.update(value=output_file), completion_msg
+            else:
+                return None, f"Failed to verify {model_key.upper()} results file"
+        except Exception as e:
+            error_msg = f"Error in {model_key.upper()} analysis: {str(e)}"
+            logging.error(error_msg)
+            return None, error_msg
+    def merge_results_with_files(input_file, model_a_file, model_b_file, model_c_file):
+        """
+        Merge all model results from the provided files and export the merged results as an Excel file.
+        """
+        if not all([input_file, model_a_file, model_b_file]):
+            return None, "Original file, Model A and B results are required"
+        try:
+            df = file_processor.load_excel(input_file.name)
+            model_a_results = file_processor.load_excel(model_a_file.name)
+            model_b_results = file_processor.load_excel(model_b_file.name)
+            model_c_results = file_processor.load_excel(model_c_file.name) if model_c_file else None
+            if any(result is None for result in [df, model_a_results, model_b_results]):
+                return None, "Failed to load one or more required files"
+            model_results["model_a"] = model_a_results
+            model_results["model_b"] = model_b_results
+            if model_c_results is not None:
+                model_results["model_c"] = model_c_results
+            merged_df = analyzer.merge_results(df, model_results)
+            final_filename = os.path.join(DATA_DIR, "final_results.xlsx")
+            result_processor.export_to_excel(merged_df, final_filename)
+            return final_filename, "Results merged successfully"
+        except Exception as e:
+            return None, f"Error merging results: {str(e)}"
+    def run_all_models(input_file):
+        """Run analysis pipeline for all models with streaming updates"""
+        try:
+            # Read Excel file using file processor
+            df = file_processor.load_excel(input_file.name)
+            if df is None:
+                yield [None, None, None, None, "Failed to load input file"]
+                return
+            # --- Process Model A ---
+            logging.info("Starting Model A analysis...")
+            model_a_results = analyzer.process_batch(df, "model_a")
+            if model_a_results is None:
+                yield [None, None, None, None, "Model A failed to process results"]
+                return
+            # Save Model A results with fixed path
+            model_a_path = os.path.join(DATA_DIR, "model_a_results.xlsx")
+            if not file_processor.save_excel(model_a_results, model_a_path):
+                yield [None, None, None, None, "Failed to save Model A results"]
+                return
+            model_results["model_a"] = model_a_results
+            status_msg = "Model A completed successfully"
+            # Yield update: Model A result available
+            yield [gr.update(value=model_a_path), None, None, None, status_msg]
+            # --- Process Model B ---
+            logging.info("Starting Model B analysis...")
+            model_b_results = analyzer.process_batch(df, "model_b", {"model_a": model_a_results})
+            if model_b_results is None:
+                yield [gr.update(value=model_a_path), None, None, None, "Model B failed to process results"]
+                return
+            # Save Model B results with fixed path
+            model_b_path = os.path.join(DATA_DIR, "model_b_results.xlsx")
+            if not file_processor.save_excel(model_b_results, model_b_path):
+                yield [gr.update(value=model_a_path), None, None, None, "Failed to save Model B results"]
+                return
+            model_results["model_b"] = model_b_results
+            status_msg = "Model B completed successfully"
+            # Yield update: Both Model A and B results available
+            yield [gr.update(value=model_a_path), gr.update(value=model_b_path), None, None, status_msg]
+            # --- Process Model C ---
+            logging.info("Starting Model C analysis...")
+            model_c_results = analyzer.process_batch(df, "model_c", {
+                "model_a": model_a_results,
+                "model_b": model_b_results
+            })
+            model_c_path = None
+            if model_c_results is not None:
+                # Save Model C results with fixed path
+                model_c_path = os.path.join(DATA_DIR, "model_c_results.xlsx")
+                if not file_processor.save_excel(model_c_results, model_c_path):
+                    yield [gr.update(value=model_a_path), gr.update(value=model_b_path), None, None, "Failed to save Model C results"]
+                    return
+                model_results["model_c"] = model_c_results
+            status_msg = "Model C completed successfully"
+            # Yield update: Model A, B and C results available
+            yield [gr.update(value=model_a_path), gr.update(value=model_b_path), gr.update(value=model_c_path), None, status_msg]
+            # Merge results
+            logging.info("Merging results...")
+            merged_df = analyzer.merge_results(df, model_results)
+            # Save final results with fixed path
+            final_path = os.path.join(DATA_DIR, "final_results.xlsx")
+            if not file_processor.save_excel(merged_df, final_path):
+                yield [gr.update(value=model_a_path), gr.update(value=model_b_path), gr.update(value=model_c_path), None, "Failed to save final results"]
+                return
+            completion_msg = "All models completed successfully"
+            # Yield final update with all results available
+            yield [gr.update(value=model_a_path), gr.update(value=model_b_path), gr.update(value=model_c_path), gr.update(value=final_path), completion_msg]
+        except Exception as e:
+            error_msg = f"Error in pipeline: {str(e)}"
+            logging.error(error_msg)
+            yield [None, None, None, None, error_msg]
+    def process_deduplication(files, threshold):
+        """
+        Process deduplication for multiple Excel files.
+        The function identifies duplicate entries based on a similarity threshold.
+        """
+        try:
+            if not files:
+                return None, None, "No files uploaded"
+            dataframes = []
+            for file in files:
+                if not file:
+                    continue
+                df = file_processor.load_excel(file.name)
+                if df is None:
+                    return None, None, f"Failed to load file: {file.name}"
+                dataframes.append(df)
+            if not dataframes:
+                return None, None, "No valid files to process"
+            unique_df, clusters_df = deduplicator.process_dataframes(dataframes, threshold)
+            unique_path = file_processor.save_excel(unique_df, "deduplicated_data.xlsx")
+            clusters_path = file_processor.save_excel(clusters_df, "duplicate_clusters.xlsx")
+            if not unique_path or not clusters_path:
+                return None, None, "Failed to save results"
+            status_msg = f"Deduplication completed successfully:\n"
+            status_msg += f"Original entries: {sum(len(df) for df in dataframes)}\n"
+            status_msg += f"Unique entries: {len(unique_df)}\n"
+            status_msg += f"Duplicate clusters: {len(clusters_df['Cluster_ID'].unique()) if len(clusters_df) > 0 else 0}"
+            return unique_path, clusters_path, status_msg
+        except Exception as e:
+            error_msg = f"Error in deduplication: {str(e)}"
+            logging.error(error_msg)
+            return None, None, error_msg
+    # Build the Gradio interface
+    interface = gr.Blocks(title="PICOS Analysis System")
+    with interface:
+        gr.Markdown("""
+        <div style="text-align: center;">
+            <h1>PICOS Literature Analysis System</h1>
+            <p>This system uses a multi-model approach to analyze medical literature abstracts.</p>
+        </div>
+        """)
+        with gr.Tab("Instructions"):
+            gr.Markdown("""
+            ## System Overview
+            This system helps researchers analyze medical literature by providing tools for citation management,
+            deduplication, and automated PICOS analysis using multiple language models.
+            ## Workflow Steps
+            **Citation Processing** -> **Deduplication** (Optional) -> **PICOS Analysis Setup** -> **Analysis Execution**
+            ## File Format Requirements
+            ### Input Files
+            - **Pubmed**: NBIB format (.nbib)
+            - **Embase**: RIS format (.ris)
+            - **Web of Science**: RIS format (.ris)
+            - **Scopus**: RIS format (.ris)
+            ### Processed Format
+            The system will generate standardized Excel files (XLSX format) with these columns:
+            - **Index**: Unique identifier for each abstract
+            - **Title**: Article title
+            - **Authors**: Author list (semicolon-separated)
+            - **Abstract**: Full abstract text
+            - **DOI**: Digital Object Identifier (when available)
+            ### Analysis Results
+            Each model will generate an Excel file containing:
+            - All original citation data
+            - PICOS analysis results
+            - Inclusion/exclusion decisions
+            - Reasoning for decisions
+            """)
+        with gr.Tab("Citation File Processing"):
+            with gr.Tab("Pubmed"):
+                gr.Markdown("""
+                ## Pubmed NBIB Processing
+                Upload a .nbib file from Pubmed to extract and convert it to Excel format. The extracted data will include:
+                - DOI
+                - Title
+                - Authors
+                - Abstract
+                """)
+                with gr.Row():
+                    nbib_file = gr.File(label="Upload NBIB File", file_types=[".nbib"])
+                    process_nbib_btn = gr.Button("Process NBIB File")
+                with gr.Row():
+                    nbib_preview = gr.Textbox(label="Preview", lines=20)
+                    nbib_output = gr.File(label="Download Excel")
+                process_nbib_btn.click(
+                    parse_nbib,
+                    inputs=[nbib_file],
+                    outputs=[nbib_output, nbib_preview]
+                )
+            with gr.Tab("Embase"):
+                gr.Markdown("""
+                ## Embase RIS Processing
+                Upload a .ris file from Embase to extract and convert it to Excel format. The extracted data will include:
+                - DOI
+                - Title
+                - Authors
+                - Abstract
+                """)
+                with gr.Row():
+                    embase_file = gr.File(label="Upload Embase RIS File", file_types=[".ris"])
+                    process_embase_btn = gr.Button("Process Embase RIS File")
+                with gr.Row():
+                    embase_preview = gr.Textbox(label="Preview", lines=20)
+                    embase_output = gr.File(label="Download Excel")
+                process_embase_btn.click(
+                    parse_nbib,
+                    inputs=[embase_file],
+                    outputs=[embase_output, embase_preview]
+                )
+            with gr.Tab("Web of Science"):
+                gr.Markdown("""
+                ## Web of Science RIS Processing
+                Upload a .ris file from Web of Science to extract and convert it to Excel format. The extracted data will include:
+                - DOI
+                - Title
+                - Authors
+                - Abstract
+                """)
+                with gr.Row():
+                    wos_file = gr.File(label="Upload WOS RIS File", file_types=[".ris"])
+                    process_wos_btn = gr.Button("Process WOS RIS File")
+                with gr.Row():
+                    wos_preview = gr.Textbox(label="Preview", lines=20)
+                    wos_output = gr.File(label="Download Excel")
+                process_wos_btn.click(
+                    lambda file: parse_nbib(file) if file else (None, "No file uploaded"),
+                    inputs=[wos_file],
+                    outputs=[wos_output, wos_preview]
+                )
+            with gr.Tab("Scopus"):
+                gr.Markdown("""
+                ## Scopus RIS Processing
+                Upload a .ris file from Scopus to extract and convert it to Excel format. The extracted data will include:
+                - DOI
+                - Title
+                - Authors
+                - Abstract
+                """)
+                with gr.Row():
+                    scopus_file = gr.File(label="Upload Scopus RIS File", file_types=[".ris"])
+                    process_scopus_btn = gr.Button("Process Scopus RIS File")
+                with gr.Row():
+                    scopus_preview = gr.Textbox(label="Preview", lines=20)
+                    scopus_output = gr.File(label="Download Excel")
+                process_scopus_btn.click(
+                    parse_scopus,
+                    inputs=[scopus_file],
+                    outputs=[scopus_output, scopus_preview]
+                )
+        with gr.Tab("Deduplication"):
+            gr.Markdown("""
+            ## Citation Deduplication
+            Upload multiple Excel files to remove duplicate entries across different citation sources.
+            The system will identify similar entries based on title and author information.
+            ### Features:
+            - Support for multiple Excel files
+            - Adjustable similarity threshold
+            - Detailed duplicate clusters report
+            - Standardized output format
+            """)
+            with gr.Row():
+                input_files = gr.File(
+                    label="Upload Excel Files",
+                    file_types=[".xlsx", ".xls"],
+                    file_count="multiple"
+                )
+                threshold = gr.Slider(
+                    label="Similarity Threshold",
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.8,
+                    step=0.05,
+                    info="Higher values mean stricter matching (0.8 recommended)"
+                )
+            with gr.Row():
+                process_btn = gr.Button("Process Deduplication")
+            with gr.Row():
+                status = gr.Textbox(label="Status", lines=5)
+            with gr.Row():
+                unique_output = gr.File(label="Download Deduplicated Data")
+                clusters_output = gr.File(label="Download Duplicate Clusters")
+            process_btn.click(
+                process_deduplication,
+                inputs=[input_files, threshold],
+                outputs=[unique_output, clusters_output, status]
+            )
+        with gr.Tab("LLM Analysis"):
+            with gr.Tab("PICOS Criteria"):
+                gr.Markdown("""
+                ## PICOS Criteria Settings
+                Define the standard PICOS criteria that will be used by all models.
+                These criteria will be used to evaluate whether each article meets the requirements.
+                """)
+                with gr.Group("Standard PICOS Criteria"):
+                    population = gr.Textbox(label="Population", value=analyzer.picos_criteria["population"],
+                                              placeholder="e.g., patients with hepatocellular carcinoma")
+                    intervention = gr.Textbox(label="Intervention", value=analyzer.picos_criteria["intervention"],
+                                              placeholder="e.g., immunotherapy or targeted therapy")
+                    comparison = gr.Textbox(label="Comparison", value=analyzer.picos_criteria["comparison"],
+                                              placeholder="e.g., standard therapy or placebo")
+                    outcome = gr.Textbox(label="Outcome", value=analyzer.picos_criteria["outcome"],
+                                         placeholder="e.g., survival or response rate")
+                    study_design = gr.Textbox(label="Study Design", value=analyzer.picos_criteria["study_design"],
+                                              placeholder="e.g., randomized controlled trial")
+                    update_picos_btn = gr.Button("Update PICOS Criteria")
+                    picos_status = gr.Textbox(label="Status")
+                    update_picos_btn.click(
+                        update_picos_criteria,
+                        inputs=[population, intervention, comparison, outcome, study_design],
+                        outputs=picos_status
+                    )
+            with gr.Tab("Model Settings"):
+                for model_key in ["model_a", "model_b", "model_c"]:
+                    with gr.Group(f"{model_key.upper()} Settings"):
+                        config = analyzer.model_manager.get_config(model_key)
+                        api_url = gr.Textbox(label="API URL", value=config["api_url"])
+                        api_key = gr.Textbox(label="API Key", value=config["api_key"])
+                        model_name = gr.Textbox(label="Model", value=config["model"])
+                        is_inference = gr.Checkbox(
+                            label="Inference Model",
+                            value=config.get("is_inference", False),
+                            info="Enable inference compatibility mode for models that return reasoning process"
+                        )
+                        temperature = gr.Slider(label="Temperature", minimum=0, maximum=10, value=config["temperature"])
+                        max_tokens = gr.Number(label="Max Tokens", value=config["max_tokens"])
+                        batch_size = gr.Number(label="Batch Size", value=config["batch_size"])
+                        threads = gr.Slider(label="Threads", minimum=1, maximum=32, step=1, value=config["threads"])
+                        timeout = gr.Number(label="Timeout (seconds)", value=config.get("timeout", 180))
+                        prompt = gr.Textbox(label="Prompt Template", value=analyzer.prompt_manager.get_prompt(model_key), lines=10)
+                        update_btn = gr.Button(f"Update {model_key.upper().replace('_', ' ')} Settings")
+                        test_btn = gr.Button(f"Test {model_key.upper().replace('_', ' ')} Connection")
+                        status = gr.Textbox(label="Status", lines=10)
+                        update_btn.click(
+                            update_model_settings,
+                            inputs=[gr.Textbox(value=model_key, visible=False),
+                                    api_url,
+                                    api_key,
+                                    model_name,
+                                    temperature,
+                                    max_tokens,
+                                    batch_size,
+                                    threads,
+                                    prompt,
+                                    is_inference,
+                                    timeout],
+                            outputs=status
+                        )
+                        test_btn.click(
+                            test_connection,
+                            inputs=[gr.Textbox(value=model_key, visible=False)],
+                            outputs=status
+                        )
+            with gr.Tab("Analysis"):
+                with gr.Row():
+                    input_file = gr.File(label="Original Excel File")
+                    model_a_input = gr.File(label="Model A Results")
+                    model_b_input = gr.File(label="Model B Results")
+                    model_c_input = gr.File(label="Model C Results")
+                with gr.Row():
+                    model_a_btn = gr.Button("Run Model A")
+                    model_b_btn = gr.Button("Run Model B")
+                    model_c_btn = gr.Button("Run Model C")
+                    merge_btn = gr.Button("Merge Results")
+                    # Register run_all_btn with streaming enabled for intermediate updates
+                    run_all_btn = gr.Button("Run All", variant="primary")
+                status = gr.Textbox(label="Status")
+                with gr.Row():
+                    model_a_output = gr.File(label="Model A Results", interactive=True)
+                    model_b_output = gr.File(label="Model B Results", interactive=True)
+                    model_c_output = gr.File(label="Model C Results", interactive=True)
+                    final_output = gr.File(label="Final Results", interactive=True)
+                # Individual model runs
+                model_a_btn.click(
+                    lambda x: process_model(x, "model_a"),
+                    inputs=[input_file],
+                    outputs=[model_a_output, status]
+                )
+                model_b_btn.click(
+                    lambda x, y: process_model(x, "model_b", y),
+                    inputs=[input_file, model_a_input],
+                    outputs=[model_b_output, status]
+                )
+                model_c_btn.click(
+                    lambda x, y, z: process_model(x, "model_c", y, z),
+                    inputs=[input_file, model_a_input, model_b_input],
+                    outputs=[model_c_output, status]
+                )
+                merge_btn.click(
+                    merge_results_with_files,
+                    inputs=[input_file, model_a_input, model_b_input, model_c_input],
+                    outputs=[final_output, status]
+                )
+                run_all_btn.click(
+                    fn=run_all_models,
+                    inputs=[input_file],
+                    outputs=[model_a_output, model_b_output, model_c_output, final_output, status]
+                )
+    return interface
+if __name__ == "__main__":
+    interface = create_gradio_interface()
+    if interface:
+        interface.launch(server_name="0.0.0.0", server_port=7860, pwa=True)
+    else:
+        print("Error: Failed to create Gradio interface")

deduplicator.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import logging
+class Deduplicator:
+    def __init__(self):
+        """Initialize Deduplicator with required columns for processing"""
+        self.required_columns = ['Title', 'Authors', 'Abstract', 'DOI']
+    def validate_dataframe(self, df):
+        """
+        Validate if dataframe has required columns
+        Args:
+            df: DataFrame to validate
+        Returns:
+            bool: True if validation passes
+        Raises:
+            ValueError: If required columns are missing
+        """
+        missing_cols = [col for col in self.required_columns if col not in df.columns]
+        if missing_cols:
+            raise ValueError(f"Missing required columns: {', '.join(missing_cols)}")
+        return True
+    def process_dataframes(self, dataframes, threshold=0.8):
+        """
+        Process multiple dataframes and remove duplicates
+        Args:
+            dataframes: List of DataFrames to process
+            threshold: Similarity threshold for duplicate detection (default: 0.8)
+        Returns:
+            tuple: (unique_df, clusters_df) where:
+                - unique_df: DataFrame containing unique entries
+                - clusters_df: DataFrame containing duplicate clusters
+        Raises:
+            Exception: If deduplication process fails
+        """
+        try:
+            # Validate and combine dataframes
+            for df in dataframes:
+                self.validate_dataframe(df)
+            combined_df = pd.concat(dataframes, ignore_index=True)
+            # Create Title_Author column for similarity comparison
+            combined_df['Title_Author'] = combined_df['Title'].fillna('') + ' ' + combined_df['Authors'].fillna('')
+            # Find duplicate clusters
+            clusters_df, unique_df = self.find_duplicate_clusters(combined_df, threshold)
+            # Ensure output format consistency
+            unique_df = self.standardize_output(unique_df)
+            clusters_df = self.standardize_clusters(clusters_df)
+            return unique_df, clusters_df
+        except Exception as e:
+            logging.error(f"Error in deduplication process: {str(e)}")
+            raise
+    def find_duplicate_clusters(self, df, threshold):
+        """
+        Find duplicate clusters using TF-IDF and cosine similarity
+        Args:
+            df: DataFrame to process
+            threshold: Similarity threshold for duplicate detection
+        Returns:
+            tuple: (clusters_df, unique_df) where:
+                - clusters_df: DataFrame containing duplicate clusters
+                - unique_df: DataFrame containing unique entries
+        """
+        # Create TF-IDF vectors for similarity comparison
+        vectorizer = TfidfVectorizer().fit_transform(df['Title_Author'])
+        cosine_sim = cosine_similarity(vectorizer)
+        n = cosine_sim.shape[0]
+        parent = list(range(n))
+        def find(x):
+            """Find the root of a cluster using path compression"""
+            if parent[x] != x:
+                parent[x] = find(parent[x])
+            return parent[x]
+        def union(x, y):
+            """Union two clusters by rank"""
+            rootX = find(x)
+            rootY = find(y)
+            if rootX != rootY:
+                parent[rootY] = rootX
+        # Build clusters using union-find
+        for i in range(n):
+            for j in range(i + 1, n):
+                if cosine_sim[i, j] > threshold:
+                    union(i, j)
+        # Collect clusters and prepare output
+        clusters = {}
+        for i in range(n):
+            root = find(i)
+            if root not in clusters:
+                clusters[root] = []
+            clusters[root].append(i)
+        # Prepare output dataframes
+        cluster_data = []
+        unique_indices = []
+        for cluster_id, indices in clusters.items():
+            if len(indices) > 1:
+                for index in indices:
+                    cluster_data.append({
+                        "Cluster_ID": cluster_id,
+                        "Index": index,
+                        "Title": df.iloc[index]["Title"],
+                        "Authors": df.iloc[index]["Authors"],
+                        "DOI": df.iloc[index]["DOI"],
+                        "Abstract": df.iloc[index]["Abstract"]
+                    })
+                unique_indices.append(indices[0])  # Keep first occurrence
+            else:
+                unique_indices.extend(indices)
+        clusters_df = pd.DataFrame(cluster_data) if cluster_data else pd.DataFrame(columns=["Cluster_ID", "Index", "Title", "Authors", "DOI", "Abstract"])
+        unique_df = df.iloc[unique_indices].copy()
+        # Reset index to ensure it starts from 0
+        unique_df = unique_df.reset_index(drop=True)
+        # Add Index column that matches NBIB/RIS format
+        unique_df.index.name = 'Index'
+        return clusters_df, unique_df
+    def standardize_output(self, df):
+        """
+        Ensure output dataframe has consistent format
+        Args:
+            df: DataFrame to standardize
+        Returns:
+            DataFrame with standardized format
+        """
+        # Make sure Index is properly set
+        if 'Index' not in df.index.name:
+            df = df.reset_index(drop=True)
+            df.index.name = 'Index'
+        # Ensure all required columns exist
+        required_columns = ['Title', 'Authors', 'Abstract', 'DOI']
+        for col in required_columns:
+            if col not in df.columns:
+                df[col] = ''
+        # Select and order columns while preserving the index
+        df = df[required_columns]
+        return df
+    def standardize_clusters(self, df):
+        """
+        Ensure clusters dataframe has consistent format
+        Args:
+            df: DataFrame containing cluster information
+        Returns:
+            DataFrame with standardized cluster format
+        """
+        required_columns = ['Cluster_ID', 'Index', 'Title', 'Authors', 'DOI', 'Abstract']
+        for col in required_columns:
+            if col not in df.columns:
+                df[col] = ''
+        return df[required_columns]

file_processor.py ADDED Viewed

	@@ -0,0 +1,407 @@

+import os
+import pandas as pd
+import logging
+import re
+from typing import Tuple, Optional
+class FileProcessor:
+    def __init__(self, data_dir: str):
+        """
+        Initialize FileProcessor
+        Args:
+            data_dir: Directory path for storing processed data
+        """
+        self.data_dir = data_dir
+    def parse_nbib(self, file_path: str) -> Tuple[Optional[str], str]:
+        """
+        Parse NBIB file and return Excel output path and preview text
+        Args:
+            file_path: Path to the NBIB file to parse
+        Returns:
+            tuple: (output_path, preview_text) where:
+                - output_path: Path to the generated Excel file (None if parsing fails)
+                - preview_text: Preview of the parsed data or error message
+        """
+        if not file_path or not os.path.exists(file_path):
+            return None, "Invalid file"
+        try:
+            records = []
+            record = {}
+            authors = []
+            current_field = None
+            with open(file_path, 'r', encoding='utf-8') as f:
+                lines = f.readlines()
+            if not lines:
+                return None, "Empty file"
+            # Process each line in the NBIB file
+            for line in lines:
+                if line.startswith('TI  - '):
+                    record['Title'] = line.replace('TI  - ', '').strip()
+                    current_field = 'Title'
+                elif line.startswith('AB  - '):
+                    record['Abstract'] = line.replace('AB  - ', '').strip()
+                    current_field = 'Abstract'
+                elif line.startswith('AU  - '):
+                    authors.append(line.replace('AU  - ', '').strip())
+                    current_field = None
+                elif line.startswith('LID - '):
+                    if '[doi]' in line:
+                        doi_part = line.replace('LID - ', '').strip()
+                        record['DOI'] = doi_part.replace(' [doi]', '').strip()
+                    current_field = None
+                elif line.startswith('PMID- '):
+                    if record:  # Save the previous record
+                        record['Authors'] = '; '.join(authors)
+                        records.append(record)
+                        record = {}
+                        authors = []
+                    current_field = None
+                elif line.startswith('      ') and current_field in ['Abstract', 'Title']:
+                    record[current_field] += ' ' + line.strip()
+            # Save the last record if exists
+            if record:
+                record['Authors'] = '; '.join(authors)
+                records.append(record)
+            # Create DataFrame and save to Excel
+            df = pd.DataFrame(records)
+            df.index.name = 'Index'
+            output_path = os.path.join(self.data_dir, "extracted_data.xlsx")
+            df.to_excel(output_path, index=True)
+            preview = self._generate_preview(records)
+            return output_path, preview
+        except Exception as e:
+            return None, f"Error processing NBIB file: {str(e)}"
+    def parse_wos_ris(self, file_path: str) -> Tuple[Optional[str], str]:
+        """
+        Parse Web of Science RIS file and return Excel output path and preview text
+        Args:
+            file_path: Path to the WOS RIS file to parse
+        Returns:
+            tuple: (output_path, preview_text) where:
+                - output_path: Path to the generated Excel file (None if parsing fails)
+                - preview_text: Preview of the parsed data or error message
+        """
+        if not file_path or not os.path.exists(file_path):
+            return None, "Invalid file"
+        try:
+            records = []
+            record = {}
+            authors = []
+            current_field = None
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            if not content:
+                return None, "Empty file"
+            # Split content into individual articles
+            articles = content.split("\nER  -")
+            for article in articles:
+                if not article.strip():
+                    continue
+                record = {}
+                authors = []
+                # Process each line in the article
+                lines = article.strip().split('\n')
+                for line in lines:
+                    if not line.strip():
+                        continue
+                    if line.startswith('TI  - '):
+                        record['Title'] = line.replace('TI  - ', '').strip()
+                    elif line.startswith('AB  - '):
+                        record['Abstract'] = line.replace('AB  - ', '').strip()
+                    elif line.startswith('AU  - '):
+                        authors.append(line.replace('AU  - ', '').strip())
+                    elif line.startswith('DO  - '):
+                        record['DOI'] = line.replace('DO  - ', '').strip()
+                    elif line.startswith('   '):
+                        if 'Abstract' in record:
+                            record['Abstract'] += ' ' + line.strip()
+                        elif 'Title' in record:
+                            record['Title'] += ' ' + line.strip()
+                if record:
+                    record['Authors'] = '; '.join(authors)
+                    records.append(record)
+            # Create DataFrame with required columns
+            df = pd.DataFrame(records)
+            required_columns = ['Title', 'Abstract', 'Authors', 'DOI']
+            for col in required_columns:
+                if col not in df.columns:
+                    df[col] = ''
+            df.index.name = 'Index'
+            output_path = os.path.join(self.data_dir, "extracted_data.xlsx")
+            df.to_excel(output_path, index=True)
+            preview = self._generate_preview(records)
+            return output_path, preview
+        except Exception as e:
+            return None, f"Error processing WOS RIS file: {str(e)}"
+    def parse_embase_ris(self, file_path: str) -> Tuple[Optional[str], str]:
+        """
+        Parse Embase RIS file and return Excel output path and preview text
+        Args:
+            file_path: Path to the Embase RIS file to parse
+        Returns:
+            tuple: (output_path, preview_text) where:
+                - output_path: Path to the generated Excel file (None if parsing fails)
+                - preview_text: Preview of the parsed data or error message
+        """
+        if not file_path or not os.path.exists(file_path):
+            return None, "Invalid file"
+        try:
+            records = []
+            record = {}
+            authors = []
+            current_field = None
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            if not content:
+                return None, "Empty file"
+            # Split content into individual articles
+            articles = content.split("\n\n")
+            for article in articles:
+                if not article.strip():
+                    continue
+                record = {}
+                authors = []
+                # Process each line in the article
+                lines = article.strip().split('\n')
+                for line in lines:
+                    if not line.strip():
+                        continue
+                    if line.startswith('T1  - '):  # Title field
+                        record['Title'] = line.replace('T1  - ', '').strip()
+                    elif line.startswith('N2  - '):  # Abstract field
+                        record['Abstract'] = line.replace('N2  - ', '').strip()
+                    elif line.startswith('A1  - '):  # Authors field
+                        authors.append(line.replace('A1  - ', '').strip())
+                    elif line.startswith('DO  - '):  # DOI field
+                        record['DOI'] = line.replace('DO  - ', '').strip()
+                    elif line.startswith('   '):  # Handle multi-line fields
+                        if 'Abstract' in record:
+                            record['Abstract'] += ' ' + line.strip()
+                        elif 'Title' in record:
+                            record['Title'] += ' ' + line.strip()
+                if record:
+                    record['Authors'] = '; '.join(authors) if authors else ''
+                    records.append(record)
+            # Create DataFrame with required columns
+            df = pd.DataFrame(records)
+            required_columns = ['Title', 'Abstract', 'Authors', 'DOI']
+            for col in required_columns:
+                if col not in df.columns:
+                    df[col] = ''
+            df.index.name = 'Index'
+            output_path = os.path.join(self.data_dir, "extracted_data.xlsx")
+            df.to_excel(output_path, index=True)
+            preview = self._generate_preview(records)
+            return output_path, preview
+        except Exception as e:
+            return None, f"Error processing Embase RIS file: {str(e)}"
+    def parse_scopus_ris(self, file_path: str) -> Tuple[Optional[str], str]:
+        """
+        Parse Scopus RIS file and return Excel output path and preview text
+        Args:
+            file_path: Path to the Scopus RIS file to parse
+        Returns:
+            tuple: (output_path, preview_text) where:
+                - output_path: Path to the generated Excel file (None if parsing fails)
+                - preview_text: Preview of the parsed data or error message
+        """
+        if not file_path or not os.path.exists(file_path):
+            return None, "Invalid file"
+        try:
+            records = []
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            if not content:
+                return None, "Empty file"
+            # Use regex to split records by "ER  -" (note the double space)
+            articles = re.split(r'\nER\s*-\s*', content)
+            for article in articles:
+                if not article.strip():
+                    continue
+                record = {}
+                authors = []
+                lines = article.strip().split('\n')
+                for line in lines:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    if line.startswith('TI  - '):
+                        record['Title'] = line.replace('TI  - ', '').strip()
+                    elif line.startswith('AB  - '):
+                        record['Abstract'] = line.replace('AB  - ', '').strip()
+                    elif line.startswith('AU  - '):
+                        authors.append(line.replace('AU  - ', '').strip())
+                    elif line.startswith('DO  - '):
+                        record['DOI'] = line.replace('DO  - ', '').strip()
+                    elif line.startswith('   '):
+                        if 'Abstract' in record:
+                            record['Abstract'] += ' ' + line.strip()
+                        elif 'Title' in record:
+                            record['Title'] += ' ' + line.strip()
+                record['Authors'] = '; '.join(authors)
+                records.append(record)
+            # Create DataFrame with required columns
+            df = pd.DataFrame(records)
+            required_columns = ['Title', 'Abstract', 'Authors', 'DOI']
+            for col in required_columns:
+                if col not in df.columns:
+                    df[col] = ''
+            df.index.name = 'Index'
+            output_path = os.path.join(self.data_dir, "extracted_data.xlsx")
+            df.to_excel(output_path, index=True)
+            preview = self._generate_preview(records)
+            return output_path, preview
+        except Exception as e:
+            return None, f"Error processing Scopus RIS file: {str(e)}"
+    def _generate_preview(self, records: list) -> str:
+        """
+        Generate a preview text for the first few parsed records
+        Args:
+            records: List of parsed records
+        Returns:
+            str: Formatted preview text showing sample records
+        """
+        preview = ""
+        for i, record in enumerate(records[:3], 0):
+            preview += f"\nRecord {i}:\n"
+            preview += f"DOI: {record.get('DOI', '')[:50]}\n"
+            preview += f"Title: {record.get('Title', '')[:100]}...\n"
+            preview += f"Authors: {record.get('Authors', '')[:100]}...\n"
+            preview += f"Abstract: {record.get('Abstract', '')[:200]}...\n"
+            preview += "-" * 80 + "\n"
+        preview += f"\nTotal records extracted: {len(records)}"
+        return preview
+    def load_excel(self, file_path: str) -> Optional[pd.DataFrame]:
+        """
+        Load Excel file and ensure the index is set correctly
+        Args:
+            file_path: Path to the Excel file to load
+        Returns:
+            DataFrame or None if loading fails
+        """
+        try:
+            # First try to read with index_col=0
+            df = pd.read_excel(file_path, index_col=0)
+            # If Index is still in columns, it means it wasn't properly set as index
+            if "Index" in df.columns:
+                df.set_index("Index", inplace=True)
+            elif df.index.name != "Index":
+                df.index.name = "Index"
+            # Ensure index is string type and handle any potential NaN values
+            df.index = df.index.astype(str)
+            df.index = df.index.str.strip()
+            # Remove any duplicate indices by keeping the first occurrence
+            if df.index.duplicated().any():
+                logging.warning(f"Found duplicate indices in {file_path}")
+                df = df[~df.index.duplicated(keep='first')]
+            logging.debug(f"Loaded DataFrame from {file_path}")
+            logging.debug(f"Shape: {df.shape}")
+            logging.debug(f"Columns: {df.columns.tolist()}")
+            logging.debug(f"Index name: {df.index.name}")
+            logging.debug(f"First few indices: {df.index.tolist()[:5]}")
+            return df
+        except Exception as e:
+            logging.error(f"Error loading Excel file: {str(e)}")
+            return None
+    def save_excel(self, df: pd.DataFrame, filename: str) -> str:
+        """
+        Save a DataFrame to an Excel file
+        Args:
+            df: DataFrame to save
+            filename: Target filename
+        Returns:
+            str: Path to the saved file or empty string if saving fails
+        """
+        try:
+            # Ensure we have a copy to avoid modifying the original
+            df = df.copy()
+            # Ensure index is properly named
+            if df.index.name != "Index":
+                df.index.name = "Index"
+            # Ensure index is string type
+            df.index = df.index.astype(str)
+            # Remove any duplicate indices
+            if df.index.duplicated().any():
+                logging.warning(f"Found duplicate indices when saving {filename}")
+                df = df[~df.index.duplicated(keep='first')]
+            output_path = os.path.join(self.data_dir, filename)
+            # Save with index
+            df.to_excel(output_path, index=True)
+            logging.debug(f"Saved DataFrame to {output_path}")
+            logging.debug(f"Shape: {df.shape}")
+            logging.debug(f"Columns: {df.columns.tolist()}")
+            return output_path
+        except Exception as e:
+            logging.error(f"Error saving Excel file: {str(e)}")
+            return ""

model_manager.py ADDED Viewed

	@@ -0,0 +1,528 @@

+import os
+import json
+import requests
+import logging
+import time
+import re
+from typing import Dict, Any
+from dotenv import load_dotenv
+# Ensure .env file is loaded (with override enabled to pick up any modifications)
+load_dotenv(override=True)
+class ModelManager:
+    def __init__(self):
+        # Load base configuration from environment variables
+        self.model_configs = {
+            "model_a": {
+                "api_key": os.getenv("MODEL_A_API_KEY", ""),
+                "api_url": os.getenv("MODEL_A_API_URL", ""),
+                "model": os.getenv("MODEL_A_MODEL_NAME", ""),
+                "name": "Model A (Primary Analyzer)",
+                "temperature": float(os.getenv("MODEL_A_TEMPERATURE", "0.3")),
+                "max_tokens": int(os.getenv("MODEL_A_MAX_TOKENS", "4096")),
+                "batch_size": int(os.getenv("MODEL_A_BATCH_SIZE", "10")),
+                "threads": int(os.getenv("MODEL_A_THREADS", "8")),
+                "timeout": int(os.getenv("MODEL_A_TIMEOUT", "180")),
+                "is_inference": os.getenv("MODEL_A_IS_INFERENCE", "").lower() == "true",
+                "updated": False  # flag to indicate if manually updated
+            },
+            "model_b": {
+                "api_key": os.getenv("MODEL_B_API_KEY", ""),
+                "api_url": os.getenv("MODEL_B_API_URL", ""),
+                "model": os.getenv("MODEL_B_MODEL_NAME", ""),
+                "name": "Model B (Critical Reviewer)",
+                "temperature": float(os.getenv("MODEL_B_TEMPERATURE", "0.3")),
+                "max_tokens": int(os.getenv("MODEL_B_MAX_TOKENS", "4096")),
+                "batch_size": int(os.getenv("MODEL_B_BATCH_SIZE", "10")),
+                "threads": int(os.getenv("MODEL_B_THREADS", "8")),
+                "timeout": int(os.getenv("MODEL_B_TIMEOUT", "180")),
+                "is_inference": os.getenv("MODEL_B_IS_INFERENCE", "").lower() == "true",
+                "updated": False
+            },
+            "model_c": {
+                "api_key": os.getenv("MODEL_C_API_KEY", ""),
+                "api_url": os.getenv("MODEL_C_API_URL", ""),
+                "model": os.getenv("MODEL_C_MODEL_NAME", ""),
+                "name": "Model C (Final Arbitrator)",
+                "temperature": float(os.getenv("MODEL_C_TEMPERATURE", "0.3")),
+                "max_tokens": int(os.getenv("MODEL_C_MAX_TOKENS", "4096")),
+                "batch_size": int(os.getenv("MODEL_C_BATCH_SIZE", "10")),
+                "threads": int(os.getenv("MODEL_C_THREADS", "8")),
+                "timeout": int(os.getenv("MODEL_C_TIMEOUT", "180")),
+                "is_inference": os.getenv("MODEL_C_IS_INFERENCE", "").lower() == "true",
+                "updated": False
+            }
+        }
+        # Validate API keys
+        for model_key, config in self.model_configs.items():
+            if not config["api_key"]:
+                logging.warning(f"API key not found for {config['name']}")
+    def update_model_config(self, model_key: str, config: Dict[str, Any]) -> None:
+        """Update model configuration."""
+        if model_key not in self.model_configs:
+            raise ValueError(f"Invalid model key: {model_key}")
+        self.model_configs[model_key].update(config)
+    def process_model_response(self, model_key: str, response: str) -> Dict:
+        """Process response based on model type."""
+        try:
+            logging.debug(f"Raw response from {model_key}: {response}")
+            logging.debug(f"Response type: {type(response)}")
+            # Parse outer JSON
+            response_obj = json.loads(response) if isinstance(response, str) else response
+            logging.debug(f"Parsed response object: {json.dumps(response_obj, indent=2)}")
+            # Process based on mode
+            if self.model_configs[model_key].get("is_inference", False):
+                logging.debug(f"Processing {model_key} response in inference mode")
+                logging.debug(f"Model config: {json.dumps(self.model_configs[model_key], indent=2)}")
+                return self.process_inference_result(response_obj, model_key)
+            # Get content from response
+            if not isinstance(response_obj, dict):
+                logging.error(f"Invalid response format from {model_key}: {response_obj}")
+                return self.get_default_response(model_key)
+            if "choices" not in response_obj:
+                logging.error(f"No choices in response: {response_obj}")
+                return self.get_default_response(model_key)
+            if not response_obj["choices"]:
+                logging.error(f"Empty choices in response: {response_obj}")
+                return self.get_default_response(model_key)
+            content = response_obj["choices"][0].get("message", {}).get("content", "")
+            logging.debug(f"Extracted content: {content}")
+            if not content:
+                logging.error(f"Empty content in {model_key} response")
+                return self.get_default_response(model_key)
+            # Handle markdown code blocks
+            if "```json" in content:
+                pattern = r"```json\s*(.*?)\s*```"
+                match = re.search(pattern, content, re.DOTALL)
+                if match:
+                    content = match.group(1).strip()
+                    logging.debug(f"Extracted JSON from markdown: {content}")
+            # Parse inner JSON
+            try:
+                result = json.loads(content)
+                logging.debug(f"Parsed content result: {json.dumps(result, indent=2)}")
+                # Validate results field
+                if "results" not in result:
+                    logging.error(f"Missing 'results' field in {model_key} response")
+                    return self.get_default_response(model_key)
+                # Validate each result item
+                valid_results = []
+                for item in result.get("results", []):
+                    logging.debug(f"Processing result item: {json.dumps(item, indent=2)}")
+                    if not isinstance(item, dict):
+                        logging.error(f"Invalid result item format: {item}")
+                        continue
+                    if "Index" not in item:
+                        logging.error(f"Missing Index in result item: {item}")
+                        continue
+                    valid_results.append(item)
+                if not valid_results:
+                    logging.error(f"No valid results found in {model_key} response")
+                    return self.get_default_response(model_key)
+                result["results"] = valid_results
+                return result
+            except json.JSONDecodeError as e:
+                logging.error(f"JSON parse error for {model_key}: {str(e)}")
+                logging.error(f"Content causing error: {content}")
+                return self.get_default_response(model_key)
+        except Exception as e:
+            logging.error(f"Error processing {model_key} response: {str(e)}")
+            logging.error("Full traceback:", exc_info=True)
+            return self.get_default_response(model_key)
+    def get_default_response(self, model_key: str) -> Dict:
+        """
+        Return default response format for each model type.
+        Args:
+            model_key: Identifier of the model.
+        Returns:
+            Dict containing default response structure.
+        """
+        if model_key == "model_a":
+            return {
+                "results": [{
+                    "Index": "0",
+                    "A_P": "not applicable",
+                    "A_I": "not applicable",
+                    "A_C": "not applicable",
+                    "A_O": "not applicable",
+                    "A_S": "not applicable",
+                    "A_Decision": False,
+                    "A_Reason": "API call failed or returned no results"
+                }]
+            }
+        elif model_key == "model_b":
+            return {
+                "results": [{
+                    "Index": "0",
+                    "B_P": "not applicable",
+                    "B_I": "not applicable",
+                    "B_C": "not applicable",
+                    "B_O": "not applicable",
+                    "B_S": "not applicable",
+                    "B_Decision": False,
+                    "B_Reason": "API call failed or returned no results"
+                }]
+            }
+        else:  # model_c
+            return {
+                "results": [{
+                    "Index": "0",
+                    "C_Decision": False,
+                    "C_Reason": "API call failed or returned no results"
+                }]
+            }
+    def process_inference_result(self, result: Dict, model_key: str) -> Dict:
+        """
+        Process inference model results.
+        Args:
+            result: Raw inference result.
+            model_key: Identifier of the model.
+        Returns:
+            Dict containing processed inference results.
+        """
+        try:
+            if not isinstance(result, dict) or "choices" not in result:
+                logging.error(f"Invalid inference result format from {model_key}")
+                return self.get_default_response(model_key)
+            for choice in result["choices"]:
+                if "message" not in choice:
+                    logging.warning(f"Missing message in choice: {choice}")
+                    continue
+                content = choice["message"].get("content", "")
+                if not content:
+                    logging.warning(f"Empty content in {model_key} choice")
+                    choice["message"]["content"] = json.dumps(self.get_default_response(model_key))
+                    continue
+                # Handle markdown code blocks
+                if "```json" in content:
+                    pattern = r"```json\s*(.*?)\s*```"
+                    match = re.search(pattern, content, re.DOTALL)
+                    if match:
+                        content = match.group(1).strip()
+                        logging.debug(f"Extracted JSON from markdown in inference result: {content}")
+                try:
+                    content_data = json.loads(content)
+                    logging.debug(f"Parsed inference content: {json.dumps(content_data, indent=2, ensure_ascii=False)}")
+                    # Return the parsed content data directly, not the original response
+                    return content_data
+                except json.JSONDecodeError as e:
+                    logging.error(f"Failed to parse {model_key} inference content: {str(e)}")
+                    logging.error(f"Content was: {content}")
+                    return self.get_default_response(model_key)
+            return self.get_default_response(model_key)
+        except Exception as e:
+            logging.error(f"Error processing {model_key} inference result: {str(e)}")
+            return self.get_default_response(model_key)
+    def process_reviews(self, result: Dict, model_key: str) -> Dict:
+        """
+        Process reviews format response.
+        Args:
+            result: Raw review data.
+            model_key: Identifier of the model.
+        Returns:
+            Dict containing processed reviews.
+        """
+        try:
+            if not isinstance(result.get("reviews", []), list):
+                logging.error("Invalid reviews format")
+                return {"reviews": []}
+            field_name = "B_Reason" if model_key == "model_b" else "C_Reason"
+            for review in result["reviews"]:
+                if field_name in review:
+                    # Remove duplicate Reason fields
+                    if isinstance(review[field_name], list):
+                        review[field_name] = review[field_name][-1]
+                    # Process inference content (remove think tags etc.)
+                    review[field_name] = self.process_inference_response(review[field_name])
+            return result
+        except Exception as e:
+            logging.error(f"Error processing reviews: {str(e)}")
+            return {"reviews": []}
+    def process_inference_response(self, response: str) -> str:
+        """
+        Process special markers in inference response.
+        Args:
+            response: Raw inference response string.
+        Returns:
+            Processed response string with special markers removed.
+        """
+        try:
+            if not isinstance(response, str):
+                return response
+            # Remove thinking process
+            response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
+            # Remove HTML tags
+            response = re.sub(r'<[^>]+>', '', response)
+            # Clean extra whitespace
+            response = re.sub(r'\n\s*\n', '\n\n', response.strip())
+            return response
+        except Exception as e:
+            logging.error(f"Error processing inference response: {str(e)}")
+            return response
+    def test_api_connection(self, model_key: str) -> str:
+        """
+        Test API connection for a specific model.
+        Args:
+            model_key: Identifier of the model to test.
+        Returns:
+            String indicating connection status.
+        """
+        config = self.model_configs.get(model_key)
+        if not config:
+            return f"❌ Configuration not found for {model_key}"
+        try:
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {config['api_key']}"
+            }
+            data = {
+                "model": config["model"],
+                "messages": [{"role": "user", "content": "test"}],
+                "temperature": config["temperature"],
+                "max_tokens": 10
+            }
+            response = requests.post(
+                config["api_url"],
+                headers=headers,
+                json=data,
+                timeout=10
+            )
+            if response.status_code == 200:
+                return f"✓ {config['name']} connection successful"
+            else:
+                return f"❌ {config['name']} connection failed: {response.status_code}"
+        except Exception as e:
+            return f"❌ {config['name']} connection error: {str(e)}"
+    def call_api(self, model_key: str, prompt: str) -> Dict:
+        """Call API with retry mechanism and improved error handling."""
+        try:
+            config = self.model_configs.get(model_key)
+            if not config:
+                logging.error(f"Configuration not found for {model_key}")
+                raise Exception(f"Configuration not found for {model_key}")
+            logging.debug(f"API call config for {model_key}: {json.dumps({k:v for k,v in config.items() if k != 'api_key'}, indent=2)}")
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {config['api_key']}"
+            }
+            logging.debug(f"Request headers: {json.dumps({k:v for k,v in headers.items() if k != 'Authorization'}, indent=2)}")
+            data = {
+                "model": config["model"],
+                "messages": [
+                    {"role": "system", "content": "You are a helpful assistant specialized in analyzing medical literature based on PICOS criteria."},
+                    {"role": "user", "content": prompt}
+                ],
+                "temperature": config["temperature"],
+                "max_tokens": config["max_tokens"]
+            }
+            logging.debug(f"Request data: {json.dumps(data, indent=2)}")
+            max_retries = 3
+            retry_delay = 1
+            for attempt in range(max_retries):
+                try:
+                    logging.debug(f"Attempt {attempt + 1} of {max_retries}")
+                    response = requests.post(
+                        config["api_url"],
+                        headers=headers,
+                        json=data,
+                        timeout=config["timeout"]
+                    )
+                    logging.debug(f"API Response status: {response.status_code}")
+                    logging.debug(f"API Response headers: {dict(response.headers)}")
+                    if response.status_code != 200:
+                        error_msg = f"API call failed for {config.get('name', model_key)}: {response.status_code} {response.reason}"
+                        if response.text:
+                            error_msg += f"\nResponse: {response.text}"
+                        logging.error(error_msg)
+                        if attempt < max_retries - 1:
+                            time.sleep(retry_delay * (attempt + 1))
+                            continue
+                        raise Exception(error_msg)
+                    return self.process_model_response(model_key, response.text)
+                except requests.Timeout:
+                    logging.error(f"Timeout on attempt {attempt + 1}/{max_retries}")
+                    if attempt < max_retries - 1:
+                        time.sleep(retry_delay * (attempt + 1))
+                        continue
+                    raise Exception(f"API call timed out after {max_retries} attempts")
+                except Exception as e:
+                    logging.error(f"API call error for {config.get('name', model_key)}: {str(e)}")
+                    logging.error("Full traceback:", exc_info=True)
+                    if attempt < max_retries - 1:
+                        time.sleep(retry_delay)
+                        continue
+                    raise
+            raise Exception(f"API call failed after {max_retries} attempts")
+        except Exception as e:
+            logging.error(f"Fatal error in API call: {str(e)}")
+            logging.error("Full traceback:", exc_info=True)
+            raise
+    def get_config(self, model_key: str) -> Dict[str, Any]:
+        """
+        Get model configuration.
+        This method re-reads environment variables for models that haven't been manually updated.
+        """
+        # Reload environment variables from .env file to capture any modifications
+        load_dotenv(override=True)
+        if model_key not in self.model_configs:
+            return {}
+        config = self.model_configs[model_key]
+        if not config.get("updated", False):
+            # For models not manually updated, refresh config from environment variables
+            if model_key == "model_a":
+                refreshed_config = {
+                    "api_key": os.getenv("MODEL_A_API_KEY", ""),
+                    "api_url": os.getenv("MODEL_A_API_URL", ""),
+                    "model": os.getenv("MODEL_A_MODEL_NAME", ""),
+                    "name": "Model A (Primary Analyzer)",
+                    "temperature": float(os.getenv("MODEL_A_TEMPERATURE", "0.3")),
+                    "max_tokens": int(os.getenv("MODEL_A_MAX_TOKENS", "4096")),
+                    "batch_size": int(os.getenv("MODEL_A_BATCH_SIZE", "10")),
+                    "threads": int(os.getenv("MODEL_A_THREADS", "8")),
+                    "timeout": int(os.getenv("MODEL_A_TIMEOUT", "180")),
+                    "is_inference": os.getenv("MODEL_A_IS_INFERENCE", "").lower() == "true",
+                    "updated": False
+                }
+            elif model_key == "model_b":
+                refreshed_config = {
+                    "api_key": os.getenv("MODEL_B_API_KEY", ""),
+                    "api_url": os.getenv("MODEL_B_API_URL", ""),
+                    "model": os.getenv("MODEL_B_MODEL_NAME", ""),
+                    "name": "Model B (Critical Reviewer)",
+                    "temperature": float(os.getenv("MODEL_B_TEMPERATURE", "0.3")),
+                    "max_tokens": int(os.getenv("MODEL_B_MAX_TOKENS", "4096")),
+                    "batch_size": int(os.getenv("MODEL_B_BATCH_SIZE", "10")),
+                    "threads": int(os.getenv("MODEL_B_THREADS", "8")),
+                    "timeout": int(os.getenv("MODEL_B_TIMEOUT", "180")),
+                    "is_inference": os.getenv("MODEL_B_IS_INFERENCE", "").lower() == "true",
+                    "updated": False
+                }
+            elif model_key == "model_c":
+                refreshed_config = {
+                    "api_key": os.getenv("MODEL_C_API_KEY", ""),
+                    "api_url": os.getenv("MODEL_C_API_URL", ""),
+                    "model": os.getenv("MODEL_C_MODEL_NAME", ""),
+                    "name": "Model C (Final Arbitrator)",
+                    "temperature": float(os.getenv("MODEL_C_TEMPERATURE", "0.3")),
+                    "max_tokens": int(os.getenv("MODEL_C_MAX_TOKENS", "4096")),
+                    "batch_size": int(os.getenv("MODEL_C_BATCH_SIZE", "10")),
+                    "threads": int(os.getenv("MODEL_C_THREADS", "8")),
+                    "timeout": int(os.getenv("MODEL_C_TIMEOUT", "180")),
+                    "is_inference": os.getenv("MODEL_C_IS_INFERENCE", "").lower() == "true",
+                    "updated": False
+                }
+            else:
+                refreshed_config = {}
+            self.model_configs[model_key] = refreshed_config
+            config = refreshed_config
+        return config
+    def process_analysis(self, result: Dict, model_key: str) -> Dict:
+        """
+        Process analysis format response.
+        Args:
+            result: Raw analysis data.
+            model_key: Identifier of the model.
+        Returns:
+            Dict containing processed analysis.
+        """
+        try:
+            if not isinstance(result.get("analysis", []), list):
+                logging.error("Invalid analysis format")
+                return {"analysis": []}
+            # Process each analysis item
+            for analysis in result["analysis"]:
+                if "A_Reason" in analysis:
+                    # Remove duplicate Reason fields
+                    if isinstance(analysis["A_Reason"], list):
+                        analysis["A_Reason"] = analysis["A_Reason"][-1]
+                    # Process inference content (remove think tags etc.)
+                    analysis["A_Reason"] = self.process_inference_response(analysis["A_Reason"])
+                # Ensure boolean fields are proper booleans
+                if "A_Decision" in analysis:
+                    analysis["A_Decision"] = bool(analysis["A_Decision"])
+                # Ensure all PICOS fields are strings
+                for field in ["A_P", "A_I", "A_C", "A_O", "A_S"]:
+                    if field in analysis:
+                        analysis[field] = str(analysis[field])
+            return result
+        except Exception as e:
+            logging.error(f"Error processing analysis: {str(e)}")
+            return {"analysis": []}

prompt_manager.py ADDED Viewed

	@@ -0,0 +1,191 @@

+from typing import Dict
+class PromptManager:
+    def __init__(self):
+        self.prompts = {
+            "model_a": """You are a medical research expert analyzing clinical trial abstracts.
+Your task is to analyze each abstract and determine if it matches the PICOS criteria.
+Target PICOS criteria:
+- Population: {population}
+- Intervention: {intervention}
+- Comparison: {comparison}
+- Outcome: {outcome}
+- Study Design: {study_design}
+Input abstracts:
+{abstracts_json}
+Each article in the input contains:
+- index: article identifier
+- abstract: the text to analyze
+IMPORTANT: You must follow these strict JSON formatting rules:
+1. Use double quotes for all strings
+2. Ensure all strings are properly terminated
+3. Use commas between array items and object properties
+4. Do not use trailing commas
+5. Keep the response concise and avoid unnecessary whitespace
+6. Escape any special characters in strings
+7. Use true/false (not True/False) for boolean values
+Provide your analysis in this exact JSON format:
+{{
+  "results": [
+    {{
+      "Index": "ARTICLE_INDEX",
+      "A_P": "brief population description",
+      "A_I": "brief intervention description",
+      "A_C": "brief comparison description",
+      "A_O": "brief outcome description",
+      "A_S": "brief study design description",
+      "A_Decision": true/false,
+      "A_Reason": "brief reasoning for match/mismatch"
+    }},
+    ...
+  ]
+}}
+Keep all descriptions brief and focused. Do not include line breaks or special characters in the text fields.
+If any field is not found in the abstract, use "not specified" as the value.
+Be strict in your evaluation and ensure the output is valid JSON format.""",
+            "model_b": """You are a critical reviewer in a systematic review team.
+Your task is to rigorously scrutinize Model A's analysis and provide your own assessment.
+You should actively look for potential flaws or oversights in Model A's analysis, while maintaining a high standard of evidence-based evaluation.
+Target PICOS criteria:
+- Population: {population}
+- Intervention: {intervention}
+- Comparison: {comparison}
+- Outcome: {outcome}
+- Study Design: {study_design}
+Input abstracts:
+{abstracts_json}
+Each article in the input contains:
+- Index: article identifier
+- abstract: original article abstract
+- model_a_analysis:
+  - A_P: Model A's population description
+  - A_I: Model A's intervention description
+  - A_C: Model A's comparison description
+  - A_O: Model A's outcome description
+  - A_S: Model A's study design description
+  - A_Decision: Model A's inclusion decision
+  - A_Reason: Model A's explanation
+Your task is to:
+1. Thoroughly examine the original abstract
+2. Critically review Model A's PICOS extraction, actively seeking potential issues:
+   - Look for missing details or nuances in population characteristics
+   - Check for precise intervention specifications
+   - Verify completeness of comparison group description
+   - Examine outcome measurements and their relevance
+   - Scrutinize study design classification
+3. Provide corrections with evidence from the abstract:
+   - B_P: Your corrected population description (use "-" only if A_P is completely accurate)
+   - B_I: Your corrected intervention description (use "-" only if A_I is completely accurate)
+   - B_C: Your corrected comparison description (use "-" only if A_C is completely accurate)
+   - B_O: Your corrected outcome description (use "-" only if A_O is completely accurate)
+   - B_S: Your corrected study design description (use "-" only if A_S is completely accurate)
+4. Make your own independent inclusion decision (B_Decision)
+5. Provide detailed reasoning (B_Reason) that:
+   - Points out any oversights or inaccuracies in Model A's analysis
+   - Cites specific evidence from the abstract
+   - Explains why your corrections or agreements are justified
+IMPORTANT: You must follow these strict JSON formatting rules:
+1. Use double quotes for all strings
+2. Ensure all strings are properly terminated
+3. Use commas between array items and object properties
+4. Do not use trailing commas
+5. Keep the response concise and avoid unnecessary whitespace
+6. Escape any special characters in strings
+7. Use true/false for B_Decision (true means the article should be included)
+8. ALL fields (B_P, B_I, B_C, B_O, B_S) must be provided for each review
+9. NEVER omit any field, even if you agree with Model A's analysis
+10. For B_S specifically, you must either provide a corrected study design description or use "-" if you agree with A_S
+Return your analysis in this exact JSON format:
+{{
+  "results": [
+    {{
+      "Index": "ARTICLE_INDEX",
+      "B_Decision": true/false,
+      "B_Reason": "detailed reasoning with evidence from abstract",
+      "B_P": "-" or "corrected population description with evidence",
+      "B_I": "-" or "corrected intervention description with evidence",
+      "B_C": "-" or "corrected comparison description with evidence",
+      "B_O": "-" or "corrected outcome description with evidence",
+      "B_S": "-" or "corrected study design description with evidence"
+    }},
+    ...
+  ]
+}}
+Keep descriptions focused and evidence-based. Do not include line breaks or special characters.
+Use "-" only when you are completely certain that Model A's extraction is accurate and complete.
+Your B_Decision should be based on whether the article meets all PICOS criteria.
+Remember to be thorough in your critique while maintaining objectivity and evidence-based reasoning.
+CRITICAL: You MUST include ALL fields in your response, especially B_S. If you agree with Model A's study design analysis, use "-" for B_S, but NEVER omit it.""",
+            "model_c": """You are the final arbitrator in a systematic review team.
+Your task is to analyze the assessments from Model A and Model B, and make a final decision.
+Target PICOS criteria:
+- Population: {population}
+- Intervention: {intervention}
+- Comparison: {comparison}
+- Outcome: {outcome}
+- Study Design: {study_design}
+Input abstracts:
+{abstracts_json}
+Each article in the input contains:
+- Index: article identifier
+- abstract: original article abstract
+- model_a_analysis: Model A's assessment
+- model_b_analysis: Model B's assessment
+Your task is to:
+1. Review the original abstract
+2. Compare Model A and Model B's assessments
+3. Make a final decision considering:
+   - Accuracy of PICOS criteria matching
+   - Validity of reasoning from both models
+   - Evidence from the abstract
+4. Provide your final assessment:
+   - C_Decision: final inclusion decision
+   - C_Reason: detailed explanation of your decision
+   - Note any disagreements between models and how you resolved them
+Return your analysis in this exact JSON format:
+{{
+  "results": [
+    {{
+      "Index": "ARTICLE_INDEX",
+      "C_Decision": true/false,
+      "C_Reason": "detailed reasoning with evidence"
+    }},
+    ...
+  ]
+}}
+Keep your reasoning focused and evidence-based.
+Your C_Decision should be based on whether the article truly meets all PICOS criteria.
+Be thorough in your analysis while maintaining objectivity."""
+        }
+    def update_prompt(self, model_key: str, prompt: str) -> None:
+        """Update model prompt"""
+        if model_key not in self.prompts:
+            raise ValueError(f"Invalid model key: {model_key}")
+        self.prompts[model_key] = prompt
+    def get_prompt(self, model_key: str) -> str:
+        """Get model prompt"""
+        return self.prompts.get(model_key, "")

renovate.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
+  "extends": [
+    "config:recommended"
+  ]
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+pandas>=1.5.0
+requests>=2.31.0
+python-dotenv>=1.0.0
+tqdm>=4.66.0
+tabulate>=0.9.0
+gradio>=4.19.0
+xlrd
+scikit-learn>=1.3.0
+openpyxl>=3.1.2

result_processor.py ADDED Viewed

	@@ -0,0 +1,393 @@

+import pandas as pd
+import logging
+from typing import Dict
+import json
+import re
+class ResultProcessor:
+    def __init__(self):
+        """Initialize ResultProcessor with required column definitions for each model"""
+        # Define required columns for each model's output
+        self.required_columns = {
+            "model_a": ["A_Decision", "A_Reason", "A_P", "A_I", "A_C", "A_O", "A_S"],
+            "model_b": ["B_Decision", "B_Reason", "B_P", "B_I", "B_C", "B_O", "B_S"],
+            "model_c": ["C_Decision", "C_Reason"]
+        }
+        # Define the order of columns in the final Excel output
+        self.output_columns = [
+            "Index",
+            "A_Decision", "A_Reason", "A_P", "A_I", "A_C", "A_O", "A_S",
+            "B_Decision", "B_Reason", "B_P", "B_I", "B_C", "B_O", "B_S",
+            "C_Decision", "C_Reason"
+        ]
+    def validate_model_response(self, result: Dict, model_key: str) -> None:
+        """
+        Validate the response format from each model
+        Args:
+            result: The model's response to validate
+            model_key: The identifier of the model ('model_a', 'model_b', or 'model_c')
+        Raises:
+            Exception: If the response format is invalid
+        """
+        # Log validation start
+        logging.debug(f"Starting validation for {model_key}")
+        logging.debug(f"Raw result type: {type(result)}")
+        if model_key == "model_a":
+            # Check if response is in completion format
+            if "choices" in result and len(result["choices"]) > 0:
+                content = result["choices"][0].get("message", {}).get("content", "")
+                if content:
+                    try:
+                        # Handle markdown-wrapped JSON content
+                        json_content = content
+                        if "```json" in content:
+                            pattern = r"```json\s*(.*?)\s*```"
+                            match = re.search(pattern, content, re.DOTALL)
+                            if match:
+                                json_content = match.group(1)
+                        logging.debug(f"Extracted JSON content: {json_content}")
+                        # Parse JSON content
+                        parsed = json.loads(json_content)
+                        if isinstance(parsed, dict) and "results" in parsed:
+                            result.clear()
+                            result.update(parsed)
+                            logging.debug("Successfully parsed Model A response")
+                    except json.JSONDecodeError as e:
+                        raise Exception(f"Invalid JSON in Model A response content: {content}. Error: {str(e)}")
+            # Validate Model A specific format
+            if not isinstance(result, dict):
+                raise Exception("Invalid Model A response format: result is not a dictionary")
+            if "results" not in result:
+                raise Exception("Invalid Model A response format: missing 'results' field")
+            if not isinstance(result["results"], list):
+                raise Exception("Invalid Model A response format: 'results' is not a list")
+            if not result["results"]:
+                raise Exception("Empty results array in Model A response")
+            # Validate each result item
+            for item in result["results"]:
+                if not isinstance(item, dict):
+                    raise Exception(f"Invalid result item format: {item}")
+                if "Index" not in item:
+                    raise Exception(f"Missing 'Index' in result item: {item}")
+                missing_fields = [field for field in self.required_columns[model_key] if field not in item]
+                if missing_fields:
+                    raise Exception(f"Missing fields in result item: {missing_fields}")
+        elif model_key == "model_b":
+            # Handle Model B's response format
+            if "choices" in result and len(result["choices"]) > 0:
+                content = result["choices"][0].get("message", {}).get("content", "")
+                if content:
+                    try:
+                        json_content = content
+                        if "```json" in content:
+                            pattern = r"```json\s*(.*?)\s*```"
+                            match = re.search(pattern, content, re.DOTALL)
+                            if match:
+                                json_content = match.group(1)
+                        logging.debug(f"Extracted JSON content for Model B: {json_content}")
+                        parsed = json.loads(json_content)
+                        if isinstance(parsed, dict) and "results" in parsed:
+                            result.clear()
+                            result.update(parsed)
+                            logging.debug("Successfully parsed Model B response")
+                    except json.JSONDecodeError as e:
+                        raise Exception(f"Invalid JSON in Model B response content: {content}. Error: {str(e)}")
+            # Validate Model B specific format
+            if not isinstance(result, dict):
+                raise Exception("Invalid Model B response format: result is not a dictionary")
+            if "results" not in result:
+                raise Exception("Invalid Model B response format: missing 'results' field")
+            if not isinstance(result["results"], list):
+                raise Exception("Invalid Model B response format: 'results' is not a list")
+            if not result["results"]:
+                raise Exception("Empty results array in Model B response")
+            # Validate each result item
+            for item in result["results"]:
+                if not isinstance(item, dict):
+                    raise Exception(f"Invalid result item format: {item}")
+                if "Index" not in item:
+                    raise Exception(f"Missing 'Index' in result item: {item}")
+                missing_fields = [field for field in self.required_columns[model_key] if field not in item]
+                if missing_fields:
+                    raise Exception(f"Missing fields in Model B result: {missing_fields}")
+        else:  # model_c
+            # Handle Model C's response format
+            if "choices" in result and len(result["choices"]) > 0:
+                content = result["choices"][0].get("message", {}).get("content", "")
+                if content:
+                    try:
+                        json_content = content
+                        if "```json" in content:
+                            pattern = r"```json\s*(.*?)\s*```"
+                            match = re.search(pattern, content, re.DOTALL)
+                            if match:
+                                json_content = match.group(1)
+                        logging.debug(f"Extracted JSON content for Model C: {json_content}")
+                        parsed = json.loads(json_content)
+                        if isinstance(parsed, dict) and "results" in parsed:
+                            result.clear()
+                            result.update(parsed)
+                            logging.debug("Successfully parsed Model C response")
+                    except json.JSONDecodeError as e:
+                        raise Exception(f"Invalid JSON in Model C response content: {content}. Error: {str(e)}")
+            # Validate Model C specific format
+            if not isinstance(result, dict):
+                raise Exception("Invalid Model C response format: result is not a dictionary")
+            if "results" not in result:
+                raise Exception("Invalid Model C response format: missing 'results' field")
+            if not isinstance(result["results"], list):
+                raise Exception("Invalid Model C response format: 'results' is not a list")
+            if not result["results"]:
+                raise Exception("Empty results array in Model C response")
+            # Validate each result item
+            for item in result["results"]:
+                if not isinstance(item, dict):
+                    raise Exception(f"Invalid result item format: {item}")
+                if "Index" not in item:
+                    raise Exception(f"Missing 'Index' in result item: {item}")
+                missing_fields = [field for field in self.required_columns[model_key] if field not in item]
+                if missing_fields:
+                    raise Exception(f"Missing fields in Model C result: {missing_fields}")
+                try:
+                    str(item["Index"])
+                    bool(item["C_Decision"])
+                    str(item["C_Reason"])
+                except (ValueError, TypeError) as e:
+                    raise Exception(f"Invalid data type in Model C result: {str(e)}")
+        # Log successful validation
+        logging.debug(f"Validation completed successfully for {model_key}")
+    def merge_results(self, df: pd.DataFrame, model_results: Dict[str, pd.DataFrame]) -> pd.DataFrame:
+        """
+        Merge all model results with correct column alignment and compute final decision
+        Args:
+            df: Original DataFrame with abstracts
+            model_results: Dictionary containing results from each model
+        Returns:
+            DataFrame with merged results from all models
+        """
+        try:
+            # Copy and clean the original DataFrame's index (remove potential whitespace)
+            df = df.copy()
+            df.index = df.index.astype(str).str.strip()
+            # Handle missing values and clean base columns
+            for col in ["Abstract", "DOI", "Title", "Authors"]:
+                if col in df.columns:
+                    df[col] = df[col].fillna("").astype(str)
+                    df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else "")
+                    df[col] = df[col].replace(r'^[\s-]*$', "", regex=True)
+            # Create base DataFrame for merging model results
+            merged_df = df.copy()
+            def join_model_results(base_df: pd.DataFrame, model_key: str) -> pd.DataFrame:
+                """
+                Merge results from a specific model, ensuring data alignment and cleaning
+                Args:
+                    base_df: Base DataFrame to merge with
+                    model_key: Identifier of the model
+                Returns:
+                    DataFrame with merged model results
+                """
+                if model_key not in model_results:
+                    logging.warning(f"{model_key} results not found")
+                    # Create default values for all rows
+                    for col in self.required_columns[model_key]:
+                        if col.endswith('_Decision'):
+                            base_df[col] = False
+                        elif col.endswith('_Reason'):
+                            base_df[col] = "Not applicable - No model result"
+                        else:
+                            base_df[col] = "not applicable"
+                    return base_df
+                try:
+                    model_df = model_results[model_key].copy()
+                    # Ensure model result indices and column names are strings without whitespace
+                    model_df.index = model_df.index.astype(str).str.strip()
+                    model_df.columns = model_df.columns.astype(str).str.strip()
+                    # Ensure all required columns exist
+                    for col in self.required_columns[model_key]:
+                        if col not in model_df.columns:
+                            if col.endswith('_Decision'):
+                                model_df[col] = False
+                            elif col.endswith('_Reason'):
+                                model_df[col] = "Not applicable - Missing column"
+                            else:
+                                model_df[col] = "not applicable"
+                    # Add default values for indices present in original data but missing in model results
+                    missing_indices = set(base_df.index) - set(model_df.index)
+                    if missing_indices:
+                        logging.info(f"Found {len(missing_indices)} missing entries in {model_key}")
+                        default_values = pd.DataFrame(
+                            index=list(missing_indices),
+                            columns=self.required_columns[model_key]
+                        )
+                        for col in self.required_columns[model_key]:
+                            if col.endswith('_Decision'):
+                                default_values[col] = False
+                            elif col.endswith('_Reason'):
+                                default_values[col] = "Not applicable - No result"
+                            else:
+                                default_values[col] = "not applicable"
+                        model_df = pd.concat([model_df, default_values])
+                    # Select only required columns
+                    model_df = model_df[self.required_columns[model_key]]
+                    # Use left join to preserve all original data indices
+                    result = pd.merge(
+                        base_df,
+                        model_df,
+                        left_index=True,
+                        right_index=True,
+                        how='left'
+                    )
+                    # Fill potential NaN values
+                    for col in self.required_columns[model_key]:
+                        if col in result.columns:
+                            if col.endswith('_Decision'):
+                                result[col] = result[col].fillna(False)
+                            elif col.endswith('_Reason'):
+                                result[col] = result[col].fillna("Not applicable - Missing value")
+                            else:
+                                result[col] = result[col].fillna("not applicable")
+                    return result
+                except Exception as e:
+                    logging.error(f"Error processing {model_key} results: {str(e)}")
+                    # Return base DataFrame with default values
+                    for col in self.required_columns[model_key]:
+                        if col.endswith('_Decision'):
+                            base_df[col] = False
+                        elif col.endswith('_Reason'):
+                            base_df[col] = f"Error processing {model_key} results: {str(e)}"
+                        else:
+                            base_df[col] = "not applicable"
+                    return base_df
+            # Merge results from each model in sequence
+            merged_df = join_model_results(merged_df, "model_a")
+            merged_df = join_model_results(merged_df, "model_b")
+            # Merge Model C results or generate default values
+            if "model_c" in model_results:
+                merged_df = join_model_results(merged_df, "model_c")
+            else:
+                merged_df["C_Decision"] = False
+                merged_df["C_Reason"] = merged_df.apply(
+                    lambda row: "No disagreement between Model A and B"
+                        if pd.notna(row.get("A_Decision")) and pd.notna(row.get("B_Decision")) and row["A_Decision"] == row["B_Decision"]
+                        else "Not applicable - No Model C result",
+                    axis=1
+                )
+            # Compute final decision based on model results
+            def compute_final_decision(row):
+                """
+                Compute final decision based on available model decisions
+                Priority: Model C > Agreement between A&B > Model B > Model A > False
+                """
+                try:
+                    if pd.notna(row.get("C_Decision")):
+                        return bool(row["C_Decision"])
+                    elif pd.notna(row.get("A_Decision")) and pd.notna(row.get("B_Decision")):
+                        if bool(row["A_Decision"]) == bool(row["B_Decision"]):
+                            return bool(row["A_Decision"])
+                        else:
+                            return bool(row["B_Decision"])  # Use Model B's result in case of disagreement
+                    elif pd.notna(row.get("B_Decision")):
+                        return bool(row["B_Decision"])
+                    elif pd.notna(row.get("A_Decision")):
+                        return bool(row["A_Decision"])
+                except Exception as e:
+                    logging.error(f"Error computing final decision: {str(e)}")
+                return False
+            merged_df["Final_Decision"] = merged_df.apply(compute_final_decision, axis=1)
+            # Define final output columns and their order
+            output_cols = [
+                "Title", "DOI", "Abstract", "Authors",
+                *self.required_columns.get("model_a", []),
+                *self.required_columns.get("model_b", []),
+                *self.required_columns.get("model_c", []),
+                "Final_Decision"
+            ]
+            # Ensure all required columns exist (assign default values if missing)
+            for col in output_cols:
+                if col not in merged_df.columns:
+                    if col.endswith('Decision'):
+                        merged_df[col] = False
+                    elif col.endswith('Reason'):
+                        merged_df[col] = "Not applicable - Missing column"
+                    else:
+                        merged_df[col] = ""
+            # Select existing columns in the specified order
+            existing_cols = [col for col in output_cols if col in merged_df.columns]
+            merged_df = merged_df[existing_cols]
+            # Final cleaning of all column values
+            for col in merged_df.columns:
+                if col.endswith('Decision'):
+                    merged_df[col] = merged_df[col].fillna(False).astype(bool)
+                elif col.endswith('Reason'):
+                    merged_df[col] = merged_df[col].fillna("Not applicable - Missing value")
+                elif col in ["Title", "DOI", "Abstract", "Authors"]:
+                    merged_df[col] = merged_df[col].fillna("").astype(str)
+                else:
+                    merged_df[col] = merged_df[col].fillna("not applicable")
+            # Add index as a column in the final result
+            merged_df.insert(0, "Index", merged_df.index)
+            return merged_df
+        except Exception as e:
+            logging.error(f"Error merging results: {str(e)}")
+            # Return a minimal DataFrame with error information
+            error_df = pd.DataFrame(index=df.index)
+            error_df["Error"] = f"Failed to merge results: {str(e)}"
+            return error_df
+    def export_to_excel(self, df: pd.DataFrame, filename: str) -> None:
+        """
+        Export DataFrame to Excel file
+        Args:
+            df: DataFrame to export
+            filename: Target Excel file path
+        """
+        try:
+            df.to_excel(filename, index=False)
+            logging.info(f"Exported results to {filename} successfully.")
+        except Exception as e:
+            logging.error(f"Error exporting to Excel: {str(e)}")