Add reviewer_recommendation module
Browse files- reviewer_recommendation/__init__.py +23 -0
- reviewer_recommendation/__pycache__/__init__.cpython-310.pyc +0 -0
- reviewer_recommendation/__pycache__/__init__.cpython-312.pyc +0 -0
- reviewer_recommendation/__pycache__/__init__.cpython-313.pyc +0 -0
- reviewer_recommendation/__pycache__/engine.cpython-310.pyc +0 -0
- reviewer_recommendation/__pycache__/engine.cpython-312.pyc +0 -0
- reviewer_recommendation/__pycache__/models.cpython-310.pyc +0 -0
- reviewer_recommendation/__pycache__/models.cpython-312.pyc +0 -0
- reviewer_recommendation/__pycache__/models.cpython-313.pyc +0 -0
- reviewer_recommendation/__pycache__/searcher.cpython-310.pyc +0 -0
- reviewer_recommendation/__pycache__/searcher.cpython-312.pyc +0 -0
- reviewer_recommendation/engine copy.py +609 -0
- reviewer_recommendation/engine.py +389 -0
- reviewer_recommendation/enginecomplex.py +609 -0
- reviewer_recommendation/models.py +59 -0
- reviewer_recommendation/searcher copy.py +666 -0
- reviewer_recommendation/searcher.py +1128 -0
- reviewer_recommendation/utils.py +153 -0
reviewer_recommendation/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
审稿人推荐系统
|
| 3 |
+
基于论文信息自动推荐合适的审稿人
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
__version__ = "1.0.0"
|
| 7 |
+
__author__ = "AI Assistant"
|
| 8 |
+
|
| 9 |
+
from .models import PaperInfo, Reviewer, RecommendationRequest, RecommendationResponse, AppState
|
| 10 |
+
from .searcher import AcademicSearcher, DynamicAcademicSearcher, OpenAlexSearcher
|
| 11 |
+
from .engine import LLMRecommendationEngine
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
"PaperInfo",
|
| 15 |
+
"Reviewer",
|
| 16 |
+
"RecommendationRequest",
|
| 17 |
+
"RecommendationResponse",
|
| 18 |
+
"AppState",
|
| 19 |
+
"AcademicSearcher",
|
| 20 |
+
"DynamicAcademicSearcher",
|
| 21 |
+
"OpenAlexSearcher",
|
| 22 |
+
"LLMRecommendationEngine"
|
| 23 |
+
]
|
reviewer_recommendation/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (666 Bytes). View file
|
|
|
reviewer_recommendation/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (692 Bytes). View file
|
|
|
reviewer_recommendation/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (697 Bytes). View file
|
|
|
reviewer_recommendation/__pycache__/engine.cpython-310.pyc
ADDED
|
Binary file (11.9 kB). View file
|
|
|
reviewer_recommendation/__pycache__/engine.cpython-312.pyc
ADDED
|
Binary file (14.1 kB). View file
|
|
|
reviewer_recommendation/__pycache__/models.cpython-310.pyc
ADDED
|
Binary file (3.36 kB). View file
|
|
|
reviewer_recommendation/__pycache__/models.cpython-312.pyc
ADDED
|
Binary file (4.2 kB). View file
|
|
|
reviewer_recommendation/__pycache__/models.cpython-313.pyc
ADDED
|
Binary file (4.25 kB). View file
|
|
|
reviewer_recommendation/__pycache__/searcher.cpython-310.pyc
ADDED
|
Binary file (28.1 kB). View file
|
|
|
reviewer_recommendation/__pycache__/searcher.cpython-312.pyc
ADDED
|
Binary file (15.1 kB). View file
|
|
|
reviewer_recommendation/engine copy.py
ADDED
|
@@ -0,0 +1,609 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
推荐引擎模块
|
| 3 |
+
使用LLM分析候选者并推荐合适的审稿人
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 11 |
+
|
| 12 |
+
from .models import PaperInfo, Reviewer
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# 配置部分
|
| 16 |
+
DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
|
| 17 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class LLMRecommendationEngine:
|
| 21 |
+
"""完全由大模型驱动的审稿人推荐引擎"""
|
| 22 |
+
|
| 23 |
+
def __init__(self):
|
| 24 |
+
pass
|
| 25 |
+
|
| 26 |
+
def analyze_candidates(self, paper: PaperInfo, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 27 |
+
"""分析候选文献,评估适合度"""
|
| 28 |
+
system_msg = "你是学术领域专家,擅长评估研究人员与特定论文的匹配度"
|
| 29 |
+
candidates_str = json.dumps(candidates, ensure_ascii=False, indent=2)
|
| 30 |
+
|
| 31 |
+
# 构建作者和机构信息
|
| 32 |
+
authors_info = ""
|
| 33 |
+
if paper.authors:
|
| 34 |
+
authors_info = f"作者: {', '.join(paper.authors)}"
|
| 35 |
+
if paper.affiliations:
|
| 36 |
+
authors_info += f"\n作者机构: {', '.join(paper.affiliations)}"
|
| 37 |
+
|
| 38 |
+
prompt = f"""
|
| 39 |
+
请分析以下候选文献的作者是否适合评审目标论文,并按适合度排序:
|
| 40 |
+
|
| 41 |
+
目标论文:
|
| 42 |
+
标题: {paper.title}
|
| 43 |
+
摘要: {paper.abstract}
|
| 44 |
+
关键词: {', '.join(paper.keywords)}
|
| 45 |
+
{authors_info}
|
| 46 |
+
|
| 47 |
+
候选文献列表:
|
| 48 |
+
{candidates_str}
|
| 49 |
+
|
| 50 |
+
分析要求:
|
| 51 |
+
1. 为每位通讯作者评估适合度,给出0-1的相关性评分
|
| 52 |
+
2. 提取作者的专业领域和研究方向
|
| 53 |
+
3. 说明推荐理由(中文 重点介绍作者本人的研究方向)
|
| 54 |
+
4. 排除重复作者
|
| 55 |
+
5. 严格排除与目标论文作者相同或来自同一机构的人员
|
| 56 |
+
6. 按适合度从高到低排序,优先考虑引用量和知名程度
|
| 57 |
+
7. 必须返回至少5-10个审稿人,确保有足够的候选人数
|
| 58 |
+
7. 如果相关性评分全部低于0.6 则重新再进行一次分析
|
| 59 |
+
8. 估算作者的学术论文引用总量(基于机构声誉和研究领域)
|
| 60 |
+
|
| 61 |
+
请返回JSON数组,每个元素包含:
|
| 62 |
+
- name: 作者姓名
|
| 63 |
+
- affiliation: 单位
|
| 64 |
+
- email: 邮箱(从数据中提取)
|
| 65 |
+
- reason: 推荐理由(中文 作者本人的研究方向与目标论文的适配度)
|
| 66 |
+
- relevance_score: 相关性评分(0-1)
|
| 67 |
+
- expertise_areas: 专业领域列表
|
| 68 |
+
- citation_count: 估算的学术论文引用总量
|
| 69 |
+
|
| 70 |
+
确保输出是纯JSON,不要包含其他内容
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
response = self._call_llm_with_retry(prompt.strip(), system_msg, json_output=True)
|
| 74 |
+
if not response:
|
| 75 |
+
return []
|
| 76 |
+
|
| 77 |
+
# 清理和解析JSON响应
|
| 78 |
+
cleaned_response = self._clean_json_response(response)
|
| 79 |
+
if not cleaned_response:
|
| 80 |
+
return []
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
result = json.loads(cleaned_response)
|
| 84 |
+
if isinstance(result, list):
|
| 85 |
+
# 并行为每个审稿人添加引用量
|
| 86 |
+
enhanced_result = self._add_citations_parallel(result)
|
| 87 |
+
|
| 88 |
+
# 按引用量和相关性评分综合排序
|
| 89 |
+
def sort_key(x):
|
| 90 |
+
citation_count = x.get('citation_count', '0')
|
| 91 |
+
if isinstance(citation_count, str) and citation_count == "未查询到":
|
| 92 |
+
citation_score = 0
|
| 93 |
+
else:
|
| 94 |
+
try:
|
| 95 |
+
citation_score = int(citation_count) / 10000 * 0.6
|
| 96 |
+
except (ValueError, TypeError):
|
| 97 |
+
citation_score = 0
|
| 98 |
+
relevance_score = x.get('relevance_score', 0) * 0.4
|
| 99 |
+
return citation_score + relevance_score
|
| 100 |
+
|
| 101 |
+
enhanced_result.sort(key=sort_key, reverse=True)
|
| 102 |
+
|
| 103 |
+
# 过滤掉相同作者和机构
|
| 104 |
+
filtered_result = self._filter_reviewers(enhanced_result, paper)
|
| 105 |
+
return filtered_result
|
| 106 |
+
else:
|
| 107 |
+
print("大模型返回的不是JSON数组")
|
| 108 |
+
return self._generate_fallback_reviewers(candidates, paper)
|
| 109 |
+
except json.JSONDecodeError:
|
| 110 |
+
print("无法解析大模型返回的JSON")
|
| 111 |
+
return self._generate_fallback_reviewers(candidates, paper)
|
| 112 |
+
|
| 113 |
+
def _clean_json_response(self, response: str) -> str:
|
| 114 |
+
"""清理大模型返回的JSON响应"""
|
| 115 |
+
if not response:
|
| 116 |
+
return ""
|
| 117 |
+
|
| 118 |
+
# 移除markdown代码块
|
| 119 |
+
if "```json" in response:
|
| 120 |
+
start = response.find("```json") + 7
|
| 121 |
+
end = response.find("```", start)
|
| 122 |
+
if end != -1:
|
| 123 |
+
response = response[start:end]
|
| 124 |
+
elif "```" in response:
|
| 125 |
+
start = response.find("```") + 3
|
| 126 |
+
end = response.find("```", start)
|
| 127 |
+
if end != -1:
|
| 128 |
+
response = response[start:end]
|
| 129 |
+
|
| 130 |
+
# 清理空白字符
|
| 131 |
+
response = response.strip()
|
| 132 |
+
|
| 133 |
+
# 处理多个独立JSON对象的情况
|
| 134 |
+
if response.count('{') > 1:
|
| 135 |
+
# 尝试将多个JSON对象合并为数组
|
| 136 |
+
try:
|
| 137 |
+
# 分割多个JSON对象
|
| 138 |
+
objects = []
|
| 139 |
+
brace_count = 0
|
| 140 |
+
current_obj = ""
|
| 141 |
+
|
| 142 |
+
for char in response:
|
| 143 |
+
current_obj += char
|
| 144 |
+
if char == '{':
|
| 145 |
+
brace_count += 1
|
| 146 |
+
elif char == '}':
|
| 147 |
+
brace_count -= 1
|
| 148 |
+
if brace_count == 0:
|
| 149 |
+
# 一个完整的JSON对象
|
| 150 |
+
obj_str = current_obj.strip()
|
| 151 |
+
if obj_str.startswith('{') and obj_str.endswith('}'):
|
| 152 |
+
try:
|
| 153 |
+
json.loads(obj_str) # 验证JSON格式
|
| 154 |
+
objects.append(obj_str)
|
| 155 |
+
except:
|
| 156 |
+
pass
|
| 157 |
+
current_obj = ""
|
| 158 |
+
|
| 159 |
+
if len(objects) > 1:
|
| 160 |
+
# 合并为JSON数组
|
| 161 |
+
return "[" + ",".join(objects) + "]"
|
| 162 |
+
elif len(objects) == 1:
|
| 163 |
+
return "[" + objects[0] + "]"
|
| 164 |
+
except:
|
| 165 |
+
pass
|
| 166 |
+
|
| 167 |
+
return response
|
| 168 |
+
|
| 169 |
+
def _filter_reviewers(self, reviewers: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
|
| 170 |
+
"""过滤掉与论文作者相同或来自同一机构的审稿人"""
|
| 171 |
+
filtered_reviewers = []
|
| 172 |
+
|
| 173 |
+
# 获取论文作者和机构的标准化列表
|
| 174 |
+
paper_authors = [author.strip().lower() for author in paper.authors if author.strip()]
|
| 175 |
+
paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
|
| 176 |
+
|
| 177 |
+
print(f"论文作者: {paper.authors}")
|
| 178 |
+
print(f"论文机构: {paper.affiliations}")
|
| 179 |
+
print(f"开始过滤 {len(reviewers)} 个审稿人...")
|
| 180 |
+
|
| 181 |
+
for reviewer in reviewers:
|
| 182 |
+
reviewer_name = reviewer.get("name", "").strip().lower()
|
| 183 |
+
reviewer_affiliation = reviewer.get("affiliation", "").strip().lower()
|
| 184 |
+
|
| 185 |
+
print(f"检查审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')})")
|
| 186 |
+
|
| 187 |
+
# 检查是否与论文作者相同
|
| 188 |
+
is_same_author = any(self._similar_names(reviewer_name, author) for author in paper_authors)
|
| 189 |
+
|
| 190 |
+
# 检查是否来自同一机构
|
| 191 |
+
is_same_institution = any(self._similar_institutions(reviewer_affiliation, aff) for aff in paper_affiliations)
|
| 192 |
+
|
| 193 |
+
# 如果既不是相同作者也不是同一机构,则保留
|
| 194 |
+
if not is_same_author and not is_same_institution:
|
| 195 |
+
filtered_reviewers.append(reviewer)
|
| 196 |
+
print(f"保留审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')})")
|
| 197 |
+
else:
|
| 198 |
+
reason = "作者相同" if is_same_author else "机构相同"
|
| 199 |
+
print(f"过滤掉审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')}) - {reason}")
|
| 200 |
+
|
| 201 |
+
print(f"过滤完成,保留 {len(filtered_reviewers)} 个审稿人")
|
| 202 |
+
return filtered_reviewers
|
| 203 |
+
|
| 204 |
+
def _similar_names(self, name1: str, name2: str) -> bool:
|
| 205 |
+
"""检查两个姓名是否相似(可能是同一人)"""
|
| 206 |
+
# 简单的相似性检查
|
| 207 |
+
if name1 == name2:
|
| 208 |
+
print(f"姓名完全匹配: '{name1}' == '{name2}'")
|
| 209 |
+
return True
|
| 210 |
+
|
| 211 |
+
# 检查是否包含相同的姓氏
|
| 212 |
+
name1_parts = name1.split()
|
| 213 |
+
name2_parts = name2.split()
|
| 214 |
+
|
| 215 |
+
if name1_parts and name2_parts:
|
| 216 |
+
# 检查姓氏是否相同
|
| 217 |
+
if name1_parts[0] == name2_parts[0]:
|
| 218 |
+
print(f"姓氏匹配: '{name1}' vs '{name2}' - 共同姓氏: {name1_parts[0]}")
|
| 219 |
+
return True
|
| 220 |
+
|
| 221 |
+
return False
|
| 222 |
+
|
| 223 |
+
def _similar_institutions(self, inst1: str, inst2: str) -> bool:
|
| 224 |
+
"""检查两个机构是否相似(可能是同一机构的不同表述)"""
|
| 225 |
+
if inst1 == inst2:
|
| 226 |
+
return True
|
| 227 |
+
|
| 228 |
+
# 过滤掉通用词汇,只保留有意义的机构名称关键词
|
| 229 |
+
def filter_common_words(words):
|
| 230 |
+
common_words = {
|
| 231 |
+
'university', 'college', 'institute', 'department', 'school',
|
| 232 |
+
'center', 'centre', 'laboratory', 'lab', 'of', 'the', 'and',
|
| 233 |
+
'at', 'in', 'for', 'medical', 'medicine', 'science', 'technology'
|
| 234 |
+
}
|
| 235 |
+
return {word for word in words if word not in common_words and len(word) > 2}
|
| 236 |
+
|
| 237 |
+
# 获取有意义的关键词
|
| 238 |
+
inst1_words = filter_common_words(set(inst1.lower().split()))
|
| 239 |
+
inst2_words = filter_common_words(set(inst2.lower().split()))
|
| 240 |
+
|
| 241 |
+
# 如果过滤后没有关键词,使用原始词汇但提高阈值
|
| 242 |
+
if not inst1_words or not inst2_words:
|
| 243 |
+
inst1_words = set(inst1.lower().split())
|
| 244 |
+
inst2_words = set(inst2.lower().split())
|
| 245 |
+
# 提高阈值到80%,减少误判
|
| 246 |
+
threshold = 0.8
|
| 247 |
+
else:
|
| 248 |
+
# 使用有意义关键词,阈值可以相对宽松
|
| 249 |
+
threshold = 0.6
|
| 250 |
+
|
| 251 |
+
# 计算共同词汇比例
|
| 252 |
+
common_words = inst1_words.intersection(inst2_words)
|
| 253 |
+
if not common_words:
|
| 254 |
+
return False
|
| 255 |
+
|
| 256 |
+
similarity_ratio = len(common_words) / min(len(inst1_words), len(inst2_words))
|
| 257 |
+
|
| 258 |
+
# 添加调试日志
|
| 259 |
+
if similarity_ratio >= threshold:
|
| 260 |
+
print(f"机构匹配: '{inst1}' vs '{inst2}' - 相似度: {similarity_ratio:.2f}, 共同词汇: {common_words}")
|
| 261 |
+
|
| 262 |
+
return similarity_ratio >= threshold
|
| 263 |
+
|
| 264 |
+
def _generate_fallback_reviewers(self, candidates: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
|
| 265 |
+
"""当LLM解析失败时,生成基础推荐"""
|
| 266 |
+
fallback_reviewers = []
|
| 267 |
+
|
| 268 |
+
for candidate in candidates[:20]: # 取前20个候选
|
| 269 |
+
author = candidate.get("corresponding_author")
|
| 270 |
+
institution = candidate.get("corresponding_institution")
|
| 271 |
+
|
| 272 |
+
if author and author not in [r.get("name") for r in fallback_reviewers]:
|
| 273 |
+
# 检查是否与论文作者或机构相同
|
| 274 |
+
author_lower = author.strip().lower()
|
| 275 |
+
institution_lower = (institution or "").strip().lower()
|
| 276 |
+
|
| 277 |
+
paper_authors = [a.strip().lower() for a in paper.authors if a.strip()]
|
| 278 |
+
paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
|
| 279 |
+
|
| 280 |
+
is_same_author = any(self._similar_names(author_lower, pa) for pa in paper_authors)
|
| 281 |
+
is_same_institution = any(self._similar_institutions(institution_lower, pa) for pa in paper_affiliations)
|
| 282 |
+
|
| 283 |
+
if not is_same_author and not is_same_institution:
|
| 284 |
+
# 获取真实引用量
|
| 285 |
+
citation_count = self._get_real_citation_count(author, institution or "未知单位")
|
| 286 |
+
|
| 287 |
+
fallback_reviewers.append({
|
| 288 |
+
"name": author,
|
| 289 |
+
"affiliation": institution or "未知单位",
|
| 290 |
+
"email": "未知邮箱",
|
| 291 |
+
"reason": "基于文献相关性自动推荐",
|
| 292 |
+
"relevance_score": 0.7,
|
| 293 |
+
"expertise_areas": ["相关研究领域"],
|
| 294 |
+
"citation_count": citation_count
|
| 295 |
+
})
|
| 296 |
+
|
| 297 |
+
return fallback_reviewers
|
| 298 |
+
|
| 299 |
+
def _call_llm_with_retry(self, prompt: str, system_msg: str, json_output: bool = False, max_retries: int = 3) -> Any:
|
| 300 |
+
"""带重试机制的LLM调用"""
|
| 301 |
+
for attempt in range(max_retries):
|
| 302 |
+
try:
|
| 303 |
+
if DASHSCOPE_API_KEY:
|
| 304 |
+
import dashscope
|
| 305 |
+
dashscope.api_key = DASHSCOPE_API_KEY
|
| 306 |
+
|
| 307 |
+
# 设置更长的超时时间和更好的错误处理
|
| 308 |
+
try:
|
| 309 |
+
response = dashscope.Generation.call(
|
| 310 |
+
model="qwen-turbo", # 使用更稳定的模型
|
| 311 |
+
messages=[
|
| 312 |
+
{"role": "system", "content": system_msg},
|
| 313 |
+
{"role": "user", "content": prompt}
|
| 314 |
+
],
|
| 315 |
+
result_format="json" if json_output else "text",
|
| 316 |
+
timeout=60 # 增加超时时间
|
| 317 |
+
)
|
| 318 |
+
if response.status_code == 200:
|
| 319 |
+
return response.output.text
|
| 320 |
+
else:
|
| 321 |
+
print(f"DashScope API错误: {response.message}")
|
| 322 |
+
|
| 323 |
+
except Exception as api_error:
|
| 324 |
+
print(f"DashScope API调用异常: {str(api_error)}")
|
| 325 |
+
if "SSL" in str(api_error) or "EOF" in str(api_error):
|
| 326 |
+
print("检测到SSL连接问题,尝试使用备用方案")
|
| 327 |
+
# 可以在这里添加备用API调用
|
| 328 |
+
|
| 329 |
+
elif OPENAI_API_KEY:
|
| 330 |
+
from openai import OpenAI
|
| 331 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 332 |
+
response = client.chat.completions.create(
|
| 333 |
+
model="gpt-3.5-turbo", # 使用更稳定的模型
|
| 334 |
+
messages=[
|
| 335 |
+
{"role": "system", "content": system_msg},
|
| 336 |
+
{"role": "user", "content": prompt}
|
| 337 |
+
],
|
| 338 |
+
response_format={"type": "json_object"} if json_output else None,
|
| 339 |
+
timeout=60
|
| 340 |
+
)
|
| 341 |
+
return response.choices[0].message.content
|
| 342 |
+
|
| 343 |
+
else:
|
| 344 |
+
print("未配置API密钥,使用备用方案")
|
| 345 |
+
return None
|
| 346 |
+
|
| 347 |
+
except Exception as e:
|
| 348 |
+
print(f"第{attempt + 1}次调用失败: {str(e)}")
|
| 349 |
+
if attempt < max_retries - 1:
|
| 350 |
+
print(f"等待 {2 ** attempt} 秒后重试...")
|
| 351 |
+
time.sleep(2 ** attempt) # 指数退避
|
| 352 |
+
else:
|
| 353 |
+
print(f"所有重试都失败了,将使用备用推荐方案")
|
| 354 |
+
return None
|
| 355 |
+
|
| 356 |
+
def _get_real_citation_count(self, name: str, affiliation: str) -> str:
|
| 357 |
+
"""获取作者的真实学术论文引用总量"""
|
| 358 |
+
try:
|
| 359 |
+
# 首先尝试OpenAlex API
|
| 360 |
+
citation_count = self._get_citation_from_openalex(name, affiliation)
|
| 361 |
+
if citation_count > 0:
|
| 362 |
+
return str(citation_count)
|
| 363 |
+
|
| 364 |
+
# 备用方案:Semantic Scholar API
|
| 365 |
+
citation_count = self._get_citation_from_semantic_scholar(name, affiliation)
|
| 366 |
+
if citation_count > 0:
|
| 367 |
+
return str(citation_count)
|
| 368 |
+
|
| 369 |
+
# 如果没有找到真实数据,返回"未查询到"
|
| 370 |
+
print(f"未找到 {name} 的引用量数据")
|
| 371 |
+
return "未查询到"
|
| 372 |
+
|
| 373 |
+
except Exception as e:
|
| 374 |
+
print(f"获取引用量失败: {str(e)}")
|
| 375 |
+
return "未查询到"
|
| 376 |
+
|
| 377 |
+
def _add_citations_parallel(self, reviewers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 378 |
+
"""并行为审稿人添加引用量"""
|
| 379 |
+
print(f"开始并行获取 {len(reviewers)} 个审稿人的引用量...")
|
| 380 |
+
|
| 381 |
+
enhanced_reviewers = []
|
| 382 |
+
|
| 383 |
+
# 使用线程池并行获取引用量
|
| 384 |
+
with ThreadPoolExecutor(max_workers=5) as executor:
|
| 385 |
+
# 提交所有引用量获取任务
|
| 386 |
+
future_to_reviewer = {}
|
| 387 |
+
for reviewer in reviewers:
|
| 388 |
+
name = reviewer.get('name', '')
|
| 389 |
+
affiliation = reviewer.get('affiliation', '')
|
| 390 |
+
future = executor.submit(self._get_real_citation_count, name, affiliation)
|
| 391 |
+
future_to_reviewer[future] = reviewer
|
| 392 |
+
|
| 393 |
+
# 收集结果
|
| 394 |
+
for future in as_completed(future_to_reviewer):
|
| 395 |
+
reviewer = future_to_reviewer[future]
|
| 396 |
+
try:
|
| 397 |
+
citation_count = future.result(timeout=15) # 15秒超时
|
| 398 |
+
reviewer['citation_count'] = citation_count
|
| 399 |
+
enhanced_reviewers.append(reviewer)
|
| 400 |
+
print(f"获取引用量完成: {reviewer.get('name')} - {citation_count}")
|
| 401 |
+
except Exception as e:
|
| 402 |
+
print(f"获取引用量失败: {reviewer.get('name')} - {str(e)}")
|
| 403 |
+
reviewer['citation_count'] = "未查询到"
|
| 404 |
+
enhanced_reviewers.append(reviewer)
|
| 405 |
+
|
| 406 |
+
print(f"并行引用量获取完成,处理了 {len(enhanced_reviewers)} 个审稿人")
|
| 407 |
+
return enhanced_reviewers
|
| 408 |
+
|
| 409 |
+
def _get_citation_from_openalex(self, name: str, affiliation: str) -> int:
|
| 410 |
+
"""从OpenAlex API获取作者引用量"""
|
| 411 |
+
try:
|
| 412 |
+
import requests
|
| 413 |
+
import urllib.parse
|
| 414 |
+
|
| 415 |
+
# 生成多种查询变体
|
| 416 |
+
name_variants = self._generate_name_variants(name)
|
| 417 |
+
|
| 418 |
+
for variant in name_variants:
|
| 419 |
+
# 简化查询,只使用姓名
|
| 420 |
+
query = f'display_name:"{variant}"'
|
| 421 |
+
print(f"OpenAlex查询: {query}")
|
| 422 |
+
|
| 423 |
+
# OpenAlex API请求
|
| 424 |
+
url = "https://api.openalex.org/authors"
|
| 425 |
+
params = {
|
| 426 |
+
'search': query,
|
| 427 |
+
'per-page': 5, # 增加结果数量
|
| 428 |
+
'select': 'id,display_name,cited_by_count,affiliations'
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
response = requests.get(url, params=params, timeout=15)
|
| 432 |
+
response.raise_for_status()
|
| 433 |
+
|
| 434 |
+
data = response.json()
|
| 435 |
+
if data.get('results'):
|
| 436 |
+
# 尝试匹配最佳结果
|
| 437 |
+
best_match = self._find_best_author_match(data['results'], name, affiliation)
|
| 438 |
+
if best_match:
|
| 439 |
+
cited_by_count = best_match.get('cited_by_count', 0)
|
| 440 |
+
print(f"OpenAlex API: {name} 引用量: {cited_by_count}")
|
| 441 |
+
return cited_by_count
|
| 442 |
+
|
| 443 |
+
print(f"OpenAlex API: 未找到 {variant} 的数据")
|
| 444 |
+
|
| 445 |
+
return 0
|
| 446 |
+
|
| 447 |
+
except Exception as e:
|
| 448 |
+
print(f"OpenAlex API调用失败: {str(e)}")
|
| 449 |
+
return 0
|
| 450 |
+
|
| 451 |
+
def _get_citation_from_semantic_scholar(self, name: str, affiliation: str) -> int:
|
| 452 |
+
"""从Semantic Scholar API获取作者引用量"""
|
| 453 |
+
try:
|
| 454 |
+
import requests
|
| 455 |
+
import urllib.parse
|
| 456 |
+
|
| 457 |
+
# 生成多种查询变体
|
| 458 |
+
name_variants = self._generate_name_variants(name)
|
| 459 |
+
|
| 460 |
+
for variant in name_variants:
|
| 461 |
+
# 简化查询,只使用姓名
|
| 462 |
+
query = variant
|
| 463 |
+
print(f"Semantic Scholar查询: {query}")
|
| 464 |
+
|
| 465 |
+
# Semantic Scholar API请求
|
| 466 |
+
url = "https://api.semanticscholar.org/graph/v1/author/search"
|
| 467 |
+
params = {
|
| 468 |
+
'query': query,
|
| 469 |
+
'limit': 5, # 增加结果数量
|
| 470 |
+
'fields': 'authorId,name,citationCount,affiliations'
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
headers = {
|
| 474 |
+
'User-Agent': 'Academic-Reviewer-System/1.0'
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
response = requests.get(url, params=params, headers=headers, timeout=15)
|
| 478 |
+
response.raise_for_status()
|
| 479 |
+
|
| 480 |
+
data = response.json()
|
| 481 |
+
if data.get('data'):
|
| 482 |
+
# 尝试匹配最佳结果
|
| 483 |
+
best_match = self._find_best_semantic_author_match(data['data'], name, affiliation)
|
| 484 |
+
if best_match:
|
| 485 |
+
citation_count = best_match.get('citationCount', 0)
|
| 486 |
+
print(f"Semantic Scholar API: {name} 引用量: {citation_count}")
|
| 487 |
+
return citation_count
|
| 488 |
+
|
| 489 |
+
print(f"Semantic Scholar API: 未找到 {variant} 的数据")
|
| 490 |
+
|
| 491 |
+
return 0
|
| 492 |
+
|
| 493 |
+
except Exception as e:
|
| 494 |
+
print(f"Semantic Scholar API调用失败: {str(e)}")
|
| 495 |
+
return 0
|
| 496 |
+
|
| 497 |
+
def _generate_name_variants(self, name: str) -> List[str]:
|
| 498 |
+
"""生成姓名的多种变体"""
|
| 499 |
+
variants = [name] # 原始姓名
|
| 500 |
+
|
| 501 |
+
# 如果包含中间名,尝试不同的组合
|
| 502 |
+
name_parts = name.split()
|
| 503 |
+
if len(name_parts) >= 2:
|
| 504 |
+
# 只使用姓和名
|
| 505 |
+
variants.append(f"{name_parts[0]} {name_parts[-1]}")
|
| 506 |
+
|
| 507 |
+
# 如果有多于2个部分,尝试不同的组合
|
| 508 |
+
if len(name_parts) == 3:
|
| 509 |
+
variants.append(f"{name_parts[0]} {name_parts[1]}")
|
| 510 |
+
variants.append(f"{name_parts[1]} {name_parts[2]}")
|
| 511 |
+
elif len(name_parts) > 3:
|
| 512 |
+
# 对于更复杂的姓名,尝试简化
|
| 513 |
+
variants.append(f"{name_parts[0]} {name_parts[1]}")
|
| 514 |
+
variants.append(f"{name_parts[0]} {name_parts[-1]}")
|
| 515 |
+
|
| 516 |
+
# 去重并保持顺序
|
| 517 |
+
seen = set()
|
| 518 |
+
unique_variants = []
|
| 519 |
+
for variant in variants:
|
| 520 |
+
if variant not in seen:
|
| 521 |
+
seen.add(variant)
|
| 522 |
+
unique_variants.append(variant)
|
| 523 |
+
|
| 524 |
+
return unique_variants
|
| 525 |
+
|
| 526 |
+
def _find_best_author_match(self, authors: List[Dict], target_name: str, target_affiliation: str) -> Optional[Dict]:
|
| 527 |
+
"""在OpenAlex结果中找到最佳匹配的作者"""
|
| 528 |
+
if not authors:
|
| 529 |
+
return None
|
| 530 |
+
|
| 531 |
+
# 如果只有一个结果,直接返回
|
| 532 |
+
if len(authors) == 1:
|
| 533 |
+
return authors[0]
|
| 534 |
+
|
| 535 |
+
# 计算每个作者的匹配分数
|
| 536 |
+
best_match = None
|
| 537 |
+
best_score = 0
|
| 538 |
+
|
| 539 |
+
for author in authors:
|
| 540 |
+
score = 0
|
| 541 |
+
author_name = author.get('display_name', '').lower()
|
| 542 |
+
target_name_lower = target_name.lower()
|
| 543 |
+
|
| 544 |
+
# 姓名匹配分数
|
| 545 |
+
if target_name_lower in author_name or author_name in target_name_lower:
|
| 546 |
+
score += 10
|
| 547 |
+
|
| 548 |
+
# 检查机构匹配
|
| 549 |
+
affiliations = author.get('affiliations', [])
|
| 550 |
+
if affiliations and target_affiliation and target_affiliation != "未知单位":
|
| 551 |
+
for aff in affiliations:
|
| 552 |
+
aff_name = aff.get('display_name', '').lower()
|
| 553 |
+
if target_affiliation.lower() in aff_name:
|
| 554 |
+
score += 5
|
| 555 |
+
break
|
| 556 |
+
|
| 557 |
+
# 引用量作为权重
|
| 558 |
+
citation_count = author.get('cited_by_count', 0)
|
| 559 |
+
if citation_count > 0:
|
| 560 |
+
score += 1
|
| 561 |
+
|
| 562 |
+
if score > best_score:
|
| 563 |
+
best_score = score
|
| 564 |
+
best_match = author
|
| 565 |
+
|
| 566 |
+
return best_match if best_score > 0 else authors[0]
|
| 567 |
+
|
| 568 |
+
def _find_best_semantic_author_match(self, authors: List[Dict], target_name: str, target_affiliation: str) -> Optional[Dict]:
|
| 569 |
+
"""在Semantic Scholar结果中找到最佳匹配的作者"""
|
| 570 |
+
if not authors:
|
| 571 |
+
return None
|
| 572 |
+
|
| 573 |
+
# 如果只有一个结果,直接返回
|
| 574 |
+
if len(authors) == 1:
|
| 575 |
+
return authors[0]
|
| 576 |
+
|
| 577 |
+
# 计算每个作者的匹配分数
|
| 578 |
+
best_match = None
|
| 579 |
+
best_score = 0
|
| 580 |
+
|
| 581 |
+
for author in authors:
|
| 582 |
+
score = 0
|
| 583 |
+
author_name = author.get('name', '').lower()
|
| 584 |
+
target_name_lower = target_name.lower()
|
| 585 |
+
|
| 586 |
+
# 姓名匹配分数
|
| 587 |
+
if target_name_lower in author_name or author_name in target_name_lower:
|
| 588 |
+
score += 10
|
| 589 |
+
|
| 590 |
+
# 检查机构匹配
|
| 591 |
+
affiliations = author.get('affiliations', [])
|
| 592 |
+
if affiliations and target_affiliation and target_affiliation != "未知单位":
|
| 593 |
+
for aff in affiliations:
|
| 594 |
+
aff_name = aff.get('name', '').lower()
|
| 595 |
+
if target_affiliation.lower() in aff_name:
|
| 596 |
+
score += 5
|
| 597 |
+
break
|
| 598 |
+
|
| 599 |
+
# 引用量作为权重
|
| 600 |
+
citation_count = author.get('citationCount', 0)
|
| 601 |
+
if citation_count > 0:
|
| 602 |
+
score += 1
|
| 603 |
+
|
| 604 |
+
if score > best_score:
|
| 605 |
+
best_score = score
|
| 606 |
+
best_match = author
|
| 607 |
+
|
| 608 |
+
return best_match if best_score > 0 else authors[0]
|
| 609 |
+
|
reviewer_recommendation/engine.py
ADDED
|
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
推荐引擎模块
|
| 3 |
+
使用LLM分析候选者并推荐合适的审稿人
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
|
| 11 |
+
from .models import PaperInfo, Reviewer
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# 配置部分
|
| 15 |
+
DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
|
| 16 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class LLMRecommendationEngine:
|
| 20 |
+
"""完全由大模型驱动的审稿人推荐引擎"""
|
| 21 |
+
|
| 22 |
+
def __init__(self):
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
def analyze_candidates(self, paper: PaperInfo, candidates: List[Dict[str, Any]], num_reviewers: int = 8) -> List[Dict[str, Any]]:
|
| 26 |
+
"""分析候选审稿人,使用统一的推荐策略"""
|
| 27 |
+
print(f"候选审稿人: {len(candidates)} 人")
|
| 28 |
+
|
| 29 |
+
# 使用统一的提示词分析候选审稿人
|
| 30 |
+
return self._analyze_candidates_unified(paper, candidates, num_reviewers)
|
| 31 |
+
|
| 32 |
+
def _analyze_candidates_unified(self, paper: PaperInfo, candidates: List[Dict[str, Any]], num_reviewers: int = 8) -> List[Dict[str, Any]]:
|
| 33 |
+
"""使用统一提示词分析候选文献"""
|
| 34 |
+
if not candidates:
|
| 35 |
+
return []
|
| 36 |
+
|
| 37 |
+
# 动态传入前端要求的审稿人数量
|
| 38 |
+
print(f"要求返回 {num_reviewers} 个审稿人")
|
| 39 |
+
|
| 40 |
+
print(f"开始过滤全部 {len(candidates)} 个候选审稿人...")
|
| 41 |
+
filtered_candidates = self._filter_all_candidates(candidates, paper)
|
| 42 |
+
print(f"过滤完成,保留 {len(filtered_candidates)} 个候选审稿人")
|
| 43 |
+
|
| 44 |
+
if not filtered_candidates:
|
| 45 |
+
print("过滤后没有候选审稿人,返回空列表")
|
| 46 |
+
return []
|
| 47 |
+
|
| 48 |
+
system_msg = "你是学术领域专家,擅长评估研究人员与特定论文的相关性"
|
| 49 |
+
|
| 50 |
+
# 提取关键字段,只保留审稿人相关信息
|
| 51 |
+
simplified_candidates = [
|
| 52 |
+
{
|
| 53 |
+
"author": candidate.get("corresponding_author", ""),
|
| 54 |
+
"institution": candidate.get("corresponding_institution", ""),
|
| 55 |
+
"title": candidate.get("title", "")
|
| 56 |
+
}
|
| 57 |
+
for candidate in filtered_candidates
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
# 构建过滤后的候选审稿人列表字符串
|
| 61 |
+
candidates_str = json.dumps(simplified_candidates, ensure_ascii=False, indent=2)
|
| 62 |
+
print(f"过滤后的候选审稿人列表: {candidates_str}")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
prompt = f"""
|
| 66 |
+
你是学术领域专家,擅长评估研究人员与特定论文的相关性
|
| 67 |
+
|
| 68 |
+
请分析以下候选审稿人是否适合评审目标论文,评估相关性:
|
| 69 |
+
|
| 70 |
+
目标论文信息:
|
| 71 |
+
标题: {paper.title}
|
| 72 |
+
摘要: {paper.abstract}
|
| 73 |
+
作者: {', '.join(paper.authors)}
|
| 74 |
+
机构: {', '.join(paper.affiliations)}
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
候选审稿人列表:
|
| 78 |
+
{candidates_str}
|
| 79 |
+
|
| 80 |
+
分析要求:
|
| 81 |
+
1. 为每位审稿人评估与目标论文的相关性,给出0-1的相关性评分
|
| 82 |
+
2. 提取审稿人的专业领域和研究方向
|
| 83 |
+
3. 按relevance_score从高到低排序(desc)
|
| 84 |
+
4. 排除与目标论文作者为合作关系的审稿人
|
| 85 |
+
5. 必须返回至少{num_reviewers}个审稿人
|
| 86 |
+
|
| 87 |
+
请返回JSON数组,每个元素包含:
|
| 88 |
+
- name: 作者姓名
|
| 89 |
+
- affiliation: 单位
|
| 90 |
+
- email: 邮箱(根据作者姓名和单位邮箱后缀构建)
|
| 91 |
+
- reason: 推荐理由(中文 只介绍作者本人的研究方向与目标论文的适配度)
|
| 92 |
+
- relevance_score: 最终评分(0-1)
|
| 93 |
+
- expertise_areas: 专业领域列表
|
| 94 |
+
|
| 95 |
+
确保输出是纯JSON,不要包含其他内容
|
| 96 |
+
"""
|
| 97 |
+
|
| 98 |
+
response = self._call_llm_with_retry(prompt.strip(), system_msg, json_output=True)
|
| 99 |
+
if not response:
|
| 100 |
+
return []
|
| 101 |
+
|
| 102 |
+
# 清理和解析JSON响应
|
| 103 |
+
cleaned_response = self._clean_json_response(response)
|
| 104 |
+
if not cleaned_response:
|
| 105 |
+
return []
|
| 106 |
+
|
| 107 |
+
try:
|
| 108 |
+
result = json.loads(cleaned_response)
|
| 109 |
+
if isinstance(result, list):
|
| 110 |
+
# 按最终评分排序,确保数据类型转换
|
| 111 |
+
def get_score(x):
|
| 112 |
+
score = x.get('relevance_score', 0)
|
| 113 |
+
try:
|
| 114 |
+
return float(score) if score is not None else 0.0
|
| 115 |
+
except (ValueError, TypeError):
|
| 116 |
+
return 0.0
|
| 117 |
+
|
| 118 |
+
result.sort(key=get_score, reverse=True)
|
| 119 |
+
|
| 120 |
+
# 清理机构信息和邮箱信息,确保不为None
|
| 121 |
+
for reviewer in result:
|
| 122 |
+
if reviewer.get("affiliation") is None:
|
| 123 |
+
reviewer["affiliation"] = "未知单位"
|
| 124 |
+
if reviewer.get("email") is None:
|
| 125 |
+
reviewer["email"] = "unknown@example.com"
|
| 126 |
+
|
| 127 |
+
# 候选审稿人已经在前面过滤过了,直接返回LLM分析结果
|
| 128 |
+
print(f"统一分析完成,推荐 {len(result)} 个审稿人")
|
| 129 |
+
return result
|
| 130 |
+
else:
|
| 131 |
+
print("大模型返回的不是JSON数组")
|
| 132 |
+
return []
|
| 133 |
+
except json.JSONDecodeError:
|
| 134 |
+
print("无法解析大模型返回的JSON")
|
| 135 |
+
return []
|
| 136 |
+
|
| 137 |
+
def _clean_json_response(self, response: str) -> str:
|
| 138 |
+
"""清理大模型返回的JSON响应"""
|
| 139 |
+
if not response:
|
| 140 |
+
return ""
|
| 141 |
+
|
| 142 |
+
# 移除markdown代码块
|
| 143 |
+
if "```json" in response:
|
| 144 |
+
start = response.find("```json") + 7
|
| 145 |
+
end = response.find("```", start)
|
| 146 |
+
if end != -1:
|
| 147 |
+
response = response[start:end]
|
| 148 |
+
elif "```" in response:
|
| 149 |
+
start = response.find("```") + 3
|
| 150 |
+
end = response.find("```", start)
|
| 151 |
+
if end != -1:
|
| 152 |
+
response = response[start:end]
|
| 153 |
+
|
| 154 |
+
# 清理空白字符
|
| 155 |
+
response = response.strip()
|
| 156 |
+
|
| 157 |
+
# 处理多个独立JSON对象的情况
|
| 158 |
+
if response.count('{') > 1:
|
| 159 |
+
# 尝试将多个JSON对象合并为数组
|
| 160 |
+
try:
|
| 161 |
+
# 分割多个JSON对象
|
| 162 |
+
objects = []
|
| 163 |
+
brace_count = 0
|
| 164 |
+
current_obj = ""
|
| 165 |
+
|
| 166 |
+
for char in response:
|
| 167 |
+
current_obj += char
|
| 168 |
+
if char == '{':
|
| 169 |
+
brace_count += 1
|
| 170 |
+
elif char == '}':
|
| 171 |
+
brace_count -= 1
|
| 172 |
+
if brace_count == 0:
|
| 173 |
+
# 一个完整的JSON对象
|
| 174 |
+
obj_str = current_obj.strip()
|
| 175 |
+
if obj_str.startswith('{') and obj_str.endswith('}'):
|
| 176 |
+
try:
|
| 177 |
+
json.loads(obj_str) # 验证JSON格式
|
| 178 |
+
objects.append(obj_str)
|
| 179 |
+
except:
|
| 180 |
+
pass
|
| 181 |
+
current_obj = ""
|
| 182 |
+
|
| 183 |
+
if len(objects) > 1:
|
| 184 |
+
# 合并为JSON数组
|
| 185 |
+
return "[" + ",".join(objects) + "]"
|
| 186 |
+
elif len(objects) == 1:
|
| 187 |
+
return "[" + objects[0] + "]"
|
| 188 |
+
except:
|
| 189 |
+
pass
|
| 190 |
+
|
| 191 |
+
return response
|
| 192 |
+
|
| 193 |
+
def _filter_all_candidates(self, candidates: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
|
| 194 |
+
"""过滤所有候选审稿人,排除相同作者和机构,并进行去重"""
|
| 195 |
+
filtered_candidates = []
|
| 196 |
+
seen_reviewers = set() # 用于去重的集合
|
| 197 |
+
|
| 198 |
+
# 获取论文作者和机构的标准化列表
|
| 199 |
+
paper_authors = [author.strip().lower() for author in paper.authors if author.strip()]
|
| 200 |
+
paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
|
| 201 |
+
|
| 202 |
+
print(f"论文作者: {paper.authors}")
|
| 203 |
+
print(f"论文机构: {paper.affiliations}")
|
| 204 |
+
|
| 205 |
+
for candidate in candidates:
|
| 206 |
+
# 提取关键字段,只处理必要信息
|
| 207 |
+
author = candidate.get('corresponding_author', '')
|
| 208 |
+
institution = candidate.get('corresponding_institution', '')
|
| 209 |
+
|
| 210 |
+
if not author:
|
| 211 |
+
continue
|
| 212 |
+
|
| 213 |
+
reviewer_name = author.strip().lower()
|
| 214 |
+
reviewer_affiliation = (institution or "").strip().lower()
|
| 215 |
+
|
| 216 |
+
# print(f"检查候选审稿人: {author} ({institution})")
|
| 217 |
+
|
| 218 |
+
# 检查是否与论文作者相同
|
| 219 |
+
is_same_author = any(self._similar_names(reviewer_name, author) for author in paper_authors)
|
| 220 |
+
|
| 221 |
+
# 检查是否来自同一机构
|
| 222 |
+
is_same_institution = any(self._similar_institutions(reviewer_affiliation, aff) for aff in paper_affiliations)
|
| 223 |
+
|
| 224 |
+
# 如果既不是相同作者也不是同一机构,则进行去重检查
|
| 225 |
+
if not is_same_author and not is_same_institution:
|
| 226 |
+
# 创建审稿人标识符用于去重
|
| 227 |
+
reviewer_key = f"{reviewer_name}_{reviewer_affiliation}"
|
| 228 |
+
|
| 229 |
+
if reviewer_key not in seen_reviewers:
|
| 230 |
+
seen_reviewers.add(reviewer_key)
|
| 231 |
+
filtered_candidates.append(candidate)
|
| 232 |
+
# print(f"保留候选审稿人: {author} ({institution})")
|
| 233 |
+
else:
|
| 234 |
+
print(f"跳过重复候选审稿人: {author} ({institution})")
|
| 235 |
+
else:
|
| 236 |
+
reason = "作者相同" if is_same_author else "机构相同"
|
| 237 |
+
print(f"过滤掉候选审稿人: {author} ({institution}) - {reason}")
|
| 238 |
+
|
| 239 |
+
print(f"去重完成,最终保留 {len(filtered_candidates)} 个候选审稿人")
|
| 240 |
+
return filtered_candidates
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def _similar_names(self, name1: str, name2: str) -> bool:
|
| 244 |
+
"""检查两个姓名是否相似(可能是同一人)"""
|
| 245 |
+
# 简单的相似性检查
|
| 246 |
+
if name1 == name2:
|
| 247 |
+
print(f"姓名完全匹配: '{name1}' == '{name2}'")
|
| 248 |
+
return True
|
| 249 |
+
|
| 250 |
+
# 检查是否包含相同的姓氏
|
| 251 |
+
name1_parts = name1.split()
|
| 252 |
+
name2_parts = name2.split()
|
| 253 |
+
|
| 254 |
+
if name1_parts and name2_parts:
|
| 255 |
+
# 检查姓氏是否相同
|
| 256 |
+
if name1_parts[0] == name2_parts[0]:
|
| 257 |
+
print(f"姓氏匹配: '{name1}' vs '{name2}' - 共同姓氏: {name1_parts[0]}")
|
| 258 |
+
return True
|
| 259 |
+
|
| 260 |
+
return False
|
| 261 |
+
|
| 262 |
+
def _similar_institutions(self, inst1: str, inst2: str) -> bool:
|
| 263 |
+
"""检查两个机构是否相似(可能是同一机构的不同表述)"""
|
| 264 |
+
if inst1 == inst2:
|
| 265 |
+
return True
|
| 266 |
+
|
| 267 |
+
# 过滤掉通用词汇,只保留有意义的机构名称关键词
|
| 268 |
+
def filter_common_words(words):
|
| 269 |
+
common_words = {
|
| 270 |
+
'university', 'college', 'institute', 'department', 'school',
|
| 271 |
+
'center', 'centre', 'laboratory', 'lab', 'of', 'the', 'and',
|
| 272 |
+
'at', 'in', 'for', 'medical', 'medicine', 'science', 'technology'
|
| 273 |
+
}
|
| 274 |
+
return {word for word in words if word not in common_words and len(word) > 2}
|
| 275 |
+
|
| 276 |
+
# 获取有意义的关键词
|
| 277 |
+
inst1_words = filter_common_words(set(inst1.lower().split()))
|
| 278 |
+
inst2_words = filter_common_words(set(inst2.lower().split()))
|
| 279 |
+
|
| 280 |
+
# 如果过滤后没有关键词,使用原始词汇但提高阈值
|
| 281 |
+
if not inst1_words or not inst2_words:
|
| 282 |
+
inst1_words = set(inst1.lower().split())
|
| 283 |
+
inst2_words = set(inst2.lower().split())
|
| 284 |
+
# 提高阈值到80%,减少误判
|
| 285 |
+
threshold = 0.8
|
| 286 |
+
else:
|
| 287 |
+
# 使用有意义关键词,阈值可以相对宽松
|
| 288 |
+
threshold = 0.6
|
| 289 |
+
|
| 290 |
+
# 计算共同词汇比例
|
| 291 |
+
common_words = inst1_words.intersection(inst2_words)
|
| 292 |
+
if not common_words:
|
| 293 |
+
return False
|
| 294 |
+
|
| 295 |
+
similarity_ratio = len(common_words) / min(len(inst1_words), len(inst2_words))
|
| 296 |
+
|
| 297 |
+
# 添加调试日志
|
| 298 |
+
if similarity_ratio >= threshold:
|
| 299 |
+
print(f"机构匹配: '{inst1}' vs '{inst2}' - 相似度: {similarity_ratio:.2f}, 共同词汇: {common_words}")
|
| 300 |
+
|
| 301 |
+
return similarity_ratio >= threshold
|
| 302 |
+
|
| 303 |
+
def _generate_fallback_reviewers(self, candidates: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
|
| 304 |
+
"""当LLM解析失败时,生成基础推荐"""
|
| 305 |
+
fallback_reviewers = []
|
| 306 |
+
|
| 307 |
+
for candidate in candidates[:20]: # 取前20个候选
|
| 308 |
+
author = candidate.get("corresponding_author")
|
| 309 |
+
institution = candidate.get("corresponding_institution")
|
| 310 |
+
|
| 311 |
+
if author and author not in [r.get("name") for r in fallback_reviewers]:
|
| 312 |
+
# 检查是否与论文作者或机构相同
|
| 313 |
+
author_lower = (author or "").strip().lower()
|
| 314 |
+
institution_lower = (institution or "").strip().lower()
|
| 315 |
+
|
| 316 |
+
paper_authors = [a.strip().lower() for a in paper.authors if a.strip()]
|
| 317 |
+
paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
|
| 318 |
+
|
| 319 |
+
is_same_author = any(self._similar_names(author_lower, pa) for pa in paper_authors)
|
| 320 |
+
is_same_institution = any(self._similar_institutions(institution_lower, pa) for pa in paper_affiliations)
|
| 321 |
+
|
| 322 |
+
if not is_same_author and not is_same_institution:
|
| 323 |
+
fallback_reviewers.append({
|
| 324 |
+
"name": author,
|
| 325 |
+
"affiliation": institution or "未知单位",
|
| 326 |
+
"email": "未知邮箱",
|
| 327 |
+
"reason": "基于文献相关性自动推荐",
|
| 328 |
+
"relevance_score": 0.7,
|
| 329 |
+
"expertise_areas": ["相关研究领域"]
|
| 330 |
+
})
|
| 331 |
+
|
| 332 |
+
return fallback_reviewers
|
| 333 |
+
|
| 334 |
+
def _call_llm_with_retry(self, prompt: str, system_msg: str, json_output: bool = False, max_retries: int = 3) -> Any:
|
| 335 |
+
"""带重试机制的LLM调用"""
|
| 336 |
+
for attempt in range(max_retries):
|
| 337 |
+
try:
|
| 338 |
+
if DASHSCOPE_API_KEY:
|
| 339 |
+
import dashscope
|
| 340 |
+
dashscope.api_key = DASHSCOPE_API_KEY
|
| 341 |
+
|
| 342 |
+
# 设置更长的超时时间和更好的错误处理
|
| 343 |
+
try:
|
| 344 |
+
response = dashscope.Generation.call(
|
| 345 |
+
model="qwen-turbo-latest", # 使用更稳定的模型
|
| 346 |
+
messages=[
|
| 347 |
+
{"role": "system", "content": system_msg},
|
| 348 |
+
{"role": "user", "content": prompt}
|
| 349 |
+
],
|
| 350 |
+
result_format="json" if json_output else "text",
|
| 351 |
+
timeout=60 # 增加超时时间
|
| 352 |
+
)
|
| 353 |
+
if response.status_code == 200:
|
| 354 |
+
return response.output.text
|
| 355 |
+
else:
|
| 356 |
+
print(f"DashScope API错误: {response.message}")
|
| 357 |
+
|
| 358 |
+
except Exception as api_error:
|
| 359 |
+
print(f"DashScope API调用异常: {str(api_error)}")
|
| 360 |
+
if "SSL" in str(api_error) or "EOF" in str(api_error):
|
| 361 |
+
print("检测到SSL连接问题,尝试使用备用方案")
|
| 362 |
+
# 可以在这里添加备用API调用
|
| 363 |
+
|
| 364 |
+
elif OPENAI_API_KEY:
|
| 365 |
+
from openai import OpenAI
|
| 366 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 367 |
+
response = client.chat.completions.create(
|
| 368 |
+
model="gpt-3.5-turbo", # 使用更稳定的模型
|
| 369 |
+
messages=[
|
| 370 |
+
{"role": "system", "content": system_msg},
|
| 371 |
+
{"role": "user", "content": prompt}
|
| 372 |
+
],
|
| 373 |
+
response_format={"type": "json_object"} if json_output else None,
|
| 374 |
+
timeout=60
|
| 375 |
+
)
|
| 376 |
+
return response.choices[0].message.content
|
| 377 |
+
|
| 378 |
+
else:
|
| 379 |
+
print("未配置API密钥,使用备用方案")
|
| 380 |
+
return None
|
| 381 |
+
|
| 382 |
+
except Exception as e:
|
| 383 |
+
print(f"第{attempt + 1}次调用失败: {str(e)}")
|
| 384 |
+
if attempt < max_retries - 1:
|
| 385 |
+
print(f"等待 {2 ** attempt} 秒后重试...")
|
| 386 |
+
time.sleep(2 ** attempt) # 指数退避
|
| 387 |
+
else:
|
| 388 |
+
print(f"所有重试都失败了,将使用备用推荐方案")
|
| 389 |
+
return None
|
reviewer_recommendation/enginecomplex.py
ADDED
|
@@ -0,0 +1,609 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
推荐引擎模块
|
| 3 |
+
使用LLM分析候选者并推荐合适的审稿人
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 11 |
+
|
| 12 |
+
from .models import PaperInfo, Reviewer
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# 配置部分
|
| 16 |
+
DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
|
| 17 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class LLMRecommendationEngine:
|
| 21 |
+
"""完全由大模型驱动的审稿人推荐引擎"""
|
| 22 |
+
|
| 23 |
+
def __init__(self):
|
| 24 |
+
pass
|
| 25 |
+
|
| 26 |
+
def analyze_candidates(self, paper: PaperInfo, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 27 |
+
"""分析候选文献,评估适合度"""
|
| 28 |
+
system_msg = "你是学术领域专家,擅长评估研究人员与特定论文的匹配度"
|
| 29 |
+
candidates_str = json.dumps(candidates, ensure_ascii=False, indent=2)
|
| 30 |
+
|
| 31 |
+
# 构建作者和机构信息
|
| 32 |
+
authors_info = ""
|
| 33 |
+
if paper.authors:
|
| 34 |
+
authors_info = f"作者: {', '.join(paper.authors)}"
|
| 35 |
+
if paper.affiliations:
|
| 36 |
+
authors_info += f"\n作者机构: {', '.join(paper.affiliations)}"
|
| 37 |
+
|
| 38 |
+
prompt = f"""
|
| 39 |
+
请分析以下候选文献的作者是否适合评审目标论文,并按适合度排序:
|
| 40 |
+
|
| 41 |
+
目标论文:
|
| 42 |
+
标题: {paper.title}
|
| 43 |
+
摘要: {paper.abstract}
|
| 44 |
+
关键词: {', '.join(paper.keywords)}
|
| 45 |
+
{authors_info}
|
| 46 |
+
|
| 47 |
+
候选文献列表:
|
| 48 |
+
{candidates_str}
|
| 49 |
+
|
| 50 |
+
分析要求:
|
| 51 |
+
1. 为每位通讯作者评估适合度,给出0-1的相关性评分
|
| 52 |
+
2. 提取作者的专业领域和研究方向
|
| 53 |
+
3. 说明推荐理由(中文 重点介绍作者本人的研究方向)
|
| 54 |
+
4. 排除重复作者
|
| 55 |
+
5. 严格排除与目标论文作者相同或来自同一机构的人员
|
| 56 |
+
6. 按适合度从高到低排序,优先考虑引用量和知名程度
|
| 57 |
+
7. 必须返回至少5-10个审稿人,确保有足够的候选人数
|
| 58 |
+
7. 如果相关性评分全部低于0.6 则重新再进行一次分析
|
| 59 |
+
8. 估算作者的学术论文引用总量(基于机构声誉和研究领域)
|
| 60 |
+
|
| 61 |
+
请返回JSON数组,每个元素包含:
|
| 62 |
+
- name: 作者姓名
|
| 63 |
+
- affiliation: 单位
|
| 64 |
+
- email: 邮箱(从数据中提取)
|
| 65 |
+
- reason: 推荐理由(中文 作者本人的研究方向与目标论文的适配度)
|
| 66 |
+
- relevance_score: 相关性评分(0-1)
|
| 67 |
+
- expertise_areas: 专业领域列表
|
| 68 |
+
- citation_count: 估算的学术论文引用总量
|
| 69 |
+
|
| 70 |
+
确保输出是纯JSON,不要包含其他内容
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
response = self._call_llm_with_retry(prompt.strip(), system_msg, json_output=True)
|
| 74 |
+
if not response:
|
| 75 |
+
return []
|
| 76 |
+
|
| 77 |
+
# 清理和解析JSON响应
|
| 78 |
+
cleaned_response = self._clean_json_response(response)
|
| 79 |
+
if not cleaned_response:
|
| 80 |
+
return []
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
result = json.loads(cleaned_response)
|
| 84 |
+
if isinstance(result, list):
|
| 85 |
+
# 并行为每个审稿人添加引用量
|
| 86 |
+
enhanced_result = self._add_citations_parallel(result)
|
| 87 |
+
|
| 88 |
+
# 按引用量和相关性评分综合排序
|
| 89 |
+
def sort_key(x):
|
| 90 |
+
citation_count = x.get('citation_count', '0')
|
| 91 |
+
if isinstance(citation_count, str) and citation_count == "未查询到":
|
| 92 |
+
citation_score = 0
|
| 93 |
+
else:
|
| 94 |
+
try:
|
| 95 |
+
citation_score = int(citation_count) / 10000 * 0.6
|
| 96 |
+
except (ValueError, TypeError):
|
| 97 |
+
citation_score = 0
|
| 98 |
+
relevance_score = x.get('relevance_score', 0) * 0.4
|
| 99 |
+
return citation_score + relevance_score
|
| 100 |
+
|
| 101 |
+
enhanced_result.sort(key=sort_key, reverse=True)
|
| 102 |
+
|
| 103 |
+
# 过滤掉相同作者和机构
|
| 104 |
+
filtered_result = self._filter_reviewers(enhanced_result, paper)
|
| 105 |
+
return filtered_result
|
| 106 |
+
else:
|
| 107 |
+
print("大模型返回的不是JSON数组")
|
| 108 |
+
return self._generate_fallback_reviewers(candidates, paper)
|
| 109 |
+
except json.JSONDecodeError:
|
| 110 |
+
print("无法解析大模型返回的JSON")
|
| 111 |
+
return self._generate_fallback_reviewers(candidates, paper)
|
| 112 |
+
|
| 113 |
+
def _clean_json_response(self, response: str) -> str:
|
| 114 |
+
"""清理大模型返回的JSON响应"""
|
| 115 |
+
if not response:
|
| 116 |
+
return ""
|
| 117 |
+
|
| 118 |
+
# 移除markdown代码块
|
| 119 |
+
if "```json" in response:
|
| 120 |
+
start = response.find("```json") + 7
|
| 121 |
+
end = response.find("```", start)
|
| 122 |
+
if end != -1:
|
| 123 |
+
response = response[start:end]
|
| 124 |
+
elif "```" in response:
|
| 125 |
+
start = response.find("```") + 3
|
| 126 |
+
end = response.find("```", start)
|
| 127 |
+
if end != -1:
|
| 128 |
+
response = response[start:end]
|
| 129 |
+
|
| 130 |
+
# 清理空白字符
|
| 131 |
+
response = response.strip()
|
| 132 |
+
|
| 133 |
+
# 处理多个独立JSON对象的情况
|
| 134 |
+
if response.count('{') > 1:
|
| 135 |
+
# 尝试将多个JSON对象合并为数组
|
| 136 |
+
try:
|
| 137 |
+
# 分割多个JSON对象
|
| 138 |
+
objects = []
|
| 139 |
+
brace_count = 0
|
| 140 |
+
current_obj = ""
|
| 141 |
+
|
| 142 |
+
for char in response:
|
| 143 |
+
current_obj += char
|
| 144 |
+
if char == '{':
|
| 145 |
+
brace_count += 1
|
| 146 |
+
elif char == '}':
|
| 147 |
+
brace_count -= 1
|
| 148 |
+
if brace_count == 0:
|
| 149 |
+
# 一个完整的JSON对象
|
| 150 |
+
obj_str = current_obj.strip()
|
| 151 |
+
if obj_str.startswith('{') and obj_str.endswith('}'):
|
| 152 |
+
try:
|
| 153 |
+
json.loads(obj_str) # 验证JSON格式
|
| 154 |
+
objects.append(obj_str)
|
| 155 |
+
except:
|
| 156 |
+
pass
|
| 157 |
+
current_obj = ""
|
| 158 |
+
|
| 159 |
+
if len(objects) > 1:
|
| 160 |
+
# 合并为JSON数组
|
| 161 |
+
return "[" + ",".join(objects) + "]"
|
| 162 |
+
elif len(objects) == 1:
|
| 163 |
+
return "[" + objects[0] + "]"
|
| 164 |
+
except:
|
| 165 |
+
pass
|
| 166 |
+
|
| 167 |
+
return response
|
| 168 |
+
|
| 169 |
+
def _filter_reviewers(self, reviewers: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
|
| 170 |
+
"""过滤掉与论文作者相同或来自同一机构的审稿人"""
|
| 171 |
+
filtered_reviewers = []
|
| 172 |
+
|
| 173 |
+
# 获取论文作者和机构的标准化列表
|
| 174 |
+
paper_authors = [author.strip().lower() for author in paper.authors if author.strip()]
|
| 175 |
+
paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
|
| 176 |
+
|
| 177 |
+
print(f"论文作者: {paper.authors}")
|
| 178 |
+
print(f"论文机构: {paper.affiliations}")
|
| 179 |
+
print(f"开始过滤 {len(reviewers)} 个审稿人...")
|
| 180 |
+
|
| 181 |
+
for reviewer in reviewers:
|
| 182 |
+
reviewer_name = reviewer.get("name", "").strip().lower()
|
| 183 |
+
reviewer_affiliation = reviewer.get("affiliation", "").strip().lower()
|
| 184 |
+
|
| 185 |
+
print(f"检查审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')})")
|
| 186 |
+
|
| 187 |
+
# 检查是否与论文作者相同
|
| 188 |
+
is_same_author = any(self._similar_names(reviewer_name, author) for author in paper_authors)
|
| 189 |
+
|
| 190 |
+
# 检查是否来自同一机构
|
| 191 |
+
is_same_institution = any(self._similar_institutions(reviewer_affiliation, aff) for aff in paper_affiliations)
|
| 192 |
+
|
| 193 |
+
# 如果既不是相同作者也不是同一机构,则保留
|
| 194 |
+
if not is_same_author and not is_same_institution:
|
| 195 |
+
filtered_reviewers.append(reviewer)
|
| 196 |
+
print(f"保留审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')})")
|
| 197 |
+
else:
|
| 198 |
+
reason = "作者相同" if is_same_author else "机构相同"
|
| 199 |
+
print(f"过滤掉审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')}) - {reason}")
|
| 200 |
+
|
| 201 |
+
print(f"过滤完成,保留 {len(filtered_reviewers)} 个审稿人")
|
| 202 |
+
return filtered_reviewers
|
| 203 |
+
|
| 204 |
+
def _similar_names(self, name1: str, name2: str) -> bool:
|
| 205 |
+
"""检查两个姓名是否相似(可能是同一人)"""
|
| 206 |
+
# 简单的相似性检查
|
| 207 |
+
if name1 == name2:
|
| 208 |
+
print(f"姓名完全匹配: '{name1}' == '{name2}'")
|
| 209 |
+
return True
|
| 210 |
+
|
| 211 |
+
# 检查是否包含相同的姓氏
|
| 212 |
+
name1_parts = name1.split()
|
| 213 |
+
name2_parts = name2.split()
|
| 214 |
+
|
| 215 |
+
if name1_parts and name2_parts:
|
| 216 |
+
# 检查姓氏是否相同
|
| 217 |
+
if name1_parts[0] == name2_parts[0]:
|
| 218 |
+
print(f"姓氏匹配: '{name1}' vs '{name2}' - 共同姓氏: {name1_parts[0]}")
|
| 219 |
+
return True
|
| 220 |
+
|
| 221 |
+
return False
|
| 222 |
+
|
| 223 |
+
def _similar_institutions(self, inst1: str, inst2: str) -> bool:
|
| 224 |
+
"""检查两个机构是否相似(可能是同一机构的不同表述)"""
|
| 225 |
+
if inst1 == inst2:
|
| 226 |
+
return True
|
| 227 |
+
|
| 228 |
+
# 过滤掉通用词汇,只保留有意义的机构名称关键词
|
| 229 |
+
def filter_common_words(words):
|
| 230 |
+
common_words = {
|
| 231 |
+
'university', 'college', 'institute', 'department', 'school',
|
| 232 |
+
'center', 'centre', 'laboratory', 'lab', 'of', 'the', 'and',
|
| 233 |
+
'at', 'in', 'for', 'medical', 'medicine', 'science', 'technology'
|
| 234 |
+
}
|
| 235 |
+
return {word for word in words if word not in common_words and len(word) > 2}
|
| 236 |
+
|
| 237 |
+
# 获取有意义的关键词
|
| 238 |
+
inst1_words = filter_common_words(set(inst1.lower().split()))
|
| 239 |
+
inst2_words = filter_common_words(set(inst2.lower().split()))
|
| 240 |
+
|
| 241 |
+
# 如果过滤后没有关键词,使用原始词汇但提高阈值
|
| 242 |
+
if not inst1_words or not inst2_words:
|
| 243 |
+
inst1_words = set(inst1.lower().split())
|
| 244 |
+
inst2_words = set(inst2.lower().split())
|
| 245 |
+
# 提高阈值到80%,减少误判
|
| 246 |
+
threshold = 0.8
|
| 247 |
+
else:
|
| 248 |
+
# 使用有意义关键词,阈值可以相对宽松
|
| 249 |
+
threshold = 0.6
|
| 250 |
+
|
| 251 |
+
# 计算共同词汇比例
|
| 252 |
+
common_words = inst1_words.intersection(inst2_words)
|
| 253 |
+
if not common_words:
|
| 254 |
+
return False
|
| 255 |
+
|
| 256 |
+
similarity_ratio = len(common_words) / min(len(inst1_words), len(inst2_words))
|
| 257 |
+
|
| 258 |
+
# 添加调试日志
|
| 259 |
+
if similarity_ratio >= threshold:
|
| 260 |
+
print(f"机构匹配: '{inst1}' vs '{inst2}' - 相似度: {similarity_ratio:.2f}, 共同词汇: {common_words}")
|
| 261 |
+
|
| 262 |
+
return similarity_ratio >= threshold
|
| 263 |
+
|
| 264 |
+
def _generate_fallback_reviewers(self, candidates: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
|
| 265 |
+
"""当LLM解析失败时,生成基础推荐"""
|
| 266 |
+
fallback_reviewers = []
|
| 267 |
+
|
| 268 |
+
for candidate in candidates[:20]: # 取前20个候选
|
| 269 |
+
author = candidate.get("corresponding_author")
|
| 270 |
+
institution = candidate.get("corresponding_institution")
|
| 271 |
+
|
| 272 |
+
if author and author not in [r.get("name") for r in fallback_reviewers]:
|
| 273 |
+
# 检查是否与论文作者或机构相同
|
| 274 |
+
author_lower = author.strip().lower()
|
| 275 |
+
institution_lower = (institution or "").strip().lower()
|
| 276 |
+
|
| 277 |
+
paper_authors = [a.strip().lower() for a in paper.authors if a.strip()]
|
| 278 |
+
paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
|
| 279 |
+
|
| 280 |
+
is_same_author = any(self._similar_names(author_lower, pa) for pa in paper_authors)
|
| 281 |
+
is_same_institution = any(self._similar_institutions(institution_lower, pa) for pa in paper_affiliations)
|
| 282 |
+
|
| 283 |
+
if not is_same_author and not is_same_institution:
|
| 284 |
+
# 获取真实引用量
|
| 285 |
+
citation_count = self._get_real_citation_count(author, institution or "未知单位")
|
| 286 |
+
|
| 287 |
+
fallback_reviewers.append({
|
| 288 |
+
"name": author,
|
| 289 |
+
"affiliation": institution or "未知单位",
|
| 290 |
+
"email": "未知邮箱",
|
| 291 |
+
"reason": "基于文献相关性自动推荐",
|
| 292 |
+
"relevance_score": 0.7,
|
| 293 |
+
"expertise_areas": ["相关研究领域"],
|
| 294 |
+
"citation_count": citation_count
|
| 295 |
+
})
|
| 296 |
+
|
| 297 |
+
return fallback_reviewers
|
| 298 |
+
|
| 299 |
+
def _call_llm_with_retry(self, prompt: str, system_msg: str, json_output: bool = False, max_retries: int = 3) -> Any:
|
| 300 |
+
"""带重试机制的LLM调用"""
|
| 301 |
+
for attempt in range(max_retries):
|
| 302 |
+
try:
|
| 303 |
+
if DASHSCOPE_API_KEY:
|
| 304 |
+
import dashscope
|
| 305 |
+
dashscope.api_key = DASHSCOPE_API_KEY
|
| 306 |
+
|
| 307 |
+
# 设置更长的超时时间和更好的错误处理
|
| 308 |
+
try:
|
| 309 |
+
response = dashscope.Generation.call(
|
| 310 |
+
model="qwen-turbo", # 使用更稳定的模型
|
| 311 |
+
messages=[
|
| 312 |
+
{"role": "system", "content": system_msg},
|
| 313 |
+
{"role": "user", "content": prompt}
|
| 314 |
+
],
|
| 315 |
+
result_format="json" if json_output else "text",
|
| 316 |
+
timeout=60 # 增加超时时间
|
| 317 |
+
)
|
| 318 |
+
if response.status_code == 200:
|
| 319 |
+
return response.output.text
|
| 320 |
+
else:
|
| 321 |
+
print(f"DashScope API错误: {response.message}")
|
| 322 |
+
|
| 323 |
+
except Exception as api_error:
|
| 324 |
+
print(f"DashScope API调用异常: {str(api_error)}")
|
| 325 |
+
if "SSL" in str(api_error) or "EOF" in str(api_error):
|
| 326 |
+
print("检测到SSL连接问题,尝试使用备用方案")
|
| 327 |
+
# 可以在这里添加备用API调用
|
| 328 |
+
|
| 329 |
+
elif OPENAI_API_KEY:
|
| 330 |
+
from openai import OpenAI
|
| 331 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 332 |
+
response = client.chat.completions.create(
|
| 333 |
+
model="gpt-3.5-turbo", # 使用更稳定的模型
|
| 334 |
+
messages=[
|
| 335 |
+
{"role": "system", "content": system_msg},
|
| 336 |
+
{"role": "user", "content": prompt}
|
| 337 |
+
],
|
| 338 |
+
response_format={"type": "json_object"} if json_output else None,
|
| 339 |
+
timeout=60
|
| 340 |
+
)
|
| 341 |
+
return response.choices[0].message.content
|
| 342 |
+
|
| 343 |
+
else:
|
| 344 |
+
print("未配置API密钥,使用备用方案")
|
| 345 |
+
return None
|
| 346 |
+
|
| 347 |
+
except Exception as e:
|
| 348 |
+
print(f"第{attempt + 1}次调用失败: {str(e)}")
|
| 349 |
+
if attempt < max_retries - 1:
|
| 350 |
+
print(f"等待 {2 ** attempt} 秒后重试...")
|
| 351 |
+
time.sleep(2 ** attempt) # 指数退避
|
| 352 |
+
else:
|
| 353 |
+
print(f"所有重试都失败了,将使用备用推荐方案")
|
| 354 |
+
return None
|
| 355 |
+
|
| 356 |
+
def _get_real_citation_count(self, name: str, affiliation: str) -> str:
|
| 357 |
+
"""获取作者的真实学术论文引用总量"""
|
| 358 |
+
try:
|
| 359 |
+
# 首先尝试OpenAlex API
|
| 360 |
+
citation_count = self._get_citation_from_openalex(name, affiliation)
|
| 361 |
+
if citation_count > 0:
|
| 362 |
+
return str(citation_count)
|
| 363 |
+
|
| 364 |
+
# 备用方案:Semantic Scholar API
|
| 365 |
+
citation_count = self._get_citation_from_semantic_scholar(name, affiliation)
|
| 366 |
+
if citation_count > 0:
|
| 367 |
+
return str(citation_count)
|
| 368 |
+
|
| 369 |
+
# 如果没有找到真实数据,返回"未查询到"
|
| 370 |
+
print(f"未找到 {name} 的引用量数据")
|
| 371 |
+
return "未查询到"
|
| 372 |
+
|
| 373 |
+
except Exception as e:
|
| 374 |
+
print(f"获取引用量失败: {str(e)}")
|
| 375 |
+
return "未查询到"
|
| 376 |
+
|
| 377 |
+
def _add_citations_parallel(self, reviewers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 378 |
+
"""并行为审稿人添加引用量"""
|
| 379 |
+
print(f"开始并行获取 {len(reviewers)} 个审稿人的引用量...")
|
| 380 |
+
|
| 381 |
+
enhanced_reviewers = []
|
| 382 |
+
|
| 383 |
+
# 使用线程池并行获取引用量
|
| 384 |
+
with ThreadPoolExecutor(max_workers=5) as executor:
|
| 385 |
+
# 提交所有引用量获取任务
|
| 386 |
+
future_to_reviewer = {}
|
| 387 |
+
for reviewer in reviewers:
|
| 388 |
+
name = reviewer.get('name', '')
|
| 389 |
+
affiliation = reviewer.get('affiliation', '')
|
| 390 |
+
future = executor.submit(self._get_real_citation_count, name, affiliation)
|
| 391 |
+
future_to_reviewer[future] = reviewer
|
| 392 |
+
|
| 393 |
+
# 收集结果
|
| 394 |
+
for future in as_completed(future_to_reviewer):
|
| 395 |
+
reviewer = future_to_reviewer[future]
|
| 396 |
+
try:
|
| 397 |
+
citation_count = future.result(timeout=15) # 15秒超时
|
| 398 |
+
reviewer['citation_count'] = citation_count
|
| 399 |
+
enhanced_reviewers.append(reviewer)
|
| 400 |
+
print(f"获取引用量完成: {reviewer.get('name')} - {citation_count}")
|
| 401 |
+
except Exception as e:
|
| 402 |
+
print(f"获取引用量失败: {reviewer.get('name')} - {str(e)}")
|
| 403 |
+
reviewer['citation_count'] = "未查询到"
|
| 404 |
+
enhanced_reviewers.append(reviewer)
|
| 405 |
+
|
| 406 |
+
print(f"并行引用量获取完成,处理了 {len(enhanced_reviewers)} 个审稿人")
|
| 407 |
+
return enhanced_reviewers
|
| 408 |
+
|
| 409 |
+
def _get_citation_from_openalex(self, name: str, affiliation: str) -> int:
|
| 410 |
+
"""从OpenAlex API获取作者引用量"""
|
| 411 |
+
try:
|
| 412 |
+
import requests
|
| 413 |
+
import urllib.parse
|
| 414 |
+
|
| 415 |
+
# 生成多种查询变体
|
| 416 |
+
name_variants = self._generate_name_variants(name)
|
| 417 |
+
|
| 418 |
+
for variant in name_variants:
|
| 419 |
+
# 简化查询,只使用姓名
|
| 420 |
+
query = f'display_name:"{variant}"'
|
| 421 |
+
print(f"OpenAlex查询: {query}")
|
| 422 |
+
|
| 423 |
+
# OpenAlex API请求
|
| 424 |
+
url = "https://api.openalex.org/authors"
|
| 425 |
+
params = {
|
| 426 |
+
'search': query,
|
| 427 |
+
'per-page': 5, # 增加结果数量
|
| 428 |
+
'select': 'id,display_name,cited_by_count,affiliations'
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
response = requests.get(url, params=params, timeout=15)
|
| 432 |
+
response.raise_for_status()
|
| 433 |
+
|
| 434 |
+
data = response.json()
|
| 435 |
+
if data.get('results'):
|
| 436 |
+
# 尝试匹配最佳结果
|
| 437 |
+
best_match = self._find_best_author_match(data['results'], name, affiliation)
|
| 438 |
+
if best_match:
|
| 439 |
+
cited_by_count = best_match.get('cited_by_count', 0)
|
| 440 |
+
print(f"OpenAlex API: {name} 引用量: {cited_by_count}")
|
| 441 |
+
return cited_by_count
|
| 442 |
+
|
| 443 |
+
print(f"OpenAlex API: 未找到 {variant} 的数据")
|
| 444 |
+
|
| 445 |
+
return 0
|
| 446 |
+
|
| 447 |
+
except Exception as e:
|
| 448 |
+
print(f"OpenAlex API调用失败: {str(e)}")
|
| 449 |
+
return 0
|
| 450 |
+
|
| 451 |
+
def _get_citation_from_semantic_scholar(self, name: str, affiliation: str) -> int:
|
| 452 |
+
"""从Semantic Scholar API获取作者引用量"""
|
| 453 |
+
try:
|
| 454 |
+
import requests
|
| 455 |
+
import urllib.parse
|
| 456 |
+
|
| 457 |
+
# 生成多种查询变体
|
| 458 |
+
name_variants = self._generate_name_variants(name)
|
| 459 |
+
|
| 460 |
+
for variant in name_variants:
|
| 461 |
+
# 简化查询,只使用姓名
|
| 462 |
+
query = variant
|
| 463 |
+
print(f"Semantic Scholar查询: {query}")
|
| 464 |
+
|
| 465 |
+
# Semantic Scholar API请求
|
| 466 |
+
url = "https://api.semanticscholar.org/graph/v1/author/search"
|
| 467 |
+
params = {
|
| 468 |
+
'query': query,
|
| 469 |
+
'limit': 5, # 增加结果数量
|
| 470 |
+
'fields': 'authorId,name,citationCount,affiliations'
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
headers = {
|
| 474 |
+
'User-Agent': 'Academic-Reviewer-System/1.0'
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
response = requests.get(url, params=params, headers=headers, timeout=15)
|
| 478 |
+
response.raise_for_status()
|
| 479 |
+
|
| 480 |
+
data = response.json()
|
| 481 |
+
if data.get('data'):
|
| 482 |
+
# 尝试匹配最佳结果
|
| 483 |
+
best_match = self._find_best_semantic_author_match(data['data'], name, affiliation)
|
| 484 |
+
if best_match:
|
| 485 |
+
citation_count = best_match.get('citationCount', 0)
|
| 486 |
+
print(f"Semantic Scholar API: {name} 引用量: {citation_count}")
|
| 487 |
+
return citation_count
|
| 488 |
+
|
| 489 |
+
print(f"Semantic Scholar API: 未找到 {variant} 的数据")
|
| 490 |
+
|
| 491 |
+
return 0
|
| 492 |
+
|
| 493 |
+
except Exception as e:
|
| 494 |
+
print(f"Semantic Scholar API调用失败: {str(e)}")
|
| 495 |
+
return 0
|
| 496 |
+
|
| 497 |
+
def _generate_name_variants(self, name: str) -> List[str]:
|
| 498 |
+
"""生成姓名的多种变体"""
|
| 499 |
+
variants = [name] # 原始姓名
|
| 500 |
+
|
| 501 |
+
# 如果包含中间名,尝试不同的组合
|
| 502 |
+
name_parts = name.split()
|
| 503 |
+
if len(name_parts) >= 2:
|
| 504 |
+
# 只使用姓和名
|
| 505 |
+
variants.append(f"{name_parts[0]} {name_parts[-1]}")
|
| 506 |
+
|
| 507 |
+
# 如果有多于2个部分,尝试不同的组合
|
| 508 |
+
if len(name_parts) == 3:
|
| 509 |
+
variants.append(f"{name_parts[0]} {name_parts[1]}")
|
| 510 |
+
variants.append(f"{name_parts[1]} {name_parts[2]}")
|
| 511 |
+
elif len(name_parts) > 3:
|
| 512 |
+
# 对于更复杂的姓名,尝试简化
|
| 513 |
+
variants.append(f"{name_parts[0]} {name_parts[1]}")
|
| 514 |
+
variants.append(f"{name_parts[0]} {name_parts[-1]}")
|
| 515 |
+
|
| 516 |
+
# 去重并保持顺序
|
| 517 |
+
seen = set()
|
| 518 |
+
unique_variants = []
|
| 519 |
+
for variant in variants:
|
| 520 |
+
if variant not in seen:
|
| 521 |
+
seen.add(variant)
|
| 522 |
+
unique_variants.append(variant)
|
| 523 |
+
|
| 524 |
+
return unique_variants
|
| 525 |
+
|
| 526 |
+
def _find_best_author_match(self, authors: List[Dict], target_name: str, target_affiliation: str) -> Optional[Dict]:
|
| 527 |
+
"""在OpenAlex结果中找到最佳匹配的作者"""
|
| 528 |
+
if not authors:
|
| 529 |
+
return None
|
| 530 |
+
|
| 531 |
+
# 如果只有一个结果,直接返回
|
| 532 |
+
if len(authors) == 1:
|
| 533 |
+
return authors[0]
|
| 534 |
+
|
| 535 |
+
# 计算每个作者的匹配分数
|
| 536 |
+
best_match = None
|
| 537 |
+
best_score = 0
|
| 538 |
+
|
| 539 |
+
for author in authors:
|
| 540 |
+
score = 0
|
| 541 |
+
author_name = author.get('display_name', '').lower()
|
| 542 |
+
target_name_lower = target_name.lower()
|
| 543 |
+
|
| 544 |
+
# 姓名匹配分数
|
| 545 |
+
if target_name_lower in author_name or author_name in target_name_lower:
|
| 546 |
+
score += 10
|
| 547 |
+
|
| 548 |
+
# 检查机构匹配
|
| 549 |
+
affiliations = author.get('affiliations', [])
|
| 550 |
+
if affiliations and target_affiliation and target_affiliation != "未知单位":
|
| 551 |
+
for aff in affiliations:
|
| 552 |
+
aff_name = aff.get('display_name', '').lower()
|
| 553 |
+
if target_affiliation.lower() in aff_name:
|
| 554 |
+
score += 5
|
| 555 |
+
break
|
| 556 |
+
|
| 557 |
+
# 引用量作为权重
|
| 558 |
+
citation_count = author.get('cited_by_count', 0)
|
| 559 |
+
if citation_count > 0:
|
| 560 |
+
score += 1
|
| 561 |
+
|
| 562 |
+
if score > best_score:
|
| 563 |
+
best_score = score
|
| 564 |
+
best_match = author
|
| 565 |
+
|
| 566 |
+
return best_match if best_score > 0 else authors[0]
|
| 567 |
+
|
| 568 |
+
def _find_best_semantic_author_match(self, authors: List[Dict], target_name: str, target_affiliation: str) -> Optional[Dict]:
|
| 569 |
+
"""在Semantic Scholar结果中找到最佳匹配的作者"""
|
| 570 |
+
if not authors:
|
| 571 |
+
return None
|
| 572 |
+
|
| 573 |
+
# 如果只有一个结果,直接返回
|
| 574 |
+
if len(authors) == 1:
|
| 575 |
+
return authors[0]
|
| 576 |
+
|
| 577 |
+
# 计算每个作者的匹配分数
|
| 578 |
+
best_match = None
|
| 579 |
+
best_score = 0
|
| 580 |
+
|
| 581 |
+
for author in authors:
|
| 582 |
+
score = 0
|
| 583 |
+
author_name = author.get('name', '').lower()
|
| 584 |
+
target_name_lower = target_name.lower()
|
| 585 |
+
|
| 586 |
+
# 姓名匹配分数
|
| 587 |
+
if target_name_lower in author_name or author_name in target_name_lower:
|
| 588 |
+
score += 10
|
| 589 |
+
|
| 590 |
+
# 检查机构匹配
|
| 591 |
+
affiliations = author.get('affiliations', [])
|
| 592 |
+
if affiliations and target_affiliation and target_affiliation != "未知单位":
|
| 593 |
+
for aff in affiliations:
|
| 594 |
+
aff_name = aff.get('name', '').lower()
|
| 595 |
+
if target_affiliation.lower() in aff_name:
|
| 596 |
+
score += 5
|
| 597 |
+
break
|
| 598 |
+
|
| 599 |
+
# 引用量作为权重
|
| 600 |
+
citation_count = author.get('citationCount', 0)
|
| 601 |
+
if citation_count > 0:
|
| 602 |
+
score += 1
|
| 603 |
+
|
| 604 |
+
if score > best_score:
|
| 605 |
+
best_score = score
|
| 606 |
+
best_match = author
|
| 607 |
+
|
| 608 |
+
return best_match if best_score > 0 else authors[0]
|
| 609 |
+
|
reviewer_recommendation/models.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
数据模型定义
|
| 3 |
+
定义审稿人推荐系统使用的核心数据结构
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Dict, Any, Optional
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class PaperInfo(BaseModel):
|
| 11 |
+
"""论文信息模型"""
|
| 12 |
+
title: str = Field(..., description="论文标题")
|
| 13 |
+
abstract: str = Field(..., description="论文摘要")
|
| 14 |
+
keywords: List[str] = Field(default_factory=list, description="论文关键词")
|
| 15 |
+
authors: List[str] = Field(default_factory=list, description="作者姓名列表")
|
| 16 |
+
affiliations: List[str] = Field(default_factory=list, description="作者所属机构列表")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class Reviewer(BaseModel):
|
| 20 |
+
"""审稿人信息模型"""
|
| 21 |
+
name: str = Field(..., description="审稿人姓名")
|
| 22 |
+
affiliation: str = Field(default="Unknown", description="所属机构")
|
| 23 |
+
email: str = Field(default="unknown@example.com", description="邮箱地址")
|
| 24 |
+
reason: str = Field(..., description="推荐理由")
|
| 25 |
+
relevance_score: float = Field(..., ge=0.0, le=1.0, description="相关性评分")
|
| 26 |
+
expertise_areas: List[str] = Field(default_factory=list, description="专业领域")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class SearchResult(BaseModel):
|
| 30 |
+
"""搜索结果模型"""
|
| 31 |
+
doi: Optional[str] = Field(None, description="DOI")
|
| 32 |
+
title: str = Field(..., description="论文标题")
|
| 33 |
+
abstract: str = Field(..., description="论文摘要")
|
| 34 |
+
corresponding_author: Optional[str] = Field(None, description="通讯作者")
|
| 35 |
+
corresponding_institution: Optional[str] = Field(None, description="通讯作者机构")
|
| 36 |
+
query_used: str = Field(..., description="使用的查询词")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class RecommendationRequest(BaseModel):
|
| 40 |
+
"""推荐请求模型"""
|
| 41 |
+
paper: PaperInfo
|
| 42 |
+
reviewer_count: int = Field(..., ge=1, le=10, description="推荐审稿人数量")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class RecommendationResponse(BaseModel):
|
| 46 |
+
"""推荐响应模型"""
|
| 47 |
+
reviewers: List[Reviewer] = Field(default_factory=list, description="推荐的审稿人列表")
|
| 48 |
+
search_time: float = Field(..., description="搜索耗时(秒)")
|
| 49 |
+
total_candidates: int = Field(..., description="候选者总数")
|
| 50 |
+
success: bool = Field(..., description="是否成功")
|
| 51 |
+
error_message: Optional[str] = Field(None, description="错误信息")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class AppState(BaseModel):
|
| 55 |
+
"""应用状态模型"""
|
| 56 |
+
current_request: Optional[RecommendationRequest] = None
|
| 57 |
+
current_response: Optional[RecommendationResponse] = None
|
| 58 |
+
is_processing: bool = False
|
| 59 |
+
last_error: Optional[str] = None
|
reviewer_recommendation/searcher copy.py
ADDED
|
@@ -0,0 +1,666 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
学术检索模块
|
| 3 |
+
提供基于EPMC和bioRxiv的学术文献检索功能
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
import urllib.parse
|
| 10 |
+
import requests
|
| 11 |
+
import warnings
|
| 12 |
+
import ssl
|
| 13 |
+
from typing import List, Dict, Any, Optional
|
| 14 |
+
from itertools import combinations
|
| 15 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 16 |
+
import threading
|
| 17 |
+
|
| 18 |
+
# 抑制SSL警告
|
| 19 |
+
warnings.filterwarnings('ignore', message='Unverified HTTPS request')
|
| 20 |
+
|
| 21 |
+
from .models import PaperInfo, SearchResult
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# 配置部分
|
| 25 |
+
DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
|
| 26 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 27 |
+
|
| 28 |
+
# 生物学关键词分类体系
|
| 29 |
+
BIOLOGY_KEYWORDS = {
|
| 30 |
+
"Molecular & Structural Biology": [
|
| 31 |
+
"Cryo-EM",
|
| 32 |
+
"X-ray crystallography",
|
| 33 |
+
"NMR spectroscopy",
|
| 34 |
+
"Single-particle analysis",
|
| 35 |
+
"Biolayer interferometry (BLI)",
|
| 36 |
+
"Surface plasmon resonance (SPR)",
|
| 37 |
+
"Confocal microscopy",
|
| 38 |
+
"CRISPR-Cas9",
|
| 39 |
+
"TALEN",
|
| 40 |
+
"ZFN",
|
| 41 |
+
"RNA interference (RNAi)",
|
| 42 |
+
"Single-molecule imaging",
|
| 43 |
+
"FRET",
|
| 44 |
+
"Optogenetics"
|
| 45 |
+
],
|
| 46 |
+
|
| 47 |
+
"Cell & Single-Cell Technologies": [
|
| 48 |
+
"Single-cell RNA-seq (scRNA-seq)",
|
| 49 |
+
"Single-cell ATAC-seq",
|
| 50 |
+
"Spatial transcriptomics",
|
| 51 |
+
"FISH (Fluorescence in situ hybridization)",
|
| 52 |
+
"Immunofluorescence",
|
| 53 |
+
"Tissue clearing (CLARITY)",
|
| 54 |
+
"Flow cytometry (FACS)",
|
| 55 |
+
"CyTOF (Mass cytometry)",
|
| 56 |
+
"High-throughput screening",
|
| 57 |
+
"Organoids",
|
| 58 |
+
"3D cell culture",
|
| 59 |
+
"Microfluidics"
|
| 60 |
+
],
|
| 61 |
+
|
| 62 |
+
"Neuroscience Tools": [
|
| 63 |
+
"Optogenetics",
|
| 64 |
+
"DREADDs (Designer Receptors Exclusively Activated by Designer Drugs)",
|
| 65 |
+
"GCaMP calcium imaging",
|
| 66 |
+
"Two-photon microscopy",
|
| 67 |
+
"Neural tracing",
|
| 68 |
+
"Patch-seq",
|
| 69 |
+
"Lineage tracing",
|
| 70 |
+
"Spatial multi-omics"
|
| 71 |
+
],
|
| 72 |
+
|
| 73 |
+
"Omics & Systems Biology": [
|
| 74 |
+
"RNA sequencing (RNA-seq)",
|
| 75 |
+
"Proteomics (LC-MS/MS)",
|
| 76 |
+
"Metabolomics",
|
| 77 |
+
"Epigenomics",
|
| 78 |
+
"10x Genomics",
|
| 79 |
+
"SMART-seq",
|
| 80 |
+
"Nanopore sequencing",
|
| 81 |
+
"Illumina HiSeq",
|
| 82 |
+
"WGCNA",
|
| 83 |
+
"Machine learning in omics",
|
| 84 |
+
"scVelo"
|
| 85 |
+
],
|
| 86 |
+
|
| 87 |
+
"Microbiome & Immunology": [
|
| 88 |
+
"16S rRNA sequencing",
|
| 89 |
+
"Metagenomics",
|
| 90 |
+
"Gut-brain axis",
|
| 91 |
+
"VDJ-seq",
|
| 92 |
+
"TCR/BCR lineage tracing",
|
| 93 |
+
"Immune checkpoints (PD-1, CTLA-4)",
|
| 94 |
+
"mRNA vaccines",
|
| 95 |
+
"DNA vaccines",
|
| 96 |
+
"Nanoparticle vaccines",
|
| 97 |
+
"Antigen presentation systems"
|
| 98 |
+
],
|
| 99 |
+
|
| 100 |
+
"Development & Regeneration": [
|
| 101 |
+
"Induced pluripotent stem cells (iPSCs)",
|
| 102 |
+
"Embryonic stem cells (ESCs)",
|
| 103 |
+
"Cellular reprogramming",
|
| 104 |
+
"Wnt signaling",
|
| 105 |
+
"Hippo pathway",
|
| 106 |
+
"Notch signaling",
|
| 107 |
+
"Zebrafish models",
|
| 108 |
+
"C. elegans",
|
| 109 |
+
"Mouse embryonic sections"
|
| 110 |
+
],
|
| 111 |
+
|
| 112 |
+
"Ecology & Environmental Biology": [
|
| 113 |
+
"Environmental DNA (eDNA)",
|
| 114 |
+
"Remote sensing ecology",
|
| 115 |
+
"Biosensors",
|
| 116 |
+
"Ecological niche modeling (ENM)",
|
| 117 |
+
"Genetic diversity analysis",
|
| 118 |
+
"Captive breeding technologies"
|
| 119 |
+
],
|
| 120 |
+
|
| 121 |
+
"Bioinformatics & AI Tools": [
|
| 122 |
+
"Seurat",
|
| 123 |
+
"Scanpy",
|
| 124 |
+
"Monocle",
|
| 125 |
+
"CIBERSORT",
|
| 126 |
+
"GSEA",
|
| 127 |
+
"AlphaFold",
|
| 128 |
+
"RoseTTAFold",
|
| 129 |
+
"Molecular docking",
|
| 130 |
+
"STRING",
|
| 131 |
+
"Cytoscape",
|
| 132 |
+
"Gene Ontology (GO)",
|
| 133 |
+
"KEGG pathway analysis"
|
| 134 |
+
]
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class AcademicSearcher:
|
| 139 |
+
"""基础学术检索器,仅负责数据获取,不做任何分析"""
|
| 140 |
+
|
| 141 |
+
EPMC_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
|
| 142 |
+
BIORXIV_URL = "https://api.biorxiv.org/details/biorxiv/{doi}/na/json"
|
| 143 |
+
|
| 144 |
+
def __init__(self, limit=50, sleep=0.1, timeout=30):
|
| 145 |
+
self.limit = limit
|
| 146 |
+
self.sleep = sleep
|
| 147 |
+
self.timeout = timeout
|
| 148 |
+
# 创建自定义SSL上下文
|
| 149 |
+
self.ssl_context = ssl.create_default_context()
|
| 150 |
+
self.ssl_context.check_hostname = False
|
| 151 |
+
self.ssl_context.verify_mode = ssl.CERT_NONE
|
| 152 |
+
|
| 153 |
+
def search(self, query: str) -> List[Dict[str, Any]]:
|
| 154 |
+
"""执行检索并返回原始文献数据"""
|
| 155 |
+
try:
|
| 156 |
+
# 1. 获取DOI列表
|
| 157 |
+
epmc_results = self._epmc_search(query)
|
| 158 |
+
|
| 159 |
+
# 2. 并行获取详细信息
|
| 160 |
+
detailed_results = self._get_details_parallel(epmc_results, query)
|
| 161 |
+
|
| 162 |
+
return detailed_results
|
| 163 |
+
except Exception as e:
|
| 164 |
+
print(f"检索错误: {str(e)}")
|
| 165 |
+
return []
|
| 166 |
+
|
| 167 |
+
def _get_details_parallel(self, epmc_results: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
|
| 168 |
+
"""并行获取详细信息"""
|
| 169 |
+
detailed_results = []
|
| 170 |
+
|
| 171 |
+
# 如果没有结果,直接返回空列表
|
| 172 |
+
if not epmc_results:
|
| 173 |
+
return detailed_results
|
| 174 |
+
|
| 175 |
+
# 限制并行数量,避免过多并发请求
|
| 176 |
+
max_workers = min(5, len(epmc_results))
|
| 177 |
+
|
| 178 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 179 |
+
# 提交所有详情获取任务
|
| 180 |
+
future_to_item = {}
|
| 181 |
+
for item in epmc_results:
|
| 182 |
+
doi = item.get("doi")
|
| 183 |
+
if doi:
|
| 184 |
+
future = executor.submit(self._get_biorxiv_detail, doi)
|
| 185 |
+
future_to_item[future] = item
|
| 186 |
+
|
| 187 |
+
# 收集结果
|
| 188 |
+
for future in as_completed(future_to_item):
|
| 189 |
+
item = future_to_item[future]
|
| 190 |
+
try:
|
| 191 |
+
detail = future.result(timeout=10) # 10秒超时
|
| 192 |
+
if detail:
|
| 193 |
+
detail["query_used"] = query
|
| 194 |
+
detailed_results.append(detail)
|
| 195 |
+
|
| 196 |
+
if len(detailed_results) >= self.limit:
|
| 197 |
+
break
|
| 198 |
+
|
| 199 |
+
except Exception as e:
|
| 200 |
+
print(f"获取详情失败: {item.get('doi')} - {str(e)}")
|
| 201 |
+
continue
|
| 202 |
+
|
| 203 |
+
return detailed_results
|
| 204 |
+
|
| 205 |
+
def _epmc_search(self, query: str) -> List[Dict[str, Any]]:
|
| 206 |
+
"""获取EPMC搜索结果"""
|
| 207 |
+
params = {
|
| 208 |
+
"query": f'(SRC:PPR) AND (DOI:10.1101*) AND ({query})',
|
| 209 |
+
"resultType": "core",
|
| 210 |
+
"pageSize": str(min(100, self.limit * 2)), # 获取更多结果用于筛选
|
| 211 |
+
"format": "json",
|
| 212 |
+
"sortby": "cited", # 按引用量排序
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
# 添加重试机制
|
| 216 |
+
for attempt in range(3):
|
| 217 |
+
try:
|
| 218 |
+
response = requests.get(
|
| 219 |
+
self.EPMC_URL,
|
| 220 |
+
params=params,
|
| 221 |
+
timeout=self.timeout,
|
| 222 |
+
verify=False, # 禁用SSL验证
|
| 223 |
+
headers={
|
| 224 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 225 |
+
}
|
| 226 |
+
)
|
| 227 |
+
response.raise_for_status()
|
| 228 |
+
data = response.json()
|
| 229 |
+
results = data.get("resultList", {}).get("result", [])
|
| 230 |
+
|
| 231 |
+
# 添加调试信息,显示检索到的文献数量和引用量信息
|
| 232 |
+
if results:
|
| 233 |
+
print(f"EPMC检索到 {len(results)} 篇文献,按引用量排序")
|
| 234 |
+
# 显示前几篇文献的引用量信息
|
| 235 |
+
for i, result in enumerate(results[:3]):
|
| 236 |
+
cited_count = result.get('citedByCount', 0)
|
| 237 |
+
title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
|
| 238 |
+
print(f" 文献 {i+1}: {title} (引用量: {cited_count})")
|
| 239 |
+
|
| 240 |
+
return results
|
| 241 |
+
|
| 242 |
+
except requests.exceptions.SSLError as e:
|
| 243 |
+
print(f"EPMC SSL错误 (尝试 {attempt + 1}/3): {str(e)}")
|
| 244 |
+
if attempt == 2:
|
| 245 |
+
print("EPMC SSL连接失败,返回空结果")
|
| 246 |
+
return []
|
| 247 |
+
time.sleep(2 ** attempt)
|
| 248 |
+
|
| 249 |
+
except requests.exceptions.RequestException as e:
|
| 250 |
+
print(f"EPMC请求错误 (尝试 {attempt + 1}/3): {str(e)}")
|
| 251 |
+
if attempt == 2:
|
| 252 |
+
print("EPMC请求失败,返回空结果")
|
| 253 |
+
return []
|
| 254 |
+
time.sleep(2 ** attempt)
|
| 255 |
+
|
| 256 |
+
except Exception as e:
|
| 257 |
+
print(f"EPMC未知错误 (尝试 {attempt + 1}/3): {str(e)}")
|
| 258 |
+
if attempt == 2:
|
| 259 |
+
print("EPMC未知错误,返回空结果")
|
| 260 |
+
return []
|
| 261 |
+
time.sleep(2 ** attempt)
|
| 262 |
+
|
| 263 |
+
return []
|
| 264 |
+
|
| 265 |
+
def _get_biorxiv_detail(self, doi: str) -> Dict[str, Any]:
|
| 266 |
+
"""获取bioRxiv详细信息"""
|
| 267 |
+
url = self.BIORXIV_URL.format(doi=urllib.parse.quote(doi))
|
| 268 |
+
|
| 269 |
+
# 添加重试机制和更好的错误处理
|
| 270 |
+
for attempt in range(3):
|
| 271 |
+
try:
|
| 272 |
+
# 使用更宽松的SSL验证和更长的超时时间
|
| 273 |
+
response = requests.get(
|
| 274 |
+
url,
|
| 275 |
+
timeout=self.timeout,
|
| 276 |
+
verify=False, # 禁用SSL验证以避免SSL错误
|
| 277 |
+
headers={
|
| 278 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 279 |
+
}
|
| 280 |
+
)
|
| 281 |
+
response.raise_for_status()
|
| 282 |
+
data = response.json()
|
| 283 |
+
|
| 284 |
+
records = data.get("collection") or data.get("records") or []
|
| 285 |
+
if not records:
|
| 286 |
+
return None
|
| 287 |
+
|
| 288 |
+
latest_record = records[-1]
|
| 289 |
+
if latest_record.get("server") and latest_record["server"].lower() != "biorxiv":
|
| 290 |
+
return None
|
| 291 |
+
|
| 292 |
+
version = latest_record.get("version") or 1
|
| 293 |
+
return {
|
| 294 |
+
"doi": latest_record.get("doi"),
|
| 295 |
+
"title": latest_record.get("title"),
|
| 296 |
+
"abstract": latest_record.get("abstract"),
|
| 297 |
+
"corresponding_author": latest_record.get("author_corresponding"),
|
| 298 |
+
"corresponding_institution": latest_record.get("author_corresponding_institution"),
|
| 299 |
+
"url": f"https://www.biorxiv.org/content/{latest_record['doi']}v{version}"
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
except requests.exceptions.SSLError as e:
|
| 303 |
+
print(f"bioRxiv SSL错误 (尝试 {attempt + 1}/3): {str(e)}")
|
| 304 |
+
if attempt == 2: # 最后一次尝试
|
| 305 |
+
print(f"跳过DOI {doi} 的详细信息获取")
|
| 306 |
+
return None
|
| 307 |
+
time.sleep(2 ** attempt) # 指数退避
|
| 308 |
+
|
| 309 |
+
except requests.exceptions.RequestException as e:
|
| 310 |
+
print(f"bioRxiv请求错误 (尝试 {attempt + 1}/3): {str(e)}")
|
| 311 |
+
if attempt == 2: # 最后一次尝试
|
| 312 |
+
print(f"跳过DOI {doi} 的详细信息获取")
|
| 313 |
+
return None
|
| 314 |
+
time.sleep(2 ** attempt) # 指数退避
|
| 315 |
+
|
| 316 |
+
except Exception as e:
|
| 317 |
+
print(f"bioRxiv未知错误 (尝试 {attempt + 1}/3): {str(e)}")
|
| 318 |
+
if attempt == 2: # 最后一次尝试
|
| 319 |
+
print(f"跳过DOI {doi} 的详细信息获取")
|
| 320 |
+
return None
|
| 321 |
+
time.sleep(2 ** attempt) # 指数退避
|
| 322 |
+
|
| 323 |
+
return None
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
class DynamicAcademicSearcher:
|
| 327 |
+
"""动态学术检索器,包含智能查询生成、动态处理和扩展功能"""
|
| 328 |
+
|
| 329 |
+
def __init__(self, base_searcher: AcademicSearcher):
|
| 330 |
+
self.base_searcher = base_searcher
|
| 331 |
+
|
| 332 |
+
def search_with_dynamic_queries(self, paper: PaperInfo, num_queries: int = 2) -> List[Dict[str, Any]]:
|
| 333 |
+
"""使用动态查询进行并行检索"""
|
| 334 |
+
# 1. 生成检索查询
|
| 335 |
+
queries = self.generate_search_queries(paper, num_queries)
|
| 336 |
+
print("生成的检索查询:")
|
| 337 |
+
for i, query in enumerate(queries, 1):
|
| 338 |
+
print(f"查询 {i}: {query}")
|
| 339 |
+
|
| 340 |
+
# 记录查询生成日志
|
| 341 |
+
self._log_query_generation(paper, queries)
|
| 342 |
+
|
| 343 |
+
# 2. 并行执行动态检索
|
| 344 |
+
all_candidates = []
|
| 345 |
+
|
| 346 |
+
# 使用线程池并行执行查询
|
| 347 |
+
with ThreadPoolExecutor(max_workers=3) as executor:
|
| 348 |
+
# 提交所有查询任务
|
| 349 |
+
future_to_query = {}
|
| 350 |
+
for query in queries:
|
| 351 |
+
future = executor.submit(self._execute_single_query, query)
|
| 352 |
+
future_to_query[future] = query
|
| 353 |
+
|
| 354 |
+
# 收集结果
|
| 355 |
+
for future in as_completed(future_to_query):
|
| 356 |
+
query = future_to_query[future]
|
| 357 |
+
try:
|
| 358 |
+
results = future.result(timeout=30) # 30秒超时
|
| 359 |
+
if results:
|
| 360 |
+
print(f"查询 '{query}' 完成,找到 {len(results)} 篇文献")
|
| 361 |
+
all_candidates.extend(results)
|
| 362 |
+
else:
|
| 363 |
+
print(f"查询 '{query}' 未找到文献")
|
| 364 |
+
except Exception as e:
|
| 365 |
+
print(f"查询 '{query}' 执行失败: {str(e)}")
|
| 366 |
+
|
| 367 |
+
# 3. 去重相同文献
|
| 368 |
+
unique_candidates = {item['doi']: item for item in all_candidates if item.get('doi')}.values()
|
| 369 |
+
|
| 370 |
+
print(f"并行检索完成,总共找到 {len(list(unique_candidates))} 篇唯一文献")
|
| 371 |
+
return list(unique_candidates)
|
| 372 |
+
|
| 373 |
+
def _execute_single_query(self, query: str) -> List[Dict[str, Any]]:
|
| 374 |
+
"""执行单个查询(用于并行处理)"""
|
| 375 |
+
print(f"开始执行查询: {query}")
|
| 376 |
+
|
| 377 |
+
# 动态查询处理
|
| 378 |
+
processed_queries = self._process_query_dynamically(query)
|
| 379 |
+
|
| 380 |
+
for processed_query in processed_queries:
|
| 381 |
+
print(f" 尝试查询: {processed_query}")
|
| 382 |
+
results = self.base_searcher.search(processed_query)
|
| 383 |
+
|
| 384 |
+
if results:
|
| 385 |
+
print(f" 找到 {len(results)} 篇文献")
|
| 386 |
+
return results
|
| 387 |
+
else:
|
| 388 |
+
print(f" 未找到文献,尝试扩展查询...")
|
| 389 |
+
|
| 390 |
+
print(f" 所有扩展查询都未找到文献")
|
| 391 |
+
return []
|
| 392 |
+
|
| 393 |
+
def generate_search_queries(self, paper: PaperInfo, num_queries: int = 2) -> List[str]:
|
| 394 |
+
"""基于生物学关键词分类体系生成两级检索查询"""
|
| 395 |
+
system_msg = "你是生物学检索策略专家,擅长分析论文的研究领域和技术方法,并生成精准的检索查询"
|
| 396 |
+
|
| 397 |
+
# 构建关键词分类体系字符串
|
| 398 |
+
keywords_str = ""
|
| 399 |
+
for category, keywords in BIOLOGY_KEYWORDS.items():
|
| 400 |
+
keywords_str += f"\n{category}:\n"
|
| 401 |
+
for keyword in keywords:
|
| 402 |
+
keywords_str += f" - {keyword}\n"
|
| 403 |
+
|
| 404 |
+
prompt = f"""
|
| 405 |
+
请分析以下论文,按照以下步骤生成2个检索查询:
|
| 406 |
+
|
| 407 |
+
论文信息:
|
| 408 |
+
标题: {paper.title}
|
| 409 |
+
摘要: {paper.abstract}
|
| 410 |
+
关键词: {', '.join(paper.keywords)}
|
| 411 |
+
|
| 412 |
+
生物学关键词分类体系:
|
| 413 |
+
{keywords_str}
|
| 414 |
+
|
| 415 |
+
步骤1: 确定第一个检索查询(大类 + 子类)
|
| 416 |
+
1. 从上述分类体系中选择最匹配的1个大类(如"Molecular & Structural Biology")
|
| 417 |
+
2. 从该大类下选择最匹配的1个子类关键词(如"Cryo-EM")
|
| 418 |
+
3. 生成查询:大类 AND 子类关键词
|
| 419 |
+
4. 格式示例:Molecular & Structural Biology AND Cryo-EM
|
| 420 |
+
|
| 421 |
+
步骤2: 确定第二个检索查询(子类 + 论文特定关键词)
|
| 422 |
+
1. 使用步骤1中确定的子类关键词
|
| 423 |
+
2. 从论文标题、摘要或关键词中提取1个最核心的特定关键词
|
| 424 |
+
3. 生成查询:子类关键词 AND 论文特定关键词
|
| 425 |
+
4. 格式示例:Cryo-EM AND Nav1.7
|
| 426 |
+
|
| 427 |
+
要求:
|
| 428 |
+
1. 每个查询只使用2个关键词,用AND连接
|
| 429 |
+
2. 第一个查询:大类 AND 子类
|
| 430 |
+
3. 第二个查询:子类 AND 论文特定关键词
|
| 431 |
+
4. 论文特定关键词要简洁明确,适合学术数据库检索
|
| 432 |
+
5. 仅返回查询语句,每行一个,不添加编号或其他内容
|
| 433 |
+
|
| 434 |
+
输出格式示例:
|
| 435 |
+
Molecular & Structural Biology AND Cryo-EM
|
| 436 |
+
Cryo-EM AND Nav1.7
|
| 437 |
+
"""
|
| 438 |
+
|
| 439 |
+
response = self._call_llm(prompt.strip(), system_msg)
|
| 440 |
+
if not response:
|
| 441 |
+
return self._generate_backup_queries(paper, num_queries)
|
| 442 |
+
|
| 443 |
+
# 解析查询
|
| 444 |
+
queries = [q.strip() for q in response.strip().split('\n') if q.strip()]
|
| 445 |
+
|
| 446 |
+
# 验证查询格式
|
| 447 |
+
validated_queries = self._validate_queries(queries)
|
| 448 |
+
|
| 449 |
+
return validated_queries[:num_queries] if validated_queries else self._generate_backup_queries(paper, num_queries)
|
| 450 |
+
|
| 451 |
+
def _validate_queries(self, queries: List[str]) -> List[str]:
|
| 452 |
+
"""验证查询格式和质量"""
|
| 453 |
+
validated_queries = []
|
| 454 |
+
|
| 455 |
+
for query in queries:
|
| 456 |
+
# 基本格式检查
|
| 457 |
+
if not query or len(query.strip()) < 5:
|
| 458 |
+
print(f"查询太短,跳过: {query}")
|
| 459 |
+
continue
|
| 460 |
+
|
| 461 |
+
# 检查是否包含AND连接符
|
| 462 |
+
if ' AND ' not in query:
|
| 463 |
+
print(f"查询缺少AND连接符,跳过: {query}")
|
| 464 |
+
continue
|
| 465 |
+
|
| 466 |
+
# 检查是否只包含两个关键词(主要学科 AND 研究层面关键词)
|
| 467 |
+
parts = query.split(' AND ')
|
| 468 |
+
if len(parts) != 2:
|
| 469 |
+
print(f"查询格式不正确,跳过: {query}")
|
| 470 |
+
continue
|
| 471 |
+
|
| 472 |
+
# 检查每个部分是否有效
|
| 473 |
+
part1 = parts[0].strip()
|
| 474 |
+
part2 = parts[1].strip()
|
| 475 |
+
|
| 476 |
+
if not part1 or not part2:
|
| 477 |
+
print(f"查询包含空部分,跳过: {query}")
|
| 478 |
+
continue
|
| 479 |
+
|
| 480 |
+
if part1.upper() == 'AND' or part2.upper() == 'AND':
|
| 481 |
+
print(f"查询包含无效AND,跳过: {query}")
|
| 482 |
+
continue
|
| 483 |
+
|
| 484 |
+
# 检查是否包含生物学关键词分类
|
| 485 |
+
has_biology_keyword = False
|
| 486 |
+
for category, keywords in BIOLOGY_KEYWORDS.items():
|
| 487 |
+
if category.lower() in query.lower():
|
| 488 |
+
has_biology_keyword = True
|
| 489 |
+
break
|
| 490 |
+
for keyword in keywords:
|
| 491 |
+
if keyword.lower() in query.lower():
|
| 492 |
+
has_biology_keyword = True
|
| 493 |
+
break
|
| 494 |
+
if has_biology_keyword:
|
| 495 |
+
break
|
| 496 |
+
|
| 497 |
+
if not has_biology_keyword:
|
| 498 |
+
print(f"查询不包含生物学关键词分类,跳过: {query}")
|
| 499 |
+
continue
|
| 500 |
+
|
| 501 |
+
# 检查查询长度合理性
|
| 502 |
+
if len(query) > 100: # 查询过长
|
| 503 |
+
print(f"查询过长,跳过: {query}")
|
| 504 |
+
continue
|
| 505 |
+
|
| 506 |
+
validated_queries.append(query.strip())
|
| 507 |
+
print(f"查询验证通过: {query}")
|
| 508 |
+
|
| 509 |
+
return validated_queries
|
| 510 |
+
|
| 511 |
+
def _process_query_dynamically(self, query: str) -> List[str]:
|
| 512 |
+
"""动态处理查询,生成多个变体"""
|
| 513 |
+
# 基础查询
|
| 514 |
+
queries = [query]
|
| 515 |
+
|
| 516 |
+
# 检查查询格式是否正确
|
| 517 |
+
if ' AND ' not in query:
|
| 518 |
+
return queries
|
| 519 |
+
|
| 520 |
+
# 按AND分割查询
|
| 521 |
+
parts = query.split(' AND ')
|
| 522 |
+
if len(parts) != 2:
|
| 523 |
+
return queries
|
| 524 |
+
|
| 525 |
+
# 清理每个部分
|
| 526 |
+
part1 = parts[0].strip()
|
| 527 |
+
part2 = parts[1].strip()
|
| 528 |
+
|
| 529 |
+
# 如果某个部分为空或只包含AND,跳过
|
| 530 |
+
if not part1 or not part2 or part1.upper() == 'AND' or part2.upper() == 'AND':
|
| 531 |
+
return queries
|
| 532 |
+
|
| 533 |
+
# 简化查询(只保留主要关键词)
|
| 534 |
+
part1_words = part1.split()
|
| 535 |
+
part2_words = part2.split()
|
| 536 |
+
|
| 537 |
+
if len(part1_words) > 1:
|
| 538 |
+
# 取第一个部分的主要关键词
|
| 539 |
+
simplified_part1 = part1_words[0]
|
| 540 |
+
queries.append(f"{simplified_part1} AND {part2}")
|
| 541 |
+
|
| 542 |
+
if len(part2_words) > 1:
|
| 543 |
+
# 取第二个部分的主要关键词
|
| 544 |
+
simplified_part2 = part2_words[0]
|
| 545 |
+
queries.append(f"{part1} AND {simplified_part2}")
|
| 546 |
+
|
| 547 |
+
# 单个关键词查询
|
| 548 |
+
queries.append(part1)
|
| 549 |
+
queries.append(part2)
|
| 550 |
+
|
| 551 |
+
return list(set(queries)) # 去重
|
| 552 |
+
|
| 553 |
+
def _generate_backup_queries(self, paper: PaperInfo, num_queries: int) -> List[str]:
|
| 554 |
+
"""生成备用查询,基于生物学关键词分类体系"""
|
| 555 |
+
queries = []
|
| 556 |
+
|
| 557 |
+
# 尝试从论文内容推断最相关的生物学分类
|
| 558 |
+
best_category, best_keyword = self._infer_biology_keywords(paper)
|
| 559 |
+
|
| 560 |
+
if not best_category or not best_keyword:
|
| 561 |
+
# 如果没有推断出,使用默认分类
|
| 562 |
+
best_category = "Molecular & Structural Biology"
|
| 563 |
+
best_keyword = "Cryo-EM"
|
| 564 |
+
|
| 565 |
+
# 生成第一个查询:大类 AND 子类
|
| 566 |
+
queries.append(f"{best_category} AND {best_keyword}")
|
| 567 |
+
|
| 568 |
+
# 从论文标题中提取特定关键词
|
| 569 |
+
title_words = paper.title.split()
|
| 570 |
+
specific_keyword = None
|
| 571 |
+
for word in title_words:
|
| 572 |
+
if len(word) > 3 and word.lower() not in ['the', 'and', 'for', 'with', 'from', 'this', 'that', 'structures', 'human', 'channel', 'complex', 'with', 'auxiliary', 'subunits', 'animal', 'toxins']:
|
| 573 |
+
specific_keyword = word
|
| 574 |
+
break
|
| 575 |
+
|
| 576 |
+
if specific_keyword:
|
| 577 |
+
# 生成第二个查询:子类 AND 论文特定关键词
|
| 578 |
+
queries.append(f"{best_keyword} AND {specific_keyword}")
|
| 579 |
+
else:
|
| 580 |
+
# 如果没有找到特定关键词,使用第一个查询的变体
|
| 581 |
+
queries.append(f"{best_category} AND structure")
|
| 582 |
+
|
| 583 |
+
return queries[:num_queries]
|
| 584 |
+
|
| 585 |
+
def _infer_biology_keywords(self, paper: PaperInfo) -> tuple:
|
| 586 |
+
"""从论文内容推断最相关的生物学分类和关键词"""
|
| 587 |
+
text = f"{paper.title} {paper.abstract} {' '.join(paper.keywords)}".lower()
|
| 588 |
+
|
| 589 |
+
best_category = None
|
| 590 |
+
best_keyword = None
|
| 591 |
+
max_matches = 0
|
| 592 |
+
|
| 593 |
+
for category, keywords in BIOLOGY_KEYWORDS.items():
|
| 594 |
+
category_matches = 0
|
| 595 |
+
best_keyword_in_category = None
|
| 596 |
+
|
| 597 |
+
# 检查类别名称匹配
|
| 598 |
+
if category.lower() in text:
|
| 599 |
+
category_matches += 2
|
| 600 |
+
|
| 601 |
+
# 检查关键词匹配
|
| 602 |
+
for keyword in keywords:
|
| 603 |
+
if keyword.lower() in text:
|
| 604 |
+
category_matches += 1
|
| 605 |
+
if not best_keyword_in_category:
|
| 606 |
+
best_keyword_in_category = keyword
|
| 607 |
+
|
| 608 |
+
# 更新最佳匹配
|
| 609 |
+
if category_matches > max_matches:
|
| 610 |
+
max_matches = category_matches
|
| 611 |
+
best_category = category
|
| 612 |
+
best_keyword = best_keyword_in_category or keywords[0]
|
| 613 |
+
|
| 614 |
+
return best_category, best_keyword
|
| 615 |
+
|
| 616 |
+
def _log_query_generation(self, paper: PaperInfo, queries: List[str]):
|
| 617 |
+
"""记录查询生成日志"""
|
| 618 |
+
log_info = {
|
| 619 |
+
"paper_title": paper.title[:100] + "..." if len(paper.title) > 100 else paper.title,
|
| 620 |
+
"paper_keywords": paper.keywords,
|
| 621 |
+
"generated_queries": queries,
|
| 622 |
+
"query_count": len(queries),
|
| 623 |
+
"timestamp": time.time()
|
| 624 |
+
}
|
| 625 |
+
print(f"查询生成日志: {log_info}")
|
| 626 |
+
|
| 627 |
+
def _call_llm(self, prompt: str, system_msg: str) -> Optional[str]:
|
| 628 |
+
"""调用LLM生成查询"""
|
| 629 |
+
try:
|
| 630 |
+
if DASHSCOPE_API_KEY:
|
| 631 |
+
import dashscope
|
| 632 |
+
dashscope.api_key = DASHSCOPE_API_KEY
|
| 633 |
+
|
| 634 |
+
response = dashscope.Generation.call(
|
| 635 |
+
model="qwen-turbo",
|
| 636 |
+
messages=[
|
| 637 |
+
{"role": "system", "content": system_msg},
|
| 638 |
+
{"role": "user", "content": prompt}
|
| 639 |
+
],
|
| 640 |
+
timeout=30
|
| 641 |
+
)
|
| 642 |
+
if response.status_code == 200:
|
| 643 |
+
return response.output.text
|
| 644 |
+
else:
|
| 645 |
+
print(f"DashScope API错误: {response.message}")
|
| 646 |
+
|
| 647 |
+
elif OPENAI_API_KEY:
|
| 648 |
+
from openai import OpenAI
|
| 649 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 650 |
+
response = client.chat.completions.create(
|
| 651 |
+
model="gpt-3.5-turbo",
|
| 652 |
+
messages=[
|
| 653 |
+
{"role": "system", "content": system_msg},
|
| 654 |
+
{"role": "user", "content": prompt}
|
| 655 |
+
],
|
| 656 |
+
timeout=30
|
| 657 |
+
)
|
| 658 |
+
return response.choices[0].message.content
|
| 659 |
+
|
| 660 |
+
else:
|
| 661 |
+
print("未配置API密钥")
|
| 662 |
+
return None
|
| 663 |
+
|
| 664 |
+
except Exception as e:
|
| 665 |
+
print(f"大模型调用错误: {str(e)}")
|
| 666 |
+
return None
|
reviewer_recommendation/searcher.py
ADDED
|
@@ -0,0 +1,1128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
学术检索模块
|
| 3 |
+
提供基于EPMC、bioRxiv和OpenAlex的学术文献检索功能
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
import urllib.parse
|
| 10 |
+
import requests
|
| 11 |
+
import warnings
|
| 12 |
+
import ssl
|
| 13 |
+
from typing import List, Dict, Any, Optional
|
| 14 |
+
from itertools import combinations
|
| 15 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 16 |
+
import threading
|
| 17 |
+
|
| 18 |
+
# 抑制SSL警告
|
| 19 |
+
warnings.filterwarnings('ignore', message='Unverified HTTPS request')
|
| 20 |
+
|
| 21 |
+
from .models import PaperInfo, SearchResult
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class OpenAlexSearcher:
|
| 25 |
+
"""OpenAlex学术检索器,提供高质量的引用量数据"""
|
| 26 |
+
|
| 27 |
+
OPENALEX_URL = "https://api.openalex.org/works"
|
| 28 |
+
|
| 29 |
+
def __init__(self, limit=50, sleep=0.1, timeout=30):
|
| 30 |
+
self.limit = limit
|
| 31 |
+
self.sleep = sleep
|
| 32 |
+
self.timeout = timeout
|
| 33 |
+
self.headers = {
|
| 34 |
+
'User-Agent': 'AcademicReviewerSystem/1.0 (mailto:moahgzantony@gmail.com)'
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
def search(self, query: str, sort_by_citations: bool = False, years_after: int = None, target_count: int = None) -> List[Dict[str, Any]]:
|
| 38 |
+
"""执行OpenAlex检索并返回文献数据"""
|
| 39 |
+
try:
|
| 40 |
+
# 简化查询字符串,移除可能导致问题的特殊字符
|
| 41 |
+
clean_query = query.replace(' AND ', ' ').replace('&', 'and').replace('(', '').replace(')', '')
|
| 42 |
+
clean_query = ' '.join(clean_query.split()) # 移除多余空格
|
| 43 |
+
|
| 44 |
+
# 确定检索数量
|
| 45 |
+
if target_count is not None:
|
| 46 |
+
per_page = min(target_count, 200) # OpenAlex单次最多200条
|
| 47 |
+
else:
|
| 48 |
+
per_page = min(self.limit, 20) # 默认控制候选文献数量
|
| 49 |
+
|
| 50 |
+
# 构建查询参数(简化版本,避免select参数问题)
|
| 51 |
+
params = {
|
| 52 |
+
"search": clean_query,
|
| 53 |
+
"per-page": per_page
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
# 添加年份过滤
|
| 57 |
+
if years_after is not None:
|
| 58 |
+
from datetime import datetime
|
| 59 |
+
current_year = datetime.now().year
|
| 60 |
+
target_year = current_year - years_after
|
| 61 |
+
# 使用正确的filter参数格式
|
| 62 |
+
params["filter"] = f"from_publication_date:{target_year}-01-01"
|
| 63 |
+
print(f"年份过滤: 只检索{target_year}年及以后发表的论文")
|
| 64 |
+
print(f"日期过滤: {params['filter']}")
|
| 65 |
+
|
| 66 |
+
# 根据需求选择排序方式
|
| 67 |
+
if sort_by_citations:
|
| 68 |
+
params["sort"] = "cited_by_count:desc"
|
| 69 |
+
else:
|
| 70 |
+
params["sort"] = "relevance_score:desc"
|
| 71 |
+
|
| 72 |
+
print(f"OpenAlex检索查询: {query} -> {clean_query}")
|
| 73 |
+
print(f"排序方式: {'按引用量' if sort_by_citations else '按相关性'}")
|
| 74 |
+
|
| 75 |
+
# 发送请求 - 手动构建URL以避免冒号被编码
|
| 76 |
+
import urllib.parse
|
| 77 |
+
|
| 78 |
+
# 手动构建查询字符串,确保冒号不被编码
|
| 79 |
+
query_parts = []
|
| 80 |
+
for key, value in params.items():
|
| 81 |
+
if (key == "sort" or key == "filter") and ":" in str(value):
|
| 82 |
+
# 对于sort和filter参数,确保冒号不被编码
|
| 83 |
+
query_parts.append(f"{key}={value}")
|
| 84 |
+
else:
|
| 85 |
+
query_parts.append(f"{key}={urllib.parse.quote(str(value))}")
|
| 86 |
+
|
| 87 |
+
query_string = "&".join(query_parts)
|
| 88 |
+
full_url = f"{self.OPENALEX_URL}?{query_string}"
|
| 89 |
+
|
| 90 |
+
print(f"完整URL: {full_url}")
|
| 91 |
+
|
| 92 |
+
response = requests.get(
|
| 93 |
+
full_url,
|
| 94 |
+
headers=self.headers,
|
| 95 |
+
timeout=self.timeout
|
| 96 |
+
)
|
| 97 |
+
response.raise_for_status()
|
| 98 |
+
data = response.json()
|
| 99 |
+
|
| 100 |
+
items = data.get("results", [])
|
| 101 |
+
total_results = data.get("meta", {}).get("count", 0)
|
| 102 |
+
|
| 103 |
+
print(f"OpenAlex检索到 {len(items)} 篇文献,总命中数: {total_results}")
|
| 104 |
+
|
| 105 |
+
# 转换数据格式
|
| 106 |
+
results = []
|
| 107 |
+
for item in items:
|
| 108 |
+
result = self._convert_openalex_item(item, query)
|
| 109 |
+
if result:
|
| 110 |
+
results.append(result)
|
| 111 |
+
|
| 112 |
+
# 显示前几篇文献的引用量信息
|
| 113 |
+
if results:
|
| 114 |
+
print(f"OpenAlex检索结果(按引用量排序):")
|
| 115 |
+
for i, result in enumerate(results[:3]):
|
| 116 |
+
cited_count = result.get('citedByCount', 0)
|
| 117 |
+
title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
|
| 118 |
+
print(f" 文献 {i+1}: {title} (引用量: {cited_count})")
|
| 119 |
+
|
| 120 |
+
return results
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
print(f"OpenAlex检索失败: {str(e)}")
|
| 124 |
+
return []
|
| 125 |
+
|
| 126 |
+
def _convert_openalex_item(self, item: Dict[str, Any], query: str) -> Optional[Dict[str, Any]]:
|
| 127 |
+
"""将OpenAlex数据转换为标准格式"""
|
| 128 |
+
try:
|
| 129 |
+
# 提取基本信息
|
| 130 |
+
title = item.get('title', '')
|
| 131 |
+
if not title:
|
| 132 |
+
return None
|
| 133 |
+
|
| 134 |
+
# 提取摘要
|
| 135 |
+
abstract = ""
|
| 136 |
+
abstract_inverted = item.get('abstract_inverted_index', {})
|
| 137 |
+
if abstract_inverted:
|
| 138 |
+
# 重构摘要文本
|
| 139 |
+
abstract_words = []
|
| 140 |
+
for word, positions in abstract_inverted.items():
|
| 141 |
+
for pos in positions:
|
| 142 |
+
abstract_words.append((pos, word))
|
| 143 |
+
abstract_words.sort(key=lambda x: x[0])
|
| 144 |
+
abstract = ' '.join([word for pos, word in abstract_words])
|
| 145 |
+
|
| 146 |
+
# 提取作者信息
|
| 147 |
+
authorships = item.get('authorships', [])
|
| 148 |
+
authors = []
|
| 149 |
+
corresponding_author = None
|
| 150 |
+
corresponding_institution = None
|
| 151 |
+
|
| 152 |
+
for authorship in authorships:
|
| 153 |
+
author = authorship.get('author', {})
|
| 154 |
+
if author:
|
| 155 |
+
author_name = author.get('display_name', '')
|
| 156 |
+
if author_name:
|
| 157 |
+
authors.append(author_name)
|
| 158 |
+
|
| 159 |
+
# 检查是否为通讯作者(通常第一个作者或标记为corresponding的作者)
|
| 160 |
+
if authorship.get('is_corresponding', False) or len(authors) == 1:
|
| 161 |
+
corresponding_author = author_name
|
| 162 |
+
|
| 163 |
+
# 获取机构信息
|
| 164 |
+
institutions = authorship.get('institutions', [])
|
| 165 |
+
if institutions:
|
| 166 |
+
institution = institutions[0].get('display_name', '')
|
| 167 |
+
if institution:
|
| 168 |
+
corresponding_institution = institution
|
| 169 |
+
|
| 170 |
+
# 提取期刊信息
|
| 171 |
+
primary_location = item.get('primary_location', {})
|
| 172 |
+
source = primary_location.get('source', {})
|
| 173 |
+
journal = source.get('display_name', '') if source else ''
|
| 174 |
+
|
| 175 |
+
# 提取发表年份
|
| 176 |
+
pub_year = item.get('publication_year', '')
|
| 177 |
+
|
| 178 |
+
# 提取引用量信息
|
| 179 |
+
cited_by_count = item.get('cited_by_count', 0)
|
| 180 |
+
citation_count = item.get('citation_count', 0)
|
| 181 |
+
referenced_works_count = item.get('referenced_works_count', 0)
|
| 182 |
+
|
| 183 |
+
# 提取DOI
|
| 184 |
+
doi = ""
|
| 185 |
+
external_ids = item.get('ids', {})
|
| 186 |
+
if external_ids:
|
| 187 |
+
doi = external_ids.get('doi', '')
|
| 188 |
+
if doi and doi.startswith('https://doi.org/'):
|
| 189 |
+
doi = doi.replace('https://doi.org/', '')
|
| 190 |
+
|
| 191 |
+
# 构建结果
|
| 192 |
+
result = {
|
| 193 |
+
'title': title,
|
| 194 |
+
'abstract': abstract,
|
| 195 |
+
'authors': authors,
|
| 196 |
+
'corresponding_author': corresponding_author,
|
| 197 |
+
'corresponding_institution': corresponding_institution,
|
| 198 |
+
'journal': journal,
|
| 199 |
+
'publication_year': pub_year,
|
| 200 |
+
'doi': doi,
|
| 201 |
+
'citedByCount': cited_by_count, # 使用与EPMC相同的字段名
|
| 202 |
+
'citation_count': citation_count,
|
| 203 |
+
'referenced_works_count': referenced_works_count,
|
| 204 |
+
'query_used': query,
|
| 205 |
+
'source': 'openalex',
|
| 206 |
+
'openalex_id': item.get('id', ''),
|
| 207 |
+
'type': item.get('type', ''),
|
| 208 |
+
'open_access': item.get('open_access', {}).get('is_oa', False)
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
return result
|
| 212 |
+
|
| 213 |
+
except Exception as e:
|
| 214 |
+
print(f"转换OpenAlex数据失败: {str(e)}")
|
| 215 |
+
return None
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
# 配置部分
|
| 219 |
+
DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
|
| 220 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 221 |
+
|
| 222 |
+
# 生物学关键词分类体系
|
| 223 |
+
BIOLOGY_KEYWORDS = {
|
| 224 |
+
"Molecular & Structural Biology": [
|
| 225 |
+
"Cryo-EM",
|
| 226 |
+
"X-ray crystallography",
|
| 227 |
+
"NMR spectroscopy",
|
| 228 |
+
"Single-particle analysis",
|
| 229 |
+
"Biolayer interferometry (BLI)",
|
| 230 |
+
"Surface plasmon resonance (SPR)",
|
| 231 |
+
"Confocal microscopy",
|
| 232 |
+
"CRISPR-Cas9",
|
| 233 |
+
"TALEN",
|
| 234 |
+
"ZFN",
|
| 235 |
+
"RNA interference (RNAi)",
|
| 236 |
+
"Single-molecule imaging",
|
| 237 |
+
"FRET",
|
| 238 |
+
"Optogenetics"
|
| 239 |
+
],
|
| 240 |
+
|
| 241 |
+
"Cell & Single-Cell Technologies": [
|
| 242 |
+
"Single-cell RNA-seq (scRNA-seq)",
|
| 243 |
+
"Single-cell ATAC-seq",
|
| 244 |
+
"Spatial transcriptomics",
|
| 245 |
+
"FISH (Fluorescence in situ hybridization)",
|
| 246 |
+
"Immunofluorescence",
|
| 247 |
+
"Tissue clearing (CLARITY)",
|
| 248 |
+
"Flow cytometry (FACS)",
|
| 249 |
+
"CyTOF (Mass cytometry)",
|
| 250 |
+
"High-throughput screening",
|
| 251 |
+
"Organoids",
|
| 252 |
+
"3D cell culture",
|
| 253 |
+
"Microfluidics"
|
| 254 |
+
],
|
| 255 |
+
|
| 256 |
+
"Neuroscience Tools": [
|
| 257 |
+
"Optogenetics",
|
| 258 |
+
"DREADDs (Designer Receptors Exclusively Activated by Designer Drugs)",
|
| 259 |
+
"GCaMP calcium imaging",
|
| 260 |
+
"Two-photon microscopy",
|
| 261 |
+
"Neural tracing",
|
| 262 |
+
"Patch-seq",
|
| 263 |
+
"Lineage tracing",
|
| 264 |
+
"Spatial multi-omics"
|
| 265 |
+
],
|
| 266 |
+
|
| 267 |
+
"Omics & Systems Biology": [
|
| 268 |
+
"RNA sequencing (RNA-seq)",
|
| 269 |
+
"Proteomics (LC-MS/MS)",
|
| 270 |
+
"Metabolomics",
|
| 271 |
+
"Epigenomics",
|
| 272 |
+
"10x Genomics",
|
| 273 |
+
"SMART-seq",
|
| 274 |
+
"Nanopore sequencing",
|
| 275 |
+
"Illumina HiSeq",
|
| 276 |
+
"WGCNA",
|
| 277 |
+
"Machine learning in omics",
|
| 278 |
+
"scVelo"
|
| 279 |
+
],
|
| 280 |
+
|
| 281 |
+
"Microbiome & Immunology": [
|
| 282 |
+
"16S rRNA sequencing",
|
| 283 |
+
"Metagenomics",
|
| 284 |
+
"Gut-brain axis",
|
| 285 |
+
"VDJ-seq",
|
| 286 |
+
"TCR/BCR lineage tracing",
|
| 287 |
+
"Immune checkpoints (PD-1, CTLA-4)",
|
| 288 |
+
"mRNA vaccines",
|
| 289 |
+
"DNA vaccines",
|
| 290 |
+
"Nanoparticle vaccines",
|
| 291 |
+
"Antigen presentation systems"
|
| 292 |
+
],
|
| 293 |
+
|
| 294 |
+
"Development & Regeneration": [
|
| 295 |
+
"Induced pluripotent stem cells (iPSCs)",
|
| 296 |
+
"Embryonic stem cells (ESCs)",
|
| 297 |
+
"Cellular reprogramming",
|
| 298 |
+
"Wnt signaling",
|
| 299 |
+
"Hippo pathway",
|
| 300 |
+
"Notch signaling",
|
| 301 |
+
"Zebrafish models",
|
| 302 |
+
"C. elegans",
|
| 303 |
+
"Mouse embryonic sections"
|
| 304 |
+
],
|
| 305 |
+
|
| 306 |
+
"Ecology & Environmental Biology": [
|
| 307 |
+
"Environmental DNA (eDNA)",
|
| 308 |
+
"Remote sensing ecology",
|
| 309 |
+
"Biosensors",
|
| 310 |
+
"Ecological niche modeling (ENM)",
|
| 311 |
+
"Genetic diversity analysis",
|
| 312 |
+
"Captive breeding technologies"
|
| 313 |
+
],
|
| 314 |
+
|
| 315 |
+
"Bioinformatics & AI Tools": [
|
| 316 |
+
"Seurat",
|
| 317 |
+
"Scanpy",
|
| 318 |
+
"Monocle",
|
| 319 |
+
"CIBERSORT",
|
| 320 |
+
"GSEA",
|
| 321 |
+
"AlphaFold",
|
| 322 |
+
"RoseTTAFold",
|
| 323 |
+
"Molecular docking",
|
| 324 |
+
"STRING",
|
| 325 |
+
"Cytoscape",
|
| 326 |
+
"Gene Ontology (GO)",
|
| 327 |
+
"KEGG pathway analysis"
|
| 328 |
+
]
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
class AcademicSearcher:
|
| 333 |
+
"""基础学术检索器,仅负责数据获取,不做任何分析"""
|
| 334 |
+
|
| 335 |
+
EPMC_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
|
| 336 |
+
BIORXIV_URL = "https://api.biorxiv.org/details/biorxiv/{doi}/na/json"
|
| 337 |
+
|
| 338 |
+
def __init__(self, limit=50, sleep=0.1, timeout=30):
|
| 339 |
+
self.limit = limit
|
| 340 |
+
self.sleep = sleep
|
| 341 |
+
self.timeout = timeout
|
| 342 |
+
# 创建自定义SSL上下文
|
| 343 |
+
self.ssl_context = ssl.create_default_context()
|
| 344 |
+
self.ssl_context.check_hostname = False
|
| 345 |
+
self.ssl_context.verify_mode = ssl.CERT_NONE
|
| 346 |
+
|
| 347 |
+
def search(self, query: str, search_preprints: bool = True) -> List[Dict[str, Any]]:
|
| 348 |
+
"""执行检索并返回原始文献数据"""
|
| 349 |
+
try:
|
| 350 |
+
# 1. 获取DOI列表
|
| 351 |
+
epmc_results = self._epmc_search(query, search_preprints)
|
| 352 |
+
|
| 353 |
+
# 2. 并行获取详细信息
|
| 354 |
+
detailed_results = self._get_details_parallel(epmc_results, query)
|
| 355 |
+
|
| 356 |
+
return detailed_results
|
| 357 |
+
except Exception as e:
|
| 358 |
+
print(f"检索错误: {str(e)}")
|
| 359 |
+
return []
|
| 360 |
+
|
| 361 |
+
def _get_details_parallel(self, epmc_results: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
|
| 362 |
+
"""并行获取详细信息"""
|
| 363 |
+
detailed_results = []
|
| 364 |
+
|
| 365 |
+
# 如果没有结果,直接返回空列表
|
| 366 |
+
if not epmc_results:
|
| 367 |
+
return detailed_results
|
| 368 |
+
|
| 369 |
+
# 限制并行数量,避免过多并发请求
|
| 370 |
+
max_workers = min(5, len(epmc_results))
|
| 371 |
+
|
| 372 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 373 |
+
# 提交所有详情获取任务
|
| 374 |
+
future_to_item = {}
|
| 375 |
+
for item in epmc_results:
|
| 376 |
+
doi = item.get("doi")
|
| 377 |
+
if doi:
|
| 378 |
+
future = executor.submit(self._get_biorxiv_detail, doi)
|
| 379 |
+
future_to_item[future] = item
|
| 380 |
+
|
| 381 |
+
# 收集结果
|
| 382 |
+
for future in as_completed(future_to_item):
|
| 383 |
+
item = future_to_item[future]
|
| 384 |
+
try:
|
| 385 |
+
detail = future.result(timeout=10) # 10秒超时
|
| 386 |
+
if detail:
|
| 387 |
+
detail["query_used"] = query
|
| 388 |
+
detailed_results.append(detail)
|
| 389 |
+
|
| 390 |
+
if len(detailed_results) >= self.limit:
|
| 391 |
+
break
|
| 392 |
+
|
| 393 |
+
except Exception as e:
|
| 394 |
+
print(f"获取详情失败: {item.get('doi')} - {str(e)}")
|
| 395 |
+
continue
|
| 396 |
+
|
| 397 |
+
return detailed_results
|
| 398 |
+
|
| 399 |
+
def _epmc_search(self, query: str, search_preprints: bool = True) -> List[Dict[str, Any]]:
|
| 400 |
+
"""获取EPMC搜索结果"""
|
| 401 |
+
if search_preprints:
|
| 402 |
+
# 检索预印本(bioRxiv)
|
| 403 |
+
query_str = f'(SRC:PPR) AND (DOI:10.1101*) AND ({query})'
|
| 404 |
+
else:
|
| 405 |
+
# 检索已发表论文(有引用量数据)
|
| 406 |
+
query_str = f'({query})'
|
| 407 |
+
|
| 408 |
+
params = {
|
| 409 |
+
"query": query_str,
|
| 410 |
+
"resultType": "core",
|
| 411 |
+
"pageSize": str(min(50, self.limit)), # 控制候选文献数量
|
| 412 |
+
"format": "json",
|
| 413 |
+
"sortby": "CITED+desc", # 按引用量降序排序
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
# 添加重试机制
|
| 417 |
+
for attempt in range(3):
|
| 418 |
+
try:
|
| 419 |
+
response = requests.get(
|
| 420 |
+
self.EPMC_URL,
|
| 421 |
+
params=params,
|
| 422 |
+
timeout=self.timeout,
|
| 423 |
+
verify=False, # 禁用SSL验证
|
| 424 |
+
headers={
|
| 425 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 426 |
+
}
|
| 427 |
+
)
|
| 428 |
+
response.raise_for_status()
|
| 429 |
+
data = response.json()
|
| 430 |
+
results = data.get("resultList", {}).get("result", [])
|
| 431 |
+
|
| 432 |
+
# 添加调试信息,显示检索到的文献数量和引用量信息
|
| 433 |
+
if results:
|
| 434 |
+
print(f"EPMC检索到 {len(results)} 篇文献,按引用量排序")
|
| 435 |
+
# 显示前几篇文献的引用量信息
|
| 436 |
+
for i, result in enumerate(results[:3]):
|
| 437 |
+
cited_count = result.get('citedByCount', 0)
|
| 438 |
+
title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
|
| 439 |
+
print(f" 文献 {i+1}: {title} (引用量: {cited_count})")
|
| 440 |
+
|
| 441 |
+
return results
|
| 442 |
+
|
| 443 |
+
except requests.exceptions.SSLError as e:
|
| 444 |
+
print(f"EPMC SSL错误 (尝试 {attempt + 1}/3): {str(e)}")
|
| 445 |
+
if attempt == 2:
|
| 446 |
+
print("EPMC SSL连接失败,返回空结果")
|
| 447 |
+
return []
|
| 448 |
+
time.sleep(2 ** attempt)
|
| 449 |
+
|
| 450 |
+
except requests.exceptions.RequestException as e:
|
| 451 |
+
print(f"EPMC请求错误 (尝试 {attempt + 1}/3): {str(e)}")
|
| 452 |
+
if attempt == 2:
|
| 453 |
+
print("EPMC请求失败,返回空结果")
|
| 454 |
+
return []
|
| 455 |
+
time.sleep(2 ** attempt)
|
| 456 |
+
|
| 457 |
+
except Exception as e:
|
| 458 |
+
print(f"EPMC未知错误 (尝试 {attempt + 1}/3): {str(e)}")
|
| 459 |
+
if attempt == 2:
|
| 460 |
+
print("EPMC未知错误,返回空结果")
|
| 461 |
+
return []
|
| 462 |
+
time.sleep(2 ** attempt)
|
| 463 |
+
|
| 464 |
+
return []
|
| 465 |
+
|
| 466 |
+
def _get_biorxiv_detail(self, doi: str) -> Dict[str, Any]:
|
| 467 |
+
"""获取bioRxiv详细信息"""
|
| 468 |
+
url = self.BIORXIV_URL.format(doi=urllib.parse.quote(doi))
|
| 469 |
+
|
| 470 |
+
# 添加重试机制和更好的错误处理
|
| 471 |
+
for attempt in range(3):
|
| 472 |
+
try:
|
| 473 |
+
# 使用更宽松的SSL验证和更长的超时时间
|
| 474 |
+
response = requests.get(
|
| 475 |
+
url,
|
| 476 |
+
timeout=self.timeout,
|
| 477 |
+
verify=False, # 禁用SSL验证以避免SSL错误
|
| 478 |
+
headers={
|
| 479 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 480 |
+
}
|
| 481 |
+
)
|
| 482 |
+
response.raise_for_status()
|
| 483 |
+
data = response.json()
|
| 484 |
+
|
| 485 |
+
records = data.get("collection") or data.get("records") or []
|
| 486 |
+
if not records:
|
| 487 |
+
return None
|
| 488 |
+
|
| 489 |
+
latest_record = records[-1]
|
| 490 |
+
if latest_record.get("server") and latest_record["server"].lower() != "biorxiv":
|
| 491 |
+
return None
|
| 492 |
+
|
| 493 |
+
version = latest_record.get("version") or 1
|
| 494 |
+
return {
|
| 495 |
+
"doi": latest_record.get("doi"),
|
| 496 |
+
"title": latest_record.get("title"),
|
| 497 |
+
"abstract": latest_record.get("abstract"),
|
| 498 |
+
"corresponding_author": latest_record.get("author_corresponding"),
|
| 499 |
+
"corresponding_institution": latest_record.get("author_corresponding_institution"),
|
| 500 |
+
"url": f"https://www.biorxiv.org/content/{latest_record['doi']}v{version}"
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
except requests.exceptions.SSLError as e:
|
| 504 |
+
print(f"bioRxiv SSL错误 (尝试 {attempt + 1}/3): {str(e)}")
|
| 505 |
+
if attempt == 2: # 最后一次尝试
|
| 506 |
+
print(f"跳过DOI {doi} 的详细信息获取")
|
| 507 |
+
return None
|
| 508 |
+
time.sleep(2 ** attempt) # 指数退避
|
| 509 |
+
|
| 510 |
+
except requests.exceptions.RequestException as e:
|
| 511 |
+
print(f"bioRxiv请求错误 (尝试 {attempt + 1}/3): {str(e)}")
|
| 512 |
+
if attempt == 2: # 最后一次尝试
|
| 513 |
+
print(f"跳过DOI {doi} 的详细信息获取")
|
| 514 |
+
return None
|
| 515 |
+
time.sleep(2 ** attempt) # 指数退避
|
| 516 |
+
|
| 517 |
+
except Exception as e:
|
| 518 |
+
print(f"bioRxiv未知错误 (尝试 {attempt + 1}/3): {str(e)}")
|
| 519 |
+
if attempt == 2: # 最后一次尝试
|
| 520 |
+
print(f"跳过DOI {doi} 的详细信息获取")
|
| 521 |
+
return None
|
| 522 |
+
time.sleep(2 ** attempt) # 指数退避
|
| 523 |
+
|
| 524 |
+
return None
|
| 525 |
+
|
| 526 |
+
|
| 527 |
+
class DynamicAcademicSearcher:
|
| 528 |
+
"""动态学术检索器,包含智能查询生成、动态处理和扩展功能"""
|
| 529 |
+
|
| 530 |
+
def __init__(self, base_searcher: AcademicSearcher = None, openalex_searcher: OpenAlexSearcher = None):
|
| 531 |
+
self.base_searcher = base_searcher
|
| 532 |
+
self.openalex_searcher = openalex_searcher
|
| 533 |
+
# 如果没有提供任何检索器,创建默认的
|
| 534 |
+
if not self.base_searcher and not self.openalex_searcher:
|
| 535 |
+
self.openalex_searcher = OpenAlexSearcher()
|
| 536 |
+
|
| 537 |
+
def search_with_dynamic_queries(self, paper: PaperInfo, num_reviewers: int = 8, years_after: int = None) -> tuple:
|
| 538 |
+
"""使用动态查询进行单通道检索"""
|
| 539 |
+
# 1. 生成检索查询(只生成一个查询)
|
| 540 |
+
queries = self.generate_search_queries(paper, 1)
|
| 541 |
+
print("生成的检索查询:")
|
| 542 |
+
for i, query in enumerate(queries, 1):
|
| 543 |
+
print(f"查询 {i}: {query}")
|
| 544 |
+
|
| 545 |
+
# 记录查询生成日志
|
| 546 |
+
self._log_query_generation(paper, queries)
|
| 547 |
+
|
| 548 |
+
# 2. 根据可用的检索器选择检索策略
|
| 549 |
+
if self.openalex_searcher:
|
| 550 |
+
return self._search_with_openalex_single_channel(queries[0], num_reviewers, years_after)
|
| 551 |
+
elif self.base_searcher:
|
| 552 |
+
return self._search_with_epmc_single_channel(queries[0], num_reviewers)
|
| 553 |
+
else:
|
| 554 |
+
print("错误:没有可用的检索器")
|
| 555 |
+
return [], []
|
| 556 |
+
|
| 557 |
+
def _search_with_openalex_single_channel(self, query: str, num_reviewers: int, years_after: int = None) -> tuple:
|
| 558 |
+
"""使用OpenAlex进行单通道检索"""
|
| 559 |
+
# 计算需要检索的文献数量(目标审稿人数量的5倍)
|
| 560 |
+
target_count = num_reviewers * 5
|
| 561 |
+
print(f"单通道检索:目标审稿人数量 {num_reviewers},检索文献数量 {target_count}")
|
| 562 |
+
|
| 563 |
+
# 执行检索
|
| 564 |
+
candidates = self._execute_openalex_query(query, sort_by_citations=False, years_after=years_after, target_count=target_count)
|
| 565 |
+
|
| 566 |
+
print(f"单通道检索完成,获得 {len(candidates)} 个候选审稿人")
|
| 567 |
+
|
| 568 |
+
# 返回单个通道的结果(为了保持兼容性,返回三个相同的通道)
|
| 569 |
+
return candidates, candidates, candidates
|
| 570 |
+
|
| 571 |
+
def _search_with_epmc_single_channel(self, query: str, num_reviewers: int) -> tuple:
|
| 572 |
+
"""使用EPMC进行单通道检索"""
|
| 573 |
+
# 计算需要检索的文献数量(目标审稿人数量的5倍)
|
| 574 |
+
target_count = num_reviewers * 5
|
| 575 |
+
print(f"单通道检索:目标审稿人数量 {num_reviewers},检索文献数量 {target_count}")
|
| 576 |
+
|
| 577 |
+
# 执行检索
|
| 578 |
+
candidates = self._execute_single_query(query, search_preprints=True, target_count=target_count)
|
| 579 |
+
|
| 580 |
+
print(f"单通道检索完成,获得 {len(candidates)} 个候选审稿人")
|
| 581 |
+
|
| 582 |
+
# 返回单个通道的结果(为了保持兼容性,返回三个相同的通道)
|
| 583 |
+
return candidates, candidates, candidates
|
| 584 |
+
|
| 585 |
+
def _search_with_openalex(self, queries: List[str], years_after: int = None) -> tuple:
|
| 586 |
+
"""使用OpenAlex进行三通道检索"""
|
| 587 |
+
channel1_candidates = [] # 高引用量论文
|
| 588 |
+
channel2_candidates = [] # 相关性论文
|
| 589 |
+
channel3_candidates = [] # 相关性论文
|
| 590 |
+
|
| 591 |
+
# 确保至少有3个查询
|
| 592 |
+
if len(queries) < 3:
|
| 593 |
+
print(f"警告:查询数量不足({len(queries)}/3),将使用备用查询")
|
| 594 |
+
queries = queries + ["cryo-em structure", "cryo-em structure analysis"] * (3 - len(queries))
|
| 595 |
+
|
| 596 |
+
# 使用线程池并行执行查询
|
| 597 |
+
with ThreadPoolExecutor(max_workers=6) as executor:
|
| 598 |
+
# 提交查询任务
|
| 599 |
+
future_to_query = {}
|
| 600 |
+
|
| 601 |
+
if len(queries) >= 1:
|
| 602 |
+
future1 = executor.submit(self._execute_openalex_query, queries[0], sort_by_citations=False, years_after=years_after)
|
| 603 |
+
future_to_query[future1] = (queries[0], "高引用量")
|
| 604 |
+
|
| 605 |
+
# 通道2:使用查询2按相关性排序
|
| 606 |
+
if len(queries) >= 2:
|
| 607 |
+
future2 = executor.submit(self._execute_openalex_query, queries[1], sort_by_citations=False, years_after=years_after)
|
| 608 |
+
future_to_query[future2] = (queries[1], "相关性2")
|
| 609 |
+
|
| 610 |
+
# 通道3:使用查询3按相关性排序
|
| 611 |
+
if len(queries) >= 3:
|
| 612 |
+
future3 = executor.submit(self._execute_openalex_query, queries[2], sort_by_citations=False, years_after=years_after)
|
| 613 |
+
future_to_query[future3] = (queries[2], "相关性3")
|
| 614 |
+
|
| 615 |
+
# 收集结果
|
| 616 |
+
for future in as_completed(future_to_query):
|
| 617 |
+
query, search_type = future_to_query[future]
|
| 618 |
+
try:
|
| 619 |
+
results = future.result()
|
| 620 |
+
if search_type == "高引用量":
|
| 621 |
+
channel1_candidates.extend(results)
|
| 622 |
+
elif search_type == "相关性2":
|
| 623 |
+
channel2_candidates.extend(results)
|
| 624 |
+
elif search_type == "相关性3":
|
| 625 |
+
channel3_candidates.extend(results)
|
| 626 |
+
except Exception as e:
|
| 627 |
+
print(f"查询失败 {query} ({search_type}): {str(e)}")
|
| 628 |
+
|
| 629 |
+
# 显示检索结果
|
| 630 |
+
print(f"\n通道1(高引用量排序)的检索结果:")
|
| 631 |
+
if channel1_candidates and len(queries) >= 1:
|
| 632 |
+
print(f"查询: \"{queries[0]}\" (按引用量)")
|
| 633 |
+
# 显示前3篇文献
|
| 634 |
+
for j, result in enumerate(channel1_candidates[:3], 1):
|
| 635 |
+
cited_count = result.get('citedByCount', 0)
|
| 636 |
+
title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
|
| 637 |
+
print(f"文献{j}: {title} (引用量: {cited_count})")
|
| 638 |
+
|
| 639 |
+
print(f"\n通道2(相关性排序)的检索结果:")
|
| 640 |
+
if channel2_candidates and len(queries) >= 2:
|
| 641 |
+
print(f"查询: \"{queries[1]}\" (按相关性)")
|
| 642 |
+
# 显示前3篇文献
|
| 643 |
+
for j, result in enumerate(channel2_candidates[:3], 1):
|
| 644 |
+
cited_count = result.get('citedByCount', 0)
|
| 645 |
+
title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
|
| 646 |
+
print(f"文献{j}: {title} (引用量: {cited_count})")
|
| 647 |
+
|
| 648 |
+
print(f"\n通道3(相关性排序)的检索结果:")
|
| 649 |
+
if channel3_candidates and len(queries) >= 3:
|
| 650 |
+
print(f"查询: \"{queries[2]}\" (按相关性)")
|
| 651 |
+
# 显示前3篇文献
|
| 652 |
+
for j, result in enumerate(channel3_candidates[:3], 1):
|
| 653 |
+
cited_count = result.get('citedByCount', 0)
|
| 654 |
+
title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
|
| 655 |
+
print(f"文献{j}: {title} (引用量: {cited_count})")
|
| 656 |
+
|
| 657 |
+
# 打印每个通道的候选审稿人列表
|
| 658 |
+
self._print_candidate_reviewers("通道1(高引用量)", channel1_candidates)
|
| 659 |
+
self._print_candidate_reviewers("通道2(相关性)", channel2_candidates)
|
| 660 |
+
self._print_candidate_reviewers("通道3(高相关性)", channel3_candidates)
|
| 661 |
+
|
| 662 |
+
print(f"\nOpenAlex检索完成 - 通道1: {len(channel1_candidates)} 篇, 通道2: {len(channel2_candidates)} 篇, 通道3: {len(channel3_candidates)} 篇")
|
| 663 |
+
return channel1_candidates, channel2_candidates, channel3_candidates
|
| 664 |
+
|
| 665 |
+
def _print_candidate_reviewers(self, channel_name: str, candidates: List[Dict[str, Any]]):
|
| 666 |
+
"""打印候选审稿人列表"""
|
| 667 |
+
if not candidates:
|
| 668 |
+
print(f"\n{channel_name}候选审稿人: 无")
|
| 669 |
+
return
|
| 670 |
+
|
| 671 |
+
print(f"\n{channel_name}候选审稿人:")
|
| 672 |
+
seen_reviewers = set()
|
| 673 |
+
|
| 674 |
+
for i, candidate in enumerate(candidates, 1):
|
| 675 |
+
corresponding_author = candidate.get('corresponding_author', '')
|
| 676 |
+
corresponding_institution = candidate.get('corresponding_institution', '')
|
| 677 |
+
title = candidate.get('title', '')
|
| 678 |
+
|
| 679 |
+
if corresponding_author:
|
| 680 |
+
# 创建审稿人标识符用于去重
|
| 681 |
+
author_lower = corresponding_author.lower()
|
| 682 |
+
institution_lower = (corresponding_institution or "未知机构").lower()
|
| 683 |
+
reviewer_key = f"{author_lower}_{institution_lower}"
|
| 684 |
+
|
| 685 |
+
if reviewer_key not in seen_reviewers:
|
| 686 |
+
seen_reviewers.add(reviewer_key)
|
| 687 |
+
title_short = title[:60] + '...' if len(title) > 60 else title
|
| 688 |
+
print(f" {len(seen_reviewers)}. {corresponding_author} ({corresponding_institution})")
|
| 689 |
+
print(f" 论文: {title_short}")
|
| 690 |
+
|
| 691 |
+
print(f" 总计: {len(seen_reviewers)} 位候选审稿人")
|
| 692 |
+
|
| 693 |
+
def _execute_openalex_query(self, query: str, sort_by_citations: bool = False, years_after: int = None, target_count: int = None) -> List[Dict[str, Any]]:
|
| 694 |
+
"""执行单个OpenAlex查询"""
|
| 695 |
+
try:
|
| 696 |
+
return self.openalex_searcher.search(query, sort_by_citations=sort_by_citations, years_after=years_after, target_count=target_count)
|
| 697 |
+
except Exception as e:
|
| 698 |
+
print(f"OpenAlex查询执行失败: {str(e)}")
|
| 699 |
+
return []
|
| 700 |
+
|
| 701 |
+
def _search_with_epmc(self, queries: List[str]) -> tuple:
|
| 702 |
+
"""使用EPMC进行双数据源检索"""
|
| 703 |
+
channel1_candidates = [] # 已发表论文(高引用量)
|
| 704 |
+
channel2_candidates = [] # 预印本(最新研��)
|
| 705 |
+
|
| 706 |
+
# 使用线程池并行执行查询
|
| 707 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
| 708 |
+
# 提交所有查询任务
|
| 709 |
+
future_to_query = {}
|
| 710 |
+
for i, query in enumerate(queries):
|
| 711 |
+
# 通道1:检索已发表论文
|
| 712 |
+
future1 = executor.submit(self._execute_single_query, query, search_preprints=False)
|
| 713 |
+
future_to_query[future1] = (query, "已发表论文")
|
| 714 |
+
|
| 715 |
+
# 通道2:检索预印本
|
| 716 |
+
future2 = executor.submit(self._execute_single_query, query, search_preprints=True)
|
| 717 |
+
future_to_query[future2] = (query, "预印本")
|
| 718 |
+
|
| 719 |
+
# 收集结果
|
| 720 |
+
for future in as_completed(future_to_query):
|
| 721 |
+
query, data_source = future_to_query[future]
|
| 722 |
+
try:
|
| 723 |
+
results = future.result(timeout=30) # 30秒超时
|
| 724 |
+
if results:
|
| 725 |
+
print(f"查询 '{query}' ({data_source}) 完成,找到 {len(results)} 篇文献")
|
| 726 |
+
if data_source == "已发表论文":
|
| 727 |
+
channel1_candidates.extend(results)
|
| 728 |
+
else:
|
| 729 |
+
channel2_candidates.extend(results)
|
| 730 |
+
else:
|
| 731 |
+
print(f"查询 '{query}' ({data_source}) 未找到文献")
|
| 732 |
+
except Exception as e:
|
| 733 |
+
print(f"查询 '{query}' ({data_source}) 执行失败: {str(e)}")
|
| 734 |
+
|
| 735 |
+
# 3. 去重相同文献
|
| 736 |
+
unique_channel1 = {item['doi']: item for item in channel1_candidates if item.get('doi')}.values()
|
| 737 |
+
unique_channel2 = {item['doi']: item for item in channel2_candidates if item.get('doi')}.values()
|
| 738 |
+
|
| 739 |
+
print(f"双数据源检索完成:已发表论文 {len(list(unique_channel1))} 篇,预印本 {len(list(unique_channel2))} 篇")
|
| 740 |
+
return list(unique_channel1), list(unique_channel2)
|
| 741 |
+
|
| 742 |
+
def _execute_single_query(self, query: str, search_preprints: bool = True, target_count: int = None) -> List[Dict[str, Any]]:
|
| 743 |
+
"""执行单个查询(用于并行处理)"""
|
| 744 |
+
data_source = "预印本" if search_preprints else "已发表论文"
|
| 745 |
+
print(f"开始执行查询: {query} ({data_source})")
|
| 746 |
+
|
| 747 |
+
# 动态查询处理
|
| 748 |
+
processed_queries = self._process_query_dynamically(query)
|
| 749 |
+
|
| 750 |
+
for processed_query in processed_queries:
|
| 751 |
+
print(f" 尝试查询: {processed_query}")
|
| 752 |
+
results = self.base_searcher.search(processed_query, search_preprints)
|
| 753 |
+
|
| 754 |
+
if results:
|
| 755 |
+
print(f" 找到 {len(results)} 篇文献")
|
| 756 |
+
return results
|
| 757 |
+
else:
|
| 758 |
+
print(f" 未找到文献,尝试扩展查询...")
|
| 759 |
+
|
| 760 |
+
print(f" 所有扩展查询都未找到文献")
|
| 761 |
+
return []
|
| 762 |
+
|
| 763 |
+
def generate_search_queries(self, paper: PaperInfo, num_queries: int = 2) -> List[str]:
|
| 764 |
+
"""动态生成双通道检索查询"""
|
| 765 |
+
system_msg = "你是学术检索专家,擅长从论文中提取出多个维度的关键词"
|
| 766 |
+
|
| 767 |
+
prompt = f"""
|
| 768 |
+
请分析以下论文,提取关键信息并生成1个精准的检索查询:
|
| 769 |
+
|
| 770 |
+
论文信息:
|
| 771 |
+
标题: {paper.title}
|
| 772 |
+
摘要: {paper.abstract}
|
| 773 |
+
关键词: {', '.join(paper.keywords)}
|
| 774 |
+
|
| 775 |
+
分析任务:
|
| 776 |
+
请从论文中识别以下三个维度的关键信息:
|
| 777 |
+
|
| 778 |
+
1. 一个研究主体 (Research Subject)
|
| 779 |
+
- 论文研究的核心对象、分子、蛋白质、疾病等
|
| 780 |
+
- 例如:Nav1.7、COVID-19、dopamine、insulin等
|
| 781 |
+
|
| 782 |
+
2. 关键组分 (Key Components)
|
| 783 |
+
- 与研究主体相关的亚单位、配体、抑制剂、调节因子等
|
| 784 |
+
- 例如:β1亚单位、Protoxin-II、receptor、agonist等
|
| 785 |
+
|
| 786 |
+
3. 研究方法 (Research Method)
|
| 787 |
+
- 论文使用的核心技术、实验方法、分析手段等
|
| 788 |
+
- 例如:Cryo-EM、CRISPR、NMR、patch-clamp等
|
| 789 |
+
|
| 790 |
+
查询生成规则:
|
| 791 |
+
- 使用布尔运算符AND和OR构建精确查询
|
| 792 |
+
- 每个维度内使用OR连接同义词或相关术语
|
| 793 |
+
- 不同维度间使用AND连接
|
| 794 |
+
- 优先选择最核心、最特异的术语
|
| 795 |
+
- 避免过于宽泛的通用词汇
|
| 796 |
+
|
| 797 |
+
输出要求:
|
| 798 |
+
请生成1个检索查询,格式如下:
|
| 799 |
+
(研究主体) AND (关键组分1 OR 关键组分2) AND (研究方法1 OR 研究方法2)
|
| 800 |
+
|
| 801 |
+
示例输出:
|
| 802 |
+
(Nav1.7) AND (β1 OR Protoxin-II) AND (cryo-EM OR cryo-electron microscopy)
|
| 803 |
+
"""
|
| 804 |
+
|
| 805 |
+
response = self._call_llm(prompt.strip(), system_msg)
|
| 806 |
+
if not response:
|
| 807 |
+
return self._generate_backup_queries(paper, num_queries)
|
| 808 |
+
|
| 809 |
+
# 解析查询(现在只生成一个查询)
|
| 810 |
+
query = response.strip()
|
| 811 |
+
print(f"LLM原始返回的查询: {query}")
|
| 812 |
+
|
| 813 |
+
# 验证查询格式
|
| 814 |
+
validated_queries = self._validate_new_queries([query])
|
| 815 |
+
print(f"验证后的查询数量: {len(validated_queries)}")
|
| 816 |
+
|
| 817 |
+
# 如果验证失败,使用备用查询
|
| 818 |
+
if len(validated_queries) == 0:
|
| 819 |
+
print(f"查询验证失败,使用备用查询")
|
| 820 |
+
backup_queries = self._generate_backup_queries(paper, num_queries)
|
| 821 |
+
print(f"备用查询: {backup_queries}")
|
| 822 |
+
return backup_queries
|
| 823 |
+
|
| 824 |
+
# 返回验证通过的查询,如果num_queries > 1,则重复使用同一个查询
|
| 825 |
+
result_queries = validated_queries[:1] # 只取第一个查询
|
| 826 |
+
if num_queries > 1:
|
| 827 |
+
# 如果需要多个查询,重复使用同一个查询
|
| 828 |
+
result_queries = result_queries * num_queries
|
| 829 |
+
print(f"重复使用查询以满足数量要求: {result_queries}")
|
| 830 |
+
|
| 831 |
+
return result_queries
|
| 832 |
+
|
| 833 |
+
def _validate_new_queries(self, queries: List[str]) -> List[str]:
|
| 834 |
+
"""验证新格式查询(单查询格式)"""
|
| 835 |
+
validated_queries = []
|
| 836 |
+
|
| 837 |
+
for query in queries:
|
| 838 |
+
# 基本格式检查
|
| 839 |
+
if not query or len(query.strip()) < 10:
|
| 840 |
+
print(f"查询太短,跳过: {query}")
|
| 841 |
+
continue
|
| 842 |
+
|
| 843 |
+
# 检查是否包含AND操作符(新格式要求)
|
| 844 |
+
if ' AND ' not in query:
|
| 845 |
+
print(f"查询缺少AND操作符,跳过: {query}")
|
| 846 |
+
continue
|
| 847 |
+
|
| 848 |
+
# 检查是否包含括号(新格式要求)
|
| 849 |
+
if '(' not in query or ')' not in query:
|
| 850 |
+
print(f"查询缺少括号,跳过: {query}")
|
| 851 |
+
continue
|
| 852 |
+
|
| 853 |
+
# 检查是否包含OR操作符(新格式要求)
|
| 854 |
+
if ' OR ' not in query:
|
| 855 |
+
print(f"查询缺少OR操作符,跳过: {query}")
|
| 856 |
+
continue
|
| 857 |
+
|
| 858 |
+
# 检查查询长度合理性
|
| 859 |
+
if len(query) > 200: # 查询过长
|
| 860 |
+
print(f"查询过长,跳过: {query}")
|
| 861 |
+
continue
|
| 862 |
+
|
| 863 |
+
# 检查是否包含生物学关键词分类
|
| 864 |
+
has_biology_keyword = False
|
| 865 |
+
for category, keywords in BIOLOGY_KEYWORDS.items():
|
| 866 |
+
if category.lower() in query.lower():
|
| 867 |
+
has_biology_keyword = True
|
| 868 |
+
break
|
| 869 |
+
for keyword in keywords:
|
| 870 |
+
if keyword.lower() in query.lower():
|
| 871 |
+
has_biology_keyword = True
|
| 872 |
+
break
|
| 873 |
+
if has_biology_keyword:
|
| 874 |
+
break
|
| 875 |
+
|
| 876 |
+
if not has_biology_keyword:
|
| 877 |
+
print(f"查询不包含生物学关键词分类,跳过: {query}")
|
| 878 |
+
continue
|
| 879 |
+
|
| 880 |
+
validated_queries.append(query.strip())
|
| 881 |
+
print(f"查询验证通过: {query}")
|
| 882 |
+
|
| 883 |
+
return validated_queries
|
| 884 |
+
|
| 885 |
+
def _validate_queries(self, queries: List[str]) -> List[str]:
|
| 886 |
+
"""验证查询格式和质量"""
|
| 887 |
+
validated_queries = []
|
| 888 |
+
|
| 889 |
+
for query in queries:
|
| 890 |
+
# 基本格式检查
|
| 891 |
+
if not query or len(query.strip()) < 5:
|
| 892 |
+
print(f"查询太短,跳过: {query}")
|
| 893 |
+
continue
|
| 894 |
+
|
| 895 |
+
# 检查是否包含AND连接符
|
| 896 |
+
if ' AND ' not in query:
|
| 897 |
+
print(f"查询缺少AND连接符,跳过: {query}")
|
| 898 |
+
continue
|
| 899 |
+
|
| 900 |
+
# 检查是否只包含两个关键词(主要学科 AND 研究层面关键词)
|
| 901 |
+
parts = query.split(' AND ')
|
| 902 |
+
if len(parts) != 2:
|
| 903 |
+
print(f"查询格式不正确,跳过: {query}")
|
| 904 |
+
continue
|
| 905 |
+
|
| 906 |
+
# 检查每个部分是否有效
|
| 907 |
+
part1 = parts[0].strip()
|
| 908 |
+
part2 = parts[1].strip()
|
| 909 |
+
|
| 910 |
+
if not part1 or not part2:
|
| 911 |
+
print(f"查询包含空部分,跳过: {query}")
|
| 912 |
+
continue
|
| 913 |
+
|
| 914 |
+
if part1.upper() == 'AND' or part2.upper() == 'AND':
|
| 915 |
+
print(f"查询包含无效AND,跳过: {query}")
|
| 916 |
+
continue
|
| 917 |
+
|
| 918 |
+
# 检查是否包含生物学关键词分类
|
| 919 |
+
has_biology_keyword = False
|
| 920 |
+
for category, keywords in BIOLOGY_KEYWORDS.items():
|
| 921 |
+
if category.lower() in query.lower():
|
| 922 |
+
has_biology_keyword = True
|
| 923 |
+
break
|
| 924 |
+
for keyword in keywords:
|
| 925 |
+
if keyword.lower() in query.lower():
|
| 926 |
+
has_biology_keyword = True
|
| 927 |
+
break
|
| 928 |
+
if has_biology_keyword:
|
| 929 |
+
break
|
| 930 |
+
|
| 931 |
+
if not has_biology_keyword:
|
| 932 |
+
print(f"查询不包含生物学关键词分类,跳过: {query}")
|
| 933 |
+
continue
|
| 934 |
+
|
| 935 |
+
# 检查查询长度合理性
|
| 936 |
+
if len(query) > 100: # 查询过长
|
| 937 |
+
print(f"查询过长,跳过: {query}")
|
| 938 |
+
continue
|
| 939 |
+
|
| 940 |
+
validated_queries.append(query.strip())
|
| 941 |
+
print(f"查询验证通过: {query}")
|
| 942 |
+
|
| 943 |
+
return validated_queries
|
| 944 |
+
|
| 945 |
+
def _process_query_dynamically(self, query: str) -> List[str]:
|
| 946 |
+
"""动态处理查询,生成多个变体"""
|
| 947 |
+
# 基础查询
|
| 948 |
+
queries = [query]
|
| 949 |
+
|
| 950 |
+
# 检查查询格式是否正确
|
| 951 |
+
if ' AND ' not in query:
|
| 952 |
+
return queries
|
| 953 |
+
|
| 954 |
+
# 按AND分割查询
|
| 955 |
+
parts = query.split(' AND ')
|
| 956 |
+
if len(parts) != 2:
|
| 957 |
+
return queries
|
| 958 |
+
|
| 959 |
+
# 清理每个部分
|
| 960 |
+
part1 = parts[0].strip()
|
| 961 |
+
part2 = parts[1].strip()
|
| 962 |
+
|
| 963 |
+
# 如果某个部分为空或只包含AND,跳过
|
| 964 |
+
if not part1 or not part2 or part1.upper() == 'AND' or part2.upper() == 'AND':
|
| 965 |
+
return queries
|
| 966 |
+
|
| 967 |
+
# 简化查询(只保留主要关键词)
|
| 968 |
+
part1_words = part1.split()
|
| 969 |
+
part2_words = part2.split()
|
| 970 |
+
|
| 971 |
+
if len(part1_words) > 1:
|
| 972 |
+
# 取第一个部分的主要关键词
|
| 973 |
+
simplified_part1 = part1_words[0]
|
| 974 |
+
queries.append(f"{simplified_part1} AND {part2}")
|
| 975 |
+
|
| 976 |
+
if len(part2_words) > 1:
|
| 977 |
+
# 取第二个部分的主要关键词
|
| 978 |
+
simplified_part2 = part2_words[0]
|
| 979 |
+
queries.append(f"{part1} AND {simplified_part2}")
|
| 980 |
+
|
| 981 |
+
# 单个关键词查询
|
| 982 |
+
queries.append(part1)
|
| 983 |
+
queries.append(part2)
|
| 984 |
+
|
| 985 |
+
return list(set(queries)) # 去重
|
| 986 |
+
|
| 987 |
+
def _generate_backup_queries(self, paper: PaperInfo, num_queries: int) -> List[str]:
|
| 988 |
+
"""生成备用查询,使用新格式"""
|
| 989 |
+
queries = []
|
| 990 |
+
|
| 991 |
+
# 从论文标题和摘要中提取关键词
|
| 992 |
+
text = f"{paper.title} {paper.abstract}".lower()
|
| 993 |
+
|
| 994 |
+
# 常见技术关键词列表
|
| 995 |
+
tech_keywords = [
|
| 996 |
+
"cryo-em", "cryoem", "x-ray", "xray", "nmr", "crispr", "pcr", "western blot",
|
| 997 |
+
"immunofluorescence", "confocal", "flow cytometry", "mass spectrometry",
|
| 998 |
+
"chromatography", "electrophoresis", "microscopy", "spectroscopy"
|
| 999 |
+
]
|
| 1000 |
+
|
| 1001 |
+
# 查找技术关键词
|
| 1002 |
+
found_tech_keyword = None
|
| 1003 |
+
for keyword in tech_keywords:
|
| 1004 |
+
if keyword in text:
|
| 1005 |
+
found_tech_keyword = keyword
|
| 1006 |
+
break
|
| 1007 |
+
|
| 1008 |
+
# 如果没有找到技术关键词,使用默认值
|
| 1009 |
+
if not found_tech_keyword:
|
| 1010 |
+
found_tech_keyword = "cryo-em"
|
| 1011 |
+
|
| 1012 |
+
# 查询1:纯子类关键词
|
| 1013 |
+
queries.append(found_tech_keyword)
|
| 1014 |
+
|
| 1015 |
+
# 查询2:子类关键词 + 子子类关键词
|
| 1016 |
+
queries.append(f"{found_tech_keyword} structure")
|
| 1017 |
+
|
| 1018 |
+
# 从标题中提取特定术语
|
| 1019 |
+
title_words = paper.title.split()
|
| 1020 |
+
specific_term = None
|
| 1021 |
+
for word in title_words:
|
| 1022 |
+
# 过滤掉常见词汇,寻找有意义的术语
|
| 1023 |
+
if (len(word) > 3 and
|
| 1024 |
+
word.lower() not in ['the', 'and', 'for', 'with', 'from', 'this', 'that',
|
| 1025 |
+
'structures', 'human', 'channel', 'complex', 'with',
|
| 1026 |
+
'auxiliary', 'subunits', 'animal', 'toxins', 'analysis',
|
| 1027 |
+
'study', 'research', 'investigation', 'characterization']):
|
| 1028 |
+
specific_term = word
|
| 1029 |
+
break
|
| 1030 |
+
|
| 1031 |
+
if specific_term:
|
| 1032 |
+
# 查询3:子类关键词 + 子子类关键词 + 论文特定术语
|
| 1033 |
+
queries.append(f"{found_tech_keyword} structure {specific_term}")
|
| 1034 |
+
else:
|
| 1035 |
+
# 如果没有找到特定术语,使用第一个查询的变体
|
| 1036 |
+
queries.append(f"{found_tech_keyword} structure analysis")
|
| 1037 |
+
|
| 1038 |
+
# 确保总是返回所需数量的查询
|
| 1039 |
+
while len(queries) < num_queries:
|
| 1040 |
+
# 如果还需要更多查询,添加变体
|
| 1041 |
+
variant_num = len(queries) + 1
|
| 1042 |
+
queries.append(f"{found_tech_keyword} analysis")
|
| 1043 |
+
|
| 1044 |
+
print(f"备用查询生成完成,共 {len(queries)} 个查询")
|
| 1045 |
+
return queries[:num_queries]
|
| 1046 |
+
|
| 1047 |
+
def _infer_biology_keywords(self, paper: PaperInfo) -> tuple:
|
| 1048 |
+
"""从论文内容推断最相关的生物学分类和关键词"""
|
| 1049 |
+
text = f"{paper.title} {paper.abstract} {' '.join(paper.keywords)}".lower()
|
| 1050 |
+
|
| 1051 |
+
best_category = None
|
| 1052 |
+
best_keyword = None
|
| 1053 |
+
max_matches = 0
|
| 1054 |
+
|
| 1055 |
+
for category, keywords in BIOLOGY_KEYWORDS.items():
|
| 1056 |
+
category_matches = 0
|
| 1057 |
+
best_keyword_in_category = None
|
| 1058 |
+
|
| 1059 |
+
# 检查类别名称匹配
|
| 1060 |
+
if category.lower() in text:
|
| 1061 |
+
category_matches += 2
|
| 1062 |
+
|
| 1063 |
+
# 检查关键词匹配
|
| 1064 |
+
for keyword in keywords:
|
| 1065 |
+
if keyword.lower() in text:
|
| 1066 |
+
category_matches += 1
|
| 1067 |
+
if not best_keyword_in_category:
|
| 1068 |
+
best_keyword_in_category = keyword
|
| 1069 |
+
|
| 1070 |
+
# ���新最佳匹配
|
| 1071 |
+
if category_matches > max_matches:
|
| 1072 |
+
max_matches = category_matches
|
| 1073 |
+
best_category = category
|
| 1074 |
+
best_keyword = best_keyword_in_category or keywords[0]
|
| 1075 |
+
|
| 1076 |
+
return best_category, best_keyword
|
| 1077 |
+
|
| 1078 |
+
def _log_query_generation(self, paper: PaperInfo, queries: List[str]):
|
| 1079 |
+
"""记录查询生成日志"""
|
| 1080 |
+
log_info = {
|
| 1081 |
+
"paper_title": paper.title[:100] + "..." if len(paper.title) > 100 else paper.title,
|
| 1082 |
+
"paper_keywords": paper.keywords,
|
| 1083 |
+
"generated_queries": queries,
|
| 1084 |
+
"query_count": len(queries),
|
| 1085 |
+
"timestamp": time.time()
|
| 1086 |
+
}
|
| 1087 |
+
print(f"查询生成日志: {log_info}")
|
| 1088 |
+
|
| 1089 |
+
def _call_llm(self, prompt: str, system_msg: str) -> Optional[str]:
|
| 1090 |
+
"""调用LLM生成查询"""
|
| 1091 |
+
try:
|
| 1092 |
+
if DASHSCOPE_API_KEY:
|
| 1093 |
+
import dashscope
|
| 1094 |
+
dashscope.api_key = DASHSCOPE_API_KEY
|
| 1095 |
+
|
| 1096 |
+
response = dashscope.Generation.call(
|
| 1097 |
+
model="qwen-turbo-latest",
|
| 1098 |
+
messages=[
|
| 1099 |
+
{"role": "system", "content": system_msg},
|
| 1100 |
+
{"role": "user", "content": prompt}
|
| 1101 |
+
],
|
| 1102 |
+
timeout=30
|
| 1103 |
+
)
|
| 1104 |
+
if response.status_code == 200:
|
| 1105 |
+
return response.output.text
|
| 1106 |
+
else:
|
| 1107 |
+
print(f"DashScope API错误: {response.message}")
|
| 1108 |
+
|
| 1109 |
+
elif OPENAI_API_KEY:
|
| 1110 |
+
from openai import OpenAI
|
| 1111 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 1112 |
+
response = client.chat.completions.create(
|
| 1113 |
+
model="gpt-3.5-turbo",
|
| 1114 |
+
messages=[
|
| 1115 |
+
{"role": "system", "content": system_msg},
|
| 1116 |
+
{"role": "user", "content": prompt}
|
| 1117 |
+
],
|
| 1118 |
+
timeout=30
|
| 1119 |
+
)
|
| 1120 |
+
return response.choices[0].message.content
|
| 1121 |
+
|
| 1122 |
+
else:
|
| 1123 |
+
print("未配置API密钥")
|
| 1124 |
+
return None
|
| 1125 |
+
|
| 1126 |
+
except Exception as e:
|
| 1127 |
+
print(f"大模型调用错误: {str(e)}")
|
| 1128 |
+
return None
|
reviewer_recommendation/utils.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
工具函数模块
|
| 3 |
+
提供错误处理、状态管理和通用工具函数
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import time
|
| 7 |
+
import logging
|
| 8 |
+
from typing import Optional, Dict, Any
|
| 9 |
+
from functools import wraps
|
| 10 |
+
|
| 11 |
+
from .models import AppState, RecommendationResponse
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# 配置日志
|
| 15 |
+
logging.basicConfig(
|
| 16 |
+
level=logging.INFO,
|
| 17 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 18 |
+
)
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def handle_api_errors(func):
|
| 23 |
+
"""API错误处理装饰器"""
|
| 24 |
+
@wraps(func)
|
| 25 |
+
def wrapper(*args, **kwargs):
|
| 26 |
+
try:
|
| 27 |
+
return func(*args, **kwargs)
|
| 28 |
+
except Exception as e:
|
| 29 |
+
logger.error(f"API调用错误 in {func.__name__}: {str(e)}")
|
| 30 |
+
raise
|
| 31 |
+
return wrapper
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def validate_paper_info(title: str, abstract: str, keywords: str) -> tuple[bool, str]:
|
| 35 |
+
"""验证论文信息"""
|
| 36 |
+
if not title or not title.strip():
|
| 37 |
+
return False, "论文标题不能为空"
|
| 38 |
+
|
| 39 |
+
if not abstract or not abstract.strip():
|
| 40 |
+
return False, "论文摘要不能为空"
|
| 41 |
+
|
| 42 |
+
if len(abstract.strip()) < 50:
|
| 43 |
+
return False, "论文摘要至少需要50个字符"
|
| 44 |
+
|
| 45 |
+
if len(title.strip()) < 10:
|
| 46 |
+
return False, "论文标题至少需要10个字符"
|
| 47 |
+
|
| 48 |
+
return True, ""
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def validate_reviewer_count(count: int) -> tuple[bool, str]:
|
| 52 |
+
"""验证审稿人数量"""
|
| 53 |
+
if count < 1:
|
| 54 |
+
return False, "推荐审稿人数量至少为1"
|
| 55 |
+
|
| 56 |
+
if count > 10:
|
| 57 |
+
return False, "推荐审稿人数量不能超过10"
|
| 58 |
+
|
| 59 |
+
return True, ""
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def format_error_message(error: Exception) -> str:
|
| 63 |
+
"""格式化错误信息"""
|
| 64 |
+
error_type = type(error).__name__
|
| 65 |
+
error_msg = str(error)
|
| 66 |
+
|
| 67 |
+
# 常见错误类型处理
|
| 68 |
+
if "timeout" in error_msg.lower():
|
| 69 |
+
return "请求超时,请稍后重试"
|
| 70 |
+
elif "api" in error_msg.lower():
|
| 71 |
+
return "API调用失败,请检查网络连接"
|
| 72 |
+
elif "json" in error_msg.lower():
|
| 73 |
+
return "数据解析错误,请重试"
|
| 74 |
+
else:
|
| 75 |
+
return f"系统错误 ({error_type}): {error_msg}"
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def create_error_response(error: Exception, search_time: float = 0.0) -> RecommendationResponse:
|
| 79 |
+
"""创建错误响应"""
|
| 80 |
+
return RecommendationResponse(
|
| 81 |
+
reviewers=[],
|
| 82 |
+
search_time=search_time,
|
| 83 |
+
total_candidates=0,
|
| 84 |
+
success=False,
|
| 85 |
+
error_message=format_error_message(error)
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def update_app_state(state: AppState, **kwargs) -> AppState:
|
| 90 |
+
"""更新应用状态"""
|
| 91 |
+
for key, value in kwargs.items():
|
| 92 |
+
if hasattr(state, key):
|
| 93 |
+
setattr(state, key, value)
|
| 94 |
+
return state
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def log_operation(operation: str, **kwargs):
|
| 98 |
+
"""记录操作日志"""
|
| 99 |
+
log_data = {
|
| 100 |
+
"operation": operation,
|
| 101 |
+
"timestamp": time.time(),
|
| 102 |
+
**kwargs
|
| 103 |
+
}
|
| 104 |
+
logger.info(f"操作日志: {log_data}")
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def sanitize_input(text: str) -> str:
|
| 108 |
+
"""清理输入文本"""
|
| 109 |
+
if not text:
|
| 110 |
+
return ""
|
| 111 |
+
|
| 112 |
+
# 移除多余的空白字符
|
| 113 |
+
text = " ".join(text.split())
|
| 114 |
+
|
| 115 |
+
# 限制长度
|
| 116 |
+
if len(text) > 10000:
|
| 117 |
+
text = text[:10000] + "..."
|
| 118 |
+
|
| 119 |
+
return text
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def extract_keywords(text: str) -> list[str]:
|
| 123 |
+
"""从文本中提取关键词"""
|
| 124 |
+
if not text:
|
| 125 |
+
return []
|
| 126 |
+
|
| 127 |
+
# 简单的关键词提取(按逗号分割)
|
| 128 |
+
keywords = [kw.strip() for kw in text.split(',') if kw.strip()]
|
| 129 |
+
|
| 130 |
+
# 过滤太短的关键词
|
| 131 |
+
keywords = [kw for kw in keywords if len(kw) >= 2]
|
| 132 |
+
|
| 133 |
+
# 去重
|
| 134 |
+
return list(set(keywords))
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def format_search_progress(current: int, total: int, step: str) -> str:
|
| 138 |
+
"""格式化搜索进度"""
|
| 139 |
+
percentage = (current / total * 100) if total > 0 else 0
|
| 140 |
+
return f"搜索进度: {current}/{total} ({percentage:.1f}%) - {step}"
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def validate_api_keys() -> tuple[bool, str]:
|
| 144 |
+
"""验证API密钥配置"""
|
| 145 |
+
import os
|
| 146 |
+
|
| 147 |
+
openai_key = os.getenv("OPENAI_API_KEY")
|
| 148 |
+
dashscope_key = "sk-564d51ee5ddd4693a86f34750b46b02e" # 硬编码的密钥
|
| 149 |
+
|
| 150 |
+
if not openai_key and not dashscope_key:
|
| 151 |
+
return False, "未配置任何API密钥"
|
| 152 |
+
|
| 153 |
+
return True, "API密钥配置正常"
|