wujian123 commited on
Commit
9d225e3
·
1 Parent(s): 5080c72

Add reviewer_recommendation module

Browse files
reviewer_recommendation/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 审稿人推荐系统
3
+ 基于论文信息自动推荐合适的审稿人
4
+ """
5
+
6
+ __version__ = "1.0.0"
7
+ __author__ = "AI Assistant"
8
+
9
+ from .models import PaperInfo, Reviewer, RecommendationRequest, RecommendationResponse, AppState
10
+ from .searcher import AcademicSearcher, DynamicAcademicSearcher, OpenAlexSearcher
11
+ from .engine import LLMRecommendationEngine
12
+
13
+ __all__ = [
14
+ "PaperInfo",
15
+ "Reviewer",
16
+ "RecommendationRequest",
17
+ "RecommendationResponse",
18
+ "AppState",
19
+ "AcademicSearcher",
20
+ "DynamicAcademicSearcher",
21
+ "OpenAlexSearcher",
22
+ "LLMRecommendationEngine"
23
+ ]
reviewer_recommendation/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (666 Bytes). View file
 
reviewer_recommendation/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (692 Bytes). View file
 
reviewer_recommendation/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (697 Bytes). View file
 
reviewer_recommendation/__pycache__/engine.cpython-310.pyc ADDED
Binary file (11.9 kB). View file
 
reviewer_recommendation/__pycache__/engine.cpython-312.pyc ADDED
Binary file (14.1 kB). View file
 
reviewer_recommendation/__pycache__/models.cpython-310.pyc ADDED
Binary file (3.36 kB). View file
 
reviewer_recommendation/__pycache__/models.cpython-312.pyc ADDED
Binary file (4.2 kB). View file
 
reviewer_recommendation/__pycache__/models.cpython-313.pyc ADDED
Binary file (4.25 kB). View file
 
reviewer_recommendation/__pycache__/searcher.cpython-310.pyc ADDED
Binary file (28.1 kB). View file
 
reviewer_recommendation/__pycache__/searcher.cpython-312.pyc ADDED
Binary file (15.1 kB). View file
 
reviewer_recommendation/engine copy.py ADDED
@@ -0,0 +1,609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 推荐引擎模块
3
+ 使用LLM分析候选者并推荐合适的审稿人
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import time
9
+ from typing import List, Dict, Any, Optional
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+
12
+ from .models import PaperInfo, Reviewer
13
+
14
+
15
+ # 配置部分
16
+ DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
17
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
18
+
19
+
20
+ class LLMRecommendationEngine:
21
+ """完全由大模型驱动的审稿人推荐引擎"""
22
+
23
+ def __init__(self):
24
+ pass
25
+
26
+ def analyze_candidates(self, paper: PaperInfo, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
27
+ """分析候选文献,评估适合度"""
28
+ system_msg = "你是学术领域专家,擅长评估研究人员与特定论文的匹配度"
29
+ candidates_str = json.dumps(candidates, ensure_ascii=False, indent=2)
30
+
31
+ # 构建作者和机构信息
32
+ authors_info = ""
33
+ if paper.authors:
34
+ authors_info = f"作者: {', '.join(paper.authors)}"
35
+ if paper.affiliations:
36
+ authors_info += f"\n作者机构: {', '.join(paper.affiliations)}"
37
+
38
+ prompt = f"""
39
+ 请分析以下候选文献的作者是否适合评审目标论文,并按适合度排序:
40
+
41
+ 目标论文:
42
+ 标题: {paper.title}
43
+ 摘要: {paper.abstract}
44
+ 关键词: {', '.join(paper.keywords)}
45
+ {authors_info}
46
+
47
+ 候选文献列表:
48
+ {candidates_str}
49
+
50
+ 分析要求:
51
+ 1. 为每位通讯作者评估适合度,给出0-1的相关性评分
52
+ 2. 提取作者的专业领域和研究方向
53
+ 3. 说明推荐理由(中文 重点介绍作者本人的研究方向)
54
+ 4. 排除重复作者
55
+ 5. 严格排除与目标论文作者相同或来自同一机构的人员
56
+ 6. 按适合度从高到低排序,优先考虑引用量和知名程度
57
+ 7. 必须返回至少5-10个审稿人,确保有足够的候选人数
58
+ 7. 如果相关性评分全部低于0.6 则重新再进行一次分析
59
+ 8. 估算作者的学术论文引用总量(基于机构声誉和研究领域)
60
+
61
+ 请返回JSON数组,每个元素包含:
62
+ - name: 作者姓名
63
+ - affiliation: 单位
64
+ - email: 邮箱(从数据中提取)
65
+ - reason: 推荐理由(中文 作者本人的研究方向与目标论文的适配度)
66
+ - relevance_score: 相关性评分(0-1)
67
+ - expertise_areas: 专业领域列表
68
+ - citation_count: 估算的学术论文引用总量
69
+
70
+ 确保输出是纯JSON,不要包含其他内容
71
+ """
72
+
73
+ response = self._call_llm_with_retry(prompt.strip(), system_msg, json_output=True)
74
+ if not response:
75
+ return []
76
+
77
+ # 清理和解析JSON响应
78
+ cleaned_response = self._clean_json_response(response)
79
+ if not cleaned_response:
80
+ return []
81
+
82
+ try:
83
+ result = json.loads(cleaned_response)
84
+ if isinstance(result, list):
85
+ # 并行为每个审稿人添加引用量
86
+ enhanced_result = self._add_citations_parallel(result)
87
+
88
+ # 按引用量和相关性评分综合排序
89
+ def sort_key(x):
90
+ citation_count = x.get('citation_count', '0')
91
+ if isinstance(citation_count, str) and citation_count == "未查询到":
92
+ citation_score = 0
93
+ else:
94
+ try:
95
+ citation_score = int(citation_count) / 10000 * 0.6
96
+ except (ValueError, TypeError):
97
+ citation_score = 0
98
+ relevance_score = x.get('relevance_score', 0) * 0.4
99
+ return citation_score + relevance_score
100
+
101
+ enhanced_result.sort(key=sort_key, reverse=True)
102
+
103
+ # 过滤掉相同作者和机构
104
+ filtered_result = self._filter_reviewers(enhanced_result, paper)
105
+ return filtered_result
106
+ else:
107
+ print("大模型返回的不是JSON数组")
108
+ return self._generate_fallback_reviewers(candidates, paper)
109
+ except json.JSONDecodeError:
110
+ print("无法解析大模型返回的JSON")
111
+ return self._generate_fallback_reviewers(candidates, paper)
112
+
113
+ def _clean_json_response(self, response: str) -> str:
114
+ """清理大模型返回的JSON响应"""
115
+ if not response:
116
+ return ""
117
+
118
+ # 移除markdown代码块
119
+ if "```json" in response:
120
+ start = response.find("```json") + 7
121
+ end = response.find("```", start)
122
+ if end != -1:
123
+ response = response[start:end]
124
+ elif "```" in response:
125
+ start = response.find("```") + 3
126
+ end = response.find("```", start)
127
+ if end != -1:
128
+ response = response[start:end]
129
+
130
+ # 清理空白字符
131
+ response = response.strip()
132
+
133
+ # 处理多个独立JSON对象的情况
134
+ if response.count('{') > 1:
135
+ # 尝试将多个JSON对象合并为数组
136
+ try:
137
+ # 分割多个JSON对象
138
+ objects = []
139
+ brace_count = 0
140
+ current_obj = ""
141
+
142
+ for char in response:
143
+ current_obj += char
144
+ if char == '{':
145
+ brace_count += 1
146
+ elif char == '}':
147
+ brace_count -= 1
148
+ if brace_count == 0:
149
+ # 一个完整的JSON对象
150
+ obj_str = current_obj.strip()
151
+ if obj_str.startswith('{') and obj_str.endswith('}'):
152
+ try:
153
+ json.loads(obj_str) # 验证JSON格式
154
+ objects.append(obj_str)
155
+ except:
156
+ pass
157
+ current_obj = ""
158
+
159
+ if len(objects) > 1:
160
+ # 合并为JSON数组
161
+ return "[" + ",".join(objects) + "]"
162
+ elif len(objects) == 1:
163
+ return "[" + objects[0] + "]"
164
+ except:
165
+ pass
166
+
167
+ return response
168
+
169
+ def _filter_reviewers(self, reviewers: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
170
+ """过滤掉与论文作者相同或来自同一机构的审稿人"""
171
+ filtered_reviewers = []
172
+
173
+ # 获取论文作者和机构的标准化列表
174
+ paper_authors = [author.strip().lower() for author in paper.authors if author.strip()]
175
+ paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
176
+
177
+ print(f"论文作者: {paper.authors}")
178
+ print(f"论文机构: {paper.affiliations}")
179
+ print(f"开始过滤 {len(reviewers)} 个审稿人...")
180
+
181
+ for reviewer in reviewers:
182
+ reviewer_name = reviewer.get("name", "").strip().lower()
183
+ reviewer_affiliation = reviewer.get("affiliation", "").strip().lower()
184
+
185
+ print(f"检查审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')})")
186
+
187
+ # 检查是否与论文作者相同
188
+ is_same_author = any(self._similar_names(reviewer_name, author) for author in paper_authors)
189
+
190
+ # 检查是否来自同一机构
191
+ is_same_institution = any(self._similar_institutions(reviewer_affiliation, aff) for aff in paper_affiliations)
192
+
193
+ # 如果既不是相同作者也不是同一机构,则保留
194
+ if not is_same_author and not is_same_institution:
195
+ filtered_reviewers.append(reviewer)
196
+ print(f"保留审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')})")
197
+ else:
198
+ reason = "作者相同" if is_same_author else "机构相同"
199
+ print(f"过滤掉审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')}) - {reason}")
200
+
201
+ print(f"过滤完成,保留 {len(filtered_reviewers)} 个审稿人")
202
+ return filtered_reviewers
203
+
204
+ def _similar_names(self, name1: str, name2: str) -> bool:
205
+ """检查两个姓名是否相似(可能是同一人)"""
206
+ # 简单的相似性检查
207
+ if name1 == name2:
208
+ print(f"姓名完全匹配: '{name1}' == '{name2}'")
209
+ return True
210
+
211
+ # 检查是否包含相同的姓氏
212
+ name1_parts = name1.split()
213
+ name2_parts = name2.split()
214
+
215
+ if name1_parts and name2_parts:
216
+ # 检查姓氏是否相同
217
+ if name1_parts[0] == name2_parts[0]:
218
+ print(f"姓氏匹配: '{name1}' vs '{name2}' - 共同姓氏: {name1_parts[0]}")
219
+ return True
220
+
221
+ return False
222
+
223
+ def _similar_institutions(self, inst1: str, inst2: str) -> bool:
224
+ """检查两个机构是否相似(可能是同一机构的不同表述)"""
225
+ if inst1 == inst2:
226
+ return True
227
+
228
+ # 过滤掉通用词汇,只保留有意义的机构名称关键词
229
+ def filter_common_words(words):
230
+ common_words = {
231
+ 'university', 'college', 'institute', 'department', 'school',
232
+ 'center', 'centre', 'laboratory', 'lab', 'of', 'the', 'and',
233
+ 'at', 'in', 'for', 'medical', 'medicine', 'science', 'technology'
234
+ }
235
+ return {word for word in words if word not in common_words and len(word) > 2}
236
+
237
+ # 获取有意义的关键词
238
+ inst1_words = filter_common_words(set(inst1.lower().split()))
239
+ inst2_words = filter_common_words(set(inst2.lower().split()))
240
+
241
+ # 如果过滤后没有关键词,使用原始词汇但提高阈值
242
+ if not inst1_words or not inst2_words:
243
+ inst1_words = set(inst1.lower().split())
244
+ inst2_words = set(inst2.lower().split())
245
+ # 提高阈值到80%,减少误判
246
+ threshold = 0.8
247
+ else:
248
+ # 使用有意义关键词,阈值可以相对宽松
249
+ threshold = 0.6
250
+
251
+ # 计算共同词汇比例
252
+ common_words = inst1_words.intersection(inst2_words)
253
+ if not common_words:
254
+ return False
255
+
256
+ similarity_ratio = len(common_words) / min(len(inst1_words), len(inst2_words))
257
+
258
+ # 添加调试日志
259
+ if similarity_ratio >= threshold:
260
+ print(f"机构匹配: '{inst1}' vs '{inst2}' - 相似度: {similarity_ratio:.2f}, 共同词汇: {common_words}")
261
+
262
+ return similarity_ratio >= threshold
263
+
264
+ def _generate_fallback_reviewers(self, candidates: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
265
+ """当LLM解析失败时,生成基础推荐"""
266
+ fallback_reviewers = []
267
+
268
+ for candidate in candidates[:20]: # 取前20个候选
269
+ author = candidate.get("corresponding_author")
270
+ institution = candidate.get("corresponding_institution")
271
+
272
+ if author and author not in [r.get("name") for r in fallback_reviewers]:
273
+ # 检查是否与论文作者或机构相同
274
+ author_lower = author.strip().lower()
275
+ institution_lower = (institution or "").strip().lower()
276
+
277
+ paper_authors = [a.strip().lower() for a in paper.authors if a.strip()]
278
+ paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
279
+
280
+ is_same_author = any(self._similar_names(author_lower, pa) for pa in paper_authors)
281
+ is_same_institution = any(self._similar_institutions(institution_lower, pa) for pa in paper_affiliations)
282
+
283
+ if not is_same_author and not is_same_institution:
284
+ # 获取真实引用量
285
+ citation_count = self._get_real_citation_count(author, institution or "未知单位")
286
+
287
+ fallback_reviewers.append({
288
+ "name": author,
289
+ "affiliation": institution or "未知单位",
290
+ "email": "未知邮箱",
291
+ "reason": "基于文献相关性自动推荐",
292
+ "relevance_score": 0.7,
293
+ "expertise_areas": ["相关研究领域"],
294
+ "citation_count": citation_count
295
+ })
296
+
297
+ return fallback_reviewers
298
+
299
+ def _call_llm_with_retry(self, prompt: str, system_msg: str, json_output: bool = False, max_retries: int = 3) -> Any:
300
+ """带重试机制的LLM调用"""
301
+ for attempt in range(max_retries):
302
+ try:
303
+ if DASHSCOPE_API_KEY:
304
+ import dashscope
305
+ dashscope.api_key = DASHSCOPE_API_KEY
306
+
307
+ # 设置更长的超时时间和更好的错误处理
308
+ try:
309
+ response = dashscope.Generation.call(
310
+ model="qwen-turbo", # 使用更稳定的模型
311
+ messages=[
312
+ {"role": "system", "content": system_msg},
313
+ {"role": "user", "content": prompt}
314
+ ],
315
+ result_format="json" if json_output else "text",
316
+ timeout=60 # 增加超时时间
317
+ )
318
+ if response.status_code == 200:
319
+ return response.output.text
320
+ else:
321
+ print(f"DashScope API错误: {response.message}")
322
+
323
+ except Exception as api_error:
324
+ print(f"DashScope API调用异常: {str(api_error)}")
325
+ if "SSL" in str(api_error) or "EOF" in str(api_error):
326
+ print("检测到SSL连接问题,尝试使用备用方案")
327
+ # 可以在这里添加备用API调用
328
+
329
+ elif OPENAI_API_KEY:
330
+ from openai import OpenAI
331
+ client = OpenAI(api_key=OPENAI_API_KEY)
332
+ response = client.chat.completions.create(
333
+ model="gpt-3.5-turbo", # 使用更稳定的模型
334
+ messages=[
335
+ {"role": "system", "content": system_msg},
336
+ {"role": "user", "content": prompt}
337
+ ],
338
+ response_format={"type": "json_object"} if json_output else None,
339
+ timeout=60
340
+ )
341
+ return response.choices[0].message.content
342
+
343
+ else:
344
+ print("未配置API密钥,使用备用方案")
345
+ return None
346
+
347
+ except Exception as e:
348
+ print(f"第{attempt + 1}次调用失败: {str(e)}")
349
+ if attempt < max_retries - 1:
350
+ print(f"等待 {2 ** attempt} 秒后重试...")
351
+ time.sleep(2 ** attempt) # 指数退避
352
+ else:
353
+ print(f"所有重试都失败了,将使用备用推荐方案")
354
+ return None
355
+
356
+ def _get_real_citation_count(self, name: str, affiliation: str) -> str:
357
+ """获取作者的真实学术论文引用总量"""
358
+ try:
359
+ # 首先尝试OpenAlex API
360
+ citation_count = self._get_citation_from_openalex(name, affiliation)
361
+ if citation_count > 0:
362
+ return str(citation_count)
363
+
364
+ # 备用方案:Semantic Scholar API
365
+ citation_count = self._get_citation_from_semantic_scholar(name, affiliation)
366
+ if citation_count > 0:
367
+ return str(citation_count)
368
+
369
+ # 如果没有找到真实数据,返回"未查询到"
370
+ print(f"未找到 {name} 的引用量数据")
371
+ return "未查询到"
372
+
373
+ except Exception as e:
374
+ print(f"获取引用量失败: {str(e)}")
375
+ return "未查询到"
376
+
377
+ def _add_citations_parallel(self, reviewers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
378
+ """并行为审稿人添加引用量"""
379
+ print(f"开始并行获取 {len(reviewers)} 个审稿人的引用量...")
380
+
381
+ enhanced_reviewers = []
382
+
383
+ # 使用线程池并行获取引用量
384
+ with ThreadPoolExecutor(max_workers=5) as executor:
385
+ # 提交所有引用量获取任务
386
+ future_to_reviewer = {}
387
+ for reviewer in reviewers:
388
+ name = reviewer.get('name', '')
389
+ affiliation = reviewer.get('affiliation', '')
390
+ future = executor.submit(self._get_real_citation_count, name, affiliation)
391
+ future_to_reviewer[future] = reviewer
392
+
393
+ # 收集结果
394
+ for future in as_completed(future_to_reviewer):
395
+ reviewer = future_to_reviewer[future]
396
+ try:
397
+ citation_count = future.result(timeout=15) # 15秒超时
398
+ reviewer['citation_count'] = citation_count
399
+ enhanced_reviewers.append(reviewer)
400
+ print(f"获取引用量完成: {reviewer.get('name')} - {citation_count}")
401
+ except Exception as e:
402
+ print(f"获取引用量失败: {reviewer.get('name')} - {str(e)}")
403
+ reviewer['citation_count'] = "未查询到"
404
+ enhanced_reviewers.append(reviewer)
405
+
406
+ print(f"并行引用量获取完成,处理了 {len(enhanced_reviewers)} 个审稿人")
407
+ return enhanced_reviewers
408
+
409
+ def _get_citation_from_openalex(self, name: str, affiliation: str) -> int:
410
+ """从OpenAlex API获取作者引用量"""
411
+ try:
412
+ import requests
413
+ import urllib.parse
414
+
415
+ # 生成多种查询变体
416
+ name_variants = self._generate_name_variants(name)
417
+
418
+ for variant in name_variants:
419
+ # 简化查询,只使用姓名
420
+ query = f'display_name:"{variant}"'
421
+ print(f"OpenAlex查询: {query}")
422
+
423
+ # OpenAlex API请求
424
+ url = "https://api.openalex.org/authors"
425
+ params = {
426
+ 'search': query,
427
+ 'per-page': 5, # 增加结果数量
428
+ 'select': 'id,display_name,cited_by_count,affiliations'
429
+ }
430
+
431
+ response = requests.get(url, params=params, timeout=15)
432
+ response.raise_for_status()
433
+
434
+ data = response.json()
435
+ if data.get('results'):
436
+ # 尝试匹配最佳结果
437
+ best_match = self._find_best_author_match(data['results'], name, affiliation)
438
+ if best_match:
439
+ cited_by_count = best_match.get('cited_by_count', 0)
440
+ print(f"OpenAlex API: {name} 引用量: {cited_by_count}")
441
+ return cited_by_count
442
+
443
+ print(f"OpenAlex API: 未找到 {variant} 的数据")
444
+
445
+ return 0
446
+
447
+ except Exception as e:
448
+ print(f"OpenAlex API调用失败: {str(e)}")
449
+ return 0
450
+
451
+ def _get_citation_from_semantic_scholar(self, name: str, affiliation: str) -> int:
452
+ """从Semantic Scholar API获取作者引用量"""
453
+ try:
454
+ import requests
455
+ import urllib.parse
456
+
457
+ # 生成多种查询变体
458
+ name_variants = self._generate_name_variants(name)
459
+
460
+ for variant in name_variants:
461
+ # 简化查询,只使用姓名
462
+ query = variant
463
+ print(f"Semantic Scholar查询: {query}")
464
+
465
+ # Semantic Scholar API请求
466
+ url = "https://api.semanticscholar.org/graph/v1/author/search"
467
+ params = {
468
+ 'query': query,
469
+ 'limit': 5, # 增加结果数量
470
+ 'fields': 'authorId,name,citationCount,affiliations'
471
+ }
472
+
473
+ headers = {
474
+ 'User-Agent': 'Academic-Reviewer-System/1.0'
475
+ }
476
+
477
+ response = requests.get(url, params=params, headers=headers, timeout=15)
478
+ response.raise_for_status()
479
+
480
+ data = response.json()
481
+ if data.get('data'):
482
+ # 尝试匹配最佳结果
483
+ best_match = self._find_best_semantic_author_match(data['data'], name, affiliation)
484
+ if best_match:
485
+ citation_count = best_match.get('citationCount', 0)
486
+ print(f"Semantic Scholar API: {name} 引用量: {citation_count}")
487
+ return citation_count
488
+
489
+ print(f"Semantic Scholar API: 未找到 {variant} 的数据")
490
+
491
+ return 0
492
+
493
+ except Exception as e:
494
+ print(f"Semantic Scholar API调用失败: {str(e)}")
495
+ return 0
496
+
497
+ def _generate_name_variants(self, name: str) -> List[str]:
498
+ """生成姓名的多种变体"""
499
+ variants = [name] # 原始姓名
500
+
501
+ # 如果包含中间名,尝试不同的组合
502
+ name_parts = name.split()
503
+ if len(name_parts) >= 2:
504
+ # 只使用姓和名
505
+ variants.append(f"{name_parts[0]} {name_parts[-1]}")
506
+
507
+ # 如果有多于2个部分,尝试不同的组合
508
+ if len(name_parts) == 3:
509
+ variants.append(f"{name_parts[0]} {name_parts[1]}")
510
+ variants.append(f"{name_parts[1]} {name_parts[2]}")
511
+ elif len(name_parts) > 3:
512
+ # 对于更复杂的姓名,尝试简化
513
+ variants.append(f"{name_parts[0]} {name_parts[1]}")
514
+ variants.append(f"{name_parts[0]} {name_parts[-1]}")
515
+
516
+ # 去重并保持顺序
517
+ seen = set()
518
+ unique_variants = []
519
+ for variant in variants:
520
+ if variant not in seen:
521
+ seen.add(variant)
522
+ unique_variants.append(variant)
523
+
524
+ return unique_variants
525
+
526
+ def _find_best_author_match(self, authors: List[Dict], target_name: str, target_affiliation: str) -> Optional[Dict]:
527
+ """在OpenAlex结果中找到最佳匹配的作者"""
528
+ if not authors:
529
+ return None
530
+
531
+ # 如果只有一个结果,直接返回
532
+ if len(authors) == 1:
533
+ return authors[0]
534
+
535
+ # 计算每个作者的匹配分数
536
+ best_match = None
537
+ best_score = 0
538
+
539
+ for author in authors:
540
+ score = 0
541
+ author_name = author.get('display_name', '').lower()
542
+ target_name_lower = target_name.lower()
543
+
544
+ # 姓名匹配分数
545
+ if target_name_lower in author_name or author_name in target_name_lower:
546
+ score += 10
547
+
548
+ # 检查机构匹配
549
+ affiliations = author.get('affiliations', [])
550
+ if affiliations and target_affiliation and target_affiliation != "未知单位":
551
+ for aff in affiliations:
552
+ aff_name = aff.get('display_name', '').lower()
553
+ if target_affiliation.lower() in aff_name:
554
+ score += 5
555
+ break
556
+
557
+ # 引用量作为权重
558
+ citation_count = author.get('cited_by_count', 0)
559
+ if citation_count > 0:
560
+ score += 1
561
+
562
+ if score > best_score:
563
+ best_score = score
564
+ best_match = author
565
+
566
+ return best_match if best_score > 0 else authors[0]
567
+
568
+ def _find_best_semantic_author_match(self, authors: List[Dict], target_name: str, target_affiliation: str) -> Optional[Dict]:
569
+ """在Semantic Scholar结果中找到最佳匹配的作者"""
570
+ if not authors:
571
+ return None
572
+
573
+ # 如果只有一个结果,直接返回
574
+ if len(authors) == 1:
575
+ return authors[0]
576
+
577
+ # 计算每个作者的匹配分数
578
+ best_match = None
579
+ best_score = 0
580
+
581
+ for author in authors:
582
+ score = 0
583
+ author_name = author.get('name', '').lower()
584
+ target_name_lower = target_name.lower()
585
+
586
+ # 姓名匹配分数
587
+ if target_name_lower in author_name or author_name in target_name_lower:
588
+ score += 10
589
+
590
+ # 检查机构匹配
591
+ affiliations = author.get('affiliations', [])
592
+ if affiliations and target_affiliation and target_affiliation != "未知单位":
593
+ for aff in affiliations:
594
+ aff_name = aff.get('name', '').lower()
595
+ if target_affiliation.lower() in aff_name:
596
+ score += 5
597
+ break
598
+
599
+ # 引用量作为权重
600
+ citation_count = author.get('citationCount', 0)
601
+ if citation_count > 0:
602
+ score += 1
603
+
604
+ if score > best_score:
605
+ best_score = score
606
+ best_match = author
607
+
608
+ return best_match if best_score > 0 else authors[0]
609
+
reviewer_recommendation/engine.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 推荐引擎模块
3
+ 使用LLM分析候选者并推荐合适的审稿人
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import time
9
+ from typing import List, Dict, Any, Optional
10
+
11
+ from .models import PaperInfo, Reviewer
12
+
13
+
14
+ # 配置部分
15
+ DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
16
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
17
+
18
+
19
+ class LLMRecommendationEngine:
20
+ """完全由大模型驱动的审稿人推荐引擎"""
21
+
22
+ def __init__(self):
23
+ pass
24
+
25
+ def analyze_candidates(self, paper: PaperInfo, candidates: List[Dict[str, Any]], num_reviewers: int = 8) -> List[Dict[str, Any]]:
26
+ """分析候选审稿人,使用统一的推荐策略"""
27
+ print(f"候选审稿人: {len(candidates)} 人")
28
+
29
+ # 使用统一的提示词分析候选审稿人
30
+ return self._analyze_candidates_unified(paper, candidates, num_reviewers)
31
+
32
+ def _analyze_candidates_unified(self, paper: PaperInfo, candidates: List[Dict[str, Any]], num_reviewers: int = 8) -> List[Dict[str, Any]]:
33
+ """使用统一提示词分析候选文献"""
34
+ if not candidates:
35
+ return []
36
+
37
+ # 动态传入前端要求的审稿人数量
38
+ print(f"要求返回 {num_reviewers} 个审稿人")
39
+
40
+ print(f"开始过滤全部 {len(candidates)} 个候选审稿人...")
41
+ filtered_candidates = self._filter_all_candidates(candidates, paper)
42
+ print(f"过滤完成,保留 {len(filtered_candidates)} 个候选审稿人")
43
+
44
+ if not filtered_candidates:
45
+ print("过滤后没有候选审稿人,返回空列表")
46
+ return []
47
+
48
+ system_msg = "你是学术领域专家,擅长评估研究人员与特定论文的相关性"
49
+
50
+ # 提取关键字段,只保留审稿人相关信息
51
+ simplified_candidates = [
52
+ {
53
+ "author": candidate.get("corresponding_author", ""),
54
+ "institution": candidate.get("corresponding_institution", ""),
55
+ "title": candidate.get("title", "")
56
+ }
57
+ for candidate in filtered_candidates
58
+ ]
59
+
60
+ # 构建过滤后的候选审稿人列表字符串
61
+ candidates_str = json.dumps(simplified_candidates, ensure_ascii=False, indent=2)
62
+ print(f"过滤后的候选审稿人列表: {candidates_str}")
63
+
64
+
65
+ prompt = f"""
66
+ 你是学术领域专家,擅长评估研究人员与特定论文的相关性
67
+
68
+ 请分析以下候选审稿人是否适合评审目标论文,评估相关性:
69
+
70
+ 目标论文信息:
71
+ 标题: {paper.title}
72
+ 摘要: {paper.abstract}
73
+ 作者: {', '.join(paper.authors)}
74
+ 机构: {', '.join(paper.affiliations)}
75
+
76
+
77
+ 候选审稿人列表:
78
+ {candidates_str}
79
+
80
+ 分析要求:
81
+ 1. 为每位审稿人评估与目标论文的相关性,给出0-1的相关性评分
82
+ 2. 提取审稿人的专业领域和研究方向
83
+ 3. 按relevance_score从高到低排序(desc)
84
+ 4. 排除与目标论文作者为合作关系的审稿人
85
+ 5. 必须返回至少{num_reviewers}个审稿人
86
+
87
+ 请返回JSON数组,每个元素包含:
88
+ - name: 作者姓名
89
+ - affiliation: 单位
90
+ - email: 邮箱(根据作者姓名和单位邮箱后缀构建)
91
+ - reason: 推荐理由(中文 只介绍作者本人的研究方向与目标论文的适配度)
92
+ - relevance_score: 最终评分(0-1)
93
+ - expertise_areas: 专业领域列表
94
+
95
+ 确保输出是纯JSON,不要包含其他内容
96
+ """
97
+
98
+ response = self._call_llm_with_retry(prompt.strip(), system_msg, json_output=True)
99
+ if not response:
100
+ return []
101
+
102
+ # 清理和解析JSON响应
103
+ cleaned_response = self._clean_json_response(response)
104
+ if not cleaned_response:
105
+ return []
106
+
107
+ try:
108
+ result = json.loads(cleaned_response)
109
+ if isinstance(result, list):
110
+ # 按最终评分排序,确保数据类型转换
111
+ def get_score(x):
112
+ score = x.get('relevance_score', 0)
113
+ try:
114
+ return float(score) if score is not None else 0.0
115
+ except (ValueError, TypeError):
116
+ return 0.0
117
+
118
+ result.sort(key=get_score, reverse=True)
119
+
120
+ # 清理机构信息和邮箱信息,确保不为None
121
+ for reviewer in result:
122
+ if reviewer.get("affiliation") is None:
123
+ reviewer["affiliation"] = "未知单位"
124
+ if reviewer.get("email") is None:
125
+ reviewer["email"] = "unknown@example.com"
126
+
127
+ # 候选审稿人已经在前面过滤过了,直接返回LLM分析结果
128
+ print(f"统一分析完成,推荐 {len(result)} 个审稿人")
129
+ return result
130
+ else:
131
+ print("大模型返回的不是JSON数组")
132
+ return []
133
+ except json.JSONDecodeError:
134
+ print("无法解析大模型返回的JSON")
135
+ return []
136
+
137
+ def _clean_json_response(self, response: str) -> str:
138
+ """清理大模型返回的JSON响应"""
139
+ if not response:
140
+ return ""
141
+
142
+ # 移除markdown代码块
143
+ if "```json" in response:
144
+ start = response.find("```json") + 7
145
+ end = response.find("```", start)
146
+ if end != -1:
147
+ response = response[start:end]
148
+ elif "```" in response:
149
+ start = response.find("```") + 3
150
+ end = response.find("```", start)
151
+ if end != -1:
152
+ response = response[start:end]
153
+
154
+ # 清理空白字符
155
+ response = response.strip()
156
+
157
+ # 处理多个独立JSON对象的情况
158
+ if response.count('{') > 1:
159
+ # 尝试将多个JSON对象合并为数组
160
+ try:
161
+ # 分割多个JSON对象
162
+ objects = []
163
+ brace_count = 0
164
+ current_obj = ""
165
+
166
+ for char in response:
167
+ current_obj += char
168
+ if char == '{':
169
+ brace_count += 1
170
+ elif char == '}':
171
+ brace_count -= 1
172
+ if brace_count == 0:
173
+ # 一个完整的JSON对象
174
+ obj_str = current_obj.strip()
175
+ if obj_str.startswith('{') and obj_str.endswith('}'):
176
+ try:
177
+ json.loads(obj_str) # 验证JSON格式
178
+ objects.append(obj_str)
179
+ except:
180
+ pass
181
+ current_obj = ""
182
+
183
+ if len(objects) > 1:
184
+ # 合并为JSON数组
185
+ return "[" + ",".join(objects) + "]"
186
+ elif len(objects) == 1:
187
+ return "[" + objects[0] + "]"
188
+ except:
189
+ pass
190
+
191
+ return response
192
+
193
+ def _filter_all_candidates(self, candidates: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
194
+ """过滤所有候选审稿人,排除相同作者和机构,并进行去重"""
195
+ filtered_candidates = []
196
+ seen_reviewers = set() # 用于去重的集合
197
+
198
+ # 获取论文作者和机构的标准化列表
199
+ paper_authors = [author.strip().lower() for author in paper.authors if author.strip()]
200
+ paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
201
+
202
+ print(f"论文作者: {paper.authors}")
203
+ print(f"论文机构: {paper.affiliations}")
204
+
205
+ for candidate in candidates:
206
+ # 提取关键字段,只处理必要信息
207
+ author = candidate.get('corresponding_author', '')
208
+ institution = candidate.get('corresponding_institution', '')
209
+
210
+ if not author:
211
+ continue
212
+
213
+ reviewer_name = author.strip().lower()
214
+ reviewer_affiliation = (institution or "").strip().lower()
215
+
216
+ # print(f"检查候选审稿人: {author} ({institution})")
217
+
218
+ # 检查是否与论文作者相同
219
+ is_same_author = any(self._similar_names(reviewer_name, author) for author in paper_authors)
220
+
221
+ # 检查是否来自同一机构
222
+ is_same_institution = any(self._similar_institutions(reviewer_affiliation, aff) for aff in paper_affiliations)
223
+
224
+ # 如果既不是相同作者也不是同一机构,则进行去重检查
225
+ if not is_same_author and not is_same_institution:
226
+ # 创建审稿人标识符用于去重
227
+ reviewer_key = f"{reviewer_name}_{reviewer_affiliation}"
228
+
229
+ if reviewer_key not in seen_reviewers:
230
+ seen_reviewers.add(reviewer_key)
231
+ filtered_candidates.append(candidate)
232
+ # print(f"保留候选审稿人: {author} ({institution})")
233
+ else:
234
+ print(f"跳过重复候选审稿人: {author} ({institution})")
235
+ else:
236
+ reason = "作者相同" if is_same_author else "机构相同"
237
+ print(f"过滤掉候选审稿人: {author} ({institution}) - {reason}")
238
+
239
+ print(f"去重完成,最终保留 {len(filtered_candidates)} 个候选审稿人")
240
+ return filtered_candidates
241
+
242
+
243
+ def _similar_names(self, name1: str, name2: str) -> bool:
244
+ """检查两个姓名是否相似(可能是同一人)"""
245
+ # 简单的相似性检查
246
+ if name1 == name2:
247
+ print(f"姓名完全匹配: '{name1}' == '{name2}'")
248
+ return True
249
+
250
+ # 检查是否包含相同的姓氏
251
+ name1_parts = name1.split()
252
+ name2_parts = name2.split()
253
+
254
+ if name1_parts and name2_parts:
255
+ # 检查姓氏是否相同
256
+ if name1_parts[0] == name2_parts[0]:
257
+ print(f"姓氏匹配: '{name1}' vs '{name2}' - 共同姓氏: {name1_parts[0]}")
258
+ return True
259
+
260
+ return False
261
+
262
+ def _similar_institutions(self, inst1: str, inst2: str) -> bool:
263
+ """检查两个机构是否相似(可能是同一机构的不同表述)"""
264
+ if inst1 == inst2:
265
+ return True
266
+
267
+ # 过滤掉通用词汇,只保留有意义的机构名称关键词
268
+ def filter_common_words(words):
269
+ common_words = {
270
+ 'university', 'college', 'institute', 'department', 'school',
271
+ 'center', 'centre', 'laboratory', 'lab', 'of', 'the', 'and',
272
+ 'at', 'in', 'for', 'medical', 'medicine', 'science', 'technology'
273
+ }
274
+ return {word for word in words if word not in common_words and len(word) > 2}
275
+
276
+ # 获取有意义的关键词
277
+ inst1_words = filter_common_words(set(inst1.lower().split()))
278
+ inst2_words = filter_common_words(set(inst2.lower().split()))
279
+
280
+ # 如果过滤后没有关键词,使用原始词汇但提高阈值
281
+ if not inst1_words or not inst2_words:
282
+ inst1_words = set(inst1.lower().split())
283
+ inst2_words = set(inst2.lower().split())
284
+ # 提高阈值到80%,减少误判
285
+ threshold = 0.8
286
+ else:
287
+ # 使用有意义关键词,阈值可以相对宽松
288
+ threshold = 0.6
289
+
290
+ # 计算共同词汇比例
291
+ common_words = inst1_words.intersection(inst2_words)
292
+ if not common_words:
293
+ return False
294
+
295
+ similarity_ratio = len(common_words) / min(len(inst1_words), len(inst2_words))
296
+
297
+ # 添加调试日志
298
+ if similarity_ratio >= threshold:
299
+ print(f"机构匹配: '{inst1}' vs '{inst2}' - 相似度: {similarity_ratio:.2f}, 共同词汇: {common_words}")
300
+
301
+ return similarity_ratio >= threshold
302
+
303
+ def _generate_fallback_reviewers(self, candidates: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
304
+ """当LLM解析失败时,生成基础推荐"""
305
+ fallback_reviewers = []
306
+
307
+ for candidate in candidates[:20]: # 取前20个候选
308
+ author = candidate.get("corresponding_author")
309
+ institution = candidate.get("corresponding_institution")
310
+
311
+ if author and author not in [r.get("name") for r in fallback_reviewers]:
312
+ # 检查是否与论文作者或机构相同
313
+ author_lower = (author or "").strip().lower()
314
+ institution_lower = (institution or "").strip().lower()
315
+
316
+ paper_authors = [a.strip().lower() for a in paper.authors if a.strip()]
317
+ paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
318
+
319
+ is_same_author = any(self._similar_names(author_lower, pa) for pa in paper_authors)
320
+ is_same_institution = any(self._similar_institutions(institution_lower, pa) for pa in paper_affiliations)
321
+
322
+ if not is_same_author and not is_same_institution:
323
+ fallback_reviewers.append({
324
+ "name": author,
325
+ "affiliation": institution or "未知单位",
326
+ "email": "未知邮箱",
327
+ "reason": "基于文献相关性自动推荐",
328
+ "relevance_score": 0.7,
329
+ "expertise_areas": ["相关研究领域"]
330
+ })
331
+
332
+ return fallback_reviewers
333
+
334
+ def _call_llm_with_retry(self, prompt: str, system_msg: str, json_output: bool = False, max_retries: int = 3) -> Any:
335
+ """带重试机制的LLM调用"""
336
+ for attempt in range(max_retries):
337
+ try:
338
+ if DASHSCOPE_API_KEY:
339
+ import dashscope
340
+ dashscope.api_key = DASHSCOPE_API_KEY
341
+
342
+ # 设置更长的超时时间和更好的错误处理
343
+ try:
344
+ response = dashscope.Generation.call(
345
+ model="qwen-turbo-latest", # 使用更稳定的模型
346
+ messages=[
347
+ {"role": "system", "content": system_msg},
348
+ {"role": "user", "content": prompt}
349
+ ],
350
+ result_format="json" if json_output else "text",
351
+ timeout=60 # 增加超时时间
352
+ )
353
+ if response.status_code == 200:
354
+ return response.output.text
355
+ else:
356
+ print(f"DashScope API错误: {response.message}")
357
+
358
+ except Exception as api_error:
359
+ print(f"DashScope API调用异常: {str(api_error)}")
360
+ if "SSL" in str(api_error) or "EOF" in str(api_error):
361
+ print("检测到SSL连接问题,尝试使用备用方案")
362
+ # 可以在这里添加备用API调用
363
+
364
+ elif OPENAI_API_KEY:
365
+ from openai import OpenAI
366
+ client = OpenAI(api_key=OPENAI_API_KEY)
367
+ response = client.chat.completions.create(
368
+ model="gpt-3.5-turbo", # 使用更稳定的模型
369
+ messages=[
370
+ {"role": "system", "content": system_msg},
371
+ {"role": "user", "content": prompt}
372
+ ],
373
+ response_format={"type": "json_object"} if json_output else None,
374
+ timeout=60
375
+ )
376
+ return response.choices[0].message.content
377
+
378
+ else:
379
+ print("未配置API密钥,使用备用方案")
380
+ return None
381
+
382
+ except Exception as e:
383
+ print(f"第{attempt + 1}次调用失败: {str(e)}")
384
+ if attempt < max_retries - 1:
385
+ print(f"等待 {2 ** attempt} 秒后重试...")
386
+ time.sleep(2 ** attempt) # 指数退避
387
+ else:
388
+ print(f"所有重试都失败了,将使用备用推荐方案")
389
+ return None
reviewer_recommendation/enginecomplex.py ADDED
@@ -0,0 +1,609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 推荐引擎模块
3
+ 使用LLM分析候选者并推荐合适的审稿人
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import time
9
+ from typing import List, Dict, Any, Optional
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+
12
+ from .models import PaperInfo, Reviewer
13
+
14
+
15
+ # 配置部分
16
+ DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
17
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
18
+
19
+
20
+ class LLMRecommendationEngine:
21
+ """完全由大模型驱动的审稿人推荐引擎"""
22
+
23
+ def __init__(self):
24
+ pass
25
+
26
+ def analyze_candidates(self, paper: PaperInfo, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
27
+ """分析候选文献,评估适合度"""
28
+ system_msg = "你是学术领域专家,擅长评估研究人员与特定论文的匹配度"
29
+ candidates_str = json.dumps(candidates, ensure_ascii=False, indent=2)
30
+
31
+ # 构建作者和机构信息
32
+ authors_info = ""
33
+ if paper.authors:
34
+ authors_info = f"作者: {', '.join(paper.authors)}"
35
+ if paper.affiliations:
36
+ authors_info += f"\n作者机构: {', '.join(paper.affiliations)}"
37
+
38
+ prompt = f"""
39
+ 请分析以下候选文献的作者是否适合评审目标论文,并按适合度排序:
40
+
41
+ 目标论文:
42
+ 标题: {paper.title}
43
+ 摘要: {paper.abstract}
44
+ 关键词: {', '.join(paper.keywords)}
45
+ {authors_info}
46
+
47
+ 候选文献列表:
48
+ {candidates_str}
49
+
50
+ 分析要求:
51
+ 1. 为每位通讯作者评估适合度,给出0-1的相关性评分
52
+ 2. 提取作者的专业领域和研究方向
53
+ 3. 说明推荐理由(中文 重点介绍作者本人的研究方向)
54
+ 4. 排除重复作者
55
+ 5. 严格排除与目标论文作者相同或来自同一机构的人员
56
+ 6. 按适合度从高到低排序,优先考虑引用量和知名程度
57
+ 7. 必须返回至少5-10个审稿人,确保有足够的候选人数
58
+ 7. 如果相关性评分全部低于0.6 则重新再进行一次分析
59
+ 8. 估算作者的学术论文引用总量(基于机构声誉和研究领域)
60
+
61
+ 请返回JSON数组,每个元素包含:
62
+ - name: 作者姓名
63
+ - affiliation: 单位
64
+ - email: 邮箱(从数据中提取)
65
+ - reason: 推荐理由(中文 作者本人的研究方向与目标论文的适配度)
66
+ - relevance_score: 相关性评分(0-1)
67
+ - expertise_areas: 专业领域列表
68
+ - citation_count: 估算的学术论文引用总量
69
+
70
+ 确保输出是纯JSON,不要包含其他内容
71
+ """
72
+
73
+ response = self._call_llm_with_retry(prompt.strip(), system_msg, json_output=True)
74
+ if not response:
75
+ return []
76
+
77
+ # 清理和解析JSON响应
78
+ cleaned_response = self._clean_json_response(response)
79
+ if not cleaned_response:
80
+ return []
81
+
82
+ try:
83
+ result = json.loads(cleaned_response)
84
+ if isinstance(result, list):
85
+ # 并行为每个审稿人添加引用量
86
+ enhanced_result = self._add_citations_parallel(result)
87
+
88
+ # 按引用量和相关性评分综合排序
89
+ def sort_key(x):
90
+ citation_count = x.get('citation_count', '0')
91
+ if isinstance(citation_count, str) and citation_count == "未查询到":
92
+ citation_score = 0
93
+ else:
94
+ try:
95
+ citation_score = int(citation_count) / 10000 * 0.6
96
+ except (ValueError, TypeError):
97
+ citation_score = 0
98
+ relevance_score = x.get('relevance_score', 0) * 0.4
99
+ return citation_score + relevance_score
100
+
101
+ enhanced_result.sort(key=sort_key, reverse=True)
102
+
103
+ # 过滤掉相同作者和机构
104
+ filtered_result = self._filter_reviewers(enhanced_result, paper)
105
+ return filtered_result
106
+ else:
107
+ print("大模型返回的不是JSON数组")
108
+ return self._generate_fallback_reviewers(candidates, paper)
109
+ except json.JSONDecodeError:
110
+ print("无法解析大模型返回的JSON")
111
+ return self._generate_fallback_reviewers(candidates, paper)
112
+
113
+ def _clean_json_response(self, response: str) -> str:
114
+ """清理大模型返回的JSON响应"""
115
+ if not response:
116
+ return ""
117
+
118
+ # 移除markdown代码块
119
+ if "```json" in response:
120
+ start = response.find("```json") + 7
121
+ end = response.find("```", start)
122
+ if end != -1:
123
+ response = response[start:end]
124
+ elif "```" in response:
125
+ start = response.find("```") + 3
126
+ end = response.find("```", start)
127
+ if end != -1:
128
+ response = response[start:end]
129
+
130
+ # 清理空白字符
131
+ response = response.strip()
132
+
133
+ # 处理多个独立JSON对象的情况
134
+ if response.count('{') > 1:
135
+ # 尝试将多个JSON对象合并为数组
136
+ try:
137
+ # 分割多个JSON对象
138
+ objects = []
139
+ brace_count = 0
140
+ current_obj = ""
141
+
142
+ for char in response:
143
+ current_obj += char
144
+ if char == '{':
145
+ brace_count += 1
146
+ elif char == '}':
147
+ brace_count -= 1
148
+ if brace_count == 0:
149
+ # 一个完整的JSON对象
150
+ obj_str = current_obj.strip()
151
+ if obj_str.startswith('{') and obj_str.endswith('}'):
152
+ try:
153
+ json.loads(obj_str) # 验证JSON格式
154
+ objects.append(obj_str)
155
+ except:
156
+ pass
157
+ current_obj = ""
158
+
159
+ if len(objects) > 1:
160
+ # 合并为JSON数组
161
+ return "[" + ",".join(objects) + "]"
162
+ elif len(objects) == 1:
163
+ return "[" + objects[0] + "]"
164
+ except:
165
+ pass
166
+
167
+ return response
168
+
169
+ def _filter_reviewers(self, reviewers: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
170
+ """过滤掉与论文作者相同或来自同一机构的审稿人"""
171
+ filtered_reviewers = []
172
+
173
+ # 获取论文作者和机构的标准化列表
174
+ paper_authors = [author.strip().lower() for author in paper.authors if author.strip()]
175
+ paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
176
+
177
+ print(f"论文作者: {paper.authors}")
178
+ print(f"论文机构: {paper.affiliations}")
179
+ print(f"开始过滤 {len(reviewers)} 个审稿人...")
180
+
181
+ for reviewer in reviewers:
182
+ reviewer_name = reviewer.get("name", "").strip().lower()
183
+ reviewer_affiliation = reviewer.get("affiliation", "").strip().lower()
184
+
185
+ print(f"检查审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')})")
186
+
187
+ # 检查是否与论文作者相同
188
+ is_same_author = any(self._similar_names(reviewer_name, author) for author in paper_authors)
189
+
190
+ # 检查是否来自同一机构
191
+ is_same_institution = any(self._similar_institutions(reviewer_affiliation, aff) for aff in paper_affiliations)
192
+
193
+ # 如果既不是相同作者也不是同一机构,则保留
194
+ if not is_same_author and not is_same_institution:
195
+ filtered_reviewers.append(reviewer)
196
+ print(f"保留审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')})")
197
+ else:
198
+ reason = "作者相同" if is_same_author else "机构相同"
199
+ print(f"过滤掉审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')}) - {reason}")
200
+
201
+ print(f"过滤完成,保留 {len(filtered_reviewers)} 个审稿人")
202
+ return filtered_reviewers
203
+
204
+ def _similar_names(self, name1: str, name2: str) -> bool:
205
+ """检查两个姓名是否相似(可能是同一人)"""
206
+ # 简单的相似性检查
207
+ if name1 == name2:
208
+ print(f"姓名完全匹配: '{name1}' == '{name2}'")
209
+ return True
210
+
211
+ # 检查是否包含相同的姓氏
212
+ name1_parts = name1.split()
213
+ name2_parts = name2.split()
214
+
215
+ if name1_parts and name2_parts:
216
+ # 检查姓氏是否相同
217
+ if name1_parts[0] == name2_parts[0]:
218
+ print(f"姓氏匹配: '{name1}' vs '{name2}' - 共同姓氏: {name1_parts[0]}")
219
+ return True
220
+
221
+ return False
222
+
223
+ def _similar_institutions(self, inst1: str, inst2: str) -> bool:
224
+ """检查两个机构是否相似(可能是同一机构的不同表述)"""
225
+ if inst1 == inst2:
226
+ return True
227
+
228
+ # 过滤掉通用词汇,只保留有意义的机构名称关键词
229
+ def filter_common_words(words):
230
+ common_words = {
231
+ 'university', 'college', 'institute', 'department', 'school',
232
+ 'center', 'centre', 'laboratory', 'lab', 'of', 'the', 'and',
233
+ 'at', 'in', 'for', 'medical', 'medicine', 'science', 'technology'
234
+ }
235
+ return {word for word in words if word not in common_words and len(word) > 2}
236
+
237
+ # 获取有意义的关键词
238
+ inst1_words = filter_common_words(set(inst1.lower().split()))
239
+ inst2_words = filter_common_words(set(inst2.lower().split()))
240
+
241
+ # 如果过滤后没有关键词,使用原始词汇但提高阈值
242
+ if not inst1_words or not inst2_words:
243
+ inst1_words = set(inst1.lower().split())
244
+ inst2_words = set(inst2.lower().split())
245
+ # 提高阈值到80%,减少误判
246
+ threshold = 0.8
247
+ else:
248
+ # 使用有意义关键词,阈值可以相对宽松
249
+ threshold = 0.6
250
+
251
+ # 计算共同词汇比例
252
+ common_words = inst1_words.intersection(inst2_words)
253
+ if not common_words:
254
+ return False
255
+
256
+ similarity_ratio = len(common_words) / min(len(inst1_words), len(inst2_words))
257
+
258
+ # 添加调试日志
259
+ if similarity_ratio >= threshold:
260
+ print(f"机构匹配: '{inst1}' vs '{inst2}' - 相似度: {similarity_ratio:.2f}, 共同词汇: {common_words}")
261
+
262
+ return similarity_ratio >= threshold
263
+
264
+ def _generate_fallback_reviewers(self, candidates: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
265
+ """当LLM解析失败时,生成基础推荐"""
266
+ fallback_reviewers = []
267
+
268
+ for candidate in candidates[:20]: # 取前20个候选
269
+ author = candidate.get("corresponding_author")
270
+ institution = candidate.get("corresponding_institution")
271
+
272
+ if author and author not in [r.get("name") for r in fallback_reviewers]:
273
+ # 检查是否与论文作者或机构相同
274
+ author_lower = author.strip().lower()
275
+ institution_lower = (institution or "").strip().lower()
276
+
277
+ paper_authors = [a.strip().lower() for a in paper.authors if a.strip()]
278
+ paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
279
+
280
+ is_same_author = any(self._similar_names(author_lower, pa) for pa in paper_authors)
281
+ is_same_institution = any(self._similar_institutions(institution_lower, pa) for pa in paper_affiliations)
282
+
283
+ if not is_same_author and not is_same_institution:
284
+ # 获取真实引用量
285
+ citation_count = self._get_real_citation_count(author, institution or "未知单位")
286
+
287
+ fallback_reviewers.append({
288
+ "name": author,
289
+ "affiliation": institution or "未知单位",
290
+ "email": "未知邮箱",
291
+ "reason": "基于文献相关性自动推荐",
292
+ "relevance_score": 0.7,
293
+ "expertise_areas": ["相关研究领域"],
294
+ "citation_count": citation_count
295
+ })
296
+
297
+ return fallback_reviewers
298
+
299
+ def _call_llm_with_retry(self, prompt: str, system_msg: str, json_output: bool = False, max_retries: int = 3) -> Any:
300
+ """带重试机制的LLM调用"""
301
+ for attempt in range(max_retries):
302
+ try:
303
+ if DASHSCOPE_API_KEY:
304
+ import dashscope
305
+ dashscope.api_key = DASHSCOPE_API_KEY
306
+
307
+ # 设置更长的超时时间和更好的错误处理
308
+ try:
309
+ response = dashscope.Generation.call(
310
+ model="qwen-turbo", # 使用更稳定的模型
311
+ messages=[
312
+ {"role": "system", "content": system_msg},
313
+ {"role": "user", "content": prompt}
314
+ ],
315
+ result_format="json" if json_output else "text",
316
+ timeout=60 # 增加超时时间
317
+ )
318
+ if response.status_code == 200:
319
+ return response.output.text
320
+ else:
321
+ print(f"DashScope API错误: {response.message}")
322
+
323
+ except Exception as api_error:
324
+ print(f"DashScope API调用异常: {str(api_error)}")
325
+ if "SSL" in str(api_error) or "EOF" in str(api_error):
326
+ print("检测到SSL连接问题,尝试使用备用方案")
327
+ # 可以在这里添加备用API调用
328
+
329
+ elif OPENAI_API_KEY:
330
+ from openai import OpenAI
331
+ client = OpenAI(api_key=OPENAI_API_KEY)
332
+ response = client.chat.completions.create(
333
+ model="gpt-3.5-turbo", # 使用更稳定的模型
334
+ messages=[
335
+ {"role": "system", "content": system_msg},
336
+ {"role": "user", "content": prompt}
337
+ ],
338
+ response_format={"type": "json_object"} if json_output else None,
339
+ timeout=60
340
+ )
341
+ return response.choices[0].message.content
342
+
343
+ else:
344
+ print("未配置API密钥,使用备用方案")
345
+ return None
346
+
347
+ except Exception as e:
348
+ print(f"第{attempt + 1}次调用失败: {str(e)}")
349
+ if attempt < max_retries - 1:
350
+ print(f"等待 {2 ** attempt} 秒后重试...")
351
+ time.sleep(2 ** attempt) # 指数退避
352
+ else:
353
+ print(f"所有重试都失败了,将使用备用推荐方案")
354
+ return None
355
+
356
+ def _get_real_citation_count(self, name: str, affiliation: str) -> str:
357
+ """获取作者的真实学术论文引用总量"""
358
+ try:
359
+ # 首先尝试OpenAlex API
360
+ citation_count = self._get_citation_from_openalex(name, affiliation)
361
+ if citation_count > 0:
362
+ return str(citation_count)
363
+
364
+ # 备用方案:Semantic Scholar API
365
+ citation_count = self._get_citation_from_semantic_scholar(name, affiliation)
366
+ if citation_count > 0:
367
+ return str(citation_count)
368
+
369
+ # 如果没有找到真实数据,返回"未查询到"
370
+ print(f"未找到 {name} 的引用量数据")
371
+ return "未查询到"
372
+
373
+ except Exception as e:
374
+ print(f"获取引用量失败: {str(e)}")
375
+ return "未查询到"
376
+
377
+ def _add_citations_parallel(self, reviewers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
378
+ """并行为审稿人添加引用量"""
379
+ print(f"开始并行获取 {len(reviewers)} 个审稿人的引用量...")
380
+
381
+ enhanced_reviewers = []
382
+
383
+ # 使用线程池并行获取引用量
384
+ with ThreadPoolExecutor(max_workers=5) as executor:
385
+ # 提交所有引用量获取任务
386
+ future_to_reviewer = {}
387
+ for reviewer in reviewers:
388
+ name = reviewer.get('name', '')
389
+ affiliation = reviewer.get('affiliation', '')
390
+ future = executor.submit(self._get_real_citation_count, name, affiliation)
391
+ future_to_reviewer[future] = reviewer
392
+
393
+ # 收集结果
394
+ for future in as_completed(future_to_reviewer):
395
+ reviewer = future_to_reviewer[future]
396
+ try:
397
+ citation_count = future.result(timeout=15) # 15秒超时
398
+ reviewer['citation_count'] = citation_count
399
+ enhanced_reviewers.append(reviewer)
400
+ print(f"获取引用量完成: {reviewer.get('name')} - {citation_count}")
401
+ except Exception as e:
402
+ print(f"获取引用量失败: {reviewer.get('name')} - {str(e)}")
403
+ reviewer['citation_count'] = "未查询到"
404
+ enhanced_reviewers.append(reviewer)
405
+
406
+ print(f"并行引用量获取完成,处理了 {len(enhanced_reviewers)} 个审稿人")
407
+ return enhanced_reviewers
408
+
409
+ def _get_citation_from_openalex(self, name: str, affiliation: str) -> int:
410
+ """从OpenAlex API获取作者引用量"""
411
+ try:
412
+ import requests
413
+ import urllib.parse
414
+
415
+ # 生成多种查询变体
416
+ name_variants = self._generate_name_variants(name)
417
+
418
+ for variant in name_variants:
419
+ # 简化查询,只使用姓名
420
+ query = f'display_name:"{variant}"'
421
+ print(f"OpenAlex查询: {query}")
422
+
423
+ # OpenAlex API请求
424
+ url = "https://api.openalex.org/authors"
425
+ params = {
426
+ 'search': query,
427
+ 'per-page': 5, # 增加结果数量
428
+ 'select': 'id,display_name,cited_by_count,affiliations'
429
+ }
430
+
431
+ response = requests.get(url, params=params, timeout=15)
432
+ response.raise_for_status()
433
+
434
+ data = response.json()
435
+ if data.get('results'):
436
+ # 尝试匹配最佳结果
437
+ best_match = self._find_best_author_match(data['results'], name, affiliation)
438
+ if best_match:
439
+ cited_by_count = best_match.get('cited_by_count', 0)
440
+ print(f"OpenAlex API: {name} 引用量: {cited_by_count}")
441
+ return cited_by_count
442
+
443
+ print(f"OpenAlex API: 未找到 {variant} 的数据")
444
+
445
+ return 0
446
+
447
+ except Exception as e:
448
+ print(f"OpenAlex API调用失败: {str(e)}")
449
+ return 0
450
+
451
+ def _get_citation_from_semantic_scholar(self, name: str, affiliation: str) -> int:
452
+ """从Semantic Scholar API获取作者引用量"""
453
+ try:
454
+ import requests
455
+ import urllib.parse
456
+
457
+ # 生成多种查询变体
458
+ name_variants = self._generate_name_variants(name)
459
+
460
+ for variant in name_variants:
461
+ # 简化查询,只使用姓名
462
+ query = variant
463
+ print(f"Semantic Scholar查询: {query}")
464
+
465
+ # Semantic Scholar API请求
466
+ url = "https://api.semanticscholar.org/graph/v1/author/search"
467
+ params = {
468
+ 'query': query,
469
+ 'limit': 5, # 增加结果数量
470
+ 'fields': 'authorId,name,citationCount,affiliations'
471
+ }
472
+
473
+ headers = {
474
+ 'User-Agent': 'Academic-Reviewer-System/1.0'
475
+ }
476
+
477
+ response = requests.get(url, params=params, headers=headers, timeout=15)
478
+ response.raise_for_status()
479
+
480
+ data = response.json()
481
+ if data.get('data'):
482
+ # 尝试匹配最佳结果
483
+ best_match = self._find_best_semantic_author_match(data['data'], name, affiliation)
484
+ if best_match:
485
+ citation_count = best_match.get('citationCount', 0)
486
+ print(f"Semantic Scholar API: {name} 引用量: {citation_count}")
487
+ return citation_count
488
+
489
+ print(f"Semantic Scholar API: 未找到 {variant} 的数据")
490
+
491
+ return 0
492
+
493
+ except Exception as e:
494
+ print(f"Semantic Scholar API调用失败: {str(e)}")
495
+ return 0
496
+
497
+ def _generate_name_variants(self, name: str) -> List[str]:
498
+ """生成姓名的多种变体"""
499
+ variants = [name] # 原始姓名
500
+
501
+ # 如果包含中间名,尝试不同的组合
502
+ name_parts = name.split()
503
+ if len(name_parts) >= 2:
504
+ # 只使用姓和名
505
+ variants.append(f"{name_parts[0]} {name_parts[-1]}")
506
+
507
+ # 如果有多于2个部分,尝试不同的组合
508
+ if len(name_parts) == 3:
509
+ variants.append(f"{name_parts[0]} {name_parts[1]}")
510
+ variants.append(f"{name_parts[1]} {name_parts[2]}")
511
+ elif len(name_parts) > 3:
512
+ # 对于更复杂的姓名,尝试简化
513
+ variants.append(f"{name_parts[0]} {name_parts[1]}")
514
+ variants.append(f"{name_parts[0]} {name_parts[-1]}")
515
+
516
+ # 去重并保持顺序
517
+ seen = set()
518
+ unique_variants = []
519
+ for variant in variants:
520
+ if variant not in seen:
521
+ seen.add(variant)
522
+ unique_variants.append(variant)
523
+
524
+ return unique_variants
525
+
526
+ def _find_best_author_match(self, authors: List[Dict], target_name: str, target_affiliation: str) -> Optional[Dict]:
527
+ """在OpenAlex结果中找到最佳匹配的作者"""
528
+ if not authors:
529
+ return None
530
+
531
+ # 如果只有一个结果,直接返回
532
+ if len(authors) == 1:
533
+ return authors[0]
534
+
535
+ # 计算每个作者的匹配分数
536
+ best_match = None
537
+ best_score = 0
538
+
539
+ for author in authors:
540
+ score = 0
541
+ author_name = author.get('display_name', '').lower()
542
+ target_name_lower = target_name.lower()
543
+
544
+ # 姓名匹配分数
545
+ if target_name_lower in author_name or author_name in target_name_lower:
546
+ score += 10
547
+
548
+ # 检查机构匹配
549
+ affiliations = author.get('affiliations', [])
550
+ if affiliations and target_affiliation and target_affiliation != "未知单位":
551
+ for aff in affiliations:
552
+ aff_name = aff.get('display_name', '').lower()
553
+ if target_affiliation.lower() in aff_name:
554
+ score += 5
555
+ break
556
+
557
+ # 引用量作为权重
558
+ citation_count = author.get('cited_by_count', 0)
559
+ if citation_count > 0:
560
+ score += 1
561
+
562
+ if score > best_score:
563
+ best_score = score
564
+ best_match = author
565
+
566
+ return best_match if best_score > 0 else authors[0]
567
+
568
+ def _find_best_semantic_author_match(self, authors: List[Dict], target_name: str, target_affiliation: str) -> Optional[Dict]:
569
+ """在Semantic Scholar结果中找到最佳匹配的作者"""
570
+ if not authors:
571
+ return None
572
+
573
+ # 如果只有一个结果,直接返回
574
+ if len(authors) == 1:
575
+ return authors[0]
576
+
577
+ # 计算每个作者的匹配分数
578
+ best_match = None
579
+ best_score = 0
580
+
581
+ for author in authors:
582
+ score = 0
583
+ author_name = author.get('name', '').lower()
584
+ target_name_lower = target_name.lower()
585
+
586
+ # 姓名匹配分数
587
+ if target_name_lower in author_name or author_name in target_name_lower:
588
+ score += 10
589
+
590
+ # 检查机构匹配
591
+ affiliations = author.get('affiliations', [])
592
+ if affiliations and target_affiliation and target_affiliation != "未知单位":
593
+ for aff in affiliations:
594
+ aff_name = aff.get('name', '').lower()
595
+ if target_affiliation.lower() in aff_name:
596
+ score += 5
597
+ break
598
+
599
+ # 引用量作为权重
600
+ citation_count = author.get('citationCount', 0)
601
+ if citation_count > 0:
602
+ score += 1
603
+
604
+ if score > best_score:
605
+ best_score = score
606
+ best_match = author
607
+
608
+ return best_match if best_score > 0 else authors[0]
609
+
reviewer_recommendation/models.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 数据模型定义
3
+ 定义审稿人推荐系统使用的核心数据结构
4
+ """
5
+
6
+ from typing import List, Dict, Any, Optional
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class PaperInfo(BaseModel):
11
+ """论文信息模型"""
12
+ title: str = Field(..., description="论文标题")
13
+ abstract: str = Field(..., description="论文摘要")
14
+ keywords: List[str] = Field(default_factory=list, description="论文关键词")
15
+ authors: List[str] = Field(default_factory=list, description="作者姓名列表")
16
+ affiliations: List[str] = Field(default_factory=list, description="作者所属机构列表")
17
+
18
+
19
+ class Reviewer(BaseModel):
20
+ """审稿人信息模型"""
21
+ name: str = Field(..., description="审稿人姓名")
22
+ affiliation: str = Field(default="Unknown", description="所属机构")
23
+ email: str = Field(default="unknown@example.com", description="邮箱地址")
24
+ reason: str = Field(..., description="推荐理由")
25
+ relevance_score: float = Field(..., ge=0.0, le=1.0, description="相关性评分")
26
+ expertise_areas: List[str] = Field(default_factory=list, description="专业领域")
27
+
28
+
29
+ class SearchResult(BaseModel):
30
+ """搜索结果模型"""
31
+ doi: Optional[str] = Field(None, description="DOI")
32
+ title: str = Field(..., description="论文标题")
33
+ abstract: str = Field(..., description="论文摘要")
34
+ corresponding_author: Optional[str] = Field(None, description="通讯作者")
35
+ corresponding_institution: Optional[str] = Field(None, description="通讯作者机构")
36
+ query_used: str = Field(..., description="使用的查询词")
37
+
38
+
39
+ class RecommendationRequest(BaseModel):
40
+ """推荐请求模型"""
41
+ paper: PaperInfo
42
+ reviewer_count: int = Field(..., ge=1, le=10, description="推荐审稿人数量")
43
+
44
+
45
+ class RecommendationResponse(BaseModel):
46
+ """推荐响应模型"""
47
+ reviewers: List[Reviewer] = Field(default_factory=list, description="推荐的审稿人列表")
48
+ search_time: float = Field(..., description="搜索耗时(秒)")
49
+ total_candidates: int = Field(..., description="候选者总数")
50
+ success: bool = Field(..., description="是否成功")
51
+ error_message: Optional[str] = Field(None, description="错误信息")
52
+
53
+
54
+ class AppState(BaseModel):
55
+ """应用状态模型"""
56
+ current_request: Optional[RecommendationRequest] = None
57
+ current_response: Optional[RecommendationResponse] = None
58
+ is_processing: bool = False
59
+ last_error: Optional[str] = None
reviewer_recommendation/searcher copy.py ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 学术检索模块
3
+ 提供基于EPMC和bioRxiv的学术文献检索功能
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import time
9
+ import urllib.parse
10
+ import requests
11
+ import warnings
12
+ import ssl
13
+ from typing import List, Dict, Any, Optional
14
+ from itertools import combinations
15
+ from concurrent.futures import ThreadPoolExecutor, as_completed
16
+ import threading
17
+
18
+ # 抑制SSL警告
19
+ warnings.filterwarnings('ignore', message='Unverified HTTPS request')
20
+
21
+ from .models import PaperInfo, SearchResult
22
+
23
+
24
+ # 配置部分
25
+ DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
26
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
27
+
28
+ # 生物学关键词分类体系
29
+ BIOLOGY_KEYWORDS = {
30
+ "Molecular & Structural Biology": [
31
+ "Cryo-EM",
32
+ "X-ray crystallography",
33
+ "NMR spectroscopy",
34
+ "Single-particle analysis",
35
+ "Biolayer interferometry (BLI)",
36
+ "Surface plasmon resonance (SPR)",
37
+ "Confocal microscopy",
38
+ "CRISPR-Cas9",
39
+ "TALEN",
40
+ "ZFN",
41
+ "RNA interference (RNAi)",
42
+ "Single-molecule imaging",
43
+ "FRET",
44
+ "Optogenetics"
45
+ ],
46
+
47
+ "Cell & Single-Cell Technologies": [
48
+ "Single-cell RNA-seq (scRNA-seq)",
49
+ "Single-cell ATAC-seq",
50
+ "Spatial transcriptomics",
51
+ "FISH (Fluorescence in situ hybridization)",
52
+ "Immunofluorescence",
53
+ "Tissue clearing (CLARITY)",
54
+ "Flow cytometry (FACS)",
55
+ "CyTOF (Mass cytometry)",
56
+ "High-throughput screening",
57
+ "Organoids",
58
+ "3D cell culture",
59
+ "Microfluidics"
60
+ ],
61
+
62
+ "Neuroscience Tools": [
63
+ "Optogenetics",
64
+ "DREADDs (Designer Receptors Exclusively Activated by Designer Drugs)",
65
+ "GCaMP calcium imaging",
66
+ "Two-photon microscopy",
67
+ "Neural tracing",
68
+ "Patch-seq",
69
+ "Lineage tracing",
70
+ "Spatial multi-omics"
71
+ ],
72
+
73
+ "Omics & Systems Biology": [
74
+ "RNA sequencing (RNA-seq)",
75
+ "Proteomics (LC-MS/MS)",
76
+ "Metabolomics",
77
+ "Epigenomics",
78
+ "10x Genomics",
79
+ "SMART-seq",
80
+ "Nanopore sequencing",
81
+ "Illumina HiSeq",
82
+ "WGCNA",
83
+ "Machine learning in omics",
84
+ "scVelo"
85
+ ],
86
+
87
+ "Microbiome & Immunology": [
88
+ "16S rRNA sequencing",
89
+ "Metagenomics",
90
+ "Gut-brain axis",
91
+ "VDJ-seq",
92
+ "TCR/BCR lineage tracing",
93
+ "Immune checkpoints (PD-1, CTLA-4)",
94
+ "mRNA vaccines",
95
+ "DNA vaccines",
96
+ "Nanoparticle vaccines",
97
+ "Antigen presentation systems"
98
+ ],
99
+
100
+ "Development & Regeneration": [
101
+ "Induced pluripotent stem cells (iPSCs)",
102
+ "Embryonic stem cells (ESCs)",
103
+ "Cellular reprogramming",
104
+ "Wnt signaling",
105
+ "Hippo pathway",
106
+ "Notch signaling",
107
+ "Zebrafish models",
108
+ "C. elegans",
109
+ "Mouse embryonic sections"
110
+ ],
111
+
112
+ "Ecology & Environmental Biology": [
113
+ "Environmental DNA (eDNA)",
114
+ "Remote sensing ecology",
115
+ "Biosensors",
116
+ "Ecological niche modeling (ENM)",
117
+ "Genetic diversity analysis",
118
+ "Captive breeding technologies"
119
+ ],
120
+
121
+ "Bioinformatics & AI Tools": [
122
+ "Seurat",
123
+ "Scanpy",
124
+ "Monocle",
125
+ "CIBERSORT",
126
+ "GSEA",
127
+ "AlphaFold",
128
+ "RoseTTAFold",
129
+ "Molecular docking",
130
+ "STRING",
131
+ "Cytoscape",
132
+ "Gene Ontology (GO)",
133
+ "KEGG pathway analysis"
134
+ ]
135
+ }
136
+
137
+
138
+ class AcademicSearcher:
139
+ """基础学术检索器,仅负责数据获取,不做任何分析"""
140
+
141
+ EPMC_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
142
+ BIORXIV_URL = "https://api.biorxiv.org/details/biorxiv/{doi}/na/json"
143
+
144
+ def __init__(self, limit=50, sleep=0.1, timeout=30):
145
+ self.limit = limit
146
+ self.sleep = sleep
147
+ self.timeout = timeout
148
+ # 创建自定义SSL上下文
149
+ self.ssl_context = ssl.create_default_context()
150
+ self.ssl_context.check_hostname = False
151
+ self.ssl_context.verify_mode = ssl.CERT_NONE
152
+
153
+ def search(self, query: str) -> List[Dict[str, Any]]:
154
+ """执行检索并返回原始文献数据"""
155
+ try:
156
+ # 1. 获取DOI列表
157
+ epmc_results = self._epmc_search(query)
158
+
159
+ # 2. 并行获取详细信息
160
+ detailed_results = self._get_details_parallel(epmc_results, query)
161
+
162
+ return detailed_results
163
+ except Exception as e:
164
+ print(f"检索错误: {str(e)}")
165
+ return []
166
+
167
+ def _get_details_parallel(self, epmc_results: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
168
+ """并行获取详细信息"""
169
+ detailed_results = []
170
+
171
+ # 如果没有结果,直接返回空列表
172
+ if not epmc_results:
173
+ return detailed_results
174
+
175
+ # 限制并行数量,避免过多并发请求
176
+ max_workers = min(5, len(epmc_results))
177
+
178
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
179
+ # 提交所有详情获取任务
180
+ future_to_item = {}
181
+ for item in epmc_results:
182
+ doi = item.get("doi")
183
+ if doi:
184
+ future = executor.submit(self._get_biorxiv_detail, doi)
185
+ future_to_item[future] = item
186
+
187
+ # 收集结果
188
+ for future in as_completed(future_to_item):
189
+ item = future_to_item[future]
190
+ try:
191
+ detail = future.result(timeout=10) # 10秒超时
192
+ if detail:
193
+ detail["query_used"] = query
194
+ detailed_results.append(detail)
195
+
196
+ if len(detailed_results) >= self.limit:
197
+ break
198
+
199
+ except Exception as e:
200
+ print(f"获取详情失败: {item.get('doi')} - {str(e)}")
201
+ continue
202
+
203
+ return detailed_results
204
+
205
+ def _epmc_search(self, query: str) -> List[Dict[str, Any]]:
206
+ """获取EPMC搜索结果"""
207
+ params = {
208
+ "query": f'(SRC:PPR) AND (DOI:10.1101*) AND ({query})',
209
+ "resultType": "core",
210
+ "pageSize": str(min(100, self.limit * 2)), # 获取更多结果用于筛选
211
+ "format": "json",
212
+ "sortby": "cited", # 按引用量排序
213
+ }
214
+
215
+ # 添加重试机制
216
+ for attempt in range(3):
217
+ try:
218
+ response = requests.get(
219
+ self.EPMC_URL,
220
+ params=params,
221
+ timeout=self.timeout,
222
+ verify=False, # 禁用SSL验证
223
+ headers={
224
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
225
+ }
226
+ )
227
+ response.raise_for_status()
228
+ data = response.json()
229
+ results = data.get("resultList", {}).get("result", [])
230
+
231
+ # 添加调试信息,显示检索到的文献数量和引用量信息
232
+ if results:
233
+ print(f"EPMC检索到 {len(results)} 篇文献,按引用量排序")
234
+ # 显示前几篇文献的引用量信息
235
+ for i, result in enumerate(results[:3]):
236
+ cited_count = result.get('citedByCount', 0)
237
+ title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
238
+ print(f" 文献 {i+1}: {title} (引用量: {cited_count})")
239
+
240
+ return results
241
+
242
+ except requests.exceptions.SSLError as e:
243
+ print(f"EPMC SSL错误 (尝试 {attempt + 1}/3): {str(e)}")
244
+ if attempt == 2:
245
+ print("EPMC SSL连接失败,返回空结果")
246
+ return []
247
+ time.sleep(2 ** attempt)
248
+
249
+ except requests.exceptions.RequestException as e:
250
+ print(f"EPMC请求错误 (尝试 {attempt + 1}/3): {str(e)}")
251
+ if attempt == 2:
252
+ print("EPMC请求失败,返回空结果")
253
+ return []
254
+ time.sleep(2 ** attempt)
255
+
256
+ except Exception as e:
257
+ print(f"EPMC未知错误 (尝试 {attempt + 1}/3): {str(e)}")
258
+ if attempt == 2:
259
+ print("EPMC未知错误,返回空结果")
260
+ return []
261
+ time.sleep(2 ** attempt)
262
+
263
+ return []
264
+
265
+ def _get_biorxiv_detail(self, doi: str) -> Dict[str, Any]:
266
+ """获取bioRxiv详细信息"""
267
+ url = self.BIORXIV_URL.format(doi=urllib.parse.quote(doi))
268
+
269
+ # 添加重试机制和更好的错误处理
270
+ for attempt in range(3):
271
+ try:
272
+ # 使用更宽松的SSL验证和更长的超时时间
273
+ response = requests.get(
274
+ url,
275
+ timeout=self.timeout,
276
+ verify=False, # 禁用SSL验证以避免SSL错误
277
+ headers={
278
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
279
+ }
280
+ )
281
+ response.raise_for_status()
282
+ data = response.json()
283
+
284
+ records = data.get("collection") or data.get("records") or []
285
+ if not records:
286
+ return None
287
+
288
+ latest_record = records[-1]
289
+ if latest_record.get("server") and latest_record["server"].lower() != "biorxiv":
290
+ return None
291
+
292
+ version = latest_record.get("version") or 1
293
+ return {
294
+ "doi": latest_record.get("doi"),
295
+ "title": latest_record.get("title"),
296
+ "abstract": latest_record.get("abstract"),
297
+ "corresponding_author": latest_record.get("author_corresponding"),
298
+ "corresponding_institution": latest_record.get("author_corresponding_institution"),
299
+ "url": f"https://www.biorxiv.org/content/{latest_record['doi']}v{version}"
300
+ }
301
+
302
+ except requests.exceptions.SSLError as e:
303
+ print(f"bioRxiv SSL错误 (尝试 {attempt + 1}/3): {str(e)}")
304
+ if attempt == 2: # 最后一次尝试
305
+ print(f"跳过DOI {doi} 的详细信息获取")
306
+ return None
307
+ time.sleep(2 ** attempt) # 指数退避
308
+
309
+ except requests.exceptions.RequestException as e:
310
+ print(f"bioRxiv请求错误 (尝试 {attempt + 1}/3): {str(e)}")
311
+ if attempt == 2: # 最后一次尝试
312
+ print(f"跳过DOI {doi} 的详细信息获取")
313
+ return None
314
+ time.sleep(2 ** attempt) # 指数退避
315
+
316
+ except Exception as e:
317
+ print(f"bioRxiv未知错误 (尝试 {attempt + 1}/3): {str(e)}")
318
+ if attempt == 2: # 最后一次尝试
319
+ print(f"跳过DOI {doi} 的详细信息获取")
320
+ return None
321
+ time.sleep(2 ** attempt) # 指数退避
322
+
323
+ return None
324
+
325
+
326
+ class DynamicAcademicSearcher:
327
+ """动态学术检索器,包含智能查询生成、动态处理和扩展功能"""
328
+
329
+ def __init__(self, base_searcher: AcademicSearcher):
330
+ self.base_searcher = base_searcher
331
+
332
+ def search_with_dynamic_queries(self, paper: PaperInfo, num_queries: int = 2) -> List[Dict[str, Any]]:
333
+ """使用动态查询进行并行检索"""
334
+ # 1. 生成检索查询
335
+ queries = self.generate_search_queries(paper, num_queries)
336
+ print("生成的检索查询:")
337
+ for i, query in enumerate(queries, 1):
338
+ print(f"查询 {i}: {query}")
339
+
340
+ # 记录查询生成日志
341
+ self._log_query_generation(paper, queries)
342
+
343
+ # 2. 并行执行动态检索
344
+ all_candidates = []
345
+
346
+ # 使用线程池并行执行查询
347
+ with ThreadPoolExecutor(max_workers=3) as executor:
348
+ # 提交所有查询任务
349
+ future_to_query = {}
350
+ for query in queries:
351
+ future = executor.submit(self._execute_single_query, query)
352
+ future_to_query[future] = query
353
+
354
+ # 收集结果
355
+ for future in as_completed(future_to_query):
356
+ query = future_to_query[future]
357
+ try:
358
+ results = future.result(timeout=30) # 30秒超时
359
+ if results:
360
+ print(f"查询 '{query}' 完成,找到 {len(results)} 篇文献")
361
+ all_candidates.extend(results)
362
+ else:
363
+ print(f"查询 '{query}' 未找到文献")
364
+ except Exception as e:
365
+ print(f"查询 '{query}' 执行失败: {str(e)}")
366
+
367
+ # 3. 去重相同文献
368
+ unique_candidates = {item['doi']: item for item in all_candidates if item.get('doi')}.values()
369
+
370
+ print(f"并行检索完成,总共找到 {len(list(unique_candidates))} 篇唯一文献")
371
+ return list(unique_candidates)
372
+
373
+ def _execute_single_query(self, query: str) -> List[Dict[str, Any]]:
374
+ """执行单个查询(用于并行处理)"""
375
+ print(f"开始执行查询: {query}")
376
+
377
+ # 动态查询处理
378
+ processed_queries = self._process_query_dynamically(query)
379
+
380
+ for processed_query in processed_queries:
381
+ print(f" 尝试查询: {processed_query}")
382
+ results = self.base_searcher.search(processed_query)
383
+
384
+ if results:
385
+ print(f" 找到 {len(results)} 篇文献")
386
+ return results
387
+ else:
388
+ print(f" 未找到文献,尝试扩展查询...")
389
+
390
+ print(f" 所有扩展查询都未找到文献")
391
+ return []
392
+
393
+ def generate_search_queries(self, paper: PaperInfo, num_queries: int = 2) -> List[str]:
394
+ """基于生物学关键词分类体系生成两级检索查询"""
395
+ system_msg = "你是生物学检索策略专家,擅长分析论文的研究领域和技术方法,并生成精准的检索查询"
396
+
397
+ # 构建关键词分类体系字符串
398
+ keywords_str = ""
399
+ for category, keywords in BIOLOGY_KEYWORDS.items():
400
+ keywords_str += f"\n{category}:\n"
401
+ for keyword in keywords:
402
+ keywords_str += f" - {keyword}\n"
403
+
404
+ prompt = f"""
405
+ 请分析以下论文,按照以下步骤生成2个检索查询:
406
+
407
+ 论文信息:
408
+ 标题: {paper.title}
409
+ 摘要: {paper.abstract}
410
+ 关键词: {', '.join(paper.keywords)}
411
+
412
+ 生物学关键词分类体系:
413
+ {keywords_str}
414
+
415
+ 步骤1: 确定第一个检索查询(大类 + 子类)
416
+ 1. 从上述分类体系中选择最匹配的1个大类(如"Molecular & Structural Biology")
417
+ 2. 从该大类下选择最匹配的1个子类关键词(如"Cryo-EM")
418
+ 3. 生成查询:大类 AND 子类关键词
419
+ 4. 格式示例:Molecular & Structural Biology AND Cryo-EM
420
+
421
+ 步骤2: 确定第二个检索查询(子类 + 论文特定关键词)
422
+ 1. 使用步骤1中确定的子类关键词
423
+ 2. 从论文标题、摘要或关键词中提取1个最核心的特定关键词
424
+ 3. 生成查询:子类关键词 AND 论文特定关键词
425
+ 4. 格式示例:Cryo-EM AND Nav1.7
426
+
427
+ 要求:
428
+ 1. 每个查询只使用2个关键词,用AND连接
429
+ 2. 第一个查询:大类 AND 子类
430
+ 3. 第二个查询:子类 AND 论文特定关键词
431
+ 4. 论文特定关键词要简洁明确,适合学术数据库检索
432
+ 5. 仅返回查询语句,每行一个,不添加编号或其他内容
433
+
434
+ 输出格式示例:
435
+ Molecular & Structural Biology AND Cryo-EM
436
+ Cryo-EM AND Nav1.7
437
+ """
438
+
439
+ response = self._call_llm(prompt.strip(), system_msg)
440
+ if not response:
441
+ return self._generate_backup_queries(paper, num_queries)
442
+
443
+ # 解析查询
444
+ queries = [q.strip() for q in response.strip().split('\n') if q.strip()]
445
+
446
+ # 验证查询格式
447
+ validated_queries = self._validate_queries(queries)
448
+
449
+ return validated_queries[:num_queries] if validated_queries else self._generate_backup_queries(paper, num_queries)
450
+
451
+ def _validate_queries(self, queries: List[str]) -> List[str]:
452
+ """验证查询格式和质量"""
453
+ validated_queries = []
454
+
455
+ for query in queries:
456
+ # 基本格式检查
457
+ if not query or len(query.strip()) < 5:
458
+ print(f"查询太短,跳过: {query}")
459
+ continue
460
+
461
+ # 检查是否包含AND连接符
462
+ if ' AND ' not in query:
463
+ print(f"查询缺少AND连接符,跳过: {query}")
464
+ continue
465
+
466
+ # 检查是否只包含两个关键词(主要学科 AND 研究层面关键词)
467
+ parts = query.split(' AND ')
468
+ if len(parts) != 2:
469
+ print(f"查询格式不正确,跳过: {query}")
470
+ continue
471
+
472
+ # 检查每个部分是否有效
473
+ part1 = parts[0].strip()
474
+ part2 = parts[1].strip()
475
+
476
+ if not part1 or not part2:
477
+ print(f"查询包含空部分,跳过: {query}")
478
+ continue
479
+
480
+ if part1.upper() == 'AND' or part2.upper() == 'AND':
481
+ print(f"查询包含无效AND,跳过: {query}")
482
+ continue
483
+
484
+ # 检查是否包含生物学关键词分类
485
+ has_biology_keyword = False
486
+ for category, keywords in BIOLOGY_KEYWORDS.items():
487
+ if category.lower() in query.lower():
488
+ has_biology_keyword = True
489
+ break
490
+ for keyword in keywords:
491
+ if keyword.lower() in query.lower():
492
+ has_biology_keyword = True
493
+ break
494
+ if has_biology_keyword:
495
+ break
496
+
497
+ if not has_biology_keyword:
498
+ print(f"查询不包含生物学关键词分类,跳过: {query}")
499
+ continue
500
+
501
+ # 检查查询长度合理性
502
+ if len(query) > 100: # 查询过长
503
+ print(f"查询过长,跳过: {query}")
504
+ continue
505
+
506
+ validated_queries.append(query.strip())
507
+ print(f"查询验证通过: {query}")
508
+
509
+ return validated_queries
510
+
511
+ def _process_query_dynamically(self, query: str) -> List[str]:
512
+ """动态处理查询,生成多个变体"""
513
+ # 基础查询
514
+ queries = [query]
515
+
516
+ # 检查查询格式是否正确
517
+ if ' AND ' not in query:
518
+ return queries
519
+
520
+ # 按AND分割查询
521
+ parts = query.split(' AND ')
522
+ if len(parts) != 2:
523
+ return queries
524
+
525
+ # 清理每个部分
526
+ part1 = parts[0].strip()
527
+ part2 = parts[1].strip()
528
+
529
+ # 如果某个部分为空或只包含AND,跳过
530
+ if not part1 or not part2 or part1.upper() == 'AND' or part2.upper() == 'AND':
531
+ return queries
532
+
533
+ # 简化查询(只保留主要关键词)
534
+ part1_words = part1.split()
535
+ part2_words = part2.split()
536
+
537
+ if len(part1_words) > 1:
538
+ # 取第一个部分的主要关键词
539
+ simplified_part1 = part1_words[0]
540
+ queries.append(f"{simplified_part1} AND {part2}")
541
+
542
+ if len(part2_words) > 1:
543
+ # 取第二个部分的主要关键词
544
+ simplified_part2 = part2_words[0]
545
+ queries.append(f"{part1} AND {simplified_part2}")
546
+
547
+ # 单个关键词查询
548
+ queries.append(part1)
549
+ queries.append(part2)
550
+
551
+ return list(set(queries)) # 去重
552
+
553
+ def _generate_backup_queries(self, paper: PaperInfo, num_queries: int) -> List[str]:
554
+ """生成备用查询,基于生物学关键词分类体系"""
555
+ queries = []
556
+
557
+ # 尝试从论文内容推断最相关的生物学分类
558
+ best_category, best_keyword = self._infer_biology_keywords(paper)
559
+
560
+ if not best_category or not best_keyword:
561
+ # 如果没有推断出,使用默认分类
562
+ best_category = "Molecular & Structural Biology"
563
+ best_keyword = "Cryo-EM"
564
+
565
+ # 生成第一个查询:大类 AND 子类
566
+ queries.append(f"{best_category} AND {best_keyword}")
567
+
568
+ # 从论文标题中提取特定关键词
569
+ title_words = paper.title.split()
570
+ specific_keyword = None
571
+ for word in title_words:
572
+ if len(word) > 3 and word.lower() not in ['the', 'and', 'for', 'with', 'from', 'this', 'that', 'structures', 'human', 'channel', 'complex', 'with', 'auxiliary', 'subunits', 'animal', 'toxins']:
573
+ specific_keyword = word
574
+ break
575
+
576
+ if specific_keyword:
577
+ # 生成第二个查询:子类 AND 论文特定关键词
578
+ queries.append(f"{best_keyword} AND {specific_keyword}")
579
+ else:
580
+ # 如果没有找到特定关键词,使用第一个查询的变体
581
+ queries.append(f"{best_category} AND structure")
582
+
583
+ return queries[:num_queries]
584
+
585
+ def _infer_biology_keywords(self, paper: PaperInfo) -> tuple:
586
+ """从论文内容推断最相关的生物学分类和关键词"""
587
+ text = f"{paper.title} {paper.abstract} {' '.join(paper.keywords)}".lower()
588
+
589
+ best_category = None
590
+ best_keyword = None
591
+ max_matches = 0
592
+
593
+ for category, keywords in BIOLOGY_KEYWORDS.items():
594
+ category_matches = 0
595
+ best_keyword_in_category = None
596
+
597
+ # 检查类别名称匹配
598
+ if category.lower() in text:
599
+ category_matches += 2
600
+
601
+ # 检查关键词匹配
602
+ for keyword in keywords:
603
+ if keyword.lower() in text:
604
+ category_matches += 1
605
+ if not best_keyword_in_category:
606
+ best_keyword_in_category = keyword
607
+
608
+ # 更新最佳匹配
609
+ if category_matches > max_matches:
610
+ max_matches = category_matches
611
+ best_category = category
612
+ best_keyword = best_keyword_in_category or keywords[0]
613
+
614
+ return best_category, best_keyword
615
+
616
+ def _log_query_generation(self, paper: PaperInfo, queries: List[str]):
617
+ """记录查询生成日志"""
618
+ log_info = {
619
+ "paper_title": paper.title[:100] + "..." if len(paper.title) > 100 else paper.title,
620
+ "paper_keywords": paper.keywords,
621
+ "generated_queries": queries,
622
+ "query_count": len(queries),
623
+ "timestamp": time.time()
624
+ }
625
+ print(f"查询生成日志: {log_info}")
626
+
627
+ def _call_llm(self, prompt: str, system_msg: str) -> Optional[str]:
628
+ """调用LLM生成查询"""
629
+ try:
630
+ if DASHSCOPE_API_KEY:
631
+ import dashscope
632
+ dashscope.api_key = DASHSCOPE_API_KEY
633
+
634
+ response = dashscope.Generation.call(
635
+ model="qwen-turbo",
636
+ messages=[
637
+ {"role": "system", "content": system_msg},
638
+ {"role": "user", "content": prompt}
639
+ ],
640
+ timeout=30
641
+ )
642
+ if response.status_code == 200:
643
+ return response.output.text
644
+ else:
645
+ print(f"DashScope API错误: {response.message}")
646
+
647
+ elif OPENAI_API_KEY:
648
+ from openai import OpenAI
649
+ client = OpenAI(api_key=OPENAI_API_KEY)
650
+ response = client.chat.completions.create(
651
+ model="gpt-3.5-turbo",
652
+ messages=[
653
+ {"role": "system", "content": system_msg},
654
+ {"role": "user", "content": prompt}
655
+ ],
656
+ timeout=30
657
+ )
658
+ return response.choices[0].message.content
659
+
660
+ else:
661
+ print("未配置API密钥")
662
+ return None
663
+
664
+ except Exception as e:
665
+ print(f"大模型调用错误: {str(e)}")
666
+ return None
reviewer_recommendation/searcher.py ADDED
@@ -0,0 +1,1128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 学术检索模块
3
+ 提供基于EPMC、bioRxiv和OpenAlex的学术文献检索功能
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import time
9
+ import urllib.parse
10
+ import requests
11
+ import warnings
12
+ import ssl
13
+ from typing import List, Dict, Any, Optional
14
+ from itertools import combinations
15
+ from concurrent.futures import ThreadPoolExecutor, as_completed
16
+ import threading
17
+
18
+ # 抑制SSL警告
19
+ warnings.filterwarnings('ignore', message='Unverified HTTPS request')
20
+
21
+ from .models import PaperInfo, SearchResult
22
+
23
+
24
+ class OpenAlexSearcher:
25
+ """OpenAlex学术检索器,提供高质量的引用量数据"""
26
+
27
+ OPENALEX_URL = "https://api.openalex.org/works"
28
+
29
+ def __init__(self, limit=50, sleep=0.1, timeout=30):
30
+ self.limit = limit
31
+ self.sleep = sleep
32
+ self.timeout = timeout
33
+ self.headers = {
34
+ 'User-Agent': 'AcademicReviewerSystem/1.0 (mailto:moahgzantony@gmail.com)'
35
+ }
36
+
37
+ def search(self, query: str, sort_by_citations: bool = False, years_after: int = None, target_count: int = None) -> List[Dict[str, Any]]:
38
+ """执行OpenAlex检索并返回文献数据"""
39
+ try:
40
+ # 简化查询字符串,移除可能导致问题的特殊字符
41
+ clean_query = query.replace(' AND ', ' ').replace('&', 'and').replace('(', '').replace(')', '')
42
+ clean_query = ' '.join(clean_query.split()) # 移除多余空格
43
+
44
+ # 确定检索数量
45
+ if target_count is not None:
46
+ per_page = min(target_count, 200) # OpenAlex单次最多200条
47
+ else:
48
+ per_page = min(self.limit, 20) # 默认控制候选文献数量
49
+
50
+ # 构建查询参数(简化版本,避免select参数问题)
51
+ params = {
52
+ "search": clean_query,
53
+ "per-page": per_page
54
+ }
55
+
56
+ # 添加年份过滤
57
+ if years_after is not None:
58
+ from datetime import datetime
59
+ current_year = datetime.now().year
60
+ target_year = current_year - years_after
61
+ # 使用正确的filter参数格式
62
+ params["filter"] = f"from_publication_date:{target_year}-01-01"
63
+ print(f"年份过滤: 只检索{target_year}年及以后发表的论文")
64
+ print(f"日期过滤: {params['filter']}")
65
+
66
+ # 根据需求选择排序方式
67
+ if sort_by_citations:
68
+ params["sort"] = "cited_by_count:desc"
69
+ else:
70
+ params["sort"] = "relevance_score:desc"
71
+
72
+ print(f"OpenAlex检索查询: {query} -> {clean_query}")
73
+ print(f"排序方式: {'按引用量' if sort_by_citations else '按相关性'}")
74
+
75
+ # 发送请求 - 手动构建URL以避免冒号被编码
76
+ import urllib.parse
77
+
78
+ # 手动构建查询字符串,确保冒号不被编码
79
+ query_parts = []
80
+ for key, value in params.items():
81
+ if (key == "sort" or key == "filter") and ":" in str(value):
82
+ # 对于sort和filter参数,确保冒号不被编码
83
+ query_parts.append(f"{key}={value}")
84
+ else:
85
+ query_parts.append(f"{key}={urllib.parse.quote(str(value))}")
86
+
87
+ query_string = "&".join(query_parts)
88
+ full_url = f"{self.OPENALEX_URL}?{query_string}"
89
+
90
+ print(f"完整URL: {full_url}")
91
+
92
+ response = requests.get(
93
+ full_url,
94
+ headers=self.headers,
95
+ timeout=self.timeout
96
+ )
97
+ response.raise_for_status()
98
+ data = response.json()
99
+
100
+ items = data.get("results", [])
101
+ total_results = data.get("meta", {}).get("count", 0)
102
+
103
+ print(f"OpenAlex检索到 {len(items)} 篇文献,总命中数: {total_results}")
104
+
105
+ # 转换数据格式
106
+ results = []
107
+ for item in items:
108
+ result = self._convert_openalex_item(item, query)
109
+ if result:
110
+ results.append(result)
111
+
112
+ # 显示前几篇文献的引用量信息
113
+ if results:
114
+ print(f"OpenAlex检索结果(按引用量排序):")
115
+ for i, result in enumerate(results[:3]):
116
+ cited_count = result.get('citedByCount', 0)
117
+ title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
118
+ print(f" 文献 {i+1}: {title} (引用量: {cited_count})")
119
+
120
+ return results
121
+
122
+ except Exception as e:
123
+ print(f"OpenAlex检索失败: {str(e)}")
124
+ return []
125
+
126
+ def _convert_openalex_item(self, item: Dict[str, Any], query: str) -> Optional[Dict[str, Any]]:
127
+ """将OpenAlex数据转换为标准格式"""
128
+ try:
129
+ # 提取基本信息
130
+ title = item.get('title', '')
131
+ if not title:
132
+ return None
133
+
134
+ # 提取摘要
135
+ abstract = ""
136
+ abstract_inverted = item.get('abstract_inverted_index', {})
137
+ if abstract_inverted:
138
+ # 重构摘要文本
139
+ abstract_words = []
140
+ for word, positions in abstract_inverted.items():
141
+ for pos in positions:
142
+ abstract_words.append((pos, word))
143
+ abstract_words.sort(key=lambda x: x[0])
144
+ abstract = ' '.join([word for pos, word in abstract_words])
145
+
146
+ # 提取作者信息
147
+ authorships = item.get('authorships', [])
148
+ authors = []
149
+ corresponding_author = None
150
+ corresponding_institution = None
151
+
152
+ for authorship in authorships:
153
+ author = authorship.get('author', {})
154
+ if author:
155
+ author_name = author.get('display_name', '')
156
+ if author_name:
157
+ authors.append(author_name)
158
+
159
+ # 检查是否为通讯作者(通常第一个作者或标记为corresponding的作者)
160
+ if authorship.get('is_corresponding', False) or len(authors) == 1:
161
+ corresponding_author = author_name
162
+
163
+ # 获取机构信息
164
+ institutions = authorship.get('institutions', [])
165
+ if institutions:
166
+ institution = institutions[0].get('display_name', '')
167
+ if institution:
168
+ corresponding_institution = institution
169
+
170
+ # 提取期刊信息
171
+ primary_location = item.get('primary_location', {})
172
+ source = primary_location.get('source', {})
173
+ journal = source.get('display_name', '') if source else ''
174
+
175
+ # 提取发表年份
176
+ pub_year = item.get('publication_year', '')
177
+
178
+ # 提取引用量信息
179
+ cited_by_count = item.get('cited_by_count', 0)
180
+ citation_count = item.get('citation_count', 0)
181
+ referenced_works_count = item.get('referenced_works_count', 0)
182
+
183
+ # 提取DOI
184
+ doi = ""
185
+ external_ids = item.get('ids', {})
186
+ if external_ids:
187
+ doi = external_ids.get('doi', '')
188
+ if doi and doi.startswith('https://doi.org/'):
189
+ doi = doi.replace('https://doi.org/', '')
190
+
191
+ # 构建结果
192
+ result = {
193
+ 'title': title,
194
+ 'abstract': abstract,
195
+ 'authors': authors,
196
+ 'corresponding_author': corresponding_author,
197
+ 'corresponding_institution': corresponding_institution,
198
+ 'journal': journal,
199
+ 'publication_year': pub_year,
200
+ 'doi': doi,
201
+ 'citedByCount': cited_by_count, # 使用与EPMC相同的字段名
202
+ 'citation_count': citation_count,
203
+ 'referenced_works_count': referenced_works_count,
204
+ 'query_used': query,
205
+ 'source': 'openalex',
206
+ 'openalex_id': item.get('id', ''),
207
+ 'type': item.get('type', ''),
208
+ 'open_access': item.get('open_access', {}).get('is_oa', False)
209
+ }
210
+
211
+ return result
212
+
213
+ except Exception as e:
214
+ print(f"转换OpenAlex数据失败: {str(e)}")
215
+ return None
216
+
217
+
218
+ # 配置部分
219
+ DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
220
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
221
+
222
+ # 生物学关键词分类体系
223
+ BIOLOGY_KEYWORDS = {
224
+ "Molecular & Structural Biology": [
225
+ "Cryo-EM",
226
+ "X-ray crystallography",
227
+ "NMR spectroscopy",
228
+ "Single-particle analysis",
229
+ "Biolayer interferometry (BLI)",
230
+ "Surface plasmon resonance (SPR)",
231
+ "Confocal microscopy",
232
+ "CRISPR-Cas9",
233
+ "TALEN",
234
+ "ZFN",
235
+ "RNA interference (RNAi)",
236
+ "Single-molecule imaging",
237
+ "FRET",
238
+ "Optogenetics"
239
+ ],
240
+
241
+ "Cell & Single-Cell Technologies": [
242
+ "Single-cell RNA-seq (scRNA-seq)",
243
+ "Single-cell ATAC-seq",
244
+ "Spatial transcriptomics",
245
+ "FISH (Fluorescence in situ hybridization)",
246
+ "Immunofluorescence",
247
+ "Tissue clearing (CLARITY)",
248
+ "Flow cytometry (FACS)",
249
+ "CyTOF (Mass cytometry)",
250
+ "High-throughput screening",
251
+ "Organoids",
252
+ "3D cell culture",
253
+ "Microfluidics"
254
+ ],
255
+
256
+ "Neuroscience Tools": [
257
+ "Optogenetics",
258
+ "DREADDs (Designer Receptors Exclusively Activated by Designer Drugs)",
259
+ "GCaMP calcium imaging",
260
+ "Two-photon microscopy",
261
+ "Neural tracing",
262
+ "Patch-seq",
263
+ "Lineage tracing",
264
+ "Spatial multi-omics"
265
+ ],
266
+
267
+ "Omics & Systems Biology": [
268
+ "RNA sequencing (RNA-seq)",
269
+ "Proteomics (LC-MS/MS)",
270
+ "Metabolomics",
271
+ "Epigenomics",
272
+ "10x Genomics",
273
+ "SMART-seq",
274
+ "Nanopore sequencing",
275
+ "Illumina HiSeq",
276
+ "WGCNA",
277
+ "Machine learning in omics",
278
+ "scVelo"
279
+ ],
280
+
281
+ "Microbiome & Immunology": [
282
+ "16S rRNA sequencing",
283
+ "Metagenomics",
284
+ "Gut-brain axis",
285
+ "VDJ-seq",
286
+ "TCR/BCR lineage tracing",
287
+ "Immune checkpoints (PD-1, CTLA-4)",
288
+ "mRNA vaccines",
289
+ "DNA vaccines",
290
+ "Nanoparticle vaccines",
291
+ "Antigen presentation systems"
292
+ ],
293
+
294
+ "Development & Regeneration": [
295
+ "Induced pluripotent stem cells (iPSCs)",
296
+ "Embryonic stem cells (ESCs)",
297
+ "Cellular reprogramming",
298
+ "Wnt signaling",
299
+ "Hippo pathway",
300
+ "Notch signaling",
301
+ "Zebrafish models",
302
+ "C. elegans",
303
+ "Mouse embryonic sections"
304
+ ],
305
+
306
+ "Ecology & Environmental Biology": [
307
+ "Environmental DNA (eDNA)",
308
+ "Remote sensing ecology",
309
+ "Biosensors",
310
+ "Ecological niche modeling (ENM)",
311
+ "Genetic diversity analysis",
312
+ "Captive breeding technologies"
313
+ ],
314
+
315
+ "Bioinformatics & AI Tools": [
316
+ "Seurat",
317
+ "Scanpy",
318
+ "Monocle",
319
+ "CIBERSORT",
320
+ "GSEA",
321
+ "AlphaFold",
322
+ "RoseTTAFold",
323
+ "Molecular docking",
324
+ "STRING",
325
+ "Cytoscape",
326
+ "Gene Ontology (GO)",
327
+ "KEGG pathway analysis"
328
+ ]
329
+ }
330
+
331
+
332
+ class AcademicSearcher:
333
+ """基础学术检索器,仅负责数据获取,不做任何分析"""
334
+
335
+ EPMC_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
336
+ BIORXIV_URL = "https://api.biorxiv.org/details/biorxiv/{doi}/na/json"
337
+
338
+ def __init__(self, limit=50, sleep=0.1, timeout=30):
339
+ self.limit = limit
340
+ self.sleep = sleep
341
+ self.timeout = timeout
342
+ # 创建自定义SSL上下文
343
+ self.ssl_context = ssl.create_default_context()
344
+ self.ssl_context.check_hostname = False
345
+ self.ssl_context.verify_mode = ssl.CERT_NONE
346
+
347
+ def search(self, query: str, search_preprints: bool = True) -> List[Dict[str, Any]]:
348
+ """执行检索并返回原始文献数据"""
349
+ try:
350
+ # 1. 获取DOI列表
351
+ epmc_results = self._epmc_search(query, search_preprints)
352
+
353
+ # 2. 并行获取详细信息
354
+ detailed_results = self._get_details_parallel(epmc_results, query)
355
+
356
+ return detailed_results
357
+ except Exception as e:
358
+ print(f"检索错误: {str(e)}")
359
+ return []
360
+
361
+ def _get_details_parallel(self, epmc_results: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
362
+ """并行获取详细信息"""
363
+ detailed_results = []
364
+
365
+ # 如果没有结果,直接返回空列表
366
+ if not epmc_results:
367
+ return detailed_results
368
+
369
+ # 限制并行数量,避免过多并发请求
370
+ max_workers = min(5, len(epmc_results))
371
+
372
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
373
+ # 提交所有详情获取任务
374
+ future_to_item = {}
375
+ for item in epmc_results:
376
+ doi = item.get("doi")
377
+ if doi:
378
+ future = executor.submit(self._get_biorxiv_detail, doi)
379
+ future_to_item[future] = item
380
+
381
+ # 收集结果
382
+ for future in as_completed(future_to_item):
383
+ item = future_to_item[future]
384
+ try:
385
+ detail = future.result(timeout=10) # 10秒超时
386
+ if detail:
387
+ detail["query_used"] = query
388
+ detailed_results.append(detail)
389
+
390
+ if len(detailed_results) >= self.limit:
391
+ break
392
+
393
+ except Exception as e:
394
+ print(f"获取详情失败: {item.get('doi')} - {str(e)}")
395
+ continue
396
+
397
+ return detailed_results
398
+
399
+ def _epmc_search(self, query: str, search_preprints: bool = True) -> List[Dict[str, Any]]:
400
+ """获取EPMC搜索结果"""
401
+ if search_preprints:
402
+ # 检索预印本(bioRxiv)
403
+ query_str = f'(SRC:PPR) AND (DOI:10.1101*) AND ({query})'
404
+ else:
405
+ # 检索已发表论文(有引用量数据)
406
+ query_str = f'({query})'
407
+
408
+ params = {
409
+ "query": query_str,
410
+ "resultType": "core",
411
+ "pageSize": str(min(50, self.limit)), # 控制候选文献数量
412
+ "format": "json",
413
+ "sortby": "CITED+desc", # 按引用量降序排序
414
+ }
415
+
416
+ # 添加重试机制
417
+ for attempt in range(3):
418
+ try:
419
+ response = requests.get(
420
+ self.EPMC_URL,
421
+ params=params,
422
+ timeout=self.timeout,
423
+ verify=False, # 禁用SSL验证
424
+ headers={
425
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
426
+ }
427
+ )
428
+ response.raise_for_status()
429
+ data = response.json()
430
+ results = data.get("resultList", {}).get("result", [])
431
+
432
+ # 添加调试信息,显示检索到的文献数量和引用量信息
433
+ if results:
434
+ print(f"EPMC检索到 {len(results)} 篇文献,按引用量排序")
435
+ # 显示前几篇文献的引用量信息
436
+ for i, result in enumerate(results[:3]):
437
+ cited_count = result.get('citedByCount', 0)
438
+ title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
439
+ print(f" 文献 {i+1}: {title} (引用量: {cited_count})")
440
+
441
+ return results
442
+
443
+ except requests.exceptions.SSLError as e:
444
+ print(f"EPMC SSL错误 (尝试 {attempt + 1}/3): {str(e)}")
445
+ if attempt == 2:
446
+ print("EPMC SSL连接失败,返回空结果")
447
+ return []
448
+ time.sleep(2 ** attempt)
449
+
450
+ except requests.exceptions.RequestException as e:
451
+ print(f"EPMC请求错误 (尝试 {attempt + 1}/3): {str(e)}")
452
+ if attempt == 2:
453
+ print("EPMC请求失败,返回空结果")
454
+ return []
455
+ time.sleep(2 ** attempt)
456
+
457
+ except Exception as e:
458
+ print(f"EPMC未知错误 (尝试 {attempt + 1}/3): {str(e)}")
459
+ if attempt == 2:
460
+ print("EPMC未知错误,返回空结果")
461
+ return []
462
+ time.sleep(2 ** attempt)
463
+
464
+ return []
465
+
466
+ def _get_biorxiv_detail(self, doi: str) -> Dict[str, Any]:
467
+ """获取bioRxiv详细信息"""
468
+ url = self.BIORXIV_URL.format(doi=urllib.parse.quote(doi))
469
+
470
+ # 添加重试机制和更好的错误处理
471
+ for attempt in range(3):
472
+ try:
473
+ # 使用更宽松的SSL验证和更长的超时时间
474
+ response = requests.get(
475
+ url,
476
+ timeout=self.timeout,
477
+ verify=False, # 禁用SSL验证以避免SSL错误
478
+ headers={
479
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
480
+ }
481
+ )
482
+ response.raise_for_status()
483
+ data = response.json()
484
+
485
+ records = data.get("collection") or data.get("records") or []
486
+ if not records:
487
+ return None
488
+
489
+ latest_record = records[-1]
490
+ if latest_record.get("server") and latest_record["server"].lower() != "biorxiv":
491
+ return None
492
+
493
+ version = latest_record.get("version") or 1
494
+ return {
495
+ "doi": latest_record.get("doi"),
496
+ "title": latest_record.get("title"),
497
+ "abstract": latest_record.get("abstract"),
498
+ "corresponding_author": latest_record.get("author_corresponding"),
499
+ "corresponding_institution": latest_record.get("author_corresponding_institution"),
500
+ "url": f"https://www.biorxiv.org/content/{latest_record['doi']}v{version}"
501
+ }
502
+
503
+ except requests.exceptions.SSLError as e:
504
+ print(f"bioRxiv SSL错误 (尝试 {attempt + 1}/3): {str(e)}")
505
+ if attempt == 2: # 最后一次尝试
506
+ print(f"跳过DOI {doi} 的详细信息获取")
507
+ return None
508
+ time.sleep(2 ** attempt) # 指数退避
509
+
510
+ except requests.exceptions.RequestException as e:
511
+ print(f"bioRxiv请求错误 (尝试 {attempt + 1}/3): {str(e)}")
512
+ if attempt == 2: # 最后一次尝试
513
+ print(f"跳过DOI {doi} 的详细信息获取")
514
+ return None
515
+ time.sleep(2 ** attempt) # 指数退避
516
+
517
+ except Exception as e:
518
+ print(f"bioRxiv未知错误 (尝试 {attempt + 1}/3): {str(e)}")
519
+ if attempt == 2: # 最后一次尝试
520
+ print(f"跳过DOI {doi} 的详细信息获取")
521
+ return None
522
+ time.sleep(2 ** attempt) # 指数退避
523
+
524
+ return None
525
+
526
+
527
+ class DynamicAcademicSearcher:
528
+ """动态学术检索器,包含智能查询生成、动态处理和扩展功能"""
529
+
530
+ def __init__(self, base_searcher: AcademicSearcher = None, openalex_searcher: OpenAlexSearcher = None):
531
+ self.base_searcher = base_searcher
532
+ self.openalex_searcher = openalex_searcher
533
+ # 如果没有提供任何检索器,创建默认的
534
+ if not self.base_searcher and not self.openalex_searcher:
535
+ self.openalex_searcher = OpenAlexSearcher()
536
+
537
+ def search_with_dynamic_queries(self, paper: PaperInfo, num_reviewers: int = 8, years_after: int = None) -> tuple:
538
+ """使用动态查询进行单通道检索"""
539
+ # 1. 生成检索查询(只生成一个查询)
540
+ queries = self.generate_search_queries(paper, 1)
541
+ print("生成的检索查询:")
542
+ for i, query in enumerate(queries, 1):
543
+ print(f"查询 {i}: {query}")
544
+
545
+ # 记录查询生成日志
546
+ self._log_query_generation(paper, queries)
547
+
548
+ # 2. 根据可用的检索器选择检索策略
549
+ if self.openalex_searcher:
550
+ return self._search_with_openalex_single_channel(queries[0], num_reviewers, years_after)
551
+ elif self.base_searcher:
552
+ return self._search_with_epmc_single_channel(queries[0], num_reviewers)
553
+ else:
554
+ print("错误:没有可用的检索器")
555
+ return [], []
556
+
557
+ def _search_with_openalex_single_channel(self, query: str, num_reviewers: int, years_after: int = None) -> tuple:
558
+ """使用OpenAlex进行单通道检索"""
559
+ # 计算需要检索的文献数量(目标审稿人数量的5倍)
560
+ target_count = num_reviewers * 5
561
+ print(f"单通道检索:目标审稿人数量 {num_reviewers},检索文献数量 {target_count}")
562
+
563
+ # 执行检索
564
+ candidates = self._execute_openalex_query(query, sort_by_citations=False, years_after=years_after, target_count=target_count)
565
+
566
+ print(f"单通道检索完成,获得 {len(candidates)} 个候选审稿人")
567
+
568
+ # 返回单个通道的结果(为了保持兼容性,返回三个相同的通道)
569
+ return candidates, candidates, candidates
570
+
571
+ def _search_with_epmc_single_channel(self, query: str, num_reviewers: int) -> tuple:
572
+ """使用EPMC进行单通道检索"""
573
+ # 计算需要检索的文献数量(目标审稿人数量的5倍)
574
+ target_count = num_reviewers * 5
575
+ print(f"单通道检索:目标审稿人数量 {num_reviewers},检索文献数量 {target_count}")
576
+
577
+ # 执行检索
578
+ candidates = self._execute_single_query(query, search_preprints=True, target_count=target_count)
579
+
580
+ print(f"单通道检索完成,获得 {len(candidates)} 个候选审稿人")
581
+
582
+ # 返回单个通道的结果(为了保持兼容性,返回三个相同的通道)
583
+ return candidates, candidates, candidates
584
+
585
+ def _search_with_openalex(self, queries: List[str], years_after: int = None) -> tuple:
586
+ """使用OpenAlex进行三通道检索"""
587
+ channel1_candidates = [] # 高引用量论文
588
+ channel2_candidates = [] # 相关性论文
589
+ channel3_candidates = [] # 相关性论文
590
+
591
+ # 确保至少有3个查询
592
+ if len(queries) < 3:
593
+ print(f"警告:查询数量不足({len(queries)}/3),将使用备用查询")
594
+ queries = queries + ["cryo-em structure", "cryo-em structure analysis"] * (3 - len(queries))
595
+
596
+ # 使用线程池并行执行查询
597
+ with ThreadPoolExecutor(max_workers=6) as executor:
598
+ # 提交查询任务
599
+ future_to_query = {}
600
+
601
+ if len(queries) >= 1:
602
+ future1 = executor.submit(self._execute_openalex_query, queries[0], sort_by_citations=False, years_after=years_after)
603
+ future_to_query[future1] = (queries[0], "高引用量")
604
+
605
+ # 通道2:使用查询2按相关性排序
606
+ if len(queries) >= 2:
607
+ future2 = executor.submit(self._execute_openalex_query, queries[1], sort_by_citations=False, years_after=years_after)
608
+ future_to_query[future2] = (queries[1], "相关性2")
609
+
610
+ # 通道3:使用查询3按相关性排序
611
+ if len(queries) >= 3:
612
+ future3 = executor.submit(self._execute_openalex_query, queries[2], sort_by_citations=False, years_after=years_after)
613
+ future_to_query[future3] = (queries[2], "相关性3")
614
+
615
+ # 收集结果
616
+ for future in as_completed(future_to_query):
617
+ query, search_type = future_to_query[future]
618
+ try:
619
+ results = future.result()
620
+ if search_type == "高引用量":
621
+ channel1_candidates.extend(results)
622
+ elif search_type == "相关性2":
623
+ channel2_candidates.extend(results)
624
+ elif search_type == "相关性3":
625
+ channel3_candidates.extend(results)
626
+ except Exception as e:
627
+ print(f"查询失败 {query} ({search_type}): {str(e)}")
628
+
629
+ # 显示检索结果
630
+ print(f"\n通道1(高引用量排序)的检索结果:")
631
+ if channel1_candidates and len(queries) >= 1:
632
+ print(f"查询: \"{queries[0]}\" (按引用量)")
633
+ # 显示前3篇文献
634
+ for j, result in enumerate(channel1_candidates[:3], 1):
635
+ cited_count = result.get('citedByCount', 0)
636
+ title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
637
+ print(f"文献{j}: {title} (引用量: {cited_count})")
638
+
639
+ print(f"\n通道2(相关性排序)的检索结果:")
640
+ if channel2_candidates and len(queries) >= 2:
641
+ print(f"查询: \"{queries[1]}\" (按相关性)")
642
+ # 显示前3篇文献
643
+ for j, result in enumerate(channel2_candidates[:3], 1):
644
+ cited_count = result.get('citedByCount', 0)
645
+ title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
646
+ print(f"文献{j}: {title} (引用量: {cited_count})")
647
+
648
+ print(f"\n通道3(相关性排序)的检索结果:")
649
+ if channel3_candidates and len(queries) >= 3:
650
+ print(f"查询: \"{queries[2]}\" (按相关性)")
651
+ # 显示前3篇文献
652
+ for j, result in enumerate(channel3_candidates[:3], 1):
653
+ cited_count = result.get('citedByCount', 0)
654
+ title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
655
+ print(f"文献{j}: {title} (引用量: {cited_count})")
656
+
657
+ # 打印每个通道的候选审稿人列表
658
+ self._print_candidate_reviewers("通道1(高引用量)", channel1_candidates)
659
+ self._print_candidate_reviewers("通道2(相关性)", channel2_candidates)
660
+ self._print_candidate_reviewers("通道3(高相关性)", channel3_candidates)
661
+
662
+ print(f"\nOpenAlex检索完成 - 通道1: {len(channel1_candidates)} 篇, 通道2: {len(channel2_candidates)} 篇, 通道3: {len(channel3_candidates)} 篇")
663
+ return channel1_candidates, channel2_candidates, channel3_candidates
664
+
665
+ def _print_candidate_reviewers(self, channel_name: str, candidates: List[Dict[str, Any]]):
666
+ """打印候选审稿人列表"""
667
+ if not candidates:
668
+ print(f"\n{channel_name}候选审稿人: 无")
669
+ return
670
+
671
+ print(f"\n{channel_name}候选审稿人:")
672
+ seen_reviewers = set()
673
+
674
+ for i, candidate in enumerate(candidates, 1):
675
+ corresponding_author = candidate.get('corresponding_author', '')
676
+ corresponding_institution = candidate.get('corresponding_institution', '')
677
+ title = candidate.get('title', '')
678
+
679
+ if corresponding_author:
680
+ # 创建审稿人标识符用于去重
681
+ author_lower = corresponding_author.lower()
682
+ institution_lower = (corresponding_institution or "未知机构").lower()
683
+ reviewer_key = f"{author_lower}_{institution_lower}"
684
+
685
+ if reviewer_key not in seen_reviewers:
686
+ seen_reviewers.add(reviewer_key)
687
+ title_short = title[:60] + '...' if len(title) > 60 else title
688
+ print(f" {len(seen_reviewers)}. {corresponding_author} ({corresponding_institution})")
689
+ print(f" 论文: {title_short}")
690
+
691
+ print(f" 总计: {len(seen_reviewers)} 位候选审稿人")
692
+
693
+ def _execute_openalex_query(self, query: str, sort_by_citations: bool = False, years_after: int = None, target_count: int = None) -> List[Dict[str, Any]]:
694
+ """执行单个OpenAlex查询"""
695
+ try:
696
+ return self.openalex_searcher.search(query, sort_by_citations=sort_by_citations, years_after=years_after, target_count=target_count)
697
+ except Exception as e:
698
+ print(f"OpenAlex查询执行失败: {str(e)}")
699
+ return []
700
+
701
+ def _search_with_epmc(self, queries: List[str]) -> tuple:
702
+ """使用EPMC进行双数据源检索"""
703
+ channel1_candidates = [] # 已发表论文(高引用量)
704
+ channel2_candidates = [] # 预印本(最新研��)
705
+
706
+ # 使用线程池并行执行查询
707
+ with ThreadPoolExecutor(max_workers=4) as executor:
708
+ # 提交所有查询任务
709
+ future_to_query = {}
710
+ for i, query in enumerate(queries):
711
+ # 通道1:检索已发表论文
712
+ future1 = executor.submit(self._execute_single_query, query, search_preprints=False)
713
+ future_to_query[future1] = (query, "已发表论文")
714
+
715
+ # 通道2:检索预印本
716
+ future2 = executor.submit(self._execute_single_query, query, search_preprints=True)
717
+ future_to_query[future2] = (query, "预印本")
718
+
719
+ # 收集结果
720
+ for future in as_completed(future_to_query):
721
+ query, data_source = future_to_query[future]
722
+ try:
723
+ results = future.result(timeout=30) # 30秒超时
724
+ if results:
725
+ print(f"查询 '{query}' ({data_source}) 完成,找到 {len(results)} 篇文献")
726
+ if data_source == "已发表论文":
727
+ channel1_candidates.extend(results)
728
+ else:
729
+ channel2_candidates.extend(results)
730
+ else:
731
+ print(f"查询 '{query}' ({data_source}) 未找到文献")
732
+ except Exception as e:
733
+ print(f"查询 '{query}' ({data_source}) 执行失败: {str(e)}")
734
+
735
+ # 3. 去重相同文献
736
+ unique_channel1 = {item['doi']: item for item in channel1_candidates if item.get('doi')}.values()
737
+ unique_channel2 = {item['doi']: item for item in channel2_candidates if item.get('doi')}.values()
738
+
739
+ print(f"双数据源检索完成:已发表论文 {len(list(unique_channel1))} 篇,预印本 {len(list(unique_channel2))} 篇")
740
+ return list(unique_channel1), list(unique_channel2)
741
+
742
+ def _execute_single_query(self, query: str, search_preprints: bool = True, target_count: int = None) -> List[Dict[str, Any]]:
743
+ """执行单个查询(用于并行处理)"""
744
+ data_source = "预印本" if search_preprints else "已发表论文"
745
+ print(f"开始执行查询: {query} ({data_source})")
746
+
747
+ # 动态查询处理
748
+ processed_queries = self._process_query_dynamically(query)
749
+
750
+ for processed_query in processed_queries:
751
+ print(f" 尝试查询: {processed_query}")
752
+ results = self.base_searcher.search(processed_query, search_preprints)
753
+
754
+ if results:
755
+ print(f" 找到 {len(results)} 篇文献")
756
+ return results
757
+ else:
758
+ print(f" 未找到文献,尝试扩展查询...")
759
+
760
+ print(f" 所有扩展查询都未找到文献")
761
+ return []
762
+
763
+ def generate_search_queries(self, paper: PaperInfo, num_queries: int = 2) -> List[str]:
764
+ """动态生成双通道检索查询"""
765
+ system_msg = "你是学术检索专家,擅长从论文中提取出多个维度的关键词"
766
+
767
+ prompt = f"""
768
+ 请分析以下论文,提取关键信息并生成1个精准的检索查询:
769
+
770
+ 论文信息:
771
+ 标题: {paper.title}
772
+ 摘要: {paper.abstract}
773
+ 关键词: {', '.join(paper.keywords)}
774
+
775
+ 分析任务:
776
+ 请从论文中识别以下三个维度的关键信息:
777
+
778
+ 1. 一个研究主体 (Research Subject)
779
+ - 论文研究的核心对象、分子、蛋白质、疾病等
780
+ - 例如:Nav1.7、COVID-19、dopamine、insulin等
781
+
782
+ 2. 关键组分 (Key Components)
783
+ - 与研究主体相关的亚单位、配体、抑制剂、调节因子等
784
+ - 例如:β1亚单位、Protoxin-II、receptor、agonist等
785
+
786
+ 3. 研究方法 (Research Method)
787
+ - 论文使用的核心技术、实验方法、分析手段等
788
+ - 例如:Cryo-EM、CRISPR、NMR、patch-clamp等
789
+
790
+ 查询生成规则:
791
+ - 使用布尔运算符AND和OR构建精确查询
792
+ - 每个维度内使用OR连接同义词或相关术语
793
+ - 不同维度间使用AND连接
794
+ - 优先选择最核心、最特异的术语
795
+ - 避免过于宽泛的通用词汇
796
+
797
+ 输出要求:
798
+ 请生成1个检索查询,格式如下:
799
+ (研究主体) AND (关键组分1 OR 关键组分2) AND (研究方法1 OR 研究方法2)
800
+
801
+ 示例输出:
802
+ (Nav1.7) AND (β1 OR Protoxin-II) AND (cryo-EM OR cryo-electron microscopy)
803
+ """
804
+
805
+ response = self._call_llm(prompt.strip(), system_msg)
806
+ if not response:
807
+ return self._generate_backup_queries(paper, num_queries)
808
+
809
+ # 解析查询(现在只生成一个查询)
810
+ query = response.strip()
811
+ print(f"LLM原始返回的查询: {query}")
812
+
813
+ # 验证查询格式
814
+ validated_queries = self._validate_new_queries([query])
815
+ print(f"验证后的查询数量: {len(validated_queries)}")
816
+
817
+ # 如果验证失败,使用备用查询
818
+ if len(validated_queries) == 0:
819
+ print(f"查询验证失败,使用备用查询")
820
+ backup_queries = self._generate_backup_queries(paper, num_queries)
821
+ print(f"备用查询: {backup_queries}")
822
+ return backup_queries
823
+
824
+ # 返回验证通过的查询,如果num_queries > 1,则重复使用同一个查询
825
+ result_queries = validated_queries[:1] # 只取第一个查询
826
+ if num_queries > 1:
827
+ # 如果需要多个查询,重复使用同一个查询
828
+ result_queries = result_queries * num_queries
829
+ print(f"重复使用查询以满足数量要求: {result_queries}")
830
+
831
+ return result_queries
832
+
833
+ def _validate_new_queries(self, queries: List[str]) -> List[str]:
834
+ """验证新格式查询(单查询格式)"""
835
+ validated_queries = []
836
+
837
+ for query in queries:
838
+ # 基本格式检查
839
+ if not query or len(query.strip()) < 10:
840
+ print(f"查询太短,跳过: {query}")
841
+ continue
842
+
843
+ # 检查是否包含AND操作符(新格式要求)
844
+ if ' AND ' not in query:
845
+ print(f"查询缺少AND操作符,跳过: {query}")
846
+ continue
847
+
848
+ # 检查是否包含括号(新格式要求)
849
+ if '(' not in query or ')' not in query:
850
+ print(f"查询缺少括号,跳过: {query}")
851
+ continue
852
+
853
+ # 检查是否包含OR操作符(新格式要求)
854
+ if ' OR ' not in query:
855
+ print(f"查询缺少OR操作符,跳过: {query}")
856
+ continue
857
+
858
+ # 检查查询长度合理性
859
+ if len(query) > 200: # 查询过长
860
+ print(f"查询过长,跳过: {query}")
861
+ continue
862
+
863
+ # 检查是否包含生物学关键词分类
864
+ has_biology_keyword = False
865
+ for category, keywords in BIOLOGY_KEYWORDS.items():
866
+ if category.lower() in query.lower():
867
+ has_biology_keyword = True
868
+ break
869
+ for keyword in keywords:
870
+ if keyword.lower() in query.lower():
871
+ has_biology_keyword = True
872
+ break
873
+ if has_biology_keyword:
874
+ break
875
+
876
+ if not has_biology_keyword:
877
+ print(f"查询不包含生物学关键词分类,跳过: {query}")
878
+ continue
879
+
880
+ validated_queries.append(query.strip())
881
+ print(f"查询验证通过: {query}")
882
+
883
+ return validated_queries
884
+
885
+ def _validate_queries(self, queries: List[str]) -> List[str]:
886
+ """验证查询格式和质量"""
887
+ validated_queries = []
888
+
889
+ for query in queries:
890
+ # 基本格式检查
891
+ if not query or len(query.strip()) < 5:
892
+ print(f"查询太短,跳过: {query}")
893
+ continue
894
+
895
+ # 检查是否包含AND连接符
896
+ if ' AND ' not in query:
897
+ print(f"查询缺少AND连接符,跳过: {query}")
898
+ continue
899
+
900
+ # 检查是否只包含两个关键词(主要学科 AND 研究层面关键词)
901
+ parts = query.split(' AND ')
902
+ if len(parts) != 2:
903
+ print(f"查询格式不正确,跳过: {query}")
904
+ continue
905
+
906
+ # 检查每个部分是否有效
907
+ part1 = parts[0].strip()
908
+ part2 = parts[1].strip()
909
+
910
+ if not part1 or not part2:
911
+ print(f"查询包含空部分,跳过: {query}")
912
+ continue
913
+
914
+ if part1.upper() == 'AND' or part2.upper() == 'AND':
915
+ print(f"查询包含无效AND,跳过: {query}")
916
+ continue
917
+
918
+ # 检查是否包含生物学关键词分类
919
+ has_biology_keyword = False
920
+ for category, keywords in BIOLOGY_KEYWORDS.items():
921
+ if category.lower() in query.lower():
922
+ has_biology_keyword = True
923
+ break
924
+ for keyword in keywords:
925
+ if keyword.lower() in query.lower():
926
+ has_biology_keyword = True
927
+ break
928
+ if has_biology_keyword:
929
+ break
930
+
931
+ if not has_biology_keyword:
932
+ print(f"查询不包含生物学关键词分类,跳过: {query}")
933
+ continue
934
+
935
+ # 检查查询长度合理性
936
+ if len(query) > 100: # 查询过长
937
+ print(f"查询过长,跳过: {query}")
938
+ continue
939
+
940
+ validated_queries.append(query.strip())
941
+ print(f"查询验证通过: {query}")
942
+
943
+ return validated_queries
944
+
945
+ def _process_query_dynamically(self, query: str) -> List[str]:
946
+ """动态处理查询,生成多个变体"""
947
+ # 基础查询
948
+ queries = [query]
949
+
950
+ # 检查查询格式是否正确
951
+ if ' AND ' not in query:
952
+ return queries
953
+
954
+ # 按AND分割查询
955
+ parts = query.split(' AND ')
956
+ if len(parts) != 2:
957
+ return queries
958
+
959
+ # 清理每个部分
960
+ part1 = parts[0].strip()
961
+ part2 = parts[1].strip()
962
+
963
+ # 如果某个部分为空或只包含AND,跳过
964
+ if not part1 or not part2 or part1.upper() == 'AND' or part2.upper() == 'AND':
965
+ return queries
966
+
967
+ # 简化查询(只保留主要关键词)
968
+ part1_words = part1.split()
969
+ part2_words = part2.split()
970
+
971
+ if len(part1_words) > 1:
972
+ # 取第一个部分的主要关键词
973
+ simplified_part1 = part1_words[0]
974
+ queries.append(f"{simplified_part1} AND {part2}")
975
+
976
+ if len(part2_words) > 1:
977
+ # 取第二个部分的主要关键词
978
+ simplified_part2 = part2_words[0]
979
+ queries.append(f"{part1} AND {simplified_part2}")
980
+
981
+ # 单个关键词查询
982
+ queries.append(part1)
983
+ queries.append(part2)
984
+
985
+ return list(set(queries)) # 去重
986
+
987
+ def _generate_backup_queries(self, paper: PaperInfo, num_queries: int) -> List[str]:
988
+ """生成备用查询,使用新格式"""
989
+ queries = []
990
+
991
+ # 从论文标题和摘要中提取关键词
992
+ text = f"{paper.title} {paper.abstract}".lower()
993
+
994
+ # 常见技术关键词列表
995
+ tech_keywords = [
996
+ "cryo-em", "cryoem", "x-ray", "xray", "nmr", "crispr", "pcr", "western blot",
997
+ "immunofluorescence", "confocal", "flow cytometry", "mass spectrometry",
998
+ "chromatography", "electrophoresis", "microscopy", "spectroscopy"
999
+ ]
1000
+
1001
+ # 查找技术关键词
1002
+ found_tech_keyword = None
1003
+ for keyword in tech_keywords:
1004
+ if keyword in text:
1005
+ found_tech_keyword = keyword
1006
+ break
1007
+
1008
+ # 如果没有找到技术关键词,使用默认值
1009
+ if not found_tech_keyword:
1010
+ found_tech_keyword = "cryo-em"
1011
+
1012
+ # 查询1:纯子类关键词
1013
+ queries.append(found_tech_keyword)
1014
+
1015
+ # 查询2:子类关键词 + 子子类关键词
1016
+ queries.append(f"{found_tech_keyword} structure")
1017
+
1018
+ # 从标题中提取特定术语
1019
+ title_words = paper.title.split()
1020
+ specific_term = None
1021
+ for word in title_words:
1022
+ # 过滤掉常见词汇,寻找有意义的术语
1023
+ if (len(word) > 3 and
1024
+ word.lower() not in ['the', 'and', 'for', 'with', 'from', 'this', 'that',
1025
+ 'structures', 'human', 'channel', 'complex', 'with',
1026
+ 'auxiliary', 'subunits', 'animal', 'toxins', 'analysis',
1027
+ 'study', 'research', 'investigation', 'characterization']):
1028
+ specific_term = word
1029
+ break
1030
+
1031
+ if specific_term:
1032
+ # 查询3:子类关键词 + 子子类关键词 + 论文特定术语
1033
+ queries.append(f"{found_tech_keyword} structure {specific_term}")
1034
+ else:
1035
+ # 如果没有找到特定术语,使用第一个查询的变体
1036
+ queries.append(f"{found_tech_keyword} structure analysis")
1037
+
1038
+ # 确保总是返回所需数量的查询
1039
+ while len(queries) < num_queries:
1040
+ # 如果还需要更多查询,添加变体
1041
+ variant_num = len(queries) + 1
1042
+ queries.append(f"{found_tech_keyword} analysis")
1043
+
1044
+ print(f"备用查询生成完成,共 {len(queries)} 个查询")
1045
+ return queries[:num_queries]
1046
+
1047
+ def _infer_biology_keywords(self, paper: PaperInfo) -> tuple:
1048
+ """从论文内容推断最相关的生物学分类和关键词"""
1049
+ text = f"{paper.title} {paper.abstract} {' '.join(paper.keywords)}".lower()
1050
+
1051
+ best_category = None
1052
+ best_keyword = None
1053
+ max_matches = 0
1054
+
1055
+ for category, keywords in BIOLOGY_KEYWORDS.items():
1056
+ category_matches = 0
1057
+ best_keyword_in_category = None
1058
+
1059
+ # 检查类别名称匹配
1060
+ if category.lower() in text:
1061
+ category_matches += 2
1062
+
1063
+ # 检查关键词匹配
1064
+ for keyword in keywords:
1065
+ if keyword.lower() in text:
1066
+ category_matches += 1
1067
+ if not best_keyword_in_category:
1068
+ best_keyword_in_category = keyword
1069
+
1070
+ # ���新最佳匹配
1071
+ if category_matches > max_matches:
1072
+ max_matches = category_matches
1073
+ best_category = category
1074
+ best_keyword = best_keyword_in_category or keywords[0]
1075
+
1076
+ return best_category, best_keyword
1077
+
1078
+ def _log_query_generation(self, paper: PaperInfo, queries: List[str]):
1079
+ """记录查询生成日志"""
1080
+ log_info = {
1081
+ "paper_title": paper.title[:100] + "..." if len(paper.title) > 100 else paper.title,
1082
+ "paper_keywords": paper.keywords,
1083
+ "generated_queries": queries,
1084
+ "query_count": len(queries),
1085
+ "timestamp": time.time()
1086
+ }
1087
+ print(f"查询生成日志: {log_info}")
1088
+
1089
+ def _call_llm(self, prompt: str, system_msg: str) -> Optional[str]:
1090
+ """调用LLM生成查询"""
1091
+ try:
1092
+ if DASHSCOPE_API_KEY:
1093
+ import dashscope
1094
+ dashscope.api_key = DASHSCOPE_API_KEY
1095
+
1096
+ response = dashscope.Generation.call(
1097
+ model="qwen-turbo-latest",
1098
+ messages=[
1099
+ {"role": "system", "content": system_msg},
1100
+ {"role": "user", "content": prompt}
1101
+ ],
1102
+ timeout=30
1103
+ )
1104
+ if response.status_code == 200:
1105
+ return response.output.text
1106
+ else:
1107
+ print(f"DashScope API错误: {response.message}")
1108
+
1109
+ elif OPENAI_API_KEY:
1110
+ from openai import OpenAI
1111
+ client = OpenAI(api_key=OPENAI_API_KEY)
1112
+ response = client.chat.completions.create(
1113
+ model="gpt-3.5-turbo",
1114
+ messages=[
1115
+ {"role": "system", "content": system_msg},
1116
+ {"role": "user", "content": prompt}
1117
+ ],
1118
+ timeout=30
1119
+ )
1120
+ return response.choices[0].message.content
1121
+
1122
+ else:
1123
+ print("未配置API密钥")
1124
+ return None
1125
+
1126
+ except Exception as e:
1127
+ print(f"大模型调用错误: {str(e)}")
1128
+ return None
reviewer_recommendation/utils.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 工具函数模块
3
+ 提供错误处理、状态管理和通用工具函数
4
+ """
5
+
6
+ import time
7
+ import logging
8
+ from typing import Optional, Dict, Any
9
+ from functools import wraps
10
+
11
+ from .models import AppState, RecommendationResponse
12
+
13
+
14
+ # 配置日志
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def handle_api_errors(func):
23
+ """API错误处理装饰器"""
24
+ @wraps(func)
25
+ def wrapper(*args, **kwargs):
26
+ try:
27
+ return func(*args, **kwargs)
28
+ except Exception as e:
29
+ logger.error(f"API调用错误 in {func.__name__}: {str(e)}")
30
+ raise
31
+ return wrapper
32
+
33
+
34
+ def validate_paper_info(title: str, abstract: str, keywords: str) -> tuple[bool, str]:
35
+ """验证论文信息"""
36
+ if not title or not title.strip():
37
+ return False, "论文标题不能为空"
38
+
39
+ if not abstract or not abstract.strip():
40
+ return False, "论文摘要不能为空"
41
+
42
+ if len(abstract.strip()) < 50:
43
+ return False, "论文摘要至少需要50个字符"
44
+
45
+ if len(title.strip()) < 10:
46
+ return False, "论文标题至少需要10个字符"
47
+
48
+ return True, ""
49
+
50
+
51
+ def validate_reviewer_count(count: int) -> tuple[bool, str]:
52
+ """验证审稿人数量"""
53
+ if count < 1:
54
+ return False, "推荐审稿人数量至少为1"
55
+
56
+ if count > 10:
57
+ return False, "推荐审稿人数量不能超过10"
58
+
59
+ return True, ""
60
+
61
+
62
+ def format_error_message(error: Exception) -> str:
63
+ """格式化错误信息"""
64
+ error_type = type(error).__name__
65
+ error_msg = str(error)
66
+
67
+ # 常见错误类型处理
68
+ if "timeout" in error_msg.lower():
69
+ return "请求超时,请稍后重试"
70
+ elif "api" in error_msg.lower():
71
+ return "API调用失败,请检查网络连接"
72
+ elif "json" in error_msg.lower():
73
+ return "数据解析错误,请重试"
74
+ else:
75
+ return f"系统错误 ({error_type}): {error_msg}"
76
+
77
+
78
+ def create_error_response(error: Exception, search_time: float = 0.0) -> RecommendationResponse:
79
+ """创建错误响应"""
80
+ return RecommendationResponse(
81
+ reviewers=[],
82
+ search_time=search_time,
83
+ total_candidates=0,
84
+ success=False,
85
+ error_message=format_error_message(error)
86
+ )
87
+
88
+
89
+ def update_app_state(state: AppState, **kwargs) -> AppState:
90
+ """更新应用状态"""
91
+ for key, value in kwargs.items():
92
+ if hasattr(state, key):
93
+ setattr(state, key, value)
94
+ return state
95
+
96
+
97
+ def log_operation(operation: str, **kwargs):
98
+ """记录操作日志"""
99
+ log_data = {
100
+ "operation": operation,
101
+ "timestamp": time.time(),
102
+ **kwargs
103
+ }
104
+ logger.info(f"操作日志: {log_data}")
105
+
106
+
107
+ def sanitize_input(text: str) -> str:
108
+ """清理输入文本"""
109
+ if not text:
110
+ return ""
111
+
112
+ # 移除多余的空白字符
113
+ text = " ".join(text.split())
114
+
115
+ # 限制长度
116
+ if len(text) > 10000:
117
+ text = text[:10000] + "..."
118
+
119
+ return text
120
+
121
+
122
+ def extract_keywords(text: str) -> list[str]:
123
+ """从文本中提取关键词"""
124
+ if not text:
125
+ return []
126
+
127
+ # 简单的关键词提取(按逗号分割)
128
+ keywords = [kw.strip() for kw in text.split(',') if kw.strip()]
129
+
130
+ # 过滤太短的关键词
131
+ keywords = [kw for kw in keywords if len(kw) >= 2]
132
+
133
+ # 去重
134
+ return list(set(keywords))
135
+
136
+
137
+ def format_search_progress(current: int, total: int, step: str) -> str:
138
+ """格式化搜索进度"""
139
+ percentage = (current / total * 100) if total > 0 else 0
140
+ return f"搜索进度: {current}/{total} ({percentage:.1f}%) - {step}"
141
+
142
+
143
+ def validate_api_keys() -> tuple[bool, str]:
144
+ """验证API密钥配置"""
145
+ import os
146
+
147
+ openai_key = os.getenv("OPENAI_API_KEY")
148
+ dashscope_key = "sk-564d51ee5ddd4693a86f34750b46b02e" # 硬编码的密钥
149
+
150
+ if not openai_key and not dashscope_key:
151
+ return False, "未配置任何API密钥"
152
+
153
+ return True, "API密钥配置正常"