Spaces:

wujian123
/

recommend

Runtime error

App Files Files Community

wujian123 commited on Oct 23, 2025

Commit

9d225e3

1 Parent(s): 5080c72

Add reviewer_recommendation module

Browse files

Files changed (18) hide show

reviewer_recommendation/__init__.py +23 -0
reviewer_recommendation/__pycache__/__init__.cpython-310.pyc +0 -0
reviewer_recommendation/__pycache__/__init__.cpython-312.pyc +0 -0
reviewer_recommendation/__pycache__/__init__.cpython-313.pyc +0 -0
reviewer_recommendation/__pycache__/engine.cpython-310.pyc +0 -0
reviewer_recommendation/__pycache__/engine.cpython-312.pyc +0 -0
reviewer_recommendation/__pycache__/models.cpython-310.pyc +0 -0
reviewer_recommendation/__pycache__/models.cpython-312.pyc +0 -0
reviewer_recommendation/__pycache__/models.cpython-313.pyc +0 -0
reviewer_recommendation/__pycache__/searcher.cpython-310.pyc +0 -0
reviewer_recommendation/__pycache__/searcher.cpython-312.pyc +0 -0
reviewer_recommendation/engine copy.py +609 -0
reviewer_recommendation/engine.py +389 -0
reviewer_recommendation/enginecomplex.py +609 -0
reviewer_recommendation/models.py +59 -0
reviewer_recommendation/searcher copy.py +666 -0
reviewer_recommendation/searcher.py +1128 -0
reviewer_recommendation/utils.py +153 -0

reviewer_recommendation/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+审稿人推荐系统
+基于论文信息自动推荐合适的审稿人
+"""
+__version__ = "1.0.0"
+__author__ = "AI Assistant"
+from .models import PaperInfo, Reviewer, RecommendationRequest, RecommendationResponse, AppState
+from .searcher import AcademicSearcher, DynamicAcademicSearcher, OpenAlexSearcher
+from .engine import LLMRecommendationEngine
+__all__ = [
+    "PaperInfo",
+    "Reviewer",
+    "RecommendationRequest",
+    "RecommendationResponse",
+    "AppState",
+    "AcademicSearcher",
+    "DynamicAcademicSearcher",
+    "OpenAlexSearcher",
+    "LLMRecommendationEngine"
+]

reviewer_recommendation/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (666 Bytes). View file

reviewer_recommendation/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (692 Bytes). View file

reviewer_recommendation/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (697 Bytes). View file

reviewer_recommendation/__pycache__/engine.cpython-310.pyc ADDED Viewed

Binary file (11.9 kB). View file

reviewer_recommendation/__pycache__/engine.cpython-312.pyc ADDED Viewed

Binary file (14.1 kB). View file

reviewer_recommendation/__pycache__/models.cpython-310.pyc ADDED Viewed

Binary file (3.36 kB). View file

reviewer_recommendation/__pycache__/models.cpython-312.pyc ADDED Viewed

Binary file (4.2 kB). View file

reviewer_recommendation/__pycache__/models.cpython-313.pyc ADDED Viewed

Binary file (4.25 kB). View file

reviewer_recommendation/__pycache__/searcher.cpython-310.pyc ADDED Viewed

Binary file (28.1 kB). View file

reviewer_recommendation/__pycache__/searcher.cpython-312.pyc ADDED Viewed

Binary file (15.1 kB). View file

reviewer_recommendation/engine copy.py ADDED Viewed

	@@ -0,0 +1,609 @@

+"""
+推荐引擎模块
+使用LLM分析候选者并推荐合适的审稿人
+"""
+import json
+import os
+import time
+from typing import List, Dict, Any, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from .models import PaperInfo, Reviewer
+# 配置部分
+DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+class LLMRecommendationEngine:
+    """完全由大模型驱动的审稿人推荐引擎"""
+    def __init__(self):
+        pass
+    def analyze_candidates(self, paper: PaperInfo, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """分析候选文献，评估适合度"""
+        system_msg = "你是学术领域专家，擅长评估研究人员与特定论文的匹配度"
+        candidates_str = json.dumps(candidates, ensure_ascii=False, indent=2)
+        # 构建作者和机构信息
+        authors_info = ""
+        if paper.authors:
+            authors_info = f"作者: {', '.join(paper.authors)}"
+        if paper.affiliations:
+            authors_info += f"\n作者机构: {', '.join(paper.affiliations)}"
+        prompt = f"""
+        请分析以下候选文献的作者是否适合评审目标论文，并按适合度排序:
+        目标论文:
+        标题: {paper.title}
+        摘要: {paper.abstract}
+        关键词: {', '.join(paper.keywords)}
+        {authors_info}
+        候选文献列表:
+        {candidates_str}
+        分析要求:
+        1. 为每位通讯作者评估适合度，给出0-1的相关性评分
+        2. 提取作者的专业领域和研究方向
+        3. 说明推荐理由(中文 重点介绍作者本人的研究方向)
+        4. 排除重复作者
+        5. 严格排除与目标论文作者相同或来自同一机构的人员
+        6. 按适合度从高到低排序，优先考虑引用量和知名程度
+        7. 必须返回至少5-10个审稿人，确保有足够的候选人数
+        7. 如果相关性评分全部低于0.6 则重新再进行一次分析
+        8. 估算作者的学术论文引用总量(基于机构声誉和研究领域)
+        请返回JSON数组，每个元素包含:
+        - name: 作者姓名
+        - affiliation: 单位
+        - email: 邮箱(从数据中提取)
+        - reason: 推荐理由(中文 作者本人的研究方向与目标论文的适配度)
+        - relevance_score: 相关性评分(0-1)
+        - expertise_areas: 专业领域列表
+        - citation_count: 估算的学术论文引用总量
+        确保输出是纯JSON，不要包含其他内容
+        """
+        response = self._call_llm_with_retry(prompt.strip(), system_msg, json_output=True)
+        if not response:
+            return []
+        # 清理和解析JSON响应
+        cleaned_response = self._clean_json_response(response)
+        if not cleaned_response:
+            return []
+        try:
+            result = json.loads(cleaned_response)
+            if isinstance(result, list):
+                # 并行为每个审稿人添加引用量
+                enhanced_result = self._add_citations_parallel(result)
+                # 按引用量和相关性评分综合排序
+                def sort_key(x):
+                    citation_count = x.get('citation_count', '0')
+                    if isinstance(citation_count, str) and citation_count == "未查询到":
+                        citation_score = 0
+                    else:
+                        try:
+                            citation_score = int(citation_count) / 10000 * 0.6
+                        except (ValueError, TypeError):
+                            citation_score = 0
+                    relevance_score = x.get('relevance_score', 0) * 0.4
+                    return citation_score + relevance_score
+                enhanced_result.sort(key=sort_key, reverse=True)
+                # 过滤掉相同作者和机构
+                filtered_result = self._filter_reviewers(enhanced_result, paper)
+                return filtered_result
+            else:
+                print("大模型返回的不是JSON数组")
+                return self._generate_fallback_reviewers(candidates, paper)
+        except json.JSONDecodeError:
+            print("无法解析大模型返回的JSON")
+            return self._generate_fallback_reviewers(candidates, paper)
+    def _clean_json_response(self, response: str) -> str:
+        """清理大模型返回的JSON响应"""
+        if not response:
+            return ""
+        # 移除markdown代码块
+        if "```json" in response:
+            start = response.find("```json") + 7
+            end = response.find("```", start)
+            if end != -1:
+                response = response[start:end]
+        elif "```" in response:
+            start = response.find("```") + 3
+            end = response.find("```", start)
+            if end != -1:
+                response = response[start:end]
+        # 清理空白字符
+        response = response.strip()
+        # 处理多个独立JSON对象的情况
+        if response.count('{') > 1:
+            # 尝试将多个JSON对象合并为数组
+            try:
+                # 分割多个JSON对象
+                objects = []
+                brace_count = 0
+                current_obj = ""
+                for char in response:
+                    current_obj += char
+                    if char == '{':
+                        brace_count += 1
+                    elif char == '}':
+                        brace_count -= 1
+                        if brace_count == 0:
+                            # 一个完整的JSON对象
+                            obj_str = current_obj.strip()
+                            if obj_str.startswith('{') and obj_str.endswith('}'):
+                                try:
+                                    json.loads(obj_str)  # 验证JSON格式
+                                    objects.append(obj_str)
+                                except:
+                                    pass
+                            current_obj = ""
+                if len(objects) > 1:
+                    # 合并为JSON数组
+                    return "[" + ",".join(objects) + "]"
+                elif len(objects) == 1:
+                    return "[" + objects[0] + "]"
+            except:
+                pass
+        return response
+    def _filter_reviewers(self, reviewers: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
+        """过滤掉与论文作者相同或来自同一机构的审稿人"""
+        filtered_reviewers = []
+        # 获取论文作者和机构的标准化列表
+        paper_authors = [author.strip().lower() for author in paper.authors if author.strip()]
+        paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
+        print(f"论文作者: {paper.authors}")
+        print(f"论文机构: {paper.affiliations}")
+        print(f"开始过滤 {len(reviewers)} 个审稿人...")
+        for reviewer in reviewers:
+            reviewer_name = reviewer.get("name", "").strip().lower()
+            reviewer_affiliation = reviewer.get("affiliation", "").strip().lower()
+            print(f"检查审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')})")
+            # 检查是否与论文作者相同
+            is_same_author = any(self._similar_names(reviewer_name, author) for author in paper_authors)
+            # 检查是否来自同一机构
+            is_same_institution = any(self._similar_institutions(reviewer_affiliation, aff) for aff in paper_affiliations)
+            # 如果既不是相同作者也不是同一机构，则保留
+            if not is_same_author and not is_same_institution:
+                filtered_reviewers.append(reviewer)
+                print(f"保留审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')})")
+            else:
+                reason = "作者相同" if is_same_author else "机构相同"
+                print(f"过滤掉审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')}) - {reason}")
+        print(f"过滤完成，保留 {len(filtered_reviewers)} 个审稿人")
+        return filtered_reviewers
+    def _similar_names(self, name1: str, name2: str) -> bool:
+        """检查两个姓名是否相似（可能是同一人）"""
+        # 简单的相似性检查
+        if name1 == name2:
+            print(f"姓名完全匹配: '{name1}' == '{name2}'")
+            return True
+        # 检查是否包含相同的姓氏
+        name1_parts = name1.split()
+        name2_parts = name2.split()
+        if name1_parts and name2_parts:
+            # 检查姓氏是否相同
+            if name1_parts[0] == name2_parts[0]:
+                print(f"姓氏匹配: '{name1}' vs '{name2}' - 共同姓氏: {name1_parts[0]}")
+                return True
+        return False
+    def _similar_institutions(self, inst1: str, inst2: str) -> bool:
+        """检查两个机构是否相似（可能是同一机构的不同表述）"""
+        if inst1 == inst2:
+            return True
+        # 过滤掉通用词汇，只保留有意义的机构名称关键词
+        def filter_common_words(words):
+            common_words = {
+                'university', 'college', 'institute', 'department', 'school',
+                'center', 'centre', 'laboratory', 'lab', 'of', 'the', 'and',
+                'at', 'in', 'for', 'medical', 'medicine', 'science', 'technology'
+            }
+            return {word for word in words if word not in common_words and len(word) > 2}
+        # 获取有意义的关键词
+        inst1_words = filter_common_words(set(inst1.lower().split()))
+        inst2_words = filter_common_words(set(inst2.lower().split()))
+        # 如果过滤后没有关键词，使用原始词汇但提高阈值
+        if not inst1_words or not inst2_words:
+            inst1_words = set(inst1.lower().split())
+            inst2_words = set(inst2.lower().split())
+            # 提高阈值到80%，减少误判
+            threshold = 0.8
+        else:
+            # 使用有意义关键词，阈值可以相对宽松
+            threshold = 0.6
+        # 计算共同词汇比例
+        common_words = inst1_words.intersection(inst2_words)
+        if not common_words:
+            return False
+        similarity_ratio = len(common_words) / min(len(inst1_words), len(inst2_words))
+        # 添加调试日志
+        if similarity_ratio >= threshold:
+            print(f"机构匹配: '{inst1}' vs '{inst2}' - 相似度: {similarity_ratio:.2f}, 共同词汇: {common_words}")
+        return similarity_ratio >= threshold
+    def _generate_fallback_reviewers(self, candidates: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
+        """当LLM解析失败时，生成基础推荐"""
+        fallback_reviewers = []
+        for candidate in candidates[:20]:  # 取前20个候选
+            author = candidate.get("corresponding_author")
+            institution = candidate.get("corresponding_institution")
+            if author and author not in [r.get("name") for r in fallback_reviewers]:
+                # 检查是否与论文作者或机构相同
+                author_lower = author.strip().lower()
+                institution_lower = (institution or "").strip().lower()
+                paper_authors = [a.strip().lower() for a in paper.authors if a.strip()]
+                paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
+                is_same_author = any(self._similar_names(author_lower, pa) for pa in paper_authors)
+                is_same_institution = any(self._similar_institutions(institution_lower, pa) for pa in paper_affiliations)
+                if not is_same_author and not is_same_institution:
+                    # 获取真实引用量
+                    citation_count = self._get_real_citation_count(author, institution or "未知单位")
+                    fallback_reviewers.append({
+                        "name": author,
+                        "affiliation": institution or "未知单位",
+                        "email": "未知邮箱",
+                        "reason": "基于文献相关性自动推荐",
+                        "relevance_score": 0.7,
+                        "expertise_areas": ["相关研究领域"],
+                        "citation_count": citation_count
+                    })
+        return fallback_reviewers
+    def _call_llm_with_retry(self, prompt: str, system_msg: str, json_output: bool = False, max_retries: int = 3) -> Any:
+        """带重试机制的LLM调用"""
+        for attempt in range(max_retries):
+            try:
+                if DASHSCOPE_API_KEY:
+                    import dashscope
+                    dashscope.api_key = DASHSCOPE_API_KEY
+                    # 设置更长的超时时间和更好的错误处理
+                    try:
+                        response = dashscope.Generation.call(
+                            model="qwen-turbo",  # 使用更稳定的模型
+                            messages=[
+                                {"role": "system", "content": system_msg},
+                                {"role": "user", "content": prompt}
+                            ],
+                            result_format="json" if json_output else "text",
+                            timeout=60  # 增加超时时间
+                        )
+                        if response.status_code == 200:
+                            return response.output.text
+                        else:
+                            print(f"DashScope API错误: {response.message}")
+                    except Exception as api_error:
+                        print(f"DashScope API调用异常: {str(api_error)}")
+                        if "SSL" in str(api_error) or "EOF" in str(api_error):
+                            print("检测到SSL连接问题，尝试使用备用方案")
+                            # 可以在这里添加备用API调用
+                elif OPENAI_API_KEY:
+                    from openai import OpenAI
+                    client = OpenAI(api_key=OPENAI_API_KEY)
+                    response = client.chat.completions.create(
+                        model="gpt-3.5-turbo",  # 使用更稳定的模型
+                        messages=[
+                            {"role": "system", "content": system_msg},
+                            {"role": "user", "content": prompt}
+                        ],
+                        response_format={"type": "json_object"} if json_output else None,
+                        timeout=60
+                    )
+                    return response.choices[0].message.content
+                else:
+                    print("未配置API密钥，使用备用方案")
+                    return None
+            except Exception as e:
+                print(f"第{attempt + 1}次调用失败: {str(e)}")
+                if attempt < max_retries - 1:
+                    print(f"等待 {2 ** attempt} 秒后重试...")
+                    time.sleep(2 ** attempt)  # 指数退避
+                else:
+                    print(f"所有重试都失败了，将使用备用推荐方案")
+                    return None
+    def _get_real_citation_count(self, name: str, affiliation: str) -> str:
+        """获取作者的真实学术论文引用总量"""
+        try:
+            # 首先尝试OpenAlex API
+            citation_count = self._get_citation_from_openalex(name, affiliation)
+            if citation_count > 0:
+                return str(citation_count)
+            # 备用方案：Semantic Scholar API
+            citation_count = self._get_citation_from_semantic_scholar(name, affiliation)
+            if citation_count > 0:
+                return str(citation_count)
+            # 如果没有找到真实数据，返回"未查询到"
+            print(f"未找到 {name} 的引用量数据")
+            return "未查询到"
+        except Exception as e:
+            print(f"获取引用量失败: {str(e)}")
+            return "未查询到"
+    def _add_citations_parallel(self, reviewers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """并行为审稿人添加引用量"""
+        print(f"开始并行获取 {len(reviewers)} 个审稿人的引用量...")
+        enhanced_reviewers = []
+        # 使用线程池并行获取引用量
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            # 提交所有引用量获取任务
+            future_to_reviewer = {}
+            for reviewer in reviewers:
+                name = reviewer.get('name', '')
+                affiliation = reviewer.get('affiliation', '')
+                future = executor.submit(self._get_real_citation_count, name, affiliation)
+                future_to_reviewer[future] = reviewer
+            # 收集结果
+            for future in as_completed(future_to_reviewer):
+                reviewer = future_to_reviewer[future]
+                try:
+                    citation_count = future.result(timeout=15)  # 15秒超时
+                    reviewer['citation_count'] = citation_count
+                    enhanced_reviewers.append(reviewer)
+                    print(f"获取引用量完成: {reviewer.get('name')} - {citation_count}")
+                except Exception as e:
+                    print(f"获取引用量失败: {reviewer.get('name')} - {str(e)}")
+                    reviewer['citation_count'] = "未查询到"
+                    enhanced_reviewers.append(reviewer)
+        print(f"并行引用量获取完成，处理了 {len(enhanced_reviewers)} 个审稿人")
+        return enhanced_reviewers
+    def _get_citation_from_openalex(self, name: str, affiliation: str) -> int:
+        """从OpenAlex API获取作者引用量"""
+        try:
+            import requests
+            import urllib.parse
+            # 生成多种查询变体
+            name_variants = self._generate_name_variants(name)
+            for variant in name_variants:
+                # 简化查询，只使用姓名
+                query = f'display_name:"{variant}"'
+                print(f"OpenAlex查询: {query}")
+                # OpenAlex API请求
+                url = "https://api.openalex.org/authors"
+                params = {
+                    'search': query,
+                    'per-page': 5,  # 增加结果数量
+                    'select': 'id,display_name,cited_by_count,affiliations'
+                }
+                response = requests.get(url, params=params, timeout=15)
+                response.raise_for_status()
+                data = response.json()
+                if data.get('results'):
+                    # 尝试匹配最佳结果
+                    best_match = self._find_best_author_match(data['results'], name, affiliation)
+                    if best_match:
+                        cited_by_count = best_match.get('cited_by_count', 0)
+                        print(f"OpenAlex API: {name} 引用量: {cited_by_count}")
+                        return cited_by_count
+                print(f"OpenAlex API: 未找到 {variant} 的数据")
+            return 0
+        except Exception as e:
+            print(f"OpenAlex API调用失败: {str(e)}")
+            return 0
+    def _get_citation_from_semantic_scholar(self, name: str, affiliation: str) -> int:
+        """从Semantic Scholar API获取作者引用量"""
+        try:
+            import requests
+            import urllib.parse
+            # 生成多种查询变体
+            name_variants = self._generate_name_variants(name)
+            for variant in name_variants:
+                # 简化查询，只使用姓名
+                query = variant
+                print(f"Semantic Scholar查询: {query}")
+                # Semantic Scholar API请求
+                url = "https://api.semanticscholar.org/graph/v1/author/search"
+                params = {
+                    'query': query,
+                    'limit': 5,  # 增加结果数量
+                    'fields': 'authorId,name,citationCount,affiliations'
+                }
+                headers = {
+                    'User-Agent': 'Academic-Reviewer-System/1.0'
+                }
+                response = requests.get(url, params=params, headers=headers, timeout=15)
+                response.raise_for_status()
+                data = response.json()
+                if data.get('data'):
+                    # 尝试匹配最佳结果
+                    best_match = self._find_best_semantic_author_match(data['data'], name, affiliation)
+                    if best_match:
+                        citation_count = best_match.get('citationCount', 0)
+                        print(f"Semantic Scholar API: {name} 引用量: {citation_count}")
+                        return citation_count
+                print(f"Semantic Scholar API: 未找到 {variant} 的数据")
+            return 0
+        except Exception as e:
+            print(f"Semantic Scholar API调用失败: {str(e)}")
+            return 0
+    def _generate_name_variants(self, name: str) -> List[str]:
+        """生成姓名的多种变体"""
+        variants = [name]  # 原始姓名
+        # 如果包含中间名，尝试不同的组合
+        name_parts = name.split()
+        if len(name_parts) >= 2:
+            # 只使用姓和名
+            variants.append(f"{name_parts[0]} {name_parts[-1]}")
+            # 如果有多于2个部分，尝试不同的组合
+            if len(name_parts) == 3:
+                variants.append(f"{name_parts[0]} {name_parts[1]}")
+                variants.append(f"{name_parts[1]} {name_parts[2]}")
+            elif len(name_parts) > 3:
+                # 对于更复杂的姓名，尝试简化
+                variants.append(f"{name_parts[0]} {name_parts[1]}")
+                variants.append(f"{name_parts[0]} {name_parts[-1]}")
+        # 去重并保持顺序
+        seen = set()
+        unique_variants = []
+        for variant in variants:
+            if variant not in seen:
+                seen.add(variant)
+                unique_variants.append(variant)
+        return unique_variants
+    def _find_best_author_match(self, authors: List[Dict], target_name: str, target_affiliation: str) -> Optional[Dict]:
+        """在OpenAlex结果中找到最佳匹配的作者"""
+        if not authors:
+            return None
+        # 如果只有一个结果，直接返回
+        if len(authors) == 1:
+            return authors[0]
+        # 计算每个作者的匹配分数
+        best_match = None
+        best_score = 0
+        for author in authors:
+            score = 0
+            author_name = author.get('display_name', '').lower()
+            target_name_lower = target_name.lower()
+            # 姓名匹配分数
+            if target_name_lower in author_name or author_name in target_name_lower:
+                score += 10
+            # 检查机构匹配
+            affiliations = author.get('affiliations', [])
+            if affiliations and target_affiliation and target_affiliation != "未知单位":
+                for aff in affiliations:
+                    aff_name = aff.get('display_name', '').lower()
+                    if target_affiliation.lower() in aff_name:
+                        score += 5
+                        break
+            # 引用量作为权重
+            citation_count = author.get('cited_by_count', 0)
+            if citation_count > 0:
+                score += 1
+            if score > best_score:
+                best_score = score
+                best_match = author
+        return best_match if best_score > 0 else authors[0]
+    def _find_best_semantic_author_match(self, authors: List[Dict], target_name: str, target_affiliation: str) -> Optional[Dict]:
+        """在Semantic Scholar结果中找到最佳匹配的作者"""
+        if not authors:
+            return None
+        # 如果只有一个结果，直接返回
+        if len(authors) == 1:
+            return authors[0]
+        # 计算每个作者的匹配分数
+        best_match = None
+        best_score = 0
+        for author in authors:
+            score = 0
+            author_name = author.get('name', '').lower()
+            target_name_lower = target_name.lower()
+            # 姓名匹配分数
+            if target_name_lower in author_name or author_name in target_name_lower:
+                score += 10
+            # 检查机构匹配
+            affiliations = author.get('affiliations', [])
+            if affiliations and target_affiliation and target_affiliation != "未知单位":
+                for aff in affiliations:
+                    aff_name = aff.get('name', '').lower()
+                    if target_affiliation.lower() in aff_name:
+                        score += 5
+                        break
+            # 引用量作为权重
+            citation_count = author.get('citationCount', 0)
+            if citation_count > 0:
+                score += 1
+            if score > best_score:
+                best_score = score
+                best_match = author
+        return best_match if best_score > 0 else authors[0]

reviewer_recommendation/engine.py ADDED Viewed

	@@ -0,0 +1,389 @@

+"""
+推荐引擎模块
+使用LLM分析候选者并推荐合适的审稿人
+"""
+import json
+import os
+import time
+from typing import List, Dict, Any, Optional
+from .models import PaperInfo, Reviewer
+# 配置部分
+DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+class LLMRecommendationEngine:
+    """完全由大模型驱动的审稿人推荐引擎"""
+    def __init__(self):
+        pass
+    def analyze_candidates(self, paper: PaperInfo, candidates: List[Dict[str, Any]], num_reviewers: int = 8) -> List[Dict[str, Any]]:
+        """分析候选审稿人，使用统一的推荐策略"""
+        print(f"候选审稿人: {len(candidates)} 人")
+        # 使用统一的提示词分析候选审稿人
+        return self._analyze_candidates_unified(paper, candidates, num_reviewers)
+    def _analyze_candidates_unified(self, paper: PaperInfo, candidates: List[Dict[str, Any]], num_reviewers: int = 8) -> List[Dict[str, Any]]:
+        """使用统一提示词分析候选文献"""
+        if not candidates:
+            return []
+        # 动态传入前端要求的审稿人数量
+        print(f"要求返回 {num_reviewers} 个审稿人")
+        print(f"开始过滤全部 {len(candidates)} 个候选审稿人...")
+        filtered_candidates = self._filter_all_candidates(candidates, paper)
+        print(f"过滤完成，保留 {len(filtered_candidates)} 个候选审稿人")
+        if not filtered_candidates:
+            print("过滤后没有候选审稿人，返回空列表")
+            return []
+        system_msg = "你是学术领域专家，擅长评估研究人员与特定论文的相关性"
+        # 提取关键字段，只保留审稿人相关信息
+        simplified_candidates = [
+            {
+                "author": candidate.get("corresponding_author", ""),
+                "institution": candidate.get("corresponding_institution", ""),
+                "title": candidate.get("title", "")
+            }
+            for candidate in filtered_candidates
+        ]
+        # 构建过滤后的候选审稿人列表字符串
+        candidates_str = json.dumps(simplified_candidates, ensure_ascii=False, indent=2)
+        print(f"过滤后的候选审稿人列表: {candidates_str}")
+        prompt = f"""
+        你是学术领域专家，擅长评估研究人员与特定论文的相关性
+        请分析以下候选审稿人是否适合评审目标论文，评估相关性:
+        目标论文信息:
+        标题: {paper.title}
+        摘要: {paper.abstract}
+        作者: {', '.join(paper.authors)}
+        机构: {', '.join(paper.affiliations)}
+        候选审稿人列表:
+        {candidates_str}
+        分析要求:
+        1. 为每位审稿人评估与目标论文的相关性，给出0-1的相关性评分
+        2. 提取审稿人的专业领域和研究方向
+        3. 按relevance_score从高到低排序(desc)
+        4. 排除与目标论文作者为合作关系的审稿人
+        5. 必须返回至少{num_reviewers}个审稿人
+        请返回JSON数组，每个元素包含:
+        - name: 作者姓名
+        - affiliation: 单位
+        - email: 邮箱(根据作者姓名和单位邮箱后缀构建)
+        - reason: 推荐理由(中文 只介绍作者本人的研究方向与目标论文的适配度)
+        - relevance_score: 最终评分(0-1)
+        - expertise_areas: 专业领域列表
+        确保输出是纯JSON，不要包含其他内容
+        """
+        response = self._call_llm_with_retry(prompt.strip(), system_msg, json_output=True)
+        if not response:
+            return []
+        # 清理和解析JSON响应
+        cleaned_response = self._clean_json_response(response)
+        if not cleaned_response:
+            return []
+        try:
+            result = json.loads(cleaned_response)
+            if isinstance(result, list):
+                # 按最终评分排序，确保数据类型转换
+                def get_score(x):
+                    score = x.get('relevance_score', 0)
+                    try:
+                        return float(score) if score is not None else 0.0
+                    except (ValueError, TypeError):
+                        return 0.0
+                result.sort(key=get_score, reverse=True)
+                # 清理机构信息和邮箱信息，确保不为None
+                for reviewer in result:
+                    if reviewer.get("affiliation") is None:
+                        reviewer["affiliation"] = "未知单位"
+                    if reviewer.get("email") is None:
+                        reviewer["email"] = "unknown@example.com"
+                # 候选审稿人已经在前面过滤过了，直接返回LLM分析结果
+                print(f"统一分析完成，推荐 {len(result)} 个审稿人")
+                return result
+            else:
+                print("大模型返回的不是JSON数组")
+                return []
+        except json.JSONDecodeError:
+            print("无法解析大模型返回的JSON")
+            return []
+    def _clean_json_response(self, response: str) -> str:
+        """清理大模型返回的JSON响应"""
+        if not response:
+            return ""
+        # 移除markdown代码块
+        if "```json" in response:
+            start = response.find("```json") + 7
+            end = response.find("```", start)
+            if end != -1:
+                response = response[start:end]
+        elif "```" in response:
+            start = response.find("```") + 3
+            end = response.find("```", start)
+            if end != -1:
+                response = response[start:end]
+        # 清理空白字符
+        response = response.strip()
+        # 处理多个独立JSON对象的情况
+        if response.count('{') > 1:
+            # 尝试将多个JSON对象合并为数组
+            try:
+                # 分割多个JSON对象
+                objects = []
+                brace_count = 0
+                current_obj = ""
+                for char in response:
+                    current_obj += char
+                    if char == '{':
+                        brace_count += 1
+                    elif char == '}':
+                        brace_count -= 1
+                        if brace_count == 0:
+                            # 一个完整的JSON对象
+                            obj_str = current_obj.strip()
+                            if obj_str.startswith('{') and obj_str.endswith('}'):
+                                try:
+                                    json.loads(obj_str)  # 验证JSON格式
+                                    objects.append(obj_str)
+                                except:
+                                    pass
+                            current_obj = ""
+                if len(objects) > 1:
+                    # 合并为JSON数组
+                    return "[" + ",".join(objects) + "]"
+                elif len(objects) == 1:
+                    return "[" + objects[0] + "]"
+            except:
+                pass
+        return response
+    def _filter_all_candidates(self, candidates: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
+        """过滤所有候选审稿人，排除相同作者和机构，并进行去重"""
+        filtered_candidates = []
+        seen_reviewers = set()  # 用于去重的集合
+        # 获取论文作者和机构的标准化列表
+        paper_authors = [author.strip().lower() for author in paper.authors if author.strip()]
+        paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
+        print(f"论文作者: {paper.authors}")
+        print(f"论文机构: {paper.affiliations}")
+        for candidate in candidates:
+            # 提取关键字段，只处理必要信息
+            author = candidate.get('corresponding_author', '')
+            institution = candidate.get('corresponding_institution', '')
+            if not author:
+                continue
+            reviewer_name = author.strip().lower()
+            reviewer_affiliation = (institution or "").strip().lower()
+            # print(f"检查候选审稿人: {author} ({institution})")
+            # 检查是否与论文作者相同
+            is_same_author = any(self._similar_names(reviewer_name, author) for author in paper_authors)
+            # 检查是否来自同一机构
+            is_same_institution = any(self._similar_institutions(reviewer_affiliation, aff) for aff in paper_affiliations)
+            # 如果既不是相同作者也不是同一机构，则进行去重检查
+            if not is_same_author and not is_same_institution:
+                # 创建审稿人标识符用于去重
+                reviewer_key = f"{reviewer_name}_{reviewer_affiliation}"
+                if reviewer_key not in seen_reviewers:
+                    seen_reviewers.add(reviewer_key)
+                    filtered_candidates.append(candidate)
+                    # print(f"保留候选审稿人: {author} ({institution})")
+                else:
+                    print(f"跳过重复候选审稿人: {author} ({institution})")
+            else:
+                reason = "作者相同" if is_same_author else "机构相同"
+                print(f"过滤掉候选审稿人: {author} ({institution}) - {reason}")
+        print(f"去重完成，最终保留 {len(filtered_candidates)} 个候选审稿人")
+        return filtered_candidates
+    def _similar_names(self, name1: str, name2: str) -> bool:
+        """检查两个姓名是否相似（可能是同一人）"""
+        # 简单的相似性检查
+        if name1 == name2:
+            print(f"姓名完全匹配: '{name1}' == '{name2}'")
+            return True
+        # 检查是否包含相同的姓氏
+        name1_parts = name1.split()
+        name2_parts = name2.split()
+        if name1_parts and name2_parts:
+            # 检查姓氏是否相同
+            if name1_parts[0] == name2_parts[0]:
+                print(f"姓氏匹配: '{name1}' vs '{name2}' - 共同姓氏: {name1_parts[0]}")
+                return True
+        return False
+    def _similar_institutions(self, inst1: str, inst2: str) -> bool:
+        """检查两个机构是否相似（可能是同一机构的不同表述）"""
+        if inst1 == inst2:
+            return True
+        # 过滤掉通用词汇，只保留有意义的机构名称关键词
+        def filter_common_words(words):
+            common_words = {
+                'university', 'college', 'institute', 'department', 'school',
+                'center', 'centre', 'laboratory', 'lab', 'of', 'the', 'and',
+                'at', 'in', 'for', 'medical', 'medicine', 'science', 'technology'
+            }
+            return {word for word in words if word not in common_words and len(word) > 2}
+        # 获取有意义的关键词
+        inst1_words = filter_common_words(set(inst1.lower().split()))
+        inst2_words = filter_common_words(set(inst2.lower().split()))
+        # 如果过滤后没有关键词，使用原始词汇但提高阈值
+        if not inst1_words or not inst2_words:
+            inst1_words = set(inst1.lower().split())
+            inst2_words = set(inst2.lower().split())
+            # 提高阈值到80%，减少误判
+            threshold = 0.8
+        else:
+            # 使用有意义关键词，阈值可以相对宽松
+            threshold = 0.6
+        # 计算共同词汇比例
+        common_words = inst1_words.intersection(inst2_words)
+        if not common_words:
+            return False
+        similarity_ratio = len(common_words) / min(len(inst1_words), len(inst2_words))
+        # 添加调试日志
+        if similarity_ratio >= threshold:
+            print(f"机构匹配: '{inst1}' vs '{inst2}' - 相似度: {similarity_ratio:.2f}, 共同词汇: {common_words}")
+        return similarity_ratio >= threshold
+    def _generate_fallback_reviewers(self, candidates: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
+        """当LLM解析失败时，生成基础推荐"""
+        fallback_reviewers = []
+        for candidate in candidates[:20]:  # 取前20个候选
+            author = candidate.get("corresponding_author")
+            institution = candidate.get("corresponding_institution")
+            if author and author not in [r.get("name") for r in fallback_reviewers]:
+                # 检查是否与论文作者或机构相同
+                author_lower = (author or "").strip().lower()
+                institution_lower = (institution or "").strip().lower()
+                paper_authors = [a.strip().lower() for a in paper.authors if a.strip()]
+                paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
+                is_same_author = any(self._similar_names(author_lower, pa) for pa in paper_authors)
+                is_same_institution = any(self._similar_institutions(institution_lower, pa) for pa in paper_affiliations)
+                if not is_same_author and not is_same_institution:
+                    fallback_reviewers.append({
+                        "name": author,
+                        "affiliation": institution or "未知单位",
+                        "email": "未知邮箱",
+                        "reason": "基于文献相关性自动推荐",
+                        "relevance_score": 0.7,
+                        "expertise_areas": ["相关研究领域"]
+                    })
+        return fallback_reviewers
+    def _call_llm_with_retry(self, prompt: str, system_msg: str, json_output: bool = False, max_retries: int = 3) -> Any:
+        """带重试机制的LLM调用"""
+        for attempt in range(max_retries):
+            try:
+                if DASHSCOPE_API_KEY:
+                    import dashscope
+                    dashscope.api_key = DASHSCOPE_API_KEY
+                    # 设置更长的超时时间和更好的错误处理
+                    try:
+                        response = dashscope.Generation.call(
+                            model="qwen-turbo-latest",  # 使用更稳定的模型
+                            messages=[
+                                {"role": "system", "content": system_msg},
+                                {"role": "user", "content": prompt}
+                            ],
+                            result_format="json" if json_output else "text",
+                            timeout=60  # 增加超时时间
+                        )
+                        if response.status_code == 200:
+                            return response.output.text
+                        else:
+                            print(f"DashScope API错误: {response.message}")
+                    except Exception as api_error:
+                        print(f"DashScope API调用异常: {str(api_error)}")
+                        if "SSL" in str(api_error) or "EOF" in str(api_error):
+                            print("检测到SSL连接问题，尝试使用备用方案")
+                            # 可以在这里添加备用API调用
+                elif OPENAI_API_KEY:
+                    from openai import OpenAI
+                    client = OpenAI(api_key=OPENAI_API_KEY)
+                    response = client.chat.completions.create(
+                        model="gpt-3.5-turbo",  # 使用更稳定的模型
+                        messages=[
+                            {"role": "system", "content": system_msg},
+                            {"role": "user", "content": prompt}
+                        ],
+                        response_format={"type": "json_object"} if json_output else None,
+                        timeout=60
+                    )
+                    return response.choices[0].message.content
+                else:
+                    print("未配置API密钥，使用备用方案")
+                    return None
+            except Exception as e:
+                print(f"第{attempt + 1}次调用失败: {str(e)}")
+                if attempt < max_retries - 1:
+                    print(f"等待 {2 ** attempt} 秒后重试...")
+                    time.sleep(2 ** attempt)  # 指数退避
+                else:
+                    print(f"所有重试都失败了，将使用备用推荐方案")
+                    return None

reviewer_recommendation/enginecomplex.py ADDED Viewed

	@@ -0,0 +1,609 @@

+"""
+推荐引擎模块
+使用LLM分析候选者并推荐合适的审稿人
+"""
+import json
+import os
+import time
+from typing import List, Dict, Any, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from .models import PaperInfo, Reviewer
+# 配置部分
+DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+class LLMRecommendationEngine:
+    """完全由大模型驱动的审稿人推荐引擎"""
+    def __init__(self):
+        pass
+    def analyze_candidates(self, paper: PaperInfo, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """分析候选文献，评估适合度"""
+        system_msg = "你是学术领域专家，擅长评估研究人员与特定论文的匹配度"
+        candidates_str = json.dumps(candidates, ensure_ascii=False, indent=2)
+        # 构建作者和机构信息
+        authors_info = ""
+        if paper.authors:
+            authors_info = f"作者: {', '.join(paper.authors)}"
+        if paper.affiliations:
+            authors_info += f"\n作者机构: {', '.join(paper.affiliations)}"
+        prompt = f"""
+        请分析以下候选文献的作者是否适合评审目标论文，并按适合度排序:
+        目标论文:
+        标题: {paper.title}
+        摘要: {paper.abstract}
+        关键词: {', '.join(paper.keywords)}
+        {authors_info}
+        候选文献列表:
+        {candidates_str}
+        分析要求:
+        1. 为每位通讯作者评估适合度，给出0-1的相关性评分
+        2. 提取作者的专业领域和研究方向
+        3. 说明推荐理由(中文 重点介绍作者本人的研究方向)
+        4. 排除重复作者
+        5. 严格排除与目标论文作者相同或来自同一机构的人员
+        6. 按适合度从高到低排序，优先考虑引用量和知名程度
+        7. 必须返回至少5-10个审稿人，确保有足够的候选人数
+        7. 如果相关性评分全部低于0.6 则重新再进行一次分析
+        8. 估算作者的学术论文引用总量(基于机构声誉和研究领域)
+        请返回JSON数组，每个元素包含:
+        - name: 作者姓名
+        - affiliation: 单位
+        - email: 邮箱(从数据中提取)
+        - reason: 推荐理由(中文 作者本人的研究方向与目标论文的适配度)
+        - relevance_score: 相关性评分(0-1)
+        - expertise_areas: 专业领域列表
+        - citation_count: 估算的学术论文引用总量
+        确保输出是纯JSON，不要包含其他内容
+        """
+        response = self._call_llm_with_retry(prompt.strip(), system_msg, json_output=True)
+        if not response:
+            return []
+        # 清理和解析JSON响应
+        cleaned_response = self._clean_json_response(response)
+        if not cleaned_response:
+            return []
+        try:
+            result = json.loads(cleaned_response)
+            if isinstance(result, list):
+                # 并行为每个审稿人添加引用量
+                enhanced_result = self._add_citations_parallel(result)
+                # 按引用量和相关性评分综合排序
+                def sort_key(x):
+                    citation_count = x.get('citation_count', '0')
+                    if isinstance(citation_count, str) and citation_count == "未查询到":
+                        citation_score = 0
+                    else:
+                        try:
+                            citation_score = int(citation_count) / 10000 * 0.6
+                        except (ValueError, TypeError):
+                            citation_score = 0
+                    relevance_score = x.get('relevance_score', 0) * 0.4
+                    return citation_score + relevance_score
+                enhanced_result.sort(key=sort_key, reverse=True)
+                # 过滤掉相同作者和机构
+                filtered_result = self._filter_reviewers(enhanced_result, paper)
+                return filtered_result
+            else:
+                print("大模型返回的不是JSON数组")
+                return self._generate_fallback_reviewers(candidates, paper)
+        except json.JSONDecodeError:
+            print("无法解析大模型返回的JSON")
+            return self._generate_fallback_reviewers(candidates, paper)
+    def _clean_json_response(self, response: str) -> str:
+        """清理大模型返回的JSON响应"""
+        if not response:
+            return ""
+        # 移除markdown代码块
+        if "```json" in response:
+            start = response.find("```json") + 7
+            end = response.find("```", start)
+            if end != -1:
+                response = response[start:end]
+        elif "```" in response:
+            start = response.find("```") + 3
+            end = response.find("```", start)
+            if end != -1:
+                response = response[start:end]
+        # 清理空白字符
+        response = response.strip()
+        # 处理多个独立JSON对象的情况
+        if response.count('{') > 1:
+            # 尝试将多个JSON对象合并为数组
+            try:
+                # 分割多个JSON对象
+                objects = []
+                brace_count = 0
+                current_obj = ""
+                for char in response:
+                    current_obj += char
+                    if char == '{':
+                        brace_count += 1
+                    elif char == '}':
+                        brace_count -= 1
+                        if brace_count == 0:
+                            # 一个完整的JSON对象
+                            obj_str = current_obj.strip()
+                            if obj_str.startswith('{') and obj_str.endswith('}'):
+                                try:
+                                    json.loads(obj_str)  # 验证JSON格式
+                                    objects.append(obj_str)
+                                except:
+                                    pass
+                            current_obj = ""
+                if len(objects) > 1:
+                    # 合并为JSON数组
+                    return "[" + ",".join(objects) + "]"
+                elif len(objects) == 1:
+                    return "[" + objects[0] + "]"
+            except:
+                pass
+        return response
+    def _filter_reviewers(self, reviewers: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
+        """过滤掉与论文作者相同或来自同一机构的审稿人"""
+        filtered_reviewers = []
+        # 获取论文作者和机构的标准化列表
+        paper_authors = [author.strip().lower() for author in paper.authors if author.strip()]
+        paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
+        print(f"论文作者: {paper.authors}")
+        print(f"论文机构: {paper.affiliations}")
+        print(f"开始过滤 {len(reviewers)} 个审稿人...")
+        for reviewer in reviewers:
+            reviewer_name = reviewer.get("name", "").strip().lower()
+            reviewer_affiliation = reviewer.get("affiliation", "").strip().lower()
+            print(f"检查审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')})")
+            # 检查是否与论文作者相同
+            is_same_author = any(self._similar_names(reviewer_name, author) for author in paper_authors)
+            # 检查是否来自同一机构
+            is_same_institution = any(self._similar_institutions(reviewer_affiliation, aff) for aff in paper_affiliations)
+            # 如果既不是相同作者也不是同一机构，则保留
+            if not is_same_author and not is_same_institution:
+                filtered_reviewers.append(reviewer)
+                print(f"保留审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')})")
+            else:
+                reason = "作者相同" if is_same_author else "机构相同"
+                print(f"过滤掉审稿人: {reviewer.get('name')} ({reviewer.get('affiliation')}) - {reason}")
+        print(f"过滤完成，保留 {len(filtered_reviewers)} 个审稿人")
+        return filtered_reviewers
+    def _similar_names(self, name1: str, name2: str) -> bool:
+        """检查两个姓名是否相似（可能是同一人）"""
+        # 简单的相似性检查
+        if name1 == name2:
+            print(f"姓名完全匹配: '{name1}' == '{name2}'")
+            return True
+        # 检查是否包含相同的姓氏
+        name1_parts = name1.split()
+        name2_parts = name2.split()
+        if name1_parts and name2_parts:
+            # 检查姓氏是否相同
+            if name1_parts[0] == name2_parts[0]:
+                print(f"姓氏匹配: '{name1}' vs '{name2}' - 共同姓氏: {name1_parts[0]}")
+                return True
+        return False
+    def _similar_institutions(self, inst1: str, inst2: str) -> bool:
+        """检查两个机构是否相似（可能是同一机构的不同表述）"""
+        if inst1 == inst2:
+            return True
+        # 过滤掉通用词汇，只保留有意义的机构名称关键词
+        def filter_common_words(words):
+            common_words = {
+                'university', 'college', 'institute', 'department', 'school',
+                'center', 'centre', 'laboratory', 'lab', 'of', 'the', 'and',
+                'at', 'in', 'for', 'medical', 'medicine', 'science', 'technology'
+            }
+            return {word for word in words if word not in common_words and len(word) > 2}
+        # 获取有意义的关键词
+        inst1_words = filter_common_words(set(inst1.lower().split()))
+        inst2_words = filter_common_words(set(inst2.lower().split()))
+        # 如果过滤后没有关键词，使用原始词汇但提高阈值
+        if not inst1_words or not inst2_words:
+            inst1_words = set(inst1.lower().split())
+            inst2_words = set(inst2.lower().split())
+            # 提高阈值到80%，减少误判
+            threshold = 0.8
+        else:
+            # 使用有意义关键词，阈值可以相对宽松
+            threshold = 0.6
+        # 计算共同词汇比例
+        common_words = inst1_words.intersection(inst2_words)
+        if not common_words:
+            return False
+        similarity_ratio = len(common_words) / min(len(inst1_words), len(inst2_words))
+        # 添加调试日志
+        if similarity_ratio >= threshold:
+            print(f"机构匹配: '{inst1}' vs '{inst2}' - 相似度: {similarity_ratio:.2f}, 共同词汇: {common_words}")
+        return similarity_ratio >= threshold
+    def _generate_fallback_reviewers(self, candidates: List[Dict[str, Any]], paper: PaperInfo) -> List[Dict[str, Any]]:
+        """当LLM解析失败时，生成基础推荐"""
+        fallback_reviewers = []
+        for candidate in candidates[:20]:  # 取前20个候选
+            author = candidate.get("corresponding_author")
+            institution = candidate.get("corresponding_institution")
+            if author and author not in [r.get("name") for r in fallback_reviewers]:
+                # 检查是否与论文作者或机构相同
+                author_lower = author.strip().lower()
+                institution_lower = (institution or "").strip().lower()
+                paper_authors = [a.strip().lower() for a in paper.authors if a.strip()]
+                paper_affiliations = [aff.strip().lower() for aff in paper.affiliations if aff.strip()]
+                is_same_author = any(self._similar_names(author_lower, pa) for pa in paper_authors)
+                is_same_institution = any(self._similar_institutions(institution_lower, pa) for pa in paper_affiliations)
+                if not is_same_author and not is_same_institution:
+                    # 获取真实引用量
+                    citation_count = self._get_real_citation_count(author, institution or "未知单位")
+                    fallback_reviewers.append({
+                        "name": author,
+                        "affiliation": institution or "未知单位",
+                        "email": "未知邮箱",
+                        "reason": "基于文献相关性自动推荐",
+                        "relevance_score": 0.7,
+                        "expertise_areas": ["相关研究领域"],
+                        "citation_count": citation_count
+                    })
+        return fallback_reviewers
+    def _call_llm_with_retry(self, prompt: str, system_msg: str, json_output: bool = False, max_retries: int = 3) -> Any:
+        """带重试机制的LLM调用"""
+        for attempt in range(max_retries):
+            try:
+                if DASHSCOPE_API_KEY:
+                    import dashscope
+                    dashscope.api_key = DASHSCOPE_API_KEY
+                    # 设置更长的超时时间和更好的错误处理
+                    try:
+                        response = dashscope.Generation.call(
+                            model="qwen-turbo",  # 使用更稳定的模型
+                            messages=[
+                                {"role": "system", "content": system_msg},
+                                {"role": "user", "content": prompt}
+                            ],
+                            result_format="json" if json_output else "text",
+                            timeout=60  # 增加超时时间
+                        )
+                        if response.status_code == 200:
+                            return response.output.text
+                        else:
+                            print(f"DashScope API错误: {response.message}")
+                    except Exception as api_error:
+                        print(f"DashScope API调用异常: {str(api_error)}")
+                        if "SSL" in str(api_error) or "EOF" in str(api_error):
+                            print("检测到SSL连接问题，尝试使用备用方案")
+                            # 可以在这里添加备用API调用
+                elif OPENAI_API_KEY:
+                    from openai import OpenAI
+                    client = OpenAI(api_key=OPENAI_API_KEY)
+                    response = client.chat.completions.create(
+                        model="gpt-3.5-turbo",  # 使用更稳定的模型
+                        messages=[
+                            {"role": "system", "content": system_msg},
+                            {"role": "user", "content": prompt}
+                        ],
+                        response_format={"type": "json_object"} if json_output else None,
+                        timeout=60
+                    )
+                    return response.choices[0].message.content
+                else:
+                    print("未配置API密钥，使用备用方案")
+                    return None
+            except Exception as e:
+                print(f"第{attempt + 1}次调用失败: {str(e)}")
+                if attempt < max_retries - 1:
+                    print(f"等待 {2 ** attempt} 秒后重试...")
+                    time.sleep(2 ** attempt)  # 指数退避
+                else:
+                    print(f"所有重试都失败了，将使用备用推荐方案")
+                    return None
+    def _get_real_citation_count(self, name: str, affiliation: str) -> str:
+        """获取作者的真实学术论文引用总量"""
+        try:
+            # 首先尝试OpenAlex API
+            citation_count = self._get_citation_from_openalex(name, affiliation)
+            if citation_count > 0:
+                return str(citation_count)
+            # 备用方案：Semantic Scholar API
+            citation_count = self._get_citation_from_semantic_scholar(name, affiliation)
+            if citation_count > 0:
+                return str(citation_count)
+            # 如果没有找到真实数据，返回"未查询到"
+            print(f"未找到 {name} 的引用量数据")
+            return "未查询到"
+        except Exception as e:
+            print(f"获取引用量失败: {str(e)}")
+            return "未查询到"
+    def _add_citations_parallel(self, reviewers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """并行为审稿人添加引用量"""
+        print(f"开始并行获取 {len(reviewers)} 个审稿人的引用量...")
+        enhanced_reviewers = []
+        # 使用线程池并行获取引用量
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            # 提交所有引用量获取任务
+            future_to_reviewer = {}
+            for reviewer in reviewers:
+                name = reviewer.get('name', '')
+                affiliation = reviewer.get('affiliation', '')
+                future = executor.submit(self._get_real_citation_count, name, affiliation)
+                future_to_reviewer[future] = reviewer
+            # 收集结果
+            for future in as_completed(future_to_reviewer):
+                reviewer = future_to_reviewer[future]
+                try:
+                    citation_count = future.result(timeout=15)  # 15秒超时
+                    reviewer['citation_count'] = citation_count
+                    enhanced_reviewers.append(reviewer)
+                    print(f"获取引用量完成: {reviewer.get('name')} - {citation_count}")
+                except Exception as e:
+                    print(f"获取引用量失败: {reviewer.get('name')} - {str(e)}")
+                    reviewer['citation_count'] = "未查询到"
+                    enhanced_reviewers.append(reviewer)
+        print(f"并行引用量获取完成，处理了 {len(enhanced_reviewers)} 个审稿人")
+        return enhanced_reviewers
+    def _get_citation_from_openalex(self, name: str, affiliation: str) -> int:
+        """从OpenAlex API获取作者引用量"""
+        try:
+            import requests
+            import urllib.parse
+            # 生成多种查询变体
+            name_variants = self._generate_name_variants(name)
+            for variant in name_variants:
+                # 简化查询，只使用姓名
+                query = f'display_name:"{variant}"'
+                print(f"OpenAlex查询: {query}")
+                # OpenAlex API请求
+                url = "https://api.openalex.org/authors"
+                params = {
+                    'search': query,
+                    'per-page': 5,  # 增加结果数量
+                    'select': 'id,display_name,cited_by_count,affiliations'
+                }
+                response = requests.get(url, params=params, timeout=15)
+                response.raise_for_status()
+                data = response.json()
+                if data.get('results'):
+                    # 尝试匹配最佳结果
+                    best_match = self._find_best_author_match(data['results'], name, affiliation)
+                    if best_match:
+                        cited_by_count = best_match.get('cited_by_count', 0)
+                        print(f"OpenAlex API: {name} 引用量: {cited_by_count}")
+                        return cited_by_count
+                print(f"OpenAlex API: 未找到 {variant} 的数据")
+            return 0
+        except Exception as e:
+            print(f"OpenAlex API调用失败: {str(e)}")
+            return 0
+    def _get_citation_from_semantic_scholar(self, name: str, affiliation: str) -> int:
+        """从Semantic Scholar API获取作者引用量"""
+        try:
+            import requests
+            import urllib.parse
+            # 生成多种查询变体
+            name_variants = self._generate_name_variants(name)
+            for variant in name_variants:
+                # 简化查询，只使用姓名
+                query = variant
+                print(f"Semantic Scholar查询: {query}")
+                # Semantic Scholar API请求
+                url = "https://api.semanticscholar.org/graph/v1/author/search"
+                params = {
+                    'query': query,
+                    'limit': 5,  # 增加结果数量
+                    'fields': 'authorId,name,citationCount,affiliations'
+                }
+                headers = {
+                    'User-Agent': 'Academic-Reviewer-System/1.0'
+                }
+                response = requests.get(url, params=params, headers=headers, timeout=15)
+                response.raise_for_status()
+                data = response.json()
+                if data.get('data'):
+                    # 尝试匹配最佳结果
+                    best_match = self._find_best_semantic_author_match(data['data'], name, affiliation)
+                    if best_match:
+                        citation_count = best_match.get('citationCount', 0)
+                        print(f"Semantic Scholar API: {name} 引用量: {citation_count}")
+                        return citation_count
+                print(f"Semantic Scholar API: 未找到 {variant} 的数据")
+            return 0
+        except Exception as e:
+            print(f"Semantic Scholar API调用失败: {str(e)}")
+            return 0
+    def _generate_name_variants(self, name: str) -> List[str]:
+        """生成姓名的多种变体"""
+        variants = [name]  # 原始姓名
+        # 如果包含中间名，尝试不同的组合
+        name_parts = name.split()
+        if len(name_parts) >= 2:
+            # 只使用姓和名
+            variants.append(f"{name_parts[0]} {name_parts[-1]}")
+            # 如果有多于2个部分，尝试不同的组合
+            if len(name_parts) == 3:
+                variants.append(f"{name_parts[0]} {name_parts[1]}")
+                variants.append(f"{name_parts[1]} {name_parts[2]}")
+            elif len(name_parts) > 3:
+                # 对于更复杂的姓名，尝试简化
+                variants.append(f"{name_parts[0]} {name_parts[1]}")
+                variants.append(f"{name_parts[0]} {name_parts[-1]}")
+        # 去重并保持顺序
+        seen = set()
+        unique_variants = []
+        for variant in variants:
+            if variant not in seen:
+                seen.add(variant)
+                unique_variants.append(variant)
+        return unique_variants
+    def _find_best_author_match(self, authors: List[Dict], target_name: str, target_affiliation: str) -> Optional[Dict]:
+        """在OpenAlex结果中找到最佳匹配的作者"""
+        if not authors:
+            return None
+        # 如果只有一个结果，直接返回
+        if len(authors) == 1:
+            return authors[0]
+        # 计算每个作者的匹配分数
+        best_match = None
+        best_score = 0
+        for author in authors:
+            score = 0
+            author_name = author.get('display_name', '').lower()
+            target_name_lower = target_name.lower()
+            # 姓名匹配分数
+            if target_name_lower in author_name or author_name in target_name_lower:
+                score += 10
+            # 检查机构匹配
+            affiliations = author.get('affiliations', [])
+            if affiliations and target_affiliation and target_affiliation != "未知单位":
+                for aff in affiliations:
+                    aff_name = aff.get('display_name', '').lower()
+                    if target_affiliation.lower() in aff_name:
+                        score += 5
+                        break
+            # 引用量作为权重
+            citation_count = author.get('cited_by_count', 0)
+            if citation_count > 0:
+                score += 1
+            if score > best_score:
+                best_score = score
+                best_match = author
+        return best_match if best_score > 0 else authors[0]
+    def _find_best_semantic_author_match(self, authors: List[Dict], target_name: str, target_affiliation: str) -> Optional[Dict]:
+        """在Semantic Scholar结果中找到最佳匹配的作者"""
+        if not authors:
+            return None
+        # 如果只有一个结果，直接返回
+        if len(authors) == 1:
+            return authors[0]
+        # 计算每个作者的匹配分数
+        best_match = None
+        best_score = 0
+        for author in authors:
+            score = 0
+            author_name = author.get('name', '').lower()
+            target_name_lower = target_name.lower()
+            # 姓名匹配分数
+            if target_name_lower in author_name or author_name in target_name_lower:
+                score += 10
+            # 检查机构匹配
+            affiliations = author.get('affiliations', [])
+            if affiliations and target_affiliation and target_affiliation != "未知单位":
+                for aff in affiliations:
+                    aff_name = aff.get('name', '').lower()
+                    if target_affiliation.lower() in aff_name:
+                        score += 5
+                        break
+            # 引用量作为权重
+            citation_count = author.get('citationCount', 0)
+            if citation_count > 0:
+                score += 1
+            if score > best_score:
+                best_score = score
+                best_match = author
+        return best_match if best_score > 0 else authors[0]

reviewer_recommendation/models.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+数据模型定义
+定义审稿人推荐系统使用的核心数据结构
+"""
+from typing import List, Dict, Any, Optional
+from pydantic import BaseModel, Field
+class PaperInfo(BaseModel):
+    """论文信息模型"""
+    title: str = Field(..., description="论文标题")
+    abstract: str = Field(..., description="论文摘要")
+    keywords: List[str] = Field(default_factory=list, description="论文关键词")
+    authors: List[str] = Field(default_factory=list, description="作者姓名列表")
+    affiliations: List[str] = Field(default_factory=list, description="作者所属机构列表")
+class Reviewer(BaseModel):
+    """审稿人信息模型"""
+    name: str = Field(..., description="审稿人姓名")
+    affiliation: str = Field(default="Unknown", description="所属机构")
+    email: str = Field(default="unknown@example.com", description="邮箱地址")
+    reason: str = Field(..., description="推荐理由")
+    relevance_score: float = Field(..., ge=0.0, le=1.0, description="相关性评分")
+    expertise_areas: List[str] = Field(default_factory=list, description="专业领域")
+class SearchResult(BaseModel):
+    """搜索结果模型"""
+    doi: Optional[str] = Field(None, description="DOI")
+    title: str = Field(..., description="论文标题")
+    abstract: str = Field(..., description="论文摘要")
+    corresponding_author: Optional[str] = Field(None, description="通讯作者")
+    corresponding_institution: Optional[str] = Field(None, description="通讯作者机构")
+    query_used: str = Field(..., description="使用的查询词")
+class RecommendationRequest(BaseModel):
+    """推荐请求模型"""
+    paper: PaperInfo
+    reviewer_count: int = Field(..., ge=1, le=10, description="推荐审稿人数量")
+class RecommendationResponse(BaseModel):
+    """推荐响应模型"""
+    reviewers: List[Reviewer] = Field(default_factory=list, description="推荐的审稿人列表")
+    search_time: float = Field(..., description="搜索耗时（秒）")
+    total_candidates: int = Field(..., description="候选者总数")
+    success: bool = Field(..., description="是否成功")
+    error_message: Optional[str] = Field(None, description="错误信息")
+class AppState(BaseModel):
+    """应用状态模型"""
+    current_request: Optional[RecommendationRequest] = None
+    current_response: Optional[RecommendationResponse] = None
+    is_processing: bool = False
+    last_error: Optional[str] = None

reviewer_recommendation/searcher copy.py ADDED Viewed

	@@ -0,0 +1,666 @@

+"""
+学术检索模块
+提供基于EPMC和bioRxiv的学术文献检索功能
+"""
+import json
+import os
+import time
+import urllib.parse
+import requests
+import warnings
+import ssl
+from typing import List, Dict, Any, Optional
+from itertools import combinations
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+# 抑制SSL警告
+warnings.filterwarnings('ignore', message='Unverified HTTPS request')
+from .models import PaperInfo, SearchResult
+# 配置部分
+DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+# 生物学关键词分类体系
+BIOLOGY_KEYWORDS = {
+    "Molecular & Structural Biology": [
+        "Cryo-EM",
+        "X-ray crystallography",
+        "NMR spectroscopy",
+        "Single-particle analysis",
+        "Biolayer interferometry (BLI)",
+        "Surface plasmon resonance (SPR)",
+        "Confocal microscopy",
+        "CRISPR-Cas9",
+        "TALEN",
+        "ZFN",
+        "RNA interference (RNAi)",
+        "Single-molecule imaging",
+        "FRET",
+        "Optogenetics"
+    ],
+    "Cell & Single-Cell Technologies": [
+        "Single-cell RNA-seq (scRNA-seq)",
+        "Single-cell ATAC-seq",
+        "Spatial transcriptomics",
+        "FISH (Fluorescence in situ hybridization)",
+        "Immunofluorescence",
+        "Tissue clearing (CLARITY)",
+        "Flow cytometry (FACS)",
+        "CyTOF (Mass cytometry)",
+        "High-throughput screening",
+        "Organoids",
+        "3D cell culture",
+        "Microfluidics"
+    ],
+    "Neuroscience Tools": [
+        "Optogenetics",
+        "DREADDs (Designer Receptors Exclusively Activated by Designer Drugs)",
+        "GCaMP calcium imaging",
+        "Two-photon microscopy",
+        "Neural tracing",
+        "Patch-seq",
+        "Lineage tracing",
+        "Spatial multi-omics"
+    ],
+    "Omics & Systems Biology": [
+        "RNA sequencing (RNA-seq)",
+        "Proteomics (LC-MS/MS)",
+        "Metabolomics",
+        "Epigenomics",
+        "10x Genomics",
+        "SMART-seq",
+        "Nanopore sequencing",
+        "Illumina HiSeq",
+        "WGCNA",
+        "Machine learning in omics",
+        "scVelo"
+    ],
+    "Microbiome & Immunology": [
+        "16S rRNA sequencing",
+        "Metagenomics",
+        "Gut-brain axis",
+        "VDJ-seq",
+        "TCR/BCR lineage tracing",
+        "Immune checkpoints (PD-1, CTLA-4)",
+        "mRNA vaccines",
+        "DNA vaccines",
+        "Nanoparticle vaccines",
+        "Antigen presentation systems"
+    ],
+    "Development & Regeneration": [
+        "Induced pluripotent stem cells (iPSCs)",
+        "Embryonic stem cells (ESCs)",
+        "Cellular reprogramming",
+        "Wnt signaling",
+        "Hippo pathway",
+        "Notch signaling",
+        "Zebrafish models",
+        "C. elegans",
+        "Mouse embryonic sections"
+    ],
+    "Ecology & Environmental Biology": [
+        "Environmental DNA (eDNA)",
+        "Remote sensing ecology",
+        "Biosensors",
+        "Ecological niche modeling (ENM)",
+        "Genetic diversity analysis",
+        "Captive breeding technologies"
+    ],
+    "Bioinformatics & AI Tools": [
+        "Seurat",
+        "Scanpy",
+        "Monocle",
+        "CIBERSORT",
+        "GSEA",
+        "AlphaFold",
+        "RoseTTAFold",
+        "Molecular docking",
+        "STRING",
+        "Cytoscape",
+        "Gene Ontology (GO)",
+        "KEGG pathway analysis"
+    ]
+}
+class AcademicSearcher:
+    """基础学术检索器，仅负责数据获取，不做任何分析"""
+    EPMC_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
+    BIORXIV_URL = "https://api.biorxiv.org/details/biorxiv/{doi}/na/json"
+    def __init__(self, limit=50, sleep=0.1, timeout=30):
+        self.limit = limit
+        self.sleep = sleep
+        self.timeout = timeout
+        # 创建自定义SSL上下文
+        self.ssl_context = ssl.create_default_context()
+        self.ssl_context.check_hostname = False
+        self.ssl_context.verify_mode = ssl.CERT_NONE
+    def search(self, query: str) -> List[Dict[str, Any]]:
+        """执行检索并返回原始文献数据"""
+        try:
+            # 1. 获取DOI列表
+            epmc_results = self._epmc_search(query)
+            # 2. 并行获取详细信息
+            detailed_results = self._get_details_parallel(epmc_results, query)
+            return detailed_results
+        except Exception as e:
+            print(f"检索错误: {str(e)}")
+            return []
+    def _get_details_parallel(self, epmc_results: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
+        """并行获取详细信息"""
+        detailed_results = []
+        # 如果没有结果，直接返回空列表
+        if not epmc_results:
+            return detailed_results
+        # 限制并行数量，避免过多并发请求
+        max_workers = min(5, len(epmc_results))
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # 提交所有详情获取任务
+            future_to_item = {}
+            for item in epmc_results:
+                doi = item.get("doi")
+                if doi:
+                    future = executor.submit(self._get_biorxiv_detail, doi)
+                    future_to_item[future] = item
+            # 收集结果
+            for future in as_completed(future_to_item):
+                item = future_to_item[future]
+                try:
+                    detail = future.result(timeout=10)  # 10秒超时
+                    if detail:
+                        detail["query_used"] = query
+                        detailed_results.append(detail)
+                    if len(detailed_results) >= self.limit:
+                        break
+                except Exception as e:
+                    print(f"获取详情失败: {item.get('doi')} - {str(e)}")
+                    continue
+        return detailed_results
+    def _epmc_search(self, query: str) -> List[Dict[str, Any]]:
+        """获取EPMC搜索结果"""
+        params = {
+            "query": f'(SRC:PPR) AND (DOI:10.1101*) AND ({query})',
+            "resultType": "core",
+            "pageSize": str(min(100, self.limit * 2)),  # 获取更多结果用于筛选
+            "format": "json",
+            "sortby": "cited",  # 按引用量排序
+        }
+        # 添加重试机制
+        for attempt in range(3):
+            try:
+                response = requests.get(
+                    self.EPMC_URL,
+                    params=params,
+                    timeout=self.timeout,
+                    verify=False,  # 禁用SSL验证
+                    headers={
+                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+                    }
+                )
+                response.raise_for_status()
+                data = response.json()
+                results = data.get("resultList", {}).get("result", [])
+                # 添加调试信息，显示检索到的文献数量和引用量信息
+                if results:
+                    print(f"EPMC检索到 {len(results)} 篇文献，按引用量排序")
+                    # 显示前几篇文献的引用量信息
+                    for i, result in enumerate(results[:3]):
+                        cited_count = result.get('citedByCount', 0)
+                        title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
+                        print(f"  文献 {i+1}: {title} (引用量: {cited_count})")
+                return results
+            except requests.exceptions.SSLError as e:
+                print(f"EPMC SSL错误 (尝试 {attempt + 1}/3): {str(e)}")
+                if attempt == 2:
+                    print("EPMC SSL连接失败，返回空结果")
+                    return []
+                time.sleep(2 ** attempt)
+            except requests.exceptions.RequestException as e:
+                print(f"EPMC请求错误 (尝试 {attempt + 1}/3): {str(e)}")
+                if attempt == 2:
+                    print("EPMC请求失败，返回空结果")
+                    return []
+                time.sleep(2 ** attempt)
+            except Exception as e:
+                print(f"EPMC未知错误 (尝试 {attempt + 1}/3): {str(e)}")
+                if attempt == 2:
+                    print("EPMC未知错误，返回空结果")
+                    return []
+                time.sleep(2 ** attempt)
+        return []
+    def _get_biorxiv_detail(self, doi: str) -> Dict[str, Any]:
+        """获取bioRxiv详细信息"""
+        url = self.BIORXIV_URL.format(doi=urllib.parse.quote(doi))
+        # 添加重试机制和更好的错误处理
+        for attempt in range(3):
+            try:
+                # 使用更宽松的SSL验证和更长的超时时间
+                response = requests.get(
+                    url,
+                    timeout=self.timeout,
+                    verify=False,  # 禁用SSL验证以避免SSL错误
+                    headers={
+                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+                    }
+                )
+                response.raise_for_status()
+                data = response.json()
+                records = data.get("collection") or data.get("records") or []
+                if not records:
+                    return None
+                latest_record = records[-1]
+                if latest_record.get("server") and latest_record["server"].lower() != "biorxiv":
+                    return None
+                version = latest_record.get("version") or 1
+                return {
+                    "doi": latest_record.get("doi"),
+                    "title": latest_record.get("title"),
+                    "abstract": latest_record.get("abstract"),
+                    "corresponding_author": latest_record.get("author_corresponding"),
+                    "corresponding_institution": latest_record.get("author_corresponding_institution"),
+                    "url": f"https://www.biorxiv.org/content/{latest_record['doi']}v{version}"
+                }
+            except requests.exceptions.SSLError as e:
+                print(f"bioRxiv SSL错误 (尝试 {attempt + 1}/3): {str(e)}")
+                if attempt == 2:  # 最后一次尝试
+                    print(f"跳过DOI {doi} 的详细信息获取")
+                    return None
+                time.sleep(2 ** attempt)  # 指数退避
+            except requests.exceptions.RequestException as e:
+                print(f"bioRxiv请求错误 (尝试 {attempt + 1}/3): {str(e)}")
+                if attempt == 2:  # 最后一次尝试
+                    print(f"跳过DOI {doi} 的详细信息获取")
+                    return None
+                time.sleep(2 ** attempt)  # 指数退避
+            except Exception as e:
+                print(f"bioRxiv未知错误 (尝试 {attempt + 1}/3): {str(e)}")
+                if attempt == 2:  # 最后一次尝试
+                    print(f"跳过DOI {doi} 的详细信息获取")
+                    return None
+                time.sleep(2 ** attempt)  # 指数退避
+        return None
+class DynamicAcademicSearcher:
+    """动态学术检索器，包含智能查询生成、动态处理和扩展功能"""
+    def __init__(self, base_searcher: AcademicSearcher):
+        self.base_searcher = base_searcher
+    def search_with_dynamic_queries(self, paper: PaperInfo, num_queries: int = 2) -> List[Dict[str, Any]]:
+        """使用动态查询进行并行检索"""
+        # 1. 生成检索查询
+        queries = self.generate_search_queries(paper, num_queries)
+        print("生成的检索查询:")
+        for i, query in enumerate(queries, 1):
+            print(f"查询 {i}: {query}")
+        # 记录查询生成日志
+        self._log_query_generation(paper, queries)
+        # 2. 并行执行动态检索
+        all_candidates = []
+        # 使用线程池并行执行查询
+        with ThreadPoolExecutor(max_workers=3) as executor:
+            # 提交所有查询任务
+            future_to_query = {}
+            for query in queries:
+                future = executor.submit(self._execute_single_query, query)
+                future_to_query[future] = query
+            # 收集结果
+            for future in as_completed(future_to_query):
+                query = future_to_query[future]
+                try:
+                    results = future.result(timeout=30)  # 30秒超时
+                    if results:
+                        print(f"查询 '{query}' 完成，找到 {len(results)} 篇文献")
+                        all_candidates.extend(results)
+                    else:
+                        print(f"查询 '{query}' 未找到文献")
+                except Exception as e:
+                    print(f"查询 '{query}' 执行失败: {str(e)}")
+        # 3. 去重相同文献
+        unique_candidates = {item['doi']: item for item in all_candidates if item.get('doi')}.values()
+        print(f"并行检索完成，总共找到 {len(list(unique_candidates))} 篇唯一文献")
+        return list(unique_candidates)
+    def _execute_single_query(self, query: str) -> List[Dict[str, Any]]:
+        """执行单个查询（用于并行处理）"""
+        print(f"开始执行查询: {query}")
+        # 动态查询处理
+        processed_queries = self._process_query_dynamically(query)
+        for processed_query in processed_queries:
+            print(f"  尝试查询: {processed_query}")
+            results = self.base_searcher.search(processed_query)
+            if results:
+                print(f"  找到 {len(results)} 篇文献")
+                return results
+            else:
+                print(f"  未找到文献，尝试扩展查询...")
+        print(f"  所有扩展查询都未找到文献")
+        return []
+    def generate_search_queries(self, paper: PaperInfo, num_queries: int = 2) -> List[str]:
+        """基于生物学关键词分类体系生成两级检索查询"""
+        system_msg = "你是生物学检索策略专家，擅长分析论文的研究领域和技术方法，并生成精准的检索查询"
+        # 构建关键词分类体系字符串
+        keywords_str = ""
+        for category, keywords in BIOLOGY_KEYWORDS.items():
+            keywords_str += f"\n{category}:\n"
+            for keyword in keywords:
+                keywords_str += f"  - {keyword}\n"
+        prompt = f"""
+请分析以下论文，按照以下步骤生成2个检索查询：
+论文信息:
+标题: {paper.title}
+摘要: {paper.abstract}
+关键词: {', '.join(paper.keywords)}
+生物学关键词分类体系:
+{keywords_str}
+步骤1: 确定第一个检索查询（大类 + 子类）
+1. 从上述分类体系中选择最匹配的1个大类（如"Molecular & Structural Biology"）
+2. 从该大类下选择最匹配的1个子类关键词（如"Cryo-EM"）
+3. 生成查询：大类 AND 子类关键词
+4. 格式示例：Molecular & Structural Biology AND Cryo-EM
+步骤2: 确定第二个检索查询（子类 + 论文特定关键词）
+1. 使用步骤1中确定的子类关键词
+2. 从论文标题、摘要或关键词中提取1个最核心的特定关键词
+3. 生成查询：子类关键词 AND 论文特定关键词
+4. 格式示例：Cryo-EM AND Nav1.7
+要求:
+1. 每个查询只使用2个关键词，用AND连接
+2. 第一个查询：大类 AND 子类
+3. 第二个查询：子类 AND 论文特定关键词
+4. 论文特定关键词要简洁明确，适合学术数据库检索
+5. 仅返回查询语句，每行一个，不添加编号或其他内容
+输出格式示例：
+Molecular & Structural Biology AND Cryo-EM
+Cryo-EM AND Nav1.7
+        """
+        response = self._call_llm(prompt.strip(), system_msg)
+        if not response:
+            return self._generate_backup_queries(paper, num_queries)
+        # 解析查询
+        queries = [q.strip() for q in response.strip().split('\n') if q.strip()]
+        # 验证查询格式
+        validated_queries = self._validate_queries(queries)
+        return validated_queries[:num_queries] if validated_queries else self._generate_backup_queries(paper, num_queries)
+    def _validate_queries(self, queries: List[str]) -> List[str]:
+        """验证查询格式和质量"""
+        validated_queries = []
+        for query in queries:
+            # 基本格式检查
+            if not query or len(query.strip()) < 5:
+                print(f"查询太短，跳过: {query}")
+                continue
+            # 检查是否包含AND连接符
+            if ' AND ' not in query:
+                print(f"查询缺少AND连接符，跳过: {query}")
+                continue
+            # 检查是否只包含两个关键词（主要学科 AND 研究层面关键词）
+            parts = query.split(' AND ')
+            if len(parts) != 2:
+                print(f"查询格式不正确，跳过: {query}")
+                continue
+            # 检查每个部分是否有效
+            part1 = parts[0].strip()
+            part2 = parts[1].strip()
+            if not part1 or not part2:
+                print(f"查询包含空部分，跳过: {query}")
+                continue
+            if part1.upper() == 'AND' or part2.upper() == 'AND':
+                print(f"查询包含无效AND，跳过: {query}")
+                continue
+            # 检查是否包含生物学关键词分类
+            has_biology_keyword = False
+            for category, keywords in BIOLOGY_KEYWORDS.items():
+                if category.lower() in query.lower():
+                    has_biology_keyword = True
+                    break
+                for keyword in keywords:
+                    if keyword.lower() in query.lower():
+                        has_biology_keyword = True
+                        break
+                if has_biology_keyword:
+                    break
+            if not has_biology_keyword:
+                print(f"查询不包含生物学关键词分类，跳过: {query}")
+                continue
+            # 检查查询长度合理性
+            if len(query) > 100:  # 查询过长
+                print(f"查询过长，跳过: {query}")
+                continue
+            validated_queries.append(query.strip())
+            print(f"查询验证通过: {query}")
+        return validated_queries
+    def _process_query_dynamically(self, query: str) -> List[str]:
+        """动态处理查询，生成多个变体"""
+        # 基础查询
+        queries = [query]
+        # 检查查询格式是否正确
+        if ' AND ' not in query:
+            return queries
+        # 按AND分割查询
+        parts = query.split(' AND ')
+        if len(parts) != 2:
+            return queries
+        # 清理每个部分
+        part1 = parts[0].strip()
+        part2 = parts[1].strip()
+        # 如果某个部分为空或只包含AND，跳过
+        if not part1 or not part2 or part1.upper() == 'AND' or part2.upper() == 'AND':
+            return queries
+        # 简化查询（只保留主要关键词）
+        part1_words = part1.split()
+        part2_words = part2.split()
+        if len(part1_words) > 1:
+            # 取第一个部分的主要关键词
+            simplified_part1 = part1_words[0]
+            queries.append(f"{simplified_part1} AND {part2}")
+        if len(part2_words) > 1:
+            # 取第二个部分的主要关键词
+            simplified_part2 = part2_words[0]
+            queries.append(f"{part1} AND {simplified_part2}")
+        # 单个关键词查询
+        queries.append(part1)
+        queries.append(part2)
+        return list(set(queries))  # 去重
+    def _generate_backup_queries(self, paper: PaperInfo, num_queries: int) -> List[str]:
+        """生成备用查询，基于生物学关键词分类体系"""
+        queries = []
+        # 尝试从论文内容推断最相关的生物学分类
+        best_category, best_keyword = self._infer_biology_keywords(paper)
+        if not best_category or not best_keyword:
+            # 如果没有推断出，使用默认分类
+            best_category = "Molecular & Structural Biology"
+            best_keyword = "Cryo-EM"
+        # 生成第一个查询：大类 AND 子类
+        queries.append(f"{best_category} AND {best_keyword}")
+        # 从论文标题中提取特定关键词
+        title_words = paper.title.split()
+        specific_keyword = None
+        for word in title_words:
+            if len(word) > 3 and word.lower() not in ['the', 'and', 'for', 'with', 'from', 'this', 'that', 'structures', 'human', 'channel', 'complex', 'with', 'auxiliary', 'subunits', 'animal', 'toxins']:
+                specific_keyword = word
+                break
+        if specific_keyword:
+            # 生成第二个查询：子类 AND 论文特定关键词
+            queries.append(f"{best_keyword} AND {specific_keyword}")
+        else:
+            # 如果没有找到特定关键词，使用第一个查询的变体
+            queries.append(f"{best_category} AND structure")
+        return queries[:num_queries]
+    def _infer_biology_keywords(self, paper: PaperInfo) -> tuple:
+        """从论文内容推断最相关的生物学分类和关键词"""
+        text = f"{paper.title} {paper.abstract} {' '.join(paper.keywords)}".lower()
+        best_category = None
+        best_keyword = None
+        max_matches = 0
+        for category, keywords in BIOLOGY_KEYWORDS.items():
+            category_matches = 0
+            best_keyword_in_category = None
+            # 检查类别名称匹配
+            if category.lower() in text:
+                category_matches += 2
+            # 检查关键词匹配
+            for keyword in keywords:
+                if keyword.lower() in text:
+                    category_matches += 1
+                    if not best_keyword_in_category:
+                        best_keyword_in_category = keyword
+            # 更新最佳匹配
+            if category_matches > max_matches:
+                max_matches = category_matches
+                best_category = category
+                best_keyword = best_keyword_in_category or keywords[0]
+        return best_category, best_keyword
+    def _log_query_generation(self, paper: PaperInfo, queries: List[str]):
+        """记录查询生成日志"""
+        log_info = {
+            "paper_title": paper.title[:100] + "..." if len(paper.title) > 100 else paper.title,
+            "paper_keywords": paper.keywords,
+            "generated_queries": queries,
+            "query_count": len(queries),
+            "timestamp": time.time()
+        }
+        print(f"查询生成日志: {log_info}")
+    def _call_llm(self, prompt: str, system_msg: str) -> Optional[str]:
+        """调用LLM生成查询"""
+        try:
+            if DASHSCOPE_API_KEY:
+                import dashscope
+                dashscope.api_key = DASHSCOPE_API_KEY
+                response = dashscope.Generation.call(
+                    model="qwen-turbo",
+                    messages=[
+                        {"role": "system", "content": system_msg},
+                        {"role": "user", "content": prompt}
+                    ],
+                    timeout=30
+                )
+                if response.status_code == 200:
+                    return response.output.text
+                else:
+                    print(f"DashScope API错误: {response.message}")
+            elif OPENAI_API_KEY:
+                from openai import OpenAI
+                client = OpenAI(api_key=OPENAI_API_KEY)
+                response = client.chat.completions.create(
+                    model="gpt-3.5-turbo",
+                    messages=[
+                        {"role": "system", "content": system_msg},
+                        {"role": "user", "content": prompt}
+                    ],
+                    timeout=30
+                )
+                return response.choices[0].message.content
+            else:
+                print("未配置API密钥")
+                return None
+        except Exception as e:
+            print(f"大模型调用错误: {str(e)}")
+            return None

reviewer_recommendation/searcher.py ADDED Viewed

	@@ -0,0 +1,1128 @@

+"""
+学术检索模块
+提供基于EPMC、bioRxiv和OpenAlex的学术文献检索功能
+"""
+import json
+import os
+import time
+import urllib.parse
+import requests
+import warnings
+import ssl
+from typing import List, Dict, Any, Optional
+from itertools import combinations
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+# 抑制SSL警告
+warnings.filterwarnings('ignore', message='Unverified HTTPS request')
+from .models import PaperInfo, SearchResult
+class OpenAlexSearcher:
+    """OpenAlex学术检索器，提供高质量的引用量数据"""
+    OPENALEX_URL = "https://api.openalex.org/works"
+    def __init__(self, limit=50, sleep=0.1, timeout=30):
+        self.limit = limit
+        self.sleep = sleep
+        self.timeout = timeout
+        self.headers = {
+            'User-Agent': 'AcademicReviewerSystem/1.0 (mailto:moahgzantony@gmail.com)'
+        }
+    def search(self, query: str, sort_by_citations: bool = False, years_after: int = None, target_count: int = None) -> List[Dict[str, Any]]:
+        """执行OpenAlex检索并返回文献数据"""
+        try:
+            # 简化查询字符串，移除可能导致问题的特殊字符
+            clean_query = query.replace(' AND ', ' ').replace('&', 'and').replace('(', '').replace(')', '')
+            clean_query = ' '.join(clean_query.split())  # 移除多余空格
+            # 确定检索数量
+            if target_count is not None:
+                per_page = min(target_count, 200)  # OpenAlex单次最多200条
+            else:
+                per_page = min(self.limit, 20)  # 默认控制候选文献数量
+            # 构建查询参数（简化版本，避免select参数问题）
+            params = {
+                "search": clean_query,
+                "per-page": per_page
+            }
+            # 添加年份过滤
+            if years_after is not None:
+                from datetime import datetime
+                current_year = datetime.now().year
+                target_year = current_year - years_after
+                # 使用正确的filter参数格式
+                params["filter"] = f"from_publication_date:{target_year}-01-01"
+                print(f"年份过滤: 只检索{target_year}年及以后发表的论文")
+                print(f"日期过滤: {params['filter']}")
+            # 根据需求选择排序方式
+            if sort_by_citations:
+                params["sort"] = "cited_by_count:desc"
+            else:
+                params["sort"] = "relevance_score:desc"
+            print(f"OpenAlex检索查询: {query} -> {clean_query}")
+            print(f"排序方式: {'按引用量' if sort_by_citations else '按相关性'}")
+            # 发送请求 - 手动构建URL以避免冒号被编码
+            import urllib.parse
+            # 手动构建查询字符串，确保冒号不被编码
+            query_parts = []
+            for key, value in params.items():
+                if (key == "sort" or key == "filter") and ":" in str(value):
+                    # 对于sort和filter参数，确保冒号不被编码
+                    query_parts.append(f"{key}={value}")
+                else:
+                    query_parts.append(f"{key}={urllib.parse.quote(str(value))}")
+            query_string = "&".join(query_parts)
+            full_url = f"{self.OPENALEX_URL}?{query_string}"
+            print(f"完整URL: {full_url}")
+            response = requests.get(
+                full_url,
+                headers=self.headers,
+                timeout=self.timeout
+            )
+            response.raise_for_status()
+            data = response.json()
+            items = data.get("results", [])
+            total_results = data.get("meta", {}).get("count", 0)
+            print(f"OpenAlex检索到 {len(items)} 篇文献，总命中数: {total_results}")
+            # 转换数据格式
+            results = []
+            for item in items:
+                result = self._convert_openalex_item(item, query)
+                if result:
+                    results.append(result)
+            # 显示前几篇文献的引用量信息
+            if results:
+                print(f"OpenAlex检索结果（按引用量排序）:")
+                for i, result in enumerate(results[:3]):
+                    cited_count = result.get('citedByCount', 0)
+                    title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
+                    print(f"  文献 {i+1}: {title} (引用量: {cited_count})")
+            return results
+        except Exception as e:
+            print(f"OpenAlex检索失败: {str(e)}")
+            return []
+    def _convert_openalex_item(self, item: Dict[str, Any], query: str) -> Optional[Dict[str, Any]]:
+        """将OpenAlex数据转换为标准格式"""
+        try:
+            # 提取基本信息
+            title = item.get('title', '')
+            if not title:
+                return None
+            # 提取摘要
+            abstract = ""
+            abstract_inverted = item.get('abstract_inverted_index', {})
+            if abstract_inverted:
+                # 重构摘要文本
+                abstract_words = []
+                for word, positions in abstract_inverted.items():
+                    for pos in positions:
+                        abstract_words.append((pos, word))
+                abstract_words.sort(key=lambda x: x[0])
+                abstract = ' '.join([word for pos, word in abstract_words])
+            # 提取作者信息
+            authorships = item.get('authorships', [])
+            authors = []
+            corresponding_author = None
+            corresponding_institution = None
+            for authorship in authorships:
+                author = authorship.get('author', {})
+                if author:
+                    author_name = author.get('display_name', '')
+                    if author_name:
+                        authors.append(author_name)
+                        # 检查是否为通讯作者（通常第一个作者或标记为corresponding的作者）
+                        if authorship.get('is_corresponding', False) or len(authors) == 1:
+                            corresponding_author = author_name
+                            # 获取机构信息
+                            institutions = authorship.get('institutions', [])
+                            if institutions:
+                                institution = institutions[0].get('display_name', '')
+                                if institution:
+                                    corresponding_institution = institution
+            # 提取期刊信息
+            primary_location = item.get('primary_location', {})
+            source = primary_location.get('source', {})
+            journal = source.get('display_name', '') if source else ''
+            # 提取发表年份
+            pub_year = item.get('publication_year', '')
+            # 提取引用量信息
+            cited_by_count = item.get('cited_by_count', 0)
+            citation_count = item.get('citation_count', 0)
+            referenced_works_count = item.get('referenced_works_count', 0)
+            # 提取DOI
+            doi = ""
+            external_ids = item.get('ids', {})
+            if external_ids:
+                doi = external_ids.get('doi', '')
+                if doi and doi.startswith('https://doi.org/'):
+                    doi = doi.replace('https://doi.org/', '')
+            # 构建结果
+            result = {
+                'title': title,
+                'abstract': abstract,
+                'authors': authors,
+                'corresponding_author': corresponding_author,
+                'corresponding_institution': corresponding_institution,
+                'journal': journal,
+                'publication_year': pub_year,
+                'doi': doi,
+                'citedByCount': cited_by_count,  # 使用与EPMC相同的字段名
+                'citation_count': citation_count,
+                'referenced_works_count': referenced_works_count,
+                'query_used': query,
+                'source': 'openalex',
+                'openalex_id': item.get('id', ''),
+                'type': item.get('type', ''),
+                'open_access': item.get('open_access', {}).get('is_oa', False)
+            }
+            return result
+        except Exception as e:
+            print(f"转换OpenAlex数据失败: {str(e)}")
+            return None
+# 配置部分
+DASHSCOPE_API_KEY = "sk-564d51ee5ddd4693a86f34750b46b02e"
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+# 生物学关键词分类体系
+BIOLOGY_KEYWORDS = {
+    "Molecular & Structural Biology": [
+        "Cryo-EM",
+        "X-ray crystallography",
+        "NMR spectroscopy",
+        "Single-particle analysis",
+        "Biolayer interferometry (BLI)",
+        "Surface plasmon resonance (SPR)",
+        "Confocal microscopy",
+        "CRISPR-Cas9",
+        "TALEN",
+        "ZFN",
+        "RNA interference (RNAi)",
+        "Single-molecule imaging",
+        "FRET",
+        "Optogenetics"
+    ],
+    "Cell & Single-Cell Technologies": [
+        "Single-cell RNA-seq (scRNA-seq)",
+        "Single-cell ATAC-seq",
+        "Spatial transcriptomics",
+        "FISH (Fluorescence in situ hybridization)",
+        "Immunofluorescence",
+        "Tissue clearing (CLARITY)",
+        "Flow cytometry (FACS)",
+        "CyTOF (Mass cytometry)",
+        "High-throughput screening",
+        "Organoids",
+        "3D cell culture",
+        "Microfluidics"
+    ],
+    "Neuroscience Tools": [
+        "Optogenetics",
+        "DREADDs (Designer Receptors Exclusively Activated by Designer Drugs)",
+        "GCaMP calcium imaging",
+        "Two-photon microscopy",
+        "Neural tracing",
+        "Patch-seq",
+        "Lineage tracing",
+        "Spatial multi-omics"
+    ],
+    "Omics & Systems Biology": [
+        "RNA sequencing (RNA-seq)",
+        "Proteomics (LC-MS/MS)",
+        "Metabolomics",
+        "Epigenomics",
+        "10x Genomics",
+        "SMART-seq",
+        "Nanopore sequencing",
+        "Illumina HiSeq",
+        "WGCNA",
+        "Machine learning in omics",
+        "scVelo"
+    ],
+    "Microbiome & Immunology": [
+        "16S rRNA sequencing",
+        "Metagenomics",
+        "Gut-brain axis",
+        "VDJ-seq",
+        "TCR/BCR lineage tracing",
+        "Immune checkpoints (PD-1, CTLA-4)",
+        "mRNA vaccines",
+        "DNA vaccines",
+        "Nanoparticle vaccines",
+        "Antigen presentation systems"
+    ],
+    "Development & Regeneration": [
+        "Induced pluripotent stem cells (iPSCs)",
+        "Embryonic stem cells (ESCs)",
+        "Cellular reprogramming",
+        "Wnt signaling",
+        "Hippo pathway",
+        "Notch signaling",
+        "Zebrafish models",
+        "C. elegans",
+        "Mouse embryonic sections"
+    ],
+    "Ecology & Environmental Biology": [
+        "Environmental DNA (eDNA)",
+        "Remote sensing ecology",
+        "Biosensors",
+        "Ecological niche modeling (ENM)",
+        "Genetic diversity analysis",
+        "Captive breeding technologies"
+    ],
+    "Bioinformatics & AI Tools": [
+        "Seurat",
+        "Scanpy",
+        "Monocle",
+        "CIBERSORT",
+        "GSEA",
+        "AlphaFold",
+        "RoseTTAFold",
+        "Molecular docking",
+        "STRING",
+        "Cytoscape",
+        "Gene Ontology (GO)",
+        "KEGG pathway analysis"
+    ]
+}
+class AcademicSearcher:
+    """基础学术检索器，仅负责数据获取，不做任何分析"""
+    EPMC_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
+    BIORXIV_URL = "https://api.biorxiv.org/details/biorxiv/{doi}/na/json"
+    def __init__(self, limit=50, sleep=0.1, timeout=30):
+        self.limit = limit
+        self.sleep = sleep
+        self.timeout = timeout
+        # 创建自定义SSL上下文
+        self.ssl_context = ssl.create_default_context()
+        self.ssl_context.check_hostname = False
+        self.ssl_context.verify_mode = ssl.CERT_NONE
+    def search(self, query: str, search_preprints: bool = True) -> List[Dict[str, Any]]:
+        """执行检索并返回原始文献数据"""
+        try:
+            # 1. 获取DOI列表
+            epmc_results = self._epmc_search(query, search_preprints)
+            # 2. 并行获取详细信息
+            detailed_results = self._get_details_parallel(epmc_results, query)
+            return detailed_results
+        except Exception as e:
+            print(f"检索错误: {str(e)}")
+            return []
+    def _get_details_parallel(self, epmc_results: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
+        """并行获取详细信息"""
+        detailed_results = []
+        # 如果没有结果，直接返回空列表
+        if not epmc_results:
+            return detailed_results
+        # 限制并行数量，避免过多并发请求
+        max_workers = min(5, len(epmc_results))
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # 提交所有详情获取任务
+            future_to_item = {}
+            for item in epmc_results:
+                doi = item.get("doi")
+                if doi:
+                    future = executor.submit(self._get_biorxiv_detail, doi)
+                    future_to_item[future] = item
+            # 收集结果
+            for future in as_completed(future_to_item):
+                item = future_to_item[future]
+                try:
+                    detail = future.result(timeout=10)  # 10秒超时
+                    if detail:
+                        detail["query_used"] = query
+                        detailed_results.append(detail)
+                    if len(detailed_results) >= self.limit:
+                        break
+                except Exception as e:
+                    print(f"获取详情失败: {item.get('doi')} - {str(e)}")
+                    continue
+        return detailed_results
+    def _epmc_search(self, query: str, search_preprints: bool = True) -> List[Dict[str, Any]]:
+        """获取EPMC搜索结果"""
+        if search_preprints:
+            # 检索预印本（bioRxiv）
+            query_str = f'(SRC:PPR) AND (DOI:10.1101*) AND ({query})'
+        else:
+            # 检索已发表论文（有引用量数据）
+            query_str = f'({query})'
+        params = {
+            "query": query_str,
+            "resultType": "core",
+            "pageSize": str(min(50, self.limit)),  # 控制候选文献数量
+            "format": "json",
+            "sortby": "CITED+desc",  # 按引用量降序排序
+        }
+        # 添加重试机制
+        for attempt in range(3):
+            try:
+                response = requests.get(
+                    self.EPMC_URL,
+                    params=params,
+                    timeout=self.timeout,
+                    verify=False,  # 禁用SSL验证
+                    headers={
+                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+                    }
+                )
+                response.raise_for_status()
+                data = response.json()
+                results = data.get("resultList", {}).get("result", [])
+                # 添加调试信息，显示检索到的文献数量和引用量信息
+                if results:
+                    print(f"EPMC检索到 {len(results)} 篇文献，按引用量排序")
+                    # 显示前几篇文献的引用量信息
+                    for i, result in enumerate(results[:3]):
+                        cited_count = result.get('citedByCount', 0)
+                        title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
+                        print(f"  文献 {i+1}: {title} (引用量: {cited_count})")
+                return results
+            except requests.exceptions.SSLError as e:
+                print(f"EPMC SSL错误 (尝试 {attempt + 1}/3): {str(e)}")
+                if attempt == 2:
+                    print("EPMC SSL连接失败，返回空结果")
+                    return []
+                time.sleep(2 ** attempt)
+            except requests.exceptions.RequestException as e:
+                print(f"EPMC请求错误 (尝试 {attempt + 1}/3): {str(e)}")
+                if attempt == 2:
+                    print("EPMC请求失败，返回空结果")
+                    return []
+                time.sleep(2 ** attempt)
+            except Exception as e:
+                print(f"EPMC未知错误 (尝试 {attempt + 1}/3): {str(e)}")
+                if attempt == 2:
+                    print("EPMC未知错误，返回空结果")
+                    return []
+                time.sleep(2 ** attempt)
+        return []
+    def _get_biorxiv_detail(self, doi: str) -> Dict[str, Any]:
+        """获取bioRxiv详细信息"""
+        url = self.BIORXIV_URL.format(doi=urllib.parse.quote(doi))
+        # 添加重试机制和更好的错误处理
+        for attempt in range(3):
+            try:
+                # 使用更宽松的SSL验证和更长的超时时间
+                response = requests.get(
+                    url,
+                    timeout=self.timeout,
+                    verify=False,  # 禁用SSL验证以避免SSL错误
+                    headers={
+                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+                    }
+                )
+                response.raise_for_status()
+                data = response.json()
+                records = data.get("collection") or data.get("records") or []
+                if not records:
+                    return None
+                latest_record = records[-1]
+                if latest_record.get("server") and latest_record["server"].lower() != "biorxiv":
+                    return None
+                version = latest_record.get("version") or 1
+                return {
+                    "doi": latest_record.get("doi"),
+                    "title": latest_record.get("title"),
+                    "abstract": latest_record.get("abstract"),
+                    "corresponding_author": latest_record.get("author_corresponding"),
+                    "corresponding_institution": latest_record.get("author_corresponding_institution"),
+                    "url": f"https://www.biorxiv.org/content/{latest_record['doi']}v{version}"
+                }
+            except requests.exceptions.SSLError as e:
+                print(f"bioRxiv SSL错误 (尝试 {attempt + 1}/3): {str(e)}")
+                if attempt == 2:  # 最后一次尝试
+                    print(f"跳过DOI {doi} 的详细信息获取")
+                    return None
+                time.sleep(2 ** attempt)  # 指数退避
+            except requests.exceptions.RequestException as e:
+                print(f"bioRxiv请求错误 (尝试 {attempt + 1}/3): {str(e)}")
+                if attempt == 2:  # 最后一次尝试
+                    print(f"跳过DOI {doi} 的详细信息获取")
+                    return None
+                time.sleep(2 ** attempt)  # 指数退避
+            except Exception as e:
+                print(f"bioRxiv未知错误 (尝试 {attempt + 1}/3): {str(e)}")
+                if attempt == 2:  # 最后一次尝试
+                    print(f"跳过DOI {doi} 的详细信息获取")
+                    return None
+                time.sleep(2 ** attempt)  # 指数退避
+        return None
+class DynamicAcademicSearcher:
+    """动态学术检索器，包含智能查询生成、动态处理和扩展功能"""
+    def __init__(self, base_searcher: AcademicSearcher = None, openalex_searcher: OpenAlexSearcher = None):
+        self.base_searcher = base_searcher
+        self.openalex_searcher = openalex_searcher
+        # 如果没有提供任何检索器，创建默认的
+        if not self.base_searcher and not self.openalex_searcher:
+            self.openalex_searcher = OpenAlexSearcher()
+    def search_with_dynamic_queries(self, paper: PaperInfo, num_reviewers: int = 8, years_after: int = None) -> tuple:
+        """使用动态查询进行单通道检索"""
+        # 1. 生成检索查询（只生成一个查询）
+        queries = self.generate_search_queries(paper, 1)
+        print("生成的检索查询:")
+        for i, query in enumerate(queries, 1):
+            print(f"查询 {i}: {query}")
+        # 记录查询生成日志
+        self._log_query_generation(paper, queries)
+        # 2. 根据可用的检索器选择检索策略
+        if self.openalex_searcher:
+            return self._search_with_openalex_single_channel(queries[0], num_reviewers, years_after)
+        elif self.base_searcher:
+            return self._search_with_epmc_single_channel(queries[0], num_reviewers)
+        else:
+            print("错误：没有可用的检索器")
+            return [], []
+    def _search_with_openalex_single_channel(self, query: str, num_reviewers: int, years_after: int = None) -> tuple:
+        """使用OpenAlex进行单通道检索"""
+        # 计算需要检索的文献数量（目标审稿人数量的5倍）
+        target_count = num_reviewers * 5
+        print(f"单通道检索：目标审稿人数量 {num_reviewers}，检索文献数量 {target_count}")
+        # 执行检索
+        candidates = self._execute_openalex_query(query, sort_by_citations=False, years_after=years_after, target_count=target_count)
+        print(f"单通道检索完成，获得 {len(candidates)} 个候选审稿人")
+        # 返回单个通道的结果（为了保持兼容性，返回三个相同的通道）
+        return candidates, candidates, candidates
+    def _search_with_epmc_single_channel(self, query: str, num_reviewers: int) -> tuple:
+        """使用EPMC进行单通道检索"""
+        # 计算需要检索的文献数量（目标审稿人数量的5倍）
+        target_count = num_reviewers * 5
+        print(f"单通道检索：目标审稿人数量 {num_reviewers}，检索文献数量 {target_count}")
+        # 执行检索
+        candidates = self._execute_single_query(query, search_preprints=True, target_count=target_count)
+        print(f"单通道检索完成，获得 {len(candidates)} 个候选审稿人")
+        # 返回单个通道的结果（为了保持兼容性，返回三个相同的通道）
+        return candidates, candidates, candidates
+    def _search_with_openalex(self, queries: List[str], years_after: int = None) -> tuple:
+        """使用OpenAlex进行三通道检索"""
+        channel1_candidates = []  # 高引用量论文
+        channel2_candidates = []  # 相关性论文
+        channel3_candidates = []  # 相关性论文
+        # 确保至少有3个查询
+        if len(queries) < 3:
+            print(f"警告：查询数量不足({len(queries)}/3)，将使用备用查询")
+            queries = queries + ["cryo-em structure", "cryo-em structure analysis"] * (3 - len(queries))
+        # 使用线程池并行执行查询
+        with ThreadPoolExecutor(max_workers=6) as executor:
+            # 提交查询任务
+            future_to_query = {}
+            if len(queries) >= 1:
+                future1 = executor.submit(self._execute_openalex_query, queries[0], sort_by_citations=False, years_after=years_after)
+                future_to_query[future1] = (queries[0], "高引用量")
+            # 通道2：使用查询2按相关性排序
+            if len(queries) >= 2:
+                future2 = executor.submit(self._execute_openalex_query, queries[1], sort_by_citations=False, years_after=years_after)
+                future_to_query[future2] = (queries[1], "相关性2")
+            # 通道3：使用查询3按相关性排序
+            if len(queries) >= 3:
+                future3 = executor.submit(self._execute_openalex_query, queries[2], sort_by_citations=False, years_after=years_after)
+                future_to_query[future3] = (queries[2], "相关性3")
+            # 收集结果
+            for future in as_completed(future_to_query):
+                query, search_type = future_to_query[future]
+                try:
+                    results = future.result()
+                    if search_type == "高引用量":
+                        channel1_candidates.extend(results)
+                    elif search_type == "相关性2":
+                        channel2_candidates.extend(results)
+                    elif search_type == "相关性3":
+                        channel3_candidates.extend(results)
+                except Exception as e:
+                    print(f"查询失败 {query} ({search_type}): {str(e)}")
+        # 显示检索结果
+        print(f"\n通道1（高引用量排序）的检索结果：")
+        if channel1_candidates and len(queries) >= 1:
+            print(f"查询: \"{queries[0]}\" (按引用量)")
+            # 显示前3篇文献
+            for j, result in enumerate(channel1_candidates[:3], 1):
+                cited_count = result.get('citedByCount', 0)
+                title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
+                print(f"文献{j}: {title} (引用量: {cited_count})")
+        print(f"\n通道2（相关性排序）的检索结果：")
+        if channel2_candidates and len(queries) >= 2:
+            print(f"查询: \"{queries[1]}\" (按相关性)")
+            # 显示前3篇文献
+            for j, result in enumerate(channel2_candidates[:3], 1):
+                cited_count = result.get('citedByCount', 0)
+                title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
+                print(f"文献{j}: {title} (引用量: {cited_count})")
+        print(f"\n通道3（相关性排序）的检索结果：")
+        if channel3_candidates and len(queries) >= 3:
+            print(f"查询: \"{queries[2]}\" (按相关性)")
+            # 显示前3篇文献
+            for j, result in enumerate(channel3_candidates[:3], 1):
+                cited_count = result.get('citedByCount', 0)
+                title = result.get('title', 'N/A')[:50] + '...' if len(result.get('title', '')) > 50 else result.get('title', 'N/A')
+                print(f"文献{j}: {title} (引用量: {cited_count})")
+        # 打印每个通道的候选审稿人列表
+        self._print_candidate_reviewers("通道1（高引用量）", channel1_candidates)
+        self._print_candidate_reviewers("通道2（相关性）", channel2_candidates)
+        self._print_candidate_reviewers("通道3（高相关性）", channel3_candidates)
+        print(f"\nOpenAlex检索完成 - 通道1: {len(channel1_candidates)} 篇, 通道2: {len(channel2_candidates)} 篇, 通道3: {len(channel3_candidates)} 篇")
+        return channel1_candidates, channel2_candidates, channel3_candidates
+    def _print_candidate_reviewers(self, channel_name: str, candidates: List[Dict[str, Any]]):
+        """打印候选审稿人列表"""
+        if not candidates:
+            print(f"\n{channel_name}候选审稿人: 无")
+            return
+        print(f"\n{channel_name}候选审稿人:")
+        seen_reviewers = set()
+        for i, candidate in enumerate(candidates, 1):
+            corresponding_author = candidate.get('corresponding_author', '')
+            corresponding_institution = candidate.get('corresponding_institution', '')
+            title = candidate.get('title', '')
+            if corresponding_author:
+                # 创建审稿人标识符用于去重
+                author_lower = corresponding_author.lower()
+                institution_lower = (corresponding_institution or "未知机构").lower()
+                reviewer_key = f"{author_lower}_{institution_lower}"
+                if reviewer_key not in seen_reviewers:
+                    seen_reviewers.add(reviewer_key)
+                    title_short = title[:60] + '...' if len(title) > 60 else title
+                    print(f"  {len(seen_reviewers)}. {corresponding_author} ({corresponding_institution})")
+                    print(f"     论文: {title_short}")
+        print(f"  总计: {len(seen_reviewers)} 位候选审稿人")
+    def _execute_openalex_query(self, query: str, sort_by_citations: bool = False, years_after: int = None, target_count: int = None) -> List[Dict[str, Any]]:
+        """执行单个OpenAlex查询"""
+        try:
+            return self.openalex_searcher.search(query, sort_by_citations=sort_by_citations, years_after=years_after, target_count=target_count)
+        except Exception as e:
+            print(f"OpenAlex查询执行失败: {str(e)}")
+            return []
+    def _search_with_epmc(self, queries: List[str]) -> tuple:
+        """使用EPMC进行双数据源检索"""
+        channel1_candidates = []  # 已发表论文（高引用量）
+        channel2_candidates = []  # 预印本（最新研��）
+        # 使用线程池并行执行查询
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            # 提交所有查询任务
+            future_to_query = {}
+            for i, query in enumerate(queries):
+                # 通道1：检索已发表论文
+                future1 = executor.submit(self._execute_single_query, query, search_preprints=False)
+                future_to_query[future1] = (query, "已发表论文")
+                # 通道2：检索预印本
+                future2 = executor.submit(self._execute_single_query, query, search_preprints=True)
+                future_to_query[future2] = (query, "预印本")
+            # 收集结果
+            for future in as_completed(future_to_query):
+                query, data_source = future_to_query[future]
+                try:
+                    results = future.result(timeout=30)  # 30秒超时
+                    if results:
+                        print(f"查询 '{query}' ({data_source}) 完成，找到 {len(results)} 篇文献")
+                        if data_source == "已发表论文":
+                            channel1_candidates.extend(results)
+                        else:
+                            channel2_candidates.extend(results)
+                    else:
+                        print(f"查询 '{query}' ({data_source}) 未找到文献")
+                except Exception as e:
+                    print(f"查询 '{query}' ({data_source}) 执行失败: {str(e)}")
+        # 3. 去重相同文献
+        unique_channel1 = {item['doi']: item for item in channel1_candidates if item.get('doi')}.values()
+        unique_channel2 = {item['doi']: item for item in channel2_candidates if item.get('doi')}.values()
+        print(f"双数据源检索完成：已发表论文 {len(list(unique_channel1))} 篇，预印本 {len(list(unique_channel2))} 篇")
+        return list(unique_channel1), list(unique_channel2)
+    def _execute_single_query(self, query: str, search_preprints: bool = True, target_count: int = None) -> List[Dict[str, Any]]:
+        """执行单个查询（用于并行处理）"""
+        data_source = "预印本" if search_preprints else "已发表论文"
+        print(f"开始执行查询: {query} ({data_source})")
+        # 动态查询处理
+        processed_queries = self._process_query_dynamically(query)
+        for processed_query in processed_queries:
+            print(f"  尝试查询: {processed_query}")
+            results = self.base_searcher.search(processed_query, search_preprints)
+            if results:
+                print(f"  找到 {len(results)} 篇文献")
+                return results
+            else:
+                print(f"  未找到文献，尝试扩展查询...")
+        print(f"  所有扩展查询都未找到文献")
+        return []
+    def generate_search_queries(self, paper: PaperInfo, num_queries: int = 2) -> List[str]:
+        """动态生成双通道检索查询"""
+        system_msg = "你是学术检索专家，擅长从论文中提取出多个维度的关键词"
+        prompt = f"""
+请分析以下论文，提取关键信息并生成1个精准的检索查询：
+论文信息:
+标题: {paper.title}
+摘要: {paper.abstract}
+关键词: {', '.join(paper.keywords)}
+分析任务:
+请从论文中识别以下三个维度的关键信息：
+1. 一个研究主体 (Research Subject)
+   - 论文研究的核心对象、分子、蛋白质、疾病等
+   - 例如：Nav1.7、COVID-19、dopamine、insulin等
+2. 关键组分 (Key Components)
+   - 与研究主体相关的亚单位、配体、抑制剂、调节因子等
+   - 例如：β1亚单位、Protoxin-II、receptor、agonist等
+3. 研究方法 (Research Method)
+   - 论文使用的核心技术、实验方法、分析手段等
+   - 例如：Cryo-EM、CRISPR、NMR、patch-clamp等
+查询生成规则:
+- 使用布尔运算符AND和OR构建精确查询
+- 每个维度内使用OR连接同义词或相关术语
+- 不同维度间使用AND连接
+- 优先选择最核心、最特异的术语
+- 避免过于宽泛的通用词汇
+输出要求:
+请生成1个检索查询，格式如下：
+(研究主体) AND (关键组分1 OR 关键组分2) AND (研究方法1 OR 研究方法2)
+示例输出:
+(Nav1.7) AND (β1 OR Protoxin-II) AND (cryo-EM OR cryo-electron microscopy)
+        """
+        response = self._call_llm(prompt.strip(), system_msg)
+        if not response:
+            return self._generate_backup_queries(paper, num_queries)
+        # 解析查询（现在只生成一个查询）
+        query = response.strip()
+        print(f"LLM原始返回的查询: {query}")
+        # 验证查询格式
+        validated_queries = self._validate_new_queries([query])
+        print(f"验证后的查询数量: {len(validated_queries)}")
+        # 如果验证失败，使用备用查询
+        if len(validated_queries) == 0:
+            print(f"查询验证失败，使用备用查询")
+            backup_queries = self._generate_backup_queries(paper, num_queries)
+            print(f"备用查询: {backup_queries}")
+            return backup_queries
+        # 返回验证通过的查询，如果num_queries > 1，则重复使用同一个查询
+        result_queries = validated_queries[:1]  # 只取第一个查询
+        if num_queries > 1:
+            # 如果需要多个查询，重复使用同一个查询
+            result_queries = result_queries * num_queries
+            print(f"重复使用查询以满足数量要求: {result_queries}")
+        return result_queries
+    def _validate_new_queries(self, queries: List[str]) -> List[str]:
+        """验证新格式查询（单查询格式）"""
+        validated_queries = []
+        for query in queries:
+            # 基本格式检查
+            if not query or len(query.strip()) < 10:
+                print(f"查询太短，跳过: {query}")
+                continue
+            # 检查是否包含AND操作符（新格式要求）
+            if ' AND ' not in query:
+                print(f"查询缺少AND操作符，跳过: {query}")
+                continue
+            # 检查是否包含括号（新格式要求）
+            if '(' not in query or ')' not in query:
+                print(f"查询缺少括号，跳过: {query}")
+                continue
+            # 检查是否包含OR操作符（新格式要求）
+            if ' OR ' not in query:
+                print(f"查询缺少OR操作符，跳过: {query}")
+                continue
+            # 检查查询长度合理性
+            if len(query) > 200:  # 查询过长
+                print(f"查询过长，跳过: {query}")
+                continue
+            # 检查是否包含生物学关键词分类
+            has_biology_keyword = False
+            for category, keywords in BIOLOGY_KEYWORDS.items():
+                if category.lower() in query.lower():
+                    has_biology_keyword = True
+                    break
+                for keyword in keywords:
+                    if keyword.lower() in query.lower():
+                        has_biology_keyword = True
+                        break
+                if has_biology_keyword:
+                    break
+            if not has_biology_keyword:
+                print(f"查询不包含生物学关键词分类，跳过: {query}")
+                continue
+            validated_queries.append(query.strip())
+            print(f"查询验证通过: {query}")
+        return validated_queries
+    def _validate_queries(self, queries: List[str]) -> List[str]:
+        """验证查询格式和质量"""
+        validated_queries = []
+        for query in queries:
+            # 基本格式检查
+            if not query or len(query.strip()) < 5:
+                print(f"查询太短，跳过: {query}")
+                continue
+            # 检查是否包含AND连接符
+            if ' AND ' not in query:
+                print(f"查询缺少AND连接符，跳过: {query}")
+                continue
+            # 检查是否只包含两个关键词（主要学科 AND 研究层面关键词）
+            parts = query.split(' AND ')
+            if len(parts) != 2:
+                print(f"查询格式不正确，跳过: {query}")
+                continue
+            # 检查每个部分是否有效
+            part1 = parts[0].strip()
+            part2 = parts[1].strip()
+            if not part1 or not part2:
+                print(f"查询包含空部分，跳过: {query}")
+                continue
+            if part1.upper() == 'AND' or part2.upper() == 'AND':
+                print(f"查询包含无效AND，跳过: {query}")
+                continue
+            # 检查是否包含生物学关键词分类
+            has_biology_keyword = False
+            for category, keywords in BIOLOGY_KEYWORDS.items():
+                if category.lower() in query.lower():
+                    has_biology_keyword = True
+                    break
+                for keyword in keywords:
+                    if keyword.lower() in query.lower():
+                        has_biology_keyword = True
+                        break
+                if has_biology_keyword:
+                    break
+            if not has_biology_keyword:
+                print(f"查询不包含生物学关键词分类，跳过: {query}")
+                continue
+            # 检查查询长度合理性
+            if len(query) > 100:  # 查询过长
+                print(f"查询过长，跳过: {query}")
+                continue
+            validated_queries.append(query.strip())
+            print(f"查询验证通过: {query}")
+        return validated_queries
+    def _process_query_dynamically(self, query: str) -> List[str]:
+        """动态处理查询，生成多个变体"""
+        # 基础查询
+        queries = [query]
+        # 检查查询格式是否正确
+        if ' AND ' not in query:
+            return queries
+        # 按AND分割查询
+        parts = query.split(' AND ')
+        if len(parts) != 2:
+            return queries
+        # 清理每个部分
+        part1 = parts[0].strip()
+        part2 = parts[1].strip()
+        # 如果某个部分为空或只包含AND，跳过
+        if not part1 or not part2 or part1.upper() == 'AND' or part2.upper() == 'AND':
+            return queries
+        # 简化查询（只保留主要关键词）
+        part1_words = part1.split()
+        part2_words = part2.split()
+        if len(part1_words) > 1:
+            # 取第一个部分的主要关键词
+            simplified_part1 = part1_words[0]
+            queries.append(f"{simplified_part1} AND {part2}")
+        if len(part2_words) > 1:
+            # 取第二个部分的主要关键词
+            simplified_part2 = part2_words[0]
+            queries.append(f"{part1} AND {simplified_part2}")
+        # 单个关键词查询
+        queries.append(part1)
+        queries.append(part2)
+        return list(set(queries))  # 去重
+    def _generate_backup_queries(self, paper: PaperInfo, num_queries: int) -> List[str]:
+        """生成备用查询，使用新格式"""
+        queries = []
+        # 从论文标题和摘要中提取关键词
+        text = f"{paper.title} {paper.abstract}".lower()
+        # 常见技术关键词列表
+        tech_keywords = [
+            "cryo-em", "cryoem", "x-ray", "xray", "nmr", "crispr", "pcr", "western blot",
+            "immunofluorescence", "confocal", "flow cytometry", "mass spectrometry",
+            "chromatography", "electrophoresis", "microscopy", "spectroscopy"
+        ]
+        # 查找技术关键词
+        found_tech_keyword = None
+        for keyword in tech_keywords:
+            if keyword in text:
+                found_tech_keyword = keyword
+                break
+        # 如果没有找到技术关键词，使用默认值
+        if not found_tech_keyword:
+            found_tech_keyword = "cryo-em"
+        # 查询1：纯子类关键词
+        queries.append(found_tech_keyword)
+        # 查询2：子类关键词 + 子子类关键词
+        queries.append(f"{found_tech_keyword} structure")
+        # 从标题中提取特定术语
+        title_words = paper.title.split()
+        specific_term = None
+        for word in title_words:
+            # 过滤掉常见词汇，寻找有意义的术语
+            if (len(word) > 3 and
+                word.lower() not in ['the', 'and', 'for', 'with', 'from', 'this', 'that',
+                                   'structures', 'human', 'channel', 'complex', 'with',
+                                   'auxiliary', 'subunits', 'animal', 'toxins', 'analysis',
+                                   'study', 'research', 'investigation', 'characterization']):
+                specific_term = word
+                break
+        if specific_term:
+            # 查询3：子类关键词 + 子子类关键词 + 论文特定术语
+            queries.append(f"{found_tech_keyword} structure {specific_term}")
+        else:
+            # 如果没有找到特定术语，使用第一个查询的变体
+            queries.append(f"{found_tech_keyword} structure analysis")
+        # 确保总是返回所需数量的查询
+        while len(queries) < num_queries:
+            # 如果还需要更多查询，添加变体
+            variant_num = len(queries) + 1
+            queries.append(f"{found_tech_keyword} analysis")
+        print(f"备用查询生成完成，共 {len(queries)} 个查询")
+        return queries[:num_queries]
+    def _infer_biology_keywords(self, paper: PaperInfo) -> tuple:
+        """从论文内容推断最相关的生物学分类和关键词"""
+        text = f"{paper.title} {paper.abstract} {' '.join(paper.keywords)}".lower()
+        best_category = None
+        best_keyword = None
+        max_matches = 0
+        for category, keywords in BIOLOGY_KEYWORDS.items():
+            category_matches = 0
+            best_keyword_in_category = None
+            # 检查类别名称匹配
+            if category.lower() in text:
+                category_matches += 2
+            # 检查关键词匹配
+            for keyword in keywords:
+                if keyword.lower() in text:
+                    category_matches += 1
+                    if not best_keyword_in_category:
+                        best_keyword_in_category = keyword
+            # ���新最佳匹配
+            if category_matches > max_matches:
+                max_matches = category_matches
+                best_category = category
+                best_keyword = best_keyword_in_category or keywords[0]
+        return best_category, best_keyword
+    def _log_query_generation(self, paper: PaperInfo, queries: List[str]):
+        """记录查询生成日志"""
+        log_info = {
+            "paper_title": paper.title[:100] + "..." if len(paper.title) > 100 else paper.title,
+            "paper_keywords": paper.keywords,
+            "generated_queries": queries,
+            "query_count": len(queries),
+            "timestamp": time.time()
+        }
+        print(f"查询生成日志: {log_info}")
+    def _call_llm(self, prompt: str, system_msg: str) -> Optional[str]:
+        """调用LLM生成查询"""
+        try:
+            if DASHSCOPE_API_KEY:
+                import dashscope
+                dashscope.api_key = DASHSCOPE_API_KEY
+                response = dashscope.Generation.call(
+                    model="qwen-turbo-latest",
+                    messages=[
+                        {"role": "system", "content": system_msg},
+                        {"role": "user", "content": prompt}
+                    ],
+                    timeout=30
+                )
+                if response.status_code == 200:
+                    return response.output.text
+                else:
+                    print(f"DashScope API错误: {response.message}")
+            elif OPENAI_API_KEY:
+                from openai import OpenAI
+                client = OpenAI(api_key=OPENAI_API_KEY)
+                response = client.chat.completions.create(
+                    model="gpt-3.5-turbo",
+                    messages=[
+                        {"role": "system", "content": system_msg},
+                        {"role": "user", "content": prompt}
+                    ],
+                    timeout=30
+                )
+                return response.choices[0].message.content
+            else:
+                print("未配置API密钥")
+                return None
+        except Exception as e:
+            print(f"大模型调用错误: {str(e)}")
+            return None

reviewer_recommendation/utils.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""
+工具函数模块
+提供错误处理、状态管理和通用工具函数
+"""
+import time
+import logging
+from typing import Optional, Dict, Any
+from functools import wraps
+from .models import AppState, RecommendationResponse
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def handle_api_errors(func):
+    """API错误处理装饰器"""
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            logger.error(f"API调用错误 in {func.__name__}: {str(e)}")
+            raise
+    return wrapper
+def validate_paper_info(title: str, abstract: str, keywords: str) -> tuple[bool, str]:
+    """验证论文信息"""
+    if not title or not title.strip():
+        return False, "论文标题不能为空"
+    if not abstract or not abstract.strip():
+        return False, "论文摘要不能为空"
+    if len(abstract.strip()) < 50:
+        return False, "论文摘要至少需要50个字符"
+    if len(title.strip()) < 10:
+        return False, "论文标题至少需要10个字符"
+    return True, ""
+def validate_reviewer_count(count: int) -> tuple[bool, str]:
+    """验证审稿人数量"""
+    if count < 1:
+        return False, "推荐审稿人数量至少为1"
+    if count > 10:
+        return False, "推荐审稿人数量不能超过10"
+    return True, ""
+def format_error_message(error: Exception) -> str:
+    """格式化错误信息"""
+    error_type = type(error).__name__
+    error_msg = str(error)
+    # 常见错误类型处理
+    if "timeout" in error_msg.lower():
+        return "请求超时，请稍后重试"
+    elif "api" in error_msg.lower():
+        return "API调用失败，请检查网络连接"
+    elif "json" in error_msg.lower():
+        return "数据解析错误，请重试"
+    else:
+        return f"系统错误 ({error_type}): {error_msg}"
+def create_error_response(error: Exception, search_time: float = 0.0) -> RecommendationResponse:
+    """创建错误响应"""
+    return RecommendationResponse(
+        reviewers=[],
+        search_time=search_time,
+        total_candidates=0,
+        success=False,
+        error_message=format_error_message(error)
+    )
+def update_app_state(state: AppState, **kwargs) -> AppState:
+    """更新应用状态"""
+    for key, value in kwargs.items():
+        if hasattr(state, key):
+            setattr(state, key, value)
+    return state
+def log_operation(operation: str, **kwargs):
+    """记录操作日志"""
+    log_data = {
+        "operation": operation,
+        "timestamp": time.time(),
+        **kwargs
+    }
+    logger.info(f"操作日志: {log_data}")
+def sanitize_input(text: str) -> str:
+    """清理输入文本"""
+    if not text:
+        return ""
+    # 移除多余的空白字符
+    text = " ".join(text.split())
+    # 限制长度
+    if len(text) > 10000:
+        text = text[:10000] + "..."
+    return text
+def extract_keywords(text: str) -> list[str]:
+    """从文本中提取关键词"""
+    if not text:
+        return []
+    # 简单的关键词提取（按逗号分割）
+    keywords = [kw.strip() for kw in text.split(',') if kw.strip()]
+    # 过滤太短的关键词
+    keywords = [kw for kw in keywords if len(kw) >= 2]
+    # 去重
+    return list(set(keywords))
+def format_search_progress(current: int, total: int, step: str) -> str:
+    """格式化搜索进度"""
+    percentage = (current / total * 100) if total > 0 else 0
+    return f"搜索进度: {current}/{total} ({percentage:.1f}%) - {step}"
+def validate_api_keys() -> tuple[bool, str]:
+    """验证API密钥配置"""
+    import os
+    openai_key = os.getenv("OPENAI_API_KEY")
+    dashscope_key = "sk-564d51ee5ddd4693a86f34750b46b02e"  # 硬编码的密钥
+    if not openai_key and not dashscope_key:
+        return False, "未配置任何API密钥"
+    return True, "API密钥配置正常"