Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| 验证Crossref API的is-referenced-by-count字段准确性 | |
| 1. 先按is-referenced-by-count排序获取列表 | |
| 2. 然后通过DOI查找验证真实引用数量 | |
| """ | |
| import requests | |
| import json | |
| import time | |
| from typing import List, Dict, Any | |
| class CrossrefCitationVerifier: | |
| """Crossref引用量验证器""" | |
| def __init__(self): | |
| self.base_url = "https://api.crossref.org/works" | |
| self.headers = { | |
| 'User-Agent': 'Academic-Reviewer-System/1.0 (mailto:test@example.com)' | |
| } | |
| self.timeout = 30 | |
| def search_by_citation_count(self, query: str, limit: int = 10) -> List[Dict[str, Any]]: | |
| """按引用量搜索(先获取结果,再本地排序)""" | |
| print(f"=== 搜索查询: {query} ===") | |
| # 先按相关性搜索获取更多结果 | |
| params = { | |
| "query": query, | |
| "rows": min(limit * 3, 50), # 获取更多结果以便筛选 | |
| "sort": "relevance", | |
| "order": "desc", | |
| "select": "DOI,title,author,container-title,published-print,published-online,is-referenced-by-count" | |
| } | |
| try: | |
| response = requests.get(self.base_url, params=params, headers=self.headers, timeout=self.timeout) | |
| response.raise_for_status() | |
| data = response.json() | |
| items = data.get("message", {}).get("items", []) | |
| total_results = data.get("message", {}).get("total-results", 0) | |
| print(f"总命中数: {total_results}") | |
| print(f"获取结果数: {len(items)}") | |
| # 按引用量排序 | |
| sorted_items = sorted(items, key=lambda x: x.get('is-referenced-by-count', 0), reverse=True) | |
| # 返回前limit个结果 | |
| return sorted_items[:limit] | |
| except Exception as e: | |
| print(f"搜索失败: {str(e)}") | |
| return [] | |
| def get_doi_details(self, doi: str) -> Dict[str, Any]: | |
| """通过DOI获取详细信息""" | |
| url = f"https://api.crossref.org/works/{doi}" | |
| try: | |
| response = requests.get(url, headers=self.headers, timeout=self.timeout) | |
| response.raise_for_status() | |
| data = response.json() | |
| return data.get("message", {}) | |
| except Exception as e: | |
| print(f"DOI查找失败 {doi}: {str(e)}") | |
| return {} | |
| def verify_citations(self, query: str, limit: int = 5) -> None: | |
| """验证引用量准确性""" | |
| print(f"\n{'='*80}") | |
| print(f"验证查询: {query}") | |
| print(f"验证数量: {limit}") | |
| print(f"{'='*80}") | |
| # 1. 按引用量搜索 | |
| search_results = self.search_by_citation_count(query, limit) | |
| if not search_results: | |
| print("没有找到搜索结果") | |
| return | |
| print(f"\n--- 搜索结果(按is-referenced-by-count排序)---") | |
| for i, item in enumerate(search_results, 1): | |
| title = item.get('title', ['N/A'])[0] if item.get('title') else 'N/A' | |
| if len(title) > 60: | |
| title = title[:60] + "..." | |
| cited_count = item.get('is-referenced-by-count', 0) | |
| doi = item.get('DOI', 'N/A') | |
| # 获取发表年份 | |
| pub_date = item.get('published-print', {}) or item.get('published-online', {}) | |
| pub_year = pub_date.get('date-parts', [[None]])[0][0] if pub_date else 'N/A' | |
| print(f"{i}. {title}") | |
| print(f" DOI: {doi}") | |
| print(f" 搜索中的引用量: {cited_count}") | |
| print(f" 发表年份: {pub_year}") | |
| print() | |
| # 2. 通过DOI验证引用量 | |
| print(f"\n--- DOI验证结果 ---") | |
| verification_results = [] | |
| for i, item in enumerate(search_results, 1): | |
| doi = item.get('DOI') | |
| if not doi: | |
| print(f"{i}. 无DOI,跳过验证") | |
| continue | |
| print(f"{i}. 验证DOI: {doi}") | |
| # 获取DOI详细信息 | |
| doi_details = self.get_doi_details(doi) | |
| if doi_details: | |
| search_citation = item.get('is-referenced-by-count', 0) | |
| doi_citation = doi_details.get('is-referenced-by-count', 0) | |
| # 获取其他引用量相关字段 | |
| reference_count = doi_details.get('reference-count', 0) | |
| references_count = doi_details.get('references-count', 0) | |
| title = doi_details.get('title', ['N/A'])[0] if doi_details.get('title') else 'N/A' | |
| if len(title) > 50: | |
| title = title[:50] + "..." | |
| print(f" 标题: {title}") | |
| print(f" 搜索中的引用量: {search_citation}") | |
| print(f" DOI查询的引用量: {doi_citation}") | |
| print(f" 参考文献数: {reference_count}") | |
| print(f" 引用量匹配: {'✅' if search_citation == doi_citation else '❌'}") | |
| verification_results.append({ | |
| 'doi': doi, | |
| 'title': title, | |
| 'search_citation': search_citation, | |
| 'doi_citation': doi_citation, | |
| 'reference_count': reference_count, | |
| 'match': search_citation == doi_citation | |
| }) | |
| else: | |
| print(f" ❌ 无法获取DOI详细信息") | |
| verification_results.append({ | |
| 'doi': doi, | |
| 'title': 'N/A', | |
| 'search_citation': item.get('is-referenced-by-count', 0), | |
| 'doi_citation': -1, | |
| 'reference_count': 0, | |
| 'match': False | |
| }) | |
| print() | |
| time.sleep(0.5) # 避免请求过快 | |
| # 3. 统计验证结果 | |
| print(f"\n--- 验证统计 ---") | |
| total_verified = len(verification_results) | |
| matched = sum(1 for r in verification_results if r['match']) | |
| mismatched = total_verified - matched | |
| print(f"总验证数: {total_verified}") | |
| print(f"匹配数: {matched}") | |
| print(f"不匹配数: {mismatched}") | |
| print(f"匹配率: {matched/total_verified*100:.1f}%" if total_verified > 0 else "匹配率: 0%") | |
| # 显示不匹配的详情 | |
| if mismatched > 0: | |
| print(f"\n--- 不匹配详情 ---") | |
| for result in verification_results: | |
| if not result['match']: | |
| print(f"DOI: {result['doi']}") | |
| print(f" 搜索引用量: {result['search_citation']}") | |
| print(f" DOI引用量: {result['doi_citation']}") | |
| print(f" 差异: {abs(result['search_citation'] - result['doi_citation'])}") | |
| print() | |
| return verification_results | |
| def test_multiple_queries(self) -> None: | |
| """测试多个查询""" | |
| test_queries = [ | |
| "machine learning", | |
| "CRISPR", | |
| "cryo-electron microscopy", | |
| "artificial intelligence", | |
| "deep learning" | |
| ] | |
| print(f"\n{'='*80}") | |
| print("多查询验证测试") | |
| print(f"{'='*80}") | |
| all_results = [] | |
| for query in test_queries: | |
| results = self.verify_citations(query, limit=3) | |
| all_results.extend(results) | |
| time.sleep(1) # 避免请求过快 | |
| # 总体统计 | |
| print(f"\n{'='*80}") | |
| print("总体验证统计") | |
| print(f"{'='*80}") | |
| total_verified = len(all_results) | |
| matched = sum(1 for r in all_results if r['match']) | |
| mismatched = total_verified - matched | |
| print(f"总验证数: {total_verified}") | |
| print(f"匹配数: {matched}") | |
| print(f"不匹配数: {mismatched}") | |
| print(f"总体匹配率: {matched/total_verified*100:.1f}%" if total_verified > 0 else "总体匹配率: 0%") | |
| # 引用量分布统计 | |
| search_citations = [r['search_citation'] for r in all_results if r['search_citation'] > 0] | |
| doi_citations = [r['doi_citation'] for r in all_results if r['doi_citation'] > 0] | |
| if search_citations: | |
| print(f"\n搜索引用量统计:") | |
| print(f" 平均: {sum(search_citations) / len(search_citations):.2f}") | |
| print(f" 最大: {max(search_citations)}") | |
| print(f" 非零数量: {len(search_citations)}/{total_verified}") | |
| if doi_citations: | |
| print(f"\nDOI引用量统计:") | |
| print(f" 平均: {sum(doi_citations) / len(doi_citations):.2f}") | |
| print(f" 最大: {max(doi_citations)}") | |
| print(f" 非零数量: {len(doi_citations)}/{total_verified}") | |
| def main(): | |
| """主函数""" | |
| verifier = CrossrefCitationVerifier() | |
| print("Crossref引用量验证工具") | |
| print("=" * 80) | |
| while True: | |
| print("\n请选择验证选项:") | |
| print("1. 验证单个查询") | |
| print("2. 验证多个查询") | |
| print("3. 退出") | |
| choice = input("\n请输入选项 (1-3): ").strip() | |
| if choice == "1": | |
| query = input("请输入查询内容: ").strip() | |
| if query: | |
| limit = input("请输入验证数量 (默认5): ").strip() | |
| limit = int(limit) if limit.isdigit() else 5 | |
| verifier.verify_citations(query, limit) | |
| elif choice == "2": | |
| verifier.test_multiple_queries() | |
| elif choice == "3": | |
| print("退出验证工具") | |
| break | |
| else: | |
| print("无效选项,请重新选择") | |
| if __name__ == "__main__": | |
| main() | |