new_recommendation / verify_crossref_citations.py
wujian123's picture
Upload all project files
3c6b551
#!/usr/bin/env python3
"""
验证Crossref API的is-referenced-by-count字段准确性
1. 先按is-referenced-by-count排序获取列表
2. 然后通过DOI查找验证真实引用数量
"""
import requests
import json
import time
from typing import List, Dict, Any
class CrossrefCitationVerifier:
"""Crossref引用量验证器"""
def __init__(self):
self.base_url = "https://api.crossref.org/works"
self.headers = {
'User-Agent': 'Academic-Reviewer-System/1.0 (mailto:test@example.com)'
}
self.timeout = 30
def search_by_citation_count(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
"""按引用量搜索(先获取结果,再本地排序)"""
print(f"=== 搜索查询: {query} ===")
# 先按相关性搜索获取更多结果
params = {
"query": query,
"rows": min(limit * 3, 50), # 获取更多结果以便筛选
"sort": "relevance",
"order": "desc",
"select": "DOI,title,author,container-title,published-print,published-online,is-referenced-by-count"
}
try:
response = requests.get(self.base_url, params=params, headers=self.headers, timeout=self.timeout)
response.raise_for_status()
data = response.json()
items = data.get("message", {}).get("items", [])
total_results = data.get("message", {}).get("total-results", 0)
print(f"总命中数: {total_results}")
print(f"获取结果数: {len(items)}")
# 按引用量排序
sorted_items = sorted(items, key=lambda x: x.get('is-referenced-by-count', 0), reverse=True)
# 返回前limit个结果
return sorted_items[:limit]
except Exception as e:
print(f"搜索失败: {str(e)}")
return []
def get_doi_details(self, doi: str) -> Dict[str, Any]:
"""通过DOI获取详细信息"""
url = f"https://api.crossref.org/works/{doi}"
try:
response = requests.get(url, headers=self.headers, timeout=self.timeout)
response.raise_for_status()
data = response.json()
return data.get("message", {})
except Exception as e:
print(f"DOI查找失败 {doi}: {str(e)}")
return {}
def verify_citations(self, query: str, limit: int = 5) -> None:
"""验证引用量准确性"""
print(f"\n{'='*80}")
print(f"验证查询: {query}")
print(f"验证数量: {limit}")
print(f"{'='*80}")
# 1. 按引用量搜索
search_results = self.search_by_citation_count(query, limit)
if not search_results:
print("没有找到搜索结果")
return
print(f"\n--- 搜索结果(按is-referenced-by-count排序)---")
for i, item in enumerate(search_results, 1):
title = item.get('title', ['N/A'])[0] if item.get('title') else 'N/A'
if len(title) > 60:
title = title[:60] + "..."
cited_count = item.get('is-referenced-by-count', 0)
doi = item.get('DOI', 'N/A')
# 获取发表年份
pub_date = item.get('published-print', {}) or item.get('published-online', {})
pub_year = pub_date.get('date-parts', [[None]])[0][0] if pub_date else 'N/A'
print(f"{i}. {title}")
print(f" DOI: {doi}")
print(f" 搜索中的引用量: {cited_count}")
print(f" 发表年份: {pub_year}")
print()
# 2. 通过DOI验证引用量
print(f"\n--- DOI验证结果 ---")
verification_results = []
for i, item in enumerate(search_results, 1):
doi = item.get('DOI')
if not doi:
print(f"{i}. 无DOI,跳过验证")
continue
print(f"{i}. 验证DOI: {doi}")
# 获取DOI详细信息
doi_details = self.get_doi_details(doi)
if doi_details:
search_citation = item.get('is-referenced-by-count', 0)
doi_citation = doi_details.get('is-referenced-by-count', 0)
# 获取其他引用量相关字段
reference_count = doi_details.get('reference-count', 0)
references_count = doi_details.get('references-count', 0)
title = doi_details.get('title', ['N/A'])[0] if doi_details.get('title') else 'N/A'
if len(title) > 50:
title = title[:50] + "..."
print(f" 标题: {title}")
print(f" 搜索中的引用量: {search_citation}")
print(f" DOI查询的引用量: {doi_citation}")
print(f" 参考文献数: {reference_count}")
print(f" 引用量匹配: {'✅' if search_citation == doi_citation else '❌'}")
verification_results.append({
'doi': doi,
'title': title,
'search_citation': search_citation,
'doi_citation': doi_citation,
'reference_count': reference_count,
'match': search_citation == doi_citation
})
else:
print(f" ❌ 无法获取DOI详细信息")
verification_results.append({
'doi': doi,
'title': 'N/A',
'search_citation': item.get('is-referenced-by-count', 0),
'doi_citation': -1,
'reference_count': 0,
'match': False
})
print()
time.sleep(0.5) # 避免请求过快
# 3. 统计验证结果
print(f"\n--- 验证统计 ---")
total_verified = len(verification_results)
matched = sum(1 for r in verification_results if r['match'])
mismatched = total_verified - matched
print(f"总验证数: {total_verified}")
print(f"匹配数: {matched}")
print(f"不匹配数: {mismatched}")
print(f"匹配率: {matched/total_verified*100:.1f}%" if total_verified > 0 else "匹配率: 0%")
# 显示不匹配的详情
if mismatched > 0:
print(f"\n--- 不匹配详情 ---")
for result in verification_results:
if not result['match']:
print(f"DOI: {result['doi']}")
print(f" 搜索引用量: {result['search_citation']}")
print(f" DOI引用量: {result['doi_citation']}")
print(f" 差异: {abs(result['search_citation'] - result['doi_citation'])}")
print()
return verification_results
def test_multiple_queries(self) -> None:
"""测试多个查询"""
test_queries = [
"machine learning",
"CRISPR",
"cryo-electron microscopy",
"artificial intelligence",
"deep learning"
]
print(f"\n{'='*80}")
print("多查询验证测试")
print(f"{'='*80}")
all_results = []
for query in test_queries:
results = self.verify_citations(query, limit=3)
all_results.extend(results)
time.sleep(1) # 避免请求过快
# 总体统计
print(f"\n{'='*80}")
print("总体验证统计")
print(f"{'='*80}")
total_verified = len(all_results)
matched = sum(1 for r in all_results if r['match'])
mismatched = total_verified - matched
print(f"总验证数: {total_verified}")
print(f"匹配数: {matched}")
print(f"不匹配数: {mismatched}")
print(f"总体匹配率: {matched/total_verified*100:.1f}%" if total_verified > 0 else "总体匹配率: 0%")
# 引用量分布统计
search_citations = [r['search_citation'] for r in all_results if r['search_citation'] > 0]
doi_citations = [r['doi_citation'] for r in all_results if r['doi_citation'] > 0]
if search_citations:
print(f"\n搜索引用量统计:")
print(f" 平均: {sum(search_citations) / len(search_citations):.2f}")
print(f" 最大: {max(search_citations)}")
print(f" 非零数量: {len(search_citations)}/{total_verified}")
if doi_citations:
print(f"\nDOI引用量统计:")
print(f" 平均: {sum(doi_citations) / len(doi_citations):.2f}")
print(f" 最大: {max(doi_citations)}")
print(f" 非零数量: {len(doi_citations)}/{total_verified}")
def main():
"""主函数"""
verifier = CrossrefCitationVerifier()
print("Crossref引用量验证工具")
print("=" * 80)
while True:
print("\n请选择验证选项:")
print("1. 验证单个查询")
print("2. 验证多个查询")
print("3. 退出")
choice = input("\n请输入选项 (1-3): ").strip()
if choice == "1":
query = input("请输入查询内容: ").strip()
if query:
limit = input("请输入验证数量 (默认5): ").strip()
limit = int(limit) if limit.isdigit() else 5
verifier.verify_citations(query, limit)
elif choice == "2":
verifier.test_multiple_queries()
elif choice == "3":
print("退出验证工具")
break
else:
print("无效选项,请重新选择")
if __name__ == "__main__":
main()