|
|
|
|
|
""" |
|
|
Script to retrieve papers from the arXiv API |
|
|
Optimized for natural representation of scientific domains |
|
|
""" |
|
|
|
|
|
import requests |
|
|
import xml.etree.ElementTree as ET |
|
|
import json |
|
|
import time |
|
|
import os |
|
|
from urllib.parse import quote |
|
|
from datetime import datetime, timedelta |
|
|
from collections import Counter |
|
|
import random |
|
|
|
|
|
class ArxivFetcher: |
|
|
def __init__(self): |
|
|
self.base_url = "http://export.arxiv.org/api/query" |
|
|
self.delay = 3 |
|
|
|
|
|
def fetch_by_category(self, categories, max_per_category=500, total_max=15000): |
|
|
"""Retrieve papers by category with global limit""" |
|
|
print(f"🔍 Retrieval by category (max {max_per_category} per cat, {total_max} total)") |
|
|
|
|
|
all_papers = [] |
|
|
|
|
|
for i, category in enumerate(categories): |
|
|
if len(all_papers) >= total_max: |
|
|
break |
|
|
|
|
|
print(f" [{i+1}/{len(categories)}] {category}...") |
|
|
|
|
|
|
|
|
remaining = total_max - len(all_papers) |
|
|
fetch_count = min(max_per_category, remaining) |
|
|
|
|
|
papers = self._fetch_category(category, fetch_count) |
|
|
all_papers.extend(papers) |
|
|
|
|
|
print(f" ✅ {len(papers)} papers retrieved (total: {len(all_papers)})") |
|
|
|
|
|
|
|
|
if i < len(categories) - 1: |
|
|
time.sleep(self.delay) |
|
|
|
|
|
return all_papers[:total_max] |
|
|
|
|
|
def fetch_recent_papers(self, days_back=30, max_results=15000): |
|
|
"""Retrieve recent papers from the last days""" |
|
|
print(f"📅 Retrieving papers from the last {days_back} days") |
|
|
|
|
|
|
|
|
end_date = datetime.now() |
|
|
|
|
|
start_date = end_date - timedelta(days=days_back) |
|
|
|
|
|
|
|
|
date_query = f"submittedDate:[{start_date.strftime('%Y%m%d%H%M')} TO {end_date.strftime('%Y%m%d%H%M')}]" |
|
|
|
|
|
return self._fetch_with_query(date_query, max_results) |
|
|
|
|
|
def _fetch_category(self, category, max_results): |
|
|
"""Retrieve papers from a specific category""" |
|
|
query = f"cat:{category}" |
|
|
return self._fetch_with_query(query, max_results) |
|
|
|
|
|
def _fetch_with_query(self, query, max_results): |
|
|
"""Generic method to retrieve with a query""" |
|
|
papers = [] |
|
|
start = 0 |
|
|
batch_size = min(1000, max_results) |
|
|
|
|
|
while len(papers) < max_results: |
|
|
remaining = max_results - len(papers) |
|
|
current_batch = min(batch_size, remaining) |
|
|
|
|
|
params = { |
|
|
'search_query': query, |
|
|
'start': start, |
|
|
'max_results': current_batch, |
|
|
'sortBy': 'submittedDate', |
|
|
'sortOrder': 'descending' |
|
|
} |
|
|
|
|
|
try: |
|
|
response = requests.get(self.base_url, params=params, timeout=30) |
|
|
response.raise_for_status() |
|
|
|
|
|
batch_papers = self._parse_response(response.text) |
|
|
if not batch_papers: |
|
|
print(f" ⚠️ No results for start={start}") |
|
|
break |
|
|
|
|
|
papers.extend(batch_papers) |
|
|
start += len(batch_papers) |
|
|
|
|
|
print(f" 📄 Batch {len(batch_papers)} papers (total: {len(papers)})") |
|
|
|
|
|
|
|
|
time.sleep(self.delay) |
|
|
|
|
|
except Exception as e: |
|
|
print(f" ❌ Error: {e}") |
|
|
break |
|
|
|
|
|
return papers[:max_results] |
|
|
|
|
|
def _parse_response(self, xml_content): |
|
|
"""Parse arXiv XML response""" |
|
|
papers = [] |
|
|
|
|
|
try: |
|
|
root = ET.fromstring(xml_content) |
|
|
|
|
|
|
|
|
ns = {'atom': 'http://www.w3.org/2005/Atom', |
|
|
'arxiv': 'http://arxiv.org/schemas/atom'} |
|
|
|
|
|
entries = root.findall('atom:entry', ns) |
|
|
|
|
|
for entry in entries: |
|
|
try: |
|
|
|
|
|
arxiv_id = entry.find('atom:id', ns).text.split('/')[-1] |
|
|
|
|
|
|
|
|
title = entry.find('atom:title', ns).text.strip() |
|
|
title = ' '.join(title.split()) |
|
|
|
|
|
|
|
|
summary = entry.find('atom:summary', ns).text.strip() |
|
|
summary = ' '.join(summary.split())[:500] |
|
|
|
|
|
|
|
|
authors = [] |
|
|
for author in entry.findall('atom:author', ns): |
|
|
name = author.find('atom:name', ns) |
|
|
if name is not None: |
|
|
authors.append(name.text.strip()) |
|
|
|
|
|
|
|
|
categories = [] |
|
|
primary_category = None |
|
|
|
|
|
for category in entry.findall('atom:category', ns): |
|
|
term = category.get('term') |
|
|
if term: |
|
|
categories.append(term) |
|
|
|
|
|
|
|
|
primary_cat = entry.find('arxiv:primary_category', ns) |
|
|
if primary_cat is not None: |
|
|
primary_category = primary_cat.get('term') |
|
|
elif categories: |
|
|
primary_category = categories[0] |
|
|
|
|
|
|
|
|
published = entry.find('atom:published', ns) |
|
|
published_date = published.text if published is not None else None |
|
|
|
|
|
paper = { |
|
|
'id': arxiv_id, |
|
|
'title': title, |
|
|
'summary': summary, |
|
|
'authors': authors, |
|
|
'categories': categories, |
|
|
'primary_category': primary_category, |
|
|
'published': published_date |
|
|
} |
|
|
|
|
|
papers.append(paper) |
|
|
|
|
|
except Exception as e: |
|
|
print(f" ⚠️ Error parsing entry: {e}") |
|
|
continue |
|
|
|
|
|
except ET.ParseError as e: |
|
|
print(f"❌ XML parsing error: {e}") |
|
|
|
|
|
return papers |
|
|
|
|
|
def save_papers(papers, filename): |
|
|
"""Save papers to JSON""" |
|
|
with open(filename, 'w', encoding='utf-8') as f: |
|
|
json.dump(papers, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
size_mb = os.path.getsize(filename) / 1024 / 1024 |
|
|
print(f"💾 Saved: {filename} ({len(papers)} papers, {size_mb:.1f} MB)") |
|
|
|
|
|
def main(): |
|
|
"""Main arXiv data retrieval""" |
|
|
print("🚀 ArXiv Data Fetcher - Version Optimisée") |
|
|
print("=" * 50) |
|
|
|
|
|
fetcher = ArxivFetcher() |
|
|
|
|
|
|
|
|
print("\n📅 SIMPLE APPROACH: 1 month of recent data") |
|
|
print("🎯 Objective: retrieve everything available from the last month") |
|
|
print("⚡ Without representativeness constraint - just natural data") |
|
|
|
|
|
|
|
|
monthly_papers = None |
|
|
for days in [30, 60, 90, 120]: |
|
|
print(f"\n🔍 Attempt: {days} days...") |
|
|
monthly_papers = fetcher.fetch_recent_papers(days_back=days, max_results=15000) |
|
|
if monthly_papers and len(monthly_papers) > 1000: |
|
|
print(f"✅ {len(monthly_papers)} papers found over {days} days") |
|
|
break |
|
|
elif monthly_papers: |
|
|
print(f"⚠️ Only {len(monthly_papers)} papers over {days} days") |
|
|
else: |
|
|
print(f"❌ No papers found over {days} days") |
|
|
|
|
|
if not monthly_papers: |
|
|
print("\n🔄 Fallback: retrieval by popular categories") |
|
|
|
|
|
popular_categories = [ |
|
|
'cs.LG', 'cs.AI', 'cs.CV', 'cs.CL', 'cs.CR', 'cs.RO', 'cs.HC', |
|
|
'physics.comp-ph', 'physics.data-an', 'physics.optics', |
|
|
'math.ST', 'math.NA', 'math.OC', 'math.PR', |
|
|
'stat.ML', 'stat.ME', 'stat.AP', |
|
|
'eess.AS', 'eess.IV', 'eess.SP', |
|
|
'q-bio.QM', 'q-bio.BM', 'astro-ph.CO' |
|
|
] |
|
|
|
|
|
monthly_papers = fetcher.fetch_by_category( |
|
|
categories=popular_categories, |
|
|
max_per_category=500, |
|
|
total_max=15000 |
|
|
) |
|
|
|
|
|
if monthly_papers: |
|
|
save_papers(monthly_papers, "arxiv_monthly_papers.json") |
|
|
|
|
|
|
|
|
from collections import Counter |
|
|
|
|
|
|
|
|
sample_keys = list(monthly_papers[0].keys()) if monthly_papers else [] |
|
|
category_key = 'primary_category' if 'primary_category' in sample_keys else 'categories' |
|
|
|
|
|
domains = [] |
|
|
for paper in monthly_papers: |
|
|
if category_key in paper: |
|
|
cat = paper[category_key] |
|
|
if isinstance(cat, list) and cat: |
|
|
domains.append(cat[0].split('.')[0]) |
|
|
elif isinstance(cat, str): |
|
|
domains.append(cat.split('.')[0]) |
|
|
|
|
|
domain_counts = Counter(domains) |
|
|
|
|
|
print(f"\n📊 Natural distribution ({len(monthly_papers)} papers):") |
|
|
for domain, count in domain_counts.most_common(): |
|
|
percentage = count / len(monthly_papers) * 100 |
|
|
print(f" {domain}: {count} papers ({percentage:.1f}%)") |
|
|
else: |
|
|
print("❌ Complete retrieval failure") |
|
|
|
|
|
print("\n🎉 Retrieval completed!") |
|
|
print("📁 Files created:") |
|
|
for filename in ["arxiv_monthly_papers.json"]: |
|
|
if os.path.exists(filename): |
|
|
size = os.path.getsize(filename) / 1024 / 1024 |
|
|
print(f" - {filename} ({size:.1f} MB)") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |