Vortex-13b-V1 / data /scraper.py
Zandy-Wandy's picture
Upload Vortex model
5c43f61 verified
"""
VortexScienceScraper: Scrapes scientific content from open access sources.
Respects robots.txt and rate limits.
"""
import time
import requests
from typing import List, Dict, Optional
from urllib.robotparser import RobotFileParser
from pathlib import Path
import json
class VortexScienceScraper:
"""
Scrapes scientific content from open access sources.
Sources: arXiv, PubMed Central, Wikipedia, NIST, NASA.
"""
SOURCES = {
"arxiv": {
"base_url": "https://arxiv.org",
"search_url": "https://arxiv.org/search/",
"rate_limit": 1.0, # seconds between requests
"robots": "https://arxiv.org/robots.txt",
},
"pubmed": {
"base_url": "https://www.ncbi.nlm.nih.gov/pmc",
"search_url": "https://www.ncbi.nlm.nih.gov/pmc/articles/",
"rate_limit": 0.5,
"robots": "https://www.ncbi.nlm.nih.gov/robots.txt",
},
"wikipedia": {
"base_url": "https://en.wikipedia.org",
"search_url": "https://en.wikipedia.org/w/api.php",
"rate_limit": 0.1,
"robots": "https://en.wikipedia.org/robots.txt",
},
"nist": {
"base_url": "https://webbook.nist.gov",
"search_url": "https://webbook.nist.gov/cgi/cbook.cgi",
"rate_limit": 1.0,
"robots": "https://webbook.nist.gov/robots.txt",
},
"nasa": {
"base_url": "https://ntrs.nasa.gov",
"search_url": "https://ntrs.nasa.gov/api/citations/search",
"rate_limit": 1.0,
"robots": "https://ntrs.nasa.gov/robots.txt",
},
}
def __init__(
self,
output_dir: str = "./data/scraped",
respect_robots: bool = True,
user_agent: str = "VortexScientificBot/1.0",
):
"""
Initialize scraper.
Args:
output_dir: Directory to save scraped data
respect_robots: Whether to respect robots.txt
user_agent: User agent string for requests
"""
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.respect_robots = respect_robots
self.user_agent = user_agent
self.session = requests.Session()
self.session.headers.update({"User-Agent": user_agent})
# Cache for robots.txt
self.robots_cache = {}
# Rate limit tracking
self.last_request_time = {}
def _check_robots_allowed(self, url: str) -> bool:
"""Check if robots.txt allows scraping the URL."""
if not self.respect_robots:
return True
# Extract base domain
from urllib.parse import urlparse
parsed = urlparse(url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
if base_url not in self.robots_cache:
rp = RobotFileParser()
rp.set_url(base_url + "/robots.txt")
try:
rp.read()
self.robots_cache[base_url] = rp
except Exception as e:
print(f"Could not read robots.txt for {base_url}: {e}")
return False
rp = self.robots_cache[base_url]
return rp.can_fetch(self.user_agent, url)
def _rate_limit(self, source: str):
"""Enforce rate limiting for a source."""
now = time.time()
last = self.last_request_time.get(source, 0)
delay = self.SOURCES[source]["rate_limit"]
if now - last < delay:
time.sleep(delay - (now - last))
self.last_request_time[source] = time.time()
def scrape_arxiv(
self,
query: str,
max_results: int = 100,
categories: Optional[List[str]] = None,
) -> List[Dict]:
"""
Scrape arXiv papers.
Args:
query: Search query
max_results: Maximum number of results
categories: Optional list of arXiv categories (e.g., ['physics', 'math'])
Returns:
List of paper metadata and abstracts
"""
papers = []
params = {
"query": query,
"searchtype": "all",
"abstracts": "show",
"size": min(max_results, 200), # arXiv max per page
"order": "-announced_date_first",
}
if categories:
params["filter"] = "categories:" + "+OR+".join(categories)
url = self.SOURCES["arxiv"]["search_url"]
if not self._check_robots_allowed(url):
print(f"Robots.txt disallows scraping {url}")
return papers
try:
self._rate_limit("arxiv")
response = self.session.get(url, params=params)
response.raise_for_status()
# Parse HTML (simplified - would use BeautifulSoup in practice)
# For now, return placeholder
print(f"Scraped arXiv query '{query}' - got response status {response.status_code}")
# Placeholder: would extract paper titles, abstracts, PDF links
for i in range(min(10, max_results)):
papers.append({
"source": "arxiv",
"title": f"Paper {i}",
"abstract": "Abstract placeholder...",
"pdf_url": f"https://arxiv.org/pdf/{i}.pdf",
})
except Exception as e:
print(f"Error scraping arXiv: {e}")
return papers
def scrape_pubmed(
self,
query: str,
max_results: int = 100,
) -> List[Dict]:
"""Scrape PubMed Central articles."""
articles = []
# PubMed API endpoint
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "pmc",
"term": query,
"retmax": max_results,
"retmode": "json",
}
if not self._check_robots_allowed(url):
print(f"Robots.txt disallows {url}")
return articles
try:
self._rate_limit("pubmed")
response = self.session.get(url, params=params)
response.raise_for_status()
data = response.json()
pmc_ids = data.get("esearchresult", {}).get("idlist", [])
for pmc_id in pmc_ids[:10]: # Limit for demo
articles.append({
"source": "pubmed",
"pmc_id": pmc_id,
"url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/",
})
print(f"Found {len(pmc_ids)} PubMed articles")
except Exception as e:
print(f"Error scraping PubMed: {e}")
return articles
def scrape_wikipedia(
self,
topic: str,
max_pages: int = 10,
) -> List[Dict]:
"""Scrape Wikipedia science articles."""
pages = []
# Wikipedia API
url = "https://en.wikipedia.org/w/api.php"
params = {
"action": "query",
"format": "json",
"prop": "extracts",
"exintro": True,
"titles": topic,
"redirects": True,
}
if not self._check_robots_allowed(url):
print(f"Robots.txt disallows {url}")
return pages
try:
self._rate_limit("wikipedia")
response = self.session.get(url, params=params)
response.raise_for_status()
data = response.json()
pages_data = data.get("query", {}).get("pages", {})
for page_id, page in pages_data.items():
if "extract" in page:
pages.append({
"source": "wikipedia",
"title": page.get("title", ""),
"text": page.get("extract", ""),
})
except Exception as e:
print(f"Error scraping Wikipedia: {e}")
return pages
def scrape_nist(
self,
element: str,
) -> List[Dict]:
"""Scrape NIST chemistry webbook for element data."""
data = []
url = "https://webbook.nist.gov/cgi/cbook.cgi"
params = {
"Formula": element,
"Units": "SI",
"Submit": "Submit",
}
if not self._check_robots_allowed(url):
print(f"Robots.txt disallows {url}")
return data
try:
self._rate_limit("nist")
response = self.session.get(url, params=params)
response.raise_for_status()
# Placeholder - would parse HTML tables
data.append({
"source": "nist",
"element": element,
"html": response.text[:1000],
})
except Exception as e:
print(f"Error scraping NIST: {e}")
return data
def scrape_nasa(
self,
query: str,
max_results: int = 50,
) -> List[Dict]:
"""Scrape NASA technical reports."""
reports = []
url = "https://ntrs.nasa.gov/api/citations/search"
params = {
"q": query,
"page[size]": max_results,
}
if not self._check_robots_allowed(url):
print(f"Robots.txt disallows {url}")
return reports
try:
self._rate_limit("nasa")
response = self.session.get(url, params=params)
response.raise_for_status()
data = response.json()
for item in data.get("data", [])[:10]:
reports.append({
"source": "nasa",
"title": item.get("attributes", {}).get("title", ""),
"abstract": item.get("attributes", {}).get("abstract", ""),
"download_url": item.get("attributes", {}).get("downloads", {}).get("pdf", ""),
})
except Exception as e:
print(f"Error scraping NASA: {e}")
return reports
def save_results(
self,
results: List[Dict],
filename: str,
):
"""Save scraped results to JSON."""
output_path = self.output_dir / filename
with open(output_path, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"Saved {len(results)} results to {output_path}")
def scrape_all_sources(
self,
queries: Dict[str, str],
max_per_source: int = 50,
) -> Dict[str, List[Dict]]:
"""
Scrape all sources with given queries.
Args:
queries: Dict mapping source name to query string
max_per_source: Max results per source
Returns:
Dict mapping source to list of results
"""
all_results = {}
for source, query in queries.items():
if source not in self.SOURCES:
print(f"Unknown source: {source}")
continue
print(f"Scraping {source} with query: {query}")
if source == "arxiv":
results = self.scrape_arxiv(query, max_results=max_per_source)
elif source == "pubmed":
results = self.scrape_pubmed(query, max_results=max_per_source)
elif source == "wikipedia":
results = self.scrape_wikipedia(query, max_pages=max_per_source)
elif source == "nist":
results = self.scrape_nist(query)
elif source == "nasa":
results = self.scrape_nasa(query, max_results=max_per_source)
else:
results = []
all_results[source] = results
# Save intermediate results
self.save_results(results, f"{source}_results.json")
return all_results
def test_scraper():
"""Test the scraper (limited)."""
scraper = VortexScienceScraper()
# Test Wikipedia (lightweight)
print("Testing Wikipedia scrape...")
results = scraper.scrape_wikipedia("quantum mechanics", max_pages=2)
print(f"Got {len(results)} Wikipedia pages")
# Test arXiv (rate limited)
print("Testing arXiv scrape...")
results = scraper.scrape_arxiv("quantum", max_results=5)
print(f"Got {len(results)} arXiv papers")
print("Scraper test passed!")
if __name__ == "__main__":
test_scraper()