data/scraper.py · Matrix-Corp/Vortex-13b-V1 at main

Upload Vortex model

5c43f61 verified 8 days ago

12.8 kB

	"""
	VortexScienceScraper: Scrapes scientific content from open access sources.
	Respects robots.txt and rate limits.
	"""

	import time
	import requests
	from typing import List, Dict, Optional
	from urllib.robotparser import RobotFileParser
	from pathlib import Path
	import json


	class VortexScienceScraper:
	"""
	Scrapes scientific content from open access sources.
	Sources: arXiv, PubMed Central, Wikipedia, NIST, NASA.
	"""

	SOURCES = {
	"arxiv": {
	"base_url": "https://arxiv.org",
	"search_url": "https://arxiv.org/search/",
	"rate_limit": 1.0, # seconds between requests
	"robots": "https://arxiv.org/robots.txt",
	},
	"pubmed": {
	"base_url": "https://www.ncbi.nlm.nih.gov/pmc",
	"search_url": "https://www.ncbi.nlm.nih.gov/pmc/articles/",
	"rate_limit": 0.5,
	"robots": "https://www.ncbi.nlm.nih.gov/robots.txt",
	},
	"wikipedia": {
	"base_url": "https://en.wikipedia.org",
	"search_url": "https://en.wikipedia.org/w/api.php",
	"rate_limit": 0.1,
	"robots": "https://en.wikipedia.org/robots.txt",
	},
	"nist": {
	"base_url": "https://webbook.nist.gov",
	"search_url": "https://webbook.nist.gov/cgi/cbook.cgi",
	"rate_limit": 1.0,
	"robots": "https://webbook.nist.gov/robots.txt",
	},
	"nasa": {
	"base_url": "https://ntrs.nasa.gov",
	"search_url": "https://ntrs.nasa.gov/api/citations/search",
	"rate_limit": 1.0,
	"robots": "https://ntrs.nasa.gov/robots.txt",
	},
	}

	def __init__(
	self,
	output_dir: str = "./data/scraped",
	respect_robots: bool = True,
	user_agent: str = "VortexScientificBot/1.0",
	):
	"""
	Initialize scraper.

	Args:
	output_dir: Directory to save scraped data
	respect_robots: Whether to respect robots.txt
	user_agent: User agent string for requests
	"""
	self.output_dir = Path(output_dir)
	self.output_dir.mkdir(parents=True, exist_ok=True)
	self.respect_robots = respect_robots
	self.user_agent = user_agent

	self.session = requests.Session()
	self.session.headers.update({"User-Agent": user_agent})

	# Cache for robots.txt
	self.robots_cache = {}

	# Rate limit tracking
	self.last_request_time = {}

	def _check_robots_allowed(self, url: str) -> bool:
	"""Check if robots.txt allows scraping the URL."""
	if not self.respect_robots:
	return True

	# Extract base domain
	from urllib.parse import urlparse
	parsed = urlparse(url)
	base_url = f"{parsed.scheme}://{parsed.netloc}"

	if base_url not in self.robots_cache:
	rp = RobotFileParser()
	rp.set_url(base_url + "/robots.txt")
	try:
	rp.read()
	self.robots_cache[base_url] = rp
	except Exception as e:
	print(f"Could not read robots.txt for {base_url}: {e}")
	return False

	rp = self.robots_cache[base_url]
	return rp.can_fetch(self.user_agent, url)

	def _rate_limit(self, source: str):
	"""Enforce rate limiting for a source."""
	now = time.time()
	last = self.last_request_time.get(source, 0)
	delay = self.SOURCES[source]["rate_limit"]
	if now - last < delay:
	time.sleep(delay - (now - last))
	self.last_request_time[source] = time.time()

	def scrape_arxiv(
	self,
	query: str,
	max_results: int = 100,
	categories: Optional[List[str]] = None,
	) -> List[Dict]:
	"""
	Scrape arXiv papers.

	Args:
	query: Search query
	max_results: Maximum number of results
	categories: Optional list of arXiv categories (e.g., ['physics', 'math'])

	Returns:
	List of paper metadata and abstracts
	"""
	papers = []

	params = {
	"query": query,
	"searchtype": "all",
	"abstracts": "show",
	"size": min(max_results, 200), # arXiv max per page
	"order": "-announced_date_first",
	}

	if categories:
	params["filter"] = "categories:" + "+OR+".join(categories)

	url = self.SOURCES["arxiv"]["search_url"]

	if not self._check_robots_allowed(url):
	print(f"Robots.txt disallows scraping {url}")
	return papers

	try:
	self._rate_limit("arxiv")
	response = self.session.get(url, params=params)
	response.raise_for_status()

	# Parse HTML (simplified - would use BeautifulSoup in practice)
	# For now, return placeholder
	print(f"Scraped arXiv query '{query}' - got response status {response.status_code}")

	# Placeholder: would extract paper titles, abstracts, PDF links
	for i in range(min(10, max_results)):
	papers.append({
	"source": "arxiv",
	"title": f"Paper {i}",
	"abstract": "Abstract placeholder...",
	"pdf_url": f"https://arxiv.org/pdf/{i}.pdf",
	})

	except Exception as e:
	print(f"Error scraping arXiv: {e}")

	return papers

	def scrape_pubmed(
	self,
	query: str,
	max_results: int = 100,
	) -> List[Dict]:
	"""Scrape PubMed Central articles."""
	articles = []

	# PubMed API endpoint
	url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
	params = {
	"db": "pmc",
	"term": query,
	"retmax": max_results,
	"retmode": "json",
	}

	if not self._check_robots_allowed(url):
	print(f"Robots.txt disallows {url}")
	return articles

	try:
	self._rate_limit("pubmed")
	response = self.session.get(url, params=params)
	response.raise_for_status()

	data = response.json()
	pmc_ids = data.get("esearchresult", {}).get("idlist", [])

	for pmc_id in pmc_ids[:10]: # Limit for demo
	articles.append({
	"source": "pubmed",
	"pmc_id": pmc_id,
	"url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/",
	})

	print(f"Found {len(pmc_ids)} PubMed articles")

	except Exception as e:
	print(f"Error scraping PubMed: {e}")

	return articles

	def scrape_wikipedia(
	self,
	topic: str,
	max_pages: int = 10,
	) -> List[Dict]:
	"""Scrape Wikipedia science articles."""
	pages = []

	# Wikipedia API
	url = "https://en.wikipedia.org/w/api.php"
	params = {
	"action": "query",
	"format": "json",
	"prop": "extracts",
	"exintro": True,
	"titles": topic,
	"redirects": True,
	}

	if not self._check_robots_allowed(url):
	print(f"Robots.txt disallows {url}")
	return pages

	try:
	self._rate_limit("wikipedia")
	response = self.session.get(url, params=params)
	response.raise_for_status()

	data = response.json()
	pages_data = data.get("query", {}).get("pages", {})

	for page_id, page in pages_data.items():
	if "extract" in page:
	pages.append({
	"source": "wikipedia",
	"title": page.get("title", ""),
	"text": page.get("extract", ""),
	})

	except Exception as e:
	print(f"Error scraping Wikipedia: {e}")

	return pages

	def scrape_nist(
	self,
	element: str,
	) -> List[Dict]:
	"""Scrape NIST chemistry webbook for element data."""
	data = []

	url = "https://webbook.nist.gov/cgi/cbook.cgi"
	params = {
	"Formula": element,
	"Units": "SI",
	"Submit": "Submit",
	}

	if not self._check_robots_allowed(url):
	print(f"Robots.txt disallows {url}")
	return data

	try:
	self._rate_limit("nist")
	response = self.session.get(url, params=params)
	response.raise_for_status()

	# Placeholder - would parse HTML tables
	data.append({
	"source": "nist",
	"element": element,
	"html": response.text[:1000],
	})

	except Exception as e:
	print(f"Error scraping NIST: {e}")

	return data

	def scrape_nasa(
	self,
	query: str,
	max_results: int = 50,
	) -> List[Dict]:
	"""Scrape NASA technical reports."""
	reports = []

	url = "https://ntrs.nasa.gov/api/citations/search"
	params = {
	"q": query,
	"page[size]": max_results,
	}

	if not self._check_robots_allowed(url):
	print(f"Robots.txt disallows {url}")
	return reports

	try:
	self._rate_limit("nasa")
	response = self.session.get(url, params=params)
	response.raise_for_status()

	data = response.json()
	for item in data.get("data", [])[:10]:
	reports.append({
	"source": "nasa",
	"title": item.get("attributes", {}).get("title", ""),
	"abstract": item.get("attributes", {}).get("abstract", ""),
	"download_url": item.get("attributes", {}).get("downloads", {}).get("pdf", ""),
	})

	except Exception as e:
	print(f"Error scraping NASA: {e}")

	return reports

	def save_results(
	self,
	results: List[Dict],
	filename: str,
	):
	"""Save scraped results to JSON."""
	output_path = self.output_dir / filename
	with open(output_path, "w", encoding="utf-8") as f:
	json.dump(results, f, indent=2, ensure_ascii=False)
	print(f"Saved {len(results)} results to {output_path}")

	def scrape_all_sources(
	self,
	queries: Dict[str, str],
	max_per_source: int = 50,
	) -> Dict[str, List[Dict]]:
	"""
	Scrape all sources with given queries.

	Args:
	queries: Dict mapping source name to query string
	max_per_source: Max results per source

	Returns:
	Dict mapping source to list of results
	"""
	all_results = {}

	for source, query in queries.items():
	if source not in self.SOURCES:
	print(f"Unknown source: {source}")
	continue

	print(f"Scraping {source} with query: {query}")

	if source == "arxiv":
	results = self.scrape_arxiv(query, max_results=max_per_source)
	elif source == "pubmed":
	results = self.scrape_pubmed(query, max_results=max_per_source)
	elif source == "wikipedia":
	results = self.scrape_wikipedia(query, max_pages=max_per_source)
	elif source == "nist":
	results = self.scrape_nist(query)
	elif source == "nasa":
	results = self.scrape_nasa(query, max_results=max_per_source)
	else:
	results = []

	all_results[source] = results

	# Save intermediate results
	self.save_results(results, f"{source}_results.json")

	return all_results


	def test_scraper():
	"""Test the scraper (limited)."""
	scraper = VortexScienceScraper()

	# Test Wikipedia (lightweight)
	print("Testing Wikipedia scrape...")
	results = scraper.scrape_wikipedia("quantum mechanics", max_pages=2)
	print(f"Got {len(results)} Wikipedia pages")

	# Test arXiv (rate limited)
	print("Testing arXiv scrape...")
	results = scraper.scrape_arxiv("quantum", max_results=5)
	print(f"Got {len(results)} arXiv papers")

	print("Scraper test passed!")


	if __name__ == "__main__":
	test_scraper()