| | """
|
| | VortexScienceScraper: Scrapes scientific content from open access sources.
|
| | Respects robots.txt and rate limits.
|
| | """
|
| |
|
| | import time
|
| | import requests
|
| | from typing import List, Dict, Optional
|
| | from urllib.robotparser import RobotFileParser
|
| | from pathlib import Path
|
| | import json
|
| |
|
| |
|
| | class VortexScienceScraper:
|
| | """
|
| | Scrapes scientific content from open access sources.
|
| | Sources: arXiv, PubMed Central, Wikipedia, NIST, NASA.
|
| | """
|
| |
|
| | SOURCES = {
|
| | "arxiv": {
|
| | "base_url": "https://arxiv.org",
|
| | "search_url": "https://arxiv.org/search/",
|
| | "rate_limit": 1.0,
|
| | "robots": "https://arxiv.org/robots.txt",
|
| | },
|
| | "pubmed": {
|
| | "base_url": "https://www.ncbi.nlm.nih.gov/pmc",
|
| | "search_url": "https://www.ncbi.nlm.nih.gov/pmc/articles/",
|
| | "rate_limit": 0.5,
|
| | "robots": "https://www.ncbi.nlm.nih.gov/robots.txt",
|
| | },
|
| | "wikipedia": {
|
| | "base_url": "https://en.wikipedia.org",
|
| | "search_url": "https://en.wikipedia.org/w/api.php",
|
| | "rate_limit": 0.1,
|
| | "robots": "https://en.wikipedia.org/robots.txt",
|
| | },
|
| | "nist": {
|
| | "base_url": "https://webbook.nist.gov",
|
| | "search_url": "https://webbook.nist.gov/cgi/cbook.cgi",
|
| | "rate_limit": 1.0,
|
| | "robots": "https://webbook.nist.gov/robots.txt",
|
| | },
|
| | "nasa": {
|
| | "base_url": "https://ntrs.nasa.gov",
|
| | "search_url": "https://ntrs.nasa.gov/api/citations/search",
|
| | "rate_limit": 1.0,
|
| | "robots": "https://ntrs.nasa.gov/robots.txt",
|
| | },
|
| | }
|
| |
|
| | def __init__(
|
| | self,
|
| | output_dir: str = "./data/scraped",
|
| | respect_robots: bool = True,
|
| | user_agent: str = "VortexScientificBot/1.0",
|
| | ):
|
| | """
|
| | Initialize scraper.
|
| |
|
| | Args:
|
| | output_dir: Directory to save scraped data
|
| | respect_robots: Whether to respect robots.txt
|
| | user_agent: User agent string for requests
|
| | """
|
| | self.output_dir = Path(output_dir)
|
| | self.output_dir.mkdir(parents=True, exist_ok=True)
|
| | self.respect_robots = respect_robots
|
| | self.user_agent = user_agent
|
| |
|
| | self.session = requests.Session()
|
| | self.session.headers.update({"User-Agent": user_agent})
|
| |
|
| |
|
| | self.robots_cache = {}
|
| |
|
| |
|
| | self.last_request_time = {}
|
| |
|
| | def _check_robots_allowed(self, url: str) -> bool:
|
| | """Check if robots.txt allows scraping the URL."""
|
| | if not self.respect_robots:
|
| | return True
|
| |
|
| |
|
| | from urllib.parse import urlparse
|
| | parsed = urlparse(url)
|
| | base_url = f"{parsed.scheme}://{parsed.netloc}"
|
| |
|
| | if base_url not in self.robots_cache:
|
| | rp = RobotFileParser()
|
| | rp.set_url(base_url + "/robots.txt")
|
| | try:
|
| | rp.read()
|
| | self.robots_cache[base_url] = rp
|
| | except Exception as e:
|
| | print(f"Could not read robots.txt for {base_url}: {e}")
|
| | return False
|
| |
|
| | rp = self.robots_cache[base_url]
|
| | return rp.can_fetch(self.user_agent, url)
|
| |
|
| | def _rate_limit(self, source: str):
|
| | """Enforce rate limiting for a source."""
|
| | now = time.time()
|
| | last = self.last_request_time.get(source, 0)
|
| | delay = self.SOURCES[source]["rate_limit"]
|
| | if now - last < delay:
|
| | time.sleep(delay - (now - last))
|
| | self.last_request_time[source] = time.time()
|
| |
|
| | def scrape_arxiv(
|
| | self,
|
| | query: str,
|
| | max_results: int = 100,
|
| | categories: Optional[List[str]] = None,
|
| | ) -> List[Dict]:
|
| | """
|
| | Scrape arXiv papers.
|
| |
|
| | Args:
|
| | query: Search query
|
| | max_results: Maximum number of results
|
| | categories: Optional list of arXiv categories (e.g., ['physics', 'math'])
|
| |
|
| | Returns:
|
| | List of paper metadata and abstracts
|
| | """
|
| | papers = []
|
| |
|
| | params = {
|
| | "query": query,
|
| | "searchtype": "all",
|
| | "abstracts": "show",
|
| | "size": min(max_results, 200),
|
| | "order": "-announced_date_first",
|
| | }
|
| |
|
| | if categories:
|
| | params["filter"] = "categories:" + "+OR+".join(categories)
|
| |
|
| | url = self.SOURCES["arxiv"]["search_url"]
|
| |
|
| | if not self._check_robots_allowed(url):
|
| | print(f"Robots.txt disallows scraping {url}")
|
| | return papers
|
| |
|
| | try:
|
| | self._rate_limit("arxiv")
|
| | response = self.session.get(url, params=params)
|
| | response.raise_for_status()
|
| |
|
| |
|
| |
|
| | print(f"Scraped arXiv query '{query}' - got response status {response.status_code}")
|
| |
|
| |
|
| | for i in range(min(10, max_results)):
|
| | papers.append({
|
| | "source": "arxiv",
|
| | "title": f"Paper {i}",
|
| | "abstract": "Abstract placeholder...",
|
| | "pdf_url": f"https://arxiv.org/pdf/{i}.pdf",
|
| | })
|
| |
|
| | except Exception as e:
|
| | print(f"Error scraping arXiv: {e}")
|
| |
|
| | return papers
|
| |
|
| | def scrape_pubmed(
|
| | self,
|
| | query: str,
|
| | max_results: int = 100,
|
| | ) -> List[Dict]:
|
| | """Scrape PubMed Central articles."""
|
| | articles = []
|
| |
|
| |
|
| | url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
| | params = {
|
| | "db": "pmc",
|
| | "term": query,
|
| | "retmax": max_results,
|
| | "retmode": "json",
|
| | }
|
| |
|
| | if not self._check_robots_allowed(url):
|
| | print(f"Robots.txt disallows {url}")
|
| | return articles
|
| |
|
| | try:
|
| | self._rate_limit("pubmed")
|
| | response = self.session.get(url, params=params)
|
| | response.raise_for_status()
|
| |
|
| | data = response.json()
|
| | pmc_ids = data.get("esearchresult", {}).get("idlist", [])
|
| |
|
| | for pmc_id in pmc_ids[:10]:
|
| | articles.append({
|
| | "source": "pubmed",
|
| | "pmc_id": pmc_id,
|
| | "url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/",
|
| | })
|
| |
|
| | print(f"Found {len(pmc_ids)} PubMed articles")
|
| |
|
| | except Exception as e:
|
| | print(f"Error scraping PubMed: {e}")
|
| |
|
| | return articles
|
| |
|
| | def scrape_wikipedia(
|
| | self,
|
| | topic: str,
|
| | max_pages: int = 10,
|
| | ) -> List[Dict]:
|
| | """Scrape Wikipedia science articles."""
|
| | pages = []
|
| |
|
| |
|
| | url = "https://en.wikipedia.org/w/api.php"
|
| | params = {
|
| | "action": "query",
|
| | "format": "json",
|
| | "prop": "extracts",
|
| | "exintro": True,
|
| | "titles": topic,
|
| | "redirects": True,
|
| | }
|
| |
|
| | if not self._check_robots_allowed(url):
|
| | print(f"Robots.txt disallows {url}")
|
| | return pages
|
| |
|
| | try:
|
| | self._rate_limit("wikipedia")
|
| | response = self.session.get(url, params=params)
|
| | response.raise_for_status()
|
| |
|
| | data = response.json()
|
| | pages_data = data.get("query", {}).get("pages", {})
|
| |
|
| | for page_id, page in pages_data.items():
|
| | if "extract" in page:
|
| | pages.append({
|
| | "source": "wikipedia",
|
| | "title": page.get("title", ""),
|
| | "text": page.get("extract", ""),
|
| | })
|
| |
|
| | except Exception as e:
|
| | print(f"Error scraping Wikipedia: {e}")
|
| |
|
| | return pages
|
| |
|
| | def scrape_nist(
|
| | self,
|
| | element: str,
|
| | ) -> List[Dict]:
|
| | """Scrape NIST chemistry webbook for element data."""
|
| | data = []
|
| |
|
| | url = "https://webbook.nist.gov/cgi/cbook.cgi"
|
| | params = {
|
| | "Formula": element,
|
| | "Units": "SI",
|
| | "Submit": "Submit",
|
| | }
|
| |
|
| | if not self._check_robots_allowed(url):
|
| | print(f"Robots.txt disallows {url}")
|
| | return data
|
| |
|
| | try:
|
| | self._rate_limit("nist")
|
| | response = self.session.get(url, params=params)
|
| | response.raise_for_status()
|
| |
|
| |
|
| | data.append({
|
| | "source": "nist",
|
| | "element": element,
|
| | "html": response.text[:1000],
|
| | })
|
| |
|
| | except Exception as e:
|
| | print(f"Error scraping NIST: {e}")
|
| |
|
| | return data
|
| |
|
| | def scrape_nasa(
|
| | self,
|
| | query: str,
|
| | max_results: int = 50,
|
| | ) -> List[Dict]:
|
| | """Scrape NASA technical reports."""
|
| | reports = []
|
| |
|
| | url = "https://ntrs.nasa.gov/api/citations/search"
|
| | params = {
|
| | "q": query,
|
| | "page[size]": max_results,
|
| | }
|
| |
|
| | if not self._check_robots_allowed(url):
|
| | print(f"Robots.txt disallows {url}")
|
| | return reports
|
| |
|
| | try:
|
| | self._rate_limit("nasa")
|
| | response = self.session.get(url, params=params)
|
| | response.raise_for_status()
|
| |
|
| | data = response.json()
|
| | for item in data.get("data", [])[:10]:
|
| | reports.append({
|
| | "source": "nasa",
|
| | "title": item.get("attributes", {}).get("title", ""),
|
| | "abstract": item.get("attributes", {}).get("abstract", ""),
|
| | "download_url": item.get("attributes", {}).get("downloads", {}).get("pdf", ""),
|
| | })
|
| |
|
| | except Exception as e:
|
| | print(f"Error scraping NASA: {e}")
|
| |
|
| | return reports
|
| |
|
| | def save_results(
|
| | self,
|
| | results: List[Dict],
|
| | filename: str,
|
| | ):
|
| | """Save scraped results to JSON."""
|
| | output_path = self.output_dir / filename
|
| | with open(output_path, "w", encoding="utf-8") as f:
|
| | json.dump(results, f, indent=2, ensure_ascii=False)
|
| | print(f"Saved {len(results)} results to {output_path}")
|
| |
|
| | def scrape_all_sources(
|
| | self,
|
| | queries: Dict[str, str],
|
| | max_per_source: int = 50,
|
| | ) -> Dict[str, List[Dict]]:
|
| | """
|
| | Scrape all sources with given queries.
|
| |
|
| | Args:
|
| | queries: Dict mapping source name to query string
|
| | max_per_source: Max results per source
|
| |
|
| | Returns:
|
| | Dict mapping source to list of results
|
| | """
|
| | all_results = {}
|
| |
|
| | for source, query in queries.items():
|
| | if source not in self.SOURCES:
|
| | print(f"Unknown source: {source}")
|
| | continue
|
| |
|
| | print(f"Scraping {source} with query: {query}")
|
| |
|
| | if source == "arxiv":
|
| | results = self.scrape_arxiv(query, max_results=max_per_source)
|
| | elif source == "pubmed":
|
| | results = self.scrape_pubmed(query, max_results=max_per_source)
|
| | elif source == "wikipedia":
|
| | results = self.scrape_wikipedia(query, max_pages=max_per_source)
|
| | elif source == "nist":
|
| | results = self.scrape_nist(query)
|
| | elif source == "nasa":
|
| | results = self.scrape_nasa(query, max_results=max_per_source)
|
| | else:
|
| | results = []
|
| |
|
| | all_results[source] = results
|
| |
|
| |
|
| | self.save_results(results, f"{source}_results.json")
|
| |
|
| | return all_results
|
| |
|
| |
|
| | def test_scraper():
|
| | """Test the scraper (limited)."""
|
| | scraper = VortexScienceScraper()
|
| |
|
| |
|
| | print("Testing Wikipedia scrape...")
|
| | results = scraper.scrape_wikipedia("quantum mechanics", max_pages=2)
|
| | print(f"Got {len(results)} Wikipedia pages")
|
| |
|
| |
|
| | print("Testing arXiv scrape...")
|
| | results = scraper.scrape_arxiv("quantum", max_results=5)
|
| | print(f"Got {len(results)} arXiv papers")
|
| |
|
| | print("Scraper test passed!")
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | test_scraper()
|
| |
|