""" VortexScienceScraper: Scrapes scientific content from open access sources. Respects robots.txt and rate limits. """ import time import requests from typing import List, Dict, Optional from urllib.robotparser import RobotFileParser from pathlib import Path import json class VortexScienceScraper: """ Scrapes scientific content from open access sources. Sources: arXiv, PubMed Central, Wikipedia, NIST, NASA. """ SOURCES = { "arxiv": { "base_url": "https://arxiv.org", "search_url": "https://arxiv.org/search/", "rate_limit": 1.0, # seconds between requests "robots": "https://arxiv.org/robots.txt", }, "pubmed": { "base_url": "https://www.ncbi.nlm.nih.gov/pmc", "search_url": "https://www.ncbi.nlm.nih.gov/pmc/articles/", "rate_limit": 0.5, "robots": "https://www.ncbi.nlm.nih.gov/robots.txt", }, "wikipedia": { "base_url": "https://en.wikipedia.org", "search_url": "https://en.wikipedia.org/w/api.php", "rate_limit": 0.1, "robots": "https://en.wikipedia.org/robots.txt", }, "nist": { "base_url": "https://webbook.nist.gov", "search_url": "https://webbook.nist.gov/cgi/cbook.cgi", "rate_limit": 1.0, "robots": "https://webbook.nist.gov/robots.txt", }, "nasa": { "base_url": "https://ntrs.nasa.gov", "search_url": "https://ntrs.nasa.gov/api/citations/search", "rate_limit": 1.0, "robots": "https://ntrs.nasa.gov/robots.txt", }, } def __init__( self, output_dir: str = "./data/scraped", respect_robots: bool = True, user_agent: str = "VortexScientificBot/1.0", ): """ Initialize scraper. Args: output_dir: Directory to save scraped data respect_robots: Whether to respect robots.txt user_agent: User agent string for requests """ self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) self.respect_robots = respect_robots self.user_agent = user_agent self.session = requests.Session() self.session.headers.update({"User-Agent": user_agent}) # Cache for robots.txt self.robots_cache = {} # Rate limit tracking self.last_request_time = {} def _check_robots_allowed(self, url: str) -> bool: """Check if robots.txt allows scraping the URL.""" if not self.respect_robots: return True # Extract base domain from urllib.parse import urlparse parsed = urlparse(url) base_url = f"{parsed.scheme}://{parsed.netloc}" if base_url not in self.robots_cache: rp = RobotFileParser() rp.set_url(base_url + "/robots.txt") try: rp.read() self.robots_cache[base_url] = rp except Exception as e: print(f"Could not read robots.txt for {base_url}: {e}") return False rp = self.robots_cache[base_url] return rp.can_fetch(self.user_agent, url) def _rate_limit(self, source: str): """Enforce rate limiting for a source.""" now = time.time() last = self.last_request_time.get(source, 0) delay = self.SOURCES[source]["rate_limit"] if now - last < delay: time.sleep(delay - (now - last)) self.last_request_time[source] = time.time() def scrape_arxiv( self, query: str, max_results: int = 100, categories: Optional[List[str]] = None, ) -> List[Dict]: """ Scrape arXiv papers. Args: query: Search query max_results: Maximum number of results categories: Optional list of arXiv categories (e.g., ['physics', 'math']) Returns: List of paper metadata and abstracts """ papers = [] params = { "query": query, "searchtype": "all", "abstracts": "show", "size": min(max_results, 200), # arXiv max per page "order": "-announced_date_first", } if categories: params["filter"] = "categories:" + "+OR+".join(categories) url = self.SOURCES["arxiv"]["search_url"] if not self._check_robots_allowed(url): print(f"Robots.txt disallows scraping {url}") return papers try: self._rate_limit("arxiv") response = self.session.get(url, params=params) response.raise_for_status() # Parse HTML (simplified - would use BeautifulSoup in practice) # For now, return placeholder print(f"Scraped arXiv query '{query}' - got response status {response.status_code}") # Placeholder: would extract paper titles, abstracts, PDF links for i in range(min(10, max_results)): papers.append({ "source": "arxiv", "title": f"Paper {i}", "abstract": "Abstract placeholder...", "pdf_url": f"https://arxiv.org/pdf/{i}.pdf", }) except Exception as e: print(f"Error scraping arXiv: {e}") return papers def scrape_pubmed( self, query: str, max_results: int = 100, ) -> List[Dict]: """Scrape PubMed Central articles.""" articles = [] # PubMed API endpoint url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" params = { "db": "pmc", "term": query, "retmax": max_results, "retmode": "json", } if not self._check_robots_allowed(url): print(f"Robots.txt disallows {url}") return articles try: self._rate_limit("pubmed") response = self.session.get(url, params=params) response.raise_for_status() data = response.json() pmc_ids = data.get("esearchresult", {}).get("idlist", []) for pmc_id in pmc_ids[:10]: # Limit for demo articles.append({ "source": "pubmed", "pmc_id": pmc_id, "url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/", }) print(f"Found {len(pmc_ids)} PubMed articles") except Exception as e: print(f"Error scraping PubMed: {e}") return articles def scrape_wikipedia( self, topic: str, max_pages: int = 10, ) -> List[Dict]: """Scrape Wikipedia science articles.""" pages = [] # Wikipedia API url = "https://en.wikipedia.org/w/api.php" params = { "action": "query", "format": "json", "prop": "extracts", "exintro": True, "titles": topic, "redirects": True, } if not self._check_robots_allowed(url): print(f"Robots.txt disallows {url}") return pages try: self._rate_limit("wikipedia") response = self.session.get(url, params=params) response.raise_for_status() data = response.json() pages_data = data.get("query", {}).get("pages", {}) for page_id, page in pages_data.items(): if "extract" in page: pages.append({ "source": "wikipedia", "title": page.get("title", ""), "text": page.get("extract", ""), }) except Exception as e: print(f"Error scraping Wikipedia: {e}") return pages def scrape_nist( self, element: str, ) -> List[Dict]: """Scrape NIST chemistry webbook for element data.""" data = [] url = "https://webbook.nist.gov/cgi/cbook.cgi" params = { "Formula": element, "Units": "SI", "Submit": "Submit", } if not self._check_robots_allowed(url): print(f"Robots.txt disallows {url}") return data try: self._rate_limit("nist") response = self.session.get(url, params=params) response.raise_for_status() # Placeholder - would parse HTML tables data.append({ "source": "nist", "element": element, "html": response.text[:1000], }) except Exception as e: print(f"Error scraping NIST: {e}") return data def scrape_nasa( self, query: str, max_results: int = 50, ) -> List[Dict]: """Scrape NASA technical reports.""" reports = [] url = "https://ntrs.nasa.gov/api/citations/search" params = { "q": query, "page[size]": max_results, } if not self._check_robots_allowed(url): print(f"Robots.txt disallows {url}") return reports try: self._rate_limit("nasa") response = self.session.get(url, params=params) response.raise_for_status() data = response.json() for item in data.get("data", [])[:10]: reports.append({ "source": "nasa", "title": item.get("attributes", {}).get("title", ""), "abstract": item.get("attributes", {}).get("abstract", ""), "download_url": item.get("attributes", {}).get("downloads", {}).get("pdf", ""), }) except Exception as e: print(f"Error scraping NASA: {e}") return reports def save_results( self, results: List[Dict], filename: str, ): """Save scraped results to JSON.""" output_path = self.output_dir / filename with open(output_path, "w", encoding="utf-8") as f: json.dump(results, f, indent=2, ensure_ascii=False) print(f"Saved {len(results)} results to {output_path}") def scrape_all_sources( self, queries: Dict[str, str], max_per_source: int = 50, ) -> Dict[str, List[Dict]]: """ Scrape all sources with given queries. Args: queries: Dict mapping source name to query string max_per_source: Max results per source Returns: Dict mapping source to list of results """ all_results = {} for source, query in queries.items(): if source not in self.SOURCES: print(f"Unknown source: {source}") continue print(f"Scraping {source} with query: {query}") if source == "arxiv": results = self.scrape_arxiv(query, max_results=max_per_source) elif source == "pubmed": results = self.scrape_pubmed(query, max_results=max_per_source) elif source == "wikipedia": results = self.scrape_wikipedia(query, max_pages=max_per_source) elif source == "nist": results = self.scrape_nist(query) elif source == "nasa": results = self.scrape_nasa(query, max_results=max_per_source) else: results = [] all_results[source] = results # Save intermediate results self.save_results(results, f"{source}_results.json") return all_results def test_scraper(): """Test the scraper (limited).""" scraper = VortexScienceScraper() # Test Wikipedia (lightweight) print("Testing Wikipedia scrape...") results = scraper.scrape_wikipedia("quantum mechanics", max_pages=2) print(f"Got {len(results)} Wikipedia pages") # Test arXiv (rate limited) print("Testing arXiv scrape...") results = scraper.scrape_arxiv("quantum", max_results=5) print(f"Got {len(results)} arXiv papers") print("Scraper test passed!") if __name__ == "__main__": test_scraper()