data/scraper.py · Matrix-Corp/Vortex-7b-V1 at main

File size: 12,847 Bytes

bf64b03

"""

VortexScienceScraper: Scrapes scientific content from open access sources.

Respects robots.txt and rate limits.

"""

import time
import requests
from typing import List, Dict, Optional
from urllib.robotparser import RobotFileParser
from pathlib import Path
import json


class VortexScienceScraper:
    """

    Scrapes scientific content from open access sources.

    Sources: arXiv, PubMed Central, Wikipedia, NIST, NASA.

    """

    SOURCES = {
        "arxiv": {
            "base_url": "https://arxiv.org",
            "search_url": "https://arxiv.org/search/",
            "rate_limit": 1.0,  # seconds between requests
            "robots": "https://arxiv.org/robots.txt",
        },
        "pubmed": {
            "base_url": "https://www.ncbi.nlm.nih.gov/pmc",
            "search_url": "https://www.ncbi.nlm.nih.gov/pmc/articles/",
            "rate_limit": 0.5,
            "robots": "https://www.ncbi.nlm.nih.gov/robots.txt",
        },
        "wikipedia": {
            "base_url": "https://en.wikipedia.org",
            "search_url": "https://en.wikipedia.org/w/api.php",
            "rate_limit": 0.1,
            "robots": "https://en.wikipedia.org/robots.txt",
        },
        "nist": {
            "base_url": "https://webbook.nist.gov",
            "search_url": "https://webbook.nist.gov/cgi/cbook.cgi",
            "rate_limit": 1.0,
            "robots": "https://webbook.nist.gov/robots.txt",
        },
        "nasa": {
            "base_url": "https://ntrs.nasa.gov",
            "search_url": "https://ntrs.nasa.gov/api/citations/search",
            "rate_limit": 1.0,
            "robots": "https://ntrs.nasa.gov/robots.txt",
        },
    }

    def __init__(

        self,

        output_dir: str = "./data/scraped",

        respect_robots: bool = True,

        user_agent: str = "VortexScientificBot/1.0",

    ):
        """

        Initialize scraper.



        Args:

            output_dir: Directory to save scraped data

            respect_robots: Whether to respect robots.txt

            user_agent: User agent string for requests

        """
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.respect_robots = respect_robots
        self.user_agent = user_agent

        self.session = requests.Session()
        self.session.headers.update({"User-Agent": user_agent})

        # Cache for robots.txt
        self.robots_cache = {}

        # Rate limit tracking
        self.last_request_time = {}

    def _check_robots_allowed(self, url: str) -> bool:
        """Check if robots.txt allows scraping the URL."""
        if not self.respect_robots:
            return True

        # Extract base domain
        from urllib.parse import urlparse
        parsed = urlparse(url)
        base_url = f"{parsed.scheme}://{parsed.netloc}"

        if base_url not in self.robots_cache:
            rp = RobotFileParser()
            rp.set_url(base_url + "/robots.txt")
            try:
                rp.read()
                self.robots_cache[base_url] = rp
            except Exception as e:
                print(f"Could not read robots.txt for {base_url}: {e}")
                return False

        rp = self.robots_cache[base_url]
        return rp.can_fetch(self.user_agent, url)

    def _rate_limit(self, source: str):
        """Enforce rate limiting for a source."""
        now = time.time()
        last = self.last_request_time.get(source, 0)
        delay = self.SOURCES[source]["rate_limit"]
        if now - last < delay:
            time.sleep(delay - (now - last))
        self.last_request_time[source] = time.time()

    def scrape_arxiv(

        self,

        query: str,

        max_results: int = 100,

        categories: Optional[List[str]] = None,

    ) -> List[Dict]:
        """

        Scrape arXiv papers.



        Args:

            query: Search query

            max_results: Maximum number of results

            categories: Optional list of arXiv categories (e.g., ['physics', 'math'])



        Returns:

            List of paper metadata and abstracts

        """
        papers = []

        params = {
            "query": query,
            "searchtype": "all",
            "abstracts": "show",
            "size": min(max_results, 200),  # arXiv max per page
            "order": "-announced_date_first",
        }

        if categories:
            params["filter"] = "categories:" + "+OR+".join(categories)

        url = self.SOURCES["arxiv"]["search_url"]

        if not self._check_robots_allowed(url):
            print(f"Robots.txt disallows scraping {url}")
            return papers

        try:
            self._rate_limit("arxiv")
            response = self.session.get(url, params=params)
            response.raise_for_status()

            # Parse HTML (simplified - would use BeautifulSoup in practice)
            # For now, return placeholder
            print(f"Scraped arXiv query '{query}' - got response status {response.status_code}")

            # Placeholder: would extract paper titles, abstracts, PDF links
            for i in range(min(10, max_results)):
                papers.append({
                    "source": "arxiv",
                    "title": f"Paper {i}",
                    "abstract": "Abstract placeholder...",
                    "pdf_url": f"https://arxiv.org/pdf/{i}.pdf",
                })

        except Exception as e:
            print(f"Error scraping arXiv: {e}")

        return papers

    def scrape_pubmed(

        self,

        query: str,

        max_results: int = 100,

    ) -> List[Dict]:
        """Scrape PubMed Central articles."""
        articles = []

        # PubMed API endpoint
        url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        params = {
            "db": "pmc",
            "term": query,
            "retmax": max_results,
            "retmode": "json",
        }

        if not self._check_robots_allowed(url):
            print(f"Robots.txt disallows {url}")
            return articles

        try:
            self._rate_limit("pubmed")
            response = self.session.get(url, params=params)
            response.raise_for_status()

            data = response.json()
            pmc_ids = data.get("esearchresult", {}).get("idlist", [])

            for pmc_id in pmc_ids[:10]:  # Limit for demo
                articles.append({
                    "source": "pubmed",
                    "pmc_id": pmc_id,
                    "url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/",
                })

            print(f"Found {len(pmc_ids)} PubMed articles")

        except Exception as e:
            print(f"Error scraping PubMed: {e}")

        return articles

    def scrape_wikipedia(

        self,

        topic: str,

        max_pages: int = 10,

    ) -> List[Dict]:
        """Scrape Wikipedia science articles."""
        pages = []

        # Wikipedia API
        url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "format": "json",
            "prop": "extracts",
            "exintro": True,
            "titles": topic,
            "redirects": True,
        }

        if not self._check_robots_allowed(url):
            print(f"Robots.txt disallows {url}")
            return pages

        try:
            self._rate_limit("wikipedia")
            response = self.session.get(url, params=params)
            response.raise_for_status()

            data = response.json()
            pages_data = data.get("query", {}).get("pages", {})

            for page_id, page in pages_data.items():
                if "extract" in page:
                    pages.append({
                        "source": "wikipedia",
                        "title": page.get("title", ""),
                        "text": page.get("extract", ""),
                    })

        except Exception as e:
            print(f"Error scraping Wikipedia: {e}")

        return pages

    def scrape_nist(

        self,

        element: str,

    ) -> List[Dict]:
        """Scrape NIST chemistry webbook for element data."""
        data = []

        url = "https://webbook.nist.gov/cgi/cbook.cgi"
        params = {
            "Formula": element,
            "Units": "SI",
            "Submit": "Submit",
        }

        if not self._check_robots_allowed(url):
            print(f"Robots.txt disallows {url}")
            return data

        try:
            self._rate_limit("nist")
            response = self.session.get(url, params=params)
            response.raise_for_status()

            # Placeholder - would parse HTML tables
            data.append({
                "source": "nist",
                "element": element,
                "html": response.text[:1000],
            })

        except Exception as e:
            print(f"Error scraping NIST: {e}")

        return data

    def scrape_nasa(

        self,

        query: str,

        max_results: int = 50,

    ) -> List[Dict]:
        """Scrape NASA technical reports."""
        reports = []

        url = "https://ntrs.nasa.gov/api/citations/search"
        params = {
            "q": query,
            "page[size]": max_results,
        }

        if not self._check_robots_allowed(url):
            print(f"Robots.txt disallows {url}")
            return reports

        try:
            self._rate_limit("nasa")
            response = self.session.get(url, params=params)
            response.raise_for_status()

            data = response.json()
            for item in data.get("data", [])[:10]:
                reports.append({
                    "source": "nasa",
                    "title": item.get("attributes", {}).get("title", ""),
                    "abstract": item.get("attributes", {}).get("abstract", ""),
                    "download_url": item.get("attributes", {}).get("downloads", {}).get("pdf", ""),
                })

        except Exception as e:
            print(f"Error scraping NASA: {e}")

        return reports

    def save_results(

        self,

        results: List[Dict],

        filename: str,

    ):
        """Save scraped results to JSON."""
        output_path = self.output_dir / filename
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        print(f"Saved {len(results)} results to {output_path}")

    def scrape_all_sources(

        self,

        queries: Dict[str, str],

        max_per_source: int = 50,

    ) -> Dict[str, List[Dict]]:
        """

        Scrape all sources with given queries.



        Args:

            queries: Dict mapping source name to query string

            max_per_source: Max results per source



        Returns:

            Dict mapping source to list of results

        """
        all_results = {}

        for source, query in queries.items():
            if source not in self.SOURCES:
                print(f"Unknown source: {source}")
                continue

            print(f"Scraping {source} with query: {query}")

            if source == "arxiv":
                results = self.scrape_arxiv(query, max_results=max_per_source)
            elif source == "pubmed":
                results = self.scrape_pubmed(query, max_results=max_per_source)
            elif source == "wikipedia":
                results = self.scrape_wikipedia(query, max_pages=max_per_source)
            elif source == "nist":
                results = self.scrape_nist(query)
            elif source == "nasa":
                results = self.scrape_nasa(query, max_results=max_per_source)
            else:
                results = []

            all_results[source] = results

            # Save intermediate results
            self.save_results(results, f"{source}_results.json")

        return all_results


def test_scraper():
    """Test the scraper (limited)."""
    scraper = VortexScienceScraper()

    # Test Wikipedia (lightweight)
    print("Testing Wikipedia scrape...")
    results = scraper.scrape_wikipedia("quantum mechanics", max_pages=2)
    print(f"Got {len(results)} Wikipedia pages")

    # Test arXiv (rate limited)
    print("Testing arXiv scrape...")
    results = scraper.scrape_arxiv("quantum", max_results=5)
    print(f"Got {len(results)} arXiv papers")

    print("Scraper test passed!")


if __name__ == "__main__":
    test_scraper()