Spaces:

obumdukor
/

eye-wiki

Runtime error

App Files Files Community

eye-wiki / src /scraper /eyewiki_crawler.py

stanleydukor

Initial deployment

702ea87 3 months ago

raw

history blame contribute delete

17.5 kB

	"""EyeWiki crawler for medical article scraping using crawl4ai."""

	import asyncio
	import json
	import re
	from collections import deque
	from datetime import datetime
	from pathlib import Path
	from typing import Dict, Optional, Set
	from urllib.parse import urljoin, urlparse, parse_qs
	from urllib.robotparser import RobotFileParser

	import aiohttp
	from bs4 import BeautifulSoup
	from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
	from rich.console import Console
	from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn

	from config.settings import settings


	class EyeWikiCrawler:
	"""
	Asynchronous crawler for EyeWiki medical articles.

	Features:
	- Asynchronous crawling with crawl4ai
	- Respects robots.txt
	- Polite crawling with configurable delays
	- Markdown content extraction
	- Checkpointing for resume capability
	- Progress tracking with rich console
	"""

	def __init__(
	self,
	base_url: str = "https://eyewiki.org",
	output_dir: Optional[Path] = None,
	checkpoint_file: Optional[Path] = None,
	delay: float = 1.5,
	timeout: int = 30,
	):
	"""
	Initialize the EyeWiki crawler.

	Args:
	base_url: Base URL for EyeWiki
	output_dir: Directory to save scraped articles
	checkpoint_file: Path to checkpoint file
	delay: Delay between requests in seconds
	timeout: Request timeout in seconds
	"""
	self.base_url = base_url
	self.domain = urlparse(base_url).netloc
	self.output_dir = output_dir or Path(settings.data_raw_path)
	self.checkpoint_file = checkpoint_file or (self.output_dir / "crawler_checkpoint.json")
	self.delay = delay
	self.timeout = timeout

	# Ensure output directory exists
	self.output_dir.mkdir(parents=True, exist_ok=True)

	# Crawl state
	self.visited_urls: Set[str] = set()
	self.to_crawl: deque = deque()
	self.failed_urls: Dict[str, str] = {}
	self.articles_saved: int = 0

	# Rich console for logging
	self.console = Console()

	# Robot parser
	self.robot_parser = RobotFileParser()
	self.robot_parser.set_url(urljoin(base_url, "/robots.txt"))

	# Patterns to skip
	self.skip_patterns = [
	r"/index\.php\?title=.*&action=", # Edit, history, etc.
	r"/index\.php\?title=.*&diff=", # Page diffs
	r"/index\.php\?title=.*&oldid=", # Page history/revisions
	r"/index\.php\?title=.*&direction=", # Page navigation
	r"/index\.php\?title=Special:", # Special pages (login, create account, etc.)
	r"/Special:", # Special pages
	r"/User:", # User pages
	r"/User_talk:", # User talk pages
	r"/Talk:", # Talk pages
	r"/File:", # File pages
	r"/Template:", # Template pages
	r"/Help:", # Help pages
	r"/MediaWiki:", # MediaWiki pages
	r"#", # Anchor links
	]

	def _is_valid_article_url(self, url: str) -> bool:
	"""
	Check if URL is a valid medical article.

	Args:
	url: URL to check

	Returns:
	True if valid article URL
	"""
	# Must be from eyewiki.org domain
	if self.domain not in url:
	return False

	# Skip patterns (these take precedence)
	for pattern in self.skip_patterns:
	if re.search(pattern, url):
	return False

	# Parse URL to check path
	parsed = urlparse(url)
	path = parsed.path.strip("/")

	# Must be article-like URL
	# EyeWiki articles can be:
	# 1. Direct: /Article_Name (e.g., /Cataract)
	# 2. Wiki-style: /wiki/Article_Name
	# 3. Query-based: /w/index.php?title=Article_Name

	# For query-based URLs, check if title parameter exists and is not a special page
	if parsed.query and "title=" in parsed.query:
	return True

	# For direct URLs, check if path is non-empty and looks like an article
	# (starts with capital letter, no file extension)
	if path and not path.startswith("w/") and not "." in path:
	# Path should look like an article name (capitalized, underscores/spaces)
	if path[0].isupper() or path.startswith("wiki/"):
	return True

	return False

	def _normalize_url(self, url: str) -> str:
	"""
	Normalize URL for consistent comparison.

	Args:
	url: URL to normalize

	Returns:
	Normalized URL
	"""
	# Remove fragment
	url = url.split("#")[0]
	# Remove trailing slash
	url = url.rstrip("/")
	return url

	def _can_fetch(self, url: str) -> bool:
	"""
	Check if URL can be fetched according to robots.txt.

	Args:
	url: URL to check

	Returns:
	True if allowed to fetch
	"""
	try:
	return self.robot_parser.can_fetch("*", url)
	except Exception as e:
	self.console.print(f"[yellow]Warning: Could not check robots.txt: {e}[/yellow]")
	return True # Be permissive if robots.txt check fails

	def _extract_links(self, html: str, current_url: str) -> Set[str]:
	"""
	Extract valid article links from HTML.

	Args:
	html: HTML content
	current_url: Current page URL for resolving relative links

	Returns:
	Set of valid article URLs
	"""
	soup = BeautifulSoup(html, "html.parser")
	links = set()

	for a_tag in soup.find_all("a", href=True):
	href = a_tag["href"]
	# Resolve relative URLs
	absolute_url = urljoin(current_url, href)
	normalized_url = self._normalize_url(absolute_url)

	if self._is_valid_article_url(normalized_url):
	links.add(normalized_url)

	return links

	def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
	"""
	Extract metadata from article page.

	Args:
	soup: BeautifulSoup object
	url: Article URL

	Returns:
	Dictionary of metadata
	"""
	metadata = {
	"url": url,
	"title": "",
	"last_updated": None,
	"categories": [],
	"scraped_at": datetime.utcnow().isoformat(),
	}

	# Extract title
	title_tag = soup.find("h1", {"id": "firstHeading"}) or soup.find("h1")
	if title_tag:
	metadata["title"] = title_tag.get_text(strip=True)

	# Extract categories
	category_links = soup.find_all("a", href=re.compile(r"/Category:"))
	metadata["categories"] = [link.get_text(strip=True) for link in category_links]

	# Extract last modified date (if available)
	last_modified = soup.find("li", {"id": "footer-info-lastmod"})
	if last_modified:
	metadata["last_updated"] = last_modified.get_text(strip=True)

	return metadata

	def save_article(self, content: Dict, filepath: Path) -> None:
	"""
	Save article content and metadata to files.

	Args:
	content: Dictionary with 'markdown' and 'metadata' keys
	filepath: Base filepath (without extension)
	"""
	# Save markdown content
	md_file = filepath.with_suffix(".md")
	with open(md_file, "w", encoding="utf-8") as f:
	f.write(content["markdown"])

	# Save metadata as JSON sidecar
	json_file = filepath.with_suffix(".json")
	with open(json_file, "w", encoding="utf-8") as f:
	json.dump(content["metadata"], f, indent=2, ensure_ascii=False)

	self.articles_saved += 1
	self.console.print(f"[green][/green] Saved: {content['metadata'].get('title', 'Untitled')}")

	def load_checkpoint(self) -> bool:
	"""
	Load checkpoint data to resume crawling.

	Returns:
	True if checkpoint was loaded successfully
	"""
	if not self.checkpoint_file.exists():
	return False

	try:
	with open(self.checkpoint_file, "r") as f:
	data = json.load(f)

	self.visited_urls = set(data.get("visited_urls", []))
	self.to_crawl = deque(data.get("to_crawl", []))
	self.failed_urls = data.get("failed_urls", {})
	self.articles_saved = data.get("articles_saved", 0)

	self.console.print(f"[blue]Loaded checkpoint:[/blue] {len(self.visited_urls)} visited, "
	f"{len(self.to_crawl)} queued, {self.articles_saved} saved")
	return True
	except Exception as e:
	self.console.print(f"[red]Error loading checkpoint: {e}[/red]")
	return False

	def save_checkpoint(self) -> None:
	"""Save current crawl state to checkpoint file."""
	data = {
	"visited_urls": list(self.visited_urls),
	"to_crawl": list(self.to_crawl),
	"failed_urls": self.failed_urls,
	"articles_saved": self.articles_saved,
	"last_checkpoint": datetime.utcnow().isoformat(),
	}

	try:
	with open(self.checkpoint_file, "w") as f:
	json.dump(data, f, indent=2)
	except Exception as e:
	self.console.print(f"[red]Error saving checkpoint: {e}[/red]")

	async def crawl_single_page(self, url: str) -> Optional[Dict]:
	"""
	Crawl a single page and extract content.

	Args:
	url: URL to crawl

	Returns:
	Dictionary with markdown content and metadata, or None if failed
	"""
	if not self._can_fetch(url):
	self.console.print(f"[yellow]Blocked by robots.txt:[/yellow] {url}")
	return None

	try:
	# Configure browser settings
	browser_config = BrowserConfig(
	headless=True,
	verbose=False,
	)

	# Configure crawler settings
	crawler_config = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS,
	page_timeout=self.timeout * 1000, # Convert to milliseconds
	wait_for="body",
	)

	# Create crawler and run
	async with AsyncWebCrawler(config=browser_config) as crawler:
	result = await crawler.arun(
	url=url,
	config=crawler_config,
	)

	if not result.success:
	self.console.print(f"[red]Failed to crawl:[/red] {url}")
	return None

	# Parse HTML for metadata
	soup = BeautifulSoup(result.html, "html.parser")
	metadata = self._extract_metadata(soup, url)

	# Get markdown content
	markdown = result.markdown

	return {
	"markdown": markdown,
	"metadata": metadata,
	"html": result.html,
	"links": self._extract_links(result.html, url),
	}

	except Exception as e:
	self.console.print(f"[red]Error crawling {url}:[/red] {e}")
	self.failed_urls[url] = str(e)
	return None

	async def crawl(
	self,
	max_pages: Optional[int] = None,
	depth: int = 2,
	start_urls: Optional[list] = None,
	) -> None:
	"""
	Crawl EyeWiki starting from the main page.

	Args:
	max_pages: Maximum number of pages to crawl (None for unlimited)
	depth: Maximum depth to crawl
	start_urls: Optional list of starting URLs (defaults to base_url)
	"""
	# Try to load checkpoint
	checkpoint_loaded = self.load_checkpoint()

	# Initialize robot parser
	try:
	self.robot_parser.read()
	self.console.print("[green][/green] Loaded robots.txt")
	except Exception as e:
	self.console.print(f"[yellow]Warning: Could not load robots.txt: {e}[/yellow]")

	# Initialize queue if not loaded from checkpoint
	if not checkpoint_loaded:
	if start_urls:
	self.to_crawl.extend([(url, 0) for url in start_urls])
	else:
	self.to_crawl.append((self.base_url, 0))

	self.console.print(f"\n[bold cyan]Starting EyeWiki Crawl[/bold cyan]")
	self.console.print(f"Max pages: {max_pages or 'unlimited'}")
	self.console.print(f"Max depth: {depth}")
	self.console.print(f"Delay: {self.delay}s\n")

	with Progress(
	SpinnerColumn(),
	TextColumn("[progress.description]{task.description}"),
	BarColumn(),
	TaskProgressColumn(),
	console=self.console,
	) as progress:

	task = progress.add_task(
	"[cyan]Crawling...",
	total=max_pages if max_pages else 100,
	)

	try:
	while self.to_crawl:
	# Check max_pages limit
	if max_pages and self.articles_saved >= max_pages:
	self.console.print(f"\n[yellow]Reached max_pages limit: {max_pages}[/yellow]")
	break

	# Get next URL
	current_url, current_depth = self.to_crawl.popleft()

	# Skip if already visited
	if current_url in self.visited_urls:
	continue

	# Check depth limit
	if current_depth > depth:
	continue

	# Mark as visited
	self.visited_urls.add(current_url)

	# Update progress
	progress.update(
	task,
	completed=self.articles_saved,
	description=f"[cyan]Crawling ({self.articles_saved} saved, {len(self.to_crawl)} queued): {current_url[:60]}...",
	)

	# Crawl the page
	result = await self.crawl_single_page(current_url)

	if result:
	# Create filename from URL
	parsed = urlparse(current_url)

	# For URLs with query parameters (like index.php?title=Article_Name),
	# extract the title parameter
	if parsed.query:
	query_params = parse_qs(parsed.query)
	if 'title' in query_params:
	# Use the title parameter as filename
	filename = query_params['title'][0]
	else:
	# Fallback: use the entire query string
	filename = parsed.query
	else:
	# Use path-based filename for clean URLs like /wiki/Article_Name
	path_parts = parsed.path.strip("/").split("/")
	filename = "_".join(path_parts[-2:]) if len(path_parts) > 1 else path_parts[-1]

	# Clean filename
	filename = re.sub(r"[^\w\s-]", "_", filename)
	filename = re.sub(r"[-\s]+", "_", filename)
	filename = filename[:200] # Limit length

	# Save article
	filepath = self.output_dir / filename
	self.save_article(result, filepath)

	# Add discovered links to queue
	for link in result["links"]:
	if link not in self.visited_urls:
	self.to_crawl.append((link, current_depth + 1))

	# Polite delay
	await asyncio.sleep(self.delay)

	# Periodic checkpoint save (every 10 articles)
	if self.articles_saved % 10 == 0:
	self.save_checkpoint()

	except KeyboardInterrupt:
	self.console.print("\n[yellow]Crawl interrupted by user[/yellow]")
	except Exception as e:
	self.console.print(f"\n[red]Error during crawl: {e}[/red]")
	finally:
	# Final checkpoint save
	self.save_checkpoint()

	# Print summary
	self.console.print("\n[bold cyan]Crawl Summary[/bold cyan]")
	self.console.print(f"Articles saved: {self.articles_saved}")
	self.console.print(f"URLs visited: {len(self.visited_urls)}")
	self.console.print(f"URLs failed: {len(self.failed_urls)}")
	self.console.print(f"URLs remaining: {len(self.to_crawl)}")

	if self.failed_urls:
	self.console.print("\n[yellow]Failed URLs:[/yellow]")
	for url, error in list(self.failed_urls.items())[:10]:
	self.console.print(f" - {url}: {error}")
	if len(self.failed_urls) > 10:
	self.console.print(f" ... and {len(self.failed_urls) - 10} more")