Spaces:
Sleeping
Sleeping
| """ | |
| Web scraper for collecting Iain Morris articles from Light Reading | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import json | |
| import time | |
| import re | |
| from urllib.parse import urljoin, urlparse | |
| from typing import List, Dict, Optional | |
| import logging | |
| from tqdm import tqdm | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class LightReadingScraper: | |
| def __init__(self, delay: float = 2.0): | |
| """ | |
| Initialize the scraper with respectful rate limiting | |
| Args: | |
| delay: Delay between requests in seconds | |
| """ | |
| self.base_url = "https://www.lightreading.com" | |
| self.delay = delay | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| }) | |
| def search_author_articles(self, author_name: str, max_pages: int = 10) -> List[str]: | |
| """ | |
| Search for articles by a specific author | |
| Args: | |
| author_name: Name of the author to search for | |
| max_pages: Maximum number of search result pages to process | |
| Returns: | |
| List of article URLs | |
| """ | |
| article_urls = [] | |
| # Try different search approaches | |
| search_queries = [ | |
| f'author:"{author_name}"', | |
| f'"{author_name}"', | |
| author_name.replace(' ', '+') | |
| ] | |
| for query in search_queries: | |
| logger.info(f"Searching with query: {query}") | |
| for page in range(1, max_pages + 1): | |
| search_url = f"{self.base_url}/search?q={query}&page={page}" | |
| try: | |
| response = self.session.get(search_url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Find article links in search results | |
| article_links = soup.find_all('a', href=True) | |
| page_urls = [] | |
| for link in article_links: | |
| href = link.get('href') | |
| if href and ('/news/' in href or '/blog/' in href or '/opinion/' in href): | |
| full_url = urljoin(self.base_url, href) | |
| if full_url not in article_urls: | |
| page_urls.append(full_url) | |
| if not page_urls: | |
| logger.info(f"No more articles found on page {page}") | |
| break | |
| article_urls.extend(page_urls) | |
| logger.info(f"Found {len(page_urls)} articles on page {page}") | |
| time.sleep(self.delay) | |
| except requests.RequestException as e: | |
| logger.error(f"Error searching page {page}: {e}") | |
| continue | |
| # Remove duplicates while preserving order | |
| unique_urls = list(dict.fromkeys(article_urls)) | |
| logger.info(f"Total unique articles found: {len(unique_urls)}") | |
| return unique_urls | |
| def get_author_page_articles(self, author_name: str) -> List[str]: | |
| """ | |
| Try to find articles from author's dedicated page | |
| Args: | |
| author_name: Name of the author | |
| Returns: | |
| List of article URLs | |
| """ | |
| article_urls = [] | |
| # Try common author page patterns | |
| author_slug = author_name.lower().replace(' ', '-') | |
| author_pages = [ | |
| f"{self.base_url}/author/{author_slug}", | |
| f"{self.base_url}/authors/{author_slug}", | |
| f"{self.base_url}/contributor/{author_slug}" | |
| ] | |
| for author_url in author_pages: | |
| try: | |
| response = self.session.get(author_url) | |
| if response.status_code == 200: | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Find article links | |
| article_links = soup.find_all('a', href=True) | |
| for link in article_links: | |
| href = link.get('href') | |
| if href and ('/news/' in href or '/blog/' in href or '/opinion/' in href): | |
| full_url = urljoin(self.base_url, href) | |
| article_urls.append(full_url) | |
| logger.info(f"Found {len(article_urls)} articles from author page") | |
| break | |
| except requests.RequestException as e: | |
| logger.debug(f"Author page {author_url} not accessible: {e}") | |
| continue | |
| time.sleep(self.delay) | |
| return list(dict.fromkeys(article_urls)) # Remove duplicates | |
| def scrape_article(self, url: str) -> Optional[Dict]: | |
| """ | |
| Scrape a single article | |
| Args: | |
| url: URL of the article to scrape | |
| Returns: | |
| Dictionary containing article data or None if failed | |
| """ | |
| try: | |
| response = self.session.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Extract article data | |
| article_data = { | |
| 'url': url, | |
| 'title': '', | |
| 'author': '', | |
| 'date': '', | |
| 'content': '', | |
| 'summary': '' | |
| } | |
| # Title | |
| title_selectors = [ | |
| 'h1.article-title', | |
| 'h1.entry-title', | |
| 'h1.post-title', | |
| 'h1', | |
| '.article-header h1', | |
| '.post-header h1' | |
| ] | |
| for selector in title_selectors: | |
| title_elem = soup.select_one(selector) | |
| if title_elem: | |
| article_data['title'] = title_elem.get_text().strip() | |
| break | |
| # Author | |
| author_selectors = [ | |
| '.author-name', | |
| '.byline', | |
| '.article-author', | |
| '.post-author', | |
| '[rel="author"]' | |
| ] | |
| for selector in author_selectors: | |
| author_elem = soup.select_one(selector) | |
| if author_elem: | |
| article_data['author'] = author_elem.get_text().strip() | |
| break | |
| # Date | |
| date_selectors = [ | |
| '.article-date', | |
| '.post-date', | |
| '.published', | |
| 'time', | |
| '.date' | |
| ] | |
| for selector in date_selectors: | |
| date_elem = soup.select_one(selector) | |
| if date_elem: | |
| article_data['date'] = date_elem.get_text().strip() | |
| break | |
| # Content | |
| content_selectors = [ | |
| '.article-content', | |
| '.post-content', | |
| '.entry-content', | |
| '.article-body', | |
| '.content' | |
| ] | |
| content_text = "" | |
| for selector in content_selectors: | |
| content_elem = soup.select_one(selector) | |
| if content_elem: | |
| # Remove script and style elements | |
| for script in content_elem(["script", "style"]): | |
| script.decompose() | |
| content_text = content_elem.get_text() | |
| break | |
| if not content_text: | |
| # Fallback: try to get all paragraph text | |
| paragraphs = soup.find_all('p') | |
| content_text = '\n'.join([p.get_text().strip() for p in paragraphs if p.get_text().strip()]) | |
| article_data['content'] = self.clean_text(content_text) | |
| # Summary (first paragraph or meta description) | |
| summary_elem = soup.select_one('meta[name="description"]') | |
| if summary_elem: | |
| article_data['summary'] = summary_elem.get('content', '').strip() | |
| elif article_data['content']: | |
| # Use first paragraph as summary | |
| first_para = article_data['content'].split('\n')[0] | |
| article_data['summary'] = first_para[:300] + '...' if len(first_para) > 300 else first_para | |
| # Validate article has minimum required content | |
| if len(article_data['content']) < 200: | |
| logger.warning(f"Article too short, skipping: {url}") | |
| return None | |
| # Note: Removed author matching check since we're scraping specific URLs | |
| # that may include articles by various authors | |
| return article_data | |
| except requests.RequestException as e: | |
| logger.error(f"Error scraping {url}: {e}") | |
| return None | |
| except Exception as e: | |
| logger.error(f"Unexpected error scraping {url}: {e}") | |
| return None | |
| def clean_text(self, text: str) -> str: | |
| """ | |
| Clean and normalize text content | |
| Args: | |
| text: Raw text to clean | |
| Returns: | |
| Cleaned text | |
| """ | |
| if not text: | |
| return "" | |
| # Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove common artifacts | |
| text = re.sub(r'\[.*?\]', '', text) # Remove [brackets] | |
| text = re.sub(r'Share this article.*$', '', text, flags=re.IGNORECASE) | |
| text = re.sub(r'Related articles.*$', '', text, flags=re.IGNORECASE) | |
| return text.strip() | |
| def scrape_author_articles(self, author_name: str, max_articles: int = 200) -> List[Dict]: | |
| """ | |
| Scrape all articles by a specific author | |
| Args: | |
| author_name: Name of the author | |
| max_articles: Maximum number of articles to scrape | |
| Returns: | |
| List of article dictionaries | |
| """ | |
| logger.info(f"Starting to scrape articles by {author_name}") | |
| # Get article URLs from multiple sources | |
| all_urls = [] | |
| # Try author page first | |
| author_page_urls = self.get_author_page_articles(author_name) | |
| all_urls.extend(author_page_urls) | |
| # Then try search | |
| search_urls = self.search_author_articles(author_name) | |
| all_urls.extend(search_urls) | |
| # Remove duplicates | |
| unique_urls = list(dict.fromkeys(all_urls)) | |
| if len(unique_urls) > max_articles: | |
| unique_urls = unique_urls[:max_articles] | |
| logger.info(f"Found {len(unique_urls)} unique article URLs to scrape") | |
| # Scrape articles | |
| articles = [] | |
| failed_count = 0 | |
| for url in tqdm(unique_urls, desc="Scraping articles"): | |
| article_data = self.scrape_article(url) | |
| if article_data: | |
| articles.append(article_data) | |
| logger.debug(f"Successfully scraped: {article_data['title']}") | |
| else: | |
| failed_count += 1 | |
| time.sleep(self.delay) | |
| logger.info(f"Successfully scraped {len(articles)} articles") | |
| logger.info(f"Failed to scrape {failed_count} articles") | |
| return articles | |
| def load_urls_from_file(self, filename: str) -> List[str]: | |
| """ | |
| Load URLs from a text file | |
| Args: | |
| filename: Path to the file containing URLs (one per line) | |
| Returns: | |
| List of URLs | |
| """ | |
| urls = [] | |
| try: | |
| with open(filename, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| url = line.strip() | |
| if url and not url.startswith('#'): # Skip empty lines and comments | |
| urls.append(url) | |
| logger.info(f"Loaded {len(urls)} URLs from {filename}") | |
| return urls | |
| except FileNotFoundError: | |
| logger.error(f"URL file not found: {filename}") | |
| return [] | |
| except Exception as e: | |
| logger.error(f"Error reading URL file {filename}: {e}") | |
| return [] | |
| def scrape_urls_from_file(self, filename: str) -> List[Dict]: | |
| """ | |
| Scrape articles from URLs listed in a file | |
| Args: | |
| filename: Path to the file containing URLs | |
| Returns: | |
| List of article dictionaries | |
| """ | |
| urls = self.load_urls_from_file(filename) | |
| if not urls: | |
| logger.error("No URLs to scrape") | |
| return [] | |
| logger.info(f"Starting to scrape {len(urls)} articles from URL file") | |
| articles = [] | |
| failed_count = 0 | |
| for url in tqdm(urls, desc="Scraping articles"): | |
| article_data = self.scrape_article(url) | |
| if article_data: | |
| articles.append(article_data) | |
| logger.debug(f"Successfully scraped: {article_data['title']}") | |
| else: | |
| failed_count += 1 | |
| logger.warning(f"Failed to scrape: {url}") | |
| time.sleep(self.delay) | |
| logger.info(f"Successfully scraped {len(articles)} articles") | |
| logger.info(f"Failed to scrape {failed_count} articles") | |
| return articles | |
| def save_articles(self, articles: List[Dict], filename: str): | |
| """ | |
| Save articles to JSON file | |
| Args: | |
| articles: List of article dictionaries | |
| filename: Output filename | |
| """ | |
| with open(filename, 'w', encoding='utf-8') as f: | |
| json.dump(articles, f, indent=2, ensure_ascii=False) | |
| logger.info(f"Saved {len(articles)} articles to {filename}") | |
| def main(): | |
| """ | |
| Main function to run the scraper | |
| """ | |
| scraper = LightReadingScraper(delay=2.0) | |
| # Scrape articles from URLs in urls.txt | |
| articles = scraper.scrape_urls_from_file("urls.txt") | |
| if articles: | |
| # Save raw articles | |
| scraper.save_articles(articles, "data/raw_articles.json") | |
| # Print summary | |
| print(f"\nScraping Summary:") | |
| print(f"Total articles collected: {len(articles)}") | |
| print(f"Average article length: {sum(len(a['content']) for a in articles) // len(articles)} characters") | |
| # Show sample titles | |
| print(f"\nSample article titles:") | |
| for i, article in enumerate(articles[:5]): | |
| print(f"{i+1}. {article['title']}") | |
| else: | |
| print("No articles were successfully scraped.") | |
| if __name__ == "__main__": | |
| main() | |