Spaces:
Sleeping
Sleeping
| import os | |
| import time | |
| import logging | |
| import csv | |
| import signal | |
| import sys | |
| import argparse | |
| from datetime import datetime | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.common.exceptions import ( | |
| TimeoutException, NoSuchElementException, ElementClickInterceptedException, | |
| StaleElementReferenceException, WebDriverException | |
| ) | |
| from bs4 import BeautifulSoup | |
| from urllib3.exceptions import ProtocolError | |
| from utils.webdriver_utils import create_chrome_driver, scroll_to_element | |
| class WIONArticleScraper: | |
| def __init__(self, max_workers=4, articles_per_page=10, max_retries=3): | |
| self.max_workers = max_workers | |
| self.articles_per_page = articles_per_page | |
| self.max_retries = max_retries | |
| self.base_url = "https://www.wionews.com/search" | |
| self.fetched_articles = set() | |
| self.articles = [] | |
| self.is_interrupted = False | |
| self.last_save_time = time.time() | |
| self.save_interval = 300 # Save every 5 minutes | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s' | |
| ) | |
| self.logger = logging.getLogger(__name__) | |
| signal.signal(signal.SIGINT, self.signal_handler) | |
| signal.signal(signal.SIGTERM, self.signal_handler) | |
| def signal_handler(self, signum, frame): | |
| print("\nReceived interrupt signal. Saving progress and shutting down...") | |
| self.is_interrupted = True | |
| if self.articles: | |
| self.save_progress("interrupted", force=True) | |
| sys.exit(0) | |
| # def create_driver(self): | |
| # chrome_options = webdriver.ChromeOptions() | |
| # chrome_options.add_argument('--headless') | |
| # chrome_options.add_argument('--no-sandbox') | |
| # chrome_options.add_argument('--disable-dev-shm-usage') | |
| # chrome_options.add_argument('--disable-extensions') | |
| # chrome_options.add_argument('--disable-gpu') | |
| # chrome_options.add_argument('--disable-infobars') | |
| # chrome_options.add_argument('--disable-notifications') | |
| # chrome_options.add_argument('--blink-settings=imagesEnabled=false') # Disable images | |
| # chrome_options.page_load_strategy = 'eager' | |
| # # Add performance preferences | |
| # chrome_options.add_experimental_option('prefs', { | |
| # 'profile.default_content_setting_values.notifications': 2, | |
| # 'profile.managed_default_content_settings.images': 2, | |
| # 'disk-cache-size': 4096 | |
| # }) | |
| # return webdriver.Chrome(options=chrome_options) | |
| def create_driver(self): | |
| """Create and return a new Chrome driver instance""" | |
| return create_chrome_driver(headless=True, load_images=False) | |
| def wait_for_page_load(self, driver, url, timeout=10, retries=3): | |
| """Wait for page load with retries""" | |
| for attempt in range(retries): | |
| try: | |
| driver.set_page_load_timeout(timeout) | |
| driver.get(url) | |
| # Wait for DOM to be ready | |
| WebDriverWait(driver, timeout).until( | |
| lambda d: d.execute_script('return document.readyState') == 'complete' | |
| ) | |
| return True | |
| except (TimeoutException, WebDriverException, ProtocolError) as e: | |
| if attempt == retries - 1: | |
| self.logger.warning(f"Failed to load {url} after {retries} attempts: {str(e)}") | |
| return False | |
| else: | |
| self.logger.info(f"Retrying page load for {url} (attempt {attempt + 2}/{retries})") | |
| time.sleep(2 * (attempt + 1)) # Exponential backoff | |
| continue | |
| except Exception as e: | |
| self.logger.error(f"Unexpected error loading {url}: {str(e)}") | |
| return False | |
| return False | |
| def get_total_pages(self, driver, search_term): | |
| try: | |
| if not self.wait_for_page_load(driver, f"{self.base_url}?page=1&title={search_term}"): | |
| return 20 | |
| soup = BeautifulSoup(driver.page_source, 'html.parser') | |
| pagination = soup.find('ul', class_='pagination') | |
| if pagination: | |
| pages = pagination.find_all('li') | |
| if pages: | |
| last_page = pages[-2].text.strip() | |
| return int(last_page) | |
| return 20 | |
| except Exception as e: | |
| self.logger.error(f"Error getting total pages: {str(e)}") | |
| return 20 | |
| def extract_visible_articles(self, driver): | |
| """Extract currently visible articles from the page""" | |
| try: | |
| # Wait for article containers to be present | |
| WebDriverWait(driver, 10).until( | |
| EC.presence_of_element_located((By.CLASS_NAME, 'gh-archive-page-post-content-main')) | |
| ) | |
| soup = BeautifulSoup(driver.page_source, 'html.parser') | |
| article_containers = soup.find_all('div', class_='gh-archive-page-post-content-main') | |
| new_articles = [] | |
| for container in article_containers: | |
| try: | |
| link_element = container.find('a', href=True) | |
| date = self.extract_date(container) | |
| if link_element and link_element['href']: | |
| full_link = link_element['href'] if link_element['href'].startswith('http') else 'https://www.wionews.com' + link_element['href'] | |
| if full_link not in self.fetched_articles: | |
| self.fetched_articles.add(full_link) | |
| new_articles.append({'link': full_link, 'date': date}) | |
| except Exception as e: | |
| self.logger.error(f"Error processing article container: {str(e)}") | |
| continue | |
| return new_articles | |
| except Exception as e: | |
| self.logger.error(f"Error extracting visible articles: {str(e)}") | |
| return [] | |
| def extract_date(self, container): | |
| try: | |
| time_element = container.find('time', class_='gh-post-info__date') | |
| if time_element: | |
| date = time_element.get('datetime', time_element.get_text().strip()) | |
| return date | |
| post_info = container.find('div', class_='gh-post-info') | |
| if post_info: | |
| time_element = post_info.find('time') | |
| if time_element: | |
| date = time_element.get('datetime', time_element.get_text().strip()) | |
| return date | |
| return None | |
| except Exception as e: | |
| self.logger.error(f"Error extracting date: {str(e)}") | |
| return None | |
| def scrape_topic(self, search_term, topic): | |
| driver = self.create_driver() | |
| try: | |
| total_pages = self.get_total_pages(driver, search_term) | |
| total_expected_articles = total_pages * self.articles_per_page | |
| self.logger.info(f"Found {total_pages} pages to scrape (approximately {total_expected_articles} articles)") | |
| for page in range(1, total_pages + 1): | |
| if self.is_interrupted: | |
| break | |
| page_url = f"{self.base_url}?page={page}&title={search_term}" | |
| articles_scraped = len(self.articles) | |
| progress_percentage = (articles_scraped / total_expected_articles) * 100 | |
| self.logger.info(f"Scraping page {page}/{total_pages} - Progress: {articles_scraped}/{total_expected_articles} articles ({progress_percentage:.1f}%)") | |
| # Try loading the page with retries | |
| if not self.wait_for_page_load(driver, page_url): | |
| self.logger.warning(f"Skipping page {page} due to load failure") | |
| continue | |
| new_articles = self.extract_visible_articles(driver) | |
| if new_articles: | |
| self.process_articles_batch(new_articles) | |
| self.logger.info(f"Scraped {len(new_articles)}/{self.articles_per_page} articles from page {page}") | |
| self.save_progress(topic) | |
| else: | |
| self.logger.warning(f"No articles found on page {page}") | |
| if self.articles: | |
| return self.save_to_csv(self.articles, topic, final=True) | |
| return None | |
| except Exception as e: | |
| self.logger.error(f"Error in scrape_topic: {str(e)}") | |
| if self.articles: | |
| return self.save_to_csv(self.articles, topic, final=True) | |
| return None | |
| finally: | |
| driver.quit() | |
| def scrape_article_parallel(self, article_data): | |
| driver = self.create_driver() | |
| url = article_data['link'] | |
| for attempt in range(self.max_retries): | |
| try: | |
| if not self.wait_for_page_load(driver, url): | |
| if attempt == self.max_retries - 1: | |
| raise TimeoutException(f"Failed to load article after {self.max_retries} attempts") | |
| continue | |
| # Wait for main content to be present | |
| WebDriverWait(driver, 10).until( | |
| EC.presence_of_element_located((By.TAG_NAME, "article")) | |
| ) | |
| soup = BeautifulSoup(driver.page_source, 'html.parser') | |
| # Extract title with multiple fallbacks | |
| title = None | |
| for selector in ['h1.article-main-title', 'h1.headline', 'h1']: | |
| title_element = soup.select_one(selector) | |
| if title_element: | |
| title = title_element.get_text().strip() | |
| break | |
| # Extract description with multiple fallbacks | |
| desc = None | |
| for class_name in ['post-content', 'article-main-content', 'post-page-main-content']: | |
| content_div = soup.find('div', class_=class_name) | |
| if content_div: | |
| p_tags = content_div.find_all('p') | |
| if p_tags: | |
| desc = ' '.join([p.get_text().strip() for p in p_tags]) | |
| break | |
| return { | |
| 'title': title or 'Title not found', | |
| 'desc': desc or 'Description not found', | |
| 'date': article_data['date'] or 'Date not found', | |
| 'link': url | |
| } | |
| except Exception as e: | |
| if attempt == self.max_retries - 1: | |
| self.logger.error(f"Error scraping article {url}: {str(e)}") | |
| return None | |
| time.sleep(2 * (attempt + 1)) # Exponential backoff | |
| finally: | |
| if attempt == self.max_retries - 1: | |
| driver.quit() | |
| def process_articles_batch(self, article_batch): | |
| with ThreadPoolExecutor(max_workers=self.max_workers) as executor: | |
| futures = [executor.submit(self.scrape_article_parallel, article_data) | |
| for article_data in article_batch] | |
| successful_articles = 0 | |
| for future in as_completed(futures): | |
| try: | |
| article = future.result() | |
| if article: | |
| self.articles.append(article) | |
| successful_articles += 1 | |
| self.logger.info(f"Successfully scraped: {article['title'][:50]}...") | |
| except Exception as e: | |
| self.logger.error(f"Error processing article: {str(e)}") | |
| if successful_articles < len(article_batch): | |
| self.logger.warning(f"Only processed {successful_articles}/{len(article_batch)} articles in this batch") | |
| def save_progress(self, topic, force=False): | |
| current_time = time.time() | |
| if force or (current_time - self.last_save_time >= self.save_interval and self.articles): | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| filename = f"wion_{topic}articles_{timestamp}_partial.csv" | |
| try: | |
| with open(filename, 'w', newline='', encoding='utf-8') as file: | |
| writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link']) | |
| writer.writeheader() | |
| writer.writerows(self.articles) | |
| self.last_save_time = current_time | |
| print(f"\nProgress saved to {filename}: {len(self.articles)} articles") | |
| self.logger.info(f"Progress saved to {filename}: {len(self.articles)} articles") | |
| except Exception as e: | |
| self.logger.error(f"Error saving progress: {str(e)}") | |
| def save_to_csv(self, articles, topic, final=False): | |
| if not articles: | |
| self.logger.error("No articles to save") | |
| return None | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| filename = f"wion_{topic}articles_{timestamp}_{'final' if final else 'partial'}.csv" | |
| try: | |
| with open(filename, 'w', newline='', encoding='utf-8') as file: | |
| writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link']) | |
| writer.writeheader() | |
| writer.writerows(articles) | |
| self.logger.info(f"Successfully saved {len(articles)} articles to: {filename}") | |
| return filename | |
| except Exception as e: | |
| self.logger.error(f"Error saving to CSV: {str(e)}") | |
| return None | |
| def parse_arguments(): | |
| """Parse command line arguments""" | |
| parser = argparse.ArgumentParser(description='Scrape articles from WION News by topic') | |
| parser.add_argument('topic', type=str, | |
| help='Topic to scrape (e.g., "RSS", "Covid", "India")') | |
| parser.add_argument('-w', '--workers', type=int, default=4, | |
| help='Number of worker threads (default: 4)') | |
| parser.add_argument('-i', '--interval', type=int, default=300, | |
| help='Auto-save interval in seconds (default: 300)') | |
| parser.add_argument('-a', '--articles-per-page', type=int, default=10, | |
| help='Expected number of articles per page (default: 10)') | |
| parser.add_argument('-r', '--retries', type=int, default=3, | |
| help='Maximum retries for failed requests (default: 3)') | |
| return parser.parse_args() | |
| def main(): | |
| try: | |
| # Parse command line arguments | |
| args = parse_arguments() | |
| # Initialize the scraper with command line arguments | |
| scraper = WIONArticleScraper( | |
| max_workers=args.workers, | |
| articles_per_page=args.articles_per_page, | |
| max_retries=args.retries | |
| ) | |
| scraper.save_interval = args.interval | |
| # Get topic from command line argument | |
| topic = args.topic | |
| print(f"\nScraping {topic}-related articles from WION News...") | |
| print("Press Ctrl+C at any time to save progress and exit.") | |
| final_csv = scraper.scrape_topic(topic.lower(), topic) | |
| if final_csv: | |
| print(f"\nArticles have been saved to: {final_csv}") | |
| print(f"Total articles scraped: {len(scraper.articles)}") | |
| else: | |
| print("\nError saving to final CSV file") | |
| except KeyboardInterrupt: | |
| print("\nProcess interrupted by user. Saving progress...") | |
| if scraper.articles: | |
| scraper.save_progress(topic, force=True) | |
| print("Saved progress and exiting.") | |
| except Exception as e: | |
| print(f"\nAn error occurred: {str(e)}") | |
| if 'scraper' in locals() and scraper.articles: | |
| scraper.save_progress(topic, force=True) | |
| print("Saved progress despite error.") | |
| if __name__ == "__main__": | |
| main() |