News-Scraper / src /scrapers /wion_scraper.py
Nishitha03's picture
Upload 15 files
dd99def verified
import os
import time
import logging
import csv
import signal
import sys
import argparse
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
TimeoutException, NoSuchElementException, ElementClickInterceptedException,
StaleElementReferenceException, WebDriverException
)
from bs4 import BeautifulSoup
from urllib3.exceptions import ProtocolError
from utils.webdriver_utils import create_chrome_driver, scroll_to_element
class WIONArticleScraper:
def __init__(self, max_workers=4, articles_per_page=10, max_retries=3):
self.max_workers = max_workers
self.articles_per_page = articles_per_page
self.max_retries = max_retries
self.base_url = "https://www.wionews.com/search"
self.fetched_articles = set()
self.articles = []
self.is_interrupted = False
self.last_save_time = time.time()
self.save_interval = 300 # Save every 5 minutes
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
self.logger = logging.getLogger(__name__)
signal.signal(signal.SIGINT, self.signal_handler)
signal.signal(signal.SIGTERM, self.signal_handler)
def signal_handler(self, signum, frame):
print("\nReceived interrupt signal. Saving progress and shutting down...")
self.is_interrupted = True
if self.articles:
self.save_progress("interrupted", force=True)
sys.exit(0)
# def create_driver(self):
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--disable-dev-shm-usage')
# chrome_options.add_argument('--disable-extensions')
# chrome_options.add_argument('--disable-gpu')
# chrome_options.add_argument('--disable-infobars')
# chrome_options.add_argument('--disable-notifications')
# chrome_options.add_argument('--blink-settings=imagesEnabled=false') # Disable images
# chrome_options.page_load_strategy = 'eager'
# # Add performance preferences
# chrome_options.add_experimental_option('prefs', {
# 'profile.default_content_setting_values.notifications': 2,
# 'profile.managed_default_content_settings.images': 2,
# 'disk-cache-size': 4096
# })
# return webdriver.Chrome(options=chrome_options)
def create_driver(self):
"""Create and return a new Chrome driver instance"""
return create_chrome_driver(headless=True, load_images=False)
def wait_for_page_load(self, driver, url, timeout=10, retries=3):
"""Wait for page load with retries"""
for attempt in range(retries):
try:
driver.set_page_load_timeout(timeout)
driver.get(url)
# Wait for DOM to be ready
WebDriverWait(driver, timeout).until(
lambda d: d.execute_script('return document.readyState') == 'complete'
)
return True
except (TimeoutException, WebDriverException, ProtocolError) as e:
if attempt == retries - 1:
self.logger.warning(f"Failed to load {url} after {retries} attempts: {str(e)}")
return False
else:
self.logger.info(f"Retrying page load for {url} (attempt {attempt + 2}/{retries})")
time.sleep(2 * (attempt + 1)) # Exponential backoff
continue
except Exception as e:
self.logger.error(f"Unexpected error loading {url}: {str(e)}")
return False
return False
def get_total_pages(self, driver, search_term):
try:
if not self.wait_for_page_load(driver, f"{self.base_url}?page=1&title={search_term}"):
return 20
soup = BeautifulSoup(driver.page_source, 'html.parser')
pagination = soup.find('ul', class_='pagination')
if pagination:
pages = pagination.find_all('li')
if pages:
last_page = pages[-2].text.strip()
return int(last_page)
return 20
except Exception as e:
self.logger.error(f"Error getting total pages: {str(e)}")
return 20
def extract_visible_articles(self, driver):
"""Extract currently visible articles from the page"""
try:
# Wait for article containers to be present
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'gh-archive-page-post-content-main'))
)
soup = BeautifulSoup(driver.page_source, 'html.parser')
article_containers = soup.find_all('div', class_='gh-archive-page-post-content-main')
new_articles = []
for container in article_containers:
try:
link_element = container.find('a', href=True)
date = self.extract_date(container)
if link_element and link_element['href']:
full_link = link_element['href'] if link_element['href'].startswith('http') else 'https://www.wionews.com' + link_element['href']
if full_link not in self.fetched_articles:
self.fetched_articles.add(full_link)
new_articles.append({'link': full_link, 'date': date})
except Exception as e:
self.logger.error(f"Error processing article container: {str(e)}")
continue
return new_articles
except Exception as e:
self.logger.error(f"Error extracting visible articles: {str(e)}")
return []
def extract_date(self, container):
try:
time_element = container.find('time', class_='gh-post-info__date')
if time_element:
date = time_element.get('datetime', time_element.get_text().strip())
return date
post_info = container.find('div', class_='gh-post-info')
if post_info:
time_element = post_info.find('time')
if time_element:
date = time_element.get('datetime', time_element.get_text().strip())
return date
return None
except Exception as e:
self.logger.error(f"Error extracting date: {str(e)}")
return None
def scrape_topic(self, search_term, topic):
driver = self.create_driver()
try:
total_pages = self.get_total_pages(driver, search_term)
total_expected_articles = total_pages * self.articles_per_page
self.logger.info(f"Found {total_pages} pages to scrape (approximately {total_expected_articles} articles)")
for page in range(1, total_pages + 1):
if self.is_interrupted:
break
page_url = f"{self.base_url}?page={page}&title={search_term}"
articles_scraped = len(self.articles)
progress_percentage = (articles_scraped / total_expected_articles) * 100
self.logger.info(f"Scraping page {page}/{total_pages} - Progress: {articles_scraped}/{total_expected_articles} articles ({progress_percentage:.1f}%)")
# Try loading the page with retries
if not self.wait_for_page_load(driver, page_url):
self.logger.warning(f"Skipping page {page} due to load failure")
continue
new_articles = self.extract_visible_articles(driver)
if new_articles:
self.process_articles_batch(new_articles)
self.logger.info(f"Scraped {len(new_articles)}/{self.articles_per_page} articles from page {page}")
self.save_progress(topic)
else:
self.logger.warning(f"No articles found on page {page}")
if self.articles:
return self.save_to_csv(self.articles, topic, final=True)
return None
except Exception as e:
self.logger.error(f"Error in scrape_topic: {str(e)}")
if self.articles:
return self.save_to_csv(self.articles, topic, final=True)
return None
finally:
driver.quit()
def scrape_article_parallel(self, article_data):
driver = self.create_driver()
url = article_data['link']
for attempt in range(self.max_retries):
try:
if not self.wait_for_page_load(driver, url):
if attempt == self.max_retries - 1:
raise TimeoutException(f"Failed to load article after {self.max_retries} attempts")
continue
# Wait for main content to be present
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "article"))
)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Extract title with multiple fallbacks
title = None
for selector in ['h1.article-main-title', 'h1.headline', 'h1']:
title_element = soup.select_one(selector)
if title_element:
title = title_element.get_text().strip()
break
# Extract description with multiple fallbacks
desc = None
for class_name in ['post-content', 'article-main-content', 'post-page-main-content']:
content_div = soup.find('div', class_=class_name)
if content_div:
p_tags = content_div.find_all('p')
if p_tags:
desc = ' '.join([p.get_text().strip() for p in p_tags])
break
return {
'title': title or 'Title not found',
'desc': desc or 'Description not found',
'date': article_data['date'] or 'Date not found',
'link': url
}
except Exception as e:
if attempt == self.max_retries - 1:
self.logger.error(f"Error scraping article {url}: {str(e)}")
return None
time.sleep(2 * (attempt + 1)) # Exponential backoff
finally:
if attempt == self.max_retries - 1:
driver.quit()
def process_articles_batch(self, article_batch):
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = [executor.submit(self.scrape_article_parallel, article_data)
for article_data in article_batch]
successful_articles = 0
for future in as_completed(futures):
try:
article = future.result()
if article:
self.articles.append(article)
successful_articles += 1
self.logger.info(f"Successfully scraped: {article['title'][:50]}...")
except Exception as e:
self.logger.error(f"Error processing article: {str(e)}")
if successful_articles < len(article_batch):
self.logger.warning(f"Only processed {successful_articles}/{len(article_batch)} articles in this batch")
def save_progress(self, topic, force=False):
current_time = time.time()
if force or (current_time - self.last_save_time >= self.save_interval and self.articles):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"wion_{topic}articles_{timestamp}_partial.csv"
try:
with open(filename, 'w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link'])
writer.writeheader()
writer.writerows(self.articles)
self.last_save_time = current_time
print(f"\nProgress saved to {filename}: {len(self.articles)} articles")
self.logger.info(f"Progress saved to {filename}: {len(self.articles)} articles")
except Exception as e:
self.logger.error(f"Error saving progress: {str(e)}")
def save_to_csv(self, articles, topic, final=False):
if not articles:
self.logger.error("No articles to save")
return None
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"wion_{topic}articles_{timestamp}_{'final' if final else 'partial'}.csv"
try:
with open(filename, 'w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link'])
writer.writeheader()
writer.writerows(articles)
self.logger.info(f"Successfully saved {len(articles)} articles to: {filename}")
return filename
except Exception as e:
self.logger.error(f"Error saving to CSV: {str(e)}")
return None
def parse_arguments():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description='Scrape articles from WION News by topic')
parser.add_argument('topic', type=str,
help='Topic to scrape (e.g., "RSS", "Covid", "India")')
parser.add_argument('-w', '--workers', type=int, default=4,
help='Number of worker threads (default: 4)')
parser.add_argument('-i', '--interval', type=int, default=300,
help='Auto-save interval in seconds (default: 300)')
parser.add_argument('-a', '--articles-per-page', type=int, default=10,
help='Expected number of articles per page (default: 10)')
parser.add_argument('-r', '--retries', type=int, default=3,
help='Maximum retries for failed requests (default: 3)')
return parser.parse_args()
def main():
try:
# Parse command line arguments
args = parse_arguments()
# Initialize the scraper with command line arguments
scraper = WIONArticleScraper(
max_workers=args.workers,
articles_per_page=args.articles_per_page,
max_retries=args.retries
)
scraper.save_interval = args.interval
# Get topic from command line argument
topic = args.topic
print(f"\nScraping {topic}-related articles from WION News...")
print("Press Ctrl+C at any time to save progress and exit.")
final_csv = scraper.scrape_topic(topic.lower(), topic)
if final_csv:
print(f"\nArticles have been saved to: {final_csv}")
print(f"Total articles scraped: {len(scraper.articles)}")
else:
print("\nError saving to final CSV file")
except KeyboardInterrupt:
print("\nProcess interrupted by user. Saving progress...")
if scraper.articles:
scraper.save_progress(topic, force=True)
print("Saved progress and exiting.")
except Exception as e:
print(f"\nAn error occurred: {str(e)}")
if 'scraper' in locals() and scraper.articles:
scraper.save_progress(topic, force=True)
print("Saved progress despite error.")
if __name__ == "__main__":
main()