Spaces:

Nishitha03
/

News-Scraper

Sleeping

App Files Files Community

News-Scraper / src /scrapers /wion_scraper.py

Nishitha03

Upload 15 files

dd99def verified 7 months ago

raw

history blame contribute delete

16.2 kB

	import os
	import time
	import logging
	import csv
	import signal
	import sys
	import argparse
	from datetime import datetime
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import (
	TimeoutException, NoSuchElementException, ElementClickInterceptedException,
	StaleElementReferenceException, WebDriverException
	)
	from bs4 import BeautifulSoup
	from urllib3.exceptions import ProtocolError

	from utils.webdriver_utils import create_chrome_driver, scroll_to_element

	class WIONArticleScraper:
	def __init__(self, max_workers=4, articles_per_page=10, max_retries=3):
	self.max_workers = max_workers
	self.articles_per_page = articles_per_page
	self.max_retries = max_retries
	self.base_url = "https://www.wionews.com/search"
	self.fetched_articles = set()
	self.articles = []
	self.is_interrupted = False
	self.last_save_time = time.time()
	self.save_interval = 300 # Save every 5 minutes

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)
	self.logger = logging.getLogger(__name__)

	signal.signal(signal.SIGINT, self.signal_handler)
	signal.signal(signal.SIGTERM, self.signal_handler)

	def signal_handler(self, signum, frame):
	print("\nReceived interrupt signal. Saving progress and shutting down...")
	self.is_interrupted = True
	if self.articles:
	self.save_progress("interrupted", force=True)
	sys.exit(0)

	# def create_driver(self):
	# chrome_options = webdriver.ChromeOptions()
	# chrome_options.add_argument('--headless')
	# chrome_options.add_argument('--no-sandbox')
	# chrome_options.add_argument('--disable-dev-shm-usage')
	# chrome_options.add_argument('--disable-extensions')
	# chrome_options.add_argument('--disable-gpu')
	# chrome_options.add_argument('--disable-infobars')
	# chrome_options.add_argument('--disable-notifications')
	# chrome_options.add_argument('--blink-settings=imagesEnabled=false') # Disable images
	# chrome_options.page_load_strategy = 'eager'

	# # Add performance preferences
	# chrome_options.add_experimental_option('prefs', {
	# 'profile.default_content_setting_values.notifications': 2,
	# 'profile.managed_default_content_settings.images': 2,
	# 'disk-cache-size': 4096
	# })

	# return webdriver.Chrome(options=chrome_options)

	def create_driver(self):
	"""Create and return a new Chrome driver instance"""
	return create_chrome_driver(headless=True, load_images=False)

	def wait_for_page_load(self, driver, url, timeout=10, retries=3):
	"""Wait for page load with retries"""
	for attempt in range(retries):
	try:
	driver.set_page_load_timeout(timeout)
	driver.get(url)

	# Wait for DOM to be ready
	WebDriverWait(driver, timeout).until(
	lambda d: d.execute_script('return document.readyState') == 'complete'
	)

	return True
	except (TimeoutException, WebDriverException, ProtocolError) as e:
	if attempt == retries - 1:
	self.logger.warning(f"Failed to load {url} after {retries} attempts: {str(e)}")
	return False
	else:
	self.logger.info(f"Retrying page load for {url} (attempt {attempt + 2}/{retries})")
	time.sleep(2 * (attempt + 1)) # Exponential backoff
	continue
	except Exception as e:
	self.logger.error(f"Unexpected error loading {url}: {str(e)}")
	return False
	return False

	def get_total_pages(self, driver, search_term):
	try:
	if not self.wait_for_page_load(driver, f"{self.base_url}?page=1&title={search_term}"):
	return 20

	soup = BeautifulSoup(driver.page_source, 'html.parser')
	pagination = soup.find('ul', class_='pagination')
	if pagination:
	pages = pagination.find_all('li')
	if pages:
	last_page = pages[-2].text.strip()
	return int(last_page)
	return 20
	except Exception as e:
	self.logger.error(f"Error getting total pages: {str(e)}")
	return 20

	def extract_visible_articles(self, driver):
	"""Extract currently visible articles from the page"""
	try:
	# Wait for article containers to be present
	WebDriverWait(driver, 10).until(
	EC.presence_of_element_located((By.CLASS_NAME, 'gh-archive-page-post-content-main'))
	)

	soup = BeautifulSoup(driver.page_source, 'html.parser')
	article_containers = soup.find_all('div', class_='gh-archive-page-post-content-main')

	new_articles = []
	for container in article_containers:
	try:
	link_element = container.find('a', href=True)
	date = self.extract_date(container)

	if link_element and link_element['href']:
	full_link = link_element['href'] if link_element['href'].startswith('http') else 'https://www.wionews.com' + link_element['href']
	if full_link not in self.fetched_articles:
	self.fetched_articles.add(full_link)
	new_articles.append({'link': full_link, 'date': date})
	except Exception as e:
	self.logger.error(f"Error processing article container: {str(e)}")
	continue

	return new_articles
	except Exception as e:
	self.logger.error(f"Error extracting visible articles: {str(e)}")
	return []

	def extract_date(self, container):
	try:
	time_element = container.find('time', class_='gh-post-info__date')

	if time_element:
	date = time_element.get('datetime', time_element.get_text().strip())
	return date

	post_info = container.find('div', class_='gh-post-info')
	if post_info:
	time_element = post_info.find('time')
	if time_element:
	date = time_element.get('datetime', time_element.get_text().strip())
	return date

	return None
	except Exception as e:
	self.logger.error(f"Error extracting date: {str(e)}")
	return None

	def scrape_topic(self, search_term, topic):
	driver = self.create_driver()
	try:
	total_pages = self.get_total_pages(driver, search_term)
	total_expected_articles = total_pages * self.articles_per_page
	self.logger.info(f"Found {total_pages} pages to scrape (approximately {total_expected_articles} articles)")

	for page in range(1, total_pages + 1):
	if self.is_interrupted:
	break

	page_url = f"{self.base_url}?page={page}&title={search_term}"
	articles_scraped = len(self.articles)
	progress_percentage = (articles_scraped / total_expected_articles) * 100

	self.logger.info(f"Scraping page {page}/{total_pages} - Progress: {articles_scraped}/{total_expected_articles} articles ({progress_percentage:.1f}%)")

	# Try loading the page with retries
	if not self.wait_for_page_load(driver, page_url):
	self.logger.warning(f"Skipping page {page} due to load failure")
	continue

	new_articles = self.extract_visible_articles(driver)
	if new_articles:
	self.process_articles_batch(new_articles)
	self.logger.info(f"Scraped {len(new_articles)}/{self.articles_per_page} articles from page {page}")
	self.save_progress(topic)
	else:
	self.logger.warning(f"No articles found on page {page}")

	if self.articles:
	return self.save_to_csv(self.articles, topic, final=True)
	return None

	except Exception as e:
	self.logger.error(f"Error in scrape_topic: {str(e)}")
	if self.articles:
	return self.save_to_csv(self.articles, topic, final=True)
	return None
	finally:
	driver.quit()

	def scrape_article_parallel(self, article_data):
	driver = self.create_driver()
	url = article_data['link']

	for attempt in range(self.max_retries):
	try:
	if not self.wait_for_page_load(driver, url):
	if attempt == self.max_retries - 1:
	raise TimeoutException(f"Failed to load article after {self.max_retries} attempts")
	continue

	# Wait for main content to be present
	WebDriverWait(driver, 10).until(
	EC.presence_of_element_located((By.TAG_NAME, "article"))
	)

	soup = BeautifulSoup(driver.page_source, 'html.parser')

	# Extract title with multiple fallbacks
	title = None
	for selector in ['h1.article-main-title', 'h1.headline', 'h1']:
	title_element = soup.select_one(selector)
	if title_element:
	title = title_element.get_text().strip()
	break

	# Extract description with multiple fallbacks
	desc = None
	for class_name in ['post-content', 'article-main-content', 'post-page-main-content']:
	content_div = soup.find('div', class_=class_name)
	if content_div:
	p_tags = content_div.find_all('p')
	if p_tags:
	desc = ' '.join([p.get_text().strip() for p in p_tags])
	break

	return {
	'title': title or 'Title not found',
	'desc': desc or 'Description not found',
	'date': article_data['date'] or 'Date not found',
	'link': url
	}

	except Exception as e:
	if attempt == self.max_retries - 1:
	self.logger.error(f"Error scraping article {url}: {str(e)}")
	return None
	time.sleep(2 * (attempt + 1)) # Exponential backoff
	finally:
	if attempt == self.max_retries - 1:
	driver.quit()

	def process_articles_batch(self, article_batch):
	with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
	futures = [executor.submit(self.scrape_article_parallel, article_data)
	for article_data in article_batch]

	successful_articles = 0
	for future in as_completed(futures):
	try:
	article = future.result()
	if article:
	self.articles.append(article)
	successful_articles += 1
	self.logger.info(f"Successfully scraped: {article['title'][:50]}...")
	except Exception as e:
	self.logger.error(f"Error processing article: {str(e)}")

	if successful_articles < len(article_batch):
	self.logger.warning(f"Only processed {successful_articles}/{len(article_batch)} articles in this batch")

	def save_progress(self, topic, force=False):
	current_time = time.time()
	if force or (current_time - self.last_save_time >= self.save_interval and self.articles):
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"wion_{topic}articles_{timestamp}_partial.csv"
	try:
	with open(filename, 'w', newline='', encoding='utf-8') as file:
	writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link'])
	writer.writeheader()
	writer.writerows(self.articles)
	self.last_save_time = current_time
	print(f"\nProgress saved to {filename}: {len(self.articles)} articles")
	self.logger.info(f"Progress saved to {filename}: {len(self.articles)} articles")
	except Exception as e:
	self.logger.error(f"Error saving progress: {str(e)}")

	def save_to_csv(self, articles, topic, final=False):
	if not articles:
	self.logger.error("No articles to save")
	return None

	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"wion_{topic}articles_{timestamp}_{'final' if final else 'partial'}.csv"

	try:
	with open(filename, 'w', newline='', encoding='utf-8') as file:
	writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link'])
	writer.writeheader()
	writer.writerows(articles)
	self.logger.info(f"Successfully saved {len(articles)} articles to: {filename}")
	return filename
	except Exception as e:
	self.logger.error(f"Error saving to CSV: {str(e)}")
	return None

	def parse_arguments():
	"""Parse command line arguments"""
	parser = argparse.ArgumentParser(description='Scrape articles from WION News by topic')

	parser.add_argument('topic', type=str,
	help='Topic to scrape (e.g., "RSS", "Covid", "India")')

	parser.add_argument('-w', '--workers', type=int, default=4,
	help='Number of worker threads (default: 4)')

	parser.add_argument('-i', '--interval', type=int, default=300,
	help='Auto-save interval in seconds (default: 300)')

	parser.add_argument('-a', '--articles-per-page', type=int, default=10,
	help='Expected number of articles per page (default: 10)')

	parser.add_argument('-r', '--retries', type=int, default=3,
	help='Maximum retries for failed requests (default: 3)')

	return parser.parse_args()

	def main():
	try:
	# Parse command line arguments
	args = parse_arguments()

	# Initialize the scraper with command line arguments
	scraper = WIONArticleScraper(
	max_workers=args.workers,
	articles_per_page=args.articles_per_page,
	max_retries=args.retries
	)
	scraper.save_interval = args.interval

	# Get topic from command line argument
	topic = args.topic

	print(f"\nScraping {topic}-related articles from WION News...")
	print("Press Ctrl+C at any time to save progress and exit.")

	final_csv = scraper.scrape_topic(topic.lower(), topic)

	if final_csv:
	print(f"\nArticles have been saved to: {final_csv}")
	print(f"Total articles scraped: {len(scraper.articles)}")
	else:
	print("\nError saving to final CSV file")

	except KeyboardInterrupt:
	print("\nProcess interrupted by user. Saving progress...")
	if scraper.articles:
	scraper.save_progress(topic, force=True)
	print("Saved progress and exiting.")
	except Exception as e:
	print(f"\nAn error occurred: {str(e)}")
	if 'scraper' in locals() and scraper.articles:
	scraper.save_progress(topic, force=True)
	print("Saved progress despite error.")

	if __name__ == "__main__":
	main()