Spaces:

Nishitha03
/

News-Scraper

Sleeping

App Files Files Community

News-Scraper / src /scrapers /scroll_scraper.py

Nishitha03

Upload 15 files

dd99def verified 4 months ago

raw

history blame contribute delete

13.2 kB

	import os
	import time
	import logging
	import csv
	import signal
	import sys
	import argparse
	from datetime import datetime
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import (
	TimeoutException, NoSuchElementException, ElementClickInterceptedException
	)
	from bs4 import BeautifulSoup
	from utils.webdriver_utils import create_chrome_driver, scroll_to_element

	class ScrollArticleScraper:
	def __init__(self, max_workers=4, articles_per_page=10):
	self.max_workers = max_workers
	self.articles_per_page = articles_per_page
	self.base_url = "https://scroll.in/search"
	self.fetched_articles = set()
	self.articles = []
	self.is_interrupted = False
	self.last_save_time = time.time()
	self.save_interval = 300 # Save every 5 minutes

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)
	self.logger = logging.getLogger(__name__)

	# Set up signal handlers
	signal.signal(signal.SIGINT, self.signal_handler)
	signal.signal(signal.SIGTERM, self.signal_handler)

	def signal_handler(self, signum, frame):
	"""Handle interrupt signals"""
	print("\nReceived interrupt signal. Saving progress and shutting down...")
	self.is_interrupted = True
	if self.articles:
	self.save_progress("interrupted", force=True)
	sys.exit(0)

	# def create_driver(self):
	# """Create and return a new Chrome driver instance"""
	# chrome_options = webdriver.ChromeOptions()
	# chrome_options.add_argument('--headless')
	# chrome_options.add_argument('--no-sandbox')
	# chrome_options.add_argument('--disable-dev-shm-usage')
	# chrome_options.add_argument('--disable-extensions')
	# chrome_options.page_load_strategy = 'eager'
	# return webdriver.Chrome(options=chrome_options)
	def create_driver(self):
	"""Create and return a new Chrome driver instance"""
	return create_chrome_driver(headless=True, load_images=False)

	def get_total_pages(self, driver, search_term):
	"""Get the total number of pages for the search term"""
	try:
	driver.get(f"{self.base_url}?q={search_term}&page=1")
	time.sleep(2)
	soup = BeautifulSoup(driver.page_source, 'html.parser')

	# Scroll.in might have a different pagination structure
	pagination = soup.find('div', class_='page-numbers')
	if pagination:
	pages = pagination.find_all('a', class_='page-number')
	if pages:
	# Get the last page number
	last_page = max([int(page.text.strip()) for page in pages])
	return last_page

	# Fallback to 144 pages if pagination not found
	return 144
	except Exception as e:
	self.logger.error(f"Error getting total pages: {str(e)}")
	return 144 # Default to 144 pages
	def extract_visible_articles(self, driver):
	soup = BeautifulSoup(driver.page_source, 'html.parser')
	article_containers = soup.find_all('li', itemscope=True, itemtype="https://schema.org/NewsArticle")

	new_articles = []
	for container in article_containers:
	link_element = container.find('a', href=True)
	date_element = container.find('time') # Change back to 'time'

	if link_element and link_element['href']:
	full_link = link_element['href'] if link_element['href'].startswith('http') else 'https://scroll.in' + link_element['href']

	# Extract date
	date = None
	if date_element:
	date_text = date_element.get_text(strip=True)
	date_text = date_text.replace('Â', '').replace('·', '').strip()
	try:
	# Try parsing the date
	parsed_date = datetime.strptime(date_text, '%b %d, %Y')
	date = parsed_date.strftime('%Y-%m-%d')
	except:
	try:
	# Alternative date format
	parsed_date = datetime.strptime(date_text, '%B %d, %Y')
	date = parsed_date.strftime('%Y-%m-%d')
	except:
	date = date_text

	if full_link not in self.fetched_articles:
	self.fetched_articles.add(full_link)
	new_articles.append({'link': full_link, 'date': date})

	return new_articles

	def scrape_topic(self, search_term, topic):
	"""Scrape articles from all pages for a given search term"""
	driver = self.create_driver()
	try:
	total_pages = self.get_total_pages(driver, search_term)
	total_expected_articles = total_pages * self.articles_per_page
	self.logger.info(f"Found {total_pages} pages to scrape (approximately {total_expected_articles} articles)")

	for page in range(1, total_pages + 1):
	if self.is_interrupted:
	break

	page_url = f"{self.base_url}?q={search_term}&page={page}"
	articles_scraped = len(self.articles)
	progress_percentage = (articles_scraped / total_expected_articles) * 100

	self.logger.info(f"Scraping page {page}/{total_pages} - Progress: {articles_scraped}/{total_expected_articles} articles ({progress_percentage:.1f}%)")

	try:
	driver.get(page_url)
	time.sleep(2) # Allow page to load

	new_articles = self.extract_visible_articles(driver)
	if new_articles:
	self.process_articles_batch(new_articles)
	self.logger.info(f"Scraped {len(new_articles)}/{self.articles_per_page} articles from page {page}")
	self.save_progress(topic)
	else:
	self.logger.warning(f"No articles found on page {page}")

	except Exception as e:
	self.logger.error(f"Error scraping page {page}: {str(e)}")
	continue

	if self.articles:
	return self.save_to_csv(self.articles, topic, final=True)
	return None

	except Exception as e:
	self.logger.error(f"Error in scrape_topic: {str(e)}")
	if self.articles:
	return self.save_to_csv(self.articles, topic, final=True)
	return None
	finally:
	driver.quit()

	def scrape_article_parallel(self, article_data):

	driver = self.create_driver()
	try:
	url = article_data['link']
	driver.get(url)
	time.sleep(2)

	soup = BeautifulSoup(driver.page_source, 'html.parser')

	# Extract title
	title = None
	title_element = soup.find('h1') # Simple h1 search
	if title_element:
	title = title_element.get_text().strip()

	# Extract description
	desc = None
	article_body = soup.find('section', class_='article-content')
	if article_body:
	paragraphs = article_body.find_all('p')
	desc = ' '.join([p.get_text().strip() for p in paragraphs])

	return {
	'title': title or 'Title not found',
	'desc': desc or 'Description not found',
	'date': article_data['date'] or 'Date not found',
	'link': url
	}

	except Exception as e:
	self.logger.error(f"Error scraping article {url}: {str(e)}")
	return None
	finally:
	driver.quit()

	def process_articles_batch(self, article_batch):
	"""Process a batch of articles in parallel"""
	with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
	futures = [executor.submit(self.scrape_article_parallel, article_data)
	for article_data in article_batch]

	successful_articles = 0
	for future in as_completed(futures):
	try:
	article = future.result()
	if article:
	self.articles.append(article)
	successful_articles += 1
	self.logger.info(f"Successfully scraped: {article['title'][:50]}...")
	except Exception as e:
	self.logger.error(f"Error processing article: {str(e)}")

	if successful_articles < len(article_batch):
	self.logger.warning(f"Only processed {successful_articles}/{len(article_batch)} articles in this batch")

	def save_progress(self, topic, force=False):
	"""Save current progress to CSV"""
	current_time = time.time()
	if force or (current_time - self.last_save_time >= self.save_interval and self.articles):
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"scroll_{topic}articles_{timestamp}_partial.csv"
	try:
	with open(filename, 'w', newline='', encoding='utf-8') as file:
	writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link'])
	writer.writeheader()
	writer.writerows(self.articles)
	self.last_save_time = current_time
	print(f"\nProgress saved to {filename}: {len(self.articles)} articles")
	self.logger.info(f"Progress saved to {filename}: {len(self.articles)} articles")
	except Exception as e:
	self.logger.error(f"Error saving progress: {str(e)}")

	def save_to_csv(self, articles, topic, final=False):
	"""Save articles to CSV file"""
	if not articles:
	self.logger.error("No articles to save")
	return None

	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"scroll_{topic}articles_{timestamp}_{'final' if final else 'partial'}.csv"

	try:
	with open(filename, 'w', newline='', encoding='utf-8') as file:
	writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link'])
	writer.writeheader()
	writer.writerows(articles)
	self.logger.info(f"Successfully saved {len(articles)} articles to: {filename}")
	return filename
	except Exception as e:
	self.logger.error(f"Error saving to CSV: {str(e)}")
	return None

	# def parse_arguments():
	# """Parse command line arguments"""
	# parser = argparse.ArgumentParser(description='Scrape articles from Scroll.in by topic')

	# parser.add_argument('topic', type=str,
	# help='Topic to scrape (e.g., "RSS", "Covid", "India")')

	# parser.add_argument('-w', '--workers', type=int, default=4,
	# help='Number of worker threads (default: 4)')

	# parser.add_argument('-i', '--interval', type=int, default=300,
	# help='Auto-save interval in seconds (default: 300)')

	# parser.add_argument('-a', '--articles-per-page', type=int, default=10,
	# help='Expected number of articles per page (default: 10)')

	# return parser.parse_args()

	def main():
	try:
	# Parse command line arguments
	args = parse_arguments()

	# Initialize the scraper with command line arguments
	scraper = ScrollArticleScraper(
	max_workers=args.workers,
	articles_per_page=args.articles_per_page
	)
	scraper.save_interval = args.interval

	# Get topic from command line argument
	topic = args.topic

	print(f"\nScraping {topic}-related articles from Scroll.in...")
	print("Press Ctrl+C at any time to save progress and exit.")

	final_csv = scraper.scrape_topic(topic.lower(), topic)

	if final_csv:
	print(f"\nArticles have been saved to: {final_csv}")
	print(f"Total articles scraped: {len(scraper.articles)}")
	else:
	print("\nError saving to final CSV file")

	except KeyboardInterrupt:
	print("\nProcess interrupted by user. Saving progress...")
	if scraper.articles:
	scraper.save_progress(topic, force=True)
	print("Saved progress and exiting.")
	except Exception as e:
	print(f"\nAn error occurred: {str(e)}")
	if 'scraper' in locals() and scraper.articles:
	scraper.save_progress(topic, force=True)
	print("Saved progress despite error.")

	if __name__ == "__main__":
	main()