Spaces:

Nishitha03
/

News-Scraper

Sleeping

App Files Files Community

News-Scraper / src /run_scraper.py

Nishitha03

Upload 15 files

dd99def verified 4 months ago

raw

history blame contribute delete

4.36 kB

	#!/usr/bin/env python3
	"""
	Main entry point for running the news scrapers.
	This script acts as a unified interface to run any of the available scrapers.
	"""

	import os
	import sys
	import argparse
	import importlib
	from datetime import datetime

	# Make sure the output directory exists
	os.makedirs('output', exist_ok=True)

	def parse_arguments():
	"""Parse command line arguments"""
	parser = argparse.ArgumentParser(description='Scrape news articles from Indian news websites')

	# Required arguments
	parser.add_argument('--source', '-s', type=str, required=True,
	choices=['toi', 'ndtv', 'wion', 'scroll'],
	help='News source to scrape from')
	parser.add_argument('--topic', '-t', type=str, required=True,
	help='Topic to search for (e.g., "Climate Change", "Politics")')

	# Optional arguments
	parser.add_argument('--workers', '-w', type=int, default=4,
	help='Number of worker threads (default: 4)')
	parser.add_argument('--interval', '-i', type=int, default=300,
	help='Auto-save interval in seconds (default: 300)')

	return parser.parse_args()

	def get_scraper_class(source):
	"""Dynamically import the appropriate scraper class based on the source"""
	source_map = {
	'toi': ('scrapers.toi_scraper', 'TOIArticleScraper'),
	'ndtv': ('scrapers.ndtv_scraper', 'NDTVArticleScraper'),
	'wion': ('scrapers.wion_scraper', 'WIONArticleScraper'),
	'scroll': ('scrapers.scroll_scraper', 'ScrollArticleScraper')
	}

	if source not in source_map:
	raise ValueError(f"Unsupported news source: {source}")

	module_name, class_name = source_map[source]
	module = importlib.import_module(module_name)
	return getattr(module, class_name)

	def main():
	"""Main function to run the scraper based on command line arguments"""
	args = parse_arguments()

	try:
	print(f"\n--- Indian News Scraper ---")
	print(f"Source: {args.source}")
	print(f"Topic: {args.topic}")
	print(f"Workers: {args.workers}")
	print(f"Auto-save interval: {args.interval} seconds")
	print("---------------------------\n")

	# Get the appropriate scraper class
	ScrapeClass = get_scraper_class(args.source)

	# Initialize the scraper
	scraper = ScrapeClass(max_workers=args.workers)
	scraper.save_interval = args.interval

	# Configure output directory
	os.chdir('output')

	print(f"Starting to scrape {args.topic}-related articles from {args.source.upper()}...")
	print("Press Ctrl+C at any time to save progress and exit.")

	# Run the scraper (accounting for different parameter names in different scrapers)
	if args.source == 'toi':
	topic_url = f"{scraper.base_url}/topic/{args.topic}/news"
	final_csv = scraper.scrape_topic(topic_url, args.topic)
	elif args.source == 'ndtv':
	final_csv = scraper.scrape_topic(args.topic)
	elif args.source == 'wion' or args.source == 'scroll':
	final_csv = scraper.scrape_topic(args.topic.lower(), args.topic)

	# Print results
	if final_csv:
	article_count = len(scraper.articles) if hasattr(scraper, 'articles') else len(scraper.scraped_articles)
	print(f"\nArticles have been saved to: {final_csv}")
	print(f"Total articles scraped: {article_count}")
	else:
	print("\nError saving to final CSV file")

	except KeyboardInterrupt:
	print("\nProcess interrupted by user. Saving progress...")
	articles = getattr(scraper, 'articles', None) or getattr(scraper, 'scraped_articles', [])
	if articles:
	scraper.save_progress(args.topic, force=True)
	print("Saved progress and exiting.")
	except Exception as e:
	print(f"\nAn error occurred: {str(e)}")
	articles = getattr(scraper, 'articles', None) or getattr(scraper, 'scraped_articles', [])
	if 'scraper' in locals() and articles:
	scraper.save_progress(args.topic, force=True)
	print("Saved progress despite error.")
	return 1

	return 0

	if __name__ == "__main__":
	sys.exit(main())