News-Scraper / src /run_scraper.py
Nishitha03's picture
Upload 15 files
dd99def verified
#!/usr/bin/env python3
"""
Main entry point for running the news scrapers.
This script acts as a unified interface to run any of the available scrapers.
"""
import os
import sys
import argparse
import importlib
from datetime import datetime
# Make sure the output directory exists
os.makedirs('output', exist_ok=True)
def parse_arguments():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description='Scrape news articles from Indian news websites')
# Required arguments
parser.add_argument('--source', '-s', type=str, required=True,
choices=['toi', 'ndtv', 'wion', 'scroll'],
help='News source to scrape from')
parser.add_argument('--topic', '-t', type=str, required=True,
help='Topic to search for (e.g., "Climate Change", "Politics")')
# Optional arguments
parser.add_argument('--workers', '-w', type=int, default=4,
help='Number of worker threads (default: 4)')
parser.add_argument('--interval', '-i', type=int, default=300,
help='Auto-save interval in seconds (default: 300)')
return parser.parse_args()
def get_scraper_class(source):
"""Dynamically import the appropriate scraper class based on the source"""
source_map = {
'toi': ('scrapers.toi_scraper', 'TOIArticleScraper'),
'ndtv': ('scrapers.ndtv_scraper', 'NDTVArticleScraper'),
'wion': ('scrapers.wion_scraper', 'WIONArticleScraper'),
'scroll': ('scrapers.scroll_scraper', 'ScrollArticleScraper')
}
if source not in source_map:
raise ValueError(f"Unsupported news source: {source}")
module_name, class_name = source_map[source]
module = importlib.import_module(module_name)
return getattr(module, class_name)
def main():
"""Main function to run the scraper based on command line arguments"""
args = parse_arguments()
try:
print(f"\n--- Indian News Scraper ---")
print(f"Source: {args.source}")
print(f"Topic: {args.topic}")
print(f"Workers: {args.workers}")
print(f"Auto-save interval: {args.interval} seconds")
print("---------------------------\n")
# Get the appropriate scraper class
ScrapeClass = get_scraper_class(args.source)
# Initialize the scraper
scraper = ScrapeClass(max_workers=args.workers)
scraper.save_interval = args.interval
# Configure output directory
os.chdir('output')
print(f"Starting to scrape {args.topic}-related articles from {args.source.upper()}...")
print("Press Ctrl+C at any time to save progress and exit.")
# Run the scraper (accounting for different parameter names in different scrapers)
if args.source == 'toi':
topic_url = f"{scraper.base_url}/topic/{args.topic}/news"
final_csv = scraper.scrape_topic(topic_url, args.topic)
elif args.source == 'ndtv':
final_csv = scraper.scrape_topic(args.topic)
elif args.source == 'wion' or args.source == 'scroll':
final_csv = scraper.scrape_topic(args.topic.lower(), args.topic)
# Print results
if final_csv:
article_count = len(scraper.articles) if hasattr(scraper, 'articles') else len(scraper.scraped_articles)
print(f"\nArticles have been saved to: {final_csv}")
print(f"Total articles scraped: {article_count}")
else:
print("\nError saving to final CSV file")
except KeyboardInterrupt:
print("\nProcess interrupted by user. Saving progress...")
articles = getattr(scraper, 'articles', None) or getattr(scraper, 'scraped_articles', [])
if articles:
scraper.save_progress(args.topic, force=True)
print("Saved progress and exiting.")
except Exception as e:
print(f"\nAn error occurred: {str(e)}")
articles = getattr(scraper, 'articles', None) or getattr(scraper, 'scraped_articles', [])
if 'scraper' in locals() and articles:
scraper.save_progress(args.topic, force=True)
print("Saved progress despite error.")
return 1
return 0
if __name__ == "__main__":
sys.exit(main())