#!/usr/bin/env python3 """ Main entry point for running the news scrapers. This script acts as a unified interface to run any of the available scrapers. """ import os import sys import argparse import importlib from datetime import datetime # Make sure the output directory exists os.makedirs('output', exist_ok=True) def parse_arguments(): """Parse command line arguments""" parser = argparse.ArgumentParser(description='Scrape news articles from Indian news websites') # Required arguments parser.add_argument('--source', '-s', type=str, required=True, choices=['toi', 'ndtv', 'wion', 'scroll'], help='News source to scrape from') parser.add_argument('--topic', '-t', type=str, required=True, help='Topic to search for (e.g., "Climate Change", "Politics")') # Optional arguments parser.add_argument('--workers', '-w', type=int, default=4, help='Number of worker threads (default: 4)') parser.add_argument('--interval', '-i', type=int, default=300, help='Auto-save interval in seconds (default: 300)') return parser.parse_args() def get_scraper_class(source): """Dynamically import the appropriate scraper class based on the source""" source_map = { 'toi': ('scrapers.toi_scraper', 'TOIArticleScraper'), 'ndtv': ('scrapers.ndtv_scraper', 'NDTVArticleScraper'), 'wion': ('scrapers.wion_scraper', 'WIONArticleScraper'), 'scroll': ('scrapers.scroll_scraper', 'ScrollArticleScraper') } if source not in source_map: raise ValueError(f"Unsupported news source: {source}") module_name, class_name = source_map[source] module = importlib.import_module(module_name) return getattr(module, class_name) def main(): """Main function to run the scraper based on command line arguments""" args = parse_arguments() try: print(f"\n--- Indian News Scraper ---") print(f"Source: {args.source}") print(f"Topic: {args.topic}") print(f"Workers: {args.workers}") print(f"Auto-save interval: {args.interval} seconds") print("---------------------------\n") # Get the appropriate scraper class ScrapeClass = get_scraper_class(args.source) # Initialize the scraper scraper = ScrapeClass(max_workers=args.workers) scraper.save_interval = args.interval # Configure output directory os.chdir('output') print(f"Starting to scrape {args.topic}-related articles from {args.source.upper()}...") print("Press Ctrl+C at any time to save progress and exit.") # Run the scraper (accounting for different parameter names in different scrapers) if args.source == 'toi': topic_url = f"{scraper.base_url}/topic/{args.topic}/news" final_csv = scraper.scrape_topic(topic_url, args.topic) elif args.source == 'ndtv': final_csv = scraper.scrape_topic(args.topic) elif args.source == 'wion' or args.source == 'scroll': final_csv = scraper.scrape_topic(args.topic.lower(), args.topic) # Print results if final_csv: article_count = len(scraper.articles) if hasattr(scraper, 'articles') else len(scraper.scraped_articles) print(f"\nArticles have been saved to: {final_csv}") print(f"Total articles scraped: {article_count}") else: print("\nError saving to final CSV file") except KeyboardInterrupt: print("\nProcess interrupted by user. Saving progress...") articles = getattr(scraper, 'articles', None) or getattr(scraper, 'scraped_articles', []) if articles: scraper.save_progress(args.topic, force=True) print("Saved progress and exiting.") except Exception as e: print(f"\nAn error occurred: {str(e)}") articles = getattr(scraper, 'articles', None) or getattr(scraper, 'scraped_articles', []) if 'scraper' in locals() and articles: scraper.save_progress(args.topic, force=True) print("Saved progress despite error.") return 1 return 0 if __name__ == "__main__": sys.exit(main())