Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Main entry point for running the news scrapers. | |
| This script acts as a unified interface to run any of the available scrapers. | |
| """ | |
| import os | |
| import sys | |
| import argparse | |
| import importlib | |
| from datetime import datetime | |
| # Make sure the output directory exists | |
| os.makedirs('output', exist_ok=True) | |
| def parse_arguments(): | |
| """Parse command line arguments""" | |
| parser = argparse.ArgumentParser(description='Scrape news articles from Indian news websites') | |
| # Required arguments | |
| parser.add_argument('--source', '-s', type=str, required=True, | |
| choices=['toi', 'ndtv', 'wion', 'scroll'], | |
| help='News source to scrape from') | |
| parser.add_argument('--topic', '-t', type=str, required=True, | |
| help='Topic to search for (e.g., "Climate Change", "Politics")') | |
| # Optional arguments | |
| parser.add_argument('--workers', '-w', type=int, default=4, | |
| help='Number of worker threads (default: 4)') | |
| parser.add_argument('--interval', '-i', type=int, default=300, | |
| help='Auto-save interval in seconds (default: 300)') | |
| return parser.parse_args() | |
| def get_scraper_class(source): | |
| """Dynamically import the appropriate scraper class based on the source""" | |
| source_map = { | |
| 'toi': ('scrapers.toi_scraper', 'TOIArticleScraper'), | |
| 'ndtv': ('scrapers.ndtv_scraper', 'NDTVArticleScraper'), | |
| 'wion': ('scrapers.wion_scraper', 'WIONArticleScraper'), | |
| 'scroll': ('scrapers.scroll_scraper', 'ScrollArticleScraper') | |
| } | |
| if source not in source_map: | |
| raise ValueError(f"Unsupported news source: {source}") | |
| module_name, class_name = source_map[source] | |
| module = importlib.import_module(module_name) | |
| return getattr(module, class_name) | |
| def main(): | |
| """Main function to run the scraper based on command line arguments""" | |
| args = parse_arguments() | |
| try: | |
| print(f"\n--- Indian News Scraper ---") | |
| print(f"Source: {args.source}") | |
| print(f"Topic: {args.topic}") | |
| print(f"Workers: {args.workers}") | |
| print(f"Auto-save interval: {args.interval} seconds") | |
| print("---------------------------\n") | |
| # Get the appropriate scraper class | |
| ScrapeClass = get_scraper_class(args.source) | |
| # Initialize the scraper | |
| scraper = ScrapeClass(max_workers=args.workers) | |
| scraper.save_interval = args.interval | |
| # Configure output directory | |
| os.chdir('output') | |
| print(f"Starting to scrape {args.topic}-related articles from {args.source.upper()}...") | |
| print("Press Ctrl+C at any time to save progress and exit.") | |
| # Run the scraper (accounting for different parameter names in different scrapers) | |
| if args.source == 'toi': | |
| topic_url = f"{scraper.base_url}/topic/{args.topic}/news" | |
| final_csv = scraper.scrape_topic(topic_url, args.topic) | |
| elif args.source == 'ndtv': | |
| final_csv = scraper.scrape_topic(args.topic) | |
| elif args.source == 'wion' or args.source == 'scroll': | |
| final_csv = scraper.scrape_topic(args.topic.lower(), args.topic) | |
| # Print results | |
| if final_csv: | |
| article_count = len(scraper.articles) if hasattr(scraper, 'articles') else len(scraper.scraped_articles) | |
| print(f"\nArticles have been saved to: {final_csv}") | |
| print(f"Total articles scraped: {article_count}") | |
| else: | |
| print("\nError saving to final CSV file") | |
| except KeyboardInterrupt: | |
| print("\nProcess interrupted by user. Saving progress...") | |
| articles = getattr(scraper, 'articles', None) or getattr(scraper, 'scraped_articles', []) | |
| if articles: | |
| scraper.save_progress(args.topic, force=True) | |
| print("Saved progress and exiting.") | |
| except Exception as e: | |
| print(f"\nAn error occurred: {str(e)}") | |
| articles = getattr(scraper, 'articles', None) or getattr(scraper, 'scraped_articles', []) | |
| if 'scraper' in locals() and articles: | |
| scraper.save_progress(args.topic, force=True) | |
| print("Saved progress despite error.") | |
| return 1 | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) |