Spaces:

Nishitha03
/

News-Scraper

Sleeping

File size: 4,363 Bytes

dd99def

#!/usr/bin/env python3
"""
Main entry point for running the news scrapers.
This script acts as a unified interface to run any of the available scrapers.
"""

import os
import sys
import argparse
import importlib
from datetime import datetime

# Make sure the output directory exists
os.makedirs('output', exist_ok=True)

def parse_arguments():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(description='Scrape news articles from Indian news websites')
    
    # Required arguments
    parser.add_argument('--source', '-s', type=str, required=True,
                        choices=['toi', 'ndtv', 'wion', 'scroll'],
                        help='News source to scrape from')
    parser.add_argument('--topic', '-t', type=str, required=True,
                        help='Topic to search for (e.g., "Climate Change", "Politics")')
    
    # Optional arguments
    parser.add_argument('--workers', '-w', type=int, default=4,
                        help='Number of worker threads (default: 4)')
    parser.add_argument('--interval', '-i', type=int, default=300,
                        help='Auto-save interval in seconds (default: 300)')
    
    return parser.parse_args()

def get_scraper_class(source):
    """Dynamically import the appropriate scraper class based on the source"""
    source_map = {
        'toi': ('scrapers.toi_scraper', 'TOIArticleScraper'),
        'ndtv': ('scrapers.ndtv_scraper', 'NDTVArticleScraper'),
        'wion': ('scrapers.wion_scraper', 'WIONArticleScraper'),
        'scroll': ('scrapers.scroll_scraper', 'ScrollArticleScraper')
    }
    
    if source not in source_map:
        raise ValueError(f"Unsupported news source: {source}")
    
    module_name, class_name = source_map[source]
    module = importlib.import_module(module_name)
    return getattr(module, class_name)

def main():
    """Main function to run the scraper based on command line arguments"""
    args = parse_arguments()
    
    try:
        print(f"\n--- Indian News Scraper ---")
        print(f"Source: {args.source}")
        print(f"Topic: {args.topic}")
        print(f"Workers: {args.workers}")
        print(f"Auto-save interval: {args.interval} seconds")
        print("---------------------------\n")
        
        # Get the appropriate scraper class
        ScrapeClass = get_scraper_class(args.source)
        
        # Initialize the scraper
        scraper = ScrapeClass(max_workers=args.workers)
        scraper.save_interval = args.interval
        
        # Configure output directory
        os.chdir('output')
        
        print(f"Starting to scrape {args.topic}-related articles from {args.source.upper()}...")
        print("Press Ctrl+C at any time to save progress and exit.")
        
        # Run the scraper (accounting for different parameter names in different scrapers)
        if args.source == 'toi':
            topic_url = f"{scraper.base_url}/topic/{args.topic}/news"
            final_csv = scraper.scrape_topic(topic_url, args.topic)
        elif args.source == 'ndtv':
            final_csv = scraper.scrape_topic(args.topic)
        elif args.source == 'wion' or args.source == 'scroll':
            final_csv = scraper.scrape_topic(args.topic.lower(), args.topic)
        
        # Print results
        if final_csv:
            article_count = len(scraper.articles) if hasattr(scraper, 'articles') else len(scraper.scraped_articles)
            print(f"\nArticles have been saved to: {final_csv}")
            print(f"Total articles scraped: {article_count}")
        else:
            print("\nError saving to final CSV file")
        
    except KeyboardInterrupt:
        print("\nProcess interrupted by user. Saving progress...")
        articles = getattr(scraper, 'articles', None) or getattr(scraper, 'scraped_articles', [])
        if articles:
            scraper.save_progress(args.topic, force=True)
        print("Saved progress and exiting.")
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        articles = getattr(scraper, 'articles', None) or getattr(scraper, 'scraped_articles', [])
        if 'scraper' in locals() and articles:
            scraper.save_progress(args.topic, force=True)
        print("Saved progress despite error.")
        return 1
        
    return 0

if __name__ == "__main__":
    sys.exit(main())