Spaces:
Sleeping
Sleeping
File size: 4,363 Bytes
dd99def |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
#!/usr/bin/env python3
"""
Main entry point for running the news scrapers.
This script acts as a unified interface to run any of the available scrapers.
"""
import os
import sys
import argparse
import importlib
from datetime import datetime
# Make sure the output directory exists
os.makedirs('output', exist_ok=True)
def parse_arguments():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description='Scrape news articles from Indian news websites')
# Required arguments
parser.add_argument('--source', '-s', type=str, required=True,
choices=['toi', 'ndtv', 'wion', 'scroll'],
help='News source to scrape from')
parser.add_argument('--topic', '-t', type=str, required=True,
help='Topic to search for (e.g., "Climate Change", "Politics")')
# Optional arguments
parser.add_argument('--workers', '-w', type=int, default=4,
help='Number of worker threads (default: 4)')
parser.add_argument('--interval', '-i', type=int, default=300,
help='Auto-save interval in seconds (default: 300)')
return parser.parse_args()
def get_scraper_class(source):
"""Dynamically import the appropriate scraper class based on the source"""
source_map = {
'toi': ('scrapers.toi_scraper', 'TOIArticleScraper'),
'ndtv': ('scrapers.ndtv_scraper', 'NDTVArticleScraper'),
'wion': ('scrapers.wion_scraper', 'WIONArticleScraper'),
'scroll': ('scrapers.scroll_scraper', 'ScrollArticleScraper')
}
if source not in source_map:
raise ValueError(f"Unsupported news source: {source}")
module_name, class_name = source_map[source]
module = importlib.import_module(module_name)
return getattr(module, class_name)
def main():
"""Main function to run the scraper based on command line arguments"""
args = parse_arguments()
try:
print(f"\n--- Indian News Scraper ---")
print(f"Source: {args.source}")
print(f"Topic: {args.topic}")
print(f"Workers: {args.workers}")
print(f"Auto-save interval: {args.interval} seconds")
print("---------------------------\n")
# Get the appropriate scraper class
ScrapeClass = get_scraper_class(args.source)
# Initialize the scraper
scraper = ScrapeClass(max_workers=args.workers)
scraper.save_interval = args.interval
# Configure output directory
os.chdir('output')
print(f"Starting to scrape {args.topic}-related articles from {args.source.upper()}...")
print("Press Ctrl+C at any time to save progress and exit.")
# Run the scraper (accounting for different parameter names in different scrapers)
if args.source == 'toi':
topic_url = f"{scraper.base_url}/topic/{args.topic}/news"
final_csv = scraper.scrape_topic(topic_url, args.topic)
elif args.source == 'ndtv':
final_csv = scraper.scrape_topic(args.topic)
elif args.source == 'wion' or args.source == 'scroll':
final_csv = scraper.scrape_topic(args.topic.lower(), args.topic)
# Print results
if final_csv:
article_count = len(scraper.articles) if hasattr(scraper, 'articles') else len(scraper.scraped_articles)
print(f"\nArticles have been saved to: {final_csv}")
print(f"Total articles scraped: {article_count}")
else:
print("\nError saving to final CSV file")
except KeyboardInterrupt:
print("\nProcess interrupted by user. Saving progress...")
articles = getattr(scraper, 'articles', None) or getattr(scraper, 'scraped_articles', [])
if articles:
scraper.save_progress(args.topic, force=True)
print("Saved progress and exiting.")
except Exception as e:
print(f"\nAn error occurred: {str(e)}")
articles = getattr(scraper, 'articles', None) or getattr(scraper, 'scraped_articles', [])
if 'scraper' in locals() and articles:
scraper.save_progress(args.topic, force=True)
print("Saved progress despite error.")
return 1
return 0
if __name__ == "__main__":
sys.exit(main()) |