#!/usr/bin/env python3 """ Course Scraper CLI Scrapes courses from a single platform and stores them as JSON. Usage: pipenv run python scripts/course_scraper.py --topic "data science" --platform coursera --count 10 pipenv run python scripts/course_scraper.py --topic "machine learning" --platform udemy --count 5 pipenv run python scripts/course_scraper.py --topic "web development" --platform coursera --count 3 """ import argparse import json import os from datetime import datetime from typing import List, Dict from scrapers.coursera_scraper import CourseraScraper from scrapers.udemy_scraper import UdemyScraper from scrapers.coursesity_scraper import CoursesityScraper from scrapers.mit_scraper import MITScraper from scrapers.harvard_scraper import HarvardScraper from llm_processor import LLMProcessor SCRAPER_MAP = { 'coursera': CourseraScraper, 'udemy': UdemyScraper, 'coursesity': CoursesityScraper, 'mit': MITScraper, 'harvard': HarvardScraper } def main(): parser = argparse.ArgumentParser(description='Scrape courses from online platforms') parser.add_argument('--topic', required=True, help='Course topic to search for') parser.add_argument('--count', type=int, default=10, help='Number of courses to scrape') parser.add_argument('--platform', required=True, choices=['coursera', 'udemy', 'coursesity', 'mit', 'harvard'], help='Platform to scrape from (coursera, udemy, coursesity, mit, or harvard)') parser.add_argument('--process-llm', action='store_true', help='Process scraped data with LLM for standardization') args = parser.parse_args() # Get the platform platform = args.platform # Validate platform if platform not in SCRAPER_MAP: print(f"Error: Invalid platform: {platform}") print(f"Available platforms: {list(SCRAPER_MAP.keys())}") return 1 print(f"šŸ” Scraping {args.count} courses about '{args.topic}' from {platform}") courses = [] # Scrape from the platform print(f"\nšŸ“š Scraping {platform}...") try: scraper = SCRAPER_MAP[platform]() courses = scraper.search_courses(args.topic, args.count) # Add source info to each course for course in courses: course['source_platform'] = platform course['scraped_at'] = datetime.now().isoformat() print(f"āœ… Found {len(courses)} courses from {platform}") except Exception as e: print(f"āŒ Error scraping {platform}: {e}") return 1 if not courses: print("āŒ No courses found!") return 1 # Remove duplicates based on URL print(f"\nšŸ” Removing duplicates from {len(courses)} courses...") seen_urls = set() unique_courses = [] duplicates_removed = 0 for course in courses: course_url = course.get('url', '').strip() if course_url and course_url not in seen_urls: seen_urls.add(course_url) unique_courses.append(course) else: duplicates_removed += 1 if duplicates_removed > 0: print(f"āœ… Removed {duplicates_removed} duplicate courses") print(f"šŸ“š {len(unique_courses)} unique courses remaining") else: print("āœ… No duplicates found") courses = unique_courses # Process with LLM if requested if args.process_llm: print(f"\nšŸ¤– Processing {len(courses)} courses with LLM...") try: processor = LLMProcessor() courses = processor.standardize_courses(courses) print("āœ… LLM processing complete") except Exception as e: print(f"āš ļø LLM processing failed: {e}") print("Continuing with raw scraped data...") # Save to JSON os.makedirs("data/scraped_courses/raw_data", exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") safe_topic = args.topic.replace(' ', '_').replace('/', '_') output_file = f"data/scraped_courses/raw_data/{platform}_{safe_topic}_{timestamp}.json" output_data = { 'metadata': { 'topic': args.topic, 'platform': platform, 'total_courses': len(courses), 'scraped_at': datetime.now().isoformat(), 'processed_with_llm': args.process_llm }, 'courses': courses } with open(output_file, 'w', encoding='utf-8') as f: json.dump(output_data, f, indent=2, ensure_ascii=False) print(f"\nāœ… Saved {len(courses)} courses to {output_file}") # Summary stats print(f"\nšŸ“Š Summary:") print(f" {platform}: {len(courses)} courses") print(f"\nšŸŽÆ Total: {len(courses)} courses saved") if __name__ == '__main__': exit(main())