coachable-course-agent / scripts /course_scraper.py
rdisipio's picture
min dedupl
d9e098b
#!/usr/bin/env python3
"""
Course Scraper CLI
Scrapes courses from a single platform and stores them as JSON.
Usage:
pipenv run python scripts/course_scraper.py --topic "data science" --platform coursera --count 10
pipenv run python scripts/course_scraper.py --topic "machine learning" --platform udemy --count 5
pipenv run python scripts/course_scraper.py --topic "web development" --platform coursera --count 3
"""
import argparse
import json
import os
from datetime import datetime
from typing import List, Dict
from scrapers.coursera_scraper import CourseraScraper
from scrapers.udemy_scraper import UdemyScraper
from scrapers.coursesity_scraper import CoursesityScraper
from scrapers.mit_scraper import MITScraper
from scrapers.harvard_scraper import HarvardScraper
from llm_processor import LLMProcessor
SCRAPER_MAP = {
'coursera': CourseraScraper,
'udemy': UdemyScraper,
'coursesity': CoursesityScraper,
'mit': MITScraper,
'harvard': HarvardScraper
}
def main():
parser = argparse.ArgumentParser(description='Scrape courses from online platforms')
parser.add_argument('--topic', required=True, help='Course topic to search for')
parser.add_argument('--count', type=int, default=10, help='Number of courses to scrape')
parser.add_argument('--platform', required=True, choices=['coursera', 'udemy', 'coursesity', 'mit', 'harvard'],
help='Platform to scrape from (coursera, udemy, coursesity, mit, or harvard)')
parser.add_argument('--process-llm', action='store_true',
help='Process scraped data with LLM for standardization')
args = parser.parse_args()
# Get the platform
platform = args.platform
# Validate platform
if platform not in SCRAPER_MAP:
print(f"Error: Invalid platform: {platform}")
print(f"Available platforms: {list(SCRAPER_MAP.keys())}")
return 1
print(f"πŸ” Scraping {args.count} courses about '{args.topic}' from {platform}")
courses = []
# Scrape from the platform
print(f"\nπŸ“š Scraping {platform}...")
try:
scraper = SCRAPER_MAP[platform]()
courses = scraper.search_courses(args.topic, args.count)
# Add source info to each course
for course in courses:
course['source_platform'] = platform
course['scraped_at'] = datetime.now().isoformat()
print(f"βœ… Found {len(courses)} courses from {platform}")
except Exception as e:
print(f"❌ Error scraping {platform}: {e}")
return 1
if not courses:
print("❌ No courses found!")
return 1
# Remove duplicates based on URL
print(f"\nπŸ” Removing duplicates from {len(courses)} courses...")
seen_urls = set()
unique_courses = []
duplicates_removed = 0
for course in courses:
course_url = course.get('url', '').strip()
if course_url and course_url not in seen_urls:
seen_urls.add(course_url)
unique_courses.append(course)
else:
duplicates_removed += 1
if duplicates_removed > 0:
print(f"βœ… Removed {duplicates_removed} duplicate courses")
print(f"πŸ“š {len(unique_courses)} unique courses remaining")
else:
print("βœ… No duplicates found")
courses = unique_courses
# Process with LLM if requested
if args.process_llm:
print(f"\nπŸ€– Processing {len(courses)} courses with LLM...")
try:
processor = LLMProcessor()
courses = processor.standardize_courses(courses)
print("βœ… LLM processing complete")
except Exception as e:
print(f"⚠️ LLM processing failed: {e}")
print("Continuing with raw scraped data...")
# Save to JSON
os.makedirs("data/scraped_courses/raw_data", exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_topic = args.topic.replace(' ', '_').replace('/', '_')
output_file = f"data/scraped_courses/raw_data/{platform}_{safe_topic}_{timestamp}.json"
output_data = {
'metadata': {
'topic': args.topic,
'platform': platform,
'total_courses': len(courses),
'scraped_at': datetime.now().isoformat(),
'processed_with_llm': args.process_llm
},
'courses': courses
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"\nβœ… Saved {len(courses)} courses to {output_file}")
# Summary stats
print(f"\nπŸ“Š Summary:")
print(f" {platform}: {len(courses)} courses")
print(f"\n🎯 Total: {len(courses)} courses saved")
if __name__ == '__main__':
exit(main())