Spaces:
Runtime error
Runtime error
File size: 4,873 Bytes
9686304 91252e5 9686304 8b437a3 9878f1b 9686304 2ed0a09 d9e098b 9686304 2ed0a09 d9e098b 9686304 91252e5 d9e098b 9686304 91252e5 9686304 91252e5 9686304 91252e5 9686304 91252e5 9686304 ccdc533 91252e5 ccdc533 91252e5 ccdc533 91252e5 ccdc533 9686304 91252e5 9686304 91252e5 9686304 91252e5 9686304 91252e5 9686304 91252e5 9686304 91252e5 9686304 91252e5 9686304 91252e5 9686304 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | #!/usr/bin/env python3
"""
Course Scraper CLI
Scrapes courses from a single platform and stores them as JSON.
Usage:
pipenv run python scripts/course_scraper.py --topic "data science" --platform coursera --count 10
pipenv run python scripts/course_scraper.py --topic "machine learning" --platform udemy --count 5
pipenv run python scripts/course_scraper.py --topic "web development" --platform coursera --count 3
"""
import argparse
import json
import os
from datetime import datetime
from typing import List, Dict
from scrapers.coursera_scraper import CourseraScraper
from scrapers.udemy_scraper import UdemyScraper
from scrapers.coursesity_scraper import CoursesityScraper
from scrapers.mit_scraper import MITScraper
from scrapers.harvard_scraper import HarvardScraper
from llm_processor import LLMProcessor
SCRAPER_MAP = {
'coursera': CourseraScraper,
'udemy': UdemyScraper,
'coursesity': CoursesityScraper,
'mit': MITScraper,
'harvard': HarvardScraper
}
def main():
parser = argparse.ArgumentParser(description='Scrape courses from online platforms')
parser.add_argument('--topic', required=True, help='Course topic to search for')
parser.add_argument('--count', type=int, default=10, help='Number of courses to scrape')
parser.add_argument('--platform', required=True, choices=['coursera', 'udemy', 'coursesity', 'mit', 'harvard'],
help='Platform to scrape from (coursera, udemy, coursesity, mit, or harvard)')
parser.add_argument('--process-llm', action='store_true',
help='Process scraped data with LLM for standardization')
args = parser.parse_args()
# Get the platform
platform = args.platform
# Validate platform
if platform not in SCRAPER_MAP:
print(f"Error: Invalid platform: {platform}")
print(f"Available platforms: {list(SCRAPER_MAP.keys())}")
return 1
print(f"π Scraping {args.count} courses about '{args.topic}' from {platform}")
courses = []
# Scrape from the platform
print(f"\nπ Scraping {platform}...")
try:
scraper = SCRAPER_MAP[platform]()
courses = scraper.search_courses(args.topic, args.count)
# Add source info to each course
for course in courses:
course['source_platform'] = platform
course['scraped_at'] = datetime.now().isoformat()
print(f"β
Found {len(courses)} courses from {platform}")
except Exception as e:
print(f"β Error scraping {platform}: {e}")
return 1
if not courses:
print("β No courses found!")
return 1
# Remove duplicates based on URL
print(f"\nπ Removing duplicates from {len(courses)} courses...")
seen_urls = set()
unique_courses = []
duplicates_removed = 0
for course in courses:
course_url = course.get('url', '').strip()
if course_url and course_url not in seen_urls:
seen_urls.add(course_url)
unique_courses.append(course)
else:
duplicates_removed += 1
if duplicates_removed > 0:
print(f"β
Removed {duplicates_removed} duplicate courses")
print(f"π {len(unique_courses)} unique courses remaining")
else:
print("β
No duplicates found")
courses = unique_courses
# Process with LLM if requested
if args.process_llm:
print(f"\nπ€ Processing {len(courses)} courses with LLM...")
try:
processor = LLMProcessor()
courses = processor.standardize_courses(courses)
print("β
LLM processing complete")
except Exception as e:
print(f"β οΈ LLM processing failed: {e}")
print("Continuing with raw scraped data...")
# Save to JSON
os.makedirs("data/scraped_courses/raw_data", exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_topic = args.topic.replace(' ', '_').replace('/', '_')
output_file = f"data/scraped_courses/raw_data/{platform}_{safe_topic}_{timestamp}.json"
output_data = {
'metadata': {
'topic': args.topic,
'platform': platform,
'total_courses': len(courses),
'scraped_at': datetime.now().isoformat(),
'processed_with_llm': args.process_llm
},
'courses': courses
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"\nβ
Saved {len(courses)} courses to {output_file}")
# Summary stats
print(f"\nπ Summary:")
print(f" {platform}: {len(courses)} courses")
print(f"\nπ― Total: {len(courses)} courses saved")
if __name__ == '__main__':
exit(main())
|