File size: 4,873 Bytes
9686304
 
 
91252e5
9686304
 
8b437a3
 
9878f1b
9686304
 
 
 
 
 
 
 
 
 
2ed0a09
d9e098b
 
9686304
 
 
 
 
2ed0a09
d9e098b
 
 
9686304
 
 
 
 
 
91252e5
d9e098b
 
9686304
 
 
 
 
91252e5
 
9686304
91252e5
 
 
 
9686304
 
91252e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9686304
91252e5
9686304
 
 
ccdc533
91252e5
ccdc533
 
 
 
91252e5
ccdc533
 
 
 
 
 
 
 
 
 
 
 
 
91252e5
ccdc533
9686304
 
91252e5
9686304
 
91252e5
9686304
 
 
 
 
 
91252e5
 
 
 
 
9686304
 
 
 
91252e5
 
9686304
 
 
91252e5
9686304
 
91252e5
9686304
 
91252e5
9686304
 
91252e5
 
 
9686304
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3
"""
Course Scraper CLI
Scrapes courses from a single platform and stores them as JSON.

Usage:
    pipenv run python scripts/course_scraper.py --topic "data science" --platform coursera --count 10
    pipenv run python scripts/course_scraper.py --topic "machine learning" --platform udemy --count 5
    pipenv run python scripts/course_scraper.py --topic "web development" --platform coursera --count 3
"""

import argparse
import json
import os
from datetime import datetime
from typing import List, Dict

from scrapers.coursera_scraper import CourseraScraper
from scrapers.udemy_scraper import UdemyScraper
from scrapers.coursesity_scraper import CoursesityScraper
from scrapers.mit_scraper import MITScraper
from scrapers.harvard_scraper import HarvardScraper
from llm_processor import LLMProcessor


SCRAPER_MAP = {
    'coursera': CourseraScraper,
    'udemy': UdemyScraper,
    'coursesity': CoursesityScraper,
    'mit': MITScraper,
    'harvard': HarvardScraper
}


def main():
    parser = argparse.ArgumentParser(description='Scrape courses from online platforms')
    parser.add_argument('--topic', required=True, help='Course topic to search for')
    parser.add_argument('--count', type=int, default=10, help='Number of courses to scrape')
    parser.add_argument('--platform', required=True, choices=['coursera', 'udemy', 'coursesity', 'mit', 'harvard'],
                       help='Platform to scrape from (coursera, udemy, coursesity, mit, or harvard)')
    parser.add_argument('--process-llm', action='store_true', 
                       help='Process scraped data with LLM for standardization')
    
    args = parser.parse_args()
    
    # Get the platform
    platform = args.platform
    
    # Validate platform
    if platform not in SCRAPER_MAP:
        print(f"Error: Invalid platform: {platform}")
        print(f"Available platforms: {list(SCRAPER_MAP.keys())}")
        return 1
    
    print(f"πŸ” Scraping {args.count} courses about '{args.topic}' from {platform}")
    
    courses = []
    
    # Scrape from the platform
    print(f"\nπŸ“š Scraping {platform}...")
    try:
        scraper = SCRAPER_MAP[platform]()
        courses = scraper.search_courses(args.topic, args.count)
        
        # Add source info to each course
        for course in courses:
            course['source_platform'] = platform
            course['scraped_at'] = datetime.now().isoformat()
        
        print(f"βœ… Found {len(courses)} courses from {platform}")
        
    except Exception as e:
        print(f"❌ Error scraping {platform}: {e}")
        return 1
    
    if not courses:
        print("❌ No courses found!")
        return 1
    
    # Remove duplicates based on URL
    print(f"\nπŸ” Removing duplicates from {len(courses)} courses...")
    seen_urls = set()
    unique_courses = []
    duplicates_removed = 0
    
    for course in courses:
        course_url = course.get('url', '').strip()
        if course_url and course_url not in seen_urls:
            seen_urls.add(course_url)
            unique_courses.append(course)
        else:
            duplicates_removed += 1
    
    if duplicates_removed > 0:
        print(f"βœ… Removed {duplicates_removed} duplicate courses")
        print(f"πŸ“š {len(unique_courses)} unique courses remaining")
    else:
        print("βœ… No duplicates found")
    
    courses = unique_courses
    
    # Process with LLM if requested
    if args.process_llm:
        print(f"\nπŸ€– Processing {len(courses)} courses with LLM...")
        try:
            processor = LLMProcessor()
            courses = processor.standardize_courses(courses)
            print("βœ… LLM processing complete")
        except Exception as e:
            print(f"⚠️  LLM processing failed: {e}")
            print("Continuing with raw scraped data...")
    
    # Save to JSON
    os.makedirs("data/scraped_courses/raw_data", exist_ok=True)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    safe_topic = args.topic.replace(' ', '_').replace('/', '_')
    output_file = f"data/scraped_courses/raw_data/{platform}_{safe_topic}_{timestamp}.json"
    
    output_data = {
        'metadata': {
            'topic': args.topic,
            'platform': platform,
            'total_courses': len(courses),
            'scraped_at': datetime.now().isoformat(),
            'processed_with_llm': args.process_llm
        },
        'courses': courses
    }
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    
    print(f"\nβœ… Saved {len(courses)} courses to {output_file}")
    
    # Summary stats
    print(f"\nπŸ“Š Summary:")
    print(f"  {platform}: {len(courses)} courses")
    print(f"\n🎯 Total: {len(courses)} courses saved")


if __name__ == '__main__':
    exit(main())