File size: 4,482 Bytes
9686304
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
LLM processor for standardizing and enhancing scraped course data
Uses Groq/Llama for data processing
"""

import os
import json
from typing import List, Dict
from groq import Groq


class LLMProcessor:
    """Process scraped course data with LLM for standardization"""
    
    def __init__(self):
        self.client = Groq(api_key=os.getenv('GROQ_API_KEY'))
        self.model = "llama-3.1-70b-versatile"  # Adjust model as needed
    
    def standardize_courses(self, courses: List[Dict]) -> List[Dict]:
        """
        Process a batch of courses with LLM for standardization
        
        Args:
            courses: List of raw course dictionaries
            
        Returns:
            List of processed/standardized course dictionaries
        """
        processed_courses = []
        
        # Process in batches to avoid token limits
        batch_size = 5
        for i in range(0, len(courses), batch_size):
            batch = courses[i:i + batch_size]
            try:
                processed_batch = self._process_batch(batch)
                processed_courses.extend(processed_batch)
            except Exception as e:
                print(f"      Warning: LLM processing failed for batch {i//batch_size + 1}: {e}")
                # Fall back to original data if LLM fails
                processed_courses.extend(batch)
        
        return processed_courses
    
    def _process_batch(self, courses: List[Dict]) -> List[Dict]:
        """Process a small batch of courses with LLM"""
        
        # Prepare prompt for LLM
        prompt = self._create_standardization_prompt(courses)
        
        # Call LLM
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a data processing expert specializing in online course metadata standardization."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=0.1,  # Low temperature for consistent output
            max_tokens=4000
        )
        
        # Parse LLM response
        try:
            result = json.loads(response.choices[0].message.content)
            return result.get('courses', courses)  # Fallback to original if parsing fails
        except json.JSONDecodeError:
            print("      Warning: Failed to parse LLM response as JSON")
            return courses
    
    def _create_standardization_prompt(self, courses: List[Dict]) -> str:
        """Create prompt for LLM course standardization"""
        
        courses_json = json.dumps(courses, indent=2)
        
        prompt = f"""
Please standardize and enhance the following course data. For each course:

1. **Clean and standardize fields:**
   - duration_hours: Convert any duration text to hours (integer)
   - level: Standardize to "beginner", "intermediate", "advanced", or "unknown"
   - format: Standardize to "self-paced", "instructor-led", or "online"
   - price: Convert to numeric value (0.0 for free)

2. **Enhance descriptions:**
   - If description is empty, create a brief, professional description based on title
   - Keep descriptions concise (2-3 sentences max)

3. **Extract learning objectives:**
   - Add a "learning_objectives" field with 3-5 key skills/topics covered
   - Base this on the title and any available description

4. **Validate data:**
   - Ensure URLs are properly formatted
   - Clean up any malformed text
   - Set reasonable defaults for missing data

Input courses:
{courses_json}

Return the processed data as a JSON object with this structure:
{{
  "courses": [
    {{
      "title": "cleaned title",
      "provider": "provider name", 
      "url": "full URL",
      "description": "enhanced description",
      "duration_hours": 25,
      "level": "beginner|intermediate|advanced|unknown",
      "format": "self-paced|instructor-led|online",
      "price": 0.0,
      "rating": 4.5,
      "enrollment_count": 10000,
      "language": "en",
      "certificate": true,
      "instructor": "instructor name",
      "learning_objectives": ["objective 1", "objective 2", "objective 3"],
      "skills": [],
      "source_platform": "original platform",
      "scraped_at": "original timestamp"
    }}
  ]
}}

Only return valid JSON. Do not include any explanatory text.
"""
        return prompt