Spaces:
Runtime error
Runtime error
File size: 4,482 Bytes
9686304 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | """
LLM processor for standardizing and enhancing scraped course data
Uses Groq/Llama for data processing
"""
import os
import json
from typing import List, Dict
from groq import Groq
class LLMProcessor:
"""Process scraped course data with LLM for standardization"""
def __init__(self):
self.client = Groq(api_key=os.getenv('GROQ_API_KEY'))
self.model = "llama-3.1-70b-versatile" # Adjust model as needed
def standardize_courses(self, courses: List[Dict]) -> List[Dict]:
"""
Process a batch of courses with LLM for standardization
Args:
courses: List of raw course dictionaries
Returns:
List of processed/standardized course dictionaries
"""
processed_courses = []
# Process in batches to avoid token limits
batch_size = 5
for i in range(0, len(courses), batch_size):
batch = courses[i:i + batch_size]
try:
processed_batch = self._process_batch(batch)
processed_courses.extend(processed_batch)
except Exception as e:
print(f" Warning: LLM processing failed for batch {i//batch_size + 1}: {e}")
# Fall back to original data if LLM fails
processed_courses.extend(batch)
return processed_courses
def _process_batch(self, courses: List[Dict]) -> List[Dict]:
"""Process a small batch of courses with LLM"""
# Prepare prompt for LLM
prompt = self._create_standardization_prompt(courses)
# Call LLM
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "system",
"content": "You are a data processing expert specializing in online course metadata standardization."
},
{
"role": "user",
"content": prompt
}
],
temperature=0.1, # Low temperature for consistent output
max_tokens=4000
)
# Parse LLM response
try:
result = json.loads(response.choices[0].message.content)
return result.get('courses', courses) # Fallback to original if parsing fails
except json.JSONDecodeError:
print(" Warning: Failed to parse LLM response as JSON")
return courses
def _create_standardization_prompt(self, courses: List[Dict]) -> str:
"""Create prompt for LLM course standardization"""
courses_json = json.dumps(courses, indent=2)
prompt = f"""
Please standardize and enhance the following course data. For each course:
1. **Clean and standardize fields:**
- duration_hours: Convert any duration text to hours (integer)
- level: Standardize to "beginner", "intermediate", "advanced", or "unknown"
- format: Standardize to "self-paced", "instructor-led", or "online"
- price: Convert to numeric value (0.0 for free)
2. **Enhance descriptions:**
- If description is empty, create a brief, professional description based on title
- Keep descriptions concise (2-3 sentences max)
3. **Extract learning objectives:**
- Add a "learning_objectives" field with 3-5 key skills/topics covered
- Base this on the title and any available description
4. **Validate data:**
- Ensure URLs are properly formatted
- Clean up any malformed text
- Set reasonable defaults for missing data
Input courses:
{courses_json}
Return the processed data as a JSON object with this structure:
{{
"courses": [
{{
"title": "cleaned title",
"provider": "provider name",
"url": "full URL",
"description": "enhanced description",
"duration_hours": 25,
"level": "beginner|intermediate|advanced|unknown",
"format": "self-paced|instructor-led|online",
"price": 0.0,
"rating": 4.5,
"enrollment_count": 10000,
"language": "en",
"certificate": true,
"instructor": "instructor name",
"learning_objectives": ["objective 1", "objective 2", "objective 3"],
"skills": [],
"source_platform": "original platform",
"scraped_at": "original timestamp"
}}
]
}}
Only return valid JSON. Do not include any explanatory text.
"""
return prompt
|