coachable-course-agent / scripts /llm_processor.py
rdisipio's picture
data scrapers
9686304
"""
LLM processor for standardizing and enhancing scraped course data
Uses Groq/Llama for data processing
"""
import os
import json
from typing import List, Dict
from groq import Groq
class LLMProcessor:
"""Process scraped course data with LLM for standardization"""
def __init__(self):
self.client = Groq(api_key=os.getenv('GROQ_API_KEY'))
self.model = "llama-3.1-70b-versatile" # Adjust model as needed
def standardize_courses(self, courses: List[Dict]) -> List[Dict]:
"""
Process a batch of courses with LLM for standardization
Args:
courses: List of raw course dictionaries
Returns:
List of processed/standardized course dictionaries
"""
processed_courses = []
# Process in batches to avoid token limits
batch_size = 5
for i in range(0, len(courses), batch_size):
batch = courses[i:i + batch_size]
try:
processed_batch = self._process_batch(batch)
processed_courses.extend(processed_batch)
except Exception as e:
print(f" Warning: LLM processing failed for batch {i//batch_size + 1}: {e}")
# Fall back to original data if LLM fails
processed_courses.extend(batch)
return processed_courses
def _process_batch(self, courses: List[Dict]) -> List[Dict]:
"""Process a small batch of courses with LLM"""
# Prepare prompt for LLM
prompt = self._create_standardization_prompt(courses)
# Call LLM
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "system",
"content": "You are a data processing expert specializing in online course metadata standardization."
},
{
"role": "user",
"content": prompt
}
],
temperature=0.1, # Low temperature for consistent output
max_tokens=4000
)
# Parse LLM response
try:
result = json.loads(response.choices[0].message.content)
return result.get('courses', courses) # Fallback to original if parsing fails
except json.JSONDecodeError:
print(" Warning: Failed to parse LLM response as JSON")
return courses
def _create_standardization_prompt(self, courses: List[Dict]) -> str:
"""Create prompt for LLM course standardization"""
courses_json = json.dumps(courses, indent=2)
prompt = f"""
Please standardize and enhance the following course data. For each course:
1. **Clean and standardize fields:**
- duration_hours: Convert any duration text to hours (integer)
- level: Standardize to "beginner", "intermediate", "advanced", or "unknown"
- format: Standardize to "self-paced", "instructor-led", or "online"
- price: Convert to numeric value (0.0 for free)
2. **Enhance descriptions:**
- If description is empty, create a brief, professional description based on title
- Keep descriptions concise (2-3 sentences max)
3. **Extract learning objectives:**
- Add a "learning_objectives" field with 3-5 key skills/topics covered
- Base this on the title and any available description
4. **Validate data:**
- Ensure URLs are properly formatted
- Clean up any malformed text
- Set reasonable defaults for missing data
Input courses:
{courses_json}
Return the processed data as a JSON object with this structure:
{{
"courses": [
{{
"title": "cleaned title",
"provider": "provider name",
"url": "full URL",
"description": "enhanced description",
"duration_hours": 25,
"level": "beginner|intermediate|advanced|unknown",
"format": "self-paced|instructor-led|online",
"price": 0.0,
"rating": 4.5,
"enrollment_count": 10000,
"language": "en",
"certificate": true,
"instructor": "instructor name",
"learning_objectives": ["objective 1", "objective 2", "objective 3"],
"skills": [],
"source_platform": "original platform",
"scraped_at": "original timestamp"
}}
]
}}
Only return valid JSON. Do not include any explanatory text.
"""
return prompt