Spaces:

rdisipio
/

coachable-course-agent

Runtime error

File size: 4,482 Bytes
"""
LLM processor for standardizing and enhancing scraped course data
Uses Groq/Llama for data processing
"""

import os
import json
from typing import List, Dict
from groq import Groq


class LLMProcessor:
    """Process scraped course data with LLM for standardization"""
    
    def __init__(self):
        self.client = Groq(api_key=os.getenv('GROQ_API_KEY'))
        self.model = "llama-3.1-70b-versatile"  # Adjust model as needed
    
    def standardize_courses(self, courses: List[Dict]) -> List[Dict]:
        """
        Process a batch of courses with LLM for standardization
        
        Args:
            courses: List of raw course dictionaries
            
        Returns:
            List of processed/standardized course dictionaries
        """
        processed_courses = []
        
        # Process in batches to avoid token limits
        batch_size = 5
        for i in range(0, len(courses), batch_size):
            batch = courses[i:i + batch_size]
            try:
                processed_batch = self._process_batch(batch)
                processed_courses.extend(processed_batch)
            except Exception as e:
                print(f"      Warning: LLM processing failed for batch {i//batch_size + 1}: {e}")
                # Fall back to original data if LLM fails
                processed_courses.extend(batch)
        
        return processed_courses
    
    def _process_batch(self, courses: List[Dict]) -> List[Dict]:
        """Process a small batch of courses with LLM"""
        
        # Prepare prompt for LLM
        prompt = self._create_standardization_prompt(courses)
        
        # Call LLM
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a data processing expert specializing in online course metadata standardization."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=0.1,  # Low temperature for consistent output
            max_tokens=4000
        )
        
        # Parse LLM response
        try:
            result = json.loads(response.choices[0].message.content)
            return result.get('courses', courses)  # Fallback to original if parsing fails
        except json.JSONDecodeError:
            print("      Warning: Failed to parse LLM response as JSON")
            return courses
    
    def _create_standardization_prompt(self, courses: List[Dict]) -> str:
        """Create prompt for LLM course standardization"""
        
        courses_json = json.dumps(courses, indent=2)
        
        prompt = f"""
Please standardize and enhance the following course data. For each course:

1. **Clean and standardize fields:**
   - duration_hours: Convert any duration text to hours (integer)
   - level: Standardize to "beginner", "intermediate", "advanced", or "unknown"
   - format: Standardize to "self-paced", "instructor-led", or "online"
   - price: Convert to numeric value (0.0 for free)

2. **Enhance descriptions:**
   - If description is empty, create a brief, professional description based on title
   - Keep descriptions concise (2-3 sentences max)

3. **Extract learning objectives:**
   - Add a "learning_objectives" field with 3-5 key skills/topics covered
   - Base this on the title and any available description

4. **Validate data:**
   - Ensure URLs are properly formatted
   - Clean up any malformed text
   - Set reasonable defaults for missing data

Input courses:
{courses_json}

Return the processed data as a JSON object with this structure:
{{
  "courses": [
    {{
      "title": "cleaned title",
      "provider": "provider name", 
      "url": "full URL",
      "description": "enhanced description",
      "duration_hours": 25,
      "level": "beginner|intermediate|advanced|unknown",
      "format": "self-paced|instructor-led|online",
      "price": 0.0,
      "rating": 4.5,
      "enrollment_count": 10000,
      "language": "en",
      "certificate": true,
      "instructor": "instructor name",
      "learning_objectives": ["objective 1", "objective 2", "objective 3"],
      "skills": [],
      "source_platform": "original platform",
      "scraped_at": "original timestamp"
    }}
  ]
}}

Only return valid JSON. Do not include any explanatory text.
"""
        return prompt