Spaces:
Runtime error
Runtime error
| """ | |
| LLM processor for standardizing and enhancing scraped course data | |
| Uses Groq/Llama for data processing | |
| """ | |
| import os | |
| import json | |
| from typing import List, Dict | |
| from groq import Groq | |
| class LLMProcessor: | |
| """Process scraped course data with LLM for standardization""" | |
| def __init__(self): | |
| self.client = Groq(api_key=os.getenv('GROQ_API_KEY')) | |
| self.model = "llama-3.1-70b-versatile" # Adjust model as needed | |
| def standardize_courses(self, courses: List[Dict]) -> List[Dict]: | |
| """ | |
| Process a batch of courses with LLM for standardization | |
| Args: | |
| courses: List of raw course dictionaries | |
| Returns: | |
| List of processed/standardized course dictionaries | |
| """ | |
| processed_courses = [] | |
| # Process in batches to avoid token limits | |
| batch_size = 5 | |
| for i in range(0, len(courses), batch_size): | |
| batch = courses[i:i + batch_size] | |
| try: | |
| processed_batch = self._process_batch(batch) | |
| processed_courses.extend(processed_batch) | |
| except Exception as e: | |
| print(f" Warning: LLM processing failed for batch {i//batch_size + 1}: {e}") | |
| # Fall back to original data if LLM fails | |
| processed_courses.extend(batch) | |
| return processed_courses | |
| def _process_batch(self, courses: List[Dict]) -> List[Dict]: | |
| """Process a small batch of courses with LLM""" | |
| # Prepare prompt for LLM | |
| prompt = self._create_standardization_prompt(courses) | |
| # Call LLM | |
| response = self.client.chat.completions.create( | |
| model=self.model, | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": "You are a data processing expert specializing in online course metadata standardization." | |
| }, | |
| { | |
| "role": "user", | |
| "content": prompt | |
| } | |
| ], | |
| temperature=0.1, # Low temperature for consistent output | |
| max_tokens=4000 | |
| ) | |
| # Parse LLM response | |
| try: | |
| result = json.loads(response.choices[0].message.content) | |
| return result.get('courses', courses) # Fallback to original if parsing fails | |
| except json.JSONDecodeError: | |
| print(" Warning: Failed to parse LLM response as JSON") | |
| return courses | |
| def _create_standardization_prompt(self, courses: List[Dict]) -> str: | |
| """Create prompt for LLM course standardization""" | |
| courses_json = json.dumps(courses, indent=2) | |
| prompt = f""" | |
| Please standardize and enhance the following course data. For each course: | |
| 1. **Clean and standardize fields:** | |
| - duration_hours: Convert any duration text to hours (integer) | |
| - level: Standardize to "beginner", "intermediate", "advanced", or "unknown" | |
| - format: Standardize to "self-paced", "instructor-led", or "online" | |
| - price: Convert to numeric value (0.0 for free) | |
| 2. **Enhance descriptions:** | |
| - If description is empty, create a brief, professional description based on title | |
| - Keep descriptions concise (2-3 sentences max) | |
| 3. **Extract learning objectives:** | |
| - Add a "learning_objectives" field with 3-5 key skills/topics covered | |
| - Base this on the title and any available description | |
| 4. **Validate data:** | |
| - Ensure URLs are properly formatted | |
| - Clean up any malformed text | |
| - Set reasonable defaults for missing data | |
| Input courses: | |
| {courses_json} | |
| Return the processed data as a JSON object with this structure: | |
| {{ | |
| "courses": [ | |
| {{ | |
| "title": "cleaned title", | |
| "provider": "provider name", | |
| "url": "full URL", | |
| "description": "enhanced description", | |
| "duration_hours": 25, | |
| "level": "beginner|intermediate|advanced|unknown", | |
| "format": "self-paced|instructor-led|online", | |
| "price": 0.0, | |
| "rating": 4.5, | |
| "enrollment_count": 10000, | |
| "language": "en", | |
| "certificate": true, | |
| "instructor": "instructor name", | |
| "learning_objectives": ["objective 1", "objective 2", "objective 3"], | |
| "skills": [], | |
| "source_platform": "original platform", | |
| "scraped_at": "original timestamp" | |
| }} | |
| ] | |
| }} | |
| Only return valid JSON. Do not include any explanatory text. | |
| """ | |
| return prompt | |