Spaces:

rdisipio
/

coachable-course-agent

Runtime error

App Files Files Community

coachable-course-agent / scripts /llm_processor.py

rdisipio

data scrapers

9686304 9 months ago

raw

history blame contribute delete

4.48 kB

	"""
	LLM processor for standardizing and enhancing scraped course data
	Uses Groq/Llama for data processing
	"""

	import os
	import json
	from typing import List, Dict
	from groq import Groq


	class LLMProcessor:
	"""Process scraped course data with LLM for standardization"""

	def __init__(self):
	self.client = Groq(api_key=os.getenv('GROQ_API_KEY'))
	self.model = "llama-3.1-70b-versatile" # Adjust model as needed

	def standardize_courses(self, courses: List[Dict]) -> List[Dict]:
	"""
	Process a batch of courses with LLM for standardization

	Args:
	courses: List of raw course dictionaries

	Returns:
	List of processed/standardized course dictionaries
	"""
	processed_courses = []

	# Process in batches to avoid token limits
	batch_size = 5
	for i in range(0, len(courses), batch_size):
	batch = courses[i:i + batch_size]
	try:
	processed_batch = self._process_batch(batch)
	processed_courses.extend(processed_batch)
	except Exception as e:
	print(f" Warning: LLM processing failed for batch {i//batch_size + 1}: {e}")
	# Fall back to original data if LLM fails
	processed_courses.extend(batch)

	return processed_courses

	def _process_batch(self, courses: List[Dict]) -> List[Dict]:
	"""Process a small batch of courses with LLM"""

	# Prepare prompt for LLM
	prompt = self._create_standardization_prompt(courses)

	# Call LLM
	response = self.client.chat.completions.create(
	model=self.model,
	messages=[
	{
	"role": "system",
	"content": "You are a data processing expert specializing in online course metadata standardization."
	},
	{
	"role": "user",
	"content": prompt
	}
	],
	temperature=0.1, # Low temperature for consistent output
	max_tokens=4000
	)

	# Parse LLM response
	try:
	result = json.loads(response.choices[0].message.content)
	return result.get('courses', courses) # Fallback to original if parsing fails
	except json.JSONDecodeError:
	print(" Warning: Failed to parse LLM response as JSON")
	return courses

	def _create_standardization_prompt(self, courses: List[Dict]) -> str:
	"""Create prompt for LLM course standardization"""

	courses_json = json.dumps(courses, indent=2)

	prompt = f"""
	Please standardize and enhance the following course data. For each course:

	1. Clean and standardize fields:
	- duration_hours: Convert any duration text to hours (integer)
	- level: Standardize to "beginner", "intermediate", "advanced", or "unknown"
	- format: Standardize to "self-paced", "instructor-led", or "online"
	- price: Convert to numeric value (0.0 for free)

	2. Enhance descriptions:
	- If description is empty, create a brief, professional description based on title
	- Keep descriptions concise (2-3 sentences max)

	3. Extract learning objectives:
	- Add a "learning_objectives" field with 3-5 key skills/topics covered
	- Base this on the title and any available description

	4. Validate data:
	- Ensure URLs are properly formatted
	- Clean up any malformed text
	- Set reasonable defaults for missing data

	Input courses:
	{courses_json}

	Return the processed data as a JSON object with this structure:
	{{
	"courses": [
	{{
	"title": "cleaned title",
	"provider": "provider name",
	"url": "full URL",
	"description": "enhanced description",
	"duration_hours": 25,
	"level": "beginner\|intermediate\|advanced\|unknown",
	"format": "self-paced\|instructor-led\|online",
	"price": 0.0,
	"rating": 4.5,
	"enrollment_count": 10000,
	"language": "en",
	"certificate": true,
	"instructor": "instructor name",
	"learning_objectives": ["objective 1", "objective 2", "objective 3"],
	"skills": [],
	"source_platform": "original platform",
	"scraped_at": "original timestamp"
	}}
	]
	}}

	Only return valid JSON. Do not include any explanatory text.
	"""
	return prompt