Spaces:

crackbit
/

ai-learning-path-generator

Sleeping

ai-learning-path-generator / src /utils /helpers.py

“shubhamdhamal”

Deploy Flask app with Docker

7644eac about 1 month ago

10.9 kB

	"""
	Helper functions for the AI Learning Path Generator.
	"""
	import re
	import json
	import datetime
	from typing import List, Dict, Any, Optional

	def sanitize_input(text: str) -> str:
	"""
	Sanitize user input to prevent any security issues.

	Args:
	text: The input text to sanitize

	Returns:
	Sanitized text string
	"""
	# Remove any HTML or script tags
	text = re.sub(r'<[^>]*>', '', text)
	# Limit length
	return text.strip()[:1000]

	def format_duration(minutes: int) -> str:
	"""
	Format a duration in minutes to a human-readable string.

	Args:
	minutes: Number of minutes

	Returns:
	Formatted string (e.g., "2 hours 30 minutes")
	"""
	hours, mins = divmod(minutes, 60)
	if hours and mins:
	return f"{hours} hour{'s' if hours > 1 else ''} {mins} minute{'s' if mins > 1 else ''}"
	elif hours:
	return f"{hours} hour{'s' if hours > 1 else ''}"
	else:
	return f"{mins} minute{'s' if mins > 1 else ''}"

	def calculate_study_schedule(
	weeks: int,
	hours_per_week: int,
	topic_weights: Dict[str, float]
	) -> Dict[str, Any]:
	"""
	Calculate a recommended study schedule based on topic weights.

	Args:
	weeks: Total duration in weeks
	hours_per_week: Hours available per week
	topic_weights: Dictionary of topics with their importance weights

	Returns:
	Dictionary with schedule information
	"""
	total_hours = weeks * hours_per_week
	total_weight = sum(topic_weights.values())

	# Normalize weights to sum to 1
	normalized_weights = {
	topic: weight / total_weight for topic, weight in topic_weights.items()
	}

	# Calculate hours per topic
	hours_per_topic = {
	topic: round(weight * total_hours) for topic, weight in normalized_weights.items()
	}

	# Ensure minimum hours and adjust to match total
	min_hours = 1
	for topic in hours_per_topic:
	if hours_per_topic[topic] < min_hours:
	hours_per_topic[topic] = min_hours

	# Create schedule with start/end dates
	start_date = datetime.datetime.now()
	current_date = start_date

	schedule = {
	"total_hours": total_hours,
	"hours_per_week": hours_per_week,
	"start_date": start_date.strftime("%Y-%m-%d"),
	"end_date": (start_date + datetime.timedelta(weeks=weeks)).strftime("%Y-%m-%d"),
	"topics": {}
	}

	for topic, hours in hours_per_topic.items():
	topic_days = hours / (hours_per_week / 7) # Distribute across available days
	topic_end = current_date + datetime.timedelta(days=topic_days)

	schedule["topics"][topic] = {
	"hours": hours,
	"start_date": current_date.strftime("%Y-%m-%d"),
	"end_date": topic_end.strftime("%Y-%m-%d"),
	"percentage": round(hours / total_hours * 100, 1)
	}

	current_date = topic_end

	return schedule

	def difficulty_to_score(difficulty: str) -> float:
	"""
	Convert difficulty description to numeric score (0-1).

	Args:
	difficulty: String description of difficulty

	Returns:
	Numeric score between 0 and 1
	"""
	difficulty = difficulty.lower()
	if "beginner" in difficulty or "easy" in difficulty:
	return 0.25
	elif "intermediate" in difficulty:
	return 0.5
	elif "advanced" in difficulty:
	return 0.75
	elif "expert" in difficulty:
	return 1.0
	else:
	return 0.5 # Default to intermediate

	def match_resources_to_learning_style(
	resources: List[Any],
	learning_style: str,
	resource_type_weights: Optional[Dict[str, Dict[str, int]]] = None
	) -> List[Any]:
	"""
	Sort resources based on learning style preference.

	Args:
	resources: List of resources (either dictionaries or Pydantic models)
	learning_style: User's learning style
	resource_type_weights: Optional custom weights for resource types

	Returns:
	Sorted list of resources
	"""
	from src.utils.config import RESOURCE_TYPES

	weights = resource_type_weights or RESOURCE_TYPES

	# Create a copy of resources to avoid modifying the original objects
	resources_with_scores = []

	for resource in resources:
	# Handle both dictionary and Pydantic model (ResourceItem) objects
	if hasattr(resource, 'dict'):
	# It's a Pydantic model
	resource_dict = resource.dict()
	resource_type = resource.type if hasattr(resource, 'type') else 'article'
	else:
	# It's a dictionary
	resource_dict = resource
	resource_type = resource.get("type", "article")

	# Calculate style score
	style_score = 1 # Default score
	if resource_type in weights and learning_style in weights[resource_type]:
	style_score = weights[resource_type][learning_style]

	# Store the original resource and its score
	resources_with_scores.append((resource, style_score))

	# Sort by style score (higher is better)
	sorted_resources = [r[0] for r in sorted(resources_with_scores, key=lambda x: x[1], reverse=True)]
	return sorted_resources


	# ============================================
	# TOKEN OPTIMIZATION UTILITIES
	# Cost-saving functions to reduce API expenses
	# ============================================

	def count_tokens(text: str, model: str = "gpt-4o-mini") -> int:
	"""
	Count tokens in text for a specific model.
	This helps us avoid expensive API calls with huge prompts.

	Why this matters:
	- OpenAI charges per token (not per character)
	- Knowing token count helps us stay within budget
	- Prevents unexpected API costs

	Args:
	text: The text to count tokens for
	model: The model name to use for encoding

	Returns:
	Number of tokens

	Example:
	>>> count_tokens("Hello, world!")
	4
	"""
	try:
	import tiktoken
	try:
	encoding = tiktoken.encoding_for_model(model)
	except KeyError:
	# Fallback to cl100k_base (used by GPT-4, GPT-3.5-turbo, text-embedding-ada-002)
	encoding = tiktoken.get_encoding("cl100k_base")

	return len(encoding.encode(text))
	except ImportError:
	# Fallback: rough estimate if tiktoken not available
	# Approximate: 1 token ≈ 4 characters for English text
	return len(text) // 4


	def truncate_text(text: str, max_tokens: int = 3000, model: str = "gpt-4o-mini") -> str:
	"""
	Truncate text to fit within token limit while keeping the most important parts.

	Why: OpenAI charges per token. We want to send ONLY what's necessary.

	Strategy:
	- Keep first 70% (context and setup)
	- Keep last 30% (recent/relevant info)
	- This preserves both context and recency

	Args:
	text: Text to truncate
	max_tokens: Maximum tokens to allow
	model: Model to use for token counting

	Returns:
	Truncated text

	Example:
	>>> long_text = "..." * 10000
	>>> short_text = truncate_text(long_text, max_tokens=100)
	>>> count_tokens(short_text) <= 100
	True
	"""
	try:
	import tiktoken
	try:
	encoding = tiktoken.encoding_for_model(model)
	except KeyError:
	encoding = tiktoken.get_encoding("cl100k_base")

	tokens = encoding.encode(text)

	if len(tokens) <= max_tokens:
	return text

	# Keep first 70% and last 30% to preserve context
	first_part = int(max_tokens * 0.7)
	last_part = int(max_tokens * 0.3)

	truncated_tokens = tokens[:first_part] + tokens[-last_part:]
	return encoding.decode(truncated_tokens)
	except ImportError:
	# Fallback: character-based truncation
	max_chars = max_tokens * 4
	if len(text) <= max_chars:
	return text
	first_part = int(max_chars * 0.7)
	last_part = int(max_chars * 0.3)
	return text[:first_part] + "\n...[truncated]...\n" + text[-last_part:]


	def optimize_prompt(prompt: str, context: Optional[List[str]] = None, max_tokens: int = 4000) -> str:
	"""
	Optimize prompt by truncating context intelligently.

	How it works:
	1. Count tokens in main prompt (always kept intact)
	2. Calculate remaining tokens for context
	3. Truncate context if needed
	4. Combine prompt + optimized context

	This ensures:
	- Main prompt is never truncated (it's critical)
	- Context is added only if space allows
	- Total stays within budget

	Args:
	prompt: Main prompt (always kept)
	context: Additional context (can be truncated)
	max_tokens: Total token budget

	Returns:
	Optimized prompt with context

	Example:
	>>> prompt = "Generate a learning path for Python"
	>>> context = ["Python is a programming language...", "..."]
	>>> optimized = optimize_prompt(prompt, context, max_tokens=500)
	>>> count_tokens(optimized) <= 500
	True
	"""
	prompt_tokens = count_tokens(prompt)

	if context:
	context_text = "\n\n".join(context)
	available_tokens = max_tokens - prompt_tokens - 100 # 100 token buffer for safety

	if available_tokens > 0:
	context_text = truncate_text(context_text, available_tokens)
	return f"{prompt}\n\nContext:\n{context_text}"

	return prompt


	def estimate_api_cost(token_count: int, model: str = "gpt-4o-mini") -> float:
	"""
	Estimate the cost of an API call based on token count.

	Pricing (as of 2024):
	- gpt-4o-mini: $0.15 per 1M input tokens, $0.60 per 1M output tokens
	- gpt-3.5-turbo: $0.50 per 1M input tokens, $1.50 per 1M output tokens
	- gpt-4: $30 per 1M input tokens, $60 per 1M output tokens

	Args:
	token_count: Number of tokens
	model: Model name

	Returns:
	Estimated cost in USD

	Example:
	>>> cost = estimate_api_cost(1000, "gpt-4o-mini")
	>>> print(f"${cost:.4f}")
	$0.0002
	"""
	# Pricing per 1M tokens (input)
	pricing = {
	"gpt-4o-mini": 0.15,
	"gpt-4o": 2.50,
	"gpt-4": 30.00,
	"gpt-3.5-turbo": 0.50,
	"text-embedding-3-small": 0.02,
	"text-embedding-3-large": 0.13,
	"text-embedding-ada-002": 0.10,
	}

	# Get price per million tokens
	price_per_million = pricing.get(model, 0.15) # Default to gpt-4o-mini pricing

	# Calculate cost
	cost = (token_count / 1_000_000) * price_per_million

	return cost