Spaces:

crackbit
/

ai-learning-path-generator

Sleeping

File size: 10,866 Bytes

7644eac

"""
Helper functions for the AI Learning Path Generator.
"""
import re
import json
import datetime
from typing import List, Dict, Any, Optional

def sanitize_input(text: str) -> str:
    """
    Sanitize user input to prevent any security issues.
    
    Args:
        text: The input text to sanitize
        
    Returns:
        Sanitized text string
    """
    # Remove any HTML or script tags
    text = re.sub(r'<[^>]*>', '', text)
    # Limit length
    return text.strip()[:1000]

def format_duration(minutes: int) -> str:
    """
    Format a duration in minutes to a human-readable string.
    
    Args:
        minutes: Number of minutes
        
    Returns:
        Formatted string (e.g., "2 hours 30 minutes")
    """
    hours, mins = divmod(minutes, 60)
    if hours and mins:
        return f"{hours} hour{'s' if hours > 1 else ''} {mins} minute{'s' if mins > 1 else ''}"
    elif hours:
        return f"{hours} hour{'s' if hours > 1 else ''}"
    else:
        return f"{mins} minute{'s' if mins > 1 else ''}"

def calculate_study_schedule(
    weeks: int, 
    hours_per_week: int, 
    topic_weights: Dict[str, float]
) -> Dict[str, Any]:
    """
    Calculate a recommended study schedule based on topic weights.
    
    Args:
        weeks: Total duration in weeks
        hours_per_week: Hours available per week
        topic_weights: Dictionary of topics with their importance weights
        
    Returns:
        Dictionary with schedule information
    """
    total_hours = weeks * hours_per_week
    total_weight = sum(topic_weights.values())
    
    # Normalize weights to sum to 1
    normalized_weights = {
        topic: weight / total_weight for topic, weight in topic_weights.items()
    }
    
    # Calculate hours per topic
    hours_per_topic = {
        topic: round(weight * total_hours) for topic, weight in normalized_weights.items()
    }
    
    # Ensure minimum hours and adjust to match total
    min_hours = 1
    for topic in hours_per_topic:
        if hours_per_topic[topic] < min_hours:
            hours_per_topic[topic] = min_hours
    
    # Create schedule with start/end dates
    start_date = datetime.datetime.now()
    current_date = start_date
    
    schedule = {
        "total_hours": total_hours,
        "hours_per_week": hours_per_week,
        "start_date": start_date.strftime("%Y-%m-%d"),
        "end_date": (start_date + datetime.timedelta(weeks=weeks)).strftime("%Y-%m-%d"),
        "topics": {}
    }
    
    for topic, hours in hours_per_topic.items():
        topic_days = hours / (hours_per_week / 7)  # Distribute across available days
        topic_end = current_date + datetime.timedelta(days=topic_days)
        
        schedule["topics"][topic] = {
            "hours": hours,
            "start_date": current_date.strftime("%Y-%m-%d"),
            "end_date": topic_end.strftime("%Y-%m-%d"),
            "percentage": round(hours / total_hours * 100, 1)
        }
        
        current_date = topic_end
    
    return schedule

def difficulty_to_score(difficulty: str) -> float:
    """
    Convert difficulty description to numeric score (0-1).
    
    Args:
        difficulty: String description of difficulty
        
    Returns:
        Numeric score between 0 and 1
    """
    difficulty = difficulty.lower()
    if "beginner" in difficulty or "easy" in difficulty:
        return 0.25
    elif "intermediate" in difficulty:
        return 0.5
    elif "advanced" in difficulty:
        return 0.75
    elif "expert" in difficulty:
        return 1.0
    else:
        return 0.5  # Default to intermediate

def match_resources_to_learning_style(
    resources: List[Any], 
    learning_style: str,
    resource_type_weights: Optional[Dict[str, Dict[str, int]]] = None
) -> List[Any]:
    """
    Sort resources based on learning style preference.
    
    Args:
        resources: List of resources (either dictionaries or Pydantic models)
        learning_style: User's learning style
        resource_type_weights: Optional custom weights for resource types
        
    Returns:
        Sorted list of resources
    """
    from src.utils.config import RESOURCE_TYPES
    
    weights = resource_type_weights or RESOURCE_TYPES
    
    # Create a copy of resources to avoid modifying the original objects
    resources_with_scores = []
    
    for resource in resources:
        # Handle both dictionary and Pydantic model (ResourceItem) objects
        if hasattr(resource, 'dict'):
            # It's a Pydantic model
            resource_dict = resource.dict()
            resource_type = resource.type if hasattr(resource, 'type') else 'article'
        else:
            # It's a dictionary
            resource_dict = resource
            resource_type = resource.get("type", "article")
        
        # Calculate style score
        style_score = 1  # Default score
        if resource_type in weights and learning_style in weights[resource_type]:
            style_score = weights[resource_type][learning_style]
        
        # Store the original resource and its score
        resources_with_scores.append((resource, style_score))
    
    # Sort by style score (higher is better)
    sorted_resources = [r[0] for r in sorted(resources_with_scores, key=lambda x: x[1], reverse=True)]
    return sorted_resources


# ============================================
# TOKEN OPTIMIZATION UTILITIES
# Cost-saving functions to reduce API expenses
# ============================================

def count_tokens(text: str, model: str = "gpt-4o-mini") -> int:
    """
    Count tokens in text for a specific model.
    This helps us avoid expensive API calls with huge prompts.
    
    Why this matters:
    - OpenAI charges per token (not per character)
    - Knowing token count helps us stay within budget
    - Prevents unexpected API costs
    
    Args:
        text: The text to count tokens for
        model: The model name to use for encoding
    
    Returns:
        Number of tokens
    
    Example:
        >>> count_tokens("Hello, world!")
        4
    """
    try:
        import tiktoken
        try:
            encoding = tiktoken.encoding_for_model(model)
        except KeyError:
            # Fallback to cl100k_base (used by GPT-4, GPT-3.5-turbo, text-embedding-ada-002)
            encoding = tiktoken.get_encoding("cl100k_base")
        
        return len(encoding.encode(text))
    except ImportError:
        # Fallback: rough estimate if tiktoken not available
        # Approximate: 1 token ≈ 4 characters for English text
        return len(text) // 4


def truncate_text(text: str, max_tokens: int = 3000, model: str = "gpt-4o-mini") -> str:
    """
    Truncate text to fit within token limit while keeping the most important parts.
    
    Why: OpenAI charges per token. We want to send ONLY what's necessary.
    
    Strategy:
    - Keep first 70% (context and setup)
    - Keep last 30% (recent/relevant info)
    - This preserves both context and recency
    
    Args:
        text: Text to truncate
        max_tokens: Maximum tokens to allow
        model: Model to use for token counting
    
    Returns:
        Truncated text
    
    Example:
        >>> long_text = "..." * 10000
        >>> short_text = truncate_text(long_text, max_tokens=100)
        >>> count_tokens(short_text) <= 100
        True
    """
    try:
        import tiktoken
        try:
            encoding = tiktoken.encoding_for_model(model)
        except KeyError:
            encoding = tiktoken.get_encoding("cl100k_base")
        
        tokens = encoding.encode(text)
        
        if len(tokens) <= max_tokens:
            return text
        
        # Keep first 70% and last 30% to preserve context
        first_part = int(max_tokens * 0.7)
        last_part = int(max_tokens * 0.3)
        
        truncated_tokens = tokens[:first_part] + tokens[-last_part:]
        return encoding.decode(truncated_tokens)
    except ImportError:
        # Fallback: character-based truncation
        max_chars = max_tokens * 4
        if len(text) <= max_chars:
            return text
        first_part = int(max_chars * 0.7)
        last_part = int(max_chars * 0.3)
        return text[:first_part] + "\n...[truncated]...\n" + text[-last_part:]


def optimize_prompt(prompt: str, context: Optional[List[str]] = None, max_tokens: int = 4000) -> str:
    """
    Optimize prompt by truncating context intelligently.
    
    How it works:
    1. Count tokens in main prompt (always kept intact)
    2. Calculate remaining tokens for context
    3. Truncate context if needed
    4. Combine prompt + optimized context
    
    This ensures:
    - Main prompt is never truncated (it's critical)
    - Context is added only if space allows
    - Total stays within budget
    
    Args:
        prompt: Main prompt (always kept)
        context: Additional context (can be truncated)
        max_tokens: Total token budget
    
    Returns:
        Optimized prompt with context
    
    Example:
        >>> prompt = "Generate a learning path for Python"
        >>> context = ["Python is a programming language...", "..."]
        >>> optimized = optimize_prompt(prompt, context, max_tokens=500)
        >>> count_tokens(optimized) <= 500
        True
    """
    prompt_tokens = count_tokens(prompt)
    
    if context:
        context_text = "\n\n".join(context)
        available_tokens = max_tokens - prompt_tokens - 100  # 100 token buffer for safety
        
        if available_tokens > 0:
            context_text = truncate_text(context_text, available_tokens)
            return f"{prompt}\n\nContext:\n{context_text}"
    
    return prompt


def estimate_api_cost(token_count: int, model: str = "gpt-4o-mini") -> float:
    """
    Estimate the cost of an API call based on token count.
    
    Pricing (as of 2024):
    - gpt-4o-mini: $0.15 per 1M input tokens, $0.60 per 1M output tokens
    - gpt-3.5-turbo: $0.50 per 1M input tokens, $1.50 per 1M output tokens
    - gpt-4: $30 per 1M input tokens, $60 per 1M output tokens
    
    Args:
        token_count: Number of tokens
        model: Model name
    
    Returns:
        Estimated cost in USD
    
    Example:
        >>> cost = estimate_api_cost(1000, "gpt-4o-mini")
        >>> print(f"${cost:.4f}")
        $0.0002
    """
    # Pricing per 1M tokens (input)
    pricing = {
        "gpt-4o-mini": 0.15,
        "gpt-4o": 2.50,
        "gpt-4": 30.00,
        "gpt-3.5-turbo": 0.50,
        "text-embedding-3-small": 0.02,
        "text-embedding-3-large": 0.13,
        "text-embedding-ada-002": 0.10,
    }
    
    # Get price per million tokens
    price_per_million = pricing.get(model, 0.15)  # Default to gpt-4o-mini pricing
    
    # Calculate cost
    cost = (token_count / 1_000_000) * price_per_million
    
    return cost