“shubhamdhamal”
Deploy Flask app with Docker
7644eac
"""
Helper functions for the AI Learning Path Generator.
"""
import re
import json
import datetime
from typing import List, Dict, Any, Optional
def sanitize_input(text: str) -> str:
"""
Sanitize user input to prevent any security issues.
Args:
text: The input text to sanitize
Returns:
Sanitized text string
"""
# Remove any HTML or script tags
text = re.sub(r'<[^>]*>', '', text)
# Limit length
return text.strip()[:1000]
def format_duration(minutes: int) -> str:
"""
Format a duration in minutes to a human-readable string.
Args:
minutes: Number of minutes
Returns:
Formatted string (e.g., "2 hours 30 minutes")
"""
hours, mins = divmod(minutes, 60)
if hours and mins:
return f"{hours} hour{'s' if hours > 1 else ''} {mins} minute{'s' if mins > 1 else ''}"
elif hours:
return f"{hours} hour{'s' if hours > 1 else ''}"
else:
return f"{mins} minute{'s' if mins > 1 else ''}"
def calculate_study_schedule(
weeks: int,
hours_per_week: int,
topic_weights: Dict[str, float]
) -> Dict[str, Any]:
"""
Calculate a recommended study schedule based on topic weights.
Args:
weeks: Total duration in weeks
hours_per_week: Hours available per week
topic_weights: Dictionary of topics with their importance weights
Returns:
Dictionary with schedule information
"""
total_hours = weeks * hours_per_week
total_weight = sum(topic_weights.values())
# Normalize weights to sum to 1
normalized_weights = {
topic: weight / total_weight for topic, weight in topic_weights.items()
}
# Calculate hours per topic
hours_per_topic = {
topic: round(weight * total_hours) for topic, weight in normalized_weights.items()
}
# Ensure minimum hours and adjust to match total
min_hours = 1
for topic in hours_per_topic:
if hours_per_topic[topic] < min_hours:
hours_per_topic[topic] = min_hours
# Create schedule with start/end dates
start_date = datetime.datetime.now()
current_date = start_date
schedule = {
"total_hours": total_hours,
"hours_per_week": hours_per_week,
"start_date": start_date.strftime("%Y-%m-%d"),
"end_date": (start_date + datetime.timedelta(weeks=weeks)).strftime("%Y-%m-%d"),
"topics": {}
}
for topic, hours in hours_per_topic.items():
topic_days = hours / (hours_per_week / 7) # Distribute across available days
topic_end = current_date + datetime.timedelta(days=topic_days)
schedule["topics"][topic] = {
"hours": hours,
"start_date": current_date.strftime("%Y-%m-%d"),
"end_date": topic_end.strftime("%Y-%m-%d"),
"percentage": round(hours / total_hours * 100, 1)
}
current_date = topic_end
return schedule
def difficulty_to_score(difficulty: str) -> float:
"""
Convert difficulty description to numeric score (0-1).
Args:
difficulty: String description of difficulty
Returns:
Numeric score between 0 and 1
"""
difficulty = difficulty.lower()
if "beginner" in difficulty or "easy" in difficulty:
return 0.25
elif "intermediate" in difficulty:
return 0.5
elif "advanced" in difficulty:
return 0.75
elif "expert" in difficulty:
return 1.0
else:
return 0.5 # Default to intermediate
def match_resources_to_learning_style(
resources: List[Any],
learning_style: str,
resource_type_weights: Optional[Dict[str, Dict[str, int]]] = None
) -> List[Any]:
"""
Sort resources based on learning style preference.
Args:
resources: List of resources (either dictionaries or Pydantic models)
learning_style: User's learning style
resource_type_weights: Optional custom weights for resource types
Returns:
Sorted list of resources
"""
from src.utils.config import RESOURCE_TYPES
weights = resource_type_weights or RESOURCE_TYPES
# Create a copy of resources to avoid modifying the original objects
resources_with_scores = []
for resource in resources:
# Handle both dictionary and Pydantic model (ResourceItem) objects
if hasattr(resource, 'dict'):
# It's a Pydantic model
resource_dict = resource.dict()
resource_type = resource.type if hasattr(resource, 'type') else 'article'
else:
# It's a dictionary
resource_dict = resource
resource_type = resource.get("type", "article")
# Calculate style score
style_score = 1 # Default score
if resource_type in weights and learning_style in weights[resource_type]:
style_score = weights[resource_type][learning_style]
# Store the original resource and its score
resources_with_scores.append((resource, style_score))
# Sort by style score (higher is better)
sorted_resources = [r[0] for r in sorted(resources_with_scores, key=lambda x: x[1], reverse=True)]
return sorted_resources
# ============================================
# TOKEN OPTIMIZATION UTILITIES
# Cost-saving functions to reduce API expenses
# ============================================
def count_tokens(text: str, model: str = "gpt-4o-mini") -> int:
"""
Count tokens in text for a specific model.
This helps us avoid expensive API calls with huge prompts.
Why this matters:
- OpenAI charges per token (not per character)
- Knowing token count helps us stay within budget
- Prevents unexpected API costs
Args:
text: The text to count tokens for
model: The model name to use for encoding
Returns:
Number of tokens
Example:
>>> count_tokens("Hello, world!")
4
"""
try:
import tiktoken
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
# Fallback to cl100k_base (used by GPT-4, GPT-3.5-turbo, text-embedding-ada-002)
encoding = tiktoken.get_encoding("cl100k_base")
return len(encoding.encode(text))
except ImportError:
# Fallback: rough estimate if tiktoken not available
# Approximate: 1 token ≈ 4 characters for English text
return len(text) // 4
def truncate_text(text: str, max_tokens: int = 3000, model: str = "gpt-4o-mini") -> str:
"""
Truncate text to fit within token limit while keeping the most important parts.
Why: OpenAI charges per token. We want to send ONLY what's necessary.
Strategy:
- Keep first 70% (context and setup)
- Keep last 30% (recent/relevant info)
- This preserves both context and recency
Args:
text: Text to truncate
max_tokens: Maximum tokens to allow
model: Model to use for token counting
Returns:
Truncated text
Example:
>>> long_text = "..." * 10000
>>> short_text = truncate_text(long_text, max_tokens=100)
>>> count_tokens(short_text) <= 100
True
"""
try:
import tiktoken
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
tokens = encoding.encode(text)
if len(tokens) <= max_tokens:
return text
# Keep first 70% and last 30% to preserve context
first_part = int(max_tokens * 0.7)
last_part = int(max_tokens * 0.3)
truncated_tokens = tokens[:first_part] + tokens[-last_part:]
return encoding.decode(truncated_tokens)
except ImportError:
# Fallback: character-based truncation
max_chars = max_tokens * 4
if len(text) <= max_chars:
return text
first_part = int(max_chars * 0.7)
last_part = int(max_chars * 0.3)
return text[:first_part] + "\n...[truncated]...\n" + text[-last_part:]
def optimize_prompt(prompt: str, context: Optional[List[str]] = None, max_tokens: int = 4000) -> str:
"""
Optimize prompt by truncating context intelligently.
How it works:
1. Count tokens in main prompt (always kept intact)
2. Calculate remaining tokens for context
3. Truncate context if needed
4. Combine prompt + optimized context
This ensures:
- Main prompt is never truncated (it's critical)
- Context is added only if space allows
- Total stays within budget
Args:
prompt: Main prompt (always kept)
context: Additional context (can be truncated)
max_tokens: Total token budget
Returns:
Optimized prompt with context
Example:
>>> prompt = "Generate a learning path for Python"
>>> context = ["Python is a programming language...", "..."]
>>> optimized = optimize_prompt(prompt, context, max_tokens=500)
>>> count_tokens(optimized) <= 500
True
"""
prompt_tokens = count_tokens(prompt)
if context:
context_text = "\n\n".join(context)
available_tokens = max_tokens - prompt_tokens - 100 # 100 token buffer for safety
if available_tokens > 0:
context_text = truncate_text(context_text, available_tokens)
return f"{prompt}\n\nContext:\n{context_text}"
return prompt
def estimate_api_cost(token_count: int, model: str = "gpt-4o-mini") -> float:
"""
Estimate the cost of an API call based on token count.
Pricing (as of 2024):
- gpt-4o-mini: $0.15 per 1M input tokens, $0.60 per 1M output tokens
- gpt-3.5-turbo: $0.50 per 1M input tokens, $1.50 per 1M output tokens
- gpt-4: $30 per 1M input tokens, $60 per 1M output tokens
Args:
token_count: Number of tokens
model: Model name
Returns:
Estimated cost in USD
Example:
>>> cost = estimate_api_cost(1000, "gpt-4o-mini")
>>> print(f"${cost:.4f}")
$0.0002
"""
# Pricing per 1M tokens (input)
pricing = {
"gpt-4o-mini": 0.15,
"gpt-4o": 2.50,
"gpt-4": 30.00,
"gpt-3.5-turbo": 0.50,
"text-embedding-3-small": 0.02,
"text-embedding-3-large": 0.13,
"text-embedding-ada-002": 0.10,
}
# Get price per million tokens
price_per_million = pricing.get(model, 0.15) # Default to gpt-4o-mini pricing
# Calculate cost
cost = (token_count / 1_000_000) * price_per_million
return cost