File size: 10,866 Bytes
7644eac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
"""
Helper functions for the AI Learning Path Generator.
"""
import re
import json
import datetime
from typing import List, Dict, Any, Optional

def sanitize_input(text: str) -> str:
    """
    Sanitize user input to prevent any security issues.
    
    Args:
        text: The input text to sanitize
        
    Returns:
        Sanitized text string
    """
    # Remove any HTML or script tags
    text = re.sub(r'<[^>]*>', '', text)
    # Limit length
    return text.strip()[:1000]

def format_duration(minutes: int) -> str:
    """
    Format a duration in minutes to a human-readable string.
    
    Args:
        minutes: Number of minutes
        
    Returns:
        Formatted string (e.g., "2 hours 30 minutes")
    """
    hours, mins = divmod(minutes, 60)
    if hours and mins:
        return f"{hours} hour{'s' if hours > 1 else ''} {mins} minute{'s' if mins > 1 else ''}"
    elif hours:
        return f"{hours} hour{'s' if hours > 1 else ''}"
    else:
        return f"{mins} minute{'s' if mins > 1 else ''}"

def calculate_study_schedule(
    weeks: int, 
    hours_per_week: int, 
    topic_weights: Dict[str, float]
) -> Dict[str, Any]:
    """
    Calculate a recommended study schedule based on topic weights.
    
    Args:
        weeks: Total duration in weeks
        hours_per_week: Hours available per week
        topic_weights: Dictionary of topics with their importance weights
        
    Returns:
        Dictionary with schedule information
    """
    total_hours = weeks * hours_per_week
    total_weight = sum(topic_weights.values())
    
    # Normalize weights to sum to 1
    normalized_weights = {
        topic: weight / total_weight for topic, weight in topic_weights.items()
    }
    
    # Calculate hours per topic
    hours_per_topic = {
        topic: round(weight * total_hours) for topic, weight in normalized_weights.items()
    }
    
    # Ensure minimum hours and adjust to match total
    min_hours = 1
    for topic in hours_per_topic:
        if hours_per_topic[topic] < min_hours:
            hours_per_topic[topic] = min_hours
    
    # Create schedule with start/end dates
    start_date = datetime.datetime.now()
    current_date = start_date
    
    schedule = {
        "total_hours": total_hours,
        "hours_per_week": hours_per_week,
        "start_date": start_date.strftime("%Y-%m-%d"),
        "end_date": (start_date + datetime.timedelta(weeks=weeks)).strftime("%Y-%m-%d"),
        "topics": {}
    }
    
    for topic, hours in hours_per_topic.items():
        topic_days = hours / (hours_per_week / 7)  # Distribute across available days
        topic_end = current_date + datetime.timedelta(days=topic_days)
        
        schedule["topics"][topic] = {
            "hours": hours,
            "start_date": current_date.strftime("%Y-%m-%d"),
            "end_date": topic_end.strftime("%Y-%m-%d"),
            "percentage": round(hours / total_hours * 100, 1)
        }
        
        current_date = topic_end
    
    return schedule

def difficulty_to_score(difficulty: str) -> float:
    """
    Convert difficulty description to numeric score (0-1).
    
    Args:
        difficulty: String description of difficulty
        
    Returns:
        Numeric score between 0 and 1
    """
    difficulty = difficulty.lower()
    if "beginner" in difficulty or "easy" in difficulty:
        return 0.25
    elif "intermediate" in difficulty:
        return 0.5
    elif "advanced" in difficulty:
        return 0.75
    elif "expert" in difficulty:
        return 1.0
    else:
        return 0.5  # Default to intermediate

def match_resources_to_learning_style(
    resources: List[Any], 
    learning_style: str,
    resource_type_weights: Optional[Dict[str, Dict[str, int]]] = None
) -> List[Any]:
    """
    Sort resources based on learning style preference.
    
    Args:
        resources: List of resources (either dictionaries or Pydantic models)
        learning_style: User's learning style
        resource_type_weights: Optional custom weights for resource types
        
    Returns:
        Sorted list of resources
    """
    from src.utils.config import RESOURCE_TYPES
    
    weights = resource_type_weights or RESOURCE_TYPES
    
    # Create a copy of resources to avoid modifying the original objects
    resources_with_scores = []
    
    for resource in resources:
        # Handle both dictionary and Pydantic model (ResourceItem) objects
        if hasattr(resource, 'dict'):
            # It's a Pydantic model
            resource_dict = resource.dict()
            resource_type = resource.type if hasattr(resource, 'type') else 'article'
        else:
            # It's a dictionary
            resource_dict = resource
            resource_type = resource.get("type", "article")
        
        # Calculate style score
        style_score = 1  # Default score
        if resource_type in weights and learning_style in weights[resource_type]:
            style_score = weights[resource_type][learning_style]
        
        # Store the original resource and its score
        resources_with_scores.append((resource, style_score))
    
    # Sort by style score (higher is better)
    sorted_resources = [r[0] for r in sorted(resources_with_scores, key=lambda x: x[1], reverse=True)]
    return sorted_resources


# ============================================
# TOKEN OPTIMIZATION UTILITIES
# Cost-saving functions to reduce API expenses
# ============================================

def count_tokens(text: str, model: str = "gpt-4o-mini") -> int:
    """
    Count tokens in text for a specific model.
    This helps us avoid expensive API calls with huge prompts.
    
    Why this matters:
    - OpenAI charges per token (not per character)
    - Knowing token count helps us stay within budget
    - Prevents unexpected API costs
    
    Args:
        text: The text to count tokens for
        model: The model name to use for encoding
    
    Returns:
        Number of tokens
    
    Example:
        >>> count_tokens("Hello, world!")
        4
    """
    try:
        import tiktoken
        try:
            encoding = tiktoken.encoding_for_model(model)
        except KeyError:
            # Fallback to cl100k_base (used by GPT-4, GPT-3.5-turbo, text-embedding-ada-002)
            encoding = tiktoken.get_encoding("cl100k_base")
        
        return len(encoding.encode(text))
    except ImportError:
        # Fallback: rough estimate if tiktoken not available
        # Approximate: 1 token ≈ 4 characters for English text
        return len(text) // 4


def truncate_text(text: str, max_tokens: int = 3000, model: str = "gpt-4o-mini") -> str:
    """
    Truncate text to fit within token limit while keeping the most important parts.
    
    Why: OpenAI charges per token. We want to send ONLY what's necessary.
    
    Strategy:
    - Keep first 70% (context and setup)
    - Keep last 30% (recent/relevant info)
    - This preserves both context and recency
    
    Args:
        text: Text to truncate
        max_tokens: Maximum tokens to allow
        model: Model to use for token counting
    
    Returns:
        Truncated text
    
    Example:
        >>> long_text = "..." * 10000
        >>> short_text = truncate_text(long_text, max_tokens=100)
        >>> count_tokens(short_text) <= 100
        True
    """
    try:
        import tiktoken
        try:
            encoding = tiktoken.encoding_for_model(model)
        except KeyError:
            encoding = tiktoken.get_encoding("cl100k_base")
        
        tokens = encoding.encode(text)
        
        if len(tokens) <= max_tokens:
            return text
        
        # Keep first 70% and last 30% to preserve context
        first_part = int(max_tokens * 0.7)
        last_part = int(max_tokens * 0.3)
        
        truncated_tokens = tokens[:first_part] + tokens[-last_part:]
        return encoding.decode(truncated_tokens)
    except ImportError:
        # Fallback: character-based truncation
        max_chars = max_tokens * 4
        if len(text) <= max_chars:
            return text
        first_part = int(max_chars * 0.7)
        last_part = int(max_chars * 0.3)
        return text[:first_part] + "\n...[truncated]...\n" + text[-last_part:]


def optimize_prompt(prompt: str, context: Optional[List[str]] = None, max_tokens: int = 4000) -> str:
    """
    Optimize prompt by truncating context intelligently.
    
    How it works:
    1. Count tokens in main prompt (always kept intact)
    2. Calculate remaining tokens for context
    3. Truncate context if needed
    4. Combine prompt + optimized context
    
    This ensures:
    - Main prompt is never truncated (it's critical)
    - Context is added only if space allows
    - Total stays within budget
    
    Args:
        prompt: Main prompt (always kept)
        context: Additional context (can be truncated)
        max_tokens: Total token budget
    
    Returns:
        Optimized prompt with context
    
    Example:
        >>> prompt = "Generate a learning path for Python"
        >>> context = ["Python is a programming language...", "..."]
        >>> optimized = optimize_prompt(prompt, context, max_tokens=500)
        >>> count_tokens(optimized) <= 500
        True
    """
    prompt_tokens = count_tokens(prompt)
    
    if context:
        context_text = "\n\n".join(context)
        available_tokens = max_tokens - prompt_tokens - 100  # 100 token buffer for safety
        
        if available_tokens > 0:
            context_text = truncate_text(context_text, available_tokens)
            return f"{prompt}\n\nContext:\n{context_text}"
    
    return prompt


def estimate_api_cost(token_count: int, model: str = "gpt-4o-mini") -> float:
    """
    Estimate the cost of an API call based on token count.
    
    Pricing (as of 2024):
    - gpt-4o-mini: $0.15 per 1M input tokens, $0.60 per 1M output tokens
    - gpt-3.5-turbo: $0.50 per 1M input tokens, $1.50 per 1M output tokens
    - gpt-4: $30 per 1M input tokens, $60 per 1M output tokens
    
    Args:
        token_count: Number of tokens
        model: Model name
    
    Returns:
        Estimated cost in USD
    
    Example:
        >>> cost = estimate_api_cost(1000, "gpt-4o-mini")
        >>> print(f"${cost:.4f}")
        $0.0002
    """
    # Pricing per 1M tokens (input)
    pricing = {
        "gpt-4o-mini": 0.15,
        "gpt-4o": 2.50,
        "gpt-4": 30.00,
        "gpt-3.5-turbo": 0.50,
        "text-embedding-3-small": 0.02,
        "text-embedding-3-large": 0.13,
        "text-embedding-ada-002": 0.10,
    }
    
    # Get price per million tokens
    price_per_million = pricing.get(model, 0.15)  # Default to gpt-4o-mini pricing
    
    # Calculate cost
    cost = (token_count / 1_000_000) * price_per_million
    
    return cost