jdesiree commited on
Commit
648647d
·
verified ·
1 Parent(s): b79a4e1

Upload agent.py

Browse files
Files changed (1) hide show
  1. agent.py +818 -0
agent.py ADDED
@@ -0,0 +1,818 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # agents.py
2
+ """
3
+ Unified agent architecture for Mimir Educational AI Assistant.
4
+
5
+ LAZY-LOADING LLAMA-3.2-3B-INSTRUCT
6
+
7
+ Components:
8
+ - LazyLlamaModel: Singleton lazy-loading model (loads on first use, cached thereafter)
9
+ - ToolDecisionAgent: Uses lazy-loaded Llama for visualization decisions
10
+ - PromptRoutingAgents: Uses lazy-loaded Llama for all 4 routing agents
11
+ - ThinkingAgents: Uses lazy-loaded Llama for all reasoning (including math)
12
+ - ResponseAgent: Uses lazy-loaded Llama for final responses
13
+
14
+ Key optimization: Model loads on first generate() call and is cached for all
15
+ subsequent requests. Single model architecture with ~1GB memory footprint.
16
+ No compile or warmup scripts needed - fully automatic.
17
+ """
18
+
19
+ import os
20
+ import re
21
+ import torch
22
+ import logging
23
+ import time
24
+ import subprocess
25
+ import threading
26
+ from datetime import datetime
27
+ from typing import Dict, List, Optional, Tuple, Type
28
+ import warnings
29
+
30
+ # Setup main logger first
31
+ logging.basicConfig(level=logging.INFO)
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # ============================================================================
35
+ # MEMORY PROFILING UTILITIES
36
+ # ============================================================================
37
+
38
+ def log_memory(tag=""):
39
+ """Log current GPU memory usage"""
40
+ try:
41
+ if torch.cuda.is_available():
42
+ allocated = torch.cuda.memory_allocated() / 1024**2
43
+ reserved = torch.cuda.memory_reserved() / 1024**2
44
+ max_allocated = torch.cuda.max_memory_allocated() / 1024**2
45
+ logger.info(f"[{tag}] GPU Memory - Allocated: {allocated:.2f} MB, Reserved: {reserved:.2f} MB, Peak: {max_allocated:.2f} MB")
46
+ else:
47
+ logger.info(f"[{tag}] No CUDA available")
48
+ except Exception as e:
49
+ logger.warning(f"[{tag}] Error logging GPU memory: {e}")
50
+
51
+
52
+ def log_nvidia_smi(tag=""):
53
+ """Log full nvidia-smi output for system-wide GPU view"""
54
+ try:
55
+ output = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used,memory.total', '--format=csv,noheader,nounits'], encoding='utf-8')
56
+ logger.info(f"[{tag}] NVIDIA-SMI: {output.strip()}")
57
+ except Exception as e:
58
+ logger.warning(f"[{tag}] Error running nvidia-smi: {e}")
59
+
60
+
61
+ def log_step(step_name, start_time=None):
62
+ """Log a pipeline step with timestamp and duration"""
63
+ now = time.time()
64
+ timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
65
+
66
+ if start_time:
67
+ duration = now - start_time
68
+ logger.info(f"[{timestamp}] ✓ {step_name} completed in {duration:.2f}s")
69
+ else:
70
+ logger.info(f"[{timestamp}] → {step_name} starting...")
71
+
72
+ return now
73
+
74
+
75
+ def profile_generation(model, tokenizer, inputs, **gen_kwargs):
76
+ """Profile memory and time for model.generate() call"""
77
+ torch.cuda.empty_cache()
78
+ torch.cuda.reset_peak_memory_stats()
79
+
80
+ log_memory("Before generate()")
81
+ start_time = time.time()
82
+
83
+ with torch.no_grad():
84
+ outputs = model.generate(**inputs, **gen_kwargs)
85
+
86
+ end_time = time.time()
87
+ duration = end_time - start_time
88
+ peak_memory = torch.cuda.max_memory_allocated() / 1024**2
89
+
90
+ log_memory("After generate()")
91
+ logger.info(f"Generation completed in {duration:.2f}s. Peak GPU: {peak_memory:.2f} MB")
92
+
93
+ return outputs, duration
94
+
95
+
96
+ # ============================================================================
97
+ # IMPORTS
98
+ # ============================================================================
99
+
100
+ # Transformers for standard models
101
+ from transformers import (
102
+ AutoTokenizer,
103
+ AutoModelForCausalLM,
104
+ BitsAndBytesConfig,
105
+ )
106
+
107
+ # ZeroGPU support
108
+ try:
109
+ import spaces
110
+ HF_SPACES_AVAILABLE = True
111
+ except ImportError:
112
+ HF_SPACES_AVAILABLE = False
113
+ class DummySpaces:
114
+ @staticmethod
115
+ def GPU(duration=90):
116
+ def decorator(func):
117
+ return func
118
+ return decorator
119
+ spaces = DummySpaces()
120
+
121
+ # Accelerate
122
+ from accelerate import Accelerator
123
+ from accelerate.utils import set_seed
124
+
125
+ # LangChain Core for proper message handling
126
+ from langchain_core.runnables import Runnable
127
+ from langchain_core.runnables.utils import Input, Output
128
+ from langchain_core.messages import SystemMessage, HumanMessage
129
+
130
+ # Import ALL prompts from prompt library
131
+ from prompt_library import (
132
+ # System prompts
133
+ CORE_IDENTITY,
134
+ TOOL_DECISION,
135
+ agent_1_system,
136
+ agent_2_system,
137
+ agent_3_system,
138
+ agent_4_system,
139
+
140
+ # Thinking agent system prompts
141
+ MATH_THINKING,
142
+ QUESTION_ANSWER_DESIGN,
143
+ REASONING_THINKING,
144
+
145
+ # Response agent prompts (dynamically applied)
146
+ VAUGE_INPUT,
147
+ USER_UNDERSTANDING,
148
+ GENERAL_FORMATTING,
149
+ LATEX_FORMATTING,
150
+ GUIDING_TEACHING,
151
+ STRUCTURE_PRACTICE_QUESTIONS,
152
+ PRACTICE_QUESTION_FOLLOWUP,
153
+ TOOL_USE_ENHANCEMENT,
154
+ )
155
+
156
+ # ============================================================================
157
+ # MODEL MANAGER - LAZY LOADING
158
+ # ============================================================================
159
+ # Import the lazy-loading Llama-3.2-3B model manager
160
+ from model_manager import get_model as get_shared_llama, LazyLlamaModel as LlamaSharedAgent
161
+
162
+ # Backwards compatibility aliases
163
+ get_shared_mistral = get_shared_llama
164
+ MistralSharedAgent = LlamaSharedAgent
165
+
166
+ # ============================================================================
167
+ # CONFIGURATION
168
+ # ============================================================================
169
+
170
+ CACHE_DIR = "/tmp/compiled_models"
171
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
172
+
173
+ # Suppress warnings
174
+ warnings.filterwarnings("ignore", category=UserWarning)
175
+ warnings.filterwarnings("ignore", category=FutureWarning)
176
+
177
+ # Model info (for logging/diagnostics)
178
+ LLAMA_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
179
+
180
+
181
+ def check_model_cache() -> Dict[str, bool]:
182
+ """Check model status (legacy function for compatibility)"""
183
+ cache_status = {
184
+ "llama": True, # Lazy-loaded on first use
185
+ "all_compiled": True,
186
+ }
187
+
188
+ logger.info("✓ Llama-3.2-3B uses lazy loading (loads on first generate() call)")
189
+
190
+ return cache_status
191
+
192
+
193
+ # Call at module load
194
+ _cache_status = check_model_cache()
195
+ log_memory("Module load complete")
196
+
197
+
198
+ # ============================================================================
199
+ # TOOL DECISION AGENT
200
+ # ============================================================================
201
+
202
+ class ToolDecisionAgent:
203
+ """
204
+ Analyzes if visualization/graphing tools should be used.
205
+
206
+ Uses lazy-loaded Llama-3.2-3B for decision-making.
207
+ Model loads automatically on first use.
208
+
209
+ Returns: Boolean (True = use tools, False = skip tools)
210
+ """
211
+
212
+ def __init__(self):
213
+ """Initialize with lazy-loaded Llama model"""
214
+ self.model = get_shared_llama()
215
+ logger.info("ToolDecisionAgent initialized (using lazy-loaded Llama)")
216
+
217
+ def decide(self, user_query: str, conversation_history: List[Dict]) -> bool:
218
+ """
219
+ Decide if graphing tools should be used.
220
+
221
+ Args:
222
+ user_query: Current user message
223
+ conversation_history: Full conversation context
224
+
225
+ Returns:
226
+ bool: True if tools should be used
227
+ """
228
+ logger.info("→ ToolDecisionAgent: Analyzing query for tool usage")
229
+
230
+ # Format conversation context
231
+ context = "\n".join([
232
+ f"{msg['role']}: {msg['content']}"
233
+ for msg in conversation_history[-3:] # Last 3 turns
234
+ ])
235
+
236
+ # Decision prompt
237
+ analysis_prompt = f"""Previous conversation:
238
+ {context}
239
+
240
+ Current query: {user_query}
241
+
242
+ Should visualization tools (graphs, charts) be used?"""
243
+
244
+ try:
245
+ decision_start = time.time()
246
+
247
+ # Use shared Llama for decision
248
+ response = self.model.generate(
249
+ system_prompt=TOOL_DECISION,
250
+ user_message=analysis_prompt,
251
+ max_tokens=10,
252
+ temperature=0.1
253
+ )
254
+
255
+ decision_time = time.time() - decision_start
256
+
257
+ # Parse decision
258
+ decision = "YES" in response.upper()
259
+
260
+ logger.info(f"✓ ToolDecision: {'USE TOOLS' if decision else 'NO TOOLS'} ({decision_time:.2f}s)")
261
+
262
+ return decision
263
+
264
+ except Exception as e:
265
+ logger.error(f"ToolDecisionAgent error: {e}")
266
+ return False # Default: no tools
267
+
268
+
269
+ # ============================================================================
270
+ # PROMPT ROUTING AGENTS (4 Specialized Agents)
271
+ # ============================================================================
272
+
273
+ class PromptRoutingAgents:
274
+ """
275
+ Four specialized agents for prompt segment selection.
276
+ All share the same Llama-3.2-3B instance for efficiency.
277
+
278
+ Agents:
279
+ 1. Practice Question Detector
280
+ 2. Discovery Mode Classifier
281
+ 3. Follow-up Assessment
282
+ 4. Teaching Mode Assessor
283
+ """
284
+
285
+ def __init__(self):
286
+ """Initialize with shared Llama model"""
287
+ self.model = get_shared_llama()
288
+ logger.info("PromptRoutingAgents initialized (4 agents, shared Llama)")
289
+
290
+ def agent_1_practice_question(
291
+ self,
292
+ user_query: str,
293
+ conversation_history: List[Dict]
294
+ ) -> bool:
295
+ """Agent 1: Detect if practice questions should be generated"""
296
+ logger.info("→ Agent 1: Analyzing for practice question opportunity")
297
+
298
+ context = "\n".join([
299
+ f"{msg['role']}: {msg['content']}"
300
+ for msg in conversation_history[-4:]
301
+ ])
302
+
303
+ analysis_prompt = f"""Conversation:
304
+ {context}
305
+
306
+ New query: {user_query}
307
+
308
+ Should I create practice questions?"""
309
+
310
+ try:
311
+ response = self.model.generate(
312
+ system_prompt=agent_1_system,
313
+ user_message=analysis_prompt,
314
+ max_tokens=10,
315
+ temperature=0.1
316
+ )
317
+
318
+ decision = "YES" in response.upper()
319
+ logger.info(f"✓ Agent 1: {'PRACTICE QUESTIONS' if decision else 'NO PRACTICE'}")
320
+
321
+ return decision
322
+
323
+ except Exception as e:
324
+ logger.error(f"Agent 1 error: {e}")
325
+ return False
326
+
327
+ def agent_2_discovery_mode(
328
+ self,
329
+ user_query: str,
330
+ conversation_history: List[Dict]
331
+ ) -> Tuple[bool, bool]:
332
+ """Agent 2: Classify vague input and understanding level"""
333
+ logger.info("→ Agent 2: Classifying discovery mode")
334
+
335
+ context = "\n".join([
336
+ f"{msg['role']}: {msg['content']}"
337
+ for msg in conversation_history[-3:]
338
+ ])
339
+
340
+ analysis_prompt = f"""Conversation:
341
+ {context}
342
+
343
+ Query: {user_query}
344
+
345
+ Classification:
346
+ 1. Is input vague? (VAGUE/CLEAR)
347
+ 2. Understanding level? (LOW/MEDIUM/HIGH)"""
348
+
349
+ try:
350
+ response = self.model.generate(
351
+ system_prompt=agent_2_system,
352
+ user_message=analysis_prompt,
353
+ max_tokens=20,
354
+ temperature=0.1
355
+ )
356
+
357
+ vague = "VAGUE" in response.upper()
358
+ low_understanding = "LOW" in response.upper()
359
+
360
+ logger.info(f"✓ Agent 2: Vague={vague}, LowUnderstanding={low_understanding}")
361
+
362
+ return vague, low_understanding
363
+
364
+ except Exception as e:
365
+ logger.error(f"Agent 2 error: {e}")
366
+ return False, False
367
+
368
+ def agent_3_followup_assessment(
369
+ self,
370
+ user_query: str,
371
+ conversation_history: List[Dict]
372
+ ) -> bool:
373
+ """Agent 3: Detect if user is responding to practice questions"""
374
+ logger.info("→ Agent 3: Checking for practice question follow-up")
375
+
376
+ # Check last bot message for practice question indicators
377
+ if len(conversation_history) < 2:
378
+ return False
379
+
380
+ last_bot_msg = None
381
+ for msg in reversed(conversation_history):
382
+ if msg['role'] == 'assistant':
383
+ last_bot_msg = msg['content']
384
+ break
385
+
386
+ if not last_bot_msg:
387
+ return False
388
+
389
+ # Look for practice question markers
390
+ has_practice = any(marker in last_bot_msg.lower() for marker in [
391
+ "practice", "try this", "solve", "calculate", "what is", "question"
392
+ ])
393
+
394
+ if not has_practice:
395
+ return False
396
+
397
+ # Analyze if current query is an answer attempt
398
+ analysis_prompt = f"""Previous message (from me):
399
+ {last_bot_msg[:500]}
400
+
401
+ User response:
402
+ {user_query}
403
+
404
+ Is user answering a practice question?"""
405
+
406
+ try:
407
+ response = self.model.generate(
408
+ system_prompt=agent_3_system,
409
+ user_message=analysis_prompt,
410
+ max_tokens=10,
411
+ temperature=0.1
412
+ )
413
+
414
+ is_followup = "YES" in response.upper()
415
+ logger.info(f"✓ Agent 3: {'GRADING MODE' if is_followup else 'NOT FOLLOWUP'}")
416
+
417
+ return is_followup
418
+
419
+ except Exception as e:
420
+ logger.error(f"Agent 3 error: {e}")
421
+ return False
422
+
423
+ def agent_4_teaching_mode(
424
+ self,
425
+ user_query: str,
426
+ conversation_history: List[Dict]
427
+ ) -> Tuple[bool, bool]:
428
+ """Agent 4: Assess teaching vs practice mode"""
429
+ logger.info("→ Agent 4: Assessing teaching mode")
430
+
431
+ context = "\n".join([
432
+ f"{msg['role']}: {msg['content']}"
433
+ for msg in conversation_history[-3:]
434
+ ])
435
+
436
+ analysis_prompt = f"""Conversation:
437
+ {context}
438
+
439
+ Query: {user_query}
440
+
441
+ Assessment:
442
+ 1. Need direct teaching? (TEACH/PRACTICE)
443
+ 2. Create practice questions? (YES/NO)"""
444
+
445
+ try:
446
+ response = self.model.generate(
447
+ system_prompt=agent_4_system,
448
+ user_message=analysis_prompt,
449
+ max_tokens=15,
450
+ temperature=0.1
451
+ )
452
+
453
+ teaching = "TEACH" in response.upper()
454
+ practice = "YES" in response.upper() or "PRACTICE" in response.upper()
455
+
456
+ logger.info(f"✓ Agent 4: Teaching={teaching}, Practice={practice}")
457
+
458
+ return teaching, practice
459
+
460
+ except Exception as e:
461
+ logger.error(f"Agent 4 error: {e}")
462
+ return False, False
463
+
464
+
465
+ # ============================================================================
466
+ # THINKING AGENTS (Preprocessing Layer)
467
+ # ============================================================================
468
+
469
+ class ThinkingAgents:
470
+ """
471
+ Generates reasoning context before final response.
472
+ Uses shared Llama-3.2-3B for all thinking (including math).
473
+
474
+ Agents:
475
+ 1. Math Thinking (Tree-of-Thought)
476
+ 2. Q&A Design (Chain-of-Thought)
477
+ 3. General Reasoning (Chain-of-Thought)
478
+ """
479
+
480
+ def __init__(self):
481
+ """Initialize with shared Llama model"""
482
+ self.model = get_shared_llama()
483
+ logger.info("ThinkingAgents initialized (using shared Llama for all thinking)")
484
+
485
+ def math_thinking(
486
+ self,
487
+ user_query: str,
488
+ conversation_history: List[Dict],
489
+ tool_context: str = ""
490
+ ) -> str:
491
+ """
492
+ Generate mathematical reasoning using Tree-of-Thought.
493
+ Now uses Llama-3.2-3B instead of GGUF.
494
+ """
495
+ logger.info("→ Math Thinking Agent: Generating reasoning")
496
+
497
+ context = "\n".join([
498
+ f"{msg['role']}: {msg['content']}"
499
+ for msg in conversation_history[-3:]
500
+ ])
501
+
502
+ thinking_prompt = f"""Conversation context:
503
+ {context}
504
+
505
+ Current query: {user_query}
506
+
507
+ {f"Tool output: {tool_context}" if tool_context else ""}
508
+
509
+ Generate mathematical reasoning:"""
510
+
511
+ try:
512
+ thinking_start = time.time()
513
+
514
+ reasoning = self.model.generate(
515
+ system_prompt=MATH_THINKING,
516
+ user_message=thinking_prompt,
517
+ max_tokens=300,
518
+ temperature=0.7
519
+ )
520
+
521
+ thinking_time = time.time() - thinking_start
522
+ logger.info(f"✓ Math Thinking: Generated {len(reasoning)} chars ({thinking_time:.2f}s)")
523
+
524
+ return reasoning
525
+
526
+ except Exception as e:
527
+ logger.error(f"Math Thinking error: {e}")
528
+ return ""
529
+
530
+ def qa_design_thinking(
531
+ self,
532
+ user_query: str,
533
+ conversation_history: List[Dict],
534
+ tool_context: str = ""
535
+ ) -> str:
536
+ """Generate practice question design reasoning"""
537
+ logger.info("→ Q&A Design Agent: Generating question strategy")
538
+
539
+ context = "\n".join([
540
+ f"{msg['role']}: {msg['content']}"
541
+ for msg in conversation_history[-3:]
542
+ ])
543
+
544
+ thinking_prompt = f"""Context:
545
+ {context}
546
+
547
+ Query: {user_query}
548
+
549
+ {f"Tool data: {tool_context}" if tool_context else ""}
550
+
551
+ Design practice questions:"""
552
+
553
+ try:
554
+ reasoning = self.model.generate(
555
+ system_prompt=QUESTION_ANSWER_DESIGN,
556
+ user_message=thinking_prompt,
557
+ max_tokens=250,
558
+ temperature=0.7
559
+ )
560
+
561
+ logger.info(f"✓ Q&A Design: Generated {len(reasoning)} chars")
562
+
563
+ return reasoning
564
+
565
+ except Exception as e:
566
+ logger.error(f"Q&A Design error: {e}")
567
+ return ""
568
+
569
+ def general_reasoning(
570
+ self,
571
+ user_query: str,
572
+ conversation_history: List[Dict],
573
+ tool_context: str = ""
574
+ ) -> str:
575
+ """Generate general reasoning context"""
576
+ logger.info("→ General Reasoning Agent: Generating context")
577
+
578
+ context = "\n".join([
579
+ f"{msg['role']}: {msg['content']}"
580
+ for msg in conversation_history[-4:]
581
+ ])
582
+
583
+ thinking_prompt = f"""Conversation:
584
+ {context}
585
+
586
+ Query: {user_query}
587
+
588
+ {f"Context: {tool_context}" if tool_context else ""}
589
+
590
+ Analyze and provide reasoning:"""
591
+
592
+ try:
593
+ reasoning = self.model.generate(
594
+ system_prompt=REASONING_THINKING,
595
+ user_message=thinking_prompt,
596
+ max_tokens=200,
597
+ temperature=0.7
598
+ )
599
+
600
+ logger.info(f"✓ General Reasoning: Generated {len(reasoning)} chars")
601
+
602
+ return reasoning
603
+
604
+ except Exception as e:
605
+ logger.error(f"General Reasoning error: {e}")
606
+ return ""
607
+
608
+
609
+ # ============================================================================
610
+ # RESPONSE AGENT (Final Response Generation)
611
+ # ============================================================================
612
+
613
+ class ResponseAgent(Runnable):
614
+ """
615
+ Generates final educational responses using lazy-loaded Llama-3.2-3B.
616
+ Model loads automatically on first use.
617
+
618
+ Features:
619
+ - Dynamic prompt assembly based on agent decisions
620
+ - Streaming word-by-word output
621
+ - Educational tone enforcement
622
+ - LaTeX support for math
623
+ - Context integration (thinking outputs, tool outputs)
624
+ """
625
+
626
+ def __init__(self):
627
+ """Initialize with lazy-loaded Llama model"""
628
+ super().__init__()
629
+ self.model = get_shared_llama()
630
+ logger.info("ResponseAgent initialized (using lazy-loaded Llama)")
631
+
632
+ def invoke(self, input_data: Dict) -> Dict:
633
+ """
634
+ Generate final response with streaming.
635
+
636
+ Args:
637
+ input_data: {
638
+ 'user_query': str,
639
+ 'conversation_history': List[Dict],
640
+ 'active_prompts': List[str],
641
+ 'thinking_context': str,
642
+ 'tool_context': str,
643
+ }
644
+
645
+ Returns:
646
+ {'response': str, 'metadata': Dict}
647
+ """
648
+ logger.info("→ ResponseAgent: Generating final response")
649
+
650
+ # Extract inputs
651
+ user_query = input_data.get('user_query', '')
652
+ conversation_history = input_data.get('conversation_history', [])
653
+ active_prompts = input_data.get('active_prompts', [])
654
+ thinking_context = input_data.get('thinking_context', '')
655
+ tool_context = input_data.get('tool_context', '')
656
+
657
+ # Build system prompt from active segments
658
+ system_prompt = self._build_system_prompt(active_prompts)
659
+
660
+ # Build user message with context
661
+ user_message = self._build_user_message(
662
+ user_query,
663
+ conversation_history,
664
+ thinking_context,
665
+ tool_context
666
+ )
667
+
668
+ try:
669
+ response_start = time.time()
670
+
671
+ # Generate response (streaming handled at app.py level)
672
+ response = self.model.generate(
673
+ system_prompt=system_prompt,
674
+ user_message=user_message,
675
+ max_tokens=600,
676
+ temperature=0.7
677
+ )
678
+
679
+ response_time = time.time() - response_start
680
+
681
+ # Clean up response
682
+ response = self._clean_response(response)
683
+
684
+ logger.info(f"✓ ResponseAgent: Generated {len(response)} chars ({response_time:.2f}s)")
685
+
686
+ return {
687
+ 'response': response,
688
+ 'metadata': {
689
+ 'generation_time': response_time,
690
+ 'model': LLAMA_MODEL_ID,
691
+ 'active_prompts': active_prompts
692
+ }
693
+ }
694
+
695
+ except Exception as e:
696
+ logger.error(f"ResponseAgent error: {e}")
697
+ return {
698
+ 'response': "I apologize, but I encountered an error generating a response. Please try again.",
699
+ 'metadata': {'error': str(e)}
700
+ }
701
+
702
+ def _build_system_prompt(self, active_prompts: List[str]) -> str:
703
+ """Assemble system prompt from active segments"""
704
+ prompt_map = {
705
+ 'CORE_IDENTITY': CORE_IDENTITY,
706
+ 'GENERAL_FORMATTING': GENERAL_FORMATTING,
707
+ 'LATEX_FORMATTING': LATEX_FORMATTING,
708
+ 'VAUGE_INPUT': VAUGE_INPUT,
709
+ 'USER_UNDERSTANDING': USER_UNDERSTANDING,
710
+ 'GUIDING_TEACHING': GUIDING_TEACHING,
711
+ 'STRUCTURE_PRACTICE_QUESTIONS': STRUCTURE_PRACTICE_QUESTIONS,
712
+ 'PRACTICE_QUESTION_FOLLOWUP': PRACTICE_QUESTION_FOLLOWUP,
713
+ 'TOOL_USE_ENHANCEMENT': TOOL_USE_ENHANCEMENT,
714
+ }
715
+
716
+ # Always include core identity
717
+ segments = [CORE_IDENTITY, GENERAL_FORMATTING]
718
+
719
+ # Add active prompts
720
+ for prompt_name in active_prompts:
721
+ if prompt_name in prompt_map and prompt_map[prompt_name] not in segments:
722
+ segments.append(prompt_map[prompt_name])
723
+
724
+ return "\n\n".join(segments)
725
+
726
+ def _build_user_message(
727
+ self,
728
+ user_query: str,
729
+ conversation_history: List[Dict],
730
+ thinking_context: str,
731
+ tool_context: str
732
+ ) -> str:
733
+ """Build user message with all context"""
734
+ parts = []
735
+
736
+ # Conversation history (last 3 turns)
737
+ if conversation_history:
738
+ history_text = "\n".join([
739
+ f"{msg['role']}: {msg['content'][:200]}"
740
+ for msg in conversation_history[-3:]
741
+ ])
742
+ parts.append(f"Recent conversation:\n{history_text}")
743
+
744
+ # Thinking context (invisible to user, guides response)
745
+ if thinking_context:
746
+ parts.append(f"[Internal reasoning context]: {thinking_context}")
747
+
748
+ # Tool context
749
+ if tool_context:
750
+ parts.append(f"[Tool output]: {tool_context}")
751
+
752
+ # Current query
753
+ parts.append(f"Student query: {user_query}")
754
+
755
+ return "\n\n".join(parts)
756
+
757
+ def _clean_response(self, response: str) -> str:
758
+ """Clean up response artifacts"""
759
+ # Remove common artifacts
760
+ artifacts = ['<|im_end|>', '<|endoftext|>', '###', '<|end|>']
761
+ for artifact in artifacts:
762
+ response = response.replace(artifact, '')
763
+
764
+ # Remove trailing incomplete sentences
765
+ if response and response[-1] not in '.!?':
766
+ # Find last complete sentence
767
+ for delimiter in ['. ', '! ', '? ']:
768
+ if delimiter in response:
769
+ response = response.rsplit(delimiter, 1)[0] + delimiter[0]
770
+ break
771
+
772
+ return response.strip()
773
+
774
+ def stream(self, input_data: Dict):
775
+ """
776
+ Stream response word-by-word.
777
+
778
+ Yields:
779
+ str: Response chunks
780
+ """
781
+ logger.info("→ ResponseAgent: Streaming response")
782
+
783
+ # Build prompts
784
+ system_prompt = self._build_system_prompt(input_data.get('active_prompts', []))
785
+ user_message = self._build_user_message(
786
+ input_data.get('user_query', ''),
787
+ input_data.get('conversation_history', []),
788
+ input_data.get('thinking_context', ''),
789
+ input_data.get('tool_context', '')
790
+ )
791
+
792
+ try:
793
+ # Use streaming generation from shared model
794
+ for chunk in self.model.generate_streaming(
795
+ system_prompt=system_prompt,
796
+ user_message=user_message,
797
+ max_tokens=600,
798
+ temperature=0.7
799
+ ):
800
+ yield chunk
801
+
802
+ except Exception as e:
803
+ logger.error(f"Streaming error: {e}")
804
+ yield "I apologize, but I encountered an error. Please try again."
805
+
806
+
807
+ # ============================================================================
808
+ # MODULE INITIALIZATION
809
+ # ============================================================================
810
+
811
+ logger.info("="*60)
812
+ logger.info("MIMIR AGENTS MODULE INITIALIZED")
813
+ logger.info("="*60)
814
+ logger.info(f" Model: Llama-3.2-3B-Instruct (lazy-loaded)")
815
+ logger.info(f" Agents: Tool, Routing (4x), Thinking (3x), Response")
816
+ logger.info(f" Memory: ~1GB (loads on first use)")
817
+ logger.info(f" Architecture: Single unified model with caching")
818
+ logger.info("="*60)