File size: 12,726 Bytes
71b3314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433cefc
 
 
71b3314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
#!/usr/bin/env python
"""
Language Model Inference Evaluation Script

This script runs the LLM through the Energy & Memory RAM Optimization environment
and evaluates its performance including:
- Action quality and validity
- Reward progression
- Task completion
- Model decision-making efficiency
- Benchmark comparison across tasks
"""

import os
import sys
import json
from typing import Dict, List, Tuple
from datetime import datetime

# Set environment variables for the inference script
os.environ.setdefault("API_BASE_URL", "https://router.huggingface.co/v1")
os.environ.setdefault("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
os.environ.setdefault("LOCAL_SERVER_URL", "http://localhost:8000")

# Import after setting environment variables
from client import EnergyOptimizationEnv
from models import EnergyOptimizationAction, EnergyOptimizationObservation
from task_graders import get_grader, get_grader_metadata, TASK_GRADERS

print("=" * 80)
print("LLM INFERENCE EVALUATION SCRIPT")
print("=" * 80)
print(f"Timestamp: {datetime.now().isoformat()}")
print(f"Available tasks: {list(TASK_GRADERS.keys())}")
print()

# ============================================================================
# EVALUATION METRICS
# ============================================================================

class EvaluationMetrics:
    """Track and calculate evaluation metrics for LLM performance."""
    
    def __init__(self, task_name: str):
        self.task_name = task_name
        self.task_meta = get_grader_metadata(task_name)
        
        # Tracking variables
        self.steps: List[int] = []
        self.actions: List[str] = []
        self.rewards: List[float] = []
        self.ram_usage: List[float] = []
        self.energy_consumption: List[float] = []
        self.task_progress: List[float] = []
        
        # Final metrics
        self.total_steps = 0
        self.total_reward = 0.0
        self.avg_reward = 0.0
        self.max_reward = 0.0
        self.min_reward = 0.0
        self.grader_score = 0.0
        self.task_completed = False
        self.action_validity_rate = 0.0
        self.valid_actions = 0
        self.invalid_actions = 0
        
    def add_step(self, step: int, action: str, reward: float, obs: EnergyOptimizationObservation):
        """Record a step in the episode."""
        self.steps.append(step)
        self.actions.append(action)
        self.rewards.append(reward)
        self.ram_usage.append(obs.ram_usage)
        self.energy_consumption.append(obs.energy_consumption)
        self.task_progress.append(obs.task_progress)
        
        self.total_steps = step
        self.total_reward += reward
        if reward > self.max_reward:
            self.max_reward = reward
        if self.min_reward == 0.0 or reward < self.min_reward:
            self.min_reward = reward
    
    def mark_action_validity(self, valid: bool):
        """Mark whether an action was valid."""
        if valid:
            self.valid_actions += 1
        else:
            self.invalid_actions += 1
    
    def finalize(self, final_obs: EnergyOptimizationObservation, grader_score: float):
        """Finalize metrics after episode completes."""
        self.grader_score = grader_score
        self.task_completed = final_obs.current_task.completed if final_obs.current_task else False
        
        if self.total_steps > 0:
            self.avg_reward = self.total_reward / self.total_steps
            self.action_validity_rate = self.valid_actions / (self.valid_actions + self.invalid_actions) if (self.valid_actions + self.invalid_actions) > 0 else 0.0
    
    def print_summary(self):
        """Print detailed evaluation summary."""
        print("\n" + "=" * 80)
        print(f"EVALUATION SUMMARY - Task: {self.task_name.upper()}")
        print("=" * 80)
        print(f"\nTask Metadata:")
        print(f"  Difficulty: {self.task_meta['difficulty']}")
        print(f"  Description: {self.task_meta['description']}")
        print(f"  RAM Target: {self.task_meta['target_ram']}% | Energy Target: {self.task_meta['target_energy']} kWh")
        print(f"  Max Steps Allowed: {self.task_meta['max_steps']}")
        
        print(f"\nPerformance Metrics:")
        print(f"  ✓ Total Steps Taken: {self.total_steps}")
        print(f"  ✓ Total Reward Accumulated: {self.total_reward:.3f}")
        print(f"  ✓ Average Reward per Step: {self.avg_reward:.3f}")
        print(f"  ✓ Reward Range: [{self.min_reward:.3f}, {self.max_reward:.3f}]")
        
        print(f"\nAction Quality:")
        print(f"  ✓ Valid Actions: {self.valid_actions}")
        print(f"  ✓ Invalid Actions: {self.invalid_actions}")
        print(f"  ✓ Action Validity Rate: {self.action_validity_rate*100:.1f}%")
        
        print(f"\nResource Optimization:")
        print(f"  ✓ Initial RAM: {self.ram_usage[0]:.1f}% → Final RAM: {self.ram_usage[-1]:.1f}%")
        print(f"    RAM Reduction: {self.ram_usage[0] - self.ram_usage[-1]:.1f}%")
        print(f"  ✓ Initial Energy: {self.energy_consumption[0]:.1f} kWh → Final Energy: {self.energy_consumption[-1]:.1f} kWh")
        print(f"    Energy Reduction: {self.energy_consumption[0] - self.energy_consumption[-1]:.1f} kWh")
        
        print(f"\nTask Completion:")
        print(f"  ✓ Task Completed: {'YES ✓' if self.task_completed else 'NO ✗'}")
        print(f"  ✓ Final Task Progress: {self.task_progress[-1]*100:.1f}%")
        
        print(f"\nGrader Evaluation:")
        print(f"  ✓ Grader Score: {self.grader_score:.3f} (Scale: 0.001-0.999)")
        print(f"  ✓ Score Quality: ", end="")
        if self.grader_score > 0.8:
            print("EXCELLENT ★★★★★")
        elif self.grader_score > 0.6:
            print("GOOD ★★★★")
        elif self.grader_score > 0.4:
            print("FAIR ★★★")
        elif self.grader_score > 0.2:
            print("POOR ★★")
        else:
            print("VERY POOR ★")
        
        print("\n" + "=" * 80)
    
    def to_dict(self) -> Dict:
        """Convert metrics to dictionary for JSON serialization."""
        return {
            "task_name": self.task_name,
            "difficulty": self.task_meta['difficulty'],
            "total_steps": self.total_steps,
            "total_reward": round(self.total_reward, 3),
            "avg_reward": round(self.avg_reward, 3),
            "reward_range": [round(self.min_reward, 3), round(self.max_reward, 3)],
            "valid_actions": self.valid_actions,
            "invalid_actions": self.invalid_actions,
            "action_validity_rate": round(self.action_validity_rate, 3),
            "initial_ram": round(self.ram_usage[0], 1) if self.ram_usage else 0,
            "final_ram": round(self.ram_usage[-1], 1) if self.ram_usage else 0,
            "initial_energy": round(self.energy_consumption[0], 1) if self.energy_consumption else 0,
            "final_energy": round(self.energy_consumption[-1], 1) if self.energy_consumption else 0,
            "task_completed": self.task_completed,
            "final_task_progress": round(self.task_progress[-1], 3) if self.task_progress else 0,
            "grader_score": round(self.grader_score, 3)
        }


# ============================================================================
# DIRECT ENVIRONMENT TEST
# ============================================================================

async def run_random_actions_baseline():
    """Run baseline test with random actions for comparison."""
    print("\n" + "=" * 80)
    print("BASELINE TEST: Random Actions")
    print("=" * 80)
    
    # Test on the easiest task
    task_name = "basic_ram_reduction"
    env = EnergyOptimizationEnv(base_url="http://localhost:8000")
    
    try:
        result = await env.reset()
        obs = result.observation
        
        print(f"Initial State:")
        print(f"  RAM: {obs.ram_usage:.1f}%")
        print(f"  Energy: {obs.energy_consumption:.1f} kWh")
        
        total_reward = 0.0
        for step in range(1, 6):
            # Random action
            import random
            action_type = random.choice(["reduce_ram", "optimize_energy", "balance_resources"])
            intensity = random.uniform(0.3, 0.9)
            
            action = EnergyOptimizationAction(action_type=action_type, intensity=intensity)
            result = await env.step(action)
            obs = result.observation
            reward = result.reward or 0.0
            total_reward += reward
            
            print(f"\nStep {step}:")
            print(f"  Action: {action_type}, Intensity: {intensity:.2f}")
            print(f"  Reward: {reward:.3f}")
            print(f"  RAM: {obs.ram_usage:.1f}% | Energy: {obs.energy_consumption:.1f} kWh")
        
        print(f"\nBaseline Total Reward: {total_reward:.3f}")
        print(f"Baseline Avg Reward: {total_reward/5:.3f}")
        
    except Exception as e:
        print(f"Error running baseline: {e}")


# ============================================================================
# SIMPLE HEURISTIC AGENT TEST
# ============================================================================

async def run_heuristic_agent():
    """Run evaluation with a simple heuristic agent (not LLM)."""
    print("\n" + "=" * 80)
    print("HEURISTIC AGENT TEST: Rule-Based Decision Making")
    print("=" * 80)
    
    task_name = "basic_ram_reduction"
    env = EnergyOptimizationEnv(base_url="http://localhost:8000")
    metrics = EvaluationMetrics(task_name)
    
    try:
        result = await env.reset()
        obs = result.observation
        
        print(f"Task: {task_name}")
        print(f"Initial RAM: {obs.ram_usage:.1f}%, Energy: {obs.energy_consumption:.1f} kWh\n")
        
        for step in range(1, 11):
            # Heuristic: If RAM > target, reduce RAM. Otherwise optimize energy.
            ram_target = 70.0
            energy_target = 7.5
            
            if obs.ram_usage > ram_target:
                action_type = "reduce_ram"
                intensity = 0.8  # High intensity for RAM reduction
                metrics.mark_action_validity(True)
            else:
                action_type = "optimize_energy"
                intensity = 0.6
                metrics.mark_action_validity(True)
            
            action = EnergyOptimizationAction(action_type=action_type, intensity=intensity)
            action_str = f"{action_type},{intensity:.1f}"
            
            result = await env.step(action)
            obs = result.observation
            reward = result.reward or 0.0
            
            metrics.add_step(step, action_str, reward, obs)
            
            print(f"Step {step}: {action_str:30} | Reward: {reward:+.3f} | RAM: {obs.ram_usage:5.1f}% | Energy: {obs.energy_consumption:5.1f} kWh")
            
            if result.done:
                break
        
        # Apply grader
        grader_func = get_grader(task_name)
        grader_score = grader_func(obs)
        metrics.finalize(obs, grader_score)
        
        metrics.print_summary()
        
        print(f"\nHeuristic Agent Performance:")
        print(f"  - Complexity: Simple rule-based")
        print(f"  - Decision Speed: Instant")
        print(f"  - Generalization: Limited (task-specific)")
        print(f"  - Final Score: {grader_score:.3f}")
        
    except Exception as e:
        print(f"Error running heuristic agent: {e}")
        import traceback
        traceback.print_exc()


# ============================================================================
# MAIN EXECUTION
# ============================================================================

async def main():
    """Run all evaluation tests."""
    print("\nStarting evaluation tests...\n")
    
    # Test 1: Baseline with random actions
    try:
        await run_random_actions_baseline()
    except Exception as e:
        print(f"Could not run baseline test: {e}")
    
    # Test 2: Heuristic agent
    try:
        await run_heuristic_agent()
    except Exception as e:
        print(f"Could not run heuristic agent: {e}")
        import traceback
        traceback.print_exc()
    
    print("\n" + "=" * 80)
    print("EVALUATION COMPLETE")
    print("=" * 80)
    print("\nKey Insights:")
    print("- Baseline (Random): Shows what untrained agent achieves")
    print("- Heuristic Agent: Shows what simple rules can achieve")
    print("- LLM Inference: Should exceed both baselines with intelligent reasoning")
    print("\nNext Step: Run `python inference.py` to evaluate the actual LLM")
    print("=" * 80 + "\n")


if __name__ == "__main__":
    import asyncio
    asyncio.run(main())