energy-optimization-space / evaluate_inference.py
Sushruth21's picture
refactor: revert to root-level package structure with proper imports and hacky pyproject.toml setup
433cefc
#!/usr/bin/env python
"""
Language Model Inference Evaluation Script
This script runs the LLM through the Energy & Memory RAM Optimization environment
and evaluates its performance including:
- Action quality and validity
- Reward progression
- Task completion
- Model decision-making efficiency
- Benchmark comparison across tasks
"""
import os
import sys
import json
from typing import Dict, List, Tuple
from datetime import datetime
# Set environment variables for the inference script
os.environ.setdefault("API_BASE_URL", "https://router.huggingface.co/v1")
os.environ.setdefault("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
os.environ.setdefault("LOCAL_SERVER_URL", "http://localhost:8000")
# Import after setting environment variables
from client import EnergyOptimizationEnv
from models import EnergyOptimizationAction, EnergyOptimizationObservation
from task_graders import get_grader, get_grader_metadata, TASK_GRADERS
print("=" * 80)
print("LLM INFERENCE EVALUATION SCRIPT")
print("=" * 80)
print(f"Timestamp: {datetime.now().isoformat()}")
print(f"Available tasks: {list(TASK_GRADERS.keys())}")
print()
# ============================================================================
# EVALUATION METRICS
# ============================================================================
class EvaluationMetrics:
"""Track and calculate evaluation metrics for LLM performance."""
def __init__(self, task_name: str):
self.task_name = task_name
self.task_meta = get_grader_metadata(task_name)
# Tracking variables
self.steps: List[int] = []
self.actions: List[str] = []
self.rewards: List[float] = []
self.ram_usage: List[float] = []
self.energy_consumption: List[float] = []
self.task_progress: List[float] = []
# Final metrics
self.total_steps = 0
self.total_reward = 0.0
self.avg_reward = 0.0
self.max_reward = 0.0
self.min_reward = 0.0
self.grader_score = 0.0
self.task_completed = False
self.action_validity_rate = 0.0
self.valid_actions = 0
self.invalid_actions = 0
def add_step(self, step: int, action: str, reward: float, obs: EnergyOptimizationObservation):
"""Record a step in the episode."""
self.steps.append(step)
self.actions.append(action)
self.rewards.append(reward)
self.ram_usage.append(obs.ram_usage)
self.energy_consumption.append(obs.energy_consumption)
self.task_progress.append(obs.task_progress)
self.total_steps = step
self.total_reward += reward
if reward > self.max_reward:
self.max_reward = reward
if self.min_reward == 0.0 or reward < self.min_reward:
self.min_reward = reward
def mark_action_validity(self, valid: bool):
"""Mark whether an action was valid."""
if valid:
self.valid_actions += 1
else:
self.invalid_actions += 1
def finalize(self, final_obs: EnergyOptimizationObservation, grader_score: float):
"""Finalize metrics after episode completes."""
self.grader_score = grader_score
self.task_completed = final_obs.current_task.completed if final_obs.current_task else False
if self.total_steps > 0:
self.avg_reward = self.total_reward / self.total_steps
self.action_validity_rate = self.valid_actions / (self.valid_actions + self.invalid_actions) if (self.valid_actions + self.invalid_actions) > 0 else 0.0
def print_summary(self):
"""Print detailed evaluation summary."""
print("\n" + "=" * 80)
print(f"EVALUATION SUMMARY - Task: {self.task_name.upper()}")
print("=" * 80)
print(f"\nTask Metadata:")
print(f" Difficulty: {self.task_meta['difficulty']}")
print(f" Description: {self.task_meta['description']}")
print(f" RAM Target: {self.task_meta['target_ram']}% | Energy Target: {self.task_meta['target_energy']} kWh")
print(f" Max Steps Allowed: {self.task_meta['max_steps']}")
print(f"\nPerformance Metrics:")
print(f" ✓ Total Steps Taken: {self.total_steps}")
print(f" ✓ Total Reward Accumulated: {self.total_reward:.3f}")
print(f" ✓ Average Reward per Step: {self.avg_reward:.3f}")
print(f" ✓ Reward Range: [{self.min_reward:.3f}, {self.max_reward:.3f}]")
print(f"\nAction Quality:")
print(f" ✓ Valid Actions: {self.valid_actions}")
print(f" ✓ Invalid Actions: {self.invalid_actions}")
print(f" ✓ Action Validity Rate: {self.action_validity_rate*100:.1f}%")
print(f"\nResource Optimization:")
print(f" ✓ Initial RAM: {self.ram_usage[0]:.1f}% → Final RAM: {self.ram_usage[-1]:.1f}%")
print(f" RAM Reduction: {self.ram_usage[0] - self.ram_usage[-1]:.1f}%")
print(f" ✓ Initial Energy: {self.energy_consumption[0]:.1f} kWh → Final Energy: {self.energy_consumption[-1]:.1f} kWh")
print(f" Energy Reduction: {self.energy_consumption[0] - self.energy_consumption[-1]:.1f} kWh")
print(f"\nTask Completion:")
print(f" ✓ Task Completed: {'YES ✓' if self.task_completed else 'NO ✗'}")
print(f" ✓ Final Task Progress: {self.task_progress[-1]*100:.1f}%")
print(f"\nGrader Evaluation:")
print(f" ✓ Grader Score: {self.grader_score:.3f} (Scale: 0.001-0.999)")
print(f" ✓ Score Quality: ", end="")
if self.grader_score > 0.8:
print("EXCELLENT ★★★★★")
elif self.grader_score > 0.6:
print("GOOD ★★★★")
elif self.grader_score > 0.4:
print("FAIR ★★★")
elif self.grader_score > 0.2:
print("POOR ★★")
else:
print("VERY POOR ★")
print("\n" + "=" * 80)
def to_dict(self) -> Dict:
"""Convert metrics to dictionary for JSON serialization."""
return {
"task_name": self.task_name,
"difficulty": self.task_meta['difficulty'],
"total_steps": self.total_steps,
"total_reward": round(self.total_reward, 3),
"avg_reward": round(self.avg_reward, 3),
"reward_range": [round(self.min_reward, 3), round(self.max_reward, 3)],
"valid_actions": self.valid_actions,
"invalid_actions": self.invalid_actions,
"action_validity_rate": round(self.action_validity_rate, 3),
"initial_ram": round(self.ram_usage[0], 1) if self.ram_usage else 0,
"final_ram": round(self.ram_usage[-1], 1) if self.ram_usage else 0,
"initial_energy": round(self.energy_consumption[0], 1) if self.energy_consumption else 0,
"final_energy": round(self.energy_consumption[-1], 1) if self.energy_consumption else 0,
"task_completed": self.task_completed,
"final_task_progress": round(self.task_progress[-1], 3) if self.task_progress else 0,
"grader_score": round(self.grader_score, 3)
}
# ============================================================================
# DIRECT ENVIRONMENT TEST
# ============================================================================
async def run_random_actions_baseline():
"""Run baseline test with random actions for comparison."""
print("\n" + "=" * 80)
print("BASELINE TEST: Random Actions")
print("=" * 80)
# Test on the easiest task
task_name = "basic_ram_reduction"
env = EnergyOptimizationEnv(base_url="http://localhost:8000")
try:
result = await env.reset()
obs = result.observation
print(f"Initial State:")
print(f" RAM: {obs.ram_usage:.1f}%")
print(f" Energy: {obs.energy_consumption:.1f} kWh")
total_reward = 0.0
for step in range(1, 6):
# Random action
import random
action_type = random.choice(["reduce_ram", "optimize_energy", "balance_resources"])
intensity = random.uniform(0.3, 0.9)
action = EnergyOptimizationAction(action_type=action_type, intensity=intensity)
result = await env.step(action)
obs = result.observation
reward = result.reward or 0.0
total_reward += reward
print(f"\nStep {step}:")
print(f" Action: {action_type}, Intensity: {intensity:.2f}")
print(f" Reward: {reward:.3f}")
print(f" RAM: {obs.ram_usage:.1f}% | Energy: {obs.energy_consumption:.1f} kWh")
print(f"\nBaseline Total Reward: {total_reward:.3f}")
print(f"Baseline Avg Reward: {total_reward/5:.3f}")
except Exception as e:
print(f"Error running baseline: {e}")
# ============================================================================
# SIMPLE HEURISTIC AGENT TEST
# ============================================================================
async def run_heuristic_agent():
"""Run evaluation with a simple heuristic agent (not LLM)."""
print("\n" + "=" * 80)
print("HEURISTIC AGENT TEST: Rule-Based Decision Making")
print("=" * 80)
task_name = "basic_ram_reduction"
env = EnergyOptimizationEnv(base_url="http://localhost:8000")
metrics = EvaluationMetrics(task_name)
try:
result = await env.reset()
obs = result.observation
print(f"Task: {task_name}")
print(f"Initial RAM: {obs.ram_usage:.1f}%, Energy: {obs.energy_consumption:.1f} kWh\n")
for step in range(1, 11):
# Heuristic: If RAM > target, reduce RAM. Otherwise optimize energy.
ram_target = 70.0
energy_target = 7.5
if obs.ram_usage > ram_target:
action_type = "reduce_ram"
intensity = 0.8 # High intensity for RAM reduction
metrics.mark_action_validity(True)
else:
action_type = "optimize_energy"
intensity = 0.6
metrics.mark_action_validity(True)
action = EnergyOptimizationAction(action_type=action_type, intensity=intensity)
action_str = f"{action_type},{intensity:.1f}"
result = await env.step(action)
obs = result.observation
reward = result.reward or 0.0
metrics.add_step(step, action_str, reward, obs)
print(f"Step {step}: {action_str:30} | Reward: {reward:+.3f} | RAM: {obs.ram_usage:5.1f}% | Energy: {obs.energy_consumption:5.1f} kWh")
if result.done:
break
# Apply grader
grader_func = get_grader(task_name)
grader_score = grader_func(obs)
metrics.finalize(obs, grader_score)
metrics.print_summary()
print(f"\nHeuristic Agent Performance:")
print(f" - Complexity: Simple rule-based")
print(f" - Decision Speed: Instant")
print(f" - Generalization: Limited (task-specific)")
print(f" - Final Score: {grader_score:.3f}")
except Exception as e:
print(f"Error running heuristic agent: {e}")
import traceback
traceback.print_exc()
# ============================================================================
# MAIN EXECUTION
# ============================================================================
async def main():
"""Run all evaluation tests."""
print("\nStarting evaluation tests...\n")
# Test 1: Baseline with random actions
try:
await run_random_actions_baseline()
except Exception as e:
print(f"Could not run baseline test: {e}")
# Test 2: Heuristic agent
try:
await run_heuristic_agent()
except Exception as e:
print(f"Could not run heuristic agent: {e}")
import traceback
traceback.print_exc()
print("\n" + "=" * 80)
print("EVALUATION COMPLETE")
print("=" * 80)
print("\nKey Insights:")
print("- Baseline (Random): Shows what untrained agent achieves")
print("- Heuristic Agent: Shows what simple rules can achieve")
print("- LLM Inference: Should exceed both baselines with intelligent reasoning")
print("\nNext Step: Run `python inference.py` to evaluate the actual LLM")
print("=" * 80 + "\n")
if __name__ == "__main__":
import asyncio
asyncio.run(main())