File size: 4,171 Bytes
ed1b365 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | #!/usr/bin/env python3
"""Verbose Evaluation Runner — See Real-Time Agent Thinking
Shows exactly what agents are thinking as they reason through each question.
Usage:
python evaluation/run_evaluation_verbose.py --questions 1
"""
import sys
import os
from pathlib import Path
# Enable verbose mode globally
os.environ['CODETTE_VERBOSE'] = '1'
# Setup logging for real-time visibility
import logging
logging.basicConfig(
level=logging.DEBUG,
format='%(name)-20s | %(levelname)-8s | %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
]
)
sys.path.insert(0, str(Path(__file__).parent.parent / 'reasoning_forge'))
sys.path.insert(0, str(Path(__file__).parent.parent / 'inference'))
from evaluation.test_suite_evaluation import (
EvaluationHarness,
EVALUATION_TEST_SUITE,
)
def run_verbose_evaluation(num_questions: int = 1):
"""Run evaluation with full real-time agent visibility."""
print("\n" + "=" * 100)
print("CODETTE VERBOSE EVALUATION — REAL-TIME AGENT THINKING")
print("=" * 100)
print(f"Questions: {num_questions}")
print(f"Verbose mode: ON (see all agent reasoning)\n")
# Load ForgeEngine
print("[1/3] Loading ForgeEngine with real LLM agents...")
try:
from reasoning_forge.forge_engine import ForgeEngine
forge = ForgeEngine(living_memory=None, enable_memory_weighting=False)
print(" ✓ ForgeEngine loaded")
if forge.newton.orchestrator:
print(f" ✓ Orchestrator ready: {forge.newton.orchestrator.available_adapters}")
print(f" ✓ GPU acceleration: {forge.newton.orchestrator.n_gpu_layers} layers")
except Exception as e:
print(f" ✗ ERROR: {e}")
import traceback
traceback.print_exc()
return False
# Create harness
print("\n[2/3] Creating evaluation harness...")
try:
harness = EvaluationHarness(forge)
print(" ✓ Harness ready\n")
except Exception as e:
print(f" ✗ ERROR: {e}")
return False
# Run ONE question in detail
print("[3/3] Running question with full real-time reasoning output...\n")
print("=" * 100)
try:
test_questions = EVALUATION_TEST_SUITE[:num_questions]
for i, question in enumerate(test_questions):
print(f"\n{'='*100}")
print(f"QUESTION {i+1}: {question.query}")
print(f"Category: {question.category} | Difficulty: {question.difficulty}")
print(f"Expected perspectives: {', '.join(question.expected_perspectives)}")
print(f"{'='*100}\n")
# This will trigger verbose logging for agent analysis
print("[RUNNING DEBATE]\n")
result = forge.forge_with_debate(question.query)
# Extract synthesis
synthesis = ""
if "messages" in result and len(result["messages"]) >= 3:
synthesis = result["messages"][2].get("content", "")
print(f"\n{'='*100}")
print(f"[FINAL SYNTHESIS] ({len(synthesis)} characters)\n")
print(synthesis)
print(f"{'='*100}\n")
# Show metadata
metadata = result.get("metadata", {})
print(f"[METADATA]")
print(f" Conflicts detected: {len(metadata.get('conflicts', []))}")
print(f" Gamma (coherence): {metadata.get('gamma', 0.5):.3f}")
print(f" Debate rounds: {metadata.get('debate_round', 0)}")
except Exception as e:
print(f"\n✗ ERROR during evaluation: {e}")
import traceback
traceback.print_exc()
return False
return True
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Verbose evaluation with real-time agent thinking")
parser.add_argument("--questions", type=int, default=1, help="Number of questions to run (default: 1)")
args = parser.parse_args()
success = run_verbose_evaluation(args.questions)
sys.exit(0 if success else 1)
|