Codette-Reasoning / evaluation /run_evaluation_verbose.py
Raiff1982's picture
Upload 120 files
ed1b365 verified
#!/usr/bin/env python3
"""Verbose Evaluation Runner β€” See Real-Time Agent Thinking
Shows exactly what agents are thinking as they reason through each question.
Usage:
python evaluation/run_evaluation_verbose.py --questions 1
"""
import sys
import os
from pathlib import Path
# Enable verbose mode globally
os.environ['CODETTE_VERBOSE'] = '1'
# Setup logging for real-time visibility
import logging
logging.basicConfig(
level=logging.DEBUG,
format='%(name)-20s | %(levelname)-8s | %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
]
)
sys.path.insert(0, str(Path(__file__).parent.parent / 'reasoning_forge'))
sys.path.insert(0, str(Path(__file__).parent.parent / 'inference'))
from evaluation.test_suite_evaluation import (
EvaluationHarness,
EVALUATION_TEST_SUITE,
)
def run_verbose_evaluation(num_questions: int = 1):
"""Run evaluation with full real-time agent visibility."""
print("\n" + "=" * 100)
print("CODETTE VERBOSE EVALUATION β€” REAL-TIME AGENT THINKING")
print("=" * 100)
print(f"Questions: {num_questions}")
print(f"Verbose mode: ON (see all agent reasoning)\n")
# Load ForgeEngine
print("[1/3] Loading ForgeEngine with real LLM agents...")
try:
from reasoning_forge.forge_engine import ForgeEngine
forge = ForgeEngine(living_memory=None, enable_memory_weighting=False)
print(" βœ“ ForgeEngine loaded")
if forge.newton.orchestrator:
print(f" βœ“ Orchestrator ready: {forge.newton.orchestrator.available_adapters}")
print(f" βœ“ GPU acceleration: {forge.newton.orchestrator.n_gpu_layers} layers")
except Exception as e:
print(f" βœ— ERROR: {e}")
import traceback
traceback.print_exc()
return False
# Create harness
print("\n[2/3] Creating evaluation harness...")
try:
harness = EvaluationHarness(forge)
print(" βœ“ Harness ready\n")
except Exception as e:
print(f" βœ— ERROR: {e}")
return False
# Run ONE question in detail
print("[3/3] Running question with full real-time reasoning output...\n")
print("=" * 100)
try:
test_questions = EVALUATION_TEST_SUITE[:num_questions]
for i, question in enumerate(test_questions):
print(f"\n{'='*100}")
print(f"QUESTION {i+1}: {question.query}")
print(f"Category: {question.category} | Difficulty: {question.difficulty}")
print(f"Expected perspectives: {', '.join(question.expected_perspectives)}")
print(f"{'='*100}\n")
# This will trigger verbose logging for agent analysis
print("[RUNNING DEBATE]\n")
result = forge.forge_with_debate(question.query)
# Extract synthesis
synthesis = ""
if "messages" in result and len(result["messages"]) >= 3:
synthesis = result["messages"][2].get("content", "")
print(f"\n{'='*100}")
print(f"[FINAL SYNTHESIS] ({len(synthesis)} characters)\n")
print(synthesis)
print(f"{'='*100}\n")
# Show metadata
metadata = result.get("metadata", {})
print(f"[METADATA]")
print(f" Conflicts detected: {len(metadata.get('conflicts', []))}")
print(f" Gamma (coherence): {metadata.get('gamma', 0.5):.3f}")
print(f" Debate rounds: {metadata.get('debate_round', 0)}")
except Exception as e:
print(f"\nβœ— ERROR during evaluation: {e}")
import traceback
traceback.print_exc()
return False
return True
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Verbose evaluation with real-time agent thinking")
parser.add_argument("--questions", type=int, default=1, help="Number of questions to run (default: 1)")
args = parser.parse_args()
success = run_verbose_evaluation(args.questions)
sys.exit(0 if success else 1)