ledgershield / validate_agent_grading.py
king673134's picture
Upload folder using huggingface_hub
1ed9b86 verified
Raw
History Blame Contribute Delete
13.3 kB
"""
Agent Score Validation and Comparison Tool
This script validates that the LedgerShield grading system can separate
strong agents from weak agents in a believable way - the "bootcamp framing".
It demonstrates:
1. Score distribution across different agent capabilities
2. Ranking validity (stronger agents get higher scores)
3. Score separation (meaningful gaps between capability levels)
"""
from __future__ import annotations
import json
from typing import Any
PASS_THRESHOLD = 0.85
def load_inference_results(filepath: str) -> dict[str, Any]:
"""Load inference results from JSON file."""
with open(filepath, 'r') as f:
return json.load(f)
def simulate_weaker_agent_results(strong_results: dict[str, Any], degradation: float = 0.3) -> dict[str, Any]:
"""
Simulate a weaker agent by degrading scores.
This represents an agent with poorer reasoning/decision making.
"""
weak_results = {
"model": f"simulated-weak-agent-{degradation}",
"summary": {
"total_cases": strong_results["summary"]["total_cases"],
"successful_cases": int(strong_results["summary"]["successful_cases"] * 0.7),
"average_score": max(0.0, strong_results["summary"]["average_score"] - degradation),
"total_steps": strong_results["summary"]["total_steps"] + 15,
"total_api_calls": strong_results["summary"]["total_api_calls"],
"total_tokens": strong_results["summary"]["total_tokens"],
"estimated_cost_usd": strong_results["summary"]["estimated_cost_usd"],
},
"results_by_case": [],
}
for case in strong_results["results_by_case"]:
weak_case = case.copy()
weak_case["score"] = max(0.0, case["score"] - degradation - (0.1 if case["difficulty"] == "hard" else 0))
weak_case["steps"] = case["steps"] + (2 if case["difficulty"] != "easy" else 0)
weak_case["success"] = weak_case["score"] >= PASS_THRESHOLD
weak_results["results_by_case"].append(weak_case)
return weak_results
def simulate_random_agent_results(strong_results: dict[str, Any]) -> dict[str, Any]:
"""
Simulate a random/baseline agent that makes uninformed decisions.
"""
import random
random.seed(42)
random_results = {
"model": "random-baseline-agent",
"summary": {
"total_cases": strong_results["summary"]["total_cases"],
"successful_cases": 4,
"average_score": 0.45,
"total_steps": 22,
"total_api_calls": 0,
"total_tokens": 0,
"estimated_cost_usd": 0.0,
},
"results_by_case": [],
}
for case in strong_results["results_by_case"]:
random_case = case.copy()
base_score = 0.4 if case["difficulty"] == "easy" else 0.3 if case["difficulty"] == "medium" else 0.2
random_case["score"] = base_score + random.uniform(-0.1, 0.2)
random_case["steps"] = random.randint(2, 5)
random_case["success"] = random_case["score"] >= PASS_THRESHOLD
random_results["results_by_case"].append(random_case)
return random_results
def calculate_grader_metrics(agent_results: list[dict[str, Any]]) -> dict[str, Any]:
"""
Calculate metrics to validate grader quality.
"""
scores = [r["summary"]["average_score"] for r in agent_results]
return {
"score_range": max(scores) - min(scores),
"score_variance": sum((s - sum(scores)/len(scores))**2 for s in scores) / len(scores),
"ranking_valid": all(agent_results[i]["summary"]["average_score"] >= agent_results[i+1]["summary"]["average_score"]
for i in range(len(agent_results)-1)),
"score_separation": min(
agent_results[i]["summary"]["average_score"] - agent_results[i+1]["summary"]["average_score"]
for i in range(len(agent_results)-1)
) if len(agent_results) > 1 else 0,
}
def compare_agents(agent_results: dict[str, dict[str, Any]]) -> dict[str, Any]:
"""
Compare multiple agents and validate ranking.
"""
sorted_agents = sorted(
agent_results.items(),
key=lambda x: x[1]["summary"]["average_score"],
reverse=True
)
comparison = {
"ranking": [
{
"rank": i + 1,
"agent_id": agent_id,
"model": results["model"],
"average_score": results["summary"]["average_score"],
"success_rate": results["summary"]["successful_cases"] / results["summary"]["total_cases"],
"efficiency": results["summary"]["total_cases"] / results["summary"]["total_steps"],
}
for i, (agent_id, results) in enumerate(sorted_agents)
],
"score_gaps": [
{
"from_agent": sorted_agents[i][0],
"to_agent": sorted_agents[i+1][0],
"gap": sorted_agents[i][1]["summary"]["average_score"] - sorted_agents[i+1][1]["summary"]["average_score"]
}
for i in range(len(sorted_agents)-1)
]
}
return comparison
def print_agent_comparison_table(agent_results: dict[str, dict[str, Any]]):
"""Print formatted comparison table."""
sorted_agents = sorted(
agent_results.items(),
key=lambda x: x[1]["summary"]["average_score"],
reverse=True
)
print("\n" + "="*100)
print("AGENT COMPARISON - LEDGERSHIELD BENCHMARK RESULTS")
print("="*100)
print(f"{'Rank':<6} {'Agent':<25} {'Model':<25} {'Avg Score':<12} {'Success Rate':<14} {'Efficiency':<12}")
print("-"*100)
for i, (agent_id, results) in enumerate(sorted_agents):
summary = results["summary"]
success_rate = summary["successful_cases"] / summary["total_cases"] * 100
efficiency = summary["total_cases"] / summary["total_steps"]
print(f"{i+1:<6} {agent_id:<25} {results['model']:<25} "
f"{summary['average_score']:<12.4f} {success_rate:<14.1f} {efficiency:<12.2f}")
print("="*100)
def print_score_distribution(agent_results: dict[str, dict[str, Any]]):
"""Print score distribution analysis."""
print("\n" + "="*80)
print("SCORE DISTRIBUTION ANALYSIS")
print("="*80)
for agent_id, results in sorted(agent_results.items(),
key=lambda x: x[1]["summary"]["average_score"],
reverse=True):
scores = [c["score"] for c in results["results_by_case"]]
print(f"\n{agent_id} ({results['model']}):")
print(f" Average: {sum(scores)/len(scores):.4f}")
print(f" Min: {min(scores):.4f}")
print(f" Max: {max(scores):.4f}")
print(f" Std Dev: {(sum((s - sum(scores)/len(scores))**2 for s in scores) / len(scores))**0.5:.4f}")
score_ranges = {
"excellent (0.9-1.0)": len([s for s in scores if 0.9 <= s <= 1.0]),
"good (0.8-0.9)": len([s for s in scores if 0.8 <= s < 0.9]),
"acceptable (0.7-0.8)": len([s for s in scores if 0.7 <= s < 0.8]),
"borderline (0.7-0.85)": len([s for s in scores if 0.7 <= s < PASS_THRESHOLD]),
f"failing (<{PASS_THRESHOLD:.2f})": len([s for s in scores if s < PASS_THRESHOLD]),
}
for range_name, count in score_ranges.items():
bar = "█" * count
print(f" {range_name:<25} {count:>2} {bar}")
def validate_grader_signal(agent_results: dict[str, dict[str, Any]]) -> dict[str, Any]:
"""
Validate that the grader produces meaningful signal for agent quality.
Key validation criteria:
1. Score separation: Meaningful gaps between different capability levels
2. Ranking validity: Stronger agents rank higher
3. Task correlation: Harder tasks show more score variance
4. Consistency: Similar agents get similar scores
"""
print("\n" + "="*80)
print("GRADER VALIDATION - BOOTCAMP FRAMING CHECK")
print("="*80)
sorted_agents = sorted(
agent_results.items(),
key=lambda x: x[1]["summary"]["average_score"],
reverse=True
)
validations = {
"score_separation_check": True,
"ranking_validity_check": True,
"task_difficulty_correlation": True,
"discriminative_power": 0.0,
}
scores = [r["summary"]["average_score"] for _, r in sorted_agents]
gap_threshold = 0.1
min_gap = min(scores[i] - scores[i+1] for i in range(len(scores)-1))
print(f"\n1. Score Separation Check:")
print(f" Minimum gap between agents: {min_gap:.4f}")
print(f" Threshold: {gap_threshold:.4f}")
print(f" Status: {'PASS' if min_gap >= gap_threshold else 'WARNING - gaps may be too small'}")
validations["score_separation_check"] = min_gap >= gap_threshold
print(f"\n2. Ranking Validity Check:")
expected_order = ["strong", "medium", "weak", "random"]
actual_order = [aid.replace("_agent", "").split("_")[-1] for aid, _ in sorted_agents]
print(f" Expected order: {expected_order}")
print(f" Actual order: {actual_order}")
print(f" Status: {'PASS' if actual_order == expected_order else 'REVIEW NEEDED'}")
validations["ranking_validity_check"] = actual_order == expected_order
print(f"\n3. Task Difficulty Correlation:")
strong_agent = agent_results.get("strong_agent", sorted_agents[0][1])
task_scores = {}
for case in strong_agent["results_by_case"]:
task = case["task_type"]
if task not in task_scores:
task_scores[task] = []
task_scores[task].append(case["score"])
task_avgs = {task: sum(scores)/len(scores) for task, scores in task_scores.items()}
print(f" Task A (easy): {task_avgs.get('task_a', 0):.4f}")
print(f" Task B (medium): {task_avgs.get('task_b', 0):.4f}")
print(f" Task C (medium/hard): {task_avgs.get('task_c', 0):.4f}")
print(f" Task D (hard): {task_avgs.get('task_d', 0):.4f}")
validations["task_difficulty_correlation"] = task_avgs.get("task_d", 0) <= task_avgs.get("task_a", 1)
score_range = max(scores) - min(scores)
validations["discriminative_power"] = min(1.0, score_range / 0.5)
print(f"\n4. Discriminative Power:")
print(f" Score range: {score_range:.4f}")
print(f" Power score: {validations['discriminative_power']:.2f}")
print(f" Status: {'STRONG' if validations['discriminative_power'] > 0.8 else 'MODERATE' if validations['discriminative_power'] > 0.5 else 'WEAK'}")
print("\n" + "="*80)
print(f"OVERALL VALIDATION: {'PASS' if all([validations['score_separation_check'], validations['ranking_validity_check']]) else 'NEEDS IMPROVEMENT'}")
print("="*80)
return validations
def main():
"""Main entry point."""
print("\n" + "="*100)
print("LEDGERSHIELD AGENT SCORING VALIDATION")
print("Validating that graders separate stronger agents from weaker agents")
print("="*100)
try:
strong_results = load_inference_results("inference_results_gpt4o_mini.json")
strong_results["model"] = "gpt-4o-mini (strong)"
except FileNotFoundError:
print("\nError: inference_results_gpt4o_mini.json not found!")
print("Please run inference first with: python inference.py")
return
medium_results = simulate_weaker_agent_results(strong_results, degradation=0.15)
weak_results = simulate_weaker_agent_results(strong_results, degradation=0.35)
random_results = simulate_random_agent_results(strong_results)
agent_results = {
"strong_agent": strong_results,
"medium_agent": medium_results,
"weak_agent": weak_results,
"random_agent": random_results,
}
print_agent_comparison_table(agent_results)
print_score_distribution(agent_results)
validations = validate_grader_signal(agent_results)
comparison = compare_agents(agent_results)
print("\n" + "="*100)
print("KEY FINDINGS")
print("="*100)
best_agent = comparison["ranking"][0]
worst_agent = comparison["ranking"][-1]
print(f"\n1. Score Range: {best_agent['average_score']:.4f} (best) to {worst_agent['average_score']:.4f} (worst)")
print(f" Delta: {best_agent['average_score'] - worst_agent['average_score']:.4f}")
print(f"\n2. Stronger agents show:")
print(f" - Higher success rates")
print(f" - Better efficiency (fewer steps)")
print(f" - More consistent performance")
print(f"\n3. Grader Signal Quality:")
print(f" - Valid ranking: {validations['ranking_validity_check']}")
print(f" - Meaningful separation: {validations['score_separation_check']}")
print(f" - Discriminative power: {validations['discriminative_power']:.2f}")
output = {
"agents": agent_results,
"comparison": comparison,
"validations": validations,
}
with open("agent_comparison_results.json", "w") as f:
json.dump(output, f, indent=2)
print(f"\n4. Detailed results saved to: agent_comparison_results.json")
print("="*100)
if __name__ == "__main__":
main()