File size: 5,052 Bytes
3f6526a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/bin/bash
# Check and analyze results

set -e

echo "=================================="
echo "πŸ“Š Results Analysis"
echo "=================================="

# Find the most recent results directory
if [ $# -eq 0 ]; then
    RESULTS_DIR=$(find examples/circle_packing/results -type d -name "*eval_service*" | sort -r | head -1)
    if [ -z "$RESULTS_DIR" ]; then
        echo "❌ No results found"
        echo ""
        echo "Usage: $0 [results_directory]"
        exit 1
    fi
    echo "πŸ“ Using most recent results: $RESULTS_DIR"
else
    RESULTS_DIR="$1"
fi

echo ""

# Check if results exist
if [ ! -d "$RESULTS_DIR" ]; then
    echo "❌ Directory not found: $RESULTS_DIR"
    exit 1
fi

echo "=" * 80
echo "πŸ“‚ Results Directory: $RESULTS_DIR"
echo "=" * 80
echo ""

# 1. Check best program
if [ -f "$RESULTS_DIR/best/results/metrics.json" ]; then
    echo "βœ… Best Program Found"
    echo "---"
    BEST_SCORE=$(jq -r '.combined_score' "$RESULTS_DIR/best/results/metrics.json")
    BEST_CORRECT=$(jq -r '.correct' "$RESULTS_DIR/best/results/metrics.json")
    BEST_GEN=$(jq -r '.generation // "N/A"' "$RESULTS_DIR/best/results/metrics.json")
    echo "  Score: $BEST_SCORE"
    echo "  Correct: $BEST_CORRECT"
    echo "  Generation: $BEST_GEN"
    echo ""
else
    echo "❌ No best program found"
    echo ""
fi

# 2. Check eval agent memory
if [ -d "$RESULTS_DIR/eval_agent_memory" ]; then
    echo "βœ… Eval Agent Memory Found"
    echo "---"
    
    if [ -f "$RESULTS_DIR/eval_agent_memory/EVAL_AGENTS.md" ]; then
        echo "  πŸ“„ EVAL_AGENTS.md: $(wc -l < "$RESULTS_DIR/eval_agent_memory/EVAL_AGENTS.md") lines"
    fi
    
    if [ -f "$RESULTS_DIR/eval_agent_memory/auxiliary_metrics.py" ]; then
        echo "  🐍 auxiliary_metrics.py: Found"
        # Count evaluate_ functions
        NUM_METRICS=$(grep -c "^def evaluate_" "$RESULTS_DIR/eval_agent_memory/auxiliary_metrics.py" || echo "0")
        echo "  πŸ“Š Auxiliary metrics: $NUM_METRICS"
    fi
    
    if [ -f "$RESULTS_DIR/eval_agent_memory/service_state.json" ]; then
        echo "  πŸ’Ύ service_state.json: Found"
    fi
    echo ""
else
    echo "⚠️  No eval agent memory found"
    echo ""
fi

# 3. Check generations
NUM_GENS=$(find "$RESULTS_DIR" -maxdepth 1 -type d -name "gen_*" | wc -l)
echo "πŸ“ˆ Generations"
echo "---"
echo "  Total generations: $NUM_GENS"
echo ""

# 4. Sample a few metrics files
echo "πŸ“Š Sample Metrics (last 3 generations)"
echo "---"
for gen_dir in $(find "$RESULTS_DIR" -maxdepth 1 -type d -name "gen_*" | sort -V | tail -3); do
    if [ -f "$gen_dir/results/metrics.json" ]; then
        GEN_NUM=$(basename "$gen_dir" | sed 's/gen_//')
        SCORE=$(jq -r '.combined_score' "$gen_dir/results/metrics.json")
        CORRECT=$(jq -r '.correct // "N/A"' "$gen_dir/results/metrics.json")
        HAS_AUX=$(jq -r '.auxiliary | length' "$gen_dir/results/metrics.json" 2>/dev/null || echo "0")
        HAS_DESC=$(jq -r '.auxiliary_descriptions | length' "$gen_dir/results/metrics.json" 2>/dev/null || echo "0")
        
        echo "  Gen $GEN_NUM: score=$SCORE, correct=$CORRECT, aux_metrics=$HAS_AUX, descriptions=$HAS_DESC"
    fi
done
echo ""

# 5. Check database
if [ -f "$RESULTS_DIR/evolution_db.sqlite" ]; then
    echo "πŸ’Ύ Database Statistics"
    echo "---"
    
    TOTAL_PROGRAMS=$(sqlite3 "$RESULTS_DIR/evolution_db.sqlite" "SELECT COUNT(*) FROM programs;" 2>/dev/null || echo "N/A")
    CORRECT_PROGRAMS=$(sqlite3 "$RESULTS_DIR/evolution_db.sqlite" "SELECT COUNT(*) FROM programs WHERE correct=1;" 2>/dev/null || echo "N/A")
    
    echo "  Total programs: $TOTAL_PROGRAMS"
    echo "  Correct programs: $CORRECT_PROGRAMS"
    echo ""
fi

# 6. Check if auxiliary metrics were used
echo "πŸ” Auxiliary Metrics Integration Check"
echo "---"

# Check a recent generation for auxiliary metrics in public_metrics
RECENT_GEN=$(find "$RESULTS_DIR" -maxdepth 1 -type d -name "gen_*" | sort -V | tail -1)
if [ -f "$RECENT_GEN/results/metrics.json" ]; then
    HAS_AUX_IN_PRIMARY=$(jq -r '.primary.public | keys | map(select(startswith("aux_"))) | length' "$RECENT_GEN/results/metrics.json" 2>/dev/null || echo "0")
    
    if [ "$HAS_AUX_IN_PRIMARY" -gt "0" ]; then
        echo "  βœ… Auxiliary metrics merged into primary.public"
        echo "  πŸ“Š Found $HAS_AUX_IN_PRIMARY aux_ metrics"
    else
        echo "  ⚠️  No aux_ metrics found in primary.public"
    fi
    
    HAS_TEXT_FEEDBACK=$(jq -r '.primary.text_feedback // "" | length' "$RECENT_GEN/results/metrics.json" 2>/dev/null || echo "0")
    if [ "$HAS_TEXT_FEEDBACK" -gt "0" ]; then
        echo "  βœ… text_feedback present (likely contains metric descriptions)"
    else
        echo "  ⚠️  No text_feedback found"
    fi
fi
echo ""

echo "=" * 80
echo "Analysis complete!"
echo "=" * 80
echo ""
echo "To view detailed results:"
echo "  - Best program: $RESULTS_DIR/best/main.py"
echo "  - Agent documentation: $RESULTS_DIR/eval_agent_memory/EVAL_AGENTS.md"
echo "  - All metrics: find $RESULTS_DIR -name 'metrics.json'"
echo ""