File size: 8,316 Bytes
2c4ca2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/env python3
"""
Generate academic report from raw_results.json
"""

import json
import sys
from pathlib import Path
from collections import defaultdict
import statistics

def load_results(json_file):
    """Load raw results from JSON file"""
    with open(json_file, 'r') as f:
        return json.load(f)

def analyze_results(data):
    """Analyze results and generate statistics"""

    # Group by model
    by_model = defaultdict(list)
    by_benchmark = defaultdict(list)
    by_algorithm = defaultdict(list)

    for exp in data:
        model = exp['model']
        benchmark = exp['benchmark']
        algorithm = exp['algorithm']
        best_r2 = exp.get('best_r2', -1.0)

        by_model[model].append({'benchmark': benchmark, 'algorithm': algorithm, 'r2': best_r2})
        by_benchmark[benchmark].append({'model': model, 'algorithm': algorithm, 'r2': best_r2})
        by_algorithm[algorithm].append({'model': model, 'benchmark': benchmark, 'r2': best_r2})

    return by_model, by_benchmark, by_algorithm

def generate_latex_table(by_model, by_benchmark):
    """Generate LaTeX table"""

    latex = []
    latex.append("\\begin{table}[htbp]")
    latex.append("\\centering")
    latex.append("\\caption{Best R² scores by model and benchmark}")
    latex.append("\\label{tab:results}")
    latex.append("\\begin{tabular}{l" + "r" * len(by_model) + "}")
    latex.append("\\toprule")

    # Header
    models = sorted(by_model.keys())
    header = "Benchmark & " + " & ".join(models) + " \\\\"
    latex.append(header)
    latex.append("\\midrule")

    # Rows
    benchmarks = sorted(by_benchmark.keys())
    for benchmark in benchmarks:
        row = [benchmark.replace('_', '\\_')]

        for model in models:
            # Find best R² for this model+benchmark
            best_r2 = -999
            for exp in by_model[model]:
                if exp['benchmark'] == benchmark and exp['r2'] > best_r2:
                    best_r2 = exp['r2']

            if best_r2 > -999:
                row.append(f"{best_r2:.4f}")
            else:
                row.append("---")

        latex.append(" & ".join(row) + " \\\\")

    latex.append("\\bottomrule")
    latex.append("\\end{tabular}")
    latex.append("\\end{table}")

    return "\n".join(latex)

def generate_markdown_report(data, by_model, by_benchmark, by_algorithm):
    """Generate comprehensive markdown report"""

    md = []
    md.append("# Comprehensive Evaluation Report - Academic Analysis")
    md.append("")
    md.append(f"**Total Experiments**: {len(data)}")
    md.append("")

    # Summary statistics
    md.append("## Summary Statistics")
    md.append("")

    for model, experiments in sorted(by_model.items()):
        r2_values = [exp['r2'] for exp in experiments if exp['r2'] > -1.0]
        if r2_values:
            avg_r2 = statistics.mean(r2_values)
            max_r2 = max(r2_values)
            min_r2 = min(r2_values)
            md.append(f"### {model}")
            md.append(f"- Experiments: {len(experiments)}")
            md.append(f"- Valid experiments: {len(r2_values)}")
            md.append(f"- Average R²: {avg_r2:.4f}")
            md.append(f"- Best R²: {max_r2:.4f}")
            md.append(f"- Worst R²: {min_r2:.4f}")
            md.append("")

    # Best results per benchmark
    md.append("## Best Results per Benchmark")
    md.append("")
    md.append("| Benchmark | Model | Algorithm | R² | Expression |")
    md.append("|-----------|-------|-----------|----|-----------  |")

    for benchmark in sorted(by_benchmark.keys()):
        exps = by_benchmark[benchmark]
        best_exp = max(exps, key=lambda x: x['r2'])

        # Find full experiment data
        full_exp = None
        for exp in data:
            if (exp['model'] == best_exp['model'] and
                exp['benchmark'] == benchmark and
                exp['algorithm'] == best_exp['algorithm']):
                full_exp = exp
                break

        expr = full_exp.get('best_expression', 'N/A')[:50] if full_exp else 'N/A'
        md.append(f"| {benchmark} | {best_exp['model']} | {best_exp['algorithm']} | {best_exp['r2']:.4f} | {expr} |")

    md.append("")

    # PPO vs GRPO comparison
    md.append("## Algorithm Comparison (PPO vs GRPO)")
    md.append("")

    ppo_r2 = [exp['r2'] for exp in by_algorithm.get('ppo', []) if exp['r2'] > -1.0]
    grpo_r2 = [exp['r2'] for exp in by_algorithm.get('grpo', []) if exp['r2'] > -1.0]

    md.append(f"**PPO**:")
    md.append(f"- Experiments: {len(by_algorithm.get('ppo', []))}")
    if ppo_r2:
        md.append(f"- Average R²: {statistics.mean(ppo_r2):.4f}")
        md.append(f"- Best R²: {max(ppo_r2):.4f}")
    md.append("")

    md.append(f"**GRPO**:")
    md.append(f"- Experiments: {len(by_algorithm.get('grpo', []))}")
    if grpo_r2:
        md.append(f"- Average R²: {statistics.mean(grpo_r2):.4f}")
        md.append(f"- Best R²: {max(grpo_r2):.4f}")
    md.append("")

    # Model comparison table
    md.append("## Model Comparison Matrix")
    md.append("")
    md.append("| Model | Benchmarks Won | Avg R² | Best R² | Success Rate |")
    md.append("|-------|----------------|--------|---------|--------------|")

    for model in sorted(by_model.keys()):
        exps = by_model[model]
        r2_values = [exp['r2'] for exp in exps if exp['r2'] > -1.0]

        # Count benchmarks won
        benchmarks_won = 0
        for benchmark in by_benchmark.keys():
            best_in_benchmark = max(by_benchmark[benchmark], key=lambda x: x['r2'])
            if best_in_benchmark['model'] == model:
                benchmarks_won += 1

        if r2_values:
            avg_r2 = statistics.mean(r2_values)
            max_r2 = max(r2_values)
            success_rate = len(r2_values) / len(exps) * 100
            md.append(f"| {model} | {benchmarks_won}/12 | {avg_r2:.4f} | {max_r2:.4f} | {success_rate:.1f}% |")
        else:
            md.append(f"| {model} | 0/12 | N/A | N/A | 0.0% |")

    md.append("")

    return "\n".join(md)

def main():
    if len(sys.argv) < 2:
        print("Usage: python generate_academic_report.py <raw_results.json>")
        sys.exit(1)

    json_file = Path(sys.argv[1])
    if not json_file.exists():
        print(f"Error: {json_file} not found")
        sys.exit(1)

    print(f"Loading results from {json_file}...")
    data = load_results(json_file)
    print(f"Loaded {len(data)} experiments")

    print("Analyzing results...")
    by_model, by_benchmark, by_algorithm = analyze_results(data)

    # Generate reports
    output_dir = json_file.parent / "analysis_results"
    output_dir.mkdir(exist_ok=True)

    # Markdown report
    print("Generating markdown report...")
    md_report = generate_markdown_report(data, by_model, by_benchmark, by_algorithm)
    md_file = output_dir / "academic_report.md"
    md_file.write_text(md_report, encoding='utf-8')
    print(f"[OK] Markdown report: {md_file}")

    # LaTeX table
    print("Generating LaTeX table...")
    latex_table = generate_latex_table(by_model, by_benchmark)
    latex_file = output_dir / "results_table.tex"
    latex_file.write_text(latex_table, encoding='utf-8')
    print(f"[OK] LaTeX table: {latex_file}")

    # Summary JSON
    print("Generating summary JSON...")
    summary = {
        "total_experiments": len(data),
        "models": {model: {
            "experiments": len(exps),
            "valid_experiments": len([e for e in exps if e['r2'] > -1.0]),
            "avg_r2": statistics.mean([e['r2'] for e in exps if e['r2'] > -1.0]) if [e for e in exps if e['r2'] > -1.0] else None,
            "best_r2": max([e['r2'] for e in exps if e['r2'] > -1.0]) if [e for e in exps if e['r2'] > -1.0] else None,
        } for model, exps in by_model.items()},
        "algorithms": {algo: {
            "experiments": len(exps),
            "avg_r2": statistics.mean([e['r2'] for e in exps if e['r2'] > -1.0]) if [e for e in exps if e['r2'] > -1.0] else None,
        } for algo, exps in by_algorithm.items()}
    }

    summary_file = output_dir / "summary.json"
    summary_file.write_text(json.dumps(summary, indent=2), encoding='utf-8')
    print(f"[OK] Summary JSON: {summary_file}")

    print(f"\n[DONE] Analysis complete! Reports saved to: {output_dir}")

if __name__ == "__main__":
    main()