File size: 4,323 Bytes
5d570d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
make_chart.py
Generates the before/after reward chart using known scores.
Run: python make_chart.py
"""

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np

# Rule-based agent (no LLM, no training) — measured locally
baseline_scores = {
    "task1":   0.100,
    "task2":   0.113,
    "task3":   0.218,
    "overall": 0.144,
}

# Qwen2.5-72B via HF Inference API — from your clean run logs
llm_scores = {
    "task1":   0.100,
    "task2":   0.113,
    "task3":   0.262,
    "overall": 0.158,
}

# After GRPO training — update these once Colab finishes
# If Colab not done yet, use llm_scores as placeholder
grpo_scores = {
    "task1":   0.100,
    "task2":   0.113,
    "task3":   0.262,
    "overall": 0.158,
}

def make_chart(baseline, llm, grpo, output="reward_chart.png"):
    tasks = ["Task 1\n(Classify)", "Task 2\n(Action)", "Task 3\n(Full Resolve)", "Overall"]
    keys  = ["task1", "task2", "task3", "overall"]

    b_vals    = [baseline.get(k, 0) for k in keys]
    llm_vals  = [llm.get(k, 0) for k in keys]
    grpo_vals = [grpo.get(k, 0) for k in keys]

    x     = np.arange(len(tasks))
    width = 0.25

    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    fig.patch.set_facecolor("#1a1a2e")
    for ax in axes:
        ax.set_facecolor("#16213e")

    ax1 = axes[0]
    bars1 = ax1.bar(x - width, b_vals,    width, label="Rule-Based",    color="#636e72", edgecolor="#2d3436")
    bars2 = ax1.bar(x,         llm_vals,  width, label="Qwen2.5-72B",   color="#0984e3", edgecolor="#2d3436")
    bars3 = ax1.bar(x + width, grpo_vals, width, label="After GRPO",    color="#00b894", edgecolor="#2d3436")

    for bars in [bars1, bars2, bars3]:
        for bar in bars:
            h = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2., h + 0.008,
                     f"{h:.2f}", ha="center", va="bottom", fontsize=8.5, color="white")

    ax1.set_xticks(x)
    ax1.set_xticklabels(tasks, color="white", fontsize=10)
    ax1.set_ylabel("Score (0 - 1)", color="white", fontsize=11)
    ax1.set_title("Score Comparison Across Training Stages", color="white", fontsize=12, fontweight="bold", pad=10)
    ax1.set_ylim(0, 1.2)
    ax1.tick_params(colors="white")
    ax1.spines[:].set_color("#2d3436")
    ax1.yaxis.grid(True, alpha=0.2, color="white")
    ax1.set_axisbelow(True)
    ax1.legend(facecolor="#0f3460", edgecolor="#2d3436", labelcolor="white", fontsize=9)

    ax2 = axes[1]
    deltas = [round(grpo.get(k, 0) - baseline.get(k, 0), 3) for k in keys]
    colors = ["#00b894" if d >= 0 else "#d63031" for d in deltas]
    bars4  = ax2.bar(x, deltas, width=0.4, color=colors, edgecolor="#2d3436")

    for bar, d in zip(bars4, deltas):
        ypos = bar.get_height() + 0.004 if d >= 0 else bar.get_height() - 0.016
        ax2.text(bar.get_x() + bar.get_width()/2., ypos,
                 f"{d:+.3f}", ha="center", va="bottom", fontsize=11,
                 fontweight="bold", color="white")

    ax2.axhline(0, color="white", linewidth=0.8, alpha=0.4)
    ax2.set_xticks(x)
    ax2.set_xticklabels(tasks, color="white", fontsize=10)
    ax2.set_ylabel("Score Delta (GRPO vs Rule-Based)", color="white", fontsize=10)
    ax2.set_title("Improvement: Rule-Based → After GRPO", color="white", fontsize=12, fontweight="bold", pad=10)
    ax2.tick_params(colors="white")
    ax2.spines[:].set_color("#2d3436")
    ax2.yaxis.grid(True, alpha=0.2, color="white")
    ax2.set_axisbelow(True)

    fig.suptitle(
        "Support Ticket Env — Training Results\nModel: Qwen2.5-0.5B-Instruct + GRPO | OpenEnv x Scalar Hackathon 2026",
        color="white", fontsize=11, y=1.02
    )

    plt.tight_layout()
    plt.savefig(output, dpi=180, bbox_inches="tight", facecolor=fig.get_facecolor())
    print(f"Chart saved: {output}")

    print("\n" + "="*52)
    print(f"{'Task':<14} {'Rule-Based':>10} {'Qwen-72B':>10} {'GRPO':>8} {'Delta':>8}")
    print("-"*52)
    for k, label in [("task1","Task 1"),("task2","Task 2"),("task3","Task 3"),("overall","Overall")]:
        b = baseline.get(k, 0)
        l = llm.get(k, 0)
        g = grpo.get(k, 0)
        d = g - b
        print(f"{label:<14} {b:>10.3f} {l:>10.3f} {g:>8.3f} {d:>+8.3f}")
    print("="*52)

if __name__ == "__main__":
    make_chart(baseline_scores, llm_scores, grpo_scores)