Spaces:
Sleeping
Sleeping
File size: 8,507 Bytes
9d429ce | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | """
Advanced Analysis of LLM Finetuning Performance
Analyzes reward curves, complexity metrics, and fixer method effectiveness
"""
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from collections import Counter
os.makedirs('results', exist_ok=True)
# Load data
rewards_df = pd.read_csv('rewards_log.csv')
complexity_df = pd.read_csv('complexity_rewards.csv')
print("\n" + "="*70)
print("FINETUNING ANALYSIS REPORT")
print("="*70)
# βββ SUMMARY STATISTICS ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("\nπ TRAINING OVERVIEW")
print(f"Total Episodes: {len(rewards_df)}")
print(f"Unique Tasks: {rewards_df['task_id'].nunique()}")
print(f"Date Range: {rewards_df['timestamp'].iloc[0]} to {rewards_df['timestamp'].iloc[-1]}")
print(f"Avg Reward: {rewards_df['reward'].mean():.4f}")
print(f"Max Reward: {rewards_df['reward'].max():.4f}")
print(f"Min Reward: {rewards_df['reward'].min():.4f}")
print(f"Reward Std: {rewards_df['reward'].std():.4f}")
# βββ TASK BREAKDOWN ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("\nπ PERFORMANCE BY TASK")
task_stats = rewards_df.groupby('task_id')['reward'].agg([
('Count', 'count'),
('Mean', 'mean'),
('Max', 'max'),
('Min', 'min'),
('Std', 'std')
]).round(4)
print(task_stats)
# βββ COMPLEXITY ANALYSIS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("\nβ‘ COMPLEXITY VS REWARD ANALYSIS")
complexity_stats = complexity_df.groupby('complexity')['reward'].agg([
('Count', 'count'),
('Mean Reward', 'mean'),
('Max Reward', 'max'),
('Min Reward', 'min')
]).round(4)
print(complexity_stats)
# βββ METHOD PERFORMANCE ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("\nπ§ FIXER METHOD EFFECTIVENESS")
method_stats = complexity_df.groupby('method')['reward'].agg([
('Count', 'count'),
('Mean Reward', 'mean'),
('Max Reward', 'max'),
('Min Reward', 'min')
]).round(4)
print(method_stats)
# βββ COMPLEXITY BREAKDOWN ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("\nπ COMPLEXITY DISTRIBUTION")
complexity_counts = complexity_df['complexity'].value_counts().sort_values(ascending=False)
print(complexity_counts)
# βββ GRAPH 1: Complexity vs Reward Scatter ββββββββββββββββββββββββββββββββββββββ
fig, ax = plt.subplots(figsize=(12, 6))
colors = {'ollama': 'blue', 'builtin': 'red', 'tgi': 'green'}
for method in complexity_df['method'].unique():
df_method = complexity_df[complexity_df['method'] == method]
ax.scatter(range(len(df_method)), df_method['reward'],
label=f"{method.capitalize()} (n={len(df_method)})",
alpha=0.6, s=60, color=colors.get(method, 'gray'))
ax.set_xlabel('Sample Index', fontsize=11)
ax.set_ylabel('Reward Score (0-1)', fontsize=11)
ax.set_title('LLM Fixer Method Performance Comparison', fontsize=13, fontweight='bold')
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('results/method_performance.png', dpi=150)
plt.close()
print("\nβ Saved: method_performance.png")
# βββ GRAPH 2: Complexity Distribution (Pie + Bar) ββββββββββββββββββββββββββββββ
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# Pie chart
colors_pie = plt.cm.Set3(np.linspace(0, 1, len(complexity_counts)))
ax1.pie(complexity_counts.values, labels=complexity_counts.index, autopct='%1.1f%%',
colors=colors_pie, startangle=90)
ax1.set_title('Complexity Distribution in Dataset', fontsize=12, fontweight='bold')
# Bar chart
complexity_counts.plot(kind='bar', ax=ax2, color='skyblue', edgecolor='navy', alpha=0.7)
ax2.set_xlabel('Time Complexity Class', fontsize=11)
ax2.set_ylabel('Number of Samples', fontsize=11)
ax2.set_title('Complexity Class Frequency', fontsize=12, fontweight='bold')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45)
ax2.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('results/complexity_distribution.png', dpi=150)
plt.close()
print("β Saved: complexity_distribution.png")
# βββ GRAPH 3: Method Performance Box Plot ββββββββββββββββββββββββββββββββββββββ
fig, ax = plt.subplots(figsize=(10, 6))
method_data = [complexity_df[complexity_df['method'] == m]['reward'].values
for m in complexity_df['method'].unique()]
bp = ax.boxplot(method_data, labels=complexity_df['method'].unique(), patch_artist=True)
for patch, color in zip(bp['boxes'], ['lightblue', 'lightcoral', 'lightgreen'][:len(bp['boxes'])]):
patch.set_facecolor(color)
ax.set_xlabel('Fixer Method', fontsize=11)
ax.set_ylabel('Reward Score (0-1)', fontsize=11)
ax.set_title('Reward Distribution by Fixer Method', fontsize=13, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('results/method_boxplot.png', dpi=150)
plt.close()
print("β Saved: method_boxplot.png")
# βββ GRAPH 4: Task Performance Heatmap ββββββββββββββββββββββββββββββββββββββββββ
task_reward_matrix = rewards_df.pivot_table(
values='reward',
index='task_id',
aggfunc=['mean', 'max', 'std']
)
task_reward_matrix = task_reward_matrix.droplevel(0, axis=1)
fig, ax = plt.subplots(figsize=(10, 6))
im = ax.imshow(task_reward_matrix.values, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
ax.set_xticks(range(len(task_reward_matrix.columns)))
ax.set_yticks(range(len(task_reward_matrix.index)))
ax.set_xticklabels(task_reward_matrix.columns, rotation=45)
ax.set_yticklabels(task_reward_matrix.index)
ax.set_title('Task Difficulty Performance Matrix (Mean, Max, Std)', fontsize=13, fontweight='bold')
# Add text annotations
for i in range(len(task_reward_matrix.index)):
for j in range(len(task_reward_matrix.columns)):
text = ax.text(j, i, f'{task_reward_matrix.values[i, j]:.2f}',
ha="center", va="center", color="black", fontsize=9)
plt.colorbar(im, ax=ax, label='Reward Score')
plt.tight_layout()
plt.savefig('results/task_performance_matrix.png', dpi=150)
plt.close()
print("β Saved: task_performance_matrix.png")
# βββ GRAPH 5: Cumulative Reward Over Time ββββββββββββββββββββββββββββββββββββββ
fig, ax = plt.subplots(figsize=(12, 6))
sorted_rewards = complexity_df.sort_values('timestamp')
cumulative_reward = sorted_rewards['reward'].cumsum()
ax.plot(range(len(cumulative_reward)), cumulative_reward, marker='o',
markersize=4, linewidth=2, color='darkblue', alpha=0.7, label='Cumulative Reward')
ax.fill_between(range(len(cumulative_reward)), cumulative_reward, alpha=0.2, color='blue')
ax.set_xlabel('Sample Index (Chronological)', fontsize=11)
ax.set_ylabel('Cumulative Reward', fontsize=11)
ax.set_title('Cumulative Reward Trajectory', fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3)
ax.legend()
plt.tight_layout()
plt.savefig('results/cumulative_reward.png', dpi=150)
plt.close()
print("β Saved: cumulative_reward.png")
# βββ FINAL SUMMARY ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("\n" + "="*70)
print("β
ALL GRAPHS GENERATED IN results/ DIRECTORY:")
print(" β’ reward_curve.png (rolling avg of rewards)")
print(" β’ reward_by_task.png (task-wise comparison)")
print(" β’ method_performance.png (fixer methods)")
print(" β’ complexity_distribution.png (algorithm classes)")
print(" β’ method_boxplot.png (reward distribution)")
print(" β’ task_performance_matrix.png (heatmap)")
print(" β’ cumulative_reward.png (training trajectory)")
print("="*70 + "\n")
|