QLoRA_RAG_test / src /analyze_results.py
Dongjin1203's picture
Add ResultAnalyzer and fix import paths
92dedf4
"""
์‹คํ—˜ ๊ฒฐ๊ณผ ๋ถ„์„ ๋ฐ ์‹œ๊ฐํ™”
JSON ๊ฒฐ๊ณผ ํŒŒ์ผ์„ ์ฝ์–ด์„œ ๋‹ค์–‘ํ•œ ๊ทธ๋ž˜ํ”„๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
"""
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List, Any
import numpy as np
# ํ•œ๊ธ€ ํฐํŠธ ์„ค์ • (matplotlib)
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False
class ResultAnalyzer:
"""์‹คํ—˜ ๊ฒฐ๊ณผ ๋ถ„์„ ํด๋ž˜์Šค"""
def __init__(self, result_file_path: str):
"""
์ดˆ๊ธฐํ™”
Args:
result_file_path: JSON ๊ฒฐ๊ณผ ํŒŒ์ผ ๊ฒฝ๋กœ
"""
self.result_file = Path(result_file_path)
self.results = self._load_results()
self.df = self._results_to_dataframe()
# ๋ถ„์„ ๊ฒฐ๊ณผ ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ
self.analysis_dir = self.result_file.parent / "analysis"
self.analysis_dir.mkdir(parents=True, exist_ok=True)
print(f"โœ… ResultAnalyzer ์ดˆ๊ธฐํ™” ์™„๋ฃŒ")
print(f" ๊ฒฐ๊ณผ ํŒŒ์ผ: {self.result_file}")
print(f" ๋ถ„์„ ์ €์žฅ ๊ฒฝ๋กœ: {self.analysis_dir}")
def _load_results(self) -> Dict[str, Any]:
"""JSON ๊ฒฐ๊ณผ ํŒŒ์ผ ๋กœ๋“œ"""
with open(self.result_file, 'r', encoding='utf-8') as f:
return json.load(f)
def _results_to_dataframe(self) -> pd.DataFrame:
"""๊ฒฐ๊ณผ๋ฅผ DataFrame์œผ๋กœ ๋ณ€ํ™˜"""
all_rows = []
for dist_type, dist_results in self.results['results'].items():
for result in dist_results:
row = {
'distribution': dist_type,
'model': result['model'],
'query': result['query'],
'category': result.get('category', 'unknown'),
'success': result['success'],
'elapsed_time': result['elapsed_time'],
'used_retrieval': result.get('used_retrieval', False),
'query_type': result.get('query_type', 'unknown'),
'search_mode': result.get('search_mode', 'none'),
'total_tokens': result.get('usage', {}).get('total_tokens', 0),
'prompt_tokens': result.get('usage', {}).get('prompt_tokens', 0),
'completion_tokens': result.get('usage', {}).get('completion_tokens', 0),
}
all_rows.append(row)
return pd.DataFrame(all_rows)
def plot_time_comparison(self, figsize=(12, 6)):
"""์‘๋‹ต ์‹œ๊ฐ„ ๋น„๊ต ๊ทธ๋ž˜ํ”„"""
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
# ์„ฑ๊ณตํ•œ ๊ฒฐ๊ณผ๋งŒ ์‚ฌ์šฉ
df_success = self.df[self.df['success'] == True].copy()
# 1. ๋ชจ๋ธ๋ณ„ ํ‰๊ท  ์‘๋‹ต ์‹œ๊ฐ„
model_time = df_success.groupby('model')['elapsed_time'].agg(['mean', 'std'])
ax1.bar(model_time.index, model_time['mean'], yerr=model_time['std'],
capsize=5, alpha=0.7, color=['#2ecc71', '#3498db', '#e74c3c'])
ax1.set_title('Average Response Time by Model', fontsize=14, fontweight='bold')
ax1.set_ylabel('Time (seconds)', fontsize=12)
ax1.set_xlabel('Model', fontsize=12)
ax1.grid(axis='y', alpha=0.3)
# 2. Distribution๋ณ„ ์‘๋‹ต ์‹œ๊ฐ„
pivot_data = df_success.pivot_table(
values='elapsed_time',
index='model',
columns='distribution',
aggfunc='mean'
)
x = np.arange(len(pivot_data.index))
width = 0.35
if 'in_distribution' in pivot_data.columns:
ax2.bar(x - width/2, pivot_data['in_distribution'], width,
label='In-Distribution', alpha=0.8, color='#2ecc71')
if 'out_distribution' in pivot_data.columns:
ax2.bar(x + width/2, pivot_data['out_distribution'], width,
label='Out-Distribution', alpha=0.8, color='#e74c3c')
ax2.set_title('Response Time: In vs Out Distribution', fontsize=14, fontweight='bold')
ax2.set_ylabel('Time (seconds)', fontsize=12)
ax2.set_xlabel('Model', fontsize=12)
ax2.set_xticks(x)
ax2.set_xticklabels(pivot_data.index, rotation=15, ha='right')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)
plt.tight_layout()
# ์ €์žฅ
output_file = self.analysis_dir / "time_comparison.png"
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.close()
print(f"โœ… ์‘๋‹ต ์‹œ๊ฐ„ ๊ทธ๋ž˜ํ”„ ์ €์žฅ: {output_file}")
def plot_token_comparison(self, figsize=(12, 6)):
"""ํ† ํฐ ์‚ฌ์šฉ๋Ÿ‰ ๋น„๊ต ๊ทธ๋ž˜ํ”„"""
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
# ์„ฑ๊ณตํ•œ ๊ฒฐ๊ณผ๋งŒ ์‚ฌ์šฉ
df_success = self.df[self.df['success'] == True].copy()
# 1. ๋ชจ๋ธ๋ณ„ ํ‰๊ท  ํ† ํฐ ์‚ฌ์šฉ๋Ÿ‰
model_tokens = df_success.groupby('model')['total_tokens'].agg(['mean', 'std'])
ax1.bar(model_tokens.index, model_tokens['mean'], yerr=model_tokens['std'],
capsize=5, alpha=0.7, color=['#2ecc71', '#3498db', '#e74c3c'])
ax1.set_title('Average Token Usage by Model', fontsize=14, fontweight='bold')
ax1.set_ylabel('Tokens', fontsize=12)
ax1.set_xlabel('Model', fontsize=12)
ax1.grid(axis='y', alpha=0.3)
# 2. Distribution๋ณ„ ํ† ํฐ ์‚ฌ์šฉ๋Ÿ‰
pivot_data = df_success.pivot_table(
values='total_tokens',
index='model',
columns='distribution',
aggfunc='mean'
)
x = np.arange(len(pivot_data.index))
width = 0.35
if 'in_distribution' in pivot_data.columns:
ax2.bar(x - width/2, pivot_data['in_distribution'], width,
label='In-Distribution', alpha=0.8, color='#2ecc71')
if 'out_distribution' in pivot_data.columns:
ax2.bar(x + width/2, pivot_data['out_distribution'], width,
label='Out-Distribution', alpha=0.8, color='#e74c3c')
ax2.set_title('Token Usage: In vs Out Distribution', fontsize=14, fontweight='bold')
ax2.set_ylabel('Tokens', fontsize=12)
ax2.set_xlabel('Model', fontsize=12)
ax2.set_xticks(x)
ax2.set_xticklabels(pivot_data.index, rotation=15, ha='right')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)
plt.tight_layout()
# ์ €์žฅ
output_file = self.analysis_dir / "token_comparison.png"
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.close()
print(f"โœ… ํ† ํฐ ์‚ฌ์šฉ๋Ÿ‰ ๊ทธ๋ž˜ํ”„ ์ €์žฅ: {output_file}")
def plot_rag_usage(self, figsize=(10, 6)):
"""RAG ์‚ฌ์šฉ ํŒจํ„ด ๋ถ„์„"""
fig, ax = plt.subplots(figsize=figsize)
# ๋ชจ๋ธ๋ณ„ RAG ์‚ฌ์šฉ๋ฅ  ๊ณ„์‚ฐ
rag_usage = self.df.groupby('model').agg({
'used_retrieval': lambda x: (x.sum() / len(x) * 100)
}).round(2)
colors = ['#2ecc71', '#3498db', '#e74c3c']
bars = ax.bar(rag_usage.index, rag_usage['used_retrieval'],
alpha=0.7, color=colors)
# ๋ง‰๋Œ€ ์œ„์— ํผ์„ผํŠธ ํ‘œ์‹œ
for bar in bars:
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2., height,
f'{height:.1f}%',
ha='center', va='bottom', fontsize=11, fontweight='bold')
ax.set_title('RAG Usage Rate by Model', fontsize=14, fontweight='bold')
ax.set_ylabel('Usage Rate (%)', fontsize=12)
ax.set_xlabel('Model', fontsize=12)
ax.set_ylim(0, 105)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
# ์ €์žฅ
output_file = self.analysis_dir / "rag_usage.png"
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.close()
print(f"โœ… RAG ์‚ฌ์šฉ ํŒจํ„ด ๊ทธ๋ž˜ํ”„ ์ €์žฅ: {output_file}")
def plot_overfitting_analysis(self, figsize=(12, 8)):
"""๊ณผ์ ํ•ฉ ๋ถ„์„: In-Distribution vs Out-Distribution ์„ฑ๋Šฅ ์ฐจ์ด"""
# ์„ฑ๊ณตํ•œ ๊ฒฐ๊ณผ๋งŒ ์‚ฌ์šฉ
df_success = self.df[self.df['success'] == True].copy()
fig, axes = plt.subplots(2, 2, figsize=figsize)
# 1. ์‘๋‹ต ์‹œ๊ฐ„ ์ฐจ์ด
time_pivot = df_success.pivot_table(
values='elapsed_time',
index='model',
columns='distribution',
aggfunc='mean'
)
if 'in_distribution' in time_pivot.columns and 'out_distribution' in time_pivot.columns:
time_diff = time_pivot['out_distribution'] - time_pivot['in_distribution']
axes[0, 0].bar(time_diff.index, time_diff.values,
color=['green' if x < 0 else 'red' for x in time_diff.values],
alpha=0.7)
axes[0, 0].axhline(y=0, color='black', linestyle='-', linewidth=0.8)
axes[0, 0].set_title('Response Time Gap (Out - In)', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Time Difference (seconds)', fontsize=10)
axes[0, 0].grid(axis='y', alpha=0.3)
# 2. ํ† ํฐ ์‚ฌ์šฉ๋Ÿ‰ ์ฐจ์ด
token_pivot = df_success.pivot_table(
values='total_tokens',
index='model',
columns='distribution',
aggfunc='mean'
)
if 'in_distribution' in token_pivot.columns and 'out_distribution' in token_pivot.columns:
token_diff = token_pivot['out_distribution'] - token_pivot['in_distribution']
axes[0, 1].bar(token_diff.index, token_diff.values,
color=['green' if x < 0 else 'red' for x in token_diff.values],
alpha=0.7)
axes[0, 1].axhline(y=0, color='black', linestyle='-', linewidth=0.8)
axes[0, 1].set_title('Token Usage Gap (Out - In)', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Token Difference', fontsize=10)
axes[0, 1].grid(axis='y', alpha=0.3)
# 3. ์„ฑ๊ณต๋ฅ  ๋น„๊ต
success_pivot = self.df.pivot_table(
values='success',
index='model',
columns='distribution',
aggfunc=lambda x: (x.sum() / len(x) * 100)
)
x = np.arange(len(success_pivot.index))
width = 0.35
if 'in_distribution' in success_pivot.columns:
axes[1, 0].bar(x - width/2, success_pivot['in_distribution'], width,
label='In-Distribution', alpha=0.8, color='#2ecc71')
if 'out_distribution' in success_pivot.columns:
axes[1, 0].bar(x + width/2, success_pivot['out_distribution'], width,
label='Out-Distribution', alpha=0.8, color='#e74c3c')
axes[1, 0].set_title('Success Rate Comparison', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('Success Rate (%)', fontsize=10)
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels(success_pivot.index, rotation=15, ha='right')
axes[1, 0].legend()
axes[1, 0].grid(axis='y', alpha=0.3)
axes[1, 0].set_ylim(0, 105)
# 4. ๊ณผ์ ํ•ฉ ์ง€์ˆ˜ (Performance Gap)
if 'in_distribution' in time_pivot.columns and 'out_distribution' in time_pivot.columns:
# ๊ฐ„๋‹จํ•œ ๊ณผ์ ํ•ฉ ์ง€์ˆ˜: (Out ์‹œ๊ฐ„ - In ์‹œ๊ฐ„) / In ์‹œ๊ฐ„ * 100
overfitting_index = ((time_pivot['out_distribution'] - time_pivot['in_distribution']) /
time_pivot['in_distribution'] * 100)
colors_custom = ['green' if x < 10 else 'orange' if x < 30 else 'red'
for x in overfitting_index.values]
axes[1, 1].bar(overfitting_index.index, overfitting_index.values,
color=colors_custom, alpha=0.7)
axes[1, 1].axhline(y=0, color='black', linestyle='-', linewidth=0.8)
axes[1, 1].axhline(y=10, color='orange', linestyle='--', linewidth=0.8, alpha=0.5)
axes[1, 1].axhline(y=30, color='red', linestyle='--', linewidth=0.8, alpha=0.5)
axes[1, 1].set_title('Overfitting Index (Time-based)', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('Performance Gap (%)', fontsize=10)
axes[1, 1].grid(axis='y', alpha=0.3)
plt.tight_layout()
# ์ €์žฅ
output_file = self.analysis_dir / "overfitting_analysis.png"
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.close()
print(f"โœ… ๊ณผ์ ํ•ฉ ๋ถ„์„ ๊ทธ๋ž˜ํ”„ ์ €์žฅ: {output_file}")
def generate_summary_report(self):
"""์ข…ํ•ฉ ์š”์•ฝ ๋ณด๊ณ ์„œ ์ƒ์„ฑ"""
report_file = self.analysis_dir / "summary_report.txt"
with open(report_file, 'w', encoding='utf-8') as f:
f.write("="*70 + "\n")
f.write("RFPilot ๋ชจ๋ธ ๋น„๊ต ์‹คํ—˜ - ์ข…ํ•ฉ ๋ถ„์„ ๋ณด๊ณ ์„œ\n")
f.write("="*70 + "\n\n")
# ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ
metadata = self.results['metadata']
f.write(f"์‹คํ—˜ ์ผ์‹œ: {metadata['timestamp']}\n")
f.write(f"๋ถ„ํฌ: {metadata['distribution']}\n")
f.write(f"๋น„๊ต ๋ชจ๋ธ: {', '.join(metadata['models'])}\n")
f.write(f"์ด ์งˆ๋ฌธ ์ˆ˜: {metadata['total_queries']}\n\n")
# ์„ฑ๊ณตํ•œ ๊ฒฐ๊ณผ๋งŒ
df_success = self.df[self.df['success'] == True]
# 1. ๋ชจ๋ธ๋ณ„ ํ‰๊ท  ์„ฑ๋Šฅ
f.write("\n" + "="*70 + "\n")
f.write("1. ๋ชจ๋ธ๋ณ„ ํ‰๊ท  ์„ฑ๋Šฅ\n")
f.write("="*70 + "\n\n")
for model in df_success['model'].unique():
model_df = df_success[df_success['model'] == model]
f.write(f"[{model}]\n")
f.write(f" - ์„ฑ๊ณต๋ฅ : {len(model_df)/len(self.df[self.df['model']==model])*100:.1f}%\n")
f.write(f" - ํ‰๊ท  ์‘๋‹ต ์‹œ๊ฐ„: {model_df['elapsed_time'].mean():.3f}์ดˆ\n")
f.write(f" - ํ‰๊ท  ํ† ํฐ: {model_df['total_tokens'].mean():.1f}\n")
f.write(f" - RAG ์‚ฌ์šฉ๋ฅ : {model_df['used_retrieval'].sum()/len(model_df)*100:.1f}%\n\n")
# 2. Distribution๋ณ„ ์„ฑ๋Šฅ
f.write("\n" + "="*70 + "\n")
f.write("2. Distribution๋ณ„ ์„ฑ๋Šฅ ๋น„๊ต\n")
f.write("="*70 + "\n\n")
for dist in df_success['distribution'].unique():
dist_df = df_success[df_success['distribution'] == dist]
f.write(f"[{dist}]\n")
f.write(f" - ํ‰๊ท  ์‘๋‹ต ์‹œ๊ฐ„: {dist_df['elapsed_time'].mean():.3f}์ดˆ\n")
f.write(f" - ํ‰๊ท  ํ† ํฐ: {dist_df['total_tokens'].mean():.1f}\n\n")
# 3. ๊ถŒ์žฅ์‚ฌํ•ญ
f.write("\n" + "="*70 + "\n")
f.write("3. ๋ถ„์„ ๋ฐ ๊ถŒ์žฅ์‚ฌํ•ญ\n")
f.write("="*70 + "\n\n")
# ๊ฐ€์žฅ ๋น ๋ฅธ ๋ชจ๋ธ
fastest_model = df_success.groupby('model')['elapsed_time'].mean().idxmin()
f.write(f"โšก ๊ฐ€์žฅ ๋น ๋ฅธ ๋ชจ๋ธ: {fastest_model}\n")
# ๊ฐ€์žฅ ํ† ํฐ์„ ์ ๊ฒŒ ์‚ฌ์šฉํ•˜๋Š” ๋ชจ๋ธ
efficient_model = df_success.groupby('model')['total_tokens'].mean().idxmin()
f.write(f"๐Ÿ’ก ๊ฐ€์žฅ ํšจ์œจ์ ์ธ ๋ชจ๋ธ (ํ† ํฐ): {efficient_model}\n")
# RAG๋ฅผ ๊ฐ€์žฅ ๋งŽ์ด ์‚ฌ์šฉํ•˜๋Š” ๋ชจ๋ธ
rag_model = df_success.groupby('model')['used_retrieval'].sum().idxmax()
f.write(f"๐Ÿ” RAG๋ฅผ ๊ฐ€์žฅ ๋งŽ์ด ํ™œ์šฉํ•˜๋Š” ๋ชจ๋ธ: {rag_model}\n")
f.write("\n" + "="*70 + "\n")
print(f"โœ… ์ข…ํ•ฉ ๋ณด๊ณ ์„œ ์ €์žฅ: {report_file}")
def run_all_analysis(self):
"""๋ชจ๋“  ๋ถ„์„ ์‹คํ–‰"""
print("\n" + "="*70)
print("์ „์ฒด ๋ถ„์„ ์‹œ์ž‘")
print("="*70 + "\n")
self.plot_time_comparison()
self.plot_token_comparison()
self.plot_rag_usage()
self.plot_overfitting_analysis()
self.generate_summary_report()
print("\n" + "="*70)
print("โœ… ๋ชจ๋“  ๋ถ„์„ ์™„๋ฃŒ!")
print(f" ๊ฒฐ๊ณผ ์ €์žฅ ์œ„์น˜: {self.analysis_dir}")
print("="*70 + "\n")
def main():
"""ํ…Œ์ŠคํŠธ์šฉ ๋ฉ”์ธ ํ•จ์ˆ˜"""
import sys
if len(sys.argv) < 2:
print("์‚ฌ์šฉ๋ฒ•: python analyze_results.py <result_json_path>")
return
result_file = sys.argv[1]
analyzer = ResultAnalyzer(result_file)
analyzer.run_all_analysis()
if __name__ == "__main__":
main()