Spaces:
Sleeping
Sleeping
| """ | |
| ์คํ ๊ฒฐ๊ณผ ๋ถ์ ๋ฐ ์๊ฐํ | |
| JSON ๊ฒฐ๊ณผ ํ์ผ์ ์ฝ์ด์ ๋ค์ํ ๊ทธ๋ํ๋ฅผ ์์ฑํฉ๋๋ค. | |
| """ | |
| import json | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from pathlib import Path | |
| from typing import Dict, List, Any | |
| import numpy as np | |
| # ํ๊ธ ํฐํธ ์ค์ (matplotlib) | |
| plt.rcParams['font.family'] = 'DejaVu Sans' | |
| plt.rcParams['axes.unicode_minus'] = False | |
| class ResultAnalyzer: | |
| """์คํ ๊ฒฐ๊ณผ ๋ถ์ ํด๋์ค""" | |
| def __init__(self, result_file_path: str): | |
| """ | |
| ์ด๊ธฐํ | |
| Args: | |
| result_file_path: JSON ๊ฒฐ๊ณผ ํ์ผ ๊ฒฝ๋ก | |
| """ | |
| self.result_file = Path(result_file_path) | |
| self.results = self._load_results() | |
| self.df = self._results_to_dataframe() | |
| # ๋ถ์ ๊ฒฐ๊ณผ ์ ์ฅ ๋๋ ํ ๋ฆฌ | |
| self.analysis_dir = self.result_file.parent / "analysis" | |
| self.analysis_dir.mkdir(parents=True, exist_ok=True) | |
| print(f"โ ResultAnalyzer ์ด๊ธฐํ ์๋ฃ") | |
| print(f" ๊ฒฐ๊ณผ ํ์ผ: {self.result_file}") | |
| print(f" ๋ถ์ ์ ์ฅ ๊ฒฝ๋ก: {self.analysis_dir}") | |
| def _load_results(self) -> Dict[str, Any]: | |
| """JSON ๊ฒฐ๊ณผ ํ์ผ ๋ก๋""" | |
| with open(self.result_file, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| def _results_to_dataframe(self) -> pd.DataFrame: | |
| """๊ฒฐ๊ณผ๋ฅผ DataFrame์ผ๋ก ๋ณํ""" | |
| all_rows = [] | |
| for dist_type, dist_results in self.results['results'].items(): | |
| for result in dist_results: | |
| row = { | |
| 'distribution': dist_type, | |
| 'model': result['model'], | |
| 'query': result['query'], | |
| 'category': result.get('category', 'unknown'), | |
| 'success': result['success'], | |
| 'elapsed_time': result['elapsed_time'], | |
| 'used_retrieval': result.get('used_retrieval', False), | |
| 'query_type': result.get('query_type', 'unknown'), | |
| 'search_mode': result.get('search_mode', 'none'), | |
| 'total_tokens': result.get('usage', {}).get('total_tokens', 0), | |
| 'prompt_tokens': result.get('usage', {}).get('prompt_tokens', 0), | |
| 'completion_tokens': result.get('usage', {}).get('completion_tokens', 0), | |
| } | |
| all_rows.append(row) | |
| return pd.DataFrame(all_rows) | |
| def plot_time_comparison(self, figsize=(12, 6)): | |
| """์๋ต ์๊ฐ ๋น๊ต ๊ทธ๋ํ""" | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize) | |
| # ์ฑ๊ณตํ ๊ฒฐ๊ณผ๋ง ์ฌ์ฉ | |
| df_success = self.df[self.df['success'] == True].copy() | |
| # 1. ๋ชจ๋ธ๋ณ ํ๊ท ์๋ต ์๊ฐ | |
| model_time = df_success.groupby('model')['elapsed_time'].agg(['mean', 'std']) | |
| ax1.bar(model_time.index, model_time['mean'], yerr=model_time['std'], | |
| capsize=5, alpha=0.7, color=['#2ecc71', '#3498db', '#e74c3c']) | |
| ax1.set_title('Average Response Time by Model', fontsize=14, fontweight='bold') | |
| ax1.set_ylabel('Time (seconds)', fontsize=12) | |
| ax1.set_xlabel('Model', fontsize=12) | |
| ax1.grid(axis='y', alpha=0.3) | |
| # 2. Distribution๋ณ ์๋ต ์๊ฐ | |
| pivot_data = df_success.pivot_table( | |
| values='elapsed_time', | |
| index='model', | |
| columns='distribution', | |
| aggfunc='mean' | |
| ) | |
| x = np.arange(len(pivot_data.index)) | |
| width = 0.35 | |
| if 'in_distribution' in pivot_data.columns: | |
| ax2.bar(x - width/2, pivot_data['in_distribution'], width, | |
| label='In-Distribution', alpha=0.8, color='#2ecc71') | |
| if 'out_distribution' in pivot_data.columns: | |
| ax2.bar(x + width/2, pivot_data['out_distribution'], width, | |
| label='Out-Distribution', alpha=0.8, color='#e74c3c') | |
| ax2.set_title('Response Time: In vs Out Distribution', fontsize=14, fontweight='bold') | |
| ax2.set_ylabel('Time (seconds)', fontsize=12) | |
| ax2.set_xlabel('Model', fontsize=12) | |
| ax2.set_xticks(x) | |
| ax2.set_xticklabels(pivot_data.index, rotation=15, ha='right') | |
| ax2.legend() | |
| ax2.grid(axis='y', alpha=0.3) | |
| plt.tight_layout() | |
| # ์ ์ฅ | |
| output_file = self.analysis_dir / "time_comparison.png" | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| print(f"โ ์๋ต ์๊ฐ ๊ทธ๋ํ ์ ์ฅ: {output_file}") | |
| def plot_token_comparison(self, figsize=(12, 6)): | |
| """ํ ํฐ ์ฌ์ฉ๋ ๋น๊ต ๊ทธ๋ํ""" | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize) | |
| # ์ฑ๊ณตํ ๊ฒฐ๊ณผ๋ง ์ฌ์ฉ | |
| df_success = self.df[self.df['success'] == True].copy() | |
| # 1. ๋ชจ๋ธ๋ณ ํ๊ท ํ ํฐ ์ฌ์ฉ๋ | |
| model_tokens = df_success.groupby('model')['total_tokens'].agg(['mean', 'std']) | |
| ax1.bar(model_tokens.index, model_tokens['mean'], yerr=model_tokens['std'], | |
| capsize=5, alpha=0.7, color=['#2ecc71', '#3498db', '#e74c3c']) | |
| ax1.set_title('Average Token Usage by Model', fontsize=14, fontweight='bold') | |
| ax1.set_ylabel('Tokens', fontsize=12) | |
| ax1.set_xlabel('Model', fontsize=12) | |
| ax1.grid(axis='y', alpha=0.3) | |
| # 2. Distribution๋ณ ํ ํฐ ์ฌ์ฉ๋ | |
| pivot_data = df_success.pivot_table( | |
| values='total_tokens', | |
| index='model', | |
| columns='distribution', | |
| aggfunc='mean' | |
| ) | |
| x = np.arange(len(pivot_data.index)) | |
| width = 0.35 | |
| if 'in_distribution' in pivot_data.columns: | |
| ax2.bar(x - width/2, pivot_data['in_distribution'], width, | |
| label='In-Distribution', alpha=0.8, color='#2ecc71') | |
| if 'out_distribution' in pivot_data.columns: | |
| ax2.bar(x + width/2, pivot_data['out_distribution'], width, | |
| label='Out-Distribution', alpha=0.8, color='#e74c3c') | |
| ax2.set_title('Token Usage: In vs Out Distribution', fontsize=14, fontweight='bold') | |
| ax2.set_ylabel('Tokens', fontsize=12) | |
| ax2.set_xlabel('Model', fontsize=12) | |
| ax2.set_xticks(x) | |
| ax2.set_xticklabels(pivot_data.index, rotation=15, ha='right') | |
| ax2.legend() | |
| ax2.grid(axis='y', alpha=0.3) | |
| plt.tight_layout() | |
| # ์ ์ฅ | |
| output_file = self.analysis_dir / "token_comparison.png" | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| print(f"โ ํ ํฐ ์ฌ์ฉ๋ ๊ทธ๋ํ ์ ์ฅ: {output_file}") | |
| def plot_rag_usage(self, figsize=(10, 6)): | |
| """RAG ์ฌ์ฉ ํจํด ๋ถ์""" | |
| fig, ax = plt.subplots(figsize=figsize) | |
| # ๋ชจ๋ธ๋ณ RAG ์ฌ์ฉ๋ฅ ๊ณ์ฐ | |
| rag_usage = self.df.groupby('model').agg({ | |
| 'used_retrieval': lambda x: (x.sum() / len(x) * 100) | |
| }).round(2) | |
| colors = ['#2ecc71', '#3498db', '#e74c3c'] | |
| bars = ax.bar(rag_usage.index, rag_usage['used_retrieval'], | |
| alpha=0.7, color=colors) | |
| # ๋ง๋ ์์ ํผ์ผํธ ํ์ | |
| for bar in bars: | |
| height = bar.get_height() | |
| ax.text(bar.get_x() + bar.get_width()/2., height, | |
| f'{height:.1f}%', | |
| ha='center', va='bottom', fontsize=11, fontweight='bold') | |
| ax.set_title('RAG Usage Rate by Model', fontsize=14, fontweight='bold') | |
| ax.set_ylabel('Usage Rate (%)', fontsize=12) | |
| ax.set_xlabel('Model', fontsize=12) | |
| ax.set_ylim(0, 105) | |
| ax.grid(axis='y', alpha=0.3) | |
| plt.tight_layout() | |
| # ์ ์ฅ | |
| output_file = self.analysis_dir / "rag_usage.png" | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| print(f"โ RAG ์ฌ์ฉ ํจํด ๊ทธ๋ํ ์ ์ฅ: {output_file}") | |
| def plot_overfitting_analysis(self, figsize=(12, 8)): | |
| """๊ณผ์ ํฉ ๋ถ์: In-Distribution vs Out-Distribution ์ฑ๋ฅ ์ฐจ์ด""" | |
| # ์ฑ๊ณตํ ๊ฒฐ๊ณผ๋ง ์ฌ์ฉ | |
| df_success = self.df[self.df['success'] == True].copy() | |
| fig, axes = plt.subplots(2, 2, figsize=figsize) | |
| # 1. ์๋ต ์๊ฐ ์ฐจ์ด | |
| time_pivot = df_success.pivot_table( | |
| values='elapsed_time', | |
| index='model', | |
| columns='distribution', | |
| aggfunc='mean' | |
| ) | |
| if 'in_distribution' in time_pivot.columns and 'out_distribution' in time_pivot.columns: | |
| time_diff = time_pivot['out_distribution'] - time_pivot['in_distribution'] | |
| axes[0, 0].bar(time_diff.index, time_diff.values, | |
| color=['green' if x < 0 else 'red' for x in time_diff.values], | |
| alpha=0.7) | |
| axes[0, 0].axhline(y=0, color='black', linestyle='-', linewidth=0.8) | |
| axes[0, 0].set_title('Response Time Gap (Out - In)', fontsize=12, fontweight='bold') | |
| axes[0, 0].set_ylabel('Time Difference (seconds)', fontsize=10) | |
| axes[0, 0].grid(axis='y', alpha=0.3) | |
| # 2. ํ ํฐ ์ฌ์ฉ๋ ์ฐจ์ด | |
| token_pivot = df_success.pivot_table( | |
| values='total_tokens', | |
| index='model', | |
| columns='distribution', | |
| aggfunc='mean' | |
| ) | |
| if 'in_distribution' in token_pivot.columns and 'out_distribution' in token_pivot.columns: | |
| token_diff = token_pivot['out_distribution'] - token_pivot['in_distribution'] | |
| axes[0, 1].bar(token_diff.index, token_diff.values, | |
| color=['green' if x < 0 else 'red' for x in token_diff.values], | |
| alpha=0.7) | |
| axes[0, 1].axhline(y=0, color='black', linestyle='-', linewidth=0.8) | |
| axes[0, 1].set_title('Token Usage Gap (Out - In)', fontsize=12, fontweight='bold') | |
| axes[0, 1].set_ylabel('Token Difference', fontsize=10) | |
| axes[0, 1].grid(axis='y', alpha=0.3) | |
| # 3. ์ฑ๊ณต๋ฅ ๋น๊ต | |
| success_pivot = self.df.pivot_table( | |
| values='success', | |
| index='model', | |
| columns='distribution', | |
| aggfunc=lambda x: (x.sum() / len(x) * 100) | |
| ) | |
| x = np.arange(len(success_pivot.index)) | |
| width = 0.35 | |
| if 'in_distribution' in success_pivot.columns: | |
| axes[1, 0].bar(x - width/2, success_pivot['in_distribution'], width, | |
| label='In-Distribution', alpha=0.8, color='#2ecc71') | |
| if 'out_distribution' in success_pivot.columns: | |
| axes[1, 0].bar(x + width/2, success_pivot['out_distribution'], width, | |
| label='Out-Distribution', alpha=0.8, color='#e74c3c') | |
| axes[1, 0].set_title('Success Rate Comparison', fontsize=12, fontweight='bold') | |
| axes[1, 0].set_ylabel('Success Rate (%)', fontsize=10) | |
| axes[1, 0].set_xticks(x) | |
| axes[1, 0].set_xticklabels(success_pivot.index, rotation=15, ha='right') | |
| axes[1, 0].legend() | |
| axes[1, 0].grid(axis='y', alpha=0.3) | |
| axes[1, 0].set_ylim(0, 105) | |
| # 4. ๊ณผ์ ํฉ ์ง์ (Performance Gap) | |
| if 'in_distribution' in time_pivot.columns and 'out_distribution' in time_pivot.columns: | |
| # ๊ฐ๋จํ ๊ณผ์ ํฉ ์ง์: (Out ์๊ฐ - In ์๊ฐ) / In ์๊ฐ * 100 | |
| overfitting_index = ((time_pivot['out_distribution'] - time_pivot['in_distribution']) / | |
| time_pivot['in_distribution'] * 100) | |
| colors_custom = ['green' if x < 10 else 'orange' if x < 30 else 'red' | |
| for x in overfitting_index.values] | |
| axes[1, 1].bar(overfitting_index.index, overfitting_index.values, | |
| color=colors_custom, alpha=0.7) | |
| axes[1, 1].axhline(y=0, color='black', linestyle='-', linewidth=0.8) | |
| axes[1, 1].axhline(y=10, color='orange', linestyle='--', linewidth=0.8, alpha=0.5) | |
| axes[1, 1].axhline(y=30, color='red', linestyle='--', linewidth=0.8, alpha=0.5) | |
| axes[1, 1].set_title('Overfitting Index (Time-based)', fontsize=12, fontweight='bold') | |
| axes[1, 1].set_ylabel('Performance Gap (%)', fontsize=10) | |
| axes[1, 1].grid(axis='y', alpha=0.3) | |
| plt.tight_layout() | |
| # ์ ์ฅ | |
| output_file = self.analysis_dir / "overfitting_analysis.png" | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| print(f"โ ๊ณผ์ ํฉ ๋ถ์ ๊ทธ๋ํ ์ ์ฅ: {output_file}") | |
| def generate_summary_report(self): | |
| """์ข ํฉ ์์ฝ ๋ณด๊ณ ์ ์์ฑ""" | |
| report_file = self.analysis_dir / "summary_report.txt" | |
| with open(report_file, 'w', encoding='utf-8') as f: | |
| f.write("="*70 + "\n") | |
| f.write("RFPilot ๋ชจ๋ธ ๋น๊ต ์คํ - ์ข ํฉ ๋ถ์ ๋ณด๊ณ ์\n") | |
| f.write("="*70 + "\n\n") | |
| # ๋ฉํ๋ฐ์ดํฐ | |
| metadata = self.results['metadata'] | |
| f.write(f"์คํ ์ผ์: {metadata['timestamp']}\n") | |
| f.write(f"๋ถํฌ: {metadata['distribution']}\n") | |
| f.write(f"๋น๊ต ๋ชจ๋ธ: {', '.join(metadata['models'])}\n") | |
| f.write(f"์ด ์ง๋ฌธ ์: {metadata['total_queries']}\n\n") | |
| # ์ฑ๊ณตํ ๊ฒฐ๊ณผ๋ง | |
| df_success = self.df[self.df['success'] == True] | |
| # 1. ๋ชจ๋ธ๋ณ ํ๊ท ์ฑ๋ฅ | |
| f.write("\n" + "="*70 + "\n") | |
| f.write("1. ๋ชจ๋ธ๋ณ ํ๊ท ์ฑ๋ฅ\n") | |
| f.write("="*70 + "\n\n") | |
| for model in df_success['model'].unique(): | |
| model_df = df_success[df_success['model'] == model] | |
| f.write(f"[{model}]\n") | |
| f.write(f" - ์ฑ๊ณต๋ฅ : {len(model_df)/len(self.df[self.df['model']==model])*100:.1f}%\n") | |
| f.write(f" - ํ๊ท ์๋ต ์๊ฐ: {model_df['elapsed_time'].mean():.3f}์ด\n") | |
| f.write(f" - ํ๊ท ํ ํฐ: {model_df['total_tokens'].mean():.1f}\n") | |
| f.write(f" - RAG ์ฌ์ฉ๋ฅ : {model_df['used_retrieval'].sum()/len(model_df)*100:.1f}%\n\n") | |
| # 2. Distribution๋ณ ์ฑ๋ฅ | |
| f.write("\n" + "="*70 + "\n") | |
| f.write("2. Distribution๋ณ ์ฑ๋ฅ ๋น๊ต\n") | |
| f.write("="*70 + "\n\n") | |
| for dist in df_success['distribution'].unique(): | |
| dist_df = df_success[df_success['distribution'] == dist] | |
| f.write(f"[{dist}]\n") | |
| f.write(f" - ํ๊ท ์๋ต ์๊ฐ: {dist_df['elapsed_time'].mean():.3f}์ด\n") | |
| f.write(f" - ํ๊ท ํ ํฐ: {dist_df['total_tokens'].mean():.1f}\n\n") | |
| # 3. ๊ถ์ฅ์ฌํญ | |
| f.write("\n" + "="*70 + "\n") | |
| f.write("3. ๋ถ์ ๋ฐ ๊ถ์ฅ์ฌํญ\n") | |
| f.write("="*70 + "\n\n") | |
| # ๊ฐ์ฅ ๋น ๋ฅธ ๋ชจ๋ธ | |
| fastest_model = df_success.groupby('model')['elapsed_time'].mean().idxmin() | |
| f.write(f"โก ๊ฐ์ฅ ๋น ๋ฅธ ๋ชจ๋ธ: {fastest_model}\n") | |
| # ๊ฐ์ฅ ํ ํฐ์ ์ ๊ฒ ์ฌ์ฉํ๋ ๋ชจ๋ธ | |
| efficient_model = df_success.groupby('model')['total_tokens'].mean().idxmin() | |
| f.write(f"๐ก ๊ฐ์ฅ ํจ์จ์ ์ธ ๋ชจ๋ธ (ํ ํฐ): {efficient_model}\n") | |
| # RAG๋ฅผ ๊ฐ์ฅ ๋ง์ด ์ฌ์ฉํ๋ ๋ชจ๋ธ | |
| rag_model = df_success.groupby('model')['used_retrieval'].sum().idxmax() | |
| f.write(f"๐ RAG๋ฅผ ๊ฐ์ฅ ๋ง์ด ํ์ฉํ๋ ๋ชจ๋ธ: {rag_model}\n") | |
| f.write("\n" + "="*70 + "\n") | |
| print(f"โ ์ข ํฉ ๋ณด๊ณ ์ ์ ์ฅ: {report_file}") | |
| def run_all_analysis(self): | |
| """๋ชจ๋ ๋ถ์ ์คํ""" | |
| print("\n" + "="*70) | |
| print("์ ์ฒด ๋ถ์ ์์") | |
| print("="*70 + "\n") | |
| self.plot_time_comparison() | |
| self.plot_token_comparison() | |
| self.plot_rag_usage() | |
| self.plot_overfitting_analysis() | |
| self.generate_summary_report() | |
| print("\n" + "="*70) | |
| print("โ ๋ชจ๋ ๋ถ์ ์๋ฃ!") | |
| print(f" ๊ฒฐ๊ณผ ์ ์ฅ ์์น: {self.analysis_dir}") | |
| print("="*70 + "\n") | |
| def main(): | |
| """ํ ์คํธ์ฉ ๋ฉ์ธ ํจ์""" | |
| import sys | |
| if len(sys.argv) < 2: | |
| print("์ฌ์ฉ๋ฒ: python analyze_results.py <result_json_path>") | |
| return | |
| result_file = sys.argv[1] | |
| analyzer = ResultAnalyzer(result_file) | |
| analyzer.run_all_analysis() | |
| if __name__ == "__main__": | |
| main() |