Dongjin1203 commited on
Commit
92dedf4
ยท
1 Parent(s): 338103b

Add ResultAnalyzer and fix import paths

Browse files
Files changed (2) hide show
  1. src/analyze_results.py +410 -0
  2. src/compare_models.py +1 -1
src/analyze_results.py CHANGED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ์‹คํ—˜ ๊ฒฐ๊ณผ ๋ถ„์„ ๋ฐ ์‹œ๊ฐํ™”
3
+
4
+ JSON ๊ฒฐ๊ณผ ํŒŒ์ผ์„ ์ฝ์–ด์„œ ๋‹ค์–‘ํ•œ ๊ทธ๋ž˜ํ”„๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
5
+ """
6
+
7
+ import json
8
+ import pandas as pd
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
11
+ from pathlib import Path
12
+ from typing import Dict, List, Any
13
+ import numpy as np
14
+
15
+ # ํ•œ๊ธ€ ํฐํŠธ ์„ค์ • (matplotlib)
16
+ plt.rcParams['font.family'] = 'DejaVu Sans'
17
+ plt.rcParams['axes.unicode_minus'] = False
18
+
19
+
20
+ class ResultAnalyzer:
21
+ """์‹คํ—˜ ๊ฒฐ๊ณผ ๋ถ„์„ ํด๋ž˜์Šค"""
22
+
23
+ def __init__(self, result_file_path: str):
24
+ """
25
+ ์ดˆ๊ธฐํ™”
26
+
27
+ Args:
28
+ result_file_path: JSON ๊ฒฐ๊ณผ ํŒŒ์ผ ๊ฒฝ๋กœ
29
+ """
30
+ self.result_file = Path(result_file_path)
31
+ self.results = self._load_results()
32
+ self.df = self._results_to_dataframe()
33
+
34
+ # ๋ถ„์„ ๊ฒฐ๊ณผ ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ
35
+ self.analysis_dir = self.result_file.parent / "analysis"
36
+ self.analysis_dir.mkdir(parents=True, exist_ok=True)
37
+
38
+ print(f"โœ… ResultAnalyzer ์ดˆ๊ธฐํ™” ์™„๋ฃŒ")
39
+ print(f" ๊ฒฐ๊ณผ ํŒŒ์ผ: {self.result_file}")
40
+ print(f" ๋ถ„์„ ์ €์žฅ ๊ฒฝ๋กœ: {self.analysis_dir}")
41
+
42
+ def _load_results(self) -> Dict[str, Any]:
43
+ """JSON ๊ฒฐ๊ณผ ํŒŒ์ผ ๋กœ๋“œ"""
44
+ with open(self.result_file, 'r', encoding='utf-8') as f:
45
+ return json.load(f)
46
+
47
+ def _results_to_dataframe(self) -> pd.DataFrame:
48
+ """๊ฒฐ๊ณผ๋ฅผ DataFrame์œผ๋กœ ๋ณ€ํ™˜"""
49
+ all_rows = []
50
+
51
+ for dist_type, dist_results in self.results['results'].items():
52
+ for result in dist_results:
53
+ row = {
54
+ 'distribution': dist_type,
55
+ 'model': result['model'],
56
+ 'query': result['query'],
57
+ 'category': result.get('category', 'unknown'),
58
+ 'success': result['success'],
59
+ 'elapsed_time': result['elapsed_time'],
60
+ 'used_retrieval': result.get('used_retrieval', False),
61
+ 'query_type': result.get('query_type', 'unknown'),
62
+ 'search_mode': result.get('search_mode', 'none'),
63
+ 'total_tokens': result.get('usage', {}).get('total_tokens', 0),
64
+ 'prompt_tokens': result.get('usage', {}).get('prompt_tokens', 0),
65
+ 'completion_tokens': result.get('usage', {}).get('completion_tokens', 0),
66
+ }
67
+ all_rows.append(row)
68
+
69
+ return pd.DataFrame(all_rows)
70
+
71
+ def plot_time_comparison(self, figsize=(12, 6)):
72
+ """์‘๋‹ต ์‹œ๊ฐ„ ๋น„๊ต ๊ทธ๋ž˜ํ”„"""
73
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
74
+
75
+ # ์„ฑ๊ณตํ•œ ๊ฒฐ๊ณผ๋งŒ ์‚ฌ์šฉ
76
+ df_success = self.df[self.df['success'] == True].copy()
77
+
78
+ # 1. ๋ชจ๋ธ๋ณ„ ํ‰๊ท  ์‘๋‹ต ์‹œ๊ฐ„
79
+ model_time = df_success.groupby('model')['elapsed_time'].agg(['mean', 'std'])
80
+
81
+ ax1.bar(model_time.index, model_time['mean'], yerr=model_time['std'],
82
+ capsize=5, alpha=0.7, color=['#2ecc71', '#3498db', '#e74c3c'])
83
+ ax1.set_title('Average Response Time by Model', fontsize=14, fontweight='bold')
84
+ ax1.set_ylabel('Time (seconds)', fontsize=12)
85
+ ax1.set_xlabel('Model', fontsize=12)
86
+ ax1.grid(axis='y', alpha=0.3)
87
+
88
+ # 2. Distribution๋ณ„ ์‘๋‹ต ์‹œ๊ฐ„
89
+ pivot_data = df_success.pivot_table(
90
+ values='elapsed_time',
91
+ index='model',
92
+ columns='distribution',
93
+ aggfunc='mean'
94
+ )
95
+
96
+ x = np.arange(len(pivot_data.index))
97
+ width = 0.35
98
+
99
+ if 'in_distribution' in pivot_data.columns:
100
+ ax2.bar(x - width/2, pivot_data['in_distribution'], width,
101
+ label='In-Distribution', alpha=0.8, color='#2ecc71')
102
+
103
+ if 'out_distribution' in pivot_data.columns:
104
+ ax2.bar(x + width/2, pivot_data['out_distribution'], width,
105
+ label='Out-Distribution', alpha=0.8, color='#e74c3c')
106
+
107
+ ax2.set_title('Response Time: In vs Out Distribution', fontsize=14, fontweight='bold')
108
+ ax2.set_ylabel('Time (seconds)', fontsize=12)
109
+ ax2.set_xlabel('Model', fontsize=12)
110
+ ax2.set_xticks(x)
111
+ ax2.set_xticklabels(pivot_data.index, rotation=15, ha='right')
112
+ ax2.legend()
113
+ ax2.grid(axis='y', alpha=0.3)
114
+
115
+ plt.tight_layout()
116
+
117
+ # ์ €์žฅ
118
+ output_file = self.analysis_dir / "time_comparison.png"
119
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
120
+ plt.close()
121
+
122
+ print(f"โœ… ์‘๋‹ต ์‹œ๊ฐ„ ๊ทธ๋ž˜ํ”„ ์ €์žฅ: {output_file}")
123
+
124
+ def plot_token_comparison(self, figsize=(12, 6)):
125
+ """ํ† ํฐ ์‚ฌ์šฉ๋Ÿ‰ ๋น„๊ต ๊ทธ๋ž˜ํ”„"""
126
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
127
+
128
+ # ์„ฑ๊ณตํ•œ ๊ฒฐ๊ณผ๋งŒ ์‚ฌ์šฉ
129
+ df_success = self.df[self.df['success'] == True].copy()
130
+
131
+ # 1. ๋ชจ๋ธ๋ณ„ ํ‰๊ท  ํ† ํฐ ์‚ฌ์šฉ๋Ÿ‰
132
+ model_tokens = df_success.groupby('model')['total_tokens'].agg(['mean', 'std'])
133
+
134
+ ax1.bar(model_tokens.index, model_tokens['mean'], yerr=model_tokens['std'],
135
+ capsize=5, alpha=0.7, color=['#2ecc71', '#3498db', '#e74c3c'])
136
+ ax1.set_title('Average Token Usage by Model', fontsize=14, fontweight='bold')
137
+ ax1.set_ylabel('Tokens', fontsize=12)
138
+ ax1.set_xlabel('Model', fontsize=12)
139
+ ax1.grid(axis='y', alpha=0.3)
140
+
141
+ # 2. Distribution๋ณ„ ํ† ํฐ ์‚ฌ์šฉ๋Ÿ‰
142
+ pivot_data = df_success.pivot_table(
143
+ values='total_tokens',
144
+ index='model',
145
+ columns='distribution',
146
+ aggfunc='mean'
147
+ )
148
+
149
+ x = np.arange(len(pivot_data.index))
150
+ width = 0.35
151
+
152
+ if 'in_distribution' in pivot_data.columns:
153
+ ax2.bar(x - width/2, pivot_data['in_distribution'], width,
154
+ label='In-Distribution', alpha=0.8, color='#2ecc71')
155
+
156
+ if 'out_distribution' in pivot_data.columns:
157
+ ax2.bar(x + width/2, pivot_data['out_distribution'], width,
158
+ label='Out-Distribution', alpha=0.8, color='#e74c3c')
159
+
160
+ ax2.set_title('Token Usage: In vs Out Distribution', fontsize=14, fontweight='bold')
161
+ ax2.set_ylabel('Tokens', fontsize=12)
162
+ ax2.set_xlabel('Model', fontsize=12)
163
+ ax2.set_xticks(x)
164
+ ax2.set_xticklabels(pivot_data.index, rotation=15, ha='right')
165
+ ax2.legend()
166
+ ax2.grid(axis='y', alpha=0.3)
167
+
168
+ plt.tight_layout()
169
+
170
+ # ์ €์žฅ
171
+ output_file = self.analysis_dir / "token_comparison.png"
172
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
173
+ plt.close()
174
+
175
+ print(f"โœ… ํ† ํฐ ์‚ฌ์šฉ๋Ÿ‰ ๊ทธ๋ž˜ํ”„ ์ €์žฅ: {output_file}")
176
+
177
+ def plot_rag_usage(self, figsize=(10, 6)):
178
+ """RAG ์‚ฌ์šฉ ํŒจํ„ด ๋ถ„์„"""
179
+ fig, ax = plt.subplots(figsize=figsize)
180
+
181
+ # ๋ชจ๋ธ๋ณ„ RAG ์‚ฌ์šฉ๋ฅ  ๊ณ„์‚ฐ
182
+ rag_usage = self.df.groupby('model').agg({
183
+ 'used_retrieval': lambda x: (x.sum() / len(x) * 100)
184
+ }).round(2)
185
+
186
+ colors = ['#2ecc71', '#3498db', '#e74c3c']
187
+ bars = ax.bar(rag_usage.index, rag_usage['used_retrieval'],
188
+ alpha=0.7, color=colors)
189
+
190
+ # ๋ง‰๋Œ€ ์œ„์— ํผ์„ผํŠธ ํ‘œ์‹œ
191
+ for bar in bars:
192
+ height = bar.get_height()
193
+ ax.text(bar.get_x() + bar.get_width()/2., height,
194
+ f'{height:.1f}%',
195
+ ha='center', va='bottom', fontsize=11, fontweight='bold')
196
+
197
+ ax.set_title('RAG Usage Rate by Model', fontsize=14, fontweight='bold')
198
+ ax.set_ylabel('Usage Rate (%)', fontsize=12)
199
+ ax.set_xlabel('Model', fontsize=12)
200
+ ax.set_ylim(0, 105)
201
+ ax.grid(axis='y', alpha=0.3)
202
+
203
+ plt.tight_layout()
204
+
205
+ # ์ €์žฅ
206
+ output_file = self.analysis_dir / "rag_usage.png"
207
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
208
+ plt.close()
209
+
210
+ print(f"โœ… RAG ์‚ฌ์šฉ ํŒจํ„ด ๊ทธ๋ž˜ํ”„ ์ €์žฅ: {output_file}")
211
+
212
+ def plot_overfitting_analysis(self, figsize=(12, 8)):
213
+ """๊ณผ์ ํ•ฉ ๋ถ„์„: In-Distribution vs Out-Distribution ์„ฑ๋Šฅ ์ฐจ์ด"""
214
+ # ์„ฑ๊ณตํ•œ ๊ฒฐ๊ณผ๋งŒ ์‚ฌ์šฉ
215
+ df_success = self.df[self.df['success'] == True].copy()
216
+
217
+ fig, axes = plt.subplots(2, 2, figsize=figsize)
218
+
219
+ # 1. ์‘๋‹ต ์‹œ๊ฐ„ ์ฐจ์ด
220
+ time_pivot = df_success.pivot_table(
221
+ values='elapsed_time',
222
+ index='model',
223
+ columns='distribution',
224
+ aggfunc='mean'
225
+ )
226
+
227
+ if 'in_distribution' in time_pivot.columns and 'out_distribution' in time_pivot.columns:
228
+ time_diff = time_pivot['out_distribution'] - time_pivot['in_distribution']
229
+
230
+ axes[0, 0].bar(time_diff.index, time_diff.values,
231
+ color=['green' if x < 0 else 'red' for x in time_diff.values],
232
+ alpha=0.7)
233
+ axes[0, 0].axhline(y=0, color='black', linestyle='-', linewidth=0.8)
234
+ axes[0, 0].set_title('Response Time Gap (Out - In)', fontsize=12, fontweight='bold')
235
+ axes[0, 0].set_ylabel('Time Difference (seconds)', fontsize=10)
236
+ axes[0, 0].grid(axis='y', alpha=0.3)
237
+
238
+ # 2. ํ† ํฐ ์‚ฌ์šฉ๋Ÿ‰ ์ฐจ์ด
239
+ token_pivot = df_success.pivot_table(
240
+ values='total_tokens',
241
+ index='model',
242
+ columns='distribution',
243
+ aggfunc='mean'
244
+ )
245
+
246
+ if 'in_distribution' in token_pivot.columns and 'out_distribution' in token_pivot.columns:
247
+ token_diff = token_pivot['out_distribution'] - token_pivot['in_distribution']
248
+
249
+ axes[0, 1].bar(token_diff.index, token_diff.values,
250
+ color=['green' if x < 0 else 'red' for x in token_diff.values],
251
+ alpha=0.7)
252
+ axes[0, 1].axhline(y=0, color='black', linestyle='-', linewidth=0.8)
253
+ axes[0, 1].set_title('Token Usage Gap (Out - In)', fontsize=12, fontweight='bold')
254
+ axes[0, 1].set_ylabel('Token Difference', fontsize=10)
255
+ axes[0, 1].grid(axis='y', alpha=0.3)
256
+
257
+ # 3. ์„ฑ๊ณต๋ฅ  ๋น„๊ต
258
+ success_pivot = self.df.pivot_table(
259
+ values='success',
260
+ index='model',
261
+ columns='distribution',
262
+ aggfunc=lambda x: (x.sum() / len(x) * 100)
263
+ )
264
+
265
+ x = np.arange(len(success_pivot.index))
266
+ width = 0.35
267
+
268
+ if 'in_distribution' in success_pivot.columns:
269
+ axes[1, 0].bar(x - width/2, success_pivot['in_distribution'], width,
270
+ label='In-Distribution', alpha=0.8, color='#2ecc71')
271
+
272
+ if 'out_distribution' in success_pivot.columns:
273
+ axes[1, 0].bar(x + width/2, success_pivot['out_distribution'], width,
274
+ label='Out-Distribution', alpha=0.8, color='#e74c3c')
275
+
276
+ axes[1, 0].set_title('Success Rate Comparison', fontsize=12, fontweight='bold')
277
+ axes[1, 0].set_ylabel('Success Rate (%)', fontsize=10)
278
+ axes[1, 0].set_xticks(x)
279
+ axes[1, 0].set_xticklabels(success_pivot.index, rotation=15, ha='right')
280
+ axes[1, 0].legend()
281
+ axes[1, 0].grid(axis='y', alpha=0.3)
282
+ axes[1, 0].set_ylim(0, 105)
283
+
284
+ # 4. ๊ณผ์ ํ•ฉ ์ง€์ˆ˜ (Performance Gap)
285
+ if 'in_distribution' in time_pivot.columns and 'out_distribution' in time_pivot.columns:
286
+ # ๊ฐ„๋‹จํ•œ ๊ณผ์ ํ•ฉ ์ง€์ˆ˜: (Out ์‹œ๊ฐ„ - In ์‹œ๊ฐ„) / In ์‹œ๊ฐ„ * 100
287
+ overfitting_index = ((time_pivot['out_distribution'] - time_pivot['in_distribution']) /
288
+ time_pivot['in_distribution'] * 100)
289
+
290
+ colors_custom = ['green' if x < 10 else 'orange' if x < 30 else 'red'
291
+ for x in overfitting_index.values]
292
+
293
+ axes[1, 1].bar(overfitting_index.index, overfitting_index.values,
294
+ color=colors_custom, alpha=0.7)
295
+ axes[1, 1].axhline(y=0, color='black', linestyle='-', linewidth=0.8)
296
+ axes[1, 1].axhline(y=10, color='orange', linestyle='--', linewidth=0.8, alpha=0.5)
297
+ axes[1, 1].axhline(y=30, color='red', linestyle='--', linewidth=0.8, alpha=0.5)
298
+ axes[1, 1].set_title('Overfitting Index (Time-based)', fontsize=12, fontweight='bold')
299
+ axes[1, 1].set_ylabel('Performance Gap (%)', fontsize=10)
300
+ axes[1, 1].grid(axis='y', alpha=0.3)
301
+
302
+ plt.tight_layout()
303
+
304
+ # ์ €์žฅ
305
+ output_file = self.analysis_dir / "overfitting_analysis.png"
306
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
307
+ plt.close()
308
+
309
+ print(f"โœ… ๊ณผ์ ํ•ฉ ๋ถ„์„ ๊ทธ๋ž˜ํ”„ ์ €์žฅ: {output_file}")
310
+
311
+ def generate_summary_report(self):
312
+ """์ข…ํ•ฉ ์š”์•ฝ ๋ณด๊ณ ์„œ ์ƒ์„ฑ"""
313
+ report_file = self.analysis_dir / "summary_report.txt"
314
+
315
+ with open(report_file, 'w', encoding='utf-8') as f:
316
+ f.write("="*70 + "\n")
317
+ f.write("RFPilot ๋ชจ๋ธ ๋น„๊ต ์‹คํ—˜ - ์ข…ํ•ฉ ๋ถ„์„ ๋ณด๊ณ ์„œ\n")
318
+ f.write("="*70 + "\n\n")
319
+
320
+ # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ
321
+ metadata = self.results['metadata']
322
+ f.write(f"์‹คํ—˜ ์ผ์‹œ: {metadata['timestamp']}\n")
323
+ f.write(f"๋ถ„ํฌ: {metadata['distribution']}\n")
324
+ f.write(f"๋น„๊ต ๋ชจ๋ธ: {', '.join(metadata['models'])}\n")
325
+ f.write(f"์ด ์งˆ๋ฌธ ์ˆ˜: {metadata['total_queries']}\n\n")
326
+
327
+ # ์„ฑ๊ณตํ•œ ๊ฒฐ๊ณผ๋งŒ
328
+ df_success = self.df[self.df['success'] == True]
329
+
330
+ # 1. ๋ชจ๋ธ๋ณ„ ํ‰๊ท  ์„ฑ๋Šฅ
331
+ f.write("\n" + "="*70 + "\n")
332
+ f.write("1. ๋ชจ๋ธ๋ณ„ ํ‰๊ท  ์„ฑ๋Šฅ\n")
333
+ f.write("="*70 + "\n\n")
334
+
335
+ for model in df_success['model'].unique():
336
+ model_df = df_success[df_success['model'] == model]
337
+
338
+ f.write(f"[{model}]\n")
339
+ f.write(f" - ์„ฑ๊ณต๋ฅ : {len(model_df)/len(self.df[self.df['model']==model])*100:.1f}%\n")
340
+ f.write(f" - ํ‰๊ท  ์‘๋‹ต ์‹œ๊ฐ„: {model_df['elapsed_time'].mean():.3f}์ดˆ\n")
341
+ f.write(f" - ํ‰๊ท  ํ† ํฐ: {model_df['total_tokens'].mean():.1f}\n")
342
+ f.write(f" - RAG ์‚ฌ์šฉ๋ฅ : {model_df['used_retrieval'].sum()/len(model_df)*100:.1f}%\n\n")
343
+
344
+ # 2. Distribution๋ณ„ ์„ฑ๋Šฅ
345
+ f.write("\n" + "="*70 + "\n")
346
+ f.write("2. Distribution๋ณ„ ์„ฑ๋Šฅ ๋น„๊ต\n")
347
+ f.write("="*70 + "\n\n")
348
+
349
+ for dist in df_success['distribution'].unique():
350
+ dist_df = df_success[df_success['distribution'] == dist]
351
+
352
+ f.write(f"[{dist}]\n")
353
+ f.write(f" - ํ‰๊ท  ์‘๋‹ต ์‹œ๊ฐ„: {dist_df['elapsed_time'].mean():.3f}์ดˆ\n")
354
+ f.write(f" - ํ‰๊ท  ํ† ํฐ: {dist_df['total_tokens'].mean():.1f}\n\n")
355
+
356
+ # 3. ๊ถŒ์žฅ์‚ฌํ•ญ
357
+ f.write("\n" + "="*70 + "\n")
358
+ f.write("3. ๋ถ„์„ ๋ฐ ๊ถŒ์žฅ์‚ฌํ•ญ\n")
359
+ f.write("="*70 + "\n\n")
360
+
361
+ # ๊ฐ€์žฅ ๋น ๋ฅธ ๋ชจ๋ธ
362
+ fastest_model = df_success.groupby('model')['elapsed_time'].mean().idxmin()
363
+ f.write(f"โšก ๊ฐ€์žฅ ๋น ๋ฅธ ๋ชจ๋ธ: {fastest_model}\n")
364
+
365
+ # ๊ฐ€์žฅ ํ† ํฐ์„ ์ ๊ฒŒ ์‚ฌ์šฉํ•˜๋Š” ๋ชจ๋ธ
366
+ efficient_model = df_success.groupby('model')['total_tokens'].mean().idxmin()
367
+ f.write(f"๐Ÿ’ก ๊ฐ€์žฅ ํšจ์œจ์ ์ธ ๋ชจ๋ธ (ํ† ํฐ): {efficient_model}\n")
368
+
369
+ # RAG๋ฅผ ๊ฐ€์žฅ ๋งŽ์ด ์‚ฌ์šฉํ•˜๋Š” ๋ชจ๋ธ
370
+ rag_model = df_success.groupby('model')['used_retrieval'].sum().idxmax()
371
+ f.write(f"๐Ÿ” RAG๋ฅผ ๊ฐ€์žฅ ๋งŽ์ด ํ™œ์šฉํ•˜๋Š” ๋ชจ๋ธ: {rag_model}\n")
372
+
373
+ f.write("\n" + "="*70 + "\n")
374
+
375
+ print(f"โœ… ์ข…ํ•ฉ ๋ณด๊ณ ์„œ ์ €์žฅ: {report_file}")
376
+
377
+ def run_all_analysis(self):
378
+ """๋ชจ๋“  ๋ถ„์„ ์‹คํ–‰"""
379
+ print("\n" + "="*70)
380
+ print("์ „์ฒด ๋ถ„์„ ์‹œ์ž‘")
381
+ print("="*70 + "\n")
382
+
383
+ self.plot_time_comparison()
384
+ self.plot_token_comparison()
385
+ self.plot_rag_usage()
386
+ self.plot_overfitting_analysis()
387
+ self.generate_summary_report()
388
+
389
+ print("\n" + "="*70)
390
+ print("โœ… ๋ชจ๋“  ๋ถ„์„ ์™„๋ฃŒ!")
391
+ print(f" ๊ฒฐ๊ณผ ์ €์žฅ ์œ„์น˜: {self.analysis_dir}")
392
+ print("="*70 + "\n")
393
+
394
+
395
+ def main():
396
+ """ํ…Œ์ŠคํŠธ์šฉ ๋ฉ”์ธ ํ•จ์ˆ˜"""
397
+ import sys
398
+
399
+ if len(sys.argv) < 2:
400
+ print("์‚ฌ์šฉ๋ฒ•: python analyze_results.py <result_json_path>")
401
+ return
402
+
403
+ result_file = sys.argv[1]
404
+
405
+ analyzer = ResultAnalyzer(result_file)
406
+ analyzer.run_all_analysis()
407
+
408
+
409
+ if __name__ == "__main__":
410
+ main()
src/compare_models.py CHANGED
@@ -26,7 +26,7 @@ project_root = Path(__file__).parent.parent
26
  sys.path.insert(0, str(project_root))
27
 
28
  from src.utils.config import RAGConfig
29
- from eval_dataset import EvalDataset
30
 
31
  # ๋กœ๊น… ์„ค์ •
32
  logging.basicConfig(
 
26
  sys.path.insert(0, str(project_root))
27
 
28
  from src.utils.config import RAGConfig
29
+ from src.eval_dataset import EvalDataset
30
 
31
  # ๋กœ๊น… ์„ค์ •
32
  logging.basicConfig(