Trouter-Library commited on
Commit
361f597
·
verified ·
1 Parent(s): 98f2559

Create evaluate_dataset.py

Browse files
Files changed (1) hide show
  1. evaluate_dataset.py +405 -0
evaluate_dataset.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helion 1.5 Dataset Evaluation Tool
3
+ ===================================
4
+ Comprehensive evaluation and quality assessment for Helion 1.5 datasets.
5
+ """
6
+
7
+ import json
8
+ import numpy as np
9
+ from typing import Dict, List, Any, Tuple
10
+ from collections import Counter, defaultdict
11
+ import matplotlib.pyplot as plt
12
+ from pathlib import Path
13
+ import logging
14
+
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class HelionDatasetEvaluator:
20
+ """Evaluate and analyze Helion 1.5 dataset quality"""
21
+
22
+ def __init__(self):
23
+ self.stats = defaultdict(list)
24
+ self.issues = []
25
+
26
+ def evaluate_conversations(self, filepath: str) -> Dict[str, Any]:
27
+ """Evaluate conversation dataset"""
28
+ logger.info(f"Evaluating conversations from {filepath}")
29
+
30
+ metrics = {
31
+ 'total_examples': 0,
32
+ 'total_turns': [],
33
+ 'word_counts': [],
34
+ 'quality_scores': [],
35
+ 'domain_distribution': Counter(),
36
+ 'difficulty_distribution': Counter(),
37
+ 'language_distribution': Counter(),
38
+ 'has_code_count': 0,
39
+ 'avg_turns_per_conversation': 0,
40
+ 'avg_words_per_turn': 0,
41
+ 'quality_issues': []
42
+ }
43
+
44
+ with open(filepath, 'r', encoding='utf-8') as f:
45
+ for line_num, line in enumerate(f, 1):
46
+ try:
47
+ data = json.loads(line.strip())
48
+ metrics['total_examples'] += 1
49
+
50
+ # Analyze conversations
51
+ if 'conversations' in data:
52
+ turns = data['conversations']
53
+ metrics['total_turns'].append(len(turns))
54
+
55
+ for turn in turns:
56
+ words = len(turn['content'].split())
57
+ metrics['word_counts'].append(words)
58
+
59
+ # Analyze metadata
60
+ if 'metadata' in data:
61
+ meta = data['metadata']
62
+
63
+ if 'quality_score' in meta:
64
+ metrics['quality_scores'].append(meta['quality_score'])
65
+
66
+ if 'domain' in meta:
67
+ metrics['domain_distribution'][meta['domain']] += 1
68
+
69
+ if 'difficulty' in meta:
70
+ metrics['difficulty_distribution'][meta['difficulty']] += 1
71
+
72
+ if 'languages' in meta:
73
+ for lang in meta['languages']:
74
+ metrics['language_distribution'][lang] += 1
75
+
76
+ if meta.get('has_code', False):
77
+ metrics['has_code_count'] += 1
78
+
79
+ # Check for quality issues
80
+ issues = self._check_conversation_quality(data)
81
+ if issues:
82
+ metrics['quality_issues'].extend([
83
+ {'line': line_num, 'issues': issues}
84
+ ])
85
+
86
+ except json.JSONDecodeError:
87
+ logger.error(f"JSON decode error at line {line_num}")
88
+ except Exception as e:
89
+ logger.error(f"Error at line {line_num}: {str(e)}")
90
+
91
+ # Calculate averages
92
+ if metrics['total_turns']:
93
+ metrics['avg_turns_per_conversation'] = np.mean(metrics['total_turns'])
94
+ if metrics['word_counts']:
95
+ metrics['avg_words_per_turn'] = np.mean(metrics['word_counts'])
96
+ if metrics['quality_scores']:
97
+ metrics['avg_quality_score'] = np.mean(metrics['quality_scores'])
98
+ metrics['min_quality_score'] = np.min(metrics['quality_scores'])
99
+ metrics['max_quality_score'] = np.max(metrics['quality_scores'])
100
+
101
+ return metrics
102
+
103
+ def evaluate_instructions(self, filepath: str) -> Dict[str, Any]:
104
+ """Evaluate instruction dataset"""
105
+ logger.info(f"Evaluating instructions from {filepath}")
106
+
107
+ metrics = {
108
+ 'total_examples': 0,
109
+ 'instruction_lengths': [],
110
+ 'output_lengths': [],
111
+ 'quality_scores': [],
112
+ 'task_type_distribution': Counter(),
113
+ 'complexity_distribution': Counter(),
114
+ 'verified_count': 0,
115
+ 'quality_issues': []
116
+ }
117
+
118
+ with open(filepath, 'r', encoding='utf-8') as f:
119
+ for line_num, line in enumerate(f, 1):
120
+ try:
121
+ data = json.loads(line.strip())
122
+ metrics['total_examples'] += 1
123
+
124
+ if 'instruction' in data:
125
+ metrics['instruction_lengths'].append(len(data['instruction'].split()))
126
+
127
+ if 'output' in data:
128
+ metrics['output_lengths'].append(len(data['output'].split()))
129
+
130
+ if 'metadata' in data:
131
+ meta = data['metadata']
132
+
133
+ if 'quality_score' in meta:
134
+ metrics['quality_scores'].append(meta['quality_score'])
135
+
136
+ if 'task_type' in meta:
137
+ metrics['task_type_distribution'][meta['task_type']] += 1
138
+
139
+ if 'complexity' in meta:
140
+ metrics['complexity_distribution'][meta['complexity']] += 1
141
+
142
+ if meta.get('verified', False):
143
+ metrics['verified_count'] += 1
144
+
145
+ issues = self._check_instruction_quality(data)
146
+ if issues:
147
+ metrics['quality_issues'].append({
148
+ 'line': line_num, 'issues': issues
149
+ })
150
+
151
+ except Exception as e:
152
+ logger.error(f"Error at line {line_num}: {str(e)}")
153
+
154
+ # Calculate statistics
155
+ if metrics['instruction_lengths']:
156
+ metrics['avg_instruction_length'] = np.mean(metrics['instruction_lengths'])
157
+ if metrics['output_lengths']:
158
+ metrics['avg_output_length'] = np.mean(metrics['output_lengths'])
159
+ if metrics['quality_scores']:
160
+ metrics['avg_quality_score'] = np.mean(metrics['quality_scores'])
161
+
162
+ metrics['verification_rate'] = metrics['verified_count'] / max(metrics['total_examples'], 1)
163
+
164
+ return metrics
165
+
166
+ def evaluate_code(self, filepath: str) -> Dict[str, Any]:
167
+ """Evaluate code dataset"""
168
+ logger.info(f"Evaluating code from {filepath}")
169
+
170
+ metrics = {
171
+ 'total_examples': 0,
172
+ 'language_distribution': Counter(),
173
+ 'difficulty_distribution': Counter(),
174
+ 'lines_of_code': [],
175
+ 'quality_scores': [],
176
+ 'has_test_cases': 0,
177
+ 'has_explanation': 0,
178
+ 'tag_distribution': Counter(),
179
+ 'quality_issues': []
180
+ }
181
+
182
+ with open(filepath, 'r', encoding='utf-8') as f:
183
+ for line_num, line in enumerate(f, 1):
184
+ try:
185
+ data = json.loads(line.strip())
186
+ metrics['total_examples'] += 1
187
+
188
+ if 'language' in data:
189
+ metrics['language_distribution'][data['language']] += 1
190
+
191
+ if 'solution' in data:
192
+ loc = len(data['solution'].strip().split('\n'))
193
+ metrics['lines_of_code'].append(loc)
194
+
195
+ if 'test_cases' in data and data['test_cases']:
196
+ metrics['has_test_cases'] += 1
197
+
198
+ if 'explanation' in data and data['explanation']:
199
+ metrics['has_explanation'] += 1
200
+
201
+ if 'metadata' in data:
202
+ meta = data['metadata']
203
+
204
+ if 'quality_score' in meta:
205
+ metrics['quality_scores'].append(meta['quality_score'])
206
+
207
+ if 'difficulty' in meta:
208
+ metrics['difficulty_distribution'][meta['difficulty']] += 1
209
+
210
+ if 'tags' in meta:
211
+ for tag in meta['tags']:
212
+ metrics['tag_distribution'][tag] += 1
213
+
214
+ issues = self._check_code_quality(data)
215
+ if issues:
216
+ metrics['quality_issues'].append({
217
+ 'line': line_num, 'issues': issues
218
+ })
219
+
220
+ except Exception as e:
221
+ logger.error(f"Error at line {line_num}: {str(e)}")
222
+
223
+ # Calculate statistics
224
+ if metrics['lines_of_code']:
225
+ metrics['avg_lines_of_code'] = np.mean(metrics['lines_of_code'])
226
+ metrics['median_lines_of_code'] = np.median(metrics['lines_of_code'])
227
+
228
+ if metrics['quality_scores']:
229
+ metrics['avg_quality_score'] = np.mean(metrics['quality_scores'])
230
+
231
+ metrics['test_case_coverage'] = metrics['has_test_cases'] / max(metrics['total_examples'], 1)
232
+ metrics['explanation_coverage'] = metrics['has_explanation'] / max(metrics['total_examples'], 1)
233
+
234
+ return metrics
235
+
236
+ def _check_conversation_quality(self, data: Dict) -> List[str]:
237
+ """Check conversation for quality issues"""
238
+ issues = []
239
+
240
+ if 'conversations' not in data:
241
+ issues.append('Missing conversations field')
242
+ return issues
243
+
244
+ conversations = data['conversations']
245
+
246
+ # Check for minimum turns
247
+ if len(conversations) < 2:
248
+ issues.append('Too few conversation turns')
249
+
250
+ # Check turn structure
251
+ for i, turn in enumerate(conversations):
252
+ if 'role' not in turn:
253
+ issues.append(f'Turn {i}: Missing role')
254
+ if 'content' not in turn:
255
+ issues.append(f'Turn {i}: Missing content')
256
+ elif len(turn['content'].strip()) < 10:
257
+ issues.append(f'Turn {i}: Content too short')
258
+
259
+ # Check metadata
260
+ if 'metadata' in data:
261
+ meta = data['metadata']
262
+ if 'quality_score' in meta and meta['quality_score'] < 0.75:
263
+ issues.append(f"Low quality score: {meta['quality_score']}")
264
+ else:
265
+ issues.append('Missing metadata')
266
+
267
+ return issues
268
+
269
+ def _check_instruction_quality(self, data: Dict) -> List[str]:
270
+ """Check instruction for quality issues"""
271
+ issues = []
272
+
273
+ if 'instruction' not in data:
274
+ issues.append('Missing instruction')
275
+ elif len(data['instruction'].strip()) < 10:
276
+ issues.append('Instruction too short')
277
+
278
+ if 'output' not in data:
279
+ issues.append('Missing output')
280
+ elif len(data['output'].strip()) < 10:
281
+ issues.append('Output too short')
282
+
283
+ if 'metadata' in data:
284
+ meta = data['metadata']
285
+ if 'quality_score' in meta and meta['quality_score'] < 0.75:
286
+ issues.append(f"Low quality score: {meta['quality_score']}")
287
+
288
+ return issues
289
+
290
+ def _check_code_quality(self, data: Dict) -> List[str]:
291
+ """Check code for quality issues"""
292
+ issues = []
293
+
294
+ if 'language' not in data:
295
+ issues.append('Missing language field')
296
+
297
+ if 'solution' not in data:
298
+ issues.append('Missing solution')
299
+ elif len(data['solution'].strip()) < 20:
300
+ issues.append('Solution too short')
301
+
302
+ if 'test_cases' not in data or not data['test_cases']:
303
+ issues.append('Missing test cases')
304
+
305
+ if 'explanation' not in data or not data['explanation']:
306
+ issues.append('Missing explanation')
307
+
308
+ return issues
309
+
310
+ def generate_report(self, output_dir: str = '.'):
311
+ """Generate comprehensive evaluation report"""
312
+ report = {
313
+ 'evaluation_date': str(np.datetime64('now')),
314
+ 'summary': {},
315
+ 'recommendations': []
316
+ }
317
+
318
+ # Add recommendations based on findings
319
+ report['recommendations'] = [
320
+ 'Maintain quality score above 0.85 for all datasets',
321
+ 'Ensure balanced domain distribution',
322
+ 'Include test cases for all code examples',
323
+ 'Add explanations to improve educational value',
324
+ 'Verify multilingual translations for accuracy',
325
+ 'Regular quality audits every quarter'
326
+ ]
327
+
328
+ # Save report
329
+ output_path = Path(output_dir) / 'evaluation_report.json'
330
+ with open(output_path, 'w') as f:
331
+ json.dump(report, f, indent=2)
332
+
333
+ logger.info(f"Evaluation report saved to {output_path}")
334
+ return report
335
+
336
+ def visualize_metrics(self, metrics: Dict, output_dir: str = '.'):
337
+ """Create visualization charts"""
338
+ output_dir = Path(output_dir)
339
+ output_dir.mkdir(exist_ok=True)
340
+
341
+ # Quality score distribution
342
+ if 'quality_scores' in metrics and metrics['quality_scores']:
343
+ plt.figure(figsize=(10, 6))
344
+ plt.hist(metrics['quality_scores'], bins=20, edgecolor='black')
345
+ plt.xlabel('Quality Score')
346
+ plt.ylabel('Frequency')
347
+ plt.title('Quality Score Distribution')
348
+ plt.savefig(output_dir / 'quality_scores.png')
349
+ plt.close()
350
+
351
+ # Domain distribution
352
+ if 'domain_distribution' in metrics:
353
+ plt.figure(figsize=(12, 6))
354
+ domains = list(metrics['domain_distribution'].keys())
355
+ counts = list(metrics['domain_distribution'].values())
356
+ plt.bar(domains, counts)
357
+ plt.xlabel('Domain')
358
+ plt.ylabel('Count')
359
+ plt.title('Domain Distribution')
360
+ plt.xticks(rotation=45, ha='right')
361
+ plt.tight_layout()
362
+ plt.savefig(output_dir / 'domain_distribution.png')
363
+ plt.close()
364
+
365
+ logger.info(f"Visualizations saved to {output_dir}")
366
+
367
+ def compare_with_helion1(self, helion15_metrics: Dict, helion1_metrics: Dict) -> Dict:
368
+ """Compare Helion 1.5 with Helion 1"""
369
+ comparison = {
370
+ 'improvements': [],
371
+ 'metrics_comparison': {}
372
+ }
373
+
374
+ # Compare key metrics
375
+ if 'avg_quality_score' in helion15_metrics and 'avg_quality_score' in helion1_metrics:
376
+ improvement = ((helion15_metrics['avg_quality_score'] - helion1_metrics['avg_quality_score'])
377
+ / helion1_metrics['avg_quality_score'] * 100)
378
+ comparison['improvements'].append(
379
+ f"Quality score improved by {improvement:.2f}%"
380
+ )
381
+
382
+ return comparison
383
+
384
+
385
+ def main():
386
+ """Main evaluation pipeline"""
387
+ evaluator = HelionDatasetEvaluator()
388
+
389
+ # Evaluate different datasets
390
+ # conv_metrics = evaluator.evaluate_conversations('helion-1.5-conversations.jsonl')
391
+ # inst_metrics = evaluator.evaluate_instructions('helion-1.5-instructions.jsonl')
392
+ # code_metrics = evaluator.evaluate_code('helion-1.5-code.jsonl')
393
+
394
+ # Generate visualizations
395
+ # evaluator.visualize_metrics(conv_metrics, 'evaluation_results')
396
+
397
+ # Generate report
398
+ report = evaluator.generate_report('evaluation_results')
399
+
400
+ logger.info("Evaluation complete!")
401
+ logger.info("Check 'evaluation_results' directory for detailed reports")
402
+
403
+
404
+ if __name__ == '__main__':
405
+ main()