Trouter-Library commited on
Commit
055cc2e
·
verified ·
1 Parent(s): ee0eed2

Delete evaluate_dataset.py

Browse files
Files changed (1) hide show
  1. evaluate_dataset.py +0 -405
evaluate_dataset.py DELETED
@@ -1,405 +0,0 @@
1
- """
2
- Helion 1.5 Dataset Evaluation Tool
3
- ===================================
4
- Comprehensive evaluation and quality assessment for Helion 1.5 datasets.
5
- """
6
-
7
- import json
8
- import numpy as np
9
- from typing import Dict, List, Any, Tuple
10
- from collections import Counter, defaultdict
11
- import matplotlib.pyplot as plt
12
- from pathlib import Path
13
- import logging
14
-
15
- logging.basicConfig(level=logging.INFO)
16
- logger = logging.getLogger(__name__)
17
-
18
-
19
- class HelionDatasetEvaluator:
20
- """Evaluate and analyze Helion 1.5 dataset quality"""
21
-
22
- def __init__(self):
23
- self.stats = defaultdict(list)
24
- self.issues = []
25
-
26
- def evaluate_conversations(self, filepath: str) -> Dict[str, Any]:
27
- """Evaluate conversation dataset"""
28
- logger.info(f"Evaluating conversations from {filepath}")
29
-
30
- metrics = {
31
- 'total_examples': 0,
32
- 'total_turns': [],
33
- 'word_counts': [],
34
- 'quality_scores': [],
35
- 'domain_distribution': Counter(),
36
- 'difficulty_distribution': Counter(),
37
- 'language_distribution': Counter(),
38
- 'has_code_count': 0,
39
- 'avg_turns_per_conversation': 0,
40
- 'avg_words_per_turn': 0,
41
- 'quality_issues': []
42
- }
43
-
44
- with open(filepath, 'r', encoding='utf-8') as f:
45
- for line_num, line in enumerate(f, 1):
46
- try:
47
- data = json.loads(line.strip())
48
- metrics['total_examples'] += 1
49
-
50
- # Analyze conversations
51
- if 'conversations' in data:
52
- turns = data['conversations']
53
- metrics['total_turns'].append(len(turns))
54
-
55
- for turn in turns:
56
- words = len(turn['content'].split())
57
- metrics['word_counts'].append(words)
58
-
59
- # Analyze metadata
60
- if 'metadata' in data:
61
- meta = data['metadata']
62
-
63
- if 'quality_score' in meta:
64
- metrics['quality_scores'].append(meta['quality_score'])
65
-
66
- if 'domain' in meta:
67
- metrics['domain_distribution'][meta['domain']] += 1
68
-
69
- if 'difficulty' in meta:
70
- metrics['difficulty_distribution'][meta['difficulty']] += 1
71
-
72
- if 'languages' in meta:
73
- for lang in meta['languages']:
74
- metrics['language_distribution'][lang] += 1
75
-
76
- if meta.get('has_code', False):
77
- metrics['has_code_count'] += 1
78
-
79
- # Check for quality issues
80
- issues = self._check_conversation_quality(data)
81
- if issues:
82
- metrics['quality_issues'].extend([
83
- {'line': line_num, 'issues': issues}
84
- ])
85
-
86
- except json.JSONDecodeError:
87
- logger.error(f"JSON decode error at line {line_num}")
88
- except Exception as e:
89
- logger.error(f"Error at line {line_num}: {str(e)}")
90
-
91
- # Calculate averages
92
- if metrics['total_turns']:
93
- metrics['avg_turns_per_conversation'] = np.mean(metrics['total_turns'])
94
- if metrics['word_counts']:
95
- metrics['avg_words_per_turn'] = np.mean(metrics['word_counts'])
96
- if metrics['quality_scores']:
97
- metrics['avg_quality_score'] = np.mean(metrics['quality_scores'])
98
- metrics['min_quality_score'] = np.min(metrics['quality_scores'])
99
- metrics['max_quality_score'] = np.max(metrics['quality_scores'])
100
-
101
- return metrics
102
-
103
- def evaluate_instructions(self, filepath: str) -> Dict[str, Any]:
104
- """Evaluate instruction dataset"""
105
- logger.info(f"Evaluating instructions from {filepath}")
106
-
107
- metrics = {
108
- 'total_examples': 0,
109
- 'instruction_lengths': [],
110
- 'output_lengths': [],
111
- 'quality_scores': [],
112
- 'task_type_distribution': Counter(),
113
- 'complexity_distribution': Counter(),
114
- 'verified_count': 0,
115
- 'quality_issues': []
116
- }
117
-
118
- with open(filepath, 'r', encoding='utf-8') as f:
119
- for line_num, line in enumerate(f, 1):
120
- try:
121
- data = json.loads(line.strip())
122
- metrics['total_examples'] += 1
123
-
124
- if 'instruction' in data:
125
- metrics['instruction_lengths'].append(len(data['instruction'].split()))
126
-
127
- if 'output' in data:
128
- metrics['output_lengths'].append(len(data['output'].split()))
129
-
130
- if 'metadata' in data:
131
- meta = data['metadata']
132
-
133
- if 'quality_score' in meta:
134
- metrics['quality_scores'].append(meta['quality_score'])
135
-
136
- if 'task_type' in meta:
137
- metrics['task_type_distribution'][meta['task_type']] += 1
138
-
139
- if 'complexity' in meta:
140
- metrics['complexity_distribution'][meta['complexity']] += 1
141
-
142
- if meta.get('verified', False):
143
- metrics['verified_count'] += 1
144
-
145
- issues = self._check_instruction_quality(data)
146
- if issues:
147
- metrics['quality_issues'].append({
148
- 'line': line_num, 'issues': issues
149
- })
150
-
151
- except Exception as e:
152
- logger.error(f"Error at line {line_num}: {str(e)}")
153
-
154
- # Calculate statistics
155
- if metrics['instruction_lengths']:
156
- metrics['avg_instruction_length'] = np.mean(metrics['instruction_lengths'])
157
- if metrics['output_lengths']:
158
- metrics['avg_output_length'] = np.mean(metrics['output_lengths'])
159
- if metrics['quality_scores']:
160
- metrics['avg_quality_score'] = np.mean(metrics['quality_scores'])
161
-
162
- metrics['verification_rate'] = metrics['verified_count'] / max(metrics['total_examples'], 1)
163
-
164
- return metrics
165
-
166
- def evaluate_code(self, filepath: str) -> Dict[str, Any]:
167
- """Evaluate code dataset"""
168
- logger.info(f"Evaluating code from {filepath}")
169
-
170
- metrics = {
171
- 'total_examples': 0,
172
- 'language_distribution': Counter(),
173
- 'difficulty_distribution': Counter(),
174
- 'lines_of_code': [],
175
- 'quality_scores': [],
176
- 'has_test_cases': 0,
177
- 'has_explanation': 0,
178
- 'tag_distribution': Counter(),
179
- 'quality_issues': []
180
- }
181
-
182
- with open(filepath, 'r', encoding='utf-8') as f:
183
- for line_num, line in enumerate(f, 1):
184
- try:
185
- data = json.loads(line.strip())
186
- metrics['total_examples'] += 1
187
-
188
- if 'language' in data:
189
- metrics['language_distribution'][data['language']] += 1
190
-
191
- if 'solution' in data:
192
- loc = len(data['solution'].strip().split('\n'))
193
- metrics['lines_of_code'].append(loc)
194
-
195
- if 'test_cases' in data and data['test_cases']:
196
- metrics['has_test_cases'] += 1
197
-
198
- if 'explanation' in data and data['explanation']:
199
- metrics['has_explanation'] += 1
200
-
201
- if 'metadata' in data:
202
- meta = data['metadata']
203
-
204
- if 'quality_score' in meta:
205
- metrics['quality_scores'].append(meta['quality_score'])
206
-
207
- if 'difficulty' in meta:
208
- metrics['difficulty_distribution'][meta['difficulty']] += 1
209
-
210
- if 'tags' in meta:
211
- for tag in meta['tags']:
212
- metrics['tag_distribution'][tag] += 1
213
-
214
- issues = self._check_code_quality(data)
215
- if issues:
216
- metrics['quality_issues'].append({
217
- 'line': line_num, 'issues': issues
218
- })
219
-
220
- except Exception as e:
221
- logger.error(f"Error at line {line_num}: {str(e)}")
222
-
223
- # Calculate statistics
224
- if metrics['lines_of_code']:
225
- metrics['avg_lines_of_code'] = np.mean(metrics['lines_of_code'])
226
- metrics['median_lines_of_code'] = np.median(metrics['lines_of_code'])
227
-
228
- if metrics['quality_scores']:
229
- metrics['avg_quality_score'] = np.mean(metrics['quality_scores'])
230
-
231
- metrics['test_case_coverage'] = metrics['has_test_cases'] / max(metrics['total_examples'], 1)
232
- metrics['explanation_coverage'] = metrics['has_explanation'] / max(metrics['total_examples'], 1)
233
-
234
- return metrics
235
-
236
- def _check_conversation_quality(self, data: Dict) -> List[str]:
237
- """Check conversation for quality issues"""
238
- issues = []
239
-
240
- if 'conversations' not in data:
241
- issues.append('Missing conversations field')
242
- return issues
243
-
244
- conversations = data['conversations']
245
-
246
- # Check for minimum turns
247
- if len(conversations) < 2:
248
- issues.append('Too few conversation turns')
249
-
250
- # Check turn structure
251
- for i, turn in enumerate(conversations):
252
- if 'role' not in turn:
253
- issues.append(f'Turn {i}: Missing role')
254
- if 'content' not in turn:
255
- issues.append(f'Turn {i}: Missing content')
256
- elif len(turn['content'].strip()) < 10:
257
- issues.append(f'Turn {i}: Content too short')
258
-
259
- # Check metadata
260
- if 'metadata' in data:
261
- meta = data['metadata']
262
- if 'quality_score' in meta and meta['quality_score'] < 0.75:
263
- issues.append(f"Low quality score: {meta['quality_score']}")
264
- else:
265
- issues.append('Missing metadata')
266
-
267
- return issues
268
-
269
- def _check_instruction_quality(self, data: Dict) -> List[str]:
270
- """Check instruction for quality issues"""
271
- issues = []
272
-
273
- if 'instruction' not in data:
274
- issues.append('Missing instruction')
275
- elif len(data['instruction'].strip()) < 10:
276
- issues.append('Instruction too short')
277
-
278
- if 'output' not in data:
279
- issues.append('Missing output')
280
- elif len(data['output'].strip()) < 10:
281
- issues.append('Output too short')
282
-
283
- if 'metadata' in data:
284
- meta = data['metadata']
285
- if 'quality_score' in meta and meta['quality_score'] < 0.75:
286
- issues.append(f"Low quality score: {meta['quality_score']}")
287
-
288
- return issues
289
-
290
- def _check_code_quality(self, data: Dict) -> List[str]:
291
- """Check code for quality issues"""
292
- issues = []
293
-
294
- if 'language' not in data:
295
- issues.append('Missing language field')
296
-
297
- if 'solution' not in data:
298
- issues.append('Missing solution')
299
- elif len(data['solution'].strip()) < 20:
300
- issues.append('Solution too short')
301
-
302
- if 'test_cases' not in data or not data['test_cases']:
303
- issues.append('Missing test cases')
304
-
305
- if 'explanation' not in data or not data['explanation']:
306
- issues.append('Missing explanation')
307
-
308
- return issues
309
-
310
- def generate_report(self, output_dir: str = '.'):
311
- """Generate comprehensive evaluation report"""
312
- report = {
313
- 'evaluation_date': str(np.datetime64('now')),
314
- 'summary': {},
315
- 'recommendations': []
316
- }
317
-
318
- # Add recommendations based on findings
319
- report['recommendations'] = [
320
- 'Maintain quality score above 0.85 for all datasets',
321
- 'Ensure balanced domain distribution',
322
- 'Include test cases for all code examples',
323
- 'Add explanations to improve educational value',
324
- 'Verify multilingual translations for accuracy',
325
- 'Regular quality audits every quarter'
326
- ]
327
-
328
- # Save report
329
- output_path = Path(output_dir) / 'evaluation_report.json'
330
- with open(output_path, 'w') as f:
331
- json.dump(report, f, indent=2)
332
-
333
- logger.info(f"Evaluation report saved to {output_path}")
334
- return report
335
-
336
- def visualize_metrics(self, metrics: Dict, output_dir: str = '.'):
337
- """Create visualization charts"""
338
- output_dir = Path(output_dir)
339
- output_dir.mkdir(exist_ok=True)
340
-
341
- # Quality score distribution
342
- if 'quality_scores' in metrics and metrics['quality_scores']:
343
- plt.figure(figsize=(10, 6))
344
- plt.hist(metrics['quality_scores'], bins=20, edgecolor='black')
345
- plt.xlabel('Quality Score')
346
- plt.ylabel('Frequency')
347
- plt.title('Quality Score Distribution')
348
- plt.savefig(output_dir / 'quality_scores.png')
349
- plt.close()
350
-
351
- # Domain distribution
352
- if 'domain_distribution' in metrics:
353
- plt.figure(figsize=(12, 6))
354
- domains = list(metrics['domain_distribution'].keys())
355
- counts = list(metrics['domain_distribution'].values())
356
- plt.bar(domains, counts)
357
- plt.xlabel('Domain')
358
- plt.ylabel('Count')
359
- plt.title('Domain Distribution')
360
- plt.xticks(rotation=45, ha='right')
361
- plt.tight_layout()
362
- plt.savefig(output_dir / 'domain_distribution.png')
363
- plt.close()
364
-
365
- logger.info(f"Visualizations saved to {output_dir}")
366
-
367
- def compare_with_helion1(self, helion15_metrics: Dict, helion1_metrics: Dict) -> Dict:
368
- """Compare Helion 1.5 with Helion 1"""
369
- comparison = {
370
- 'improvements': [],
371
- 'metrics_comparison': {}
372
- }
373
-
374
- # Compare key metrics
375
- if 'avg_quality_score' in helion15_metrics and 'avg_quality_score' in helion1_metrics:
376
- improvement = ((helion15_metrics['avg_quality_score'] - helion1_metrics['avg_quality_score'])
377
- / helion1_metrics['avg_quality_score'] * 100)
378
- comparison['improvements'].append(
379
- f"Quality score improved by {improvement:.2f}%"
380
- )
381
-
382
- return comparison
383
-
384
-
385
- def main():
386
- """Main evaluation pipeline"""
387
- evaluator = HelionDatasetEvaluator()
388
-
389
- # Evaluate different datasets
390
- # conv_metrics = evaluator.evaluate_conversations('helion-1.5-conversations.jsonl')
391
- # inst_metrics = evaluator.evaluate_instructions('helion-1.5-instructions.jsonl')
392
- # code_metrics = evaluator.evaluate_code('helion-1.5-code.jsonl')
393
-
394
- # Generate visualizations
395
- # evaluator.visualize_metrics(conv_metrics, 'evaluation_results')
396
-
397
- # Generate report
398
- report = evaluator.generate_report('evaluation_results')
399
-
400
- logger.info("Evaluation complete!")
401
- logger.info("Check 'evaluation_results' directory for detailed reports")
402
-
403
-
404
- if __name__ == '__main__':
405
- main()