Prithvik-1 commited on
Commit
eada9ff
ยท
verified ยท
1 Parent(s): bb9fa45

Upload scripts/validate_dataset.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/validate_dataset.py +201 -0
scripts/validate_dataset.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Dataset validation script for CodeLlama fine-tuning
4
+ Validates format, content, and quality of JSONL datasets
5
+ """
6
+
7
+ import json
8
+ import sys
9
+ from pathlib import Path
10
+ from typing import Dict, List, Tuple
11
+ from collections import Counter
12
+
13
+ def validate_dataset(input_file: str, min_length: int = 3) -> Dict:
14
+ """Comprehensive dataset validation"""
15
+
16
+ print(f"๐Ÿ” Validating dataset: {input_file}")
17
+ print("=" * 70)
18
+
19
+ results = {
20
+ "valid_samples": [],
21
+ "invalid_samples": [],
22
+ "errors": [],
23
+ "warnings": [],
24
+ "statistics": {}
25
+ }
26
+
27
+ total_lines = 0
28
+ valid_count = 0
29
+ invalid_count = 0
30
+
31
+ # Statistics
32
+ instruction_lengths = []
33
+ response_lengths = []
34
+ has_code_markers = 0
35
+ duplicates = []
36
+ seen_samples = set()
37
+
38
+ print("\n๐Ÿ“‹ Checking each sample...")
39
+
40
+ with open(input_file, 'r', encoding='utf-8') as f:
41
+ for line_num, line in enumerate(f, 1):
42
+ total_lines += 1
43
+ line = line.strip()
44
+
45
+ if not line:
46
+ continue
47
+
48
+ sample = None
49
+ try:
50
+ sample = json.loads(line)
51
+ except json.JSONDecodeError as e:
52
+ invalid_count += 1
53
+ error_msg = f"Line {line_num}: Invalid JSON - {str(e)}"
54
+ results["errors"].append(error_msg)
55
+ results["invalid_samples"].append({"line": line_num, "error": error_msg})
56
+ continue
57
+
58
+ # Validate fields
59
+ validation_errors = []
60
+
61
+ # Check required fields
62
+ if "instruction" not in sample:
63
+ validation_errors.append("Missing 'instruction' field")
64
+ if "response" not in sample:
65
+ validation_errors.append("Missing 'response' field")
66
+
67
+ # Check data types
68
+ if "instruction" in sample and not isinstance(sample["instruction"], str):
69
+ validation_errors.append("'instruction' must be a string")
70
+ if "response" in sample and not isinstance(sample["response"], str):
71
+ validation_errors.append("'response' must be a string")
72
+
73
+ # Check content
74
+ if "instruction" in sample:
75
+ instruction = sample["instruction"].strip()
76
+ if not instruction:
77
+ validation_errors.append("Empty 'instruction' field")
78
+ elif len(instruction) < min_length:
79
+ validation_errors.append(f"'instruction' too short (< {min_length} chars)")
80
+ else:
81
+ instruction_lengths.append(len(instruction))
82
+
83
+ if "response" in sample:
84
+ response = sample["response"].strip()
85
+ if not response:
86
+ validation_errors.append("Empty 'response' field")
87
+ elif len(response) < min_length:
88
+ validation_errors.append(f"'response' too short (< {min_length} chars)")
89
+ else:
90
+ response_lengths.append(len(response))
91
+ if '```verilog' in response or '```' in response:
92
+ has_code_markers += 1
93
+
94
+ # Check for duplicates
95
+ sample_hash = hash(json.dumps(sample, sort_keys=True))
96
+ if sample_hash in seen_samples:
97
+ duplicates.append(line_num)
98
+ results["warnings"].append(f"Line {line_num}: Duplicate sample")
99
+ else:
100
+ seen_samples.add(sample_hash)
101
+
102
+ # Record result
103
+ if validation_errors:
104
+ invalid_count += 1
105
+ error_msg = f"Line {line_num}: {'; '.join(validation_errors)}"
106
+ results["errors"].append(error_msg)
107
+ results["invalid_samples"].append({"line": line_num, "errors": validation_errors})
108
+ else:
109
+ valid_count += 1
110
+ results["valid_samples"].append(line_num)
111
+
112
+ # Calculate statistics
113
+ results["statistics"] = {
114
+ "total_lines": total_lines,
115
+ "valid_samples": valid_count,
116
+ "invalid_samples": invalid_count,
117
+ "duplicates": len(duplicates),
118
+ "avg_instruction_length": sum(instruction_lengths) / len(instruction_lengths) if instruction_lengths else 0,
119
+ "avg_response_length": sum(response_lengths) / len(response_lengths) if response_lengths else 0,
120
+ "min_instruction_length": min(instruction_lengths) if instruction_lengths else 0,
121
+ "max_instruction_length": max(instruction_lengths) if instruction_lengths else 0,
122
+ "min_response_length": min(response_lengths) if response_lengths else 0,
123
+ "max_response_length": max(response_lengths) if response_lengths else 0,
124
+ "samples_with_code_markers": has_code_markers,
125
+ "code_marker_percentage": (has_code_markers / valid_count * 100) if valid_count > 0 else 0
126
+ }
127
+
128
+ # Print results
129
+ print(f"\n๐Ÿ“Š Validation Results:")
130
+ print("=" * 70)
131
+ print(f" Total lines: {total_lines}")
132
+ print(f" โœ… Valid samples: {valid_count}")
133
+ print(f" โŒ Invalid samples: {invalid_count}")
134
+ print(f" โš ๏ธ Duplicates: {len(duplicates)}")
135
+
136
+ if instruction_lengths:
137
+ print(f"\n๐Ÿ“ Instruction Statistics:")
138
+ print(f" Average length: {results['statistics']['avg_instruction_length']:.1f} chars")
139
+ print(f" Min/Max: {results['statistics']['min_instruction_length']} / {results['statistics']['max_instruction_length']} chars")
140
+
141
+ if response_lengths:
142
+ print(f"\n๐Ÿ“ Response Statistics:")
143
+ print(f" Average length: {results['statistics']['avg_response_length']:.1f} chars")
144
+ print(f" Min/Max: {results['statistics']['min_response_length']} / {results['statistics']['max_response_length']} chars")
145
+ print(f" Samples with code markers: {has_code_markers} ({results['statistics']['code_marker_percentage']:.1f}%)")
146
+
147
+ if results["errors"]:
148
+ print(f"\nโŒ Errors ({len(results['errors'])}):")
149
+ for error in results["errors"][:10]: # Show first 10
150
+ print(f" {error}")
151
+ if len(results["errors"]) > 10:
152
+ print(f" ... and {len(results['errors']) - 10} more errors")
153
+
154
+ if results["warnings"]:
155
+ print(f"\nโš ๏ธ Warnings ({len(results['warnings'])}):")
156
+ for warning in results["warnings"][:5]: # Show first 5
157
+ print(f" {warning}")
158
+ if len(results["warnings"]) > 5:
159
+ print(f" ... and {len(results['warnings']) - 5} more warnings")
160
+
161
+ # Validation summary
162
+ print(f"\n" + "=" * 70)
163
+ if invalid_count == 0 and len(duplicates) == 0:
164
+ print("โœ… DATASET VALIDATION PASSED - Ready for training!")
165
+ elif invalid_count == 0:
166
+ print("โš ๏ธ DATASET VALIDATION PASSED (with warnings about duplicates)")
167
+ else:
168
+ print("โŒ DATASET VALIDATION FAILED - Fix errors before training")
169
+ print("=" * 70)
170
+
171
+ return results
172
+
173
+ if __name__ == "__main__":
174
+ import argparse
175
+
176
+ parser = argparse.ArgumentParser(description="Validate dataset for training")
177
+ parser.add_argument("--input", required=True, help="Input JSONL file to validate")
178
+ parser.add_argument("--report", help="Optional: Save validation report to JSON file")
179
+ parser.add_argument("--min-length", type=int, default=3, help="Minimum field length (default: 3)")
180
+
181
+ args = parser.parse_args()
182
+
183
+ if not Path(args.input).exists():
184
+ print(f"โŒ Error: File not found: {args.input}")
185
+ sys.exit(1)
186
+
187
+ results = validate_dataset(args.input, args.min_length)
188
+
189
+ # Save report if requested
190
+ if args.report:
191
+ with open(args.report, 'w') as f:
192
+ json.dump(results, f, indent=2)
193
+ print(f"\n๐Ÿ“„ Validation report saved to: {args.report}")
194
+
195
+ # Exit with appropriate code
196
+ if results["statistics"]["invalid_samples"] > 0:
197
+ sys.exit(1)
198
+ else:
199
+ sys.exit(0)
200
+
201
+