Elinnos
/

codellama-fine-tuning

Model card Files Files and versions

xet

Community

Prithvik-1 commited on Nov 25, 2025

Commit

eada9ff

verified ·

1 Parent(s): bb9fa45

Upload scripts/validate_dataset.py with huggingface_hub

Browse files

Files changed (1) hide show

scripts/validate_dataset.py +201 -0

scripts/validate_dataset.py ADDED Viewed

	@@ -0,0 +1,201 @@

+#!/usr/bin/env python3
+"""
+Dataset validation script for CodeLlama fine-tuning
+Validates format, content, and quality of JSONL datasets
+"""
+import json
+import sys
+from pathlib import Path
+from typing import Dict, List, Tuple
+from collections import Counter
+def validate_dataset(input_file: str, min_length: int = 3) -> Dict:
+    """Comprehensive dataset validation"""
+    print(f"🔍 Validating dataset: {input_file}")
+    print("=" * 70)
+    results = {
+        "valid_samples": [],
+        "invalid_samples": [],
+        "errors": [],
+        "warnings": [],
+        "statistics": {}
+    }
+    total_lines = 0
+    valid_count = 0
+    invalid_count = 0
+    # Statistics
+    instruction_lengths = []
+    response_lengths = []
+    has_code_markers = 0
+    duplicates = []
+    seen_samples = set()
+    print("\n📋 Checking each sample...")
+    with open(input_file, 'r', encoding='utf-8') as f:
+        for line_num, line in enumerate(f, 1):
+            total_lines += 1
+            line = line.strip()
+            if not line:
+                continue
+            sample = None
+            try:
+                sample = json.loads(line)
+            except json.JSONDecodeError as e:
+                invalid_count += 1
+                error_msg = f"Line {line_num}: Invalid JSON - {str(e)}"
+                results["errors"].append(error_msg)
+                results["invalid_samples"].append({"line": line_num, "error": error_msg})
+                continue
+            # Validate fields
+            validation_errors = []
+            # Check required fields
+            if "instruction" not in sample:
+                validation_errors.append("Missing 'instruction' field")
+            if "response" not in sample:
+                validation_errors.append("Missing 'response' field")
+            # Check data types
+            if "instruction" in sample and not isinstance(sample["instruction"], str):
+                validation_errors.append("'instruction' must be a string")
+            if "response" in sample and not isinstance(sample["response"], str):
+                validation_errors.append("'response' must be a string")
+            # Check content
+            if "instruction" in sample:
+                instruction = sample["instruction"].strip()
+                if not instruction:
+                    validation_errors.append("Empty 'instruction' field")
+                elif len(instruction) < min_length:
+                    validation_errors.append(f"'instruction' too short (< {min_length} chars)")
+                else:
+                    instruction_lengths.append(len(instruction))
+            if "response" in sample:
+                response = sample["response"].strip()
+                if not response:
+                    validation_errors.append("Empty 'response' field")
+                elif len(response) < min_length:
+                    validation_errors.append(f"'response' too short (< {min_length} chars)")
+                else:
+                    response_lengths.append(len(response))
+                    if '```verilog' in response or '```' in response:
+                        has_code_markers += 1
+            # Check for duplicates
+            sample_hash = hash(json.dumps(sample, sort_keys=True))
+            if sample_hash in seen_samples:
+                duplicates.append(line_num)
+                results["warnings"].append(f"Line {line_num}: Duplicate sample")
+            else:
+                seen_samples.add(sample_hash)
+            # Record result
+            if validation_errors:
+                invalid_count += 1
+                error_msg = f"Line {line_num}: {'; '.join(validation_errors)}"
+                results["errors"].append(error_msg)
+                results["invalid_samples"].append({"line": line_num, "errors": validation_errors})
+            else:
+                valid_count += 1
+                results["valid_samples"].append(line_num)
+    # Calculate statistics
+    results["statistics"] = {
+        "total_lines": total_lines,
+        "valid_samples": valid_count,
+        "invalid_samples": invalid_count,
+        "duplicates": len(duplicates),
+        "avg_instruction_length": sum(instruction_lengths) / len(instruction_lengths) if instruction_lengths else 0,
+        "avg_response_length": sum(response_lengths) / len(response_lengths) if response_lengths else 0,
+        "min_instruction_length": min(instruction_lengths) if instruction_lengths else 0,
+        "max_instruction_length": max(instruction_lengths) if instruction_lengths else 0,
+        "min_response_length": min(response_lengths) if response_lengths else 0,
+        "max_response_length": max(response_lengths) if response_lengths else 0,
+        "samples_with_code_markers": has_code_markers,
+        "code_marker_percentage": (has_code_markers / valid_count * 100) if valid_count > 0 else 0
+    }
+    # Print results
+    print(f"\n📊 Validation Results:")
+    print("=" * 70)
+    print(f"   Total lines: {total_lines}")
+    print(f"   ✅ Valid samples: {valid_count}")
+    print(f"   ❌ Invalid samples: {invalid_count}")
+    print(f"   ⚠️  Duplicates: {len(duplicates)}")
+    if instruction_lengths:
+        print(f"\n📏 Instruction Statistics:")
+        print(f"   Average length: {results['statistics']['avg_instruction_length']:.1f} chars")
+        print(f"   Min/Max: {results['statistics']['min_instruction_length']} / {results['statistics']['max_instruction_length']} chars")
+    if response_lengths:
+        print(f"\n📏 Response Statistics:")
+        print(f"   Average length: {results['statistics']['avg_response_length']:.1f} chars")
+        print(f"   Min/Max: {results['statistics']['min_response_length']} / {results['statistics']['max_response_length']} chars")
+        print(f"   Samples with code markers: {has_code_markers} ({results['statistics']['code_marker_percentage']:.1f}%)")
+    if results["errors"]:
+        print(f"\n❌ Errors ({len(results['errors'])}):")
+        for error in results["errors"][:10]:  # Show first 10
+            print(f"   {error}")
+        if len(results["errors"]) > 10:
+            print(f"   ... and {len(results['errors']) - 10} more errors")
+    if results["warnings"]:
+        print(f"\n⚠️  Warnings ({len(results['warnings'])}):")
+        for warning in results["warnings"][:5]:  # Show first 5
+            print(f"   {warning}")
+        if len(results["warnings"]) > 5:
+            print(f"   ... and {len(results['warnings']) - 5} more warnings")
+    # Validation summary
+    print(f"\n" + "=" * 70)
+    if invalid_count == 0 and len(duplicates) == 0:
+        print("✅ DATASET VALIDATION PASSED - Ready for training!")
+    elif invalid_count == 0:
+        print("⚠️  DATASET VALIDATION PASSED (with warnings about duplicates)")
+    else:
+        print("❌ DATASET VALIDATION FAILED - Fix errors before training")
+    print("=" * 70)
+    return results
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Validate dataset for training")
+    parser.add_argument("--input", required=True, help="Input JSONL file to validate")
+    parser.add_argument("--report", help="Optional: Save validation report to JSON file")
+    parser.add_argument("--min-length", type=int, default=3, help="Minimum field length (default: 3)")
+    args = parser.parse_args()
+    if not Path(args.input).exists():
+        print(f"❌ Error: File not found: {args.input}")
+        sys.exit(1)
+    results = validate_dataset(args.input, args.min_length)
+    # Save report if requested
+    if args.report:
+        with open(args.report, 'w') as f:
+            json.dump(results, f, indent=2)
+        print(f"\n📄 Validation report saved to: {args.report}")
+    # Exit with appropriate code
+    if results["statistics"]["invalid_samples"] > 0:
+        sys.exit(1)
+    else:
+        sys.exit(0)