File size: 8,061 Bytes

eada9ff

#!/usr/bin/env python3
"""
Dataset validation script for CodeLlama fine-tuning
Validates format, content, and quality of JSONL datasets
"""

import json
import sys
from pathlib import Path
from typing import Dict, List, Tuple
from collections import Counter

def validate_dataset(input_file: str, min_length: int = 3) -> Dict:
    """Comprehensive dataset validation"""
    
    print(f"🔍 Validating dataset: {input_file}")
    print("=" * 70)
    
    results = {
        "valid_samples": [],
        "invalid_samples": [],
        "errors": [],
        "warnings": [],
        "statistics": {}
    }
    
    total_lines = 0
    valid_count = 0
    invalid_count = 0
    
    # Statistics
    instruction_lengths = []
    response_lengths = []
    has_code_markers = 0
    duplicates = []
    seen_samples = set()
    
    print("\n📋 Checking each sample...")
    
    with open(input_file, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            total_lines += 1
            line = line.strip()
            
            if not line:
                continue
            
            sample = None
            try:
                sample = json.loads(line)
            except json.JSONDecodeError as e:
                invalid_count += 1
                error_msg = f"Line {line_num}: Invalid JSON - {str(e)}"
                results["errors"].append(error_msg)
                results["invalid_samples"].append({"line": line_num, "error": error_msg})
                continue
            
            # Validate fields
            validation_errors = []
            
            # Check required fields
            if "instruction" not in sample:
                validation_errors.append("Missing 'instruction' field")
            if "response" not in sample:
                validation_errors.append("Missing 'response' field")
            
            # Check data types
            if "instruction" in sample and not isinstance(sample["instruction"], str):
                validation_errors.append("'instruction' must be a string")
            if "response" in sample and not isinstance(sample["response"], str):
                validation_errors.append("'response' must be a string")
            
            # Check content
            if "instruction" in sample:
                instruction = sample["instruction"].strip()
                if not instruction:
                    validation_errors.append("Empty 'instruction' field")
                elif len(instruction) < min_length:
                    validation_errors.append(f"'instruction' too short (< {min_length} chars)")
                else:
                    instruction_lengths.append(len(instruction))
            
            if "response" in sample:
                response = sample["response"].strip()
                if not response:
                    validation_errors.append("Empty 'response' field")
                elif len(response) < min_length:
                    validation_errors.append(f"'response' too short (< {min_length} chars)")
                else:
                    response_lengths.append(len(response))
                    if '```verilog' in response or '```' in response:
                        has_code_markers += 1
            
            # Check for duplicates
            sample_hash = hash(json.dumps(sample, sort_keys=True))
            if sample_hash in seen_samples:
                duplicates.append(line_num)
                results["warnings"].append(f"Line {line_num}: Duplicate sample")
            else:
                seen_samples.add(sample_hash)
            
            # Record result
            if validation_errors:
                invalid_count += 1
                error_msg = f"Line {line_num}: {'; '.join(validation_errors)}"
                results["errors"].append(error_msg)
                results["invalid_samples"].append({"line": line_num, "errors": validation_errors})
            else:
                valid_count += 1
                results["valid_samples"].append(line_num)
    
    # Calculate statistics
    results["statistics"] = {
        "total_lines": total_lines,
        "valid_samples": valid_count,
        "invalid_samples": invalid_count,
        "duplicates": len(duplicates),
        "avg_instruction_length": sum(instruction_lengths) / len(instruction_lengths) if instruction_lengths else 0,
        "avg_response_length": sum(response_lengths) / len(response_lengths) if response_lengths else 0,
        "min_instruction_length": min(instruction_lengths) if instruction_lengths else 0,
        "max_instruction_length": max(instruction_lengths) if instruction_lengths else 0,
        "min_response_length": min(response_lengths) if response_lengths else 0,
        "max_response_length": max(response_lengths) if response_lengths else 0,
        "samples_with_code_markers": has_code_markers,
        "code_marker_percentage": (has_code_markers / valid_count * 100) if valid_count > 0 else 0
    }
    
    # Print results
    print(f"\n📊 Validation Results:")
    print("=" * 70)
    print(f"   Total lines: {total_lines}")
    print(f"   ✅ Valid samples: {valid_count}")
    print(f"   ❌ Invalid samples: {invalid_count}")
    print(f"   ⚠️  Duplicates: {len(duplicates)}")
    
    if instruction_lengths:
        print(f"\n📏 Instruction Statistics:")
        print(f"   Average length: {results['statistics']['avg_instruction_length']:.1f} chars")
        print(f"   Min/Max: {results['statistics']['min_instruction_length']} / {results['statistics']['max_instruction_length']} chars")
    
    if response_lengths:
        print(f"\n📏 Response Statistics:")
        print(f"   Average length: {results['statistics']['avg_response_length']:.1f} chars")
        print(f"   Min/Max: {results['statistics']['min_response_length']} / {results['statistics']['max_response_length']} chars")
        print(f"   Samples with code markers: {has_code_markers} ({results['statistics']['code_marker_percentage']:.1f}%)")
    
    if results["errors"]:
        print(f"\n❌ Errors ({len(results['errors'])}):")
        for error in results["errors"][:10]:  # Show first 10
            print(f"   {error}")
        if len(results["errors"]) > 10:
            print(f"   ... and {len(results['errors']) - 10} more errors")
    
    if results["warnings"]:
        print(f"\n⚠️  Warnings ({len(results['warnings'])}):")
        for warning in results["warnings"][:5]:  # Show first 5
            print(f"   {warning}")
        if len(results["warnings"]) > 5:
            print(f"   ... and {len(results['warnings']) - 5} more warnings")
    
    # Validation summary
    print(f"\n" + "=" * 70)
    if invalid_count == 0 and len(duplicates) == 0:
        print("✅ DATASET VALIDATION PASSED - Ready for training!")
    elif invalid_count == 0:
        print("⚠️  DATASET VALIDATION PASSED (with warnings about duplicates)")
    else:
        print("❌ DATASET VALIDATION FAILED - Fix errors before training")
    print("=" * 70)
    
    return results

if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description="Validate dataset for training")
    parser.add_argument("--input", required=True, help="Input JSONL file to validate")
    parser.add_argument("--report", help="Optional: Save validation report to JSON file")
    parser.add_argument("--min-length", type=int, default=3, help="Minimum field length (default: 3)")
    
    args = parser.parse_args()
    
    if not Path(args.input).exists():
        print(f"❌ Error: File not found: {args.input}")
        sys.exit(1)
    
    results = validate_dataset(args.input, args.min_length)
    
    # Save report if requested
    if args.report:
        with open(args.report, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"\n📄 Validation report saved to: {args.report}")
    
    # Exit with appropriate code
    if results["statistics"]["invalid_samples"] > 0:
        sys.exit(1)
    else:
        sys.exit(0)