File size: 9,650 Bytes

7275aef

#!/usr/bin/env python3
"""
Unit tests for dataset loaders
"""

import json
import tempfile
import os
from pathlib import Path
import pytest
from datasets import Dataset
from training.data_loader import (
    load_wikitext,
    load_jsonl,
    load_hf_dataset,
    auto_load_dataset,
    _detect_jsonl_schema,
    _process_jsonl_item
)


class TestJSONLProcessing:
    """Test JSONL processing functions"""
    
    def test_detect_sft_schema(self):
        """Test SFT schema detection"""
        sample = {
            "instruction": "What is the capital of France?",
            "input": "",
            "output": "The capital of France is Paris."
        }
        assert _detect_jsonl_schema(sample) == "sft"
    
    def test_detect_dialogue_schema(self):
        """Test dialogue schema detection"""
        sample = {
            "messages": [
                {"role": "user", "content": "Hello"},
                {"role": "assistant", "content": "Hi there!"}
            ]
        }
        assert _detect_jsonl_schema(sample) == "dialogue"
    
    def test_detect_plain_schema(self):
        """Test plain text schema detection"""
        sample = {"text": "This is plain text"}
        assert _detect_jsonl_schema(sample) == "plain"
    
    def test_process_sft_item(self):
        """Test SFT item processing"""
        item = {
            "instruction": "What is the capital of France?",
            "input": "",
            "output": "The capital of France is Paris."
        }
        result = _process_jsonl_item(item, "sft")
        
        assert result is not None
        assert "text" in result
        assert "prompt" in result
        assert "response" in result
        assert "Paris" in result["text"]
    
    def test_process_sft_item_with_input(self):
        """Test SFT item processing with input"""
        item = {
            "instruction": "Translate to French",
            "input": "Hello world",
            "output": "Bonjour le monde"
        }
        result = _process_jsonl_item(item, "sft")
        
        assert result is not None
        assert "Input:" in result["text"]
        assert "Hello world" in result["text"]
        assert "Bonjour le monde" in result["text"]
    
    def test_process_dialogue_item(self):
        """Test dialogue item processing"""
        item = {
            "messages": [
                {"role": "user", "content": "Hello"},
                {"role": "assistant", "content": "Hi there!"}
            ]
        }
        result = _process_jsonl_item(item, "dialogue")
        
        assert result is not None
        assert "text" in result
        assert "messages" in result
        assert "user: Hello" in result["text"]
        assert "assistant: Hi there!" in result["text"]
    
    def test_process_plain_item(self):
        """Test plain text item processing"""
        item = {"text": "This is plain text"}
        result = _process_jsonl_item(item, "plain")
        
        assert result is not None
        assert result["text"] == "This is plain text"
    
    def test_process_invalid_item(self):
        """Test processing invalid item"""
        item = {"invalid": "data"}
        result = _process_jsonl_item(item, "sft")
        assert result is None


class TestJSONLLoader:
    """Test JSONL dataset loading"""
    
    def test_load_jsonl_sft(self):
        """Test loading SFT JSONL dataset"""
        # Create temporary JSONL file
        with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
            jsonl_data = [
                {"instruction": "What is 2+2?", "input": "", "output": "4"},
                {"instruction": "What is the capital of France?", "input": "", "output": "Paris"},
                {"instruction": "Translate hello", "input": "hello", "output": "hola"},
            ]
            for item in jsonl_data:
                f.write(json.dumps(item) + '\n')
            temp_path = f.name
        
        try:
            train_dataset, eval_dataset = load_jsonl(temp_path, "sft")
            
            assert len(train_dataset) > 0
            assert len(eval_dataset) > 0
            assert len(train_dataset) + len(eval_dataset) == 3
            
            # Check that data is processed correctly
            sample = train_dataset[0]
            assert "text" in sample
            assert "instruction" in sample["text"].lower()
            
        finally:
            os.unlink(temp_path)
    
    def test_load_jsonl_dialogue(self):
        """Test loading dialogue JSONL dataset"""
        # Create temporary JSONL file
        with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
            jsonl_data = [
                {
                    "messages": [
                        {"role": "user", "content": "Hello"},
                        {"role": "assistant", "content": "Hi there!"}
                    ]
                },
                {
                    "messages": [
                        {"role": "user", "content": "How are you?"},
                        {"role": "assistant", "content": "I'm doing well, thanks!"}
                    ]
                }
            ]
            for item in jsonl_data:
                f.write(json.dumps(item) + '\n')
            temp_path = f.name
        
        try:
            train_dataset, eval_dataset = load_jsonl(temp_path, "dialogue")
            
            assert len(train_dataset) > 0
            assert len(eval_dataset) > 0
            assert len(train_dataset) + len(eval_dataset) == 2
            
            # Check that data is processed correctly
            sample = train_dataset[0]
            assert "text" in sample
            assert "user:" in sample["text"]
            assert "assistant:" in sample["text"]
            
        finally:
            os.unlink(temp_path)
    
    def test_load_jsonl_auto_detect(self):
        """Test loading JSONL with auto-detection"""
        # Create temporary JSONL file
        with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
            jsonl_data = [
                {"instruction": "What is 2+2?", "input": "", "output": "4"},
                {"instruction": "What is the capital of France?", "input": "", "output": "Paris"},
            ]
            for item in jsonl_data:
                f.write(json.dumps(item) + '\n')
            temp_path = f.name
        
        try:
            train_dataset, eval_dataset = load_jsonl(temp_path, "auto")
            
            assert len(train_dataset) > 0
            assert len(eval_dataset) > 0
            
        finally:
            os.unlink(temp_path)
    
    def test_load_jsonl_invalid_file(self):
        """Test loading non-existent JSONL file"""
        with pytest.raises(FileNotFoundError):
            load_jsonl("nonexistent.jsonl")
    
    def test_load_jsonl_invalid_json(self):
        """Test loading JSONL with invalid JSON"""
        with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
            f.write("invalid json content\n")
            f.write('{"valid": "json"}\n')
            temp_path = f.name
        
        try:
            # Should not raise exception, but skip invalid lines
            train_dataset, eval_dataset = load_jsonl(temp_path)
            assert len(train_dataset) + len(eval_dataset) == 1
            
        finally:
            os.unlink(temp_path)


class TestAutoLoadDataset:
    """Test automatic dataset loading"""
    
    def test_auto_load_wikitext(self):
        """Test auto-loading Wikitext dataset"""
        train_dataset, eval_dataset, metadata = auto_load_dataset("wikitext")
        
        assert len(train_dataset) > 0
        assert len(eval_dataset) > 0
        assert metadata["dataset_type"] == "wikitext"
        assert metadata["text_field"] == "text"
        assert metadata["schema"] == "plain"
    
    def test_auto_load_jsonl(self):
        """Test auto-loading JSONL dataset"""
        # Create temporary JSONL file
        with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
            jsonl_data = [
                {"instruction": "What is 2+2?", "input": "", "output": "4"},
                {"instruction": "What is the capital of France?", "input": "", "output": "Paris"},
            ]
            for item in jsonl_data:
                f.write(json.dumps(item) + '\n')
            temp_path = f.name
        
        try:
            train_dataset, eval_dataset, metadata = auto_load_dataset(f"jsonl:{temp_path}")
            
            assert len(train_dataset) > 0
            assert len(eval_dataset) > 0
            assert metadata["dataset_type"] == "jsonl"
            assert metadata["file_path"] == temp_path
            
        finally:
            os.unlink(temp_path)
    
    def test_auto_load_hf_dataset(self):
        """Test auto-loading Hugging Face dataset"""
        # This test might fail if the dataset is not available
        # We'll use a small, commonly available dataset
        try:
            train_dataset, eval_dataset, metadata = auto_load_dataset("hf:imdb")
            
            assert len(train_dataset) > 0
            assert len(eval_dataset) > 0
            assert metadata["dataset_type"] == "hf"
            assert metadata["dataset_name"] == "imdb"
            assert metadata["text_field"] == "text"
            
        except Exception as e:
            # If the dataset is not available, that's okay for testing
            pytest.skip(f"HF dataset not available: {e}")


if __name__ == "__main__":
    pytest.main([__file__])