File size: 3,144 Bytes

d0c3c53

"""
Simple test to verify dataset integration setup.
This test doesn't require external libraries to be installed.
"""

import json
import os

def test_dataset_files():
    """Test that dataset files exist and are properly formatted."""
    
    dataset_path = os.path.expanduser("~/huggingface.co/datasets/chiedo/hello-world")
    
    print("Testing Dataset Integration Setup")
    print("=" * 50)
    
    # Check dataset files exist
    required_files = ["train.jsonl", "validation.jsonl", "test.jsonl", "README.md", "hello_world.py"]
    
    print("\n1. Checking dataset files:")
    for file in required_files:
        file_path = os.path.join(dataset_path, file)
        if os.path.exists(file_path):
            print(f"   ✓ {file} exists")
        else:
            print(f"   ✗ {file} missing")
    
    # Load and validate dataset content
    print("\n2. Validating dataset content:")
    splits = ["train", "validation", "test"]
    
    for split in splits:
        file_path = os.path.join(dataset_path, f"{split}.jsonl")
        try:
            with open(file_path, 'r') as f:
                lines = f.readlines()
                print(f"\n   {split} split:")
                print(f"   - Examples: {len(lines)}")
                
                # Parse first example
                first_example = json.loads(lines[0])
                print(f"   - First example: {first_example}")
                
                # Validate structure
                if "text" in first_example and "label" in first_example:
                    print(f"   - Structure: ✓ Valid")
                else:
                    print(f"   - Structure: ✗ Invalid")
        except Exception as e:
            print(f"   Error reading {split}: {e}")
    
    # Check model integration code
    print("\n3. Checking model integration:")
    model_file = "model.py"
    
    if os.path.exists(model_file):
        with open(model_file, 'r') as f:
            content = f.read()
            
        # Check for dataset integration methods
        if "load_dataset" in content:
            print("   ✓ load_dataset method found in model.py")
        else:
            print("   ✗ load_dataset method not found")
            
        if "prepare_dataset_batch" in content:
            print("   ✓ prepare_dataset_batch method found in model.py")
        else:
            print("   ✗ prepare_dataset_batch method not found")
            
        if "from datasets import load_dataset" in content:
            print("   ✓ datasets import found in model.py")
        else:
            print("   ✗ datasets import not found")
    
    print("\n4. Dataset URLs:")
    print(f"   Model: https://huggingface.co/chiedo/hello-world")
    print(f"   Dataset: https://huggingface.co/datasets/chiedo/hello-world")
    
    print("\n" + "=" * 50)
    print("Dataset integration setup complete!")
    print("\nTo use the dataset with the model, install dependencies:")
    print("  pip install torch transformers datasets")
    print("\nThen run:")
    print("  python example_with_dataset.py")

if __name__ == "__main__":
    test_dataset_files()