File size: 3,144 Bytes
d0c3c53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Simple test to verify dataset integration setup.
This test doesn't require external libraries to be installed.
"""

import json
import os

def test_dataset_files():
    """Test that dataset files exist and are properly formatted."""
    
    dataset_path = os.path.expanduser("~/huggingface.co/datasets/chiedo/hello-world")
    
    print("Testing Dataset Integration Setup")
    print("=" * 50)
    
    # Check dataset files exist
    required_files = ["train.jsonl", "validation.jsonl", "test.jsonl", "README.md", "hello_world.py"]
    
    print("\n1. Checking dataset files:")
    for file in required_files:
        file_path = os.path.join(dataset_path, file)
        if os.path.exists(file_path):
            print(f"   βœ“ {file} exists")
        else:
            print(f"   βœ— {file} missing")
    
    # Load and validate dataset content
    print("\n2. Validating dataset content:")
    splits = ["train", "validation", "test"]
    
    for split in splits:
        file_path = os.path.join(dataset_path, f"{split}.jsonl")
        try:
            with open(file_path, 'r') as f:
                lines = f.readlines()
                print(f"\n   {split} split:")
                print(f"   - Examples: {len(lines)}")
                
                # Parse first example
                first_example = json.loads(lines[0])
                print(f"   - First example: {first_example}")
                
                # Validate structure
                if "text" in first_example and "label" in first_example:
                    print(f"   - Structure: βœ“ Valid")
                else:
                    print(f"   - Structure: βœ— Invalid")
        except Exception as e:
            print(f"   Error reading {split}: {e}")
    
    # Check model integration code
    print("\n3. Checking model integration:")
    model_file = "model.py"
    
    if os.path.exists(model_file):
        with open(model_file, 'r') as f:
            content = f.read()
            
        # Check for dataset integration methods
        if "load_dataset" in content:
            print("   βœ“ load_dataset method found in model.py")
        else:
            print("   βœ— load_dataset method not found")
            
        if "prepare_dataset_batch" in content:
            print("   βœ“ prepare_dataset_batch method found in model.py")
        else:
            print("   βœ— prepare_dataset_batch method not found")
            
        if "from datasets import load_dataset" in content:
            print("   βœ“ datasets import found in model.py")
        else:
            print("   βœ— datasets import not found")
    
    print("\n4. Dataset URLs:")
    print(f"   Model: https://huggingface.co/chiedo/hello-world")
    print(f"   Dataset: https://huggingface.co/datasets/chiedo/hello-world")
    
    print("\n" + "=" * 50)
    print("Dataset integration setup complete!")
    print("\nTo use the dataset with the model, install dependencies:")
    print("  pip install torch transformers datasets")
    print("\nThen run:")
    print("  python example_with_dataset.py")

if __name__ == "__main__":
    test_dataset_files()