|
|
""" |
|
|
Simple test to verify dataset integration setup. |
|
|
This test doesn't require external libraries to be installed. |
|
|
""" |
|
|
|
|
|
import json |
|
|
import os |
|
|
|
|
|
def test_dataset_files(): |
|
|
"""Test that dataset files exist and are properly formatted.""" |
|
|
|
|
|
dataset_path = os.path.expanduser("~/huggingface.co/datasets/chiedo/hello-world") |
|
|
|
|
|
print("Testing Dataset Integration Setup") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
required_files = ["train.jsonl", "validation.jsonl", "test.jsonl", "README.md", "hello_world.py"] |
|
|
|
|
|
print("\n1. Checking dataset files:") |
|
|
for file in required_files: |
|
|
file_path = os.path.join(dataset_path, file) |
|
|
if os.path.exists(file_path): |
|
|
print(f" β {file} exists") |
|
|
else: |
|
|
print(f" β {file} missing") |
|
|
|
|
|
|
|
|
print("\n2. Validating dataset content:") |
|
|
splits = ["train", "validation", "test"] |
|
|
|
|
|
for split in splits: |
|
|
file_path = os.path.join(dataset_path, f"{split}.jsonl") |
|
|
try: |
|
|
with open(file_path, 'r') as f: |
|
|
lines = f.readlines() |
|
|
print(f"\n {split} split:") |
|
|
print(f" - Examples: {len(lines)}") |
|
|
|
|
|
|
|
|
first_example = json.loads(lines[0]) |
|
|
print(f" - First example: {first_example}") |
|
|
|
|
|
|
|
|
if "text" in first_example and "label" in first_example: |
|
|
print(f" - Structure: β Valid") |
|
|
else: |
|
|
print(f" - Structure: β Invalid") |
|
|
except Exception as e: |
|
|
print(f" Error reading {split}: {e}") |
|
|
|
|
|
|
|
|
print("\n3. Checking model integration:") |
|
|
model_file = "model.py" |
|
|
|
|
|
if os.path.exists(model_file): |
|
|
with open(model_file, 'r') as f: |
|
|
content = f.read() |
|
|
|
|
|
|
|
|
if "load_dataset" in content: |
|
|
print(" β load_dataset method found in model.py") |
|
|
else: |
|
|
print(" β load_dataset method not found") |
|
|
|
|
|
if "prepare_dataset_batch" in content: |
|
|
print(" β prepare_dataset_batch method found in model.py") |
|
|
else: |
|
|
print(" β prepare_dataset_batch method not found") |
|
|
|
|
|
if "from datasets import load_dataset" in content: |
|
|
print(" β datasets import found in model.py") |
|
|
else: |
|
|
print(" β datasets import not found") |
|
|
|
|
|
print("\n4. Dataset URLs:") |
|
|
print(f" Model: https://huggingface.co/chiedo/hello-world") |
|
|
print(f" Dataset: https://huggingface.co/datasets/chiedo/hello-world") |
|
|
|
|
|
print("\n" + "=" * 50) |
|
|
print("Dataset integration setup complete!") |
|
|
print("\nTo use the dataset with the model, install dependencies:") |
|
|
print(" pip install torch transformers datasets") |
|
|
print("\nThen run:") |
|
|
print(" python example_with_dataset.py") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
test_dataset_files() |