Spaces:
Running
Running
File size: 4,191 Bytes
b3aa246 e93499e b3aa246 e93499e b3aa246 e93499e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
"""
Test script to verify the merged metadata from Dataset Service.
This script tests that the DatasetService properly merges data from both
the Hub API and Dataset Viewer API.
"""
import os
import logging
from pprint import pprint
from dotenv import load_dotenv
from hf_eda_mcp.services.dataset_service import DatasetService
load_dotenv()
# Setup logging
logging.basicConfig(
filename="scripts.log",
encoding='utf-8',
level=logging.DEBUG,
filemode="w",
format='%(asctime)s - %(levelname)s - %(message)s',
)
logger = logging.getLogger(__name__)
def test_merged_metadata(dataset_name = "wikitext", config_name = "wikitext-2-raw-v1"):
"""Test merged metadata retrieval."""
print("=" * 80)
print("Testing Merged Metadata from DatasetService")
print("=" * 80)
# Initialize service
service = DatasetService(
cache_dir="./cache",
token=os.environ.get("HF_TOKEN")
)
# Clear cache to force fresh fetch
service.clear_cache(dataset_name)
# Test with squad dataset
print(f"\n### Testing: {dataset_name} ###\n")
try:
metadata = service.load_dataset_info(dataset_name, config_name)
print("Key Information:")
print(f" Dataset ID: {metadata.get('id')}")
print(f" Author: {metadata.get('author')}")
print(f" Size (bytes): {metadata.get('size_bytes', 'N/A')}")
print(f" Size (human): {metadata.get('size_human', 'N/A')}")
print(f" Download Size: {metadata.get('download_size_human', 'N/A')}")
print(f" Total Examples: {metadata.get('total_examples', 'N/A')}")
print(f" Downloads: {metadata.get('downloads', 0):,}")
print(f" Likes: {metadata.get('likes', 0)}")
print("\nSplits:")
for split_name, split_info in metadata.get('splits', {}).items():
if isinstance(split_info, dict):
num_examples = split_info.get('num_examples', 'N/A')
num_bytes = split_info.get('num_bytes', 'N/A')
print(f" {split_name}: {num_examples:,} examples, {num_bytes:,} bytes")
else:
print(f" {split_name}: {split_info}")
print("\nFeatures Schema:")
features = metadata.get('features', {})
if features:
for feature_name, feature_info in features.items():
print(f" {feature_name}: {feature_info}")
else:
print(" No features available")
print("\nSummary:")
print(f" {metadata.get('summary', 'N/A')}")
print("\n" + "=" * 80)
print("Full Metadata:")
print("=" * 80)
pprint(metadata, indent=2)
except Exception as e:
print(f"\n✗ Error: {e}")
logger.exception("Failed to retrieve merged metadata")
def test_multi_config_dataset(dataset_name = "stanfordnlp/imdb"):
"""Test with a multi-config dataset."""
print("\n\n" + "=" * 80)
print("Testing Multi-Config Dataset: ")
print("=" * 80)
service = DatasetService(
cache_dir="./cache",
token=os.environ.get("HF_TOKEN")
)
# Clear cache
service.clear_cache(dataset_name)
print(f"\n### Testing: {dataset_name} ###\n")
try:
metadata = service.load_dataset_info(dataset_name)
print("Key Information:")
print(f" Dataset ID: {metadata.get('id')}")
print(f" Total Examples: {metadata.get('total_examples', 'N/A')}")
print(f" Size (human): {metadata.get('size_human', 'N/A')}")
print("\nSplits:")
for split_name, split_info in metadata.get('splits', {}).items():
if isinstance(split_info, dict):
num_examples = split_info.get('num_examples', 'N/A')
print(f" {split_name}: {num_examples:,} examples")
print("\nSummary:")
print(f" {metadata.get('summary', 'N/A')}")
except Exception as e:
print(f"\n✗ Error: {e}")
logger.exception("Failed to retrieve imdb metadata")
if __name__ == "__main__":
test_merged_metadata()
#test_multi_config_dataset()
|