Spaces:
Running
Running
| """ | |
| Test script to verify the merged metadata from Dataset Service. | |
| This script tests that the DatasetService properly merges data from both | |
| the Hub API and Dataset Viewer API. | |
| """ | |
| import os | |
| import logging | |
| from pprint import pprint | |
| from dotenv import load_dotenv | |
| from hf_eda_mcp.services.dataset_service import DatasetService | |
| load_dotenv() | |
| # Setup logging | |
| logging.basicConfig( | |
| filename="scripts.log", | |
| encoding='utf-8', | |
| level=logging.DEBUG, | |
| filemode="w", | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def test_merged_metadata(dataset_name = "wikitext", config_name = "wikitext-2-raw-v1"): | |
| """Test merged metadata retrieval.""" | |
| print("=" * 80) | |
| print("Testing Merged Metadata from DatasetService") | |
| print("=" * 80) | |
| # Initialize service | |
| service = DatasetService( | |
| cache_dir="./cache", | |
| token=os.environ.get("HF_TOKEN") | |
| ) | |
| # Clear cache to force fresh fetch | |
| service.clear_cache(dataset_name) | |
| # Test with squad dataset | |
| print(f"\n### Testing: {dataset_name} ###\n") | |
| try: | |
| metadata = service.load_dataset_info(dataset_name, config_name) | |
| print("Key Information:") | |
| print(f" Dataset ID: {metadata.get('id')}") | |
| print(f" Author: {metadata.get('author')}") | |
| print(f" Size (bytes): {metadata.get('size_bytes', 'N/A')}") | |
| print(f" Size (human): {metadata.get('size_human', 'N/A')}") | |
| print(f" Download Size: {metadata.get('download_size_human', 'N/A')}") | |
| print(f" Total Examples: {metadata.get('total_examples', 'N/A')}") | |
| print(f" Downloads: {metadata.get('downloads', 0):,}") | |
| print(f" Likes: {metadata.get('likes', 0)}") | |
| print("\nSplits:") | |
| for split_name, split_info in metadata.get('splits', {}).items(): | |
| if isinstance(split_info, dict): | |
| num_examples = split_info.get('num_examples', 'N/A') | |
| num_bytes = split_info.get('num_bytes', 'N/A') | |
| print(f" {split_name}: {num_examples:,} examples, {num_bytes:,} bytes") | |
| else: | |
| print(f" {split_name}: {split_info}") | |
| print("\nFeatures Schema:") | |
| features = metadata.get('features', {}) | |
| if features: | |
| for feature_name, feature_info in features.items(): | |
| print(f" {feature_name}: {feature_info}") | |
| else: | |
| print(" No features available") | |
| print("\nSummary:") | |
| print(f" {metadata.get('summary', 'N/A')}") | |
| print("\n" + "=" * 80) | |
| print("Full Metadata:") | |
| print("=" * 80) | |
| pprint(metadata, indent=2) | |
| except Exception as e: | |
| print(f"\n✗ Error: {e}") | |
| logger.exception("Failed to retrieve merged metadata") | |
| def test_multi_config_dataset(dataset_name = "stanfordnlp/imdb"): | |
| """Test with a multi-config dataset.""" | |
| print("\n\n" + "=" * 80) | |
| print("Testing Multi-Config Dataset: ") | |
| print("=" * 80) | |
| service = DatasetService( | |
| cache_dir="./cache", | |
| token=os.environ.get("HF_TOKEN") | |
| ) | |
| # Clear cache | |
| service.clear_cache(dataset_name) | |
| print(f"\n### Testing: {dataset_name} ###\n") | |
| try: | |
| metadata = service.load_dataset_info(dataset_name) | |
| print("Key Information:") | |
| print(f" Dataset ID: {metadata.get('id')}") | |
| print(f" Total Examples: {metadata.get('total_examples', 'N/A')}") | |
| print(f" Size (human): {metadata.get('size_human', 'N/A')}") | |
| print("\nSplits:") | |
| for split_name, split_info in metadata.get('splits', {}).items(): | |
| if isinstance(split_info, dict): | |
| num_examples = split_info.get('num_examples', 'N/A') | |
| print(f" {split_name}: {num_examples:,} examples") | |
| print("\nSummary:") | |
| print(f" {metadata.get('summary', 'N/A')}") | |
| except Exception as e: | |
| print(f"\n✗ Error: {e}") | |
| logger.exception("Failed to retrieve imdb metadata") | |
| if __name__ == "__main__": | |
| test_merged_metadata() | |
| #test_multi_config_dataset() | |