Spaces:
Running
Running
Commit
·
e93499e
1
Parent(s):
59f0b85
fix checking statistics capabilities
Browse files
scripts/playground/metadata_tool_playground.py
CHANGED
|
@@ -25,7 +25,7 @@ logging.basicConfig(
|
|
| 25 |
logger = logging.getLogger(__name__)
|
| 26 |
|
| 27 |
|
| 28 |
-
def test_merged_metadata(dataset_name = "
|
| 29 |
"""Test merged metadata retrieval."""
|
| 30 |
print("=" * 80)
|
| 31 |
print("Testing Merged Metadata from DatasetService")
|
|
@@ -43,7 +43,7 @@ def test_merged_metadata(dataset_name = "rajpurkar/squad"):
|
|
| 43 |
# Test with squad dataset
|
| 44 |
print(f"\n### Testing: {dataset_name} ###\n")
|
| 45 |
try:
|
| 46 |
-
metadata = service.load_dataset_info(dataset_name)
|
| 47 |
|
| 48 |
print("Key Information:")
|
| 49 |
print(f" Dataset ID: {metadata.get('id')}")
|
|
@@ -124,4 +124,4 @@ def test_multi_config_dataset(dataset_name = "stanfordnlp/imdb"):
|
|
| 124 |
|
| 125 |
if __name__ == "__main__":
|
| 126 |
test_merged_metadata()
|
| 127 |
-
test_multi_config_dataset()
|
|
|
|
| 25 |
logger = logging.getLogger(__name__)
|
| 26 |
|
| 27 |
|
| 28 |
+
def test_merged_metadata(dataset_name = "wikitext", config_name = "wikitext-2-raw-v1"):
|
| 29 |
"""Test merged metadata retrieval."""
|
| 30 |
print("=" * 80)
|
| 31 |
print("Testing Merged Metadata from DatasetService")
|
|
|
|
| 43 |
# Test with squad dataset
|
| 44 |
print(f"\n### Testing: {dataset_name} ###\n")
|
| 45 |
try:
|
| 46 |
+
metadata = service.load_dataset_info(dataset_name, config_name)
|
| 47 |
|
| 48 |
print("Key Information:")
|
| 49 |
print(f" Dataset ID: {metadata.get('id')}")
|
|
|
|
| 124 |
|
| 125 |
if __name__ == "__main__":
|
| 126 |
test_merged_metadata()
|
| 127 |
+
#test_multi_config_dataset()
|
src/hf_eda_mcp/services/dataset_service.py
CHANGED
|
@@ -808,7 +808,7 @@ class DatasetService:
|
|
| 808 |
def _check_statistics_availability(
|
| 809 |
self,
|
| 810 |
dataset_name: str,
|
| 811 |
-
|
| 812 |
) -> dict:
|
| 813 |
"""
|
| 814 |
Check if statistics are available for a dataset.
|
|
@@ -818,7 +818,7 @@ class DatasetService:
|
|
| 818 |
|
| 819 |
Args:
|
| 820 |
dataset_name: HuggingFace dataset identifier
|
| 821 |
-
|
| 822 |
|
| 823 |
Returns:
|
| 824 |
Dictionary with availability information:
|
|
@@ -830,34 +830,40 @@ class DatasetService:
|
|
| 830 |
DatasetViewerError: If the API request fails
|
| 831 |
"""
|
| 832 |
try:
|
| 833 |
-
dataset_info = self.load_dataset_info(dataset_name,
|
| 834 |
full_dataset_id = dataset_info.get('id', dataset_name)
|
| 835 |
|
| 836 |
if len(dataset_info["configs"]) == 1:
|
| 837 |
# Single config format
|
| 838 |
builder_name = dataset_info.get('builder_name', '')
|
| 839 |
is_parquet = builder_name == 'parquet'
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
"full_dataset_id": full_dataset_id,
|
| 844 |
-
"configs": [dataset_info.get('config_name')] if is_parquet else [],
|
| 845 |
-
"reason": 'Statistics available' if is_parquet else f'Statistics only available for parquet datasets (found: {builder_name})'
|
| 846 |
-
}
|
| 847 |
else:
|
| 848 |
# Multiple configs format
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 861 |
except Exception as e:
|
| 862 |
error_msg = f"Unexpected error checking statistics availability: {str(e)}"
|
| 863 |
logger.error(error_msg)
|
|
@@ -867,8 +873,8 @@ class DatasetService:
|
|
| 867 |
def get_dataset_service(hf_api_token: str) -> DatasetService:
|
| 868 |
"""Get or create the global dataset service instance using current config."""
|
| 869 |
config = get_config()
|
| 870 |
-
|
| 871 |
-
|
| 872 |
dataset_service = DatasetService(
|
| 873 |
cache_dir=config.cache_dir,
|
| 874 |
token=hf_api_token
|
|
|
|
| 808 |
def _check_statistics_availability(
|
| 809 |
self,
|
| 810 |
dataset_name: str,
|
| 811 |
+
config_name: Optional[str] = None
|
| 812 |
) -> dict:
|
| 813 |
"""
|
| 814 |
Check if statistics are available for a dataset.
|
|
|
|
| 818 |
|
| 819 |
Args:
|
| 820 |
dataset_name: HuggingFace dataset identifier
|
| 821 |
+
config_name: Optional configuration name
|
| 822 |
|
| 823 |
Returns:
|
| 824 |
Dictionary with availability information:
|
|
|
|
| 830 |
DatasetViewerError: If the API request fails
|
| 831 |
"""
|
| 832 |
try:
|
| 833 |
+
dataset_info = self.load_dataset_info(dataset_name, config_name)
|
| 834 |
full_dataset_id = dataset_info.get('id', dataset_name)
|
| 835 |
|
| 836 |
if len(dataset_info["configs"]) == 1:
|
| 837 |
# Single config format
|
| 838 |
builder_name = dataset_info.get('builder_name', '')
|
| 839 |
is_parquet = builder_name == 'parquet'
|
| 840 |
+
configs = [dataset_info["configs"][0]] if is_parquet else [],
|
| 841 |
+
reason = 'Statistics available' if is_parquet else f'Statistics only available for parquet datasets (found: {builder_name})'
|
| 842 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 843 |
else:
|
| 844 |
# Multiple configs format
|
| 845 |
+
if config_name is None:
|
| 846 |
+
# Take every configs
|
| 847 |
+
configs = []
|
| 848 |
+
for cfg_data in dataset_info["config_details"]:
|
| 849 |
+
if cfg_data.get('builder_name') == 'parquet':
|
| 850 |
+
configs.append(cfg_data.get("config_name"))
|
| 851 |
+
is_parquet = len(configs) > 0
|
| 852 |
+
reason = f'Statistics available for {len(configs)} config(s)' if configs else 'No parquet configs found'
|
| 853 |
+
|
| 854 |
+
else:
|
| 855 |
+
configs = [config_name]
|
| 856 |
+
builder_name = dataset_info.get('builder_name', '')
|
| 857 |
+
is_parquet = builder_name == 'parquet'
|
| 858 |
+
reason = f'Statistics available for provided config {config_name}' if is_parquet else f'No parquet found for config {config_name}'
|
| 859 |
+
|
| 860 |
+
return {
|
| 861 |
+
"available": is_parquet,
|
| 862 |
+
"full_dataset_id": full_dataset_id,
|
| 863 |
+
"configs": configs,
|
| 864 |
+
"reason": reason
|
| 865 |
+
}
|
| 866 |
+
|
| 867 |
except Exception as e:
|
| 868 |
error_msg = f"Unexpected error checking statistics availability: {str(e)}"
|
| 869 |
logger.error(error_msg)
|
|
|
|
| 873 |
def get_dataset_service(hf_api_token: str) -> DatasetService:
|
| 874 |
"""Get or create the global dataset service instance using current config."""
|
| 875 |
config = get_config()
|
| 876 |
+
if hf_api_token is None or len(hf_api_token) == 0:
|
| 877 |
+
hf_api_token = config.hf_token
|
| 878 |
dataset_service = DatasetService(
|
| 879 |
cache_dir=config.cache_dir,
|
| 880 |
token=hf_api_token
|