Spaces:
Running
Running
File size: 2,074 Bytes
43642a4 64e67e1 43642a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import os
import logging
from pprint import pprint
from dotenv import load_dotenv
from hf_eda_mcp.integrations.dataset_viewer_adapter import DatasetViewerAdapter
from hf_eda_mcp.tools.analysis import analyze_dataset_features
load_dotenv()
# Setup logging
logging.basicConfig(
filename="scripts.log",
encoding='utf-8',
level=logging.DEBUG,
filemode="w",
format='%(asctime)s - %(levelname)s - %(message)s',
)
logger = logging.getLogger(__name__)
def test_dataset_viewer_analysis(dataset_name = "stanfordnlp/imdb"):
service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"])
result = service.get_dataset_statistics(dataset_name=dataset_name, config="plain_text", split_name="train")
pprint(result, indent=2)
def test_dataset_service_analysis(dataset_name = "stanfordnlp/imdb"):
result = analyze_dataset_features(dataset_id=dataset_name, split="train")
pprint(result, indent=2)
def test_statistics_availability(dataset_name = "stanfordnlp/imdb"):
service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"])
result = service.check_statistics_availability(dataset_name=dataset_name)
print(f"\nStatistics availability for {dataset_name}:")
pprint(result, indent=2)
if __name__ == "__main__":
print("###### Dataset Viewer Statistics Endpoint #######")
test_dataset_viewer_analysis()
print("\n###### Integrated Analysis (uses Dataset Viewer when available) #######")
test_dataset_service_analysis()
print("\n###### Check Statistics Availability #######")
test_statistics_availability("stanfordnlp/imdb")
# Test with a dataset that might not have statistics
print("\n###### Testing fallback for dataset without parquet format #######")
try:
result = analyze_dataset_features(dataset_id="glue", config_name="cola", split="train", sample_size=100)
print(f"Analysis method: {result['sample_info']['sampling_method']}")
print(f"Sample size: {result['dataset_info']['sample_size_used']}")
except Exception as e:
print(f"Error: {e}") |