hf-eda-mcp / scripts /playground /analysis_tool_playground.py
KhalilGuetari's picture
Document technical details
64e67e1
import os
import logging
from pprint import pprint
from dotenv import load_dotenv
from hf_eda_mcp.integrations.dataset_viewer_adapter import DatasetViewerAdapter
from hf_eda_mcp.tools.analysis import analyze_dataset_features
load_dotenv()
# Setup logging
logging.basicConfig(
filename="scripts.log",
encoding='utf-8',
level=logging.DEBUG,
filemode="w",
format='%(asctime)s - %(levelname)s - %(message)s',
)
logger = logging.getLogger(__name__)
def test_dataset_viewer_analysis(dataset_name = "stanfordnlp/imdb"):
service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"])
result = service.get_dataset_statistics(dataset_name=dataset_name, config="plain_text", split_name="train")
pprint(result, indent=2)
def test_dataset_service_analysis(dataset_name = "stanfordnlp/imdb"):
result = analyze_dataset_features(dataset_id=dataset_name, split="train")
pprint(result, indent=2)
def test_statistics_availability(dataset_name = "stanfordnlp/imdb"):
service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"])
result = service.check_statistics_availability(dataset_name=dataset_name)
print(f"\nStatistics availability for {dataset_name}:")
pprint(result, indent=2)
if __name__ == "__main__":
print("###### Dataset Viewer Statistics Endpoint #######")
test_dataset_viewer_analysis()
print("\n###### Integrated Analysis (uses Dataset Viewer when available) #######")
test_dataset_service_analysis()
print("\n###### Check Statistics Availability #######")
test_statistics_availability("stanfordnlp/imdb")
# Test with a dataset that might not have statistics
print("\n###### Testing fallback for dataset without parquet format #######")
try:
result = analyze_dataset_features(dataset_id="glue", config_name="cola", split="train", sample_size=100)
print(f"Analysis method: {result['sample_info']['sampling_method']}")
print(f"Sample size: {result['dataset_info']['sample_size_used']}")
except Exception as e:
print(f"Error: {e}")