import os import logging from pprint import pprint from dotenv import load_dotenv from hf_eda_mcp.integrations.dataset_viewer_adapter import DatasetViewerAdapter from hf_eda_mcp.tools.analysis import analyze_dataset_features load_dotenv() # Setup logging logging.basicConfig( filename="scripts.log", encoding='utf-8', level=logging.DEBUG, filemode="w", format='%(asctime)s - %(levelname)s - %(message)s', ) logger = logging.getLogger(__name__) def test_dataset_viewer_analysis(dataset_name = "stanfordnlp/imdb"): service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"]) result = service.get_dataset_statistics(dataset_name=dataset_name, config="plain_text", split_name="train") pprint(result, indent=2) def test_dataset_service_analysis(dataset_name = "stanfordnlp/imdb"): result = analyze_dataset_features(dataset_id=dataset_name, split="train") pprint(result, indent=2) def test_statistics_availability(dataset_name = "stanfordnlp/imdb"): service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"]) result = service.check_statistics_availability(dataset_name=dataset_name) print(f"\nStatistics availability for {dataset_name}:") pprint(result, indent=2) if __name__ == "__main__": print("###### Dataset Viewer Statistics Endpoint #######") test_dataset_viewer_analysis() print("\n###### Integrated Analysis (uses Dataset Viewer when available) #######") test_dataset_service_analysis() print("\n###### Check Statistics Availability #######") test_statistics_availability("stanfordnlp/imdb") # Test with a dataset that might not have statistics print("\n###### Testing fallback for dataset without parquet format #######") try: result = analyze_dataset_features(dataset_id="glue", config_name="cola", split="train", sample_size=100) print(f"Analysis method: {result['sample_info']['sampling_method']}") print(f"Sample size: {result['dataset_info']['sample_size_used']}") except Exception as e: print(f"Error: {e}")