Spaces:
Running
Running
| import os | |
| import logging | |
| from pprint import pprint | |
| from dotenv import load_dotenv | |
| from hf_eda_mcp.integrations.dataset_viewer_adapter import DatasetViewerAdapter | |
| from hf_eda_mcp.tools.analysis import analyze_dataset_features | |
| load_dotenv() | |
| # Setup logging | |
| logging.basicConfig( | |
| filename="scripts.log", | |
| encoding='utf-8', | |
| level=logging.DEBUG, | |
| filemode="w", | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def test_dataset_viewer_analysis(dataset_name = "stanfordnlp/imdb"): | |
| service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"]) | |
| result = service.get_dataset_statistics(dataset_name=dataset_name, config="plain_text", split_name="train") | |
| pprint(result, indent=2) | |
| def test_dataset_service_analysis(dataset_name = "stanfordnlp/imdb"): | |
| result = analyze_dataset_features(dataset_id=dataset_name, split="train") | |
| pprint(result, indent=2) | |
| def test_statistics_availability(dataset_name = "stanfordnlp/imdb"): | |
| service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"]) | |
| result = service.check_statistics_availability(dataset_name=dataset_name) | |
| print(f"\nStatistics availability for {dataset_name}:") | |
| pprint(result, indent=2) | |
| if __name__ == "__main__": | |
| print("###### Dataset Viewer Statistics Endpoint #######") | |
| test_dataset_viewer_analysis() | |
| print("\n###### Integrated Analysis (uses Dataset Viewer when available) #######") | |
| test_dataset_service_analysis() | |
| print("\n###### Check Statistics Availability #######") | |
| test_statistics_availability("stanfordnlp/imdb") | |
| # Test with a dataset that might not have statistics | |
| print("\n###### Testing fallback for dataset without parquet format #######") | |
| try: | |
| result = analyze_dataset_features(dataset_id="glue", config_name="cola", split="train", sample_size=100) | |
| print(f"Analysis method: {result['sample_info']['sampling_method']}") | |
| print(f"Sample size: {result['dataset_info']['sample_size_used']}") | |
| except Exception as e: | |
| print(f"Error: {e}") |