File size: 2,074 Bytes
43642a4
 
 
 
64e67e1
43642a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import logging
from pprint import pprint
from dotenv import load_dotenv
from hf_eda_mcp.integrations.dataset_viewer_adapter import DatasetViewerAdapter
from hf_eda_mcp.tools.analysis import analyze_dataset_features

load_dotenv()

# Setup logging
logging.basicConfig(
    filename="scripts.log",
    encoding='utf-8',
    level=logging.DEBUG,
    filemode="w",
    format='%(asctime)s - %(levelname)s - %(message)s',
)

logger = logging.getLogger(__name__)


def test_dataset_viewer_analysis(dataset_name = "stanfordnlp/imdb"):
    service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"])
    result = service.get_dataset_statistics(dataset_name=dataset_name, config="plain_text", split_name="train")
    pprint(result, indent=2)


def test_dataset_service_analysis(dataset_name = "stanfordnlp/imdb"):
    result = analyze_dataset_features(dataset_id=dataset_name, split="train")
    pprint(result, indent=2)


def test_statistics_availability(dataset_name = "stanfordnlp/imdb"):
    service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"])
    result = service.check_statistics_availability(dataset_name=dataset_name)
    print(f"\nStatistics availability for {dataset_name}:")
    pprint(result, indent=2)


if __name__ == "__main__":
    print("###### Dataset Viewer Statistics Endpoint #######")
    test_dataset_viewer_analysis()
    
    print("\n###### Integrated Analysis (uses Dataset Viewer when available) #######")
    test_dataset_service_analysis()
    
    print("\n###### Check Statistics Availability #######")
    test_statistics_availability("stanfordnlp/imdb")
    
    # Test with a dataset that might not have statistics
    print("\n###### Testing fallback for dataset without parquet format #######")
    try:
        result = analyze_dataset_features(dataset_id="glue", config_name="cola", split="train", sample_size=100)
        print(f"Analysis method: {result['sample_info']['sampling_method']}")
        print(f"Sample size: {result['dataset_info']['sample_size_used']}")
    except Exception as e:
        print(f"Error: {e}")