KhalilGuetari commited on
Commit
43642a4
·
1 Parent(s): b3aa246

Improve analysis tool with dataset viewer

Browse files
README.md CHANGED
@@ -17,9 +17,13 @@ An MCP (Model Context Protocol) server that provides tools for Exploratory Data
17
 
18
  ## Features
19
 
20
- - **Dataset Metadata**: Retrieve comprehensive information about HuggingFace datasets
21
  - **Dataset Sampling**: Get samples from any dataset split for quick exploration
22
- - **Feature Analysis**: Perform basic EDA including statistics, missing values, and distributions
 
 
 
 
23
 
24
  ## Usage
25
 
@@ -43,9 +47,12 @@ Replace `YOUR-USERNAME` with your HuggingFace username.
43
 
44
  ### Available Tools
45
 
46
- 1. **get_dataset_metadata**: Get detailed information about a dataset
47
- 2. **get_dataset_sample**: Retrieve sample rows from a dataset
48
- 3. **analyze_dataset_features**: Perform exploratory analysis on dataset features
 
 
 
49
 
50
  ## Authentication
51
 
 
17
 
18
  ## Features
19
 
20
+ - **Dataset Metadata**: Retrieve comprehensive information about HuggingFace datasets including size, features, splits, and configurations
21
  - **Dataset Sampling**: Get samples from any dataset split for quick exploration
22
+ - **Feature Analysis**: Perform basic EDA with automatic optimization
23
+ - Uses HuggingFace Dataset Viewer API for full dataset statistics (when available)
24
+ - Automatic fallback to sample-based analysis
25
+ - Supports multiple data types: numerical, categorical, text, image, audio
26
+ - Includes histograms, distributions, and missing value analysis
27
 
28
  ## Usage
29
 
 
47
 
48
  ### Available Tools
49
 
50
+ 1. **get_dataset_metadata**: Get detailed information about a dataset including size, features, splits, and download statistics
51
+ 2. **get_dataset_sample**: Retrieve sample rows from a dataset for quick exploration
52
+ 3. **analyze_dataset_features**: Perform comprehensive exploratory analysis with automatic optimization
53
+ - Automatically uses Dataset Viewer API statistics for parquet datasets (full dataset analysis)
54
+ - Falls back to sample-based analysis for other formats
55
+ - Returns feature types, statistics, histograms, and missing value analysis
56
 
57
  ## Authentication
58
 
pdm.lock CHANGED
@@ -2,10 +2,10 @@
2
  # It is not intended for manual editing.
3
 
4
  [metadata]
5
- groups = ["default", "hf-cli", "plots"]
6
  strategy = ["inherit_metadata"]
7
  lock_version = "4.5.0"
8
- content_hash = "sha256:4742028e3a2ecbfce41b8229882183b577923af0d34e1e32cf3e8b37314ed204"
9
 
10
  [[metadata.targets]]
11
  requires_python = ">=3.13"
@@ -324,8 +324,8 @@ name = "colorama"
324
  version = "0.4.6"
325
  requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
326
  summary = "Cross-platform colored terminal text."
327
- groups = ["default", "hf-cli"]
328
- marker = "platform_system == \"Windows\""
329
  files = [
330
  {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
331
  {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
@@ -388,6 +388,137 @@ files = [
388
  {file = "contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880"},
389
  ]
390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  [[package]]
392
  name = "cycler"
393
  version = "0.12.1"
@@ -807,6 +938,17 @@ files = [
807
  {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"},
808
  ]
809
 
 
 
 
 
 
 
 
 
 
 
 
810
  [[package]]
811
  name = "jinja2"
812
  version = "3.1.6"
@@ -1208,7 +1350,7 @@ name = "packaging"
1208
  version = "25.0"
1209
  requires_python = ">=3.8"
1210
  summary = "Core utilities for Python packages"
1211
- groups = ["default", "hf-cli", "plots"]
1212
  files = [
1213
  {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"},
1214
  {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"},
@@ -1315,6 +1457,17 @@ files = [
1315
  {file = "pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523"},
1316
  ]
1317
 
 
 
 
 
 
 
 
 
 
 
 
1318
  [[package]]
1319
  name = "propcache"
1320
  version = "0.4.1"
@@ -1486,7 +1639,7 @@ name = "pygments"
1486
  version = "2.19.2"
1487
  requires_python = ">=3.8"
1488
  summary = "Pygments is a syntax highlighting package written in Python."
1489
- groups = ["default"]
1490
  files = [
1491
  {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"},
1492
  {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"},
@@ -1503,6 +1656,58 @@ files = [
1503
  {file = "pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6"},
1504
  ]
1505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1506
  [[package]]
1507
  name = "python-dateutil"
1508
  version = "2.9.0.post0"
 
2
  # It is not intended for manual editing.
3
 
4
  [metadata]
5
+ groups = ["default", "hf-cli", "plots", "test"]
6
  strategy = ["inherit_metadata"]
7
  lock_version = "4.5.0"
8
+ content_hash = "sha256:7db937b9435dfaf07c2e27ae0b16da07ce0764665446873e8f40e81af6d5b5b4"
9
 
10
  [[metadata.targets]]
11
  requires_python = ">=3.13"
 
324
  version = "0.4.6"
325
  requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
326
  summary = "Cross-platform colored terminal text."
327
+ groups = ["default", "hf-cli", "test"]
328
+ marker = "sys_platform == \"win32\" or platform_system == \"Windows\""
329
  files = [
330
  {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
331
  {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 
388
  {file = "contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880"},
389
  ]
390
 
391
+ [[package]]
392
+ name = "coverage"
393
+ version = "7.12.0"
394
+ requires_python = ">=3.10"
395
+ summary = "Code coverage measurement for Python"
396
+ groups = ["test"]
397
+ files = [
398
+ {file = "coverage-7.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:47324fffca8d8eae7e185b5bb20c14645f23350f870c1649003618ea91a78941"},
399
+ {file = "coverage-7.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ccf3b2ede91decd2fb53ec73c1f949c3e034129d1e0b07798ff1d02ea0c8fa4a"},
400
+ {file = "coverage-7.12.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:b365adc70a6936c6b0582dc38746b33b2454148c02349345412c6e743efb646d"},
401
+ {file = "coverage-7.12.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bc13baf85cd8a4cfcf4a35c7bc9d795837ad809775f782f697bf630b7e200211"},
402
+ {file = "coverage-7.12.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:099d11698385d572ceafb3288a5b80fe1fc58bf665b3f9d362389de488361d3d"},
403
+ {file = "coverage-7.12.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:473dc45d69694069adb7680c405fb1e81f60b2aff42c81e2f2c3feaf544d878c"},
404
+ {file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:583f9adbefd278e9de33c33d6846aa8f5d164fa49b47144180a0e037f0688bb9"},
405
+ {file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b2089cc445f2dc0af6f801f0d1355c025b76c24481935303cf1af28f636688f0"},
406
+ {file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:950411f1eb5d579999c5f66c62a40961f126fc71e5e14419f004471957b51508"},
407
+ {file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b1aab7302a87bafebfe76b12af681b56ff446dc6f32ed178ff9c092ca776e6bc"},
408
+ {file = "coverage-7.12.0-cp313-cp313-win32.whl", hash = "sha256:d7e0d0303c13b54db495eb636bc2465b2fb8475d4c8bcec8fe4b5ca454dfbae8"},
409
+ {file = "coverage-7.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:ce61969812d6a98a981d147d9ac583a36ac7db7766f2e64a9d4d059c2fe29d07"},
410
+ {file = "coverage-7.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:bcec6f47e4cb8a4c2dc91ce507f6eefc6a1b10f58df32cdc61dff65455031dfc"},
411
+ {file = "coverage-7.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:459443346509476170d553035e4a3eed7b860f4fe5242f02de1010501956ce87"},
412
+ {file = "coverage-7.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:04a79245ab2b7a61688958f7a855275997134bc84f4a03bc240cf64ff132abf6"},
413
+ {file = "coverage-7.12.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:09a86acaaa8455f13d6a99221d9654df249b33937b4e212b4e5a822065f12aa7"},
414
+ {file = "coverage-7.12.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:907e0df1b71ba77463687a74149c6122c3f6aac56c2510a5d906b2f368208560"},
415
+ {file = "coverage-7.12.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b57e2d0ddd5f0582bae5437c04ee71c46cd908e7bc5d4d0391f9a41e812dd12"},
416
+ {file = "coverage-7.12.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:58c1c6aa677f3a1411fe6fb28ec3a942e4f665df036a3608816e0847fad23296"},
417
+ {file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4c589361263ab2953e3c4cd2a94db94c4ad4a8e572776ecfbad2389c626e4507"},
418
+ {file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:91b810a163ccad2e43b1faa11d70d3cf4b6f3d83f9fd5f2df82a32d47b648e0d"},
419
+ {file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:40c867af715f22592e0d0fb533a33a71ec9e0f73a6945f722a0c85c8c1cbe3a2"},
420
+ {file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:68b0d0a2d84f333de875666259dadf28cc67858bc8fd8b3f1eae84d3c2bec455"},
421
+ {file = "coverage-7.12.0-cp313-cp313t-win32.whl", hash = "sha256:73f9e7fbd51a221818fd11b7090eaa835a353ddd59c236c57b2199486b116c6d"},
422
+ {file = "coverage-7.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:24cff9d1f5743f67db7ba46ff284018a6e9aeb649b67aa1e70c396aa1b7cb23c"},
423
+ {file = "coverage-7.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:c87395744f5c77c866d0f5a43d97cc39e17c7f1cb0115e54a2fe67ca75c5d14d"},
424
+ {file = "coverage-7.12.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:a1c59b7dc169809a88b21a936eccf71c3895a78f5592051b1af8f4d59c2b4f92"},
425
+ {file = "coverage-7.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8787b0f982e020adb732b9f051f3e49dd5054cebbc3f3432061278512a2b1360"},
426
+ {file = "coverage-7.12.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5ea5a9f7dc8877455b13dd1effd3202e0bca72f6f3ab09f9036b1bcf728f69ac"},
427
+ {file = "coverage-7.12.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fdba9f15849534594f60b47c9a30bc70409b54947319a7c4fd0e8e3d8d2f355d"},
428
+ {file = "coverage-7.12.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a00594770eb715854fb1c57e0dea08cce6720cfbc531accdb9850d7c7770396c"},
429
+ {file = "coverage-7.12.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5560c7e0d82b42eb1951e4f68f071f8017c824ebfd5a6ebe42c60ac16c6c2434"},
430
+ {file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d6c2e26b481c9159c2773a37947a9718cfdc58893029cdfb177531793e375cfc"},
431
+ {file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:6e1a8c066dabcde56d5d9fed6a66bc19a2883a3fe051f0c397a41fc42aedd4cc"},
432
+ {file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:f7ba9da4726e446d8dd8aae5a6cd872511184a5d861de80a86ef970b5dacce3e"},
433
+ {file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e0f483ab4f749039894abaf80c2f9e7ed77bbf3c737517fb88c8e8e305896a17"},
434
+ {file = "coverage-7.12.0-cp314-cp314-win32.whl", hash = "sha256:76336c19a9ef4a94b2f8dc79f8ac2da3f193f625bb5d6f51a328cd19bfc19933"},
435
+ {file = "coverage-7.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:7c1059b600aec6ef090721f8f633f60ed70afaffe8ecab85b59df748f24b31fe"},
436
+ {file = "coverage-7.12.0-cp314-cp314-win_arm64.whl", hash = "sha256:172cf3a34bfef42611963e2b661302a8931f44df31629e5b1050567d6b90287d"},
437
+ {file = "coverage-7.12.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:aa7d48520a32cb21c7a9b31f81799e8eaec7239db36c3b670be0fa2403828d1d"},
438
+ {file = "coverage-7.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:90d58ac63bc85e0fb919f14d09d6caa63f35a5512a2205284b7816cafd21bb03"},
439
+ {file = "coverage-7.12.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ca8ecfa283764fdda3eae1bdb6afe58bf78c2c3ec2b2edcb05a671f0bba7b3f9"},
440
+ {file = "coverage-7.12.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:874fe69a0785d96bd066059cd4368022cebbec1a8958f224f0016979183916e6"},
441
+ {file = "coverage-7.12.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5b3c889c0b8b283a24d721a9eabc8ccafcfc3aebf167e4cd0d0e23bf8ec4e339"},
442
+ {file = "coverage-7.12.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8bb5b894b3ec09dcd6d3743229dc7f2c42ef7787dc40596ae04c0edda487371e"},
443
+ {file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:79a44421cd5fba96aa57b5e3b5a4d3274c449d4c622e8f76882d76635501fd13"},
444
+ {file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:33baadc0efd5c7294f436a632566ccc1f72c867f82833eb59820ee37dc811c6f"},
445
+ {file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:c406a71f544800ef7e9e0000af706b88465f3573ae8b8de37e5f96c59f689ad1"},
446
+ {file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e71bba6a40883b00c6d571599b4627f50c360b3d0d02bfc658168936be74027b"},
447
+ {file = "coverage-7.12.0-cp314-cp314t-win32.whl", hash = "sha256:9157a5e233c40ce6613dead4c131a006adfda70e557b6856b97aceed01b0e27a"},
448
+ {file = "coverage-7.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:e84da3a0fd233aeec797b981c51af1cabac74f9bd67be42458365b30d11b5291"},
449
+ {file = "coverage-7.12.0-cp314-cp314t-win_arm64.whl", hash = "sha256:01d24af36fedda51c2b1aca56e4330a3710f83b02a5ff3743a6b015ffa7c9384"},
450
+ {file = "coverage-7.12.0-py3-none-any.whl", hash = "sha256:159d50c0b12e060b15ed3d39f87ed43d4f7f7ad40b8a534f4dd331adbb51104a"},
451
+ {file = "coverage-7.12.0.tar.gz", hash = "sha256:fc11e0a4e372cb5f282f16ef90d4a585034050ccda536451901abfb19a57f40c"},
452
+ ]
453
+
454
+ [[package]]
455
+ name = "coverage"
456
+ version = "7.12.0"
457
+ extras = ["toml"]
458
+ requires_python = ">=3.10"
459
+ summary = "Code coverage measurement for Python"
460
+ groups = ["test"]
461
+ dependencies = [
462
+ "coverage==7.12.0",
463
+ "tomli; python_full_version <= \"3.11.0a6\"",
464
+ ]
465
+ files = [
466
+ {file = "coverage-7.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:47324fffca8d8eae7e185b5bb20c14645f23350f870c1649003618ea91a78941"},
467
+ {file = "coverage-7.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ccf3b2ede91decd2fb53ec73c1f949c3e034129d1e0b07798ff1d02ea0c8fa4a"},
468
+ {file = "coverage-7.12.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:b365adc70a6936c6b0582dc38746b33b2454148c02349345412c6e743efb646d"},
469
+ {file = "coverage-7.12.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bc13baf85cd8a4cfcf4a35c7bc9d795837ad809775f782f697bf630b7e200211"},
470
+ {file = "coverage-7.12.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:099d11698385d572ceafb3288a5b80fe1fc58bf665b3f9d362389de488361d3d"},
471
+ {file = "coverage-7.12.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:473dc45d69694069adb7680c405fb1e81f60b2aff42c81e2f2c3feaf544d878c"},
472
+ {file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:583f9adbefd278e9de33c33d6846aa8f5d164fa49b47144180a0e037f0688bb9"},
473
+ {file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b2089cc445f2dc0af6f801f0d1355c025b76c24481935303cf1af28f636688f0"},
474
+ {file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:950411f1eb5d579999c5f66c62a40961f126fc71e5e14419f004471957b51508"},
475
+ {file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b1aab7302a87bafebfe76b12af681b56ff446dc6f32ed178ff9c092ca776e6bc"},
476
+ {file = "coverage-7.12.0-cp313-cp313-win32.whl", hash = "sha256:d7e0d0303c13b54db495eb636bc2465b2fb8475d4c8bcec8fe4b5ca454dfbae8"},
477
+ {file = "coverage-7.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:ce61969812d6a98a981d147d9ac583a36ac7db7766f2e64a9d4d059c2fe29d07"},
478
+ {file = "coverage-7.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:bcec6f47e4cb8a4c2dc91ce507f6eefc6a1b10f58df32cdc61dff65455031dfc"},
479
+ {file = "coverage-7.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:459443346509476170d553035e4a3eed7b860f4fe5242f02de1010501956ce87"},
480
+ {file = "coverage-7.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:04a79245ab2b7a61688958f7a855275997134bc84f4a03bc240cf64ff132abf6"},
481
+ {file = "coverage-7.12.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:09a86acaaa8455f13d6a99221d9654df249b33937b4e212b4e5a822065f12aa7"},
482
+ {file = "coverage-7.12.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:907e0df1b71ba77463687a74149c6122c3f6aac56c2510a5d906b2f368208560"},
483
+ {file = "coverage-7.12.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b57e2d0ddd5f0582bae5437c04ee71c46cd908e7bc5d4d0391f9a41e812dd12"},
484
+ {file = "coverage-7.12.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:58c1c6aa677f3a1411fe6fb28ec3a942e4f665df036a3608816e0847fad23296"},
485
+ {file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4c589361263ab2953e3c4cd2a94db94c4ad4a8e572776ecfbad2389c626e4507"},
486
+ {file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:91b810a163ccad2e43b1faa11d70d3cf4b6f3d83f9fd5f2df82a32d47b648e0d"},
487
+ {file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:40c867af715f22592e0d0fb533a33a71ec9e0f73a6945f722a0c85c8c1cbe3a2"},
488
+ {file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:68b0d0a2d84f333de875666259dadf28cc67858bc8fd8b3f1eae84d3c2bec455"},
489
+ {file = "coverage-7.12.0-cp313-cp313t-win32.whl", hash = "sha256:73f9e7fbd51a221818fd11b7090eaa835a353ddd59c236c57b2199486b116c6d"},
490
+ {file = "coverage-7.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:24cff9d1f5743f67db7ba46ff284018a6e9aeb649b67aa1e70c396aa1b7cb23c"},
491
+ {file = "coverage-7.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:c87395744f5c77c866d0f5a43d97cc39e17c7f1cb0115e54a2fe67ca75c5d14d"},
492
+ {file = "coverage-7.12.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:a1c59b7dc169809a88b21a936eccf71c3895a78f5592051b1af8f4d59c2b4f92"},
493
+ {file = "coverage-7.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8787b0f982e020adb732b9f051f3e49dd5054cebbc3f3432061278512a2b1360"},
494
+ {file = "coverage-7.12.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5ea5a9f7dc8877455b13dd1effd3202e0bca72f6f3ab09f9036b1bcf728f69ac"},
495
+ {file = "coverage-7.12.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fdba9f15849534594f60b47c9a30bc70409b54947319a7c4fd0e8e3d8d2f355d"},
496
+ {file = "coverage-7.12.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a00594770eb715854fb1c57e0dea08cce6720cfbc531accdb9850d7c7770396c"},
497
+ {file = "coverage-7.12.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5560c7e0d82b42eb1951e4f68f071f8017c824ebfd5a6ebe42c60ac16c6c2434"},
498
+ {file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d6c2e26b481c9159c2773a37947a9718cfdc58893029cdfb177531793e375cfc"},
499
+ {file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:6e1a8c066dabcde56d5d9fed6a66bc19a2883a3fe051f0c397a41fc42aedd4cc"},
500
+ {file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:f7ba9da4726e446d8dd8aae5a6cd872511184a5d861de80a86ef970b5dacce3e"},
501
+ {file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e0f483ab4f749039894abaf80c2f9e7ed77bbf3c737517fb88c8e8e305896a17"},
502
+ {file = "coverage-7.12.0-cp314-cp314-win32.whl", hash = "sha256:76336c19a9ef4a94b2f8dc79f8ac2da3f193f625bb5d6f51a328cd19bfc19933"},
503
+ {file = "coverage-7.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:7c1059b600aec6ef090721f8f633f60ed70afaffe8ecab85b59df748f24b31fe"},
504
+ {file = "coverage-7.12.0-cp314-cp314-win_arm64.whl", hash = "sha256:172cf3a34bfef42611963e2b661302a8931f44df31629e5b1050567d6b90287d"},
505
+ {file = "coverage-7.12.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:aa7d48520a32cb21c7a9b31f81799e8eaec7239db36c3b670be0fa2403828d1d"},
506
+ {file = "coverage-7.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:90d58ac63bc85e0fb919f14d09d6caa63f35a5512a2205284b7816cafd21bb03"},
507
+ {file = "coverage-7.12.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ca8ecfa283764fdda3eae1bdb6afe58bf78c2c3ec2b2edcb05a671f0bba7b3f9"},
508
+ {file = "coverage-7.12.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:874fe69a0785d96bd066059cd4368022cebbec1a8958f224f0016979183916e6"},
509
+ {file = "coverage-7.12.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5b3c889c0b8b283a24d721a9eabc8ccafcfc3aebf167e4cd0d0e23bf8ec4e339"},
510
+ {file = "coverage-7.12.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8bb5b894b3ec09dcd6d3743229dc7f2c42ef7787dc40596ae04c0edda487371e"},
511
+ {file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:79a44421cd5fba96aa57b5e3b5a4d3274c449d4c622e8f76882d76635501fd13"},
512
+ {file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:33baadc0efd5c7294f436a632566ccc1f72c867f82833eb59820ee37dc811c6f"},
513
+ {file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:c406a71f544800ef7e9e0000af706b88465f3573ae8b8de37e5f96c59f689ad1"},
514
+ {file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e71bba6a40883b00c6d571599b4627f50c360b3d0d02bfc658168936be74027b"},
515
+ {file = "coverage-7.12.0-cp314-cp314t-win32.whl", hash = "sha256:9157a5e233c40ce6613dead4c131a006adfda70e557b6856b97aceed01b0e27a"},
516
+ {file = "coverage-7.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:e84da3a0fd233aeec797b981c51af1cabac74f9bd67be42458365b30d11b5291"},
517
+ {file = "coverage-7.12.0-cp314-cp314t-win_arm64.whl", hash = "sha256:01d24af36fedda51c2b1aca56e4330a3710f83b02a5ff3743a6b015ffa7c9384"},
518
+ {file = "coverage-7.12.0-py3-none-any.whl", hash = "sha256:159d50c0b12e060b15ed3d39f87ed43d4f7f7ad40b8a534f4dd331adbb51104a"},
519
+ {file = "coverage-7.12.0.tar.gz", hash = "sha256:fc11e0a4e372cb5f282f16ef90d4a585034050ccda536451901abfb19a57f40c"},
520
+ ]
521
+
522
  [[package]]
523
  name = "cycler"
524
  version = "0.12.1"
 
938
  {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"},
939
  ]
940
 
941
+ [[package]]
942
+ name = "iniconfig"
943
+ version = "2.3.0"
944
+ requires_python = ">=3.10"
945
+ summary = "brain-dead simple config-ini parsing"
946
+ groups = ["test"]
947
+ files = [
948
+ {file = "iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12"},
949
+ {file = "iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730"},
950
+ ]
951
+
952
  [[package]]
953
  name = "jinja2"
954
  version = "3.1.6"
 
1350
  version = "25.0"
1351
  requires_python = ">=3.8"
1352
  summary = "Core utilities for Python packages"
1353
+ groups = ["default", "hf-cli", "plots", "test"]
1354
  files = [
1355
  {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"},
1356
  {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"},
 
1457
  {file = "pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523"},
1458
  ]
1459
 
1460
+ [[package]]
1461
+ name = "pluggy"
1462
+ version = "1.6.0"
1463
+ requires_python = ">=3.9"
1464
+ summary = "plugin and hook calling mechanisms for python"
1465
+ groups = ["test"]
1466
+ files = [
1467
+ {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"},
1468
+ {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"},
1469
+ ]
1470
+
1471
  [[package]]
1472
  name = "propcache"
1473
  version = "0.4.1"
 
1639
  version = "2.19.2"
1640
  requires_python = ">=3.8"
1641
  summary = "Pygments is a syntax highlighting package written in Python."
1642
+ groups = ["default", "test"]
1643
  files = [
1644
  {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"},
1645
  {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"},
 
1656
  {file = "pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6"},
1657
  ]
1658
 
1659
+ [[package]]
1660
+ name = "pytest"
1661
+ version = "9.0.1"
1662
+ requires_python = ">=3.10"
1663
+ summary = "pytest: simple powerful testing with Python"
1664
+ groups = ["test"]
1665
+ dependencies = [
1666
+ "colorama>=0.4; sys_platform == \"win32\"",
1667
+ "exceptiongroup>=1; python_version < \"3.11\"",
1668
+ "iniconfig>=1.0.1",
1669
+ "packaging>=22",
1670
+ "pluggy<2,>=1.5",
1671
+ "pygments>=2.7.2",
1672
+ "tomli>=1; python_version < \"3.11\"",
1673
+ ]
1674
+ files = [
1675
+ {file = "pytest-9.0.1-py3-none-any.whl", hash = "sha256:67be0030d194df2dfa7b556f2e56fb3c3315bd5c8822c6951162b92b32ce7dad"},
1676
+ {file = "pytest-9.0.1.tar.gz", hash = "sha256:3e9c069ea73583e255c3b21cf46b8d3c56f6e3a1a8f6da94ccb0fcf57b9d73c8"},
1677
+ ]
1678
+
1679
+ [[package]]
1680
+ name = "pytest-asyncio"
1681
+ version = "1.3.0"
1682
+ requires_python = ">=3.10"
1683
+ summary = "Pytest support for asyncio"
1684
+ groups = ["test"]
1685
+ dependencies = [
1686
+ "backports-asyncio-runner<2,>=1.1; python_version < \"3.11\"",
1687
+ "pytest<10,>=8.2",
1688
+ "typing-extensions>=4.12; python_version < \"3.13\"",
1689
+ ]
1690
+ files = [
1691
+ {file = "pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5"},
1692
+ {file = "pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5"},
1693
+ ]
1694
+
1695
+ [[package]]
1696
+ name = "pytest-cov"
1697
+ version = "7.0.0"
1698
+ requires_python = ">=3.9"
1699
+ summary = "Pytest plugin for measuring coverage."
1700
+ groups = ["test"]
1701
+ dependencies = [
1702
+ "coverage[toml]>=7.10.6",
1703
+ "pluggy>=1.2",
1704
+ "pytest>=7",
1705
+ ]
1706
+ files = [
1707
+ {file = "pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861"},
1708
+ {file = "pytest_cov-7.0.0.tar.gz", hash = "sha256:33c97eda2e049a0c5298e91f519302a1334c26ac65c1a483d6206fd458361af1"},
1709
+ ]
1710
+
1711
  [[package]]
1712
  name = "python-dateutil"
1713
  version = "2.9.0.post0"
pyproject.toml CHANGED
@@ -41,18 +41,21 @@ where = ["src"]
41
  hf-eda-mcp = {cmd="python -m hf_eda_mcp", env_file= ".env"}
42
  hf_client_playground = "python -m scripts.playground.hf_client_playground"
43
  metadata_playground = "python -m scripts.playground.metadata_tool_playground"
 
44
 
45
  [tool.pdm]
46
  distribution = true
47
 
48
  [tool.pdm.dev-dependencies]
49
- test = [
50
- "pytest>=7.0.0",
51
- "pytest-asyncio>=0.21.0",
52
- "pytest-cov>=4.0.0"
53
- ]
54
  lint = [
55
  "ruff>=0.1.0",
56
  "black>=23.0.0",
57
  "mypy>=1.0.0"
58
  ]
 
 
 
 
 
 
 
 
41
  hf-eda-mcp = {cmd="python -m hf_eda_mcp", env_file= ".env"}
42
  hf_client_playground = "python -m scripts.playground.hf_client_playground"
43
  metadata_playground = "python -m scripts.playground.metadata_tool_playground"
44
+ analysis_playground = "python -m scripts.playground.analysis_tool_playground"
45
 
46
  [tool.pdm]
47
  distribution = true
48
 
49
  [tool.pdm.dev-dependencies]
 
 
 
 
 
50
  lint = [
51
  "ruff>=0.1.0",
52
  "black>=23.0.0",
53
  "mypy>=1.0.0"
54
  ]
55
+
56
+ [dependency-groups]
57
+ test = [
58
+ "pytest>=9.0.1",
59
+ "pytest-asyncio>=0.21.0",
60
+ "pytest-cov>=4.0.0",
61
+ ]
scripts/playground/analysis_tool_playground.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from pprint import pprint
4
+ from dotenv import load_dotenv
5
+ from hf_eda_mcp.services.dataset_viewer_adapter import DatasetViewerAdapter
6
+ from hf_eda_mcp.tools.analysis import analyze_dataset_features
7
+
8
+ load_dotenv()
9
+
10
+ # Setup logging
11
+ logging.basicConfig(
12
+ filename="scripts.log",
13
+ encoding='utf-8',
14
+ level=logging.DEBUG,
15
+ filemode="w",
16
+ format='%(asctime)s - %(levelname)s - %(message)s',
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def test_dataset_viewer_analysis(dataset_name = "stanfordnlp/imdb"):
23
+ service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"])
24
+ result = service.get_dataset_statistics(dataset_name=dataset_name, config="plain_text", split_name="train")
25
+ pprint(result, indent=2)
26
+
27
+
28
+ def test_dataset_service_analysis(dataset_name = "stanfordnlp/imdb"):
29
+ result = analyze_dataset_features(dataset_id=dataset_name, split="train")
30
+ pprint(result, indent=2)
31
+
32
+
33
+ def test_statistics_availability(dataset_name = "stanfordnlp/imdb"):
34
+ service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"])
35
+ result = service.check_statistics_availability(dataset_name=dataset_name)
36
+ print(f"\nStatistics availability for {dataset_name}:")
37
+ pprint(result, indent=2)
38
+
39
+
40
+ if __name__ == "__main__":
41
+ print("###### Dataset Viewer Statistics Endpoint #######")
42
+ test_dataset_viewer_analysis()
43
+
44
+ print("\n###### Integrated Analysis (uses Dataset Viewer when available) #######")
45
+ test_dataset_service_analysis()
46
+
47
+ print("\n###### Check Statistics Availability #######")
48
+ test_statistics_availability("stanfordnlp/imdb")
49
+
50
+ # Test with a dataset that might not have statistics
51
+ print("\n###### Testing fallback for dataset without parquet format #######")
52
+ try:
53
+ result = analyze_dataset_features(dataset_id="glue", config_name="cola", split="train", sample_size=100)
54
+ print(f"Analysis method: {result['sample_info']['sampling_method']}")
55
+ print(f"Sample size: {result['dataset_info']['sample_size_used']}")
56
+ except Exception as e:
57
+ print(f"Error: {e}")
src/hf_eda_mcp/services/dataset_service.py CHANGED
@@ -79,9 +79,11 @@ class DatasetService:
79
  # Cache subdirectories
80
  self.metadata_cache_dir = self.cache_dir / "metadata"
81
  self.sample_cache_dir = self.cache_dir / "samples"
 
82
 
83
  self.metadata_cache_dir.mkdir(exist_ok=True)
84
  self.sample_cache_dir.mkdir(exist_ok=True)
 
85
 
86
  logger.info(f"DatasetService initialized with cache dir: {self.cache_dir}")
87
 
@@ -101,6 +103,16 @@ class DatasetService:
101
  """Generate cache key for dataset samples."""
102
  base_key = self._get_cache_key(dataset_id, config_name)
103
  return f"{base_key}_{split}_{num_samples}"
 
 
 
 
 
 
 
 
 
 
104
 
105
  def _is_cache_valid(self, cache_file: Path) -> bool:
106
  """Check if cache file exists and is within TTL."""
@@ -572,6 +584,123 @@ class DatasetService:
572
  f"Failed to load dataset sample: {str(e)}"
573
  ) from e
574
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  def get_cached_metadata(self, dataset_id: str, config_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
576
  """
577
  Retrieve cached metadata without making API calls.
@@ -602,6 +731,8 @@ class DatasetService:
602
  cache_file.unlink()
603
  for cache_file in self.sample_cache_dir.glob("*.json"):
604
  cache_file.unlink()
 
 
605
  logger.info("Cleared all cache")
606
  else:
607
  # Clear cache for specific dataset
@@ -615,6 +746,10 @@ class DatasetService:
615
  for cache_file in self.sample_cache_dir.glob(f"{cache_key}*.json"):
616
  cache_file.unlink()
617
 
 
 
 
 
618
  logger.info(f"Cleared cache for dataset: {dataset_id}")
619
 
620
  except Exception as e:
@@ -631,19 +766,23 @@ class DatasetService:
631
  try:
632
  metadata_files = list(self.metadata_cache_dir.glob("*.json"))
633
  sample_files = list(self.sample_cache_dir.glob("*.json"))
 
634
 
635
  # Calculate cache sizes
636
  metadata_size = sum(f.stat().st_size for f in metadata_files)
637
  sample_size = sum(f.stat().st_size for f in sample_files)
 
638
 
639
  return {
640
  'cache_dir': str(self.cache_dir),
641
  'metadata_files': len(metadata_files),
642
  'sample_files': len(sample_files),
643
- 'total_files': len(metadata_files) + len(sample_files),
 
644
  'metadata_size_bytes': metadata_size,
645
  'sample_size_bytes': sample_size,
646
- 'total_size_bytes': metadata_size + sample_size,
 
647
  'cache_ttl_seconds': self.cache_ttl
648
  }
649
  except Exception as e:
 
79
  # Cache subdirectories
80
  self.metadata_cache_dir = self.cache_dir / "metadata"
81
  self.sample_cache_dir = self.cache_dir / "samples"
82
+ self.statistics_cache_dir = self.cache_dir / "statistics"
83
 
84
  self.metadata_cache_dir.mkdir(exist_ok=True)
85
  self.sample_cache_dir.mkdir(exist_ok=True)
86
+ self.statistics_cache_dir.mkdir(exist_ok=True)
87
 
88
  logger.info(f"DatasetService initialized with cache dir: {self.cache_dir}")
89
 
 
103
  """Generate cache key for dataset samples."""
104
  base_key = self._get_cache_key(dataset_id, config_name)
105
  return f"{base_key}_{split}_{num_samples}"
106
+
107
+ def _get_statistics_cache_key(
108
+ self,
109
+ dataset_id: str,
110
+ split: str,
111
+ config_name: Optional[str] = None
112
+ ) -> str:
113
+ """Generate cache key for dataset statistics."""
114
+ base_key = self._get_cache_key(dataset_id, config_name)
115
+ return f"{base_key}_{split}_stats"
116
 
117
  def _is_cache_valid(self, cache_file: Path) -> bool:
118
  """Check if cache file exists and is within TTL."""
 
584
  f"Failed to load dataset sample: {str(e)}"
585
  ) from e
586
 
587
+ def get_dataset_statistics(
588
+ self,
589
+ dataset_id: str,
590
+ split: str = "train",
591
+ config_name: Optional[str] = None,
592
+ use_cache: bool = True
593
+ ) -> Optional[Dict[str, Any]]:
594
+ """
595
+ Get detailed statistics from Dataset Viewer API with caching.
596
+
597
+ This method provides comprehensive statistics directly from HuggingFace's
598
+ Dataset Viewer API, which is more efficient and complete than sampling.
599
+
600
+ Statistics are only available for datasets with builder_name="parquet".
601
+ If statistics are not available, returns None and the caller should fall
602
+ back to sample-based analysis.
603
+
604
+ Args:
605
+ dataset_id: HuggingFace dataset identifier
606
+ split: Dataset split to get statistics for
607
+ config_name: Optional configuration name
608
+ use_cache: Whether to use cached statistics (default: True)
609
+
610
+ Returns:
611
+ Dictionary containing statistics or None if unavailable:
612
+ - num_examples: Total number of examples
613
+ - statistics: List of column statistics
614
+ - partial: Whether response is partial
615
+ - _cached_at: Cache timestamp
616
+
617
+ Raises:
618
+ DatasetServiceError: If the API request fails unexpectedly
619
+ """
620
+ context = {
621
+ "dataset_id": dataset_id,
622
+ "split": split,
623
+ "config_name": config_name,
624
+ "operation": "get_dataset_statistics"
625
+ }
626
+
627
+ # Check cache first if enabled
628
+ if use_cache:
629
+ cache_key = self._get_statistics_cache_key(dataset_id, split, config_name)
630
+ cache_file = self.statistics_cache_dir / f"{cache_key}.json"
631
+
632
+ cached_data = self._load_from_cache(cache_file)
633
+ if cached_data is not None:
634
+ logger.debug(f"Using cached statistics for {dataset_id}/{split}")
635
+ return cached_data
636
+
637
+ try:
638
+ # First, check if statistics are available for this dataset
639
+ logger.info(f"Checking statistics availability for {dataset_id}")
640
+ availability = self.dataset_viewer.check_statistics_availability(
641
+ dataset_id, config_name
642
+ )
643
+
644
+ if not availability['available']:
645
+ logger.info(
646
+ f"Statistics not available for {dataset_id}: {availability['reason']}"
647
+ )
648
+ return None
649
+
650
+ # Determine which config to use
651
+ if config_name is None:
652
+ # Use first available config
653
+ available_configs = availability['configs']
654
+ if not available_configs:
655
+ logger.warning(f"No configs with statistics found for {dataset_id}")
656
+ return None
657
+ config_name = available_configs[0]
658
+ logger.info(f"Using config '{config_name}' for statistics")
659
+ elif config_name not in availability['configs']:
660
+ logger.warning(
661
+ f"Config '{config_name}' does not support statistics. "
662
+ f"Available configs: {availability['configs']}"
663
+ )
664
+ return None
665
+
666
+ # Fetch statistics from API
667
+ logger.info(f"Fetching statistics for {dataset_id}/{config_name}/{split}")
668
+ statistics = self.dataset_viewer.get_dataset_statistics(
669
+ dataset_name=dataset_id,
670
+ config=config_name,
671
+ split_name=split
672
+ )
673
+
674
+ # Add metadata
675
+ statistics['_cached_at'] = time.time()
676
+ statistics['_config_used'] = config_name
677
+ statistics['_dataset_id'] = dataset_id
678
+ statistics['_split'] = split
679
+
680
+ # Cache the results
681
+ if use_cache:
682
+ try:
683
+ self._save_to_cache(cache_file, statistics)
684
+ except CacheError as e:
685
+ logger.warning(f"Failed to cache statistics: {e}")
686
+
687
+ logger.info(
688
+ f"Successfully fetched statistics for {dataset_id}: "
689
+ f"{statistics.get('num_examples', 0)} examples, "
690
+ f"{len(statistics.get('statistics', []))} columns"
691
+ )
692
+
693
+ return statistics
694
+
695
+ except Exception as e:
696
+ # Log but don't fail - caller can fall back to sampling
697
+ log_error_with_context(e, context, level=logging.WARNING)
698
+ logger.info(
699
+ f"Could not fetch statistics for {dataset_id}, "
700
+ "caller should use sample-based analysis"
701
+ )
702
+ return None
703
+
704
  def get_cached_metadata(self, dataset_id: str, config_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
705
  """
706
  Retrieve cached metadata without making API calls.
 
731
  cache_file.unlink()
732
  for cache_file in self.sample_cache_dir.glob("*.json"):
733
  cache_file.unlink()
734
+ for cache_file in self.statistics_cache_dir.glob("*.json"):
735
+ cache_file.unlink()
736
  logger.info("Cleared all cache")
737
  else:
738
  # Clear cache for specific dataset
 
746
  for cache_file in self.sample_cache_dir.glob(f"{cache_key}*.json"):
747
  cache_file.unlink()
748
 
749
+ # Clear statistics cache
750
+ for cache_file in self.statistics_cache_dir.glob(f"{cache_key}*.json"):
751
+ cache_file.unlink()
752
+
753
  logger.info(f"Cleared cache for dataset: {dataset_id}")
754
 
755
  except Exception as e:
 
766
  try:
767
  metadata_files = list(self.metadata_cache_dir.glob("*.json"))
768
  sample_files = list(self.sample_cache_dir.glob("*.json"))
769
+ statistics_files = list(self.statistics_cache_dir.glob("*.json"))
770
 
771
  # Calculate cache sizes
772
  metadata_size = sum(f.stat().st_size for f in metadata_files)
773
  sample_size = sum(f.stat().st_size for f in sample_files)
774
+ statistics_size = sum(f.stat().st_size for f in statistics_files)
775
 
776
  return {
777
  'cache_dir': str(self.cache_dir),
778
  'metadata_files': len(metadata_files),
779
  'sample_files': len(sample_files),
780
+ 'statistics_files': len(statistics_files),
781
+ 'total_files': len(metadata_files) + len(sample_files) + len(statistics_files),
782
  'metadata_size_bytes': metadata_size,
783
  'sample_size_bytes': sample_size,
784
+ 'statistics_size_bytes': statistics_size,
785
+ 'total_size_bytes': metadata_size + sample_size + statistics_size,
786
  'cache_ttl_seconds': self.cache_ttl
787
  }
788
  except Exception as e:
src/hf_eda_mcp/services/dataset_viewer_adapter.py CHANGED
@@ -153,4 +153,128 @@ class DatasetViewerAdapter():
153
  except Exception as e:
154
  error_msg = f"Unexpected error fetching dataset information: {str(e)}"
155
  logger.error(error_msg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  raise DatasetViewerError(error_msg) from e
 
153
  except Exception as e:
154
  error_msg = f"Unexpected error fetching dataset information: {str(e)}"
155
  logger.error(error_msg)
156
+ raise DatasetViewerError(error_msg) from e
157
+
158
+
159
+ def get_dataset_statistics(
160
+ self,
161
+ dataset_name: str,
162
+ config: str,
163
+ split_name: str
164
+ ) -> dict:
165
+ """
166
+ Get detailed statistics for a dataset split from the Dataset Viewer API.
167
+
168
+ This endpoint provides comprehensive statistics including:
169
+ - Numerical features: histograms, mean, median, min, max, std
170
+ - Categorical features: value frequencies, unique counts
171
+ - Text features: length distributions
172
+ - Image features: width/height distributions
173
+ - Audio features: duration distributions
174
+
175
+ Note: This endpoint only works for datasets with builder_name="parquet".
176
+ Use get_dataset_information() first to check if statistics are available.
177
+
178
+ Args:
179
+ dataset_name: HuggingFace dataset identifier
180
+ config: Configuration name (required)
181
+ split_name: Split name (required)
182
+
183
+ Returns:
184
+ Dictionary containing detailed statistics including:
185
+ - num_examples: Total number of examples in the split
186
+ - statistics: List of column statistics with type-specific metrics
187
+ - partial: Whether the response is partial
188
+
189
+ Raises:
190
+ DatasetViewerError: If the API request fails or statistics are unavailable
191
+ """
192
+ params = {
193
+ "dataset": dataset_name,
194
+ "config": config,
195
+ "split": split_name,
196
+ }
197
+
198
+ logger.info(f"Fetching dataset statistics from Viewer API: {dataset_name}/{config}/{split_name}")
199
+
200
+ try:
201
+ result = self._api_get(
202
+ route="statistics",
203
+ params=params
204
+ )
205
+
206
+ # Check for errors in response
207
+ if result.get('failed'):
208
+ logger.warning(f"Dataset Viewer API returned failures: {result['failed']}")
209
+
210
+ if result.get('partial'):
211
+ logger.warning("Dataset Viewer API returned partial data")
212
+
213
+ return result
214
+
215
+ except DatasetViewerError:
216
+ # Re-raise with context
217
+ raise
218
+ except Exception as e:
219
+ error_msg = f"Unexpected error fetching dataset statistics: {str(e)}"
220
+ logger.error(error_msg)
221
+ raise DatasetViewerError(error_msg) from e
222
+
223
+ def check_statistics_availability(
224
+ self,
225
+ dataset_name: str,
226
+ config: Optional[str] = None
227
+ ) -> dict:
228
+ """
229
+ Check if statistics are available for a dataset.
230
+
231
+ Statistics are only available for datasets with builder_name="parquet".
232
+ This method checks the dataset information to determine availability.
233
+
234
+ Args:
235
+ dataset_name: HuggingFace dataset identifier
236
+ config: Optional configuration name
237
+
238
+ Returns:
239
+ Dictionary with availability information:
240
+ - available: Boolean indicating if statistics are available
241
+ - configs: List of configs with statistics support
242
+ - reason: Explanation if statistics are not available
243
+
244
+ Raises:
245
+ DatasetViewerError: If the API request fails
246
+ """
247
+ try:
248
+ info = self.get_dataset_information(dataset_name, config)
249
+ dataset_info = info.get('dataset_info', {})
250
+
251
+ # Handle both response formats
252
+ if isinstance(dataset_info, dict) and 'config_name' in dataset_info:
253
+ # Single config format
254
+ builder_name = dataset_info.get('builder_name', '')
255
+ is_parquet = builder_name == 'parquet'
256
+
257
+ return {
258
+ 'available': is_parquet,
259
+ 'configs': [dataset_info.get('config_name')] if is_parquet else [],
260
+ 'reason': 'Statistics available' if is_parquet else f'Statistics only available for parquet datasets (found: {builder_name})'
261
+ }
262
+ else:
263
+ # Multiple configs format
264
+ parquet_configs = []
265
+ for cfg_name, cfg_data in dataset_info.items():
266
+ if cfg_data.get('builder_name') == 'parquet':
267
+ parquet_configs.append(cfg_name)
268
+
269
+ return {
270
+ 'available': len(parquet_configs) > 0,
271
+ 'configs': parquet_configs,
272
+ 'reason': f'Statistics available for {len(parquet_configs)} config(s)' if parquet_configs else 'No parquet configs found'
273
+ }
274
+
275
+ except DatasetViewerError:
276
+ raise
277
+ except Exception as e:
278
+ error_msg = f"Unexpected error checking statistics availability: {str(e)}"
279
+ logger.error(error_msg)
280
  raise DatasetViewerError(error_msg) from e
src/hf_eda_mcp/tools/analysis.py CHANGED
@@ -2,7 +2,7 @@
2
  Basic analysis tools for exploratory data analysis of HuggingFace datasets.
3
 
4
  This module provides tools for performing exploratory data analysis including
5
- feature statistics, missing value analysis, and data quality insights.
6
  """
7
 
8
  import logging
@@ -75,7 +75,6 @@ def analyze_dataset_features(
75
  - unique_count: Number of unique values
76
  - statistics: Type-specific statistics (mean, std for numerical; top values for categorical)
77
  - summary: Overall analysis summary
78
- - data_quality: Data quality assessment
79
 
80
  Raises:
81
  ValueError: If inputs are invalid
@@ -88,10 +87,6 @@ def analyze_dataset_features(
88
  >>> for feature_name, feature_analysis in analysis['features'].items():
89
  ... print(f"{feature_name}: {feature_analysis['feature_type']}")
90
  ... print(f" Missing: {feature_analysis['missing_percentage']:.1f}%")
91
-
92
- >>> # Check data quality
93
- >>> quality = analysis['data_quality']
94
- >>> print(f"Overall quality score: {quality['quality_score']:.2f}")
95
  """
96
  # Handle empty strings from Gradio (convert to None)
97
  if config_name == "":
@@ -122,8 +117,25 @@ def analyze_dataset_features(
122
  )
123
 
124
  try:
125
- # Get dataset service and load sample for analysis
126
  service = get_dataset_service()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  sample_data = service.load_dataset_sample(
128
  dataset_id=dataset_id,
129
  split=split,
@@ -132,8 +144,6 @@ def analyze_dataset_features(
132
  streaming=True,
133
  )
134
 
135
- # Note: We could get dataset metadata here for additional context if needed
136
-
137
  # Perform feature analysis
138
  features_analysis = {}
139
  data_samples = sample_data["data"]
@@ -172,7 +182,6 @@ def analyze_dataset_features(
172
  "analysis_timestamp": sample_data.get("_sampled_at"),
173
  },
174
  "features": features_analysis,
175
- "data_quality": _assess_data_quality(features_analysis),
176
  "summary": _generate_analysis_summary(features_analysis, len(data_samples)),
177
  }
178
 
@@ -205,6 +214,215 @@ def analyze_dataset_features(
205
  raise DatasetServiceError(f"Failed to analyze dataset features: {str(e)}") from e
206
 
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  def _analyze_single_feature(
209
  feature_name: str, data_samples: List[Dict[str, Any]]
210
  ) -> Dict[str, Any]:
@@ -396,113 +614,6 @@ def _compute_text_statistics(values: List[str]) -> Dict[str, Any]:
396
  return {"count": len(values), "error": str(e)}
397
 
398
 
399
- def _assess_data_quality(
400
- features_analysis: Dict[str, Dict[str, Any]],
401
- ) -> Dict[str, Any]:
402
- """
403
- Assess overall data quality based on feature analysis.
404
-
405
- Args:
406
- features_analysis: Dictionary of feature analyses
407
-
408
- Returns:
409
- Dictionary containing data quality assessment
410
- """
411
- if not features_analysis:
412
- return {"quality_score": 0.0, "issues": ["No features to analyze"]}
413
-
414
- total_features = len(features_analysis)
415
- issues = []
416
- quality_factors = []
417
-
418
- # Check missing value rates
419
- high_missing_features = 0
420
- total_missing_rate = 0
421
-
422
- for feature_name, analysis in features_analysis.items():
423
- missing_pct = analysis.get("missing_percentage", 0)
424
- total_missing_rate += missing_pct
425
-
426
- if missing_pct > 50:
427
- high_missing_features += 1
428
- issues.append(
429
- f"Feature '{feature_name}' has {missing_pct:.1f}% missing values"
430
- )
431
- elif missing_pct > 20:
432
- issues.append(
433
- f"Feature '{feature_name}' has {missing_pct:.1f}% missing values"
434
- )
435
-
436
- avg_missing_rate = total_missing_rate / total_features
437
-
438
- # Quality score calculation (0-1 scale)
439
- missing_score = max(0, 1 - (avg_missing_rate / 100))
440
- quality_factors.append(("missing_values", missing_score))
441
-
442
- # Check for features with very low diversity
443
- low_diversity_features = 0
444
- for feature_name, analysis in features_analysis.items():
445
- unique_count = analysis.get("unique_count", 0)
446
- total_count = analysis.get("total_count", 1)
447
- diversity_ratio = unique_count / total_count if total_count > 0 else 0
448
-
449
- if diversity_ratio < 0.01 and analysis.get("feature_type") != "boolean":
450
- low_diversity_features += 1
451
- issues.append(
452
- f"Feature '{feature_name}' has very low diversity ({unique_count} unique values)"
453
- )
454
-
455
- diversity_score = max(0, 1 - (low_diversity_features / total_features))
456
- quality_factors.append(("diversity", diversity_score))
457
-
458
- # Overall quality score (weighted average)
459
- weights = {"missing_values": 0.6, "diversity": 0.4}
460
- quality_score = sum(weights[factor] * score for factor, score in quality_factors)
461
-
462
- # Quality assessment
463
- if quality_score >= 0.8:
464
- quality_level = "high"
465
- elif quality_score >= 0.6:
466
- quality_level = "medium"
467
- else:
468
- quality_level = "low"
469
-
470
- return {
471
- "quality_score": quality_score,
472
- "quality_level": quality_level,
473
- "avg_missing_rate": avg_missing_rate,
474
- "high_missing_features": high_missing_features,
475
- "low_diversity_features": low_diversity_features,
476
- "issues": issues,
477
- "recommendations": _generate_quality_recommendations(issues, quality_score),
478
- }
479
-
480
-
481
- def _generate_quality_recommendations(
482
- issues: List[str], quality_score: float
483
- ) -> List[str]:
484
- """Generate recommendations based on data quality issues."""
485
- recommendations = []
486
-
487
- if quality_score < 0.6:
488
- recommendations.append(
489
- "Consider data cleaning and preprocessing before analysis"
490
- )
491
-
492
- if any("missing values" in issue for issue in issues):
493
- recommendations.append("Handle missing values through imputation or removal")
494
-
495
- if any("low diversity" in issue for issue in issues):
496
- recommendations.append(
497
- "Review features with low diversity - they may not be informative"
498
- )
499
-
500
- if not recommendations:
501
- recommendations.append("Data quality looks good for analysis")
502
-
503
- return recommendations
504
-
505
-
506
  def _generate_analysis_summary(
507
  features_analysis: Dict[str, Dict[str, Any]], sample_size: int
508
  ) -> str:
 
2
  Basic analysis tools for exploratory data analysis of HuggingFace datasets.
3
 
4
  This module provides tools for performing exploratory data analysis including
5
+ feature statistics and missing value analysis.
6
  """
7
 
8
  import logging
 
75
  - unique_count: Number of unique values
76
  - statistics: Type-specific statistics (mean, std for numerical; top values for categorical)
77
  - summary: Overall analysis summary
 
78
 
79
  Raises:
80
  ValueError: If inputs are invalid
 
87
  >>> for feature_name, feature_analysis in analysis['features'].items():
88
  ... print(f"{feature_name}: {feature_analysis['feature_type']}")
89
  ... print(f" Missing: {feature_analysis['missing_percentage']:.1f}%")
 
 
 
 
90
  """
91
  # Handle empty strings from Gradio (convert to None)
92
  if config_name == "":
 
117
  )
118
 
119
  try:
120
+ # Get dataset service
121
  service = get_dataset_service()
122
+
123
+ # Try to get statistics from Dataset Viewer API first (more efficient and complete)
124
+ viewer_stats = service.get_dataset_statistics(
125
+ dataset_id=dataset_id,
126
+ split=split,
127
+ config_name=config_name
128
+ )
129
+
130
+ if viewer_stats is not None:
131
+ # Use Dataset Viewer statistics (full dataset, no sampling needed)
132
+ logger.info(f"Using Dataset Viewer statistics for {dataset_id}")
133
+ return _convert_viewer_statistics_to_analysis(
134
+ viewer_stats, dataset_id, config_name, split
135
+ )
136
+
137
+ # Fall back to sample-based analysis
138
+ logger.info("Dataset Viewer statistics not available, using sample-based analysis")
139
  sample_data = service.load_dataset_sample(
140
  dataset_id=dataset_id,
141
  split=split,
 
144
  streaming=True,
145
  )
146
 
 
 
147
  # Perform feature analysis
148
  features_analysis = {}
149
  data_samples = sample_data["data"]
 
182
  "analysis_timestamp": sample_data.get("_sampled_at"),
183
  },
184
  "features": features_analysis,
 
185
  "summary": _generate_analysis_summary(features_analysis, len(data_samples)),
186
  }
187
 
 
214
  raise DatasetServiceError(f"Failed to analyze dataset features: {str(e)}") from e
215
 
216
 
217
+ def _convert_viewer_statistics_to_analysis(
218
+ viewer_stats: Dict[str, Any],
219
+ dataset_id: str,
220
+ config_name: Optional[str],
221
+ split: str
222
+ ) -> Dict[str, Any]:
223
+ """
224
+ Convert Dataset Viewer API statistics to our analysis format.
225
+
226
+ Supports all Dataset Viewer column types:
227
+ - Numerical: int, float
228
+ - Categorical: class_label, string_label, bool
229
+ - Text: string_text
230
+ - Media: image, audio
231
+ - Structured: list
232
+
233
+ Args:
234
+ viewer_stats: Statistics from Dataset Viewer API
235
+ dataset_id: Dataset identifier
236
+ config_name: Configuration name
237
+ split: Split name
238
+
239
+ Returns:
240
+ Dictionary in our standard analysis format
241
+ """
242
+ num_examples = viewer_stats.get('num_examples', 0)
243
+ statistics_list = viewer_stats.get('statistics', [])
244
+
245
+ features_analysis = {}
246
+
247
+ for col_stat in statistics_list:
248
+ column_name = col_stat.get('column_name', 'unknown')
249
+ column_type = col_stat.get('column_type', 'unknown')
250
+ column_statistics = col_stat.get('column_statistics', {})
251
+
252
+ # Convert to our format based on column type
253
+ if column_type == 'string_text':
254
+ # Text features: character length statistics
255
+ features_analysis[column_name] = {
256
+ 'feature_type': 'text',
257
+ 'missing_count': column_statistics.get('nan_count', 0),
258
+ 'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
259
+ 'unique_count': 0, # Not provided by viewer for text
260
+ 'total_count': num_examples,
261
+ 'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
262
+ 'statistics': {
263
+ 'count': num_examples - column_statistics.get('nan_count', 0),
264
+ 'min_length': column_statistics.get('min', 0),
265
+ 'max_length': column_statistics.get('max', 0),
266
+ 'mean_length': column_statistics.get('mean', 0),
267
+ 'median_length': column_statistics.get('median', 0),
268
+ 'std_length': column_statistics.get('std', 0),
269
+ 'histogram': column_statistics.get('histogram', {}),
270
+ },
271
+ 'sample_values': [],
272
+ }
273
+ elif column_type in ['class_label', 'string_label']:
274
+ # Categorical features: frequency distributions
275
+ frequencies = column_statistics.get('frequencies', {})
276
+ features_analysis[column_name] = {
277
+ 'feature_type': 'categorical',
278
+ 'missing_count': column_statistics.get('nan_count', 0),
279
+ 'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
280
+ 'unique_count': column_statistics.get('n_unique', len(frequencies)),
281
+ 'total_count': num_examples,
282
+ 'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
283
+ 'statistics': {
284
+ 'count': num_examples - column_statistics.get('nan_count', 0),
285
+ 'unique_count': column_statistics.get('n_unique', len(frequencies)),
286
+ 'frequencies': frequencies,
287
+ 'most_common': [(k, v) for k, v in sorted(frequencies.items(), key=lambda x: x[1], reverse=True)],
288
+ 'top_value': max(frequencies.items(), key=lambda x: x[1]) if frequencies else None,
289
+ 'no_label_count': column_statistics.get('no_label_count', 0),
290
+ 'no_label_proportion': column_statistics.get('no_label_proportion', 0.0),
291
+ },
292
+ 'sample_values': list(frequencies.keys())[:5],
293
+ }
294
+ elif column_type == 'bool':
295
+ # Boolean features: True/False frequencies
296
+ frequencies = column_statistics.get('frequencies', {})
297
+ features_analysis[column_name] = {
298
+ 'feature_type': 'boolean',
299
+ 'missing_count': column_statistics.get('nan_count', 0),
300
+ 'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
301
+ 'unique_count': len(frequencies),
302
+ 'total_count': num_examples,
303
+ 'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
304
+ 'statistics': {
305
+ 'count': num_examples - column_statistics.get('nan_count', 0),
306
+ 'frequencies': frequencies,
307
+ },
308
+ 'sample_values': list(frequencies.keys()),
309
+ }
310
+ elif column_type in ['int', 'float']:
311
+ # Numerical features: statistical measures
312
+ features_analysis[column_name] = {
313
+ 'feature_type': 'numerical',
314
+ 'missing_count': column_statistics.get('nan_count', 0),
315
+ 'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
316
+ 'unique_count': 0, # Not always provided
317
+ 'total_count': num_examples,
318
+ 'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
319
+ 'statistics': {
320
+ 'count': num_examples - column_statistics.get('nan_count', 0),
321
+ 'mean': column_statistics.get('mean', 0),
322
+ 'median': column_statistics.get('median', 0),
323
+ 'min': column_statistics.get('min', 0),
324
+ 'max': column_statistics.get('max', 0),
325
+ 'std': column_statistics.get('std', 0),
326
+ 'histogram': column_statistics.get('histogram', {}),
327
+ },
328
+ 'sample_values': [],
329
+ }
330
+ elif column_type == 'image':
331
+ # Image features: dimension statistics
332
+ features_analysis[column_name] = {
333
+ 'feature_type': 'image',
334
+ 'missing_count': column_statistics.get('nan_count', 0),
335
+ 'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
336
+ 'unique_count': 0,
337
+ 'total_count': num_examples,
338
+ 'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
339
+ 'statistics': {
340
+ 'count': num_examples - column_statistics.get('nan_count', 0),
341
+ 'min_dimension': column_statistics.get('min', 0),
342
+ 'max_dimension': column_statistics.get('max', 0),
343
+ 'mean_dimension': column_statistics.get('mean', 0),
344
+ 'median_dimension': column_statistics.get('median', 0),
345
+ 'std_dimension': column_statistics.get('std', 0),
346
+ 'histogram': column_statistics.get('histogram', {}),
347
+ },
348
+ 'sample_values': [],
349
+ }
350
+ elif column_type == 'audio':
351
+ # Audio features: duration statistics (in seconds)
352
+ features_analysis[column_name] = {
353
+ 'feature_type': 'audio',
354
+ 'missing_count': column_statistics.get('nan_count', 0),
355
+ 'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
356
+ 'unique_count': 0,
357
+ 'total_count': num_examples,
358
+ 'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
359
+ 'statistics': {
360
+ 'count': num_examples - column_statistics.get('nan_count', 0),
361
+ 'min_duration': column_statistics.get('min', 0),
362
+ 'max_duration': column_statistics.get('max', 0),
363
+ 'mean_duration': column_statistics.get('mean', 0),
364
+ 'median_duration': column_statistics.get('median', 0),
365
+ 'std_duration': column_statistics.get('std', 0),
366
+ 'histogram': column_statistics.get('histogram', {}),
367
+ },
368
+ 'sample_values': [],
369
+ }
370
+ elif column_type == 'list':
371
+ # List features: length statistics
372
+ features_analysis[column_name] = {
373
+ 'feature_type': 'list',
374
+ 'missing_count': column_statistics.get('nan_count', 0),
375
+ 'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
376
+ 'unique_count': 0,
377
+ 'total_count': num_examples,
378
+ 'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
379
+ 'statistics': {
380
+ 'count': num_examples - column_statistics.get('nan_count', 0),
381
+ 'min_length': column_statistics.get('min', 0),
382
+ 'max_length': column_statistics.get('max', 0),
383
+ 'mean_length': column_statistics.get('mean', 0),
384
+ 'median_length': column_statistics.get('median', 0),
385
+ 'std_length': column_statistics.get('std', 0),
386
+ 'histogram': column_statistics.get('histogram', {}),
387
+ },
388
+ 'sample_values': [],
389
+ }
390
+ else:
391
+ # Unknown type - provide basic info with all available statistics
392
+ features_analysis[column_name] = {
393
+ 'feature_type': column_type,
394
+ 'missing_count': column_statistics.get('nan_count', 0),
395
+ 'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
396
+ 'unique_count': column_statistics.get('n_unique', 0),
397
+ 'total_count': num_examples,
398
+ 'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
399
+ 'statistics': column_statistics,
400
+ 'sample_values': [],
401
+ }
402
+
403
+ # Generate overall analysis
404
+ analysis_result = {
405
+ "dataset_info": {
406
+ "dataset_id": dataset_id,
407
+ "config_name": viewer_stats.get('_config_used', config_name),
408
+ "split": split,
409
+ "total_features": len(features_analysis),
410
+ "sample_size_used": num_examples,
411
+ "sample_size_requested": num_examples,
412
+ },
413
+ "sample_info": {
414
+ "sampling_method": "dataset_viewer_api",
415
+ "represents_full_dataset": True,
416
+ "analysis_timestamp": viewer_stats.get('_cached_at'),
417
+ "partial": viewer_stats.get('partial', False),
418
+ },
419
+ "features": features_analysis,
420
+ "summary": _generate_analysis_summary(features_analysis, num_examples),
421
+ }
422
+
423
+ return analysis_result
424
+
425
+
426
  def _analyze_single_feature(
427
  feature_name: str, data_samples: List[Dict[str, Any]]
428
  ) -> Dict[str, Any]:
 
614
  return {"count": len(values), "error": str(e)}
615
 
616
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
  def _generate_analysis_summary(
618
  features_analysis: Dict[str, Dict[str, Any]], sample_size: int
619
  ) -> str: