Spaces:
Running
Running
Commit
·
43642a4
1
Parent(s):
b3aa246
Improve analysis tool with dataset viewer
Browse files- README.md +12 -5
- pdm.lock +211 -6
- pyproject.toml +8 -5
- scripts/playground/analysis_tool_playground.py +57 -0
- src/hf_eda_mcp/services/dataset_service.py +141 -2
- src/hf_eda_mcp/services/dataset_viewer_adapter.py +124 -0
- src/hf_eda_mcp/tools/analysis.py +228 -117
README.md
CHANGED
|
@@ -17,9 +17,13 @@ An MCP (Model Context Protocol) server that provides tools for Exploratory Data
|
|
| 17 |
|
| 18 |
## Features
|
| 19 |
|
| 20 |
-
- **Dataset Metadata**: Retrieve comprehensive information about HuggingFace datasets
|
| 21 |
- **Dataset Sampling**: Get samples from any dataset split for quick exploration
|
| 22 |
-
- **Feature Analysis**: Perform basic EDA
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
## Usage
|
| 25 |
|
|
@@ -43,9 +47,12 @@ Replace `YOUR-USERNAME` with your HuggingFace username.
|
|
| 43 |
|
| 44 |
### Available Tools
|
| 45 |
|
| 46 |
-
1. **get_dataset_metadata**: Get detailed information about a dataset
|
| 47 |
-
2. **get_dataset_sample**: Retrieve sample rows from a dataset
|
| 48 |
-
3. **analyze_dataset_features**: Perform exploratory analysis
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
## Authentication
|
| 51 |
|
|
|
|
| 17 |
|
| 18 |
## Features
|
| 19 |
|
| 20 |
+
- **Dataset Metadata**: Retrieve comprehensive information about HuggingFace datasets including size, features, splits, and configurations
|
| 21 |
- **Dataset Sampling**: Get samples from any dataset split for quick exploration
|
| 22 |
+
- **Feature Analysis**: Perform basic EDA with automatic optimization
|
| 23 |
+
- Uses HuggingFace Dataset Viewer API for full dataset statistics (when available)
|
| 24 |
+
- Automatic fallback to sample-based analysis
|
| 25 |
+
- Supports multiple data types: numerical, categorical, text, image, audio
|
| 26 |
+
- Includes histograms, distributions, and missing value analysis
|
| 27 |
|
| 28 |
## Usage
|
| 29 |
|
|
|
|
| 47 |
|
| 48 |
### Available Tools
|
| 49 |
|
| 50 |
+
1. **get_dataset_metadata**: Get detailed information about a dataset including size, features, splits, and download statistics
|
| 51 |
+
2. **get_dataset_sample**: Retrieve sample rows from a dataset for quick exploration
|
| 52 |
+
3. **analyze_dataset_features**: Perform comprehensive exploratory analysis with automatic optimization
|
| 53 |
+
- Automatically uses Dataset Viewer API statistics for parquet datasets (full dataset analysis)
|
| 54 |
+
- Falls back to sample-based analysis for other formats
|
| 55 |
+
- Returns feature types, statistics, histograms, and missing value analysis
|
| 56 |
|
| 57 |
## Authentication
|
| 58 |
|
pdm.lock
CHANGED
|
@@ -2,10 +2,10 @@
|
|
| 2 |
# It is not intended for manual editing.
|
| 3 |
|
| 4 |
[metadata]
|
| 5 |
-
groups = ["default", "hf-cli", "plots"]
|
| 6 |
strategy = ["inherit_metadata"]
|
| 7 |
lock_version = "4.5.0"
|
| 8 |
-
content_hash = "sha256:
|
| 9 |
|
| 10 |
[[metadata.targets]]
|
| 11 |
requires_python = ">=3.13"
|
|
@@ -324,8 +324,8 @@ name = "colorama"
|
|
| 324 |
version = "0.4.6"
|
| 325 |
requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
| 326 |
summary = "Cross-platform colored terminal text."
|
| 327 |
-
groups = ["default", "hf-cli"]
|
| 328 |
-
marker = "platform_system == \"Windows\""
|
| 329 |
files = [
|
| 330 |
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
|
| 331 |
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
|
@@ -388,6 +388,137 @@ files = [
|
|
| 388 |
{file = "contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880"},
|
| 389 |
]
|
| 390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
[[package]]
|
| 392 |
name = "cycler"
|
| 393 |
version = "0.12.1"
|
|
@@ -807,6 +938,17 @@ files = [
|
|
| 807 |
{file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"},
|
| 808 |
]
|
| 809 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 810 |
[[package]]
|
| 811 |
name = "jinja2"
|
| 812 |
version = "3.1.6"
|
|
@@ -1208,7 +1350,7 @@ name = "packaging"
|
|
| 1208 |
version = "25.0"
|
| 1209 |
requires_python = ">=3.8"
|
| 1210 |
summary = "Core utilities for Python packages"
|
| 1211 |
-
groups = ["default", "hf-cli", "plots"]
|
| 1212 |
files = [
|
| 1213 |
{file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"},
|
| 1214 |
{file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"},
|
|
@@ -1315,6 +1457,17 @@ files = [
|
|
| 1315 |
{file = "pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523"},
|
| 1316 |
]
|
| 1317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1318 |
[[package]]
|
| 1319 |
name = "propcache"
|
| 1320 |
version = "0.4.1"
|
|
@@ -1486,7 +1639,7 @@ name = "pygments"
|
|
| 1486 |
version = "2.19.2"
|
| 1487 |
requires_python = ">=3.8"
|
| 1488 |
summary = "Pygments is a syntax highlighting package written in Python."
|
| 1489 |
-
groups = ["default"]
|
| 1490 |
files = [
|
| 1491 |
{file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"},
|
| 1492 |
{file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"},
|
|
@@ -1503,6 +1656,58 @@ files = [
|
|
| 1503 |
{file = "pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6"},
|
| 1504 |
]
|
| 1505 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1506 |
[[package]]
|
| 1507 |
name = "python-dateutil"
|
| 1508 |
version = "2.9.0.post0"
|
|
|
|
| 2 |
# It is not intended for manual editing.
|
| 3 |
|
| 4 |
[metadata]
|
| 5 |
+
groups = ["default", "hf-cli", "plots", "test"]
|
| 6 |
strategy = ["inherit_metadata"]
|
| 7 |
lock_version = "4.5.0"
|
| 8 |
+
content_hash = "sha256:7db937b9435dfaf07c2e27ae0b16da07ce0764665446873e8f40e81af6d5b5b4"
|
| 9 |
|
| 10 |
[[metadata.targets]]
|
| 11 |
requires_python = ">=3.13"
|
|
|
|
| 324 |
version = "0.4.6"
|
| 325 |
requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
| 326 |
summary = "Cross-platform colored terminal text."
|
| 327 |
+
groups = ["default", "hf-cli", "test"]
|
| 328 |
+
marker = "sys_platform == \"win32\" or platform_system == \"Windows\""
|
| 329 |
files = [
|
| 330 |
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
|
| 331 |
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
|
|
|
| 388 |
{file = "contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880"},
|
| 389 |
]
|
| 390 |
|
| 391 |
+
[[package]]
|
| 392 |
+
name = "coverage"
|
| 393 |
+
version = "7.12.0"
|
| 394 |
+
requires_python = ">=3.10"
|
| 395 |
+
summary = "Code coverage measurement for Python"
|
| 396 |
+
groups = ["test"]
|
| 397 |
+
files = [
|
| 398 |
+
{file = "coverage-7.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:47324fffca8d8eae7e185b5bb20c14645f23350f870c1649003618ea91a78941"},
|
| 399 |
+
{file = "coverage-7.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ccf3b2ede91decd2fb53ec73c1f949c3e034129d1e0b07798ff1d02ea0c8fa4a"},
|
| 400 |
+
{file = "coverage-7.12.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:b365adc70a6936c6b0582dc38746b33b2454148c02349345412c6e743efb646d"},
|
| 401 |
+
{file = "coverage-7.12.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bc13baf85cd8a4cfcf4a35c7bc9d795837ad809775f782f697bf630b7e200211"},
|
| 402 |
+
{file = "coverage-7.12.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:099d11698385d572ceafb3288a5b80fe1fc58bf665b3f9d362389de488361d3d"},
|
| 403 |
+
{file = "coverage-7.12.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:473dc45d69694069adb7680c405fb1e81f60b2aff42c81e2f2c3feaf544d878c"},
|
| 404 |
+
{file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:583f9adbefd278e9de33c33d6846aa8f5d164fa49b47144180a0e037f0688bb9"},
|
| 405 |
+
{file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b2089cc445f2dc0af6f801f0d1355c025b76c24481935303cf1af28f636688f0"},
|
| 406 |
+
{file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:950411f1eb5d579999c5f66c62a40961f126fc71e5e14419f004471957b51508"},
|
| 407 |
+
{file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b1aab7302a87bafebfe76b12af681b56ff446dc6f32ed178ff9c092ca776e6bc"},
|
| 408 |
+
{file = "coverage-7.12.0-cp313-cp313-win32.whl", hash = "sha256:d7e0d0303c13b54db495eb636bc2465b2fb8475d4c8bcec8fe4b5ca454dfbae8"},
|
| 409 |
+
{file = "coverage-7.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:ce61969812d6a98a981d147d9ac583a36ac7db7766f2e64a9d4d059c2fe29d07"},
|
| 410 |
+
{file = "coverage-7.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:bcec6f47e4cb8a4c2dc91ce507f6eefc6a1b10f58df32cdc61dff65455031dfc"},
|
| 411 |
+
{file = "coverage-7.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:459443346509476170d553035e4a3eed7b860f4fe5242f02de1010501956ce87"},
|
| 412 |
+
{file = "coverage-7.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:04a79245ab2b7a61688958f7a855275997134bc84f4a03bc240cf64ff132abf6"},
|
| 413 |
+
{file = "coverage-7.12.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:09a86acaaa8455f13d6a99221d9654df249b33937b4e212b4e5a822065f12aa7"},
|
| 414 |
+
{file = "coverage-7.12.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:907e0df1b71ba77463687a74149c6122c3f6aac56c2510a5d906b2f368208560"},
|
| 415 |
+
{file = "coverage-7.12.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b57e2d0ddd5f0582bae5437c04ee71c46cd908e7bc5d4d0391f9a41e812dd12"},
|
| 416 |
+
{file = "coverage-7.12.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:58c1c6aa677f3a1411fe6fb28ec3a942e4f665df036a3608816e0847fad23296"},
|
| 417 |
+
{file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4c589361263ab2953e3c4cd2a94db94c4ad4a8e572776ecfbad2389c626e4507"},
|
| 418 |
+
{file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:91b810a163ccad2e43b1faa11d70d3cf4b6f3d83f9fd5f2df82a32d47b648e0d"},
|
| 419 |
+
{file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:40c867af715f22592e0d0fb533a33a71ec9e0f73a6945f722a0c85c8c1cbe3a2"},
|
| 420 |
+
{file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:68b0d0a2d84f333de875666259dadf28cc67858bc8fd8b3f1eae84d3c2bec455"},
|
| 421 |
+
{file = "coverage-7.12.0-cp313-cp313t-win32.whl", hash = "sha256:73f9e7fbd51a221818fd11b7090eaa835a353ddd59c236c57b2199486b116c6d"},
|
| 422 |
+
{file = "coverage-7.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:24cff9d1f5743f67db7ba46ff284018a6e9aeb649b67aa1e70c396aa1b7cb23c"},
|
| 423 |
+
{file = "coverage-7.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:c87395744f5c77c866d0f5a43d97cc39e17c7f1cb0115e54a2fe67ca75c5d14d"},
|
| 424 |
+
{file = "coverage-7.12.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:a1c59b7dc169809a88b21a936eccf71c3895a78f5592051b1af8f4d59c2b4f92"},
|
| 425 |
+
{file = "coverage-7.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8787b0f982e020adb732b9f051f3e49dd5054cebbc3f3432061278512a2b1360"},
|
| 426 |
+
{file = "coverage-7.12.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5ea5a9f7dc8877455b13dd1effd3202e0bca72f6f3ab09f9036b1bcf728f69ac"},
|
| 427 |
+
{file = "coverage-7.12.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fdba9f15849534594f60b47c9a30bc70409b54947319a7c4fd0e8e3d8d2f355d"},
|
| 428 |
+
{file = "coverage-7.12.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a00594770eb715854fb1c57e0dea08cce6720cfbc531accdb9850d7c7770396c"},
|
| 429 |
+
{file = "coverage-7.12.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5560c7e0d82b42eb1951e4f68f071f8017c824ebfd5a6ebe42c60ac16c6c2434"},
|
| 430 |
+
{file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d6c2e26b481c9159c2773a37947a9718cfdc58893029cdfb177531793e375cfc"},
|
| 431 |
+
{file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:6e1a8c066dabcde56d5d9fed6a66bc19a2883a3fe051f0c397a41fc42aedd4cc"},
|
| 432 |
+
{file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:f7ba9da4726e446d8dd8aae5a6cd872511184a5d861de80a86ef970b5dacce3e"},
|
| 433 |
+
{file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e0f483ab4f749039894abaf80c2f9e7ed77bbf3c737517fb88c8e8e305896a17"},
|
| 434 |
+
{file = "coverage-7.12.0-cp314-cp314-win32.whl", hash = "sha256:76336c19a9ef4a94b2f8dc79f8ac2da3f193f625bb5d6f51a328cd19bfc19933"},
|
| 435 |
+
{file = "coverage-7.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:7c1059b600aec6ef090721f8f633f60ed70afaffe8ecab85b59df748f24b31fe"},
|
| 436 |
+
{file = "coverage-7.12.0-cp314-cp314-win_arm64.whl", hash = "sha256:172cf3a34bfef42611963e2b661302a8931f44df31629e5b1050567d6b90287d"},
|
| 437 |
+
{file = "coverage-7.12.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:aa7d48520a32cb21c7a9b31f81799e8eaec7239db36c3b670be0fa2403828d1d"},
|
| 438 |
+
{file = "coverage-7.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:90d58ac63bc85e0fb919f14d09d6caa63f35a5512a2205284b7816cafd21bb03"},
|
| 439 |
+
{file = "coverage-7.12.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ca8ecfa283764fdda3eae1bdb6afe58bf78c2c3ec2b2edcb05a671f0bba7b3f9"},
|
| 440 |
+
{file = "coverage-7.12.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:874fe69a0785d96bd066059cd4368022cebbec1a8958f224f0016979183916e6"},
|
| 441 |
+
{file = "coverage-7.12.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5b3c889c0b8b283a24d721a9eabc8ccafcfc3aebf167e4cd0d0e23bf8ec4e339"},
|
| 442 |
+
{file = "coverage-7.12.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8bb5b894b3ec09dcd6d3743229dc7f2c42ef7787dc40596ae04c0edda487371e"},
|
| 443 |
+
{file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:79a44421cd5fba96aa57b5e3b5a4d3274c449d4c622e8f76882d76635501fd13"},
|
| 444 |
+
{file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:33baadc0efd5c7294f436a632566ccc1f72c867f82833eb59820ee37dc811c6f"},
|
| 445 |
+
{file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:c406a71f544800ef7e9e0000af706b88465f3573ae8b8de37e5f96c59f689ad1"},
|
| 446 |
+
{file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e71bba6a40883b00c6d571599b4627f50c360b3d0d02bfc658168936be74027b"},
|
| 447 |
+
{file = "coverage-7.12.0-cp314-cp314t-win32.whl", hash = "sha256:9157a5e233c40ce6613dead4c131a006adfda70e557b6856b97aceed01b0e27a"},
|
| 448 |
+
{file = "coverage-7.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:e84da3a0fd233aeec797b981c51af1cabac74f9bd67be42458365b30d11b5291"},
|
| 449 |
+
{file = "coverage-7.12.0-cp314-cp314t-win_arm64.whl", hash = "sha256:01d24af36fedda51c2b1aca56e4330a3710f83b02a5ff3743a6b015ffa7c9384"},
|
| 450 |
+
{file = "coverage-7.12.0-py3-none-any.whl", hash = "sha256:159d50c0b12e060b15ed3d39f87ed43d4f7f7ad40b8a534f4dd331adbb51104a"},
|
| 451 |
+
{file = "coverage-7.12.0.tar.gz", hash = "sha256:fc11e0a4e372cb5f282f16ef90d4a585034050ccda536451901abfb19a57f40c"},
|
| 452 |
+
]
|
| 453 |
+
|
| 454 |
+
[[package]]
|
| 455 |
+
name = "coverage"
|
| 456 |
+
version = "7.12.0"
|
| 457 |
+
extras = ["toml"]
|
| 458 |
+
requires_python = ">=3.10"
|
| 459 |
+
summary = "Code coverage measurement for Python"
|
| 460 |
+
groups = ["test"]
|
| 461 |
+
dependencies = [
|
| 462 |
+
"coverage==7.12.0",
|
| 463 |
+
"tomli; python_full_version <= \"3.11.0a6\"",
|
| 464 |
+
]
|
| 465 |
+
files = [
|
| 466 |
+
{file = "coverage-7.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:47324fffca8d8eae7e185b5bb20c14645f23350f870c1649003618ea91a78941"},
|
| 467 |
+
{file = "coverage-7.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ccf3b2ede91decd2fb53ec73c1f949c3e034129d1e0b07798ff1d02ea0c8fa4a"},
|
| 468 |
+
{file = "coverage-7.12.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:b365adc70a6936c6b0582dc38746b33b2454148c02349345412c6e743efb646d"},
|
| 469 |
+
{file = "coverage-7.12.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bc13baf85cd8a4cfcf4a35c7bc9d795837ad809775f782f697bf630b7e200211"},
|
| 470 |
+
{file = "coverage-7.12.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:099d11698385d572ceafb3288a5b80fe1fc58bf665b3f9d362389de488361d3d"},
|
| 471 |
+
{file = "coverage-7.12.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:473dc45d69694069adb7680c405fb1e81f60b2aff42c81e2f2c3feaf544d878c"},
|
| 472 |
+
{file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:583f9adbefd278e9de33c33d6846aa8f5d164fa49b47144180a0e037f0688bb9"},
|
| 473 |
+
{file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b2089cc445f2dc0af6f801f0d1355c025b76c24481935303cf1af28f636688f0"},
|
| 474 |
+
{file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:950411f1eb5d579999c5f66c62a40961f126fc71e5e14419f004471957b51508"},
|
| 475 |
+
{file = "coverage-7.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b1aab7302a87bafebfe76b12af681b56ff446dc6f32ed178ff9c092ca776e6bc"},
|
| 476 |
+
{file = "coverage-7.12.0-cp313-cp313-win32.whl", hash = "sha256:d7e0d0303c13b54db495eb636bc2465b2fb8475d4c8bcec8fe4b5ca454dfbae8"},
|
| 477 |
+
{file = "coverage-7.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:ce61969812d6a98a981d147d9ac583a36ac7db7766f2e64a9d4d059c2fe29d07"},
|
| 478 |
+
{file = "coverage-7.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:bcec6f47e4cb8a4c2dc91ce507f6eefc6a1b10f58df32cdc61dff65455031dfc"},
|
| 479 |
+
{file = "coverage-7.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:459443346509476170d553035e4a3eed7b860f4fe5242f02de1010501956ce87"},
|
| 480 |
+
{file = "coverage-7.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:04a79245ab2b7a61688958f7a855275997134bc84f4a03bc240cf64ff132abf6"},
|
| 481 |
+
{file = "coverage-7.12.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:09a86acaaa8455f13d6a99221d9654df249b33937b4e212b4e5a822065f12aa7"},
|
| 482 |
+
{file = "coverage-7.12.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:907e0df1b71ba77463687a74149c6122c3f6aac56c2510a5d906b2f368208560"},
|
| 483 |
+
{file = "coverage-7.12.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b57e2d0ddd5f0582bae5437c04ee71c46cd908e7bc5d4d0391f9a41e812dd12"},
|
| 484 |
+
{file = "coverage-7.12.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:58c1c6aa677f3a1411fe6fb28ec3a942e4f665df036a3608816e0847fad23296"},
|
| 485 |
+
{file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4c589361263ab2953e3c4cd2a94db94c4ad4a8e572776ecfbad2389c626e4507"},
|
| 486 |
+
{file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:91b810a163ccad2e43b1faa11d70d3cf4b6f3d83f9fd5f2df82a32d47b648e0d"},
|
| 487 |
+
{file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:40c867af715f22592e0d0fb533a33a71ec9e0f73a6945f722a0c85c8c1cbe3a2"},
|
| 488 |
+
{file = "coverage-7.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:68b0d0a2d84f333de875666259dadf28cc67858bc8fd8b3f1eae84d3c2bec455"},
|
| 489 |
+
{file = "coverage-7.12.0-cp313-cp313t-win32.whl", hash = "sha256:73f9e7fbd51a221818fd11b7090eaa835a353ddd59c236c57b2199486b116c6d"},
|
| 490 |
+
{file = "coverage-7.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:24cff9d1f5743f67db7ba46ff284018a6e9aeb649b67aa1e70c396aa1b7cb23c"},
|
| 491 |
+
{file = "coverage-7.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:c87395744f5c77c866d0f5a43d97cc39e17c7f1cb0115e54a2fe67ca75c5d14d"},
|
| 492 |
+
{file = "coverage-7.12.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:a1c59b7dc169809a88b21a936eccf71c3895a78f5592051b1af8f4d59c2b4f92"},
|
| 493 |
+
{file = "coverage-7.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8787b0f982e020adb732b9f051f3e49dd5054cebbc3f3432061278512a2b1360"},
|
| 494 |
+
{file = "coverage-7.12.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5ea5a9f7dc8877455b13dd1effd3202e0bca72f6f3ab09f9036b1bcf728f69ac"},
|
| 495 |
+
{file = "coverage-7.12.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fdba9f15849534594f60b47c9a30bc70409b54947319a7c4fd0e8e3d8d2f355d"},
|
| 496 |
+
{file = "coverage-7.12.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a00594770eb715854fb1c57e0dea08cce6720cfbc531accdb9850d7c7770396c"},
|
| 497 |
+
{file = "coverage-7.12.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5560c7e0d82b42eb1951e4f68f071f8017c824ebfd5a6ebe42c60ac16c6c2434"},
|
| 498 |
+
{file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d6c2e26b481c9159c2773a37947a9718cfdc58893029cdfb177531793e375cfc"},
|
| 499 |
+
{file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:6e1a8c066dabcde56d5d9fed6a66bc19a2883a3fe051f0c397a41fc42aedd4cc"},
|
| 500 |
+
{file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:f7ba9da4726e446d8dd8aae5a6cd872511184a5d861de80a86ef970b5dacce3e"},
|
| 501 |
+
{file = "coverage-7.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e0f483ab4f749039894abaf80c2f9e7ed77bbf3c737517fb88c8e8e305896a17"},
|
| 502 |
+
{file = "coverage-7.12.0-cp314-cp314-win32.whl", hash = "sha256:76336c19a9ef4a94b2f8dc79f8ac2da3f193f625bb5d6f51a328cd19bfc19933"},
|
| 503 |
+
{file = "coverage-7.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:7c1059b600aec6ef090721f8f633f60ed70afaffe8ecab85b59df748f24b31fe"},
|
| 504 |
+
{file = "coverage-7.12.0-cp314-cp314-win_arm64.whl", hash = "sha256:172cf3a34bfef42611963e2b661302a8931f44df31629e5b1050567d6b90287d"},
|
| 505 |
+
{file = "coverage-7.12.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:aa7d48520a32cb21c7a9b31f81799e8eaec7239db36c3b670be0fa2403828d1d"},
|
| 506 |
+
{file = "coverage-7.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:90d58ac63bc85e0fb919f14d09d6caa63f35a5512a2205284b7816cafd21bb03"},
|
| 507 |
+
{file = "coverage-7.12.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ca8ecfa283764fdda3eae1bdb6afe58bf78c2c3ec2b2edcb05a671f0bba7b3f9"},
|
| 508 |
+
{file = "coverage-7.12.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:874fe69a0785d96bd066059cd4368022cebbec1a8958f224f0016979183916e6"},
|
| 509 |
+
{file = "coverage-7.12.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5b3c889c0b8b283a24d721a9eabc8ccafcfc3aebf167e4cd0d0e23bf8ec4e339"},
|
| 510 |
+
{file = "coverage-7.12.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8bb5b894b3ec09dcd6d3743229dc7f2c42ef7787dc40596ae04c0edda487371e"},
|
| 511 |
+
{file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:79a44421cd5fba96aa57b5e3b5a4d3274c449d4c622e8f76882d76635501fd13"},
|
| 512 |
+
{file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:33baadc0efd5c7294f436a632566ccc1f72c867f82833eb59820ee37dc811c6f"},
|
| 513 |
+
{file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:c406a71f544800ef7e9e0000af706b88465f3573ae8b8de37e5f96c59f689ad1"},
|
| 514 |
+
{file = "coverage-7.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e71bba6a40883b00c6d571599b4627f50c360b3d0d02bfc658168936be74027b"},
|
| 515 |
+
{file = "coverage-7.12.0-cp314-cp314t-win32.whl", hash = "sha256:9157a5e233c40ce6613dead4c131a006adfda70e557b6856b97aceed01b0e27a"},
|
| 516 |
+
{file = "coverage-7.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:e84da3a0fd233aeec797b981c51af1cabac74f9bd67be42458365b30d11b5291"},
|
| 517 |
+
{file = "coverage-7.12.0-cp314-cp314t-win_arm64.whl", hash = "sha256:01d24af36fedda51c2b1aca56e4330a3710f83b02a5ff3743a6b015ffa7c9384"},
|
| 518 |
+
{file = "coverage-7.12.0-py3-none-any.whl", hash = "sha256:159d50c0b12e060b15ed3d39f87ed43d4f7f7ad40b8a534f4dd331adbb51104a"},
|
| 519 |
+
{file = "coverage-7.12.0.tar.gz", hash = "sha256:fc11e0a4e372cb5f282f16ef90d4a585034050ccda536451901abfb19a57f40c"},
|
| 520 |
+
]
|
| 521 |
+
|
| 522 |
[[package]]
|
| 523 |
name = "cycler"
|
| 524 |
version = "0.12.1"
|
|
|
|
| 938 |
{file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"},
|
| 939 |
]
|
| 940 |
|
| 941 |
+
[[package]]
|
| 942 |
+
name = "iniconfig"
|
| 943 |
+
version = "2.3.0"
|
| 944 |
+
requires_python = ">=3.10"
|
| 945 |
+
summary = "brain-dead simple config-ini parsing"
|
| 946 |
+
groups = ["test"]
|
| 947 |
+
files = [
|
| 948 |
+
{file = "iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12"},
|
| 949 |
+
{file = "iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730"},
|
| 950 |
+
]
|
| 951 |
+
|
| 952 |
[[package]]
|
| 953 |
name = "jinja2"
|
| 954 |
version = "3.1.6"
|
|
|
|
| 1350 |
version = "25.0"
|
| 1351 |
requires_python = ">=3.8"
|
| 1352 |
summary = "Core utilities for Python packages"
|
| 1353 |
+
groups = ["default", "hf-cli", "plots", "test"]
|
| 1354 |
files = [
|
| 1355 |
{file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"},
|
| 1356 |
{file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"},
|
|
|
|
| 1457 |
{file = "pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523"},
|
| 1458 |
]
|
| 1459 |
|
| 1460 |
+
[[package]]
|
| 1461 |
+
name = "pluggy"
|
| 1462 |
+
version = "1.6.0"
|
| 1463 |
+
requires_python = ">=3.9"
|
| 1464 |
+
summary = "plugin and hook calling mechanisms for python"
|
| 1465 |
+
groups = ["test"]
|
| 1466 |
+
files = [
|
| 1467 |
+
{file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"},
|
| 1468 |
+
{file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"},
|
| 1469 |
+
]
|
| 1470 |
+
|
| 1471 |
[[package]]
|
| 1472 |
name = "propcache"
|
| 1473 |
version = "0.4.1"
|
|
|
|
| 1639 |
version = "2.19.2"
|
| 1640 |
requires_python = ">=3.8"
|
| 1641 |
summary = "Pygments is a syntax highlighting package written in Python."
|
| 1642 |
+
groups = ["default", "test"]
|
| 1643 |
files = [
|
| 1644 |
{file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"},
|
| 1645 |
{file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"},
|
|
|
|
| 1656 |
{file = "pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6"},
|
| 1657 |
]
|
| 1658 |
|
| 1659 |
+
[[package]]
|
| 1660 |
+
name = "pytest"
|
| 1661 |
+
version = "9.0.1"
|
| 1662 |
+
requires_python = ">=3.10"
|
| 1663 |
+
summary = "pytest: simple powerful testing with Python"
|
| 1664 |
+
groups = ["test"]
|
| 1665 |
+
dependencies = [
|
| 1666 |
+
"colorama>=0.4; sys_platform == \"win32\"",
|
| 1667 |
+
"exceptiongroup>=1; python_version < \"3.11\"",
|
| 1668 |
+
"iniconfig>=1.0.1",
|
| 1669 |
+
"packaging>=22",
|
| 1670 |
+
"pluggy<2,>=1.5",
|
| 1671 |
+
"pygments>=2.7.2",
|
| 1672 |
+
"tomli>=1; python_version < \"3.11\"",
|
| 1673 |
+
]
|
| 1674 |
+
files = [
|
| 1675 |
+
{file = "pytest-9.0.1-py3-none-any.whl", hash = "sha256:67be0030d194df2dfa7b556f2e56fb3c3315bd5c8822c6951162b92b32ce7dad"},
|
| 1676 |
+
{file = "pytest-9.0.1.tar.gz", hash = "sha256:3e9c069ea73583e255c3b21cf46b8d3c56f6e3a1a8f6da94ccb0fcf57b9d73c8"},
|
| 1677 |
+
]
|
| 1678 |
+
|
| 1679 |
+
[[package]]
|
| 1680 |
+
name = "pytest-asyncio"
|
| 1681 |
+
version = "1.3.0"
|
| 1682 |
+
requires_python = ">=3.10"
|
| 1683 |
+
summary = "Pytest support for asyncio"
|
| 1684 |
+
groups = ["test"]
|
| 1685 |
+
dependencies = [
|
| 1686 |
+
"backports-asyncio-runner<2,>=1.1; python_version < \"3.11\"",
|
| 1687 |
+
"pytest<10,>=8.2",
|
| 1688 |
+
"typing-extensions>=4.12; python_version < \"3.13\"",
|
| 1689 |
+
]
|
| 1690 |
+
files = [
|
| 1691 |
+
{file = "pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5"},
|
| 1692 |
+
{file = "pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5"},
|
| 1693 |
+
]
|
| 1694 |
+
|
| 1695 |
+
[[package]]
|
| 1696 |
+
name = "pytest-cov"
|
| 1697 |
+
version = "7.0.0"
|
| 1698 |
+
requires_python = ">=3.9"
|
| 1699 |
+
summary = "Pytest plugin for measuring coverage."
|
| 1700 |
+
groups = ["test"]
|
| 1701 |
+
dependencies = [
|
| 1702 |
+
"coverage[toml]>=7.10.6",
|
| 1703 |
+
"pluggy>=1.2",
|
| 1704 |
+
"pytest>=7",
|
| 1705 |
+
]
|
| 1706 |
+
files = [
|
| 1707 |
+
{file = "pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861"},
|
| 1708 |
+
{file = "pytest_cov-7.0.0.tar.gz", hash = "sha256:33c97eda2e049a0c5298e91f519302a1334c26ac65c1a483d6206fd458361af1"},
|
| 1709 |
+
]
|
| 1710 |
+
|
| 1711 |
[[package]]
|
| 1712 |
name = "python-dateutil"
|
| 1713 |
version = "2.9.0.post0"
|
pyproject.toml
CHANGED
|
@@ -41,18 +41,21 @@ where = ["src"]
|
|
| 41 |
hf-eda-mcp = {cmd="python -m hf_eda_mcp", env_file= ".env"}
|
| 42 |
hf_client_playground = "python -m scripts.playground.hf_client_playground"
|
| 43 |
metadata_playground = "python -m scripts.playground.metadata_tool_playground"
|
|
|
|
| 44 |
|
| 45 |
[tool.pdm]
|
| 46 |
distribution = true
|
| 47 |
|
| 48 |
[tool.pdm.dev-dependencies]
|
| 49 |
-
test = [
|
| 50 |
-
"pytest>=7.0.0",
|
| 51 |
-
"pytest-asyncio>=0.21.0",
|
| 52 |
-
"pytest-cov>=4.0.0"
|
| 53 |
-
]
|
| 54 |
lint = [
|
| 55 |
"ruff>=0.1.0",
|
| 56 |
"black>=23.0.0",
|
| 57 |
"mypy>=1.0.0"
|
| 58 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
hf-eda-mcp = {cmd="python -m hf_eda_mcp", env_file= ".env"}
|
| 42 |
hf_client_playground = "python -m scripts.playground.hf_client_playground"
|
| 43 |
metadata_playground = "python -m scripts.playground.metadata_tool_playground"
|
| 44 |
+
analysis_playground = "python -m scripts.playground.analysis_tool_playground"
|
| 45 |
|
| 46 |
[tool.pdm]
|
| 47 |
distribution = true
|
| 48 |
|
| 49 |
[tool.pdm.dev-dependencies]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
lint = [
|
| 51 |
"ruff>=0.1.0",
|
| 52 |
"black>=23.0.0",
|
| 53 |
"mypy>=1.0.0"
|
| 54 |
]
|
| 55 |
+
|
| 56 |
+
[dependency-groups]
|
| 57 |
+
test = [
|
| 58 |
+
"pytest>=9.0.1",
|
| 59 |
+
"pytest-asyncio>=0.21.0",
|
| 60 |
+
"pytest-cov>=4.0.0",
|
| 61 |
+
]
|
scripts/playground/analysis_tool_playground.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
from pprint import pprint
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
from hf_eda_mcp.services.dataset_viewer_adapter import DatasetViewerAdapter
|
| 6 |
+
from hf_eda_mcp.tools.analysis import analyze_dataset_features
|
| 7 |
+
|
| 8 |
+
load_dotenv()
|
| 9 |
+
|
| 10 |
+
# Setup logging
|
| 11 |
+
logging.basicConfig(
|
| 12 |
+
filename="scripts.log",
|
| 13 |
+
encoding='utf-8',
|
| 14 |
+
level=logging.DEBUG,
|
| 15 |
+
filemode="w",
|
| 16 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_dataset_viewer_analysis(dataset_name = "stanfordnlp/imdb"):
|
| 23 |
+
service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"])
|
| 24 |
+
result = service.get_dataset_statistics(dataset_name=dataset_name, config="plain_text", split_name="train")
|
| 25 |
+
pprint(result, indent=2)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_dataset_service_analysis(dataset_name = "stanfordnlp/imdb"):
|
| 29 |
+
result = analyze_dataset_features(dataset_id=dataset_name, split="train")
|
| 30 |
+
pprint(result, indent=2)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_statistics_availability(dataset_name = "stanfordnlp/imdb"):
|
| 34 |
+
service = DatasetViewerAdapter(token=os.environ["HF_TOKEN"])
|
| 35 |
+
result = service.check_statistics_availability(dataset_name=dataset_name)
|
| 36 |
+
print(f"\nStatistics availability for {dataset_name}:")
|
| 37 |
+
pprint(result, indent=2)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
print("###### Dataset Viewer Statistics Endpoint #######")
|
| 42 |
+
test_dataset_viewer_analysis()
|
| 43 |
+
|
| 44 |
+
print("\n###### Integrated Analysis (uses Dataset Viewer when available) #######")
|
| 45 |
+
test_dataset_service_analysis()
|
| 46 |
+
|
| 47 |
+
print("\n###### Check Statistics Availability #######")
|
| 48 |
+
test_statistics_availability("stanfordnlp/imdb")
|
| 49 |
+
|
| 50 |
+
# Test with a dataset that might not have statistics
|
| 51 |
+
print("\n###### Testing fallback for dataset without parquet format #######")
|
| 52 |
+
try:
|
| 53 |
+
result = analyze_dataset_features(dataset_id="glue", config_name="cola", split="train", sample_size=100)
|
| 54 |
+
print(f"Analysis method: {result['sample_info']['sampling_method']}")
|
| 55 |
+
print(f"Sample size: {result['dataset_info']['sample_size_used']}")
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"Error: {e}")
|
src/hf_eda_mcp/services/dataset_service.py
CHANGED
|
@@ -79,9 +79,11 @@ class DatasetService:
|
|
| 79 |
# Cache subdirectories
|
| 80 |
self.metadata_cache_dir = self.cache_dir / "metadata"
|
| 81 |
self.sample_cache_dir = self.cache_dir / "samples"
|
|
|
|
| 82 |
|
| 83 |
self.metadata_cache_dir.mkdir(exist_ok=True)
|
| 84 |
self.sample_cache_dir.mkdir(exist_ok=True)
|
|
|
|
| 85 |
|
| 86 |
logger.info(f"DatasetService initialized with cache dir: {self.cache_dir}")
|
| 87 |
|
|
@@ -101,6 +103,16 @@ class DatasetService:
|
|
| 101 |
"""Generate cache key for dataset samples."""
|
| 102 |
base_key = self._get_cache_key(dataset_id, config_name)
|
| 103 |
return f"{base_key}_{split}_{num_samples}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
def _is_cache_valid(self, cache_file: Path) -> bool:
|
| 106 |
"""Check if cache file exists and is within TTL."""
|
|
@@ -572,6 +584,123 @@ class DatasetService:
|
|
| 572 |
f"Failed to load dataset sample: {str(e)}"
|
| 573 |
) from e
|
| 574 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
def get_cached_metadata(self, dataset_id: str, config_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
| 576 |
"""
|
| 577 |
Retrieve cached metadata without making API calls.
|
|
@@ -602,6 +731,8 @@ class DatasetService:
|
|
| 602 |
cache_file.unlink()
|
| 603 |
for cache_file in self.sample_cache_dir.glob("*.json"):
|
| 604 |
cache_file.unlink()
|
|
|
|
|
|
|
| 605 |
logger.info("Cleared all cache")
|
| 606 |
else:
|
| 607 |
# Clear cache for specific dataset
|
|
@@ -615,6 +746,10 @@ class DatasetService:
|
|
| 615 |
for cache_file in self.sample_cache_dir.glob(f"{cache_key}*.json"):
|
| 616 |
cache_file.unlink()
|
| 617 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
logger.info(f"Cleared cache for dataset: {dataset_id}")
|
| 619 |
|
| 620 |
except Exception as e:
|
|
@@ -631,19 +766,23 @@ class DatasetService:
|
|
| 631 |
try:
|
| 632 |
metadata_files = list(self.metadata_cache_dir.glob("*.json"))
|
| 633 |
sample_files = list(self.sample_cache_dir.glob("*.json"))
|
|
|
|
| 634 |
|
| 635 |
# Calculate cache sizes
|
| 636 |
metadata_size = sum(f.stat().st_size for f in metadata_files)
|
| 637 |
sample_size = sum(f.stat().st_size for f in sample_files)
|
|
|
|
| 638 |
|
| 639 |
return {
|
| 640 |
'cache_dir': str(self.cache_dir),
|
| 641 |
'metadata_files': len(metadata_files),
|
| 642 |
'sample_files': len(sample_files),
|
| 643 |
-
'
|
|
|
|
| 644 |
'metadata_size_bytes': metadata_size,
|
| 645 |
'sample_size_bytes': sample_size,
|
| 646 |
-
'
|
|
|
|
| 647 |
'cache_ttl_seconds': self.cache_ttl
|
| 648 |
}
|
| 649 |
except Exception as e:
|
|
|
|
| 79 |
# Cache subdirectories
|
| 80 |
self.metadata_cache_dir = self.cache_dir / "metadata"
|
| 81 |
self.sample_cache_dir = self.cache_dir / "samples"
|
| 82 |
+
self.statistics_cache_dir = self.cache_dir / "statistics"
|
| 83 |
|
| 84 |
self.metadata_cache_dir.mkdir(exist_ok=True)
|
| 85 |
self.sample_cache_dir.mkdir(exist_ok=True)
|
| 86 |
+
self.statistics_cache_dir.mkdir(exist_ok=True)
|
| 87 |
|
| 88 |
logger.info(f"DatasetService initialized with cache dir: {self.cache_dir}")
|
| 89 |
|
|
|
|
| 103 |
"""Generate cache key for dataset samples."""
|
| 104 |
base_key = self._get_cache_key(dataset_id, config_name)
|
| 105 |
return f"{base_key}_{split}_{num_samples}"
|
| 106 |
+
|
| 107 |
+
def _get_statistics_cache_key(
|
| 108 |
+
self,
|
| 109 |
+
dataset_id: str,
|
| 110 |
+
split: str,
|
| 111 |
+
config_name: Optional[str] = None
|
| 112 |
+
) -> str:
|
| 113 |
+
"""Generate cache key for dataset statistics."""
|
| 114 |
+
base_key = self._get_cache_key(dataset_id, config_name)
|
| 115 |
+
return f"{base_key}_{split}_stats"
|
| 116 |
|
| 117 |
def _is_cache_valid(self, cache_file: Path) -> bool:
|
| 118 |
"""Check if cache file exists and is within TTL."""
|
|
|
|
| 584 |
f"Failed to load dataset sample: {str(e)}"
|
| 585 |
) from e
|
| 586 |
|
| 587 |
+
def get_dataset_statistics(
|
| 588 |
+
self,
|
| 589 |
+
dataset_id: str,
|
| 590 |
+
split: str = "train",
|
| 591 |
+
config_name: Optional[str] = None,
|
| 592 |
+
use_cache: bool = True
|
| 593 |
+
) -> Optional[Dict[str, Any]]:
|
| 594 |
+
"""
|
| 595 |
+
Get detailed statistics from Dataset Viewer API with caching.
|
| 596 |
+
|
| 597 |
+
This method provides comprehensive statistics directly from HuggingFace's
|
| 598 |
+
Dataset Viewer API, which is more efficient and complete than sampling.
|
| 599 |
+
|
| 600 |
+
Statistics are only available for datasets with builder_name="parquet".
|
| 601 |
+
If statistics are not available, returns None and the caller should fall
|
| 602 |
+
back to sample-based analysis.
|
| 603 |
+
|
| 604 |
+
Args:
|
| 605 |
+
dataset_id: HuggingFace dataset identifier
|
| 606 |
+
split: Dataset split to get statistics for
|
| 607 |
+
config_name: Optional configuration name
|
| 608 |
+
use_cache: Whether to use cached statistics (default: True)
|
| 609 |
+
|
| 610 |
+
Returns:
|
| 611 |
+
Dictionary containing statistics or None if unavailable:
|
| 612 |
+
- num_examples: Total number of examples
|
| 613 |
+
- statistics: List of column statistics
|
| 614 |
+
- partial: Whether response is partial
|
| 615 |
+
- _cached_at: Cache timestamp
|
| 616 |
+
|
| 617 |
+
Raises:
|
| 618 |
+
DatasetServiceError: If the API request fails unexpectedly
|
| 619 |
+
"""
|
| 620 |
+
context = {
|
| 621 |
+
"dataset_id": dataset_id,
|
| 622 |
+
"split": split,
|
| 623 |
+
"config_name": config_name,
|
| 624 |
+
"operation": "get_dataset_statistics"
|
| 625 |
+
}
|
| 626 |
+
|
| 627 |
+
# Check cache first if enabled
|
| 628 |
+
if use_cache:
|
| 629 |
+
cache_key = self._get_statistics_cache_key(dataset_id, split, config_name)
|
| 630 |
+
cache_file = self.statistics_cache_dir / f"{cache_key}.json"
|
| 631 |
+
|
| 632 |
+
cached_data = self._load_from_cache(cache_file)
|
| 633 |
+
if cached_data is not None:
|
| 634 |
+
logger.debug(f"Using cached statistics for {dataset_id}/{split}")
|
| 635 |
+
return cached_data
|
| 636 |
+
|
| 637 |
+
try:
|
| 638 |
+
# First, check if statistics are available for this dataset
|
| 639 |
+
logger.info(f"Checking statistics availability for {dataset_id}")
|
| 640 |
+
availability = self.dataset_viewer.check_statistics_availability(
|
| 641 |
+
dataset_id, config_name
|
| 642 |
+
)
|
| 643 |
+
|
| 644 |
+
if not availability['available']:
|
| 645 |
+
logger.info(
|
| 646 |
+
f"Statistics not available for {dataset_id}: {availability['reason']}"
|
| 647 |
+
)
|
| 648 |
+
return None
|
| 649 |
+
|
| 650 |
+
# Determine which config to use
|
| 651 |
+
if config_name is None:
|
| 652 |
+
# Use first available config
|
| 653 |
+
available_configs = availability['configs']
|
| 654 |
+
if not available_configs:
|
| 655 |
+
logger.warning(f"No configs with statistics found for {dataset_id}")
|
| 656 |
+
return None
|
| 657 |
+
config_name = available_configs[0]
|
| 658 |
+
logger.info(f"Using config '{config_name}' for statistics")
|
| 659 |
+
elif config_name not in availability['configs']:
|
| 660 |
+
logger.warning(
|
| 661 |
+
f"Config '{config_name}' does not support statistics. "
|
| 662 |
+
f"Available configs: {availability['configs']}"
|
| 663 |
+
)
|
| 664 |
+
return None
|
| 665 |
+
|
| 666 |
+
# Fetch statistics from API
|
| 667 |
+
logger.info(f"Fetching statistics for {dataset_id}/{config_name}/{split}")
|
| 668 |
+
statistics = self.dataset_viewer.get_dataset_statistics(
|
| 669 |
+
dataset_name=dataset_id,
|
| 670 |
+
config=config_name,
|
| 671 |
+
split_name=split
|
| 672 |
+
)
|
| 673 |
+
|
| 674 |
+
# Add metadata
|
| 675 |
+
statistics['_cached_at'] = time.time()
|
| 676 |
+
statistics['_config_used'] = config_name
|
| 677 |
+
statistics['_dataset_id'] = dataset_id
|
| 678 |
+
statistics['_split'] = split
|
| 679 |
+
|
| 680 |
+
# Cache the results
|
| 681 |
+
if use_cache:
|
| 682 |
+
try:
|
| 683 |
+
self._save_to_cache(cache_file, statistics)
|
| 684 |
+
except CacheError as e:
|
| 685 |
+
logger.warning(f"Failed to cache statistics: {e}")
|
| 686 |
+
|
| 687 |
+
logger.info(
|
| 688 |
+
f"Successfully fetched statistics for {dataset_id}: "
|
| 689 |
+
f"{statistics.get('num_examples', 0)} examples, "
|
| 690 |
+
f"{len(statistics.get('statistics', []))} columns"
|
| 691 |
+
)
|
| 692 |
+
|
| 693 |
+
return statistics
|
| 694 |
+
|
| 695 |
+
except Exception as e:
|
| 696 |
+
# Log but don't fail - caller can fall back to sampling
|
| 697 |
+
log_error_with_context(e, context, level=logging.WARNING)
|
| 698 |
+
logger.info(
|
| 699 |
+
f"Could not fetch statistics for {dataset_id}, "
|
| 700 |
+
"caller should use sample-based analysis"
|
| 701 |
+
)
|
| 702 |
+
return None
|
| 703 |
+
|
| 704 |
def get_cached_metadata(self, dataset_id: str, config_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
| 705 |
"""
|
| 706 |
Retrieve cached metadata without making API calls.
|
|
|
|
| 731 |
cache_file.unlink()
|
| 732 |
for cache_file in self.sample_cache_dir.glob("*.json"):
|
| 733 |
cache_file.unlink()
|
| 734 |
+
for cache_file in self.statistics_cache_dir.glob("*.json"):
|
| 735 |
+
cache_file.unlink()
|
| 736 |
logger.info("Cleared all cache")
|
| 737 |
else:
|
| 738 |
# Clear cache for specific dataset
|
|
|
|
| 746 |
for cache_file in self.sample_cache_dir.glob(f"{cache_key}*.json"):
|
| 747 |
cache_file.unlink()
|
| 748 |
|
| 749 |
+
# Clear statistics cache
|
| 750 |
+
for cache_file in self.statistics_cache_dir.glob(f"{cache_key}*.json"):
|
| 751 |
+
cache_file.unlink()
|
| 752 |
+
|
| 753 |
logger.info(f"Cleared cache for dataset: {dataset_id}")
|
| 754 |
|
| 755 |
except Exception as e:
|
|
|
|
| 766 |
try:
|
| 767 |
metadata_files = list(self.metadata_cache_dir.glob("*.json"))
|
| 768 |
sample_files = list(self.sample_cache_dir.glob("*.json"))
|
| 769 |
+
statistics_files = list(self.statistics_cache_dir.glob("*.json"))
|
| 770 |
|
| 771 |
# Calculate cache sizes
|
| 772 |
metadata_size = sum(f.stat().st_size for f in metadata_files)
|
| 773 |
sample_size = sum(f.stat().st_size for f in sample_files)
|
| 774 |
+
statistics_size = sum(f.stat().st_size for f in statistics_files)
|
| 775 |
|
| 776 |
return {
|
| 777 |
'cache_dir': str(self.cache_dir),
|
| 778 |
'metadata_files': len(metadata_files),
|
| 779 |
'sample_files': len(sample_files),
|
| 780 |
+
'statistics_files': len(statistics_files),
|
| 781 |
+
'total_files': len(metadata_files) + len(sample_files) + len(statistics_files),
|
| 782 |
'metadata_size_bytes': metadata_size,
|
| 783 |
'sample_size_bytes': sample_size,
|
| 784 |
+
'statistics_size_bytes': statistics_size,
|
| 785 |
+
'total_size_bytes': metadata_size + sample_size + statistics_size,
|
| 786 |
'cache_ttl_seconds': self.cache_ttl
|
| 787 |
}
|
| 788 |
except Exception as e:
|
src/hf_eda_mcp/services/dataset_viewer_adapter.py
CHANGED
|
@@ -153,4 +153,128 @@ class DatasetViewerAdapter():
|
|
| 153 |
except Exception as e:
|
| 154 |
error_msg = f"Unexpected error fetching dataset information: {str(e)}"
|
| 155 |
logger.error(error_msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
raise DatasetViewerError(error_msg) from e
|
|
|
|
| 153 |
except Exception as e:
|
| 154 |
error_msg = f"Unexpected error fetching dataset information: {str(e)}"
|
| 155 |
logger.error(error_msg)
|
| 156 |
+
raise DatasetViewerError(error_msg) from e
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def get_dataset_statistics(
|
| 160 |
+
self,
|
| 161 |
+
dataset_name: str,
|
| 162 |
+
config: str,
|
| 163 |
+
split_name: str
|
| 164 |
+
) -> dict:
|
| 165 |
+
"""
|
| 166 |
+
Get detailed statistics for a dataset split from the Dataset Viewer API.
|
| 167 |
+
|
| 168 |
+
This endpoint provides comprehensive statistics including:
|
| 169 |
+
- Numerical features: histograms, mean, median, min, max, std
|
| 170 |
+
- Categorical features: value frequencies, unique counts
|
| 171 |
+
- Text features: length distributions
|
| 172 |
+
- Image features: width/height distributions
|
| 173 |
+
- Audio features: duration distributions
|
| 174 |
+
|
| 175 |
+
Note: This endpoint only works for datasets with builder_name="parquet".
|
| 176 |
+
Use get_dataset_information() first to check if statistics are available.
|
| 177 |
+
|
| 178 |
+
Args:
|
| 179 |
+
dataset_name: HuggingFace dataset identifier
|
| 180 |
+
config: Configuration name (required)
|
| 181 |
+
split_name: Split name (required)
|
| 182 |
+
|
| 183 |
+
Returns:
|
| 184 |
+
Dictionary containing detailed statistics including:
|
| 185 |
+
- num_examples: Total number of examples in the split
|
| 186 |
+
- statistics: List of column statistics with type-specific metrics
|
| 187 |
+
- partial: Whether the response is partial
|
| 188 |
+
|
| 189 |
+
Raises:
|
| 190 |
+
DatasetViewerError: If the API request fails or statistics are unavailable
|
| 191 |
+
"""
|
| 192 |
+
params = {
|
| 193 |
+
"dataset": dataset_name,
|
| 194 |
+
"config": config,
|
| 195 |
+
"split": split_name,
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
logger.info(f"Fetching dataset statistics from Viewer API: {dataset_name}/{config}/{split_name}")
|
| 199 |
+
|
| 200 |
+
try:
|
| 201 |
+
result = self._api_get(
|
| 202 |
+
route="statistics",
|
| 203 |
+
params=params
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
# Check for errors in response
|
| 207 |
+
if result.get('failed'):
|
| 208 |
+
logger.warning(f"Dataset Viewer API returned failures: {result['failed']}")
|
| 209 |
+
|
| 210 |
+
if result.get('partial'):
|
| 211 |
+
logger.warning("Dataset Viewer API returned partial data")
|
| 212 |
+
|
| 213 |
+
return result
|
| 214 |
+
|
| 215 |
+
except DatasetViewerError:
|
| 216 |
+
# Re-raise with context
|
| 217 |
+
raise
|
| 218 |
+
except Exception as e:
|
| 219 |
+
error_msg = f"Unexpected error fetching dataset statistics: {str(e)}"
|
| 220 |
+
logger.error(error_msg)
|
| 221 |
+
raise DatasetViewerError(error_msg) from e
|
| 222 |
+
|
| 223 |
+
def check_statistics_availability(
|
| 224 |
+
self,
|
| 225 |
+
dataset_name: str,
|
| 226 |
+
config: Optional[str] = None
|
| 227 |
+
) -> dict:
|
| 228 |
+
"""
|
| 229 |
+
Check if statistics are available for a dataset.
|
| 230 |
+
|
| 231 |
+
Statistics are only available for datasets with builder_name="parquet".
|
| 232 |
+
This method checks the dataset information to determine availability.
|
| 233 |
+
|
| 234 |
+
Args:
|
| 235 |
+
dataset_name: HuggingFace dataset identifier
|
| 236 |
+
config: Optional configuration name
|
| 237 |
+
|
| 238 |
+
Returns:
|
| 239 |
+
Dictionary with availability information:
|
| 240 |
+
- available: Boolean indicating if statistics are available
|
| 241 |
+
- configs: List of configs with statistics support
|
| 242 |
+
- reason: Explanation if statistics are not available
|
| 243 |
+
|
| 244 |
+
Raises:
|
| 245 |
+
DatasetViewerError: If the API request fails
|
| 246 |
+
"""
|
| 247 |
+
try:
|
| 248 |
+
info = self.get_dataset_information(dataset_name, config)
|
| 249 |
+
dataset_info = info.get('dataset_info', {})
|
| 250 |
+
|
| 251 |
+
# Handle both response formats
|
| 252 |
+
if isinstance(dataset_info, dict) and 'config_name' in dataset_info:
|
| 253 |
+
# Single config format
|
| 254 |
+
builder_name = dataset_info.get('builder_name', '')
|
| 255 |
+
is_parquet = builder_name == 'parquet'
|
| 256 |
+
|
| 257 |
+
return {
|
| 258 |
+
'available': is_parquet,
|
| 259 |
+
'configs': [dataset_info.get('config_name')] if is_parquet else [],
|
| 260 |
+
'reason': 'Statistics available' if is_parquet else f'Statistics only available for parquet datasets (found: {builder_name})'
|
| 261 |
+
}
|
| 262 |
+
else:
|
| 263 |
+
# Multiple configs format
|
| 264 |
+
parquet_configs = []
|
| 265 |
+
for cfg_name, cfg_data in dataset_info.items():
|
| 266 |
+
if cfg_data.get('builder_name') == 'parquet':
|
| 267 |
+
parquet_configs.append(cfg_name)
|
| 268 |
+
|
| 269 |
+
return {
|
| 270 |
+
'available': len(parquet_configs) > 0,
|
| 271 |
+
'configs': parquet_configs,
|
| 272 |
+
'reason': f'Statistics available for {len(parquet_configs)} config(s)' if parquet_configs else 'No parquet configs found'
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
except DatasetViewerError:
|
| 276 |
+
raise
|
| 277 |
+
except Exception as e:
|
| 278 |
+
error_msg = f"Unexpected error checking statistics availability: {str(e)}"
|
| 279 |
+
logger.error(error_msg)
|
| 280 |
raise DatasetViewerError(error_msg) from e
|
src/hf_eda_mcp/tools/analysis.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
Basic analysis tools for exploratory data analysis of HuggingFace datasets.
|
| 3 |
|
| 4 |
This module provides tools for performing exploratory data analysis including
|
| 5 |
-
feature statistics
|
| 6 |
"""
|
| 7 |
|
| 8 |
import logging
|
|
@@ -75,7 +75,6 @@ def analyze_dataset_features(
|
|
| 75 |
- unique_count: Number of unique values
|
| 76 |
- statistics: Type-specific statistics (mean, std for numerical; top values for categorical)
|
| 77 |
- summary: Overall analysis summary
|
| 78 |
-
- data_quality: Data quality assessment
|
| 79 |
|
| 80 |
Raises:
|
| 81 |
ValueError: If inputs are invalid
|
|
@@ -88,10 +87,6 @@ def analyze_dataset_features(
|
|
| 88 |
>>> for feature_name, feature_analysis in analysis['features'].items():
|
| 89 |
... print(f"{feature_name}: {feature_analysis['feature_type']}")
|
| 90 |
... print(f" Missing: {feature_analysis['missing_percentage']:.1f}%")
|
| 91 |
-
|
| 92 |
-
>>> # Check data quality
|
| 93 |
-
>>> quality = analysis['data_quality']
|
| 94 |
-
>>> print(f"Overall quality score: {quality['quality_score']:.2f}")
|
| 95 |
"""
|
| 96 |
# Handle empty strings from Gradio (convert to None)
|
| 97 |
if config_name == "":
|
|
@@ -122,8 +117,25 @@ def analyze_dataset_features(
|
|
| 122 |
)
|
| 123 |
|
| 124 |
try:
|
| 125 |
-
# Get dataset service
|
| 126 |
service = get_dataset_service()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
sample_data = service.load_dataset_sample(
|
| 128 |
dataset_id=dataset_id,
|
| 129 |
split=split,
|
|
@@ -132,8 +144,6 @@ def analyze_dataset_features(
|
|
| 132 |
streaming=True,
|
| 133 |
)
|
| 134 |
|
| 135 |
-
# Note: We could get dataset metadata here for additional context if needed
|
| 136 |
-
|
| 137 |
# Perform feature analysis
|
| 138 |
features_analysis = {}
|
| 139 |
data_samples = sample_data["data"]
|
|
@@ -172,7 +182,6 @@ def analyze_dataset_features(
|
|
| 172 |
"analysis_timestamp": sample_data.get("_sampled_at"),
|
| 173 |
},
|
| 174 |
"features": features_analysis,
|
| 175 |
-
"data_quality": _assess_data_quality(features_analysis),
|
| 176 |
"summary": _generate_analysis_summary(features_analysis, len(data_samples)),
|
| 177 |
}
|
| 178 |
|
|
@@ -205,6 +214,215 @@ def analyze_dataset_features(
|
|
| 205 |
raise DatasetServiceError(f"Failed to analyze dataset features: {str(e)}") from e
|
| 206 |
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
def _analyze_single_feature(
|
| 209 |
feature_name: str, data_samples: List[Dict[str, Any]]
|
| 210 |
) -> Dict[str, Any]:
|
|
@@ -396,113 +614,6 @@ def _compute_text_statistics(values: List[str]) -> Dict[str, Any]:
|
|
| 396 |
return {"count": len(values), "error": str(e)}
|
| 397 |
|
| 398 |
|
| 399 |
-
def _assess_data_quality(
|
| 400 |
-
features_analysis: Dict[str, Dict[str, Any]],
|
| 401 |
-
) -> Dict[str, Any]:
|
| 402 |
-
"""
|
| 403 |
-
Assess overall data quality based on feature analysis.
|
| 404 |
-
|
| 405 |
-
Args:
|
| 406 |
-
features_analysis: Dictionary of feature analyses
|
| 407 |
-
|
| 408 |
-
Returns:
|
| 409 |
-
Dictionary containing data quality assessment
|
| 410 |
-
"""
|
| 411 |
-
if not features_analysis:
|
| 412 |
-
return {"quality_score": 0.0, "issues": ["No features to analyze"]}
|
| 413 |
-
|
| 414 |
-
total_features = len(features_analysis)
|
| 415 |
-
issues = []
|
| 416 |
-
quality_factors = []
|
| 417 |
-
|
| 418 |
-
# Check missing value rates
|
| 419 |
-
high_missing_features = 0
|
| 420 |
-
total_missing_rate = 0
|
| 421 |
-
|
| 422 |
-
for feature_name, analysis in features_analysis.items():
|
| 423 |
-
missing_pct = analysis.get("missing_percentage", 0)
|
| 424 |
-
total_missing_rate += missing_pct
|
| 425 |
-
|
| 426 |
-
if missing_pct > 50:
|
| 427 |
-
high_missing_features += 1
|
| 428 |
-
issues.append(
|
| 429 |
-
f"Feature '{feature_name}' has {missing_pct:.1f}% missing values"
|
| 430 |
-
)
|
| 431 |
-
elif missing_pct > 20:
|
| 432 |
-
issues.append(
|
| 433 |
-
f"Feature '{feature_name}' has {missing_pct:.1f}% missing values"
|
| 434 |
-
)
|
| 435 |
-
|
| 436 |
-
avg_missing_rate = total_missing_rate / total_features
|
| 437 |
-
|
| 438 |
-
# Quality score calculation (0-1 scale)
|
| 439 |
-
missing_score = max(0, 1 - (avg_missing_rate / 100))
|
| 440 |
-
quality_factors.append(("missing_values", missing_score))
|
| 441 |
-
|
| 442 |
-
# Check for features with very low diversity
|
| 443 |
-
low_diversity_features = 0
|
| 444 |
-
for feature_name, analysis in features_analysis.items():
|
| 445 |
-
unique_count = analysis.get("unique_count", 0)
|
| 446 |
-
total_count = analysis.get("total_count", 1)
|
| 447 |
-
diversity_ratio = unique_count / total_count if total_count > 0 else 0
|
| 448 |
-
|
| 449 |
-
if diversity_ratio < 0.01 and analysis.get("feature_type") != "boolean":
|
| 450 |
-
low_diversity_features += 1
|
| 451 |
-
issues.append(
|
| 452 |
-
f"Feature '{feature_name}' has very low diversity ({unique_count} unique values)"
|
| 453 |
-
)
|
| 454 |
-
|
| 455 |
-
diversity_score = max(0, 1 - (low_diversity_features / total_features))
|
| 456 |
-
quality_factors.append(("diversity", diversity_score))
|
| 457 |
-
|
| 458 |
-
# Overall quality score (weighted average)
|
| 459 |
-
weights = {"missing_values": 0.6, "diversity": 0.4}
|
| 460 |
-
quality_score = sum(weights[factor] * score for factor, score in quality_factors)
|
| 461 |
-
|
| 462 |
-
# Quality assessment
|
| 463 |
-
if quality_score >= 0.8:
|
| 464 |
-
quality_level = "high"
|
| 465 |
-
elif quality_score >= 0.6:
|
| 466 |
-
quality_level = "medium"
|
| 467 |
-
else:
|
| 468 |
-
quality_level = "low"
|
| 469 |
-
|
| 470 |
-
return {
|
| 471 |
-
"quality_score": quality_score,
|
| 472 |
-
"quality_level": quality_level,
|
| 473 |
-
"avg_missing_rate": avg_missing_rate,
|
| 474 |
-
"high_missing_features": high_missing_features,
|
| 475 |
-
"low_diversity_features": low_diversity_features,
|
| 476 |
-
"issues": issues,
|
| 477 |
-
"recommendations": _generate_quality_recommendations(issues, quality_score),
|
| 478 |
-
}
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
def _generate_quality_recommendations(
|
| 482 |
-
issues: List[str], quality_score: float
|
| 483 |
-
) -> List[str]:
|
| 484 |
-
"""Generate recommendations based on data quality issues."""
|
| 485 |
-
recommendations = []
|
| 486 |
-
|
| 487 |
-
if quality_score < 0.6:
|
| 488 |
-
recommendations.append(
|
| 489 |
-
"Consider data cleaning and preprocessing before analysis"
|
| 490 |
-
)
|
| 491 |
-
|
| 492 |
-
if any("missing values" in issue for issue in issues):
|
| 493 |
-
recommendations.append("Handle missing values through imputation or removal")
|
| 494 |
-
|
| 495 |
-
if any("low diversity" in issue for issue in issues):
|
| 496 |
-
recommendations.append(
|
| 497 |
-
"Review features with low diversity - they may not be informative"
|
| 498 |
-
)
|
| 499 |
-
|
| 500 |
-
if not recommendations:
|
| 501 |
-
recommendations.append("Data quality looks good for analysis")
|
| 502 |
-
|
| 503 |
-
return recommendations
|
| 504 |
-
|
| 505 |
-
|
| 506 |
def _generate_analysis_summary(
|
| 507 |
features_analysis: Dict[str, Dict[str, Any]], sample_size: int
|
| 508 |
) -> str:
|
|
|
|
| 2 |
Basic analysis tools for exploratory data analysis of HuggingFace datasets.
|
| 3 |
|
| 4 |
This module provides tools for performing exploratory data analysis including
|
| 5 |
+
feature statistics and missing value analysis.
|
| 6 |
"""
|
| 7 |
|
| 8 |
import logging
|
|
|
|
| 75 |
- unique_count: Number of unique values
|
| 76 |
- statistics: Type-specific statistics (mean, std for numerical; top values for categorical)
|
| 77 |
- summary: Overall analysis summary
|
|
|
|
| 78 |
|
| 79 |
Raises:
|
| 80 |
ValueError: If inputs are invalid
|
|
|
|
| 87 |
>>> for feature_name, feature_analysis in analysis['features'].items():
|
| 88 |
... print(f"{feature_name}: {feature_analysis['feature_type']}")
|
| 89 |
... print(f" Missing: {feature_analysis['missing_percentage']:.1f}%")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
"""
|
| 91 |
# Handle empty strings from Gradio (convert to None)
|
| 92 |
if config_name == "":
|
|
|
|
| 117 |
)
|
| 118 |
|
| 119 |
try:
|
| 120 |
+
# Get dataset service
|
| 121 |
service = get_dataset_service()
|
| 122 |
+
|
| 123 |
+
# Try to get statistics from Dataset Viewer API first (more efficient and complete)
|
| 124 |
+
viewer_stats = service.get_dataset_statistics(
|
| 125 |
+
dataset_id=dataset_id,
|
| 126 |
+
split=split,
|
| 127 |
+
config_name=config_name
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
if viewer_stats is not None:
|
| 131 |
+
# Use Dataset Viewer statistics (full dataset, no sampling needed)
|
| 132 |
+
logger.info(f"Using Dataset Viewer statistics for {dataset_id}")
|
| 133 |
+
return _convert_viewer_statistics_to_analysis(
|
| 134 |
+
viewer_stats, dataset_id, config_name, split
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Fall back to sample-based analysis
|
| 138 |
+
logger.info("Dataset Viewer statistics not available, using sample-based analysis")
|
| 139 |
sample_data = service.load_dataset_sample(
|
| 140 |
dataset_id=dataset_id,
|
| 141 |
split=split,
|
|
|
|
| 144 |
streaming=True,
|
| 145 |
)
|
| 146 |
|
|
|
|
|
|
|
| 147 |
# Perform feature analysis
|
| 148 |
features_analysis = {}
|
| 149 |
data_samples = sample_data["data"]
|
|
|
|
| 182 |
"analysis_timestamp": sample_data.get("_sampled_at"),
|
| 183 |
},
|
| 184 |
"features": features_analysis,
|
|
|
|
| 185 |
"summary": _generate_analysis_summary(features_analysis, len(data_samples)),
|
| 186 |
}
|
| 187 |
|
|
|
|
| 214 |
raise DatasetServiceError(f"Failed to analyze dataset features: {str(e)}") from e
|
| 215 |
|
| 216 |
|
| 217 |
+
def _convert_viewer_statistics_to_analysis(
|
| 218 |
+
viewer_stats: Dict[str, Any],
|
| 219 |
+
dataset_id: str,
|
| 220 |
+
config_name: Optional[str],
|
| 221 |
+
split: str
|
| 222 |
+
) -> Dict[str, Any]:
|
| 223 |
+
"""
|
| 224 |
+
Convert Dataset Viewer API statistics to our analysis format.
|
| 225 |
+
|
| 226 |
+
Supports all Dataset Viewer column types:
|
| 227 |
+
- Numerical: int, float
|
| 228 |
+
- Categorical: class_label, string_label, bool
|
| 229 |
+
- Text: string_text
|
| 230 |
+
- Media: image, audio
|
| 231 |
+
- Structured: list
|
| 232 |
+
|
| 233 |
+
Args:
|
| 234 |
+
viewer_stats: Statistics from Dataset Viewer API
|
| 235 |
+
dataset_id: Dataset identifier
|
| 236 |
+
config_name: Configuration name
|
| 237 |
+
split: Split name
|
| 238 |
+
|
| 239 |
+
Returns:
|
| 240 |
+
Dictionary in our standard analysis format
|
| 241 |
+
"""
|
| 242 |
+
num_examples = viewer_stats.get('num_examples', 0)
|
| 243 |
+
statistics_list = viewer_stats.get('statistics', [])
|
| 244 |
+
|
| 245 |
+
features_analysis = {}
|
| 246 |
+
|
| 247 |
+
for col_stat in statistics_list:
|
| 248 |
+
column_name = col_stat.get('column_name', 'unknown')
|
| 249 |
+
column_type = col_stat.get('column_type', 'unknown')
|
| 250 |
+
column_statistics = col_stat.get('column_statistics', {})
|
| 251 |
+
|
| 252 |
+
# Convert to our format based on column type
|
| 253 |
+
if column_type == 'string_text':
|
| 254 |
+
# Text features: character length statistics
|
| 255 |
+
features_analysis[column_name] = {
|
| 256 |
+
'feature_type': 'text',
|
| 257 |
+
'missing_count': column_statistics.get('nan_count', 0),
|
| 258 |
+
'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
|
| 259 |
+
'unique_count': 0, # Not provided by viewer for text
|
| 260 |
+
'total_count': num_examples,
|
| 261 |
+
'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
|
| 262 |
+
'statistics': {
|
| 263 |
+
'count': num_examples - column_statistics.get('nan_count', 0),
|
| 264 |
+
'min_length': column_statistics.get('min', 0),
|
| 265 |
+
'max_length': column_statistics.get('max', 0),
|
| 266 |
+
'mean_length': column_statistics.get('mean', 0),
|
| 267 |
+
'median_length': column_statistics.get('median', 0),
|
| 268 |
+
'std_length': column_statistics.get('std', 0),
|
| 269 |
+
'histogram': column_statistics.get('histogram', {}),
|
| 270 |
+
},
|
| 271 |
+
'sample_values': [],
|
| 272 |
+
}
|
| 273 |
+
elif column_type in ['class_label', 'string_label']:
|
| 274 |
+
# Categorical features: frequency distributions
|
| 275 |
+
frequencies = column_statistics.get('frequencies', {})
|
| 276 |
+
features_analysis[column_name] = {
|
| 277 |
+
'feature_type': 'categorical',
|
| 278 |
+
'missing_count': column_statistics.get('nan_count', 0),
|
| 279 |
+
'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
|
| 280 |
+
'unique_count': column_statistics.get('n_unique', len(frequencies)),
|
| 281 |
+
'total_count': num_examples,
|
| 282 |
+
'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
|
| 283 |
+
'statistics': {
|
| 284 |
+
'count': num_examples - column_statistics.get('nan_count', 0),
|
| 285 |
+
'unique_count': column_statistics.get('n_unique', len(frequencies)),
|
| 286 |
+
'frequencies': frequencies,
|
| 287 |
+
'most_common': [(k, v) for k, v in sorted(frequencies.items(), key=lambda x: x[1], reverse=True)],
|
| 288 |
+
'top_value': max(frequencies.items(), key=lambda x: x[1]) if frequencies else None,
|
| 289 |
+
'no_label_count': column_statistics.get('no_label_count', 0),
|
| 290 |
+
'no_label_proportion': column_statistics.get('no_label_proportion', 0.0),
|
| 291 |
+
},
|
| 292 |
+
'sample_values': list(frequencies.keys())[:5],
|
| 293 |
+
}
|
| 294 |
+
elif column_type == 'bool':
|
| 295 |
+
# Boolean features: True/False frequencies
|
| 296 |
+
frequencies = column_statistics.get('frequencies', {})
|
| 297 |
+
features_analysis[column_name] = {
|
| 298 |
+
'feature_type': 'boolean',
|
| 299 |
+
'missing_count': column_statistics.get('nan_count', 0),
|
| 300 |
+
'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
|
| 301 |
+
'unique_count': len(frequencies),
|
| 302 |
+
'total_count': num_examples,
|
| 303 |
+
'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
|
| 304 |
+
'statistics': {
|
| 305 |
+
'count': num_examples - column_statistics.get('nan_count', 0),
|
| 306 |
+
'frequencies': frequencies,
|
| 307 |
+
},
|
| 308 |
+
'sample_values': list(frequencies.keys()),
|
| 309 |
+
}
|
| 310 |
+
elif column_type in ['int', 'float']:
|
| 311 |
+
# Numerical features: statistical measures
|
| 312 |
+
features_analysis[column_name] = {
|
| 313 |
+
'feature_type': 'numerical',
|
| 314 |
+
'missing_count': column_statistics.get('nan_count', 0),
|
| 315 |
+
'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
|
| 316 |
+
'unique_count': 0, # Not always provided
|
| 317 |
+
'total_count': num_examples,
|
| 318 |
+
'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
|
| 319 |
+
'statistics': {
|
| 320 |
+
'count': num_examples - column_statistics.get('nan_count', 0),
|
| 321 |
+
'mean': column_statistics.get('mean', 0),
|
| 322 |
+
'median': column_statistics.get('median', 0),
|
| 323 |
+
'min': column_statistics.get('min', 0),
|
| 324 |
+
'max': column_statistics.get('max', 0),
|
| 325 |
+
'std': column_statistics.get('std', 0),
|
| 326 |
+
'histogram': column_statistics.get('histogram', {}),
|
| 327 |
+
},
|
| 328 |
+
'sample_values': [],
|
| 329 |
+
}
|
| 330 |
+
elif column_type == 'image':
|
| 331 |
+
# Image features: dimension statistics
|
| 332 |
+
features_analysis[column_name] = {
|
| 333 |
+
'feature_type': 'image',
|
| 334 |
+
'missing_count': column_statistics.get('nan_count', 0),
|
| 335 |
+
'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
|
| 336 |
+
'unique_count': 0,
|
| 337 |
+
'total_count': num_examples,
|
| 338 |
+
'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
|
| 339 |
+
'statistics': {
|
| 340 |
+
'count': num_examples - column_statistics.get('nan_count', 0),
|
| 341 |
+
'min_dimension': column_statistics.get('min', 0),
|
| 342 |
+
'max_dimension': column_statistics.get('max', 0),
|
| 343 |
+
'mean_dimension': column_statistics.get('mean', 0),
|
| 344 |
+
'median_dimension': column_statistics.get('median', 0),
|
| 345 |
+
'std_dimension': column_statistics.get('std', 0),
|
| 346 |
+
'histogram': column_statistics.get('histogram', {}),
|
| 347 |
+
},
|
| 348 |
+
'sample_values': [],
|
| 349 |
+
}
|
| 350 |
+
elif column_type == 'audio':
|
| 351 |
+
# Audio features: duration statistics (in seconds)
|
| 352 |
+
features_analysis[column_name] = {
|
| 353 |
+
'feature_type': 'audio',
|
| 354 |
+
'missing_count': column_statistics.get('nan_count', 0),
|
| 355 |
+
'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
|
| 356 |
+
'unique_count': 0,
|
| 357 |
+
'total_count': num_examples,
|
| 358 |
+
'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
|
| 359 |
+
'statistics': {
|
| 360 |
+
'count': num_examples - column_statistics.get('nan_count', 0),
|
| 361 |
+
'min_duration': column_statistics.get('min', 0),
|
| 362 |
+
'max_duration': column_statistics.get('max', 0),
|
| 363 |
+
'mean_duration': column_statistics.get('mean', 0),
|
| 364 |
+
'median_duration': column_statistics.get('median', 0),
|
| 365 |
+
'std_duration': column_statistics.get('std', 0),
|
| 366 |
+
'histogram': column_statistics.get('histogram', {}),
|
| 367 |
+
},
|
| 368 |
+
'sample_values': [],
|
| 369 |
+
}
|
| 370 |
+
elif column_type == 'list':
|
| 371 |
+
# List features: length statistics
|
| 372 |
+
features_analysis[column_name] = {
|
| 373 |
+
'feature_type': 'list',
|
| 374 |
+
'missing_count': column_statistics.get('nan_count', 0),
|
| 375 |
+
'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
|
| 376 |
+
'unique_count': 0,
|
| 377 |
+
'total_count': num_examples,
|
| 378 |
+
'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
|
| 379 |
+
'statistics': {
|
| 380 |
+
'count': num_examples - column_statistics.get('nan_count', 0),
|
| 381 |
+
'min_length': column_statistics.get('min', 0),
|
| 382 |
+
'max_length': column_statistics.get('max', 0),
|
| 383 |
+
'mean_length': column_statistics.get('mean', 0),
|
| 384 |
+
'median_length': column_statistics.get('median', 0),
|
| 385 |
+
'std_length': column_statistics.get('std', 0),
|
| 386 |
+
'histogram': column_statistics.get('histogram', {}),
|
| 387 |
+
},
|
| 388 |
+
'sample_values': [],
|
| 389 |
+
}
|
| 390 |
+
else:
|
| 391 |
+
# Unknown type - provide basic info with all available statistics
|
| 392 |
+
features_analysis[column_name] = {
|
| 393 |
+
'feature_type': column_type,
|
| 394 |
+
'missing_count': column_statistics.get('nan_count', 0),
|
| 395 |
+
'missing_percentage': column_statistics.get('nan_proportion', 0.0) * 100,
|
| 396 |
+
'unique_count': column_statistics.get('n_unique', 0),
|
| 397 |
+
'total_count': num_examples,
|
| 398 |
+
'non_missing_count': num_examples - column_statistics.get('nan_count', 0),
|
| 399 |
+
'statistics': column_statistics,
|
| 400 |
+
'sample_values': [],
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
# Generate overall analysis
|
| 404 |
+
analysis_result = {
|
| 405 |
+
"dataset_info": {
|
| 406 |
+
"dataset_id": dataset_id,
|
| 407 |
+
"config_name": viewer_stats.get('_config_used', config_name),
|
| 408 |
+
"split": split,
|
| 409 |
+
"total_features": len(features_analysis),
|
| 410 |
+
"sample_size_used": num_examples,
|
| 411 |
+
"sample_size_requested": num_examples,
|
| 412 |
+
},
|
| 413 |
+
"sample_info": {
|
| 414 |
+
"sampling_method": "dataset_viewer_api",
|
| 415 |
+
"represents_full_dataset": True,
|
| 416 |
+
"analysis_timestamp": viewer_stats.get('_cached_at'),
|
| 417 |
+
"partial": viewer_stats.get('partial', False),
|
| 418 |
+
},
|
| 419 |
+
"features": features_analysis,
|
| 420 |
+
"summary": _generate_analysis_summary(features_analysis, num_examples),
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
return analysis_result
|
| 424 |
+
|
| 425 |
+
|
| 426 |
def _analyze_single_feature(
|
| 427 |
feature_name: str, data_samples: List[Dict[str, Any]]
|
| 428 |
) -> Dict[str, Any]:
|
|
|
|
| 614 |
return {"count": len(values), "error": str(e)}
|
| 615 |
|
| 616 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 617 |
def _generate_analysis_summary(
|
| 618 |
features_analysis: Dict[str, Dict[str, Any]], sample_size: int
|
| 619 |
) -> str:
|