meg-huggingface
commited on
Commit
·
cb46b6e
1
Parent(s):
34eeb68
Trying to fix loading error. Note that I'm adding trust_remote_code=True; when new datasets are added, should be verified that this is OK.
Browse files
data_measurements/dataset_utils.py
CHANGED
|
@@ -17,7 +17,7 @@ from dataclasses import asdict
|
|
| 17 |
from os.path import exists
|
| 18 |
|
| 19 |
import pandas as pd
|
| 20 |
-
from datasets import Dataset, get_dataset_infos, load_dataset, load_from_disk
|
| 21 |
|
| 22 |
# treating inf values as NaN as well
|
| 23 |
pd.set_option("use_inf_as_na", True)
|
|
@@ -110,7 +110,7 @@ def load_truncated_dataset(
|
|
| 110 |
"""
|
| 111 |
if cache_name is None:
|
| 112 |
cache_name = f"{dataset_name}_{config_name}_{split_name}_{num_rows}"
|
| 113 |
-
if exists(cache_name):
|
| 114 |
dataset = load_from_disk(cache_name)
|
| 115 |
else:
|
| 116 |
if use_streaming and dataset_name in _STREAMABLE_DATASET_LIST:
|
|
@@ -130,11 +130,15 @@ def load_truncated_dataset(
|
|
| 130 |
"temp.jsonl", features=iterable_dataset.features, split=split_name
|
| 131 |
)
|
| 132 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
full_dataset = load_dataset(
|
| 134 |
dataset_name,
|
| 135 |
name=config_name,
|
| 136 |
split=split_name,
|
| 137 |
-
trust_remote_code=
|
| 138 |
)
|
| 139 |
dataset = full_dataset.select(range(num_rows))
|
| 140 |
dataset.save_to_disk(cache_name)
|
|
@@ -289,21 +293,62 @@ def dictionarize_info(dset_info):
|
|
| 289 |
def get_dataset_info_dicts(dataset_id=None):
|
| 290 |
"""
|
| 291 |
Creates a dict from dataset configs.
|
| 292 |
-
Uses the datasets lib's get_dataset_infos
|
|
|
|
| 293 |
:return: Dictionary mapping dataset names to their configurations
|
| 294 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
if dataset_id != None:
|
| 296 |
ds_name_to_conf_dict = {
|
| 297 |
dataset_id: {
|
| 298 |
config_name: dictionarize_info(config_info)
|
| 299 |
-
for config_name, config_info in
|
| 300 |
}
|
| 301 |
}
|
| 302 |
else:
|
| 303 |
ds_name_to_conf_dict = {
|
| 304 |
ds_id: {
|
| 305 |
config_name: dictionarize_info(config_info)
|
| 306 |
-
for config_name, config_info in
|
| 307 |
}
|
| 308 |
for ds_id in _DATASET_LIST
|
| 309 |
}
|
|
|
|
| 17 |
from os.path import exists
|
| 18 |
|
| 19 |
import pandas as pd
|
| 20 |
+
from datasets import Dataset, get_dataset_infos, load_dataset, load_from_disk, load_dataset_builder, get_dataset_config_names
|
| 21 |
|
| 22 |
# treating inf values as NaN as well
|
| 23 |
pd.set_option("use_inf_as_na", True)
|
|
|
|
| 110 |
"""
|
| 111 |
if cache_name is None:
|
| 112 |
cache_name = f"{dataset_name}_{config_name}_{split_name}_{num_rows}"
|
| 113 |
+
if exists(cache_name) and use_cache:
|
| 114 |
dataset = load_from_disk(cache_name)
|
| 115 |
else:
|
| 116 |
if use_streaming and dataset_name in _STREAMABLE_DATASET_LIST:
|
|
|
|
| 130 |
"temp.jsonl", features=iterable_dataset.features, split=split_name
|
| 131 |
)
|
| 132 |
else:
|
| 133 |
+
if dataset_name in _DATASET_LIST:
|
| 134 |
+
trust_remote_code=True,
|
| 135 |
+
else:
|
| 136 |
+
trust_remote_code=False,
|
| 137 |
full_dataset = load_dataset(
|
| 138 |
dataset_name,
|
| 139 |
name=config_name,
|
| 140 |
split=split_name,
|
| 141 |
+
trust_remote_code=trust_remote_code,
|
| 142 |
)
|
| 143 |
dataset = full_dataset.select(range(num_rows))
|
| 144 |
dataset.save_to_disk(cache_name)
|
|
|
|
| 293 |
def get_dataset_info_dicts(dataset_id=None):
|
| 294 |
"""
|
| 295 |
Creates a dict from dataset configs.
|
| 296 |
+
Uses the datasets lib's get_dataset_infos or load_dataset_builder
|
| 297 |
+
with trust_remote_code=True for datasets that require it.
|
| 298 |
:return: Dictionary mapping dataset names to their configurations
|
| 299 |
"""
|
| 300 |
+
def get_infos_with_trust(ds_id):
|
| 301 |
+
"""Get dataset infos with trust_remote_code=True if needed."""
|
| 302 |
+
try:
|
| 303 |
+
# First try without trust_remote_code
|
| 304 |
+
return get_dataset_infos(ds_id)
|
| 305 |
+
except (ValueError, RuntimeError) as e:
|
| 306 |
+
# If it requires trust_remote_code, use load_dataset_builder for each config
|
| 307 |
+
if ds_id in _DATASET_LIST and ("trust_remote_code" in str(e) or "requires you to execute" in str(e)):
|
| 308 |
+
try:
|
| 309 |
+
# Get all config names
|
| 310 |
+
config_names = get_dataset_config_names(ds_id, trust_remote_code=True)
|
| 311 |
+
if not config_names:
|
| 312 |
+
config_names = ["default"]
|
| 313 |
+
|
| 314 |
+
# Load builder for each config and get its info
|
| 315 |
+
infos = {}
|
| 316 |
+
for config_name in config_names:
|
| 317 |
+
try:
|
| 318 |
+
builder = load_dataset_builder(ds_id, config_name=config_name, trust_remote_code=True)
|
| 319 |
+
if builder.info is not None:
|
| 320 |
+
infos[config_name] = builder.info
|
| 321 |
+
except Exception:
|
| 322 |
+
# If loading with config_name fails, try without it
|
| 323 |
+
if config_name == "default":
|
| 324 |
+
builder = load_dataset_builder(ds_id, trust_remote_code=True)
|
| 325 |
+
if builder.info is not None:
|
| 326 |
+
infos["default"] = builder.info
|
| 327 |
+
|
| 328 |
+
if not infos:
|
| 329 |
+
raise ValueError(f"Could not load dataset info for {ds_id} with trust_remote_code=True")
|
| 330 |
+
return infos
|
| 331 |
+
except Exception as inner_e:
|
| 332 |
+
# Fallback: try get_dataset_infos with config_kwargs if supported
|
| 333 |
+
try:
|
| 334 |
+
return get_dataset_infos(ds_id, config_kwargs={'trust_remote_code': True})
|
| 335 |
+
except Exception:
|
| 336 |
+
raise e
|
| 337 |
+
else:
|
| 338 |
+
raise e
|
| 339 |
+
|
| 340 |
if dataset_id != None:
|
| 341 |
ds_name_to_conf_dict = {
|
| 342 |
dataset_id: {
|
| 343 |
config_name: dictionarize_info(config_info)
|
| 344 |
+
for config_name, config_info in get_infos_with_trust(dataset_id).items()
|
| 345 |
}
|
| 346 |
}
|
| 347 |
else:
|
| 348 |
ds_name_to_conf_dict = {
|
| 349 |
ds_id: {
|
| 350 |
config_name: dictionarize_info(config_info)
|
| 351 |
+
for config_name, config_info in get_infos_with_trust(ds_id).items()
|
| 352 |
}
|
| 353 |
for ds_id in _DATASET_LIST
|
| 354 |
}
|