meg-huggingface commited on
Commit
cb46b6e
·
1 Parent(s): 34eeb68

Trying to fix loading error. Note that I'm adding trust_remote_code=True; when new datasets are added, should be verified that this is OK.

Browse files
Files changed (1) hide show
  1. data_measurements/dataset_utils.py +51 -6
data_measurements/dataset_utils.py CHANGED
@@ -17,7 +17,7 @@ from dataclasses import asdict
17
  from os.path import exists
18
 
19
  import pandas as pd
20
- from datasets import Dataset, get_dataset_infos, load_dataset, load_from_disk
21
 
22
  # treating inf values as NaN as well
23
  pd.set_option("use_inf_as_na", True)
@@ -110,7 +110,7 @@ def load_truncated_dataset(
110
  """
111
  if cache_name is None:
112
  cache_name = f"{dataset_name}_{config_name}_{split_name}_{num_rows}"
113
- if exists(cache_name):
114
  dataset = load_from_disk(cache_name)
115
  else:
116
  if use_streaming and dataset_name in _STREAMABLE_DATASET_LIST:
@@ -130,11 +130,15 @@ def load_truncated_dataset(
130
  "temp.jsonl", features=iterable_dataset.features, split=split_name
131
  )
132
  else:
 
 
 
 
133
  full_dataset = load_dataset(
134
  dataset_name,
135
  name=config_name,
136
  split=split_name,
137
- trust_remote_code=True,
138
  )
139
  dataset = full_dataset.select(range(num_rows))
140
  dataset.save_to_disk(cache_name)
@@ -289,21 +293,62 @@ def dictionarize_info(dset_info):
289
  def get_dataset_info_dicts(dataset_id=None):
290
  """
291
  Creates a dict from dataset configs.
292
- Uses the datasets lib's get_dataset_infos
 
293
  :return: Dictionary mapping dataset names to their configurations
294
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  if dataset_id != None:
296
  ds_name_to_conf_dict = {
297
  dataset_id: {
298
  config_name: dictionarize_info(config_info)
299
- for config_name, config_info in get_dataset_infos(dataset_id).items()
300
  }
301
  }
302
  else:
303
  ds_name_to_conf_dict = {
304
  ds_id: {
305
  config_name: dictionarize_info(config_info)
306
- for config_name, config_info in get_dataset_infos(ds_id).items()
307
  }
308
  for ds_id in _DATASET_LIST
309
  }
 
17
  from os.path import exists
18
 
19
  import pandas as pd
20
+ from datasets import Dataset, get_dataset_infos, load_dataset, load_from_disk, load_dataset_builder, get_dataset_config_names
21
 
22
  # treating inf values as NaN as well
23
  pd.set_option("use_inf_as_na", True)
 
110
  """
111
  if cache_name is None:
112
  cache_name = f"{dataset_name}_{config_name}_{split_name}_{num_rows}"
113
+ if exists(cache_name) and use_cache:
114
  dataset = load_from_disk(cache_name)
115
  else:
116
  if use_streaming and dataset_name in _STREAMABLE_DATASET_LIST:
 
130
  "temp.jsonl", features=iterable_dataset.features, split=split_name
131
  )
132
  else:
133
+ if dataset_name in _DATASET_LIST:
134
+ trust_remote_code=True,
135
+ else:
136
+ trust_remote_code=False,
137
  full_dataset = load_dataset(
138
  dataset_name,
139
  name=config_name,
140
  split=split_name,
141
+ trust_remote_code=trust_remote_code,
142
  )
143
  dataset = full_dataset.select(range(num_rows))
144
  dataset.save_to_disk(cache_name)
 
293
  def get_dataset_info_dicts(dataset_id=None):
294
  """
295
  Creates a dict from dataset configs.
296
+ Uses the datasets lib's get_dataset_infos or load_dataset_builder
297
+ with trust_remote_code=True for datasets that require it.
298
  :return: Dictionary mapping dataset names to their configurations
299
  """
300
+ def get_infos_with_trust(ds_id):
301
+ """Get dataset infos with trust_remote_code=True if needed."""
302
+ try:
303
+ # First try without trust_remote_code
304
+ return get_dataset_infos(ds_id)
305
+ except (ValueError, RuntimeError) as e:
306
+ # If it requires trust_remote_code, use load_dataset_builder for each config
307
+ if ds_id in _DATASET_LIST and ("trust_remote_code" in str(e) or "requires you to execute" in str(e)):
308
+ try:
309
+ # Get all config names
310
+ config_names = get_dataset_config_names(ds_id, trust_remote_code=True)
311
+ if not config_names:
312
+ config_names = ["default"]
313
+
314
+ # Load builder for each config and get its info
315
+ infos = {}
316
+ for config_name in config_names:
317
+ try:
318
+ builder = load_dataset_builder(ds_id, config_name=config_name, trust_remote_code=True)
319
+ if builder.info is not None:
320
+ infos[config_name] = builder.info
321
+ except Exception:
322
+ # If loading with config_name fails, try without it
323
+ if config_name == "default":
324
+ builder = load_dataset_builder(ds_id, trust_remote_code=True)
325
+ if builder.info is not None:
326
+ infos["default"] = builder.info
327
+
328
+ if not infos:
329
+ raise ValueError(f"Could not load dataset info for {ds_id} with trust_remote_code=True")
330
+ return infos
331
+ except Exception as inner_e:
332
+ # Fallback: try get_dataset_infos with config_kwargs if supported
333
+ try:
334
+ return get_dataset_infos(ds_id, config_kwargs={'trust_remote_code': True})
335
+ except Exception:
336
+ raise e
337
+ else:
338
+ raise e
339
+
340
  if dataset_id != None:
341
  ds_name_to_conf_dict = {
342
  dataset_id: {
343
  config_name: dictionarize_info(config_info)
344
+ for config_name, config_info in get_infos_with_trust(dataset_id).items()
345
  }
346
  }
347
  else:
348
  ds_name_to_conf_dict = {
349
  ds_id: {
350
  config_name: dictionarize_info(config_info)
351
+ for config_name, config_info in get_infos_with_trust(ds_id).items()
352
  }
353
  for ds_id in _DATASET_LIST
354
  }