jacklangerman commited on
Commit
403b0be
·
1 Parent(s): 82b72d0
Files changed (1) hide show
  1. script.py +22 -10
script.py CHANGED
@@ -56,18 +56,30 @@ if __name__ == "__main__":
56
  data_path = data_path_local
57
 
58
  print(data_path)
59
- print([str(p) for p in data_path.rglob('*validation*.arrow')])
60
 
61
  # dataset = load_dataset(params['dataset'], trust_remote_code=True, use_auth_token=params['token'])
62
- dataset = load_dataset(
63
- "arrow",
64
- data_files={
65
- "validation": [str(p) for p in data_path.rglob('*validation*.arrow')],
66
- "test": [str(p) for p in data_path.rglob('*test*.arrow')],
67
- },
68
- trust_remote_code=True,
69
- # streaming=True
70
- )
 
 
 
 
 
 
 
 
 
 
 
 
71
  print(dataset, flush=True)
72
  # dataset = load_dataset('webdataset', data_files={)
73
 
 
56
  data_path = data_path_local
57
 
58
  print(data_path)
59
+ print([str(p) for p in data_path.rglob('*validation*.(arrow|tar)')])
60
 
61
  # dataset = load_dataset(params['dataset'], trust_remote_code=True, use_auth_token=params['token'])
62
+ data_files = {
63
+ "validation": [str(p) for p in [*data_path.rglob('*validation*.arrow')]+[*data_path.rglob('*validation*.tar')]],
64
+ "test": [str(p) for p in [*data_path.rglob('*test*.arrow')]+[*data_path.rglob('*test*.tar')]],
65
+ }
66
+ try:
67
+ dataset = load_dataset(
68
+ "arrow",
69
+ data_files=data_files,
70
+ trust_remote_code=True,
71
+ # streaming=True
72
+ )
73
+ print('load with arrow')
74
+ except:
75
+ dataset = load_dataset(
76
+ "webdataset",
77
+ data_files=data_files,
78
+ trust_remote_code=True,
79
+ # streaming=True
80
+ )
81
+ print('load with webdataset')
82
+
83
  print(dataset, flush=True)
84
  # dataset = load_dataset('webdataset', data_files={)
85