allow remote data paths (#1278)
Browse files* allow remote data paths
* add docs about public url
* only allow https
* better docs
* better docs
- README.md +8 -0
- src/axolotl/utils/data.py +10 -0
README.md
CHANGED
|
@@ -468,6 +468,14 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
|
|
| 468 |
dataset:
|
| 469 |
- path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
|
| 470 |
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
```
|
| 472 |
|
| 473 |
- loading
|
|
|
|
| 468 |
dataset:
|
| 469 |
- path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
|
| 470 |
...
|
| 471 |
+
|
| 472 |
+
# Loading Data From a Public URL
|
| 473 |
+
# - URLs must use HTTPS protocol for security reasons, not HTTP.
|
| 474 |
+
# - The URL should be a direct link to the file you wish to load.
|
| 475 |
+
# - The file format is `json` (which includes `jsonl`) by default. For different formats, adjust the `ds_type` option accordingly.
|
| 476 |
+
dataset:
|
| 477 |
+
- path: https://some.url.com/yourdata.jsonl # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
|
| 478 |
+
ds_type: json # this is the default, see other options below.
|
| 479 |
```
|
| 480 |
|
| 481 |
- loading
|
src/axolotl/utils/data.py
CHANGED
|
@@ -336,6 +336,16 @@ def load_tokenized_prepared_datasets(
|
|
| 336 |
split=None,
|
| 337 |
storage_options=storage_options,
|
| 338 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
else:
|
| 340 |
if isinstance(config_dataset.data_files, str):
|
| 341 |
fp = hf_hub_download(
|
|
|
|
| 336 |
split=None,
|
| 337 |
storage_options=storage_options,
|
| 338 |
)
|
| 339 |
+
elif config_dataset.path.startswith("https://"):
|
| 340 |
+
ds_type = get_ds_type(config_dataset)
|
| 341 |
+
ds = load_dataset(
|
| 342 |
+
ds_type,
|
| 343 |
+
name=config_dataset.name,
|
| 344 |
+
data_files=config_dataset.path,
|
| 345 |
+
streaming=False,
|
| 346 |
+
split=None,
|
| 347 |
+
storage_options=storage_options,
|
| 348 |
+
)
|
| 349 |
else:
|
| 350 |
if isinstance(config_dataset.data_files, str):
|
| 351 |
fp = hf_hub_download(
|