Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +4 -0
- llm_tutorial/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document.idx +0 -0
- llm_tutorial/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document/cache/GPTDataset_indices/80d7dc88bf2d24bda8780c0ade9232eb-GPTDataset-document_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document.bin +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document.idx +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/02eca412ecb6292af266f09e6facab79-GPTDataset-description.txt +10 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/02eca412ecb6292af266f09e6facab79-GPTDataset-document_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/02eca412ecb6292af266f09e6facab79-GPTDataset-sample_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/02eca412ecb6292af266f09e6facab79-GPTDataset-shuffle_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/08ff1eb63b2207c65cce43be54c00e54-GPTDataset-description.txt +10 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/08ff1eb63b2207c65cce43be54c00e54-GPTDataset-document_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/08ff1eb63b2207c65cce43be54c00e54-GPTDataset-sample_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/08ff1eb63b2207c65cce43be54c00e54-GPTDataset-shuffle_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/33ded67e093b2391ae3c697433b4284f-GPTDataset-description.txt +10 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/33ded67e093b2391ae3c697433b4284f-GPTDataset-document_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/33ded67e093b2391ae3c697433b4284f-GPTDataset-sample_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/33ded67e093b2391ae3c697433b4284f-GPTDataset-shuffle_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/350fe6b65ba7fd88a7114cedb7cbd446-GPTDataset-description.txt +10 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/350fe6b65ba7fd88a7114cedb7cbd446-GPTDataset-document_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/350fe6b65ba7fd88a7114cedb7cbd446-GPTDataset-sample_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/350fe6b65ba7fd88a7114cedb7cbd446-GPTDataset-shuffle_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/46d01ceb7d2cf1ad9e20542382e986d1-GPTDataset-description.txt +10 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/46d01ceb7d2cf1ad9e20542382e986d1-GPTDataset-document_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/46d01ceb7d2cf1ad9e20542382e986d1-GPTDataset-sample_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/46d01ceb7d2cf1ad9e20542382e986d1-GPTDataset-shuffle_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/48d213b6ea09663ec4b8fe09aadea1d0-GPTDataset-description.txt +10 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/48d213b6ea09663ec4b8fe09aadea1d0-GPTDataset-document_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/48d213b6ea09663ec4b8fe09aadea1d0-GPTDataset-sample_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/48d213b6ea09663ec4b8fe09aadea1d0-GPTDataset-shuffle_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/592590386f9491eebb2140b0b7a5a7aa-GPTDataset-description.txt +10 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/5fde86a30edbf3f3b847378216442630-GPTDataset-description.txt +10 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/5fde86a30edbf3f3b847378216442630-GPTDataset-document_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/5fde86a30edbf3f3b847378216442630-GPTDataset-sample_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/5fde86a30edbf3f3b847378216442630-GPTDataset-shuffle_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/676905a6d4ca63142a18409e64a50828-GPTDataset-description.txt +10 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/676905a6d4ca63142a18409e64a50828-GPTDataset-document_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/676905a6d4ca63142a18409e64a50828-GPTDataset-sample_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/676905a6d4ca63142a18409e64a50828-GPTDataset-shuffle_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6f81b6b1e641d082a358ab714ec3ef80-GPTDataset-description.txt +10 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6f81b6b1e641d082a358ab714ec3ef80-GPTDataset-document_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6f81b6b1e641d082a358ab714ec3ef80-GPTDataset-sample_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6f81b6b1e641d082a358ab714ec3ef80-GPTDataset-shuffle_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6fea953a1d3b9960c6ae25d0b4b50cff-GPTDataset-description.txt +10 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6fea953a1d3b9960c6ae25d0b4b50cff-GPTDataset-document_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6fea953a1d3b9960c6ae25d0b4b50cff-GPTDataset-sample_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6fea953a1d3b9960c6ae25d0b4b50cff-GPTDataset-shuffle_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/7dad590c16d66ac471a2be17aa829a7d-GPTDataset-description.txt +10 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/7dad590c16d66ac471a2be17aa829a7d-GPTDataset-document_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/7dad590c16d66ac471a2be17aa829a7d-GPTDataset-sample_index.npy +3 -0
- llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/7dad590c16d66ac471a2be17aa829a7d-GPTDataset-shuffle_index.npy +3 -0
.gitattributes
CHANGED
|
@@ -287,3 +287,7 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib
|
|
| 287 |
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so filter=lfs diff=lfs merge=lfs -text
|
| 288 |
tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/_C/libtriton.so filter=lfs diff=lfs merge=lfs -text
|
| 289 |
llm_tutorial/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document.idx filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so filter=lfs diff=lfs merge=lfs -text
|
| 288 |
tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/_C/libtriton.so filter=lfs diff=lfs merge=lfs -text
|
| 289 |
llm_tutorial/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document.idx filter=lfs diff=lfs merge=lfs -text
|
| 290 |
+
llm_tutorial/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document.idx filter=lfs diff=lfs merge=lfs -text
|
| 291 |
+
llm_tutorial/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train/data_text_document.idx filter=lfs diff=lfs merge=lfs -text
|
| 292 |
+
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document.idx filter=lfs diff=lfs merge=lfs -text
|
| 293 |
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/lib/libtorch_cuda.so filter=lfs diff=lfs merge=lfs -text
|
llm_tutorial/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document.idx
ADDED
|
Binary file (22.7 kB). View file
|
|
|
llm_tutorial/llm_recipes/datasets/bin/baseline/llm_jp_corpus_v2_ja_wiki_validation_0/data_text_document/cache/GPTDataset_indices/80d7dc88bf2d24bda8780c0ade9232eb-GPTDataset-document_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3984439db692dbc4f2333e39e8ac90fec1b0ccd9290e92c23d230827b4856ed4
|
| 3 |
+
size 5747240
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f6b7e6187060ef902222fca36be65f2185555aa2ad5e1d207d37675d4610010
|
| 3 |
+
size 8594368
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document.idx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90dbef9fe4b0c6019720513510a42844aa61217422985834df4c8057b8774db8
|
| 3 |
+
size 200042
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/02eca412ecb6292af266f09e6facab79-GPTDataset-description.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "GPTDataset",
|
| 3 |
+
"path_prefix": "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 4 |
+
"num_samples": 1624080,
|
| 5 |
+
"index_split": "valid",
|
| 6 |
+
"random_seed": 1234,
|
| 7 |
+
"sequence_length": 512,
|
| 8 |
+
"split": null,
|
| 9 |
+
"split_matrix": null
|
| 10 |
+
}
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/02eca412ecb6292af266f09e6facab79-GPTDataset-document_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:012599946530e516a7037d31471e5df17effc857633ae66d5d717e7b5e685724
|
| 3 |
+
size 8280128
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/02eca412ecb6292af266f09e6facab79-GPTDataset-sample_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7652a8e1ba313956df814b7241d4a30fcf870ed1c21b2211d3a6745d59c6a6b6
|
| 3 |
+
size 13013776
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/02eca412ecb6292af266f09e6facab79-GPTDataset-shuffle_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0dd70dd1e3a94cd995adbfe62a08a6e186c91acbb870eb214187c674d0c54a85
|
| 3 |
+
size 6506948
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/08ff1eb63b2207c65cce43be54c00e54-GPTDataset-description.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "GPTDataset",
|
| 3 |
+
"path_prefix": "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 4 |
+
"num_samples": 35376,
|
| 5 |
+
"index_split": "valid",
|
| 6 |
+
"random_seed": 1234,
|
| 7 |
+
"sequence_length": 256,
|
| 8 |
+
"split": null,
|
| 9 |
+
"split_matrix": null
|
| 10 |
+
}
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/08ff1eb63b2207c65cce43be54c00e54-GPTDataset-document_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ae24d87e59589370fafb89f6d1375718dee305d8245efb6606f621e51b66bf6
|
| 3 |
+
size 120128
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/08ff1eb63b2207c65cce43be54c00e54-GPTDataset-sample_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2344c62fcaeb89a2655bb78c74df2063328636d16326cfdec87bbf69943e1399
|
| 3 |
+
size 377336
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/08ff1eb63b2207c65cce43be54c00e54-GPTDataset-shuffle_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:54fe5d057a74f2128db38752566ba125e84b3930f7b7af54825279305f90ad04
|
| 3 |
+
size 188728
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/33ded67e093b2391ae3c697433b4284f-GPTDataset-description.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "GPTDataset",
|
| 3 |
+
"path_prefix": "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 4 |
+
"num_samples": 16080,
|
| 5 |
+
"index_split": "test",
|
| 6 |
+
"random_seed": 1234,
|
| 7 |
+
"sequence_length": 512,
|
| 8 |
+
"split": null,
|
| 9 |
+
"split_matrix": null
|
| 10 |
+
}
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/33ded67e093b2391ae3c697433b4284f-GPTDataset-document_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ae24d87e59589370fafb89f6d1375718dee305d8245efb6606f621e51b66bf6
|
| 3 |
+
size 120128
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/33ded67e093b2391ae3c697433b4284f-GPTDataset-sample_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:deda1d81893431f8c565b3b3c1827c6173655253304b825a8fca32d65a35e161
|
| 3 |
+
size 188736
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/33ded67e093b2391ae3c697433b4284f-GPTDataset-shuffle_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96712567ccc2b29d6f0f88bd0554a49fe6c8012b511ae5e7b85e158f87710235
|
| 3 |
+
size 94428
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/350fe6b65ba7fd88a7114cedb7cbd446-GPTDataset-description.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "GPTDataset",
|
| 3 |
+
"path_prefix": "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 4 |
+
"num_samples": 324816,
|
| 5 |
+
"index_split": "valid",
|
| 6 |
+
"random_seed": 1234,
|
| 7 |
+
"sequence_length": 512,
|
| 8 |
+
"split": null,
|
| 9 |
+
"split_matrix": null
|
| 10 |
+
}
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/350fe6b65ba7fd88a7114cedb7cbd446-GPTDataset-document_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:efbef7b827bba33d87aaa0ff3b4e2f730f7a8306df5b555e98ee0371fef932ae
|
| 3 |
+
size 1680128
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/350fe6b65ba7fd88a7114cedb7cbd446-GPTDataset-sample_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:459258f66f7eca3b35144e1e79eb59de4af4d24f37900b2c6ffb81700d08f00b
|
| 3 |
+
size 2640584
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/350fe6b65ba7fd88a7114cedb7cbd446-GPTDataset-shuffle_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:00d334cba2f4a54e8b7e52d256efaa416a2ef753c03c0950579f080158fd189d
|
| 3 |
+
size 1320352
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/46d01ceb7d2cf1ad9e20542382e986d1-GPTDataset-description.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "GPTDataset",
|
| 3 |
+
"path_prefix": "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 4 |
+
"num_samples": 35376,
|
| 5 |
+
"index_split": "valid",
|
| 6 |
+
"random_seed": 1234,
|
| 7 |
+
"sequence_length": 512,
|
| 8 |
+
"split": null,
|
| 9 |
+
"split_matrix": null
|
| 10 |
+
}
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/46d01ceb7d2cf1ad9e20542382e986d1-GPTDataset-document_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:05f2718b1e39405c4204e595ffc50244888e2db41a3bc28b8294d85af9170e0e
|
| 3 |
+
size 200128
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/46d01ceb7d2cf1ad9e20542382e986d1-GPTDataset-sample_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa533a35db6b87804daeaf99bcfcb19a055f2a53ea42d1bb1c3dca126066720b
|
| 3 |
+
size 314472
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/46d01ceb7d2cf1ad9e20542382e986d1-GPTDataset-shuffle_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a9288446077edc3a2cf85ee4f0cef902811cf283bb22e91eaa289b0427ca7db
|
| 3 |
+
size 157296
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/48d213b6ea09663ec4b8fe09aadea1d0-GPTDataset-description.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "GPTDataset",
|
| 3 |
+
"path_prefix": "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 4 |
+
"num_samples": 324816,
|
| 5 |
+
"index_split": "valid",
|
| 6 |
+
"random_seed": 1234,
|
| 7 |
+
"sequence_length": 1024,
|
| 8 |
+
"split": null,
|
| 9 |
+
"split_matrix": null
|
| 10 |
+
}
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/48d213b6ea09663ec4b8fe09aadea1d0-GPTDataset-document_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb77b72fa1e026bfbc648e6df847c9919a1100ae35fd50ca62293a0769053d51
|
| 3 |
+
size 3320128
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/48d213b6ea09663ec4b8fe09aadea1d0-GPTDataset-sample_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d80b241c69ca9e0ff16b845b1b1f6265591ff1164be0c7f2a6741ba2511b1dd
|
| 3 |
+
size 2609144
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/48d213b6ea09663ec4b8fe09aadea1d0-GPTDataset-shuffle_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3bfcbccee470988632a4121c64f60cb34c67d710e629d1170516e037159f2de7
|
| 3 |
+
size 1304632
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/592590386f9491eebb2140b0b7a5a7aa-GPTDataset-description.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "GPTDataset",
|
| 3 |
+
"path_prefix": "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 4 |
+
"num_samples": 32160000,
|
| 5 |
+
"index_split": "train",
|
| 6 |
+
"random_seed": 1234,
|
| 7 |
+
"sequence_length": 512,
|
| 8 |
+
"split": null,
|
| 9 |
+
"split_matrix": null
|
| 10 |
+
}
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/5fde86a30edbf3f3b847378216442630-GPTDataset-description.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "GPTDataset",
|
| 3 |
+
"path_prefix": "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 4 |
+
"num_samples": 643200,
|
| 5 |
+
"index_split": "train",
|
| 6 |
+
"random_seed": 1234,
|
| 7 |
+
"sequence_length": 512,
|
| 8 |
+
"split": null,
|
| 9 |
+
"split_matrix": null
|
| 10 |
+
}
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/5fde86a30edbf3f3b847378216442630-GPTDataset-document_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0348455e23ca090e51bf303077cf89f1c59054eb39bfd847009b2955c6da7a20
|
| 3 |
+
size 3280128
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/5fde86a30edbf3f3b847378216442630-GPTDataset-sample_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f31c6f7f94616a03e11b54dd04cbf5ed1b0da88dabe4d2f53130d8c96dd5e60
|
| 3 |
+
size 5155296
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/5fde86a30edbf3f3b847378216442630-GPTDataset-shuffle_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:634556f26bc65185bb848178f9d6444661384c0b2c9321b58dfcdb59460329fe
|
| 3 |
+
size 2577708
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/676905a6d4ca63142a18409e64a50828-GPTDataset-description.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "GPTDataset",
|
| 3 |
+
"path_prefix": "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 4 |
+
"num_samples": 3216,
|
| 5 |
+
"index_split": "test",
|
| 6 |
+
"random_seed": 1234,
|
| 7 |
+
"sequence_length": 1024,
|
| 8 |
+
"split": null,
|
| 9 |
+
"split_matrix": null
|
| 10 |
+
}
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/676905a6d4ca63142a18409e64a50828-GPTDataset-document_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ae60cf8fc9161a5ccff34c147058dbfd2fd1c3b44ddd68fcf42619465a8570e
|
| 3 |
+
size 40128
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/676905a6d4ca63142a18409e64a50828-GPTDataset-sample_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f68751056019b52d36647f552bf0f10341b0299a5737d7e06f31599494e403dc
|
| 3 |
+
size 31568
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/676905a6d4ca63142a18409e64a50828-GPTDataset-shuffle_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9b2dbdf4a1a9e6fbc7fce63ad51b5684cef1fafb381bfa21d33ade05456e1c4
|
| 3 |
+
size 15844
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6f81b6b1e641d082a358ab714ec3ef80-GPTDataset-description.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "GPTDataset",
|
| 3 |
+
"path_prefix": "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 4 |
+
"num_samples": 3216,
|
| 5 |
+
"index_split": "test",
|
| 6 |
+
"random_seed": 1234,
|
| 7 |
+
"sequence_length": 256,
|
| 8 |
+
"split": null,
|
| 9 |
+
"split_matrix": null
|
| 10 |
+
}
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6f81b6b1e641d082a358ab714ec3ef80-GPTDataset-document_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ae60cf8fc9161a5ccff34c147058dbfd2fd1c3b44ddd68fcf42619465a8570e
|
| 3 |
+
size 40128
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6f81b6b1e641d082a358ab714ec3ef80-GPTDataset-sample_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f6d07bdb6e4934dfaae5110ca54e8bf1230288c6d0531524a1e0b9787d493522
|
| 3 |
+
size 125864
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6f81b6b1e641d082a358ab714ec3ef80-GPTDataset-shuffle_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9253864331017f64dbb28cf31ece07df6945b178c2fd17aa8193e18a80476797
|
| 3 |
+
size 62992
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6fea953a1d3b9960c6ae25d0b4b50cff-GPTDataset-description.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "GPTDataset",
|
| 3 |
+
"path_prefix": "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 4 |
+
"num_samples": 3216,
|
| 5 |
+
"index_split": "test",
|
| 6 |
+
"random_seed": 1234,
|
| 7 |
+
"sequence_length": 512,
|
| 8 |
+
"split": null,
|
| 9 |
+
"split_matrix": null
|
| 10 |
+
}
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6fea953a1d3b9960c6ae25d0b4b50cff-GPTDataset-document_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ae60cf8fc9161a5ccff34c147058dbfd2fd1c3b44ddd68fcf42619465a8570e
|
| 3 |
+
size 40128
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6fea953a1d3b9960c6ae25d0b4b50cff-GPTDataset-sample_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0bdbba1d7493cd4c8e9d879258b79824068b00f60667f12647328483747a15bb
|
| 3 |
+
size 63000
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/6fea953a1d3b9960c6ae25d0b4b50cff-GPTDataset-shuffle_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a63d260926e2777d393dc245cf26ae95dc0f69e8bd34d8c3adb61703d5d10234
|
| 3 |
+
size 31560
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/7dad590c16d66ac471a2be17aa829a7d-GPTDataset-description.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "GPTDataset",
|
| 3 |
+
"path_prefix": "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
|
| 4 |
+
"num_samples": 51456,
|
| 5 |
+
"index_split": "valid",
|
| 6 |
+
"random_seed": 1234,
|
| 7 |
+
"sequence_length": 512,
|
| 8 |
+
"split": null,
|
| 9 |
+
"split_matrix": null
|
| 10 |
+
}
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/7dad590c16d66ac471a2be17aa829a7d-GPTDataset-document_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:531827519c1334cec7467a1e027b2215d93cb2ff1dbbb171398c6a770b503ee9
|
| 3 |
+
size 280128
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/7dad590c16d66ac471a2be17aa829a7d-GPTDataset-sample_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e5c1213e055d47372cce2197b396527f98f7e73419cc0621a75a1f2e4c9f892
|
| 3 |
+
size 440208
|
llm_tutorial/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document/cache/GPTDataset_indices/7dad590c16d66ac471a2be17aa829a7d-GPTDataset-shuffle_index.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f040b32caeb937b0b42461a0237b0c0cef30a6e9c01db4efca3f6769da429712
|
| 3 |
+
size 220164
|