diff --git a/config.json b/config.json index 03c8e0311ceb764d7ab4ae67d9f539dd5ae7680f..63521d4dd7dc030cdd51f9fe669e56245189d591 100644 --- a/config.json +++ b/config.json @@ -12,10 +12,6 @@ "global", "local", "global", - "local", - "global", - "local", - "global", "local" ], "attention_types": [ @@ -24,7 +20,7 @@ "global", "local" ], - 6 + 4 ] ], "bos_token_id": 50256, @@ -38,7 +34,7 @@ "max_position_embeddings": 1024, "model_type": "gpt_neo", "num_heads": 16, - "num_layers": 12, + "num_layers": 8, "resid_dropout": 0.0, "torch_dtype": "float32", "transformers_version": "4.48.3", diff --git a/model.safetensors b/model.safetensors index d695de6c02f2e823d6c53547e560cc9132c87db0..77d402c205bd36560c7bd1ea38a13110b7c83af4 100644 --- a/model.safetensors +++ b/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8747dcbdb0d43a8a6c99dcc0cb3de2025cfb21800401f3db0c9ed419101b2e93 -size 813458248 +oid sha256:40c26b1ae3b666a04072aae648c2eece6ce13e61086dd7349b3f20ab56113241 +size 611962176 diff --git a/tokenized_tamil_CulturaX_dataset/dataset_dict.json b/tokenized_tamil_CulturaX_dataset/dataset_dict.json new file mode 100644 index 0000000000000000000000000000000000000000..ce084f11866187cc2a80681a7feed2433220a9e0 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/dataset_dict.json @@ -0,0 +1 @@ +{"splits": ["train", "validation"]} \ No newline at end of file diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00000-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00000-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..469ed4b807140c49f600a075906e8d0b4ad22659 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00000-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c09f29d19bd7192254bc3038be0b21cabc45e5cffb1ec3b9ffff761eb9d4f584 +size 497381632 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00001-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00001-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..423b8251125d84a2f0fab0b13f91e949c42cb916 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00001-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a52500425760f186a1966d28a1f9cdd59027db8f500ed9cd97fc6ef7a747c802 +size 493607912 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00002-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00002-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..aa7dfcf00321ae904a4f04ff7b8ae40be262bfdf --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00002-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed1d777f4eab98a10eea8d72f611f7f7c6c9a6d56fdd0572665d07dcc0378fd6 +size 494228680 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00003-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00003-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..d4aaa06e21bff7b08905a2d58794c50d38f896b2 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00003-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54e3bd2c78cc9a7dd3133fce457500b6f58b9542a845c06e9b0cd2aba51c647f +size 502511280 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00004-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00004-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..4c3be4ba7e4856b35de850e55738fbac917db62b --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00004-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3022cda03909699540c1dfb74771a6a0683697eee4b5be287ff0b4503a1c9713 +size 499485808 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00005-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00005-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..8bddbb68b1db1487ffc8c2762fe07ac55ad4b910 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00005-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f85be0815461bdcd0f2120ea915e3315ae481dcc82817ec5ed26a0766abb3505 +size 499387248 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00006-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00006-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c3f237d62d7b8c4b2e7a7e22b9c27ac92a65e13e --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00006-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72d80d9b39d48a6773c82dc210185fb11df584d694b1f606521481a020cec1c0 +size 495876704 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00007-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00007-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..dab8c6d7cc49403d2d2f2b366a5300f3b3487b52 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00007-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2efd9ac9aad8212d8f6deb22892f6d1c9543a6cff7d972878871f1505a818020 +size 501272680 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00008-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00008-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..73e11acddefad3712e69b15640f103d59ccccb69 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00008-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:797b07fe8e1640f6e571f83e4fd535b0669dda219c7be53716c012c132570981 +size 493478768 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00009-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00009-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..7913bb6b5890aee688fb3023d3529266ad3317b7 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00009-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8924d14f4b66d34d42f3de68f34c243ceee3d4ecb543da9cc808d5f77c832d9d +size 501205456 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00010-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00010-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..ce7644e696faab033eb30474303ea70df05f4506 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00010-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4892390827a065744dcdc3a112b3e150f10078401074274952efcb459345016 +size 496196224 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00011-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00011-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b4943af4112e2a9313fd4a3d507cd25683cf3435 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00011-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28d6bca7a1c716f13010f9937424c10de8447fff48bbd22a536b353a64821d7c +size 498188272 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00012-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00012-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..0c7d8758520000494557fc2f7a6d88c0176e84a6 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00012-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e80da07ac70a4397c5277bd31a0d6e47c4191b5e0bfbea16149058b1bc2c43b +size 497566808 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00013-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00013-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..db1df2068a5b15b6e6e1c1c028610622c31af002 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00013-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12facbfb3fb51d3d1ed2353436b9f34a8fbf61c917ae7517f5b595978c54a4ed +size 498150920 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00014-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00014-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..587975dc80939473da33e91dbc060accfcf40bfd --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00014-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4fc6c6d570c251c270182fb46354891b220e4e7fc16583fa4d4e7ceb6241444 +size 496817208 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00015-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00015-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..301aa7e214f618c8de843f7719cede0c353e72e8 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00015-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6d0e46090e0d74100dc5beeee5376bcf28c3c54b258683ae8b52e9812890f4f +size 495138056 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00016-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00016-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..0098d3d8ad2f5ef4a151a7ea683e3e635038f2b3 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00016-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7a3220adfdb414e620d371b69fc4ed9f04902da468ae5b0e8e7a89be96a4576 +size 502839768 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00017-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00017-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..8f38a4b02f3055938a8bb18a49a1c6797629e888 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00017-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7ddf9997c925fa809e1358c9b6641bdfb6261bbf4ce52e129bba7972c433a9b +size 500549936 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00018-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00018-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..ad127db72dc2a994f5ecd3c71cab8f849a4f2f4a --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00018-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d14ea433dbe2879fd658bbea588cdadf643c2ef6331b8026f8817dd922294afc +size 497750272 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00019-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00019-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..5596ece791e49129bd3b57a6837485e13dc96a26 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00019-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:226dd4d794de05f22649e0c4fd40e6dd8b1eff3247f5e9ba301139ed9f57ba01 +size 499247264 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00020-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00020-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..96ee2d981f8d464779fcfcad84390bab9006f578 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00020-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:581794a85ab4eab8e62ee556a71cf494105045ae2806ddb23833ae2f4fd67b92 +size 493278376 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00021-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00021-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..eda96c9c6b8ab08636e593cf9bf9fa9d3e29c20a --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00021-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0883fcb3fbbf1182044e4785f1ecf23af209222b646ce22cbbae1d45966bb7ef +size 495244888 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00022-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00022-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..43c935813ebdda692b0b80cd12a0182d0a5c7252 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00022-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28707191b7b6ac68b30cc402f8fce23a854eaed3695d5207bb4384c2514fe767 +size 503937304 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00023-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00023-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..e6bca1879b9612d1879f7c55d4d47c2f671a182b --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00023-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c75fc37600344427261f1189d0bd6157b0bec8ccfbf63810ba7e4b7d6f23f42 +size 492867936 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00024-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00024-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..9adc401a8812335b65d3e1308bdc0d0a39b89772 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00024-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a5497e4a1340921761fb77ab3b33a9be942b89cf4ade85aa1a70b9f9b0fe8ce +size 494700552 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00025-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00025-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..1f1eac77d5093c7d6ed9ca29e6dc5f256ffeb7f2 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00025-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71476f07fe5fd58859dfd91866104f26815853f04e97df52900dbdec322d7bba +size 496562080 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00026-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00026-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..7adae300933cc9284b19a44107143f3294dc9470 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00026-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4f151e7829eed4e30d184efb69b54ce76953d16eb57dda444a9ad04caf1f7b4 +size 495927992 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00027-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00027-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..ccf751c9585e239eee06053de6e6d558c92fcb0c --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00027-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b543ed8aad4a30611f2272c427b9b1d6e042241721651ba349f182f2d462ddf +size 494564728 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00028-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00028-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..3304dfad1bc00943610b09b363ddda043d679215 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00028-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f72822f16a3587fbe77bf92de8dabaf5f14677344ede86120e39e8735f111903 +size 496818240 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00029-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00029-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..511f281390437197ea4aceb523870a92e72071a6 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00029-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e421ab1cad2bc79cb971214a9ff25eb9332fc4ba7f8283621ae7a91207317339 +size 502068472 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00030-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00030-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..75d6da34004779de26ba28e2215cb011d8494077 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00030-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:458f48c79301d4f3f6c44647d55e3e9624134d361ec867ec0239167f51c8f948 +size 496780120 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00031-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00031-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2bec0e7cb76226046aade69e0dda4ee1d81b3791 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00031-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd7307df47aed789d120c4655042aa7706356c996626b101915481b71dfb2d0a +size 500810568 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00032-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00032-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2730d04fd592977ac610baa4de20d13cacf4eb13 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00032-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:600279e4031af57d1e6f746ec800ce0886541224218c6aa4b178906996f5ae50 +size 501512040 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00033-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00033-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..7b9f33ff01f38f7063e0605ee7a47f3f088efc38 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00033-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe45fd26b605977541ac08a7ebdb2f8adc03eccf4a7e1eb702b65b384f322de +size 494778744 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00034-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00034-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..14ebafb9289de10e7e658a870ec6d858564b81ea --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00034-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c500c2b69b3aa49a069d34f20a532cfea6bb781424044345999d7d4449fd77db +size 496628840 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00035-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00035-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..cb0199658f5f40f6638c0c4b616f7d4a2673100e --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00035-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab86b09cb74d87fa45109947e8c9e9e28efba33abbac1b3d1d7fa4e59e4d8669 +size 498848680 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00036-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00036-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..920c5f40289cf3d45890eb4898e25cbecb9a3fd5 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00036-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fec74e5e342d5466ccbd1c1de98c555c580950b96659a59cfdda8003f95d006 +size 494516592 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00037-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00037-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..77935a26adbc51fc4f70d59aec49d143fc9ed3cb --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00037-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50ddb23a9ed78d24b0559e3f96a3835406885f74f571ec4876bac4ebf47f1d21 +size 494706152 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00038-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00038-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..a8da4e97bc7def0b43e383fedab3b2d413bc845c --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00038-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4c264d66c22c260132e95289e1bd82eea7cac80a5a84638cab3aa4a2f75349d +size 499886680 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00039-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00039-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..3758caa1bc4223ea1370d54d2128e4a3f5bb5bb2 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00039-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9d413252e8eb76f52c1ca67f26175ece1bdeb2d7832c11547b5f9932c0f18c2 +size 496438128 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00040-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00040-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..be72d75a8dd76c6c2bc15df806a82682712cb0ef --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00040-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3421104cc5e1b88f23831863714d7b066974dd3a2f36083a8a56467b28cbc245 +size 497710752 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00041-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00041-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2ee610e144428933c9a5e0ed2949a28a3f9489ce --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00041-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b114b5aaec7dbb1a2f00c528cd3805cdc546e616d6c0f556fe04a738a45a19e +size 498914256 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00042-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00042-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..379f12ad28f78ab136eb1c06c9a2924850a37f78 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00042-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:047f4d85cf130a2d4babea02922822f3cf0adace60c3d7ac6105bc97929f22e9 +size 494177904 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00043-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00043-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..60f87b819a4271a2e146dd43b6bbf727ca5dcc34 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00043-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57c2c608c8478d1d4a185925e8e65605365b7f185daea43dafd0895470aaf3fc +size 493330896 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00044-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00044-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..8d2b60fb960e139f1beab44bfce631148cebd532 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00044-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06d057c9ed60f422fde3c758f1e3d0c03ede94b0c8c5ee7840494f9e57c43881 +size 495189992 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00045-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00045-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..5f8d64e8dd5a563f52cf976f01fbc2cd82ab10d9 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00045-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdf6da80c19b227c60ebe56904ef585908705d2d26dbfd1327aca66d2fb095c5 +size 497097816 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00046-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00046-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..26b5b3e4ea1efbed0d2bb13c8f63dae6b46eadd2 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00046-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:076a029e84d4f8a1f4fb4670366c51edddf41dd1819553c538c129319c98e118 +size 494662432 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00047-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00047-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..be93eaafcc869febd9b906da08d306c9a6474c7b --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00047-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c748133ebd763d26711bae40b137ea5f58f70e51c53a3f2f8849401fb539df6f +size 503115352 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00048-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00048-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..60456dc21e5d875c37f92585d53eca9fd8bf5ddd --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00048-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c01044f247586707217551e0fba9721252659b8bd1bde3930ac6ab68da3ebff +size 495651704 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00049-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00049-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b365a70090f7163d29261ce961c0e1ed785ecb5c --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00049-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6546101b1ebae3aeca71de5426425815531f85e2eaafd2d415dfffb419f638a1 +size 498972808 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00050-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00050-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..3f68b5ec006a2fd2de8edaf998b0637ff2dd35b0 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00050-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f92d59b0e2156983e8ee61eeae627bd6276b5fb2806eb6fedcf40512c36a1be7 +size 495121576 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00051-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00051-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..0206bc7cf9681803059d45fa93bbf0752791217f --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00051-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5aa13fc839cbb527dc3b77e53c008dab8f906fa05bee680a121fb20f1dd52bf +size 502536608 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00052-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00052-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..af2d2acddf87c0ae2482fa36008ecb1d37e6dbd5 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00052-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575963da65c9852dcf00b5dca24278be0024bbce009e47742dff4227b457ce34 +size 499919128 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00053-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00053-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..feabd407be3bb5e29366ac68890107ebf416d9aa --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00053-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47bba7011306582fd55dabe4c8e9ff42665b85c661769092b38ae120aa0e335e +size 504237848 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00054-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00054-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c32eafc9aeb49ea0eee081e21b42888fbd4e910a --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00054-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cc12b29778a99f912072804246e05a7e4c7b2da9487269d338d468f4b322640 +size 501585304 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00055-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00055-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..77123a318415f8e90565cfcaec741780d715672b --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00055-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a92d79d40a71c8c95b2bb7b66141cd874b91b6c80f11a516d50095be7ee6c2ec +size 496955344 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00056-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00056-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..a4f8c71f16042360770369ad739f9ad7e21d7f9f --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00056-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e911c876a0f6078e6fab0c9adc631768f9dbfe7b7ef5ccaafc1e57cae29320d +size 497171544 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00057-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00057-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..47694210f61827836ecd26e13c3c54394f5136af --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00057-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d3cf63a2880a9d850f2049b167a4849eb6cd2d0ef4ae63a0e522f40af99681d +size 498928448 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00058-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00058-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..84fabae0beb4408f3fb95d71757f491070733c71 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00058-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57cbff2da791d47d2b502139aa80b5de22d6cdc8fcdb8105d14eb2ebfaa2c922 +size 490408952 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00059-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00059-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..19e3693bbe39b816f844a8ffae85db5052b663e8 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00059-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24f5409cf10888fff8f28b240468aab000c3d0c95b362371c06219d58d6b1549 +size 500502632 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00060-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00060-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..530bd9491682743d44e2c5919b6f2a5be5fb7b7b --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00060-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14404b199e1f35b41b8b55d0de2bb943370aa4e967d7fbbb1fed39f6296e3213 +size 501525312 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00061-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00061-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..fcd95cf289f65720a3e7c2987bacab69ae4de028 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00061-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:776dae37b118aa06f2317346deb65211f98ffae80ce2799dc800d15c3b88ec6d +size 496310392 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00062-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00062-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..5ad124e63000b0a11a25f4d11fb421ca243c8bf3 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00062-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8f7ee423ee6585d9df437d547b57de2fc837d3a436c66f8552bdf6729778cb8 +size 499685584 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00063-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00063-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b15a188070217897fdd09562190c7d9697f13c0a --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00063-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11d1ce326105d0f99bd137fc86ba0c38ab47c0eab8aee2877d83747aa6ffe0d3 +size 498057528 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00064-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00064-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..ff37315e59a7e52bbe5dcca9be25c1c076c5809a --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00064-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42aad1b9836f4e568c6bdb58a7e95c323be63fa598fd5fc9bd2b3ecfdd55ebeb +size 501968304 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00065-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00065-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..ee19763a9450bde2e99b28228bdabec0d3cc2eb8 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00065-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5b30db8495b1c55c4401d41b0d544d1fbbeb1e7e5020d992d6b74c58e3b3132 +size 497203336 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00066-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00066-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..d8179becfdca14583729fb812cb2e7d67ea3a1fb --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00066-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8d978ce44571d31f7d5503902f4583f0c1e3bca4d64d47f567bb3a2f7f1d487 +size 498844696 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00067-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00067-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c96abf72203a9b8c66649948f07bb7302c5c01dd --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00067-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e66de9ddf290dc7b5c62456149e0eeb071de2e6956f7bb221017dafa6ad0cdba +size 493499872 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00068-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00068-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..585d9a23998613b69bb3b32ba66323b10a7b624a --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00068-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:795af985c05ab2cfd3301cb908cb6e9c5dd78974dddd086a275e5e52b7b2eea3 +size 498633680 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00069-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00069-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..bdf5e258d65ede8936621a135b6d125f7479f922 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00069-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a7727af5fe83d9d10795f5478ced12dceb5d8a7d16db8b17f4c9870f3c15412 +size 499003944 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00070-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00070-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..d4c7a76217e42c1878cceff9d7db1ea3f7f33591 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00070-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5554366f05e7986a886e4be2e7eacc90255df7fa12cecfaafa38de215729842 +size 492468088 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00071-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00071-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..7e6d9bc27dfa0aa5fd78d45de42d1baddd4943d7 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00071-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:978902abb353d00700aed318b9c1d87573c075c42936a876a65dd2ac46119aa1 +size 504187272 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00072-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00072-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..0a21d2e5689a287e723d0d4175a939123602fc96 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00072-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2b551d1965c8006f1180d492bf0e74c6086b5c6ca1b7d29ac2f1f9d3c64f21c +size 495095520 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00073-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00073-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..696055c638826d3750409007bcde1197adfd067b --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00073-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5d22ab0959d24a18943365989c5920858b54c5b65823bbab607300880c06526 +size 500018256 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00074-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00074-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..0d68dd39a305c4dbd18315905cba6b18983f2725 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00074-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd0d0c3383b76f1a2bf090aba2b46b63b28344949cfc54835fe34d944aec9a73 +size 500425008 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00075-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00075-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..7d899c6172add6f5923401a530eacfd96987e58c --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00075-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b8673e9be51b18e0e401688f69b549824cb27044b2af57a711611146021a18e +size 497086440 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00076-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00076-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b8ba66db11f0b278a9b09b9fd5bedf01482f5230 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00076-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ffa45571e55f2bba4b65290fffc0131258197a03b7f994b38be78a8245eacaf +size 497916712 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00077-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00077-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..e16b25df55a2f2d4d075d4e0ccdfad3692e819d1 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00077-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b57f916ae7d63e970f1237b65a62c7d6e465538e660f7d77b866768356ee0531 +size 501655504 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00078-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00078-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c42b27d6da892872eaa9b298bd8315531a5a58dd --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00078-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d263a4fa8eca4aaed08cf8e48c183f2c70799465548dffbcd4033ffc722d27b +size 493785992 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00079-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00079-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2a06cd2c1866b4e771221c7e5b587e5d76862566 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00079-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a8b0a03c9c1026cb4f977fcd26d8ca4c3e4b79439866e3b083ba117ae6371f9 +size 495808712 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00080-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00080-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..30c47dac6a2d5acbe54a5848c5bb417f0740bf00 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00080-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c34d1575275bbed31438704da7854654a6fe54386f0dae20ef8ee6b3433f93e9 +size 493682512 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00081-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00081-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..19d1c3b784ab1ceb32a6433b069dc64fd9767def --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00081-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f56f24c36fe00d211b9fe84e2d88c4aae019fcf9c2fe2e6260d339a9d8d595e8 +size 493545448 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00082-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00082-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..f1dc36b73341d94408715be31bafc8f977553cec --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00082-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8736a1ab9ac2b74a75a6821f009abc9f0a9fe72f4d774fb9294b9d54f06c8ee +size 500687680 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00083-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00083-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c6a762cce5cd54cd20fbb38022e70fbebbde14a6 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00083-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2714133df009f94867738168e4715da89d906e6ccb06fa2c912471f4706809e2 +size 493189584 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00084-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00084-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..316eeeeebedd56343c4882c8da7a0b77cced902c --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00084-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:613090ab04f1d4fd95173db27da86649b172680e0fa12155cdf84810fb23749b +size 504387944 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00085-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00085-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..9e29929f733e9a92430916abcfb972ba3e767e06 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00085-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e0415d21cf7977b50eff5d1ca7cfffa3ba5d9062aa58098f357a6e72f15ca7a +size 501131368 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00086-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00086-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..91554c3530a649ecd109f4e43b7fd6696e277915 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00086-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1478b01877089ce3ce7ca4a1342a64fed8c52fee0fdddb2081e435bbb863795b +size 499096680 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00087-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00087-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..fa534f4384e337c1ec32b37a5b706513238b0691 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00087-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a66c513c580ae2799b44a0905e58349ba156786a8ae2046bb53891aa4b57696 +size 503480632 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00088-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00088-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..e9d79142b8dd0a637e004c27678c29873bbab5b4 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00088-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a35d57a753109c224eeacb31e6747d7273f7a2507a0194a144184789a0ddfe3 +size 498982384 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00089-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00089-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..71a31acccd63e5acace3941d5dae5ecf36d31a6c --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00089-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63ca02124e2beb70f4157eafc2bbb116b830476504369b087b3276855aac0686 +size 494625544 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00090-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00090-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..1ca7f2c0d0bd464c6807e6cb4cf99bf537e98539 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00090-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ea92015853caef17aaa0eaf91551e97dfa6dc1a655332dda363f1a2bc205ccc +size 488349072 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00091-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00091-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..9b17283ce252ba88e7a202e2d6bb384cfe814b51 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00091-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1f4b9582184e0b18bbd3fe82a3d9e495a52c78029cd98dc246bcddfdd97e218 +size 495150832 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00092-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00092-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..023dd43a593fdc0d7141c7c1bbfdb667f2311181 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00092-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e9b51650d7e140c9198bed2227e681ef278f3ad81cbc4d31b3d84c2228f74f3 +size 496642920 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00093-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00093-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..cc61cbc0d699e6bd360739c5068e58e5e00ff8ab --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00093-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37901d34899efb5d3f72ff7f849e073a43d1fc5510871dc2fb8600d42294ba84 +size 499934192 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00094-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00094-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..7e73008859437f8c76fba9e9e84017c5dbeb5894 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00094-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c39a28cf7d2445a6dcc5bdebd60df3de14710c4db6866df3672465778e8078a3 +size 499619760 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00095-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00095-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..a9f610004d7a9e198f99d734878390e9dc2807cd --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00095-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69e63a7bd9a3e82c996fae3f296f7e79f4946a1cc1b40cf4de4cb0a7e59f157b +size 492313808 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00096-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00096-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c803b4efdf81044e71e1bfadb504575810e0160b --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00096-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bc380c38ee66634a311da6e8dae53c049f7aa504915910b4e684485262bcaa2 +size 496606112 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00097-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00097-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..1f58970fb1d3fa0504e4cf5154eb7de9c121d27c --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00097-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:686398f5e32b828ac43e8ab365412318a43af7409316f5c3051d4e89c8a4e814 +size 500504576 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00098-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00098-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..23c54fd2060c7a77b04e984788a294e20e8438c7 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00098-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b6d986c602eed04d67df4643b6f7455b3e90456ea744d08bb0dc48b6491ae4c +size 499965584 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00099-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00099-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c0cc27cbb69b851081c1ffc37cd567f4f56492d2 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00099-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecfe83520766b2c26ba7f1b0a2d6fd4fa76a70e9adf95b6a5b31bbd6c5083f40 +size 497158200 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00100-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00100-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..cb9e85661d81f550d318366ac81cd680268aeb50 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00100-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a574ede17ca184ef0d6efecde428ce7edc74952bda696bdbfa7eea07225c001 +size 496678760 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00101-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00101-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b5e73b3a1bb514a8571da94899e877ea1a99f549 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00101-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4463ec0ce64a8c2821254103f8bd205a8678e450941b33782ea1aad594498ff +size 495197232 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00102-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00102-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..d9ddc2f43f1fb6e428afc3deaed0b74e8685763c --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00102-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d112c0173cd7f5f3d5e684cfc7c267097d5b93706e993c1591226938c9202d91 +size 498079360 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00103-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00103-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c08f99db09d417237387059178a4439216d1e5f9 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00103-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:787242e57b2a692e01655a938805bc200a53676b73b193370fe58dc1e5a2cd12 +size 497222192 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00104-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00104-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..30a738ef113665ac20dccbf6eab6458525c7e2c0 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00104-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47fa568dcb92ad58e18dabe49f496e140d1dea3f7e7db661b59ef10bcd2c5a00 +size 494067752 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00105-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00105-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c8f540999351def8fc8e0d2864900c36857b5df4 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00105-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01ccf42ee869e80c091b8925bea1032162119bc7e61a9ef17e0f2dea5952ba05 +size 500549176 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00106-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00106-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b85272512dd2bd07fc179a1d4ecb755e9b233e50 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00106-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:927e050dabc8519f5b3b831fdb4790f4bc7e890c3a900067ae7a3e2b71b1b0b0 +size 497252312 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00107-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00107-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2617fb7bda48b3f243187924fcc9afc34c4bdc02 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00107-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1adb111bb789296941589c343c8f34437cae559856cab3df022e1a17dd3930e +size 498521928 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00108-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00108-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..286f7c04c141768a097a093b2d18c19ef1d04637 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00108-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:932d4c63deb9cbee82eff0a1c89d5831ff4e7bb859a29a6997b7a19e30a9b434 +size 496952904 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00109-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00109-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..85ce385f40c0db6fa5e2dc787ff83b49cfee1fd9 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00109-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9868cf6d2c9d1b6548a1b2f0fac45d3a924525401a1d62c82d71ebbb85b71e44 +size 499597040 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00110-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00110-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..0693509c66704e2dae26f4bfcea921579a5c23b3 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00110-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bff700146ab65e8e2dc75615b45f82f29b33cffa67b86210c1d6e1b116423952 +size 498894152 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00111-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00111-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..f43e68feb50c40229951fd53699cbfbc4f95ebfe --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00111-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8236b23b71577514d75591c4fa671251de21e4937c154be69fddea36b8c40752 +size 495494136 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00112-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00112-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..239e92f5612268c91d5976877431565765d313e4 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00112-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43fb7ce240d7e5354e422e955618376c7fdeb837c9f161149896f4ede3ff8deb +size 503071640 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00113-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00113-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..d016ed1b43c159372ca8eb06533a020bfb16d6e2 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00113-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c40b0a8d68a70c359e0eea47c904cc3e7b87aa5bfd537249f3feb6a0e13b4ea +size 497711536 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00114-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00114-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..195a410f20741abcf21bd39b500d8f1b50732117 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00114-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ed2388c7ab42b8aab704353b089d84011a3359cf1d774b25db79cc63e36fa49 +size 494901144 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00115-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00115-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..eda5458ebcf7df9fab35184910196122d37bec22 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00115-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f4132993906cb6f451cb509ceef2051ab065ec2a17626fb09bb691570f3d41f +size 496132792 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00116-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00116-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..115a23ab2405080fb492912f8ca67522bb3aaeba --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00116-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a3f9fbfc7cc01afb53466561a244a8d946fde38d8530083ba6f28e2c3f957e4 +size 498099944 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00117-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00117-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..9368e0d3fddf2f79ba3bcae6071225e4626b7d6a --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00117-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d71e618fe4b184413a11b20de8de16407fc2b12668f73d6a94334ba9c4c5c1e7 +size 498699064 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00118-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00118-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..28827d7b258052a28c8eb14ba70c1178bde79b31 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00118-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb374897392b3db16ffff03aa447fef94867b522b1bfaef38ef2c00b0e20c782 +size 491884312 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00119-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00119-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..bd77018ae92e88d69750f8cd91d7bf4cabc5fb84 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00119-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c68c71f1ac4e49720008af1c493625286cae2993b217b1ec9374e565e729cc60 +size 495809872 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00120-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00120-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..dc0a13a575ddf010e563f02d434a7ce331b177f3 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00120-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa2eee7ab554ee08ecae11ade033c370971c56940643439dbe648448e5410e77 +size 498790232 diff --git a/tokenized_tamil_CulturaX_dataset/train/data-00121-of-00122.arrow b/tokenized_tamil_CulturaX_dataset/train/data-00121-of-00122.arrow new file mode 100644 index 0000000000000000000000000000000000000000..6a67ac4025fa1fc833e67c0a9198ad0b043bab03 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/data-00121-of-00122.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feb6c08c5d9306d1f15cbe5e24fc32d9b074942e36f613b0be90e1258d403603 +size 504146792 diff --git a/tokenized_tamil_CulturaX_dataset/train/dataset_info.json b/tokenized_tamil_CulturaX_dataset/train/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..a914b93a3d51757cee72714061e03c779890195b --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/dataset_info.json @@ -0,0 +1,188 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "ta", + "dataset_name": "cultura_x", + "dataset_size": 43202234827, + "description": "", + "download_checksums": { + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00000.parquet": { + "num_bytes": 1541822289, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00001.parquet": { + "num_bytes": 1544216758, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00002.parquet": { + "num_bytes": 1547106018, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00003.parquet": { + "num_bytes": 1542979052, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00004.parquet": { + "num_bytes": 1536500363, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00005.parquet": { + "num_bytes": 1536547401, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00006.parquet": { + "num_bytes": 1533679358, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00007.parquet": { + "num_bytes": 1823097879, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00008.parquet": { + "num_bytes": 1653835169, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00009.parquet": { + "num_bytes": 1134462079, + "checksum": null + } + }, + "download_size": 15394246366, + "features": { + "text": { + "dtype": "string", + "_type": "Value" + }, + "timestamp": { + "dtype": "string", + "_type": "Value" + }, + "url": { + "dtype": "string", + "_type": "Value" + }, + "source": { + "dtype": "string", + "_type": "Value" + }, + "input_ids": { + "feature": { + "dtype": "int32", + "_type": "Value" + }, + "_type": "Sequence" + }, + "attention_mask": { + "feature": { + "dtype": "int8", + "_type": "Value" + }, + "_type": "Sequence" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 58596481193, + "splits": { + "train": { + "name": "train", + "num_bytes": 43202234827, + "num_examples": 4728460, + "shard_lengths": [ + 55000, + 55000, + 55000, + 56000, + 55000, + 55000, + 55000, + 55000, + 54846, + 55000, + 55000, + 55000, + 55000, + 55000, + 54000, + 54000, + 55000, + 54846, + 55000, + 54000, + 55000, + 55000, + 55000, + 54000, + 55000, + 54846, + 55000, + 55000, + 55000, + 54000, + 55000, + 55000, + 55000, + 55000, + 54846, + 55000, + 54000, + 55000, + 55000, + 55000, + 56000, + 55000, + 55000, + 54846, + 55000, + 56000, + 55000, + 55000, + 55000, + 56000, + 55000, + 54846, + 55000, + 55000, + 55000, + 55000, + 55000, + 56000, + 55000, + 56000, + 54846, + 55000, + 55000, + 55000, + 55000, + 56000, + 36000, + 35000, + 36000, + 37000, + 36846, + 36000, + 37000, + 37000, + 43000, + 43000, + 59000, + 87000, + 78000, + 82846, + 68000, + 70000, + 73000, + 77000, + 78000, + 48846 + ], + "dataset_name": "cultura_x" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/tokenized_tamil_CulturaX_dataset/train/state.json b/tokenized_tamil_CulturaX_dataset/train/state.json new file mode 100644 index 0000000000000000000000000000000000000000..78e87e997599f821429d3b88491db67dec64b9b0 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/train/state.json @@ -0,0 +1,376 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00122.arrow" + }, + { + "filename": "data-00001-of-00122.arrow" + }, + { + "filename": "data-00002-of-00122.arrow" + }, + { + "filename": "data-00003-of-00122.arrow" + }, + { + "filename": "data-00004-of-00122.arrow" + }, + { + "filename": "data-00005-of-00122.arrow" + }, + { + "filename": "data-00006-of-00122.arrow" + }, + { + "filename": "data-00007-of-00122.arrow" + }, + { + "filename": "data-00008-of-00122.arrow" + }, + { + "filename": "data-00009-of-00122.arrow" + }, + { + "filename": "data-00010-of-00122.arrow" + }, + { + "filename": "data-00011-of-00122.arrow" + }, + { + "filename": "data-00012-of-00122.arrow" + }, + { + "filename": "data-00013-of-00122.arrow" + }, + { + "filename": "data-00014-of-00122.arrow" + }, + { + "filename": "data-00015-of-00122.arrow" + }, + { + "filename": "data-00016-of-00122.arrow" + }, + { + "filename": "data-00017-of-00122.arrow" + }, + { + "filename": "data-00018-of-00122.arrow" + }, + { + "filename": "data-00019-of-00122.arrow" + }, + { + "filename": "data-00020-of-00122.arrow" + }, + { + "filename": "data-00021-of-00122.arrow" + }, + { + "filename": "data-00022-of-00122.arrow" + }, + { + "filename": "data-00023-of-00122.arrow" + }, + { + "filename": "data-00024-of-00122.arrow" + }, + { + "filename": "data-00025-of-00122.arrow" + }, + { + "filename": "data-00026-of-00122.arrow" + }, + { + "filename": "data-00027-of-00122.arrow" + }, + { + "filename": "data-00028-of-00122.arrow" + }, + { + "filename": "data-00029-of-00122.arrow" + }, + { + "filename": "data-00030-of-00122.arrow" + }, + { + "filename": "data-00031-of-00122.arrow" + }, + { + "filename": "data-00032-of-00122.arrow" + }, + { + "filename": "data-00033-of-00122.arrow" + }, + { + "filename": "data-00034-of-00122.arrow" + }, + { + "filename": "data-00035-of-00122.arrow" + }, + { + "filename": "data-00036-of-00122.arrow" + }, + { + "filename": "data-00037-of-00122.arrow" + }, + { + "filename": "data-00038-of-00122.arrow" + }, + { + "filename": "data-00039-of-00122.arrow" + }, + { + "filename": "data-00040-of-00122.arrow" + }, + { + "filename": "data-00041-of-00122.arrow" + }, + { + "filename": "data-00042-of-00122.arrow" + }, + { + "filename": "data-00043-of-00122.arrow" + }, + { + "filename": "data-00044-of-00122.arrow" + }, + { + "filename": "data-00045-of-00122.arrow" + }, + { + "filename": "data-00046-of-00122.arrow" + }, + { + "filename": "data-00047-of-00122.arrow" + }, + { + "filename": "data-00048-of-00122.arrow" + }, + { + "filename": "data-00049-of-00122.arrow" + }, + { + "filename": "data-00050-of-00122.arrow" + }, + { + "filename": "data-00051-of-00122.arrow" + }, + { + "filename": "data-00052-of-00122.arrow" + }, + { + "filename": "data-00053-of-00122.arrow" + }, + { + "filename": "data-00054-of-00122.arrow" + }, + { + "filename": "data-00055-of-00122.arrow" + }, + { + "filename": "data-00056-of-00122.arrow" + }, + { + "filename": "data-00057-of-00122.arrow" + }, + { + "filename": "data-00058-of-00122.arrow" + }, + { + "filename": "data-00059-of-00122.arrow" + }, + { + "filename": "data-00060-of-00122.arrow" + }, + { + "filename": "data-00061-of-00122.arrow" + }, + { + "filename": "data-00062-of-00122.arrow" + }, + { + "filename": "data-00063-of-00122.arrow" + }, + { + "filename": "data-00064-of-00122.arrow" + }, + { + "filename": "data-00065-of-00122.arrow" + }, + { + "filename": "data-00066-of-00122.arrow" + }, + { + "filename": "data-00067-of-00122.arrow" + }, + { + "filename": "data-00068-of-00122.arrow" + }, + { + "filename": "data-00069-of-00122.arrow" + }, + { + "filename": "data-00070-of-00122.arrow" + }, + { + "filename": "data-00071-of-00122.arrow" + }, + { + "filename": "data-00072-of-00122.arrow" + }, + { + "filename": "data-00073-of-00122.arrow" + }, + { + "filename": "data-00074-of-00122.arrow" + }, + { + "filename": "data-00075-of-00122.arrow" + }, + { + "filename": "data-00076-of-00122.arrow" + }, + { + "filename": "data-00077-of-00122.arrow" + }, + { + "filename": "data-00078-of-00122.arrow" + }, + { + "filename": "data-00079-of-00122.arrow" + }, + { + "filename": "data-00080-of-00122.arrow" + }, + { + "filename": "data-00081-of-00122.arrow" + }, + { + "filename": "data-00082-of-00122.arrow" + }, + { + "filename": "data-00083-of-00122.arrow" + }, + { + "filename": "data-00084-of-00122.arrow" + }, + { + "filename": "data-00085-of-00122.arrow" + }, + { + "filename": "data-00086-of-00122.arrow" + }, + { + "filename": "data-00087-of-00122.arrow" + }, + { + "filename": "data-00088-of-00122.arrow" + }, + { + "filename": "data-00089-of-00122.arrow" + }, + { + "filename": "data-00090-of-00122.arrow" + }, + { + "filename": "data-00091-of-00122.arrow" + }, + { + "filename": "data-00092-of-00122.arrow" + }, + { + "filename": "data-00093-of-00122.arrow" + }, + { + "filename": "data-00094-of-00122.arrow" + }, + { + "filename": "data-00095-of-00122.arrow" + }, + { + "filename": "data-00096-of-00122.arrow" + }, + { + "filename": "data-00097-of-00122.arrow" + }, + { + "filename": "data-00098-of-00122.arrow" + }, + { + "filename": "data-00099-of-00122.arrow" + }, + { + "filename": "data-00100-of-00122.arrow" + }, + { + "filename": "data-00101-of-00122.arrow" + }, + { + "filename": "data-00102-of-00122.arrow" + }, + { + "filename": "data-00103-of-00122.arrow" + }, + { + "filename": "data-00104-of-00122.arrow" + }, + { + "filename": "data-00105-of-00122.arrow" + }, + { + "filename": "data-00106-of-00122.arrow" + }, + { + "filename": "data-00107-of-00122.arrow" + }, + { + "filename": "data-00108-of-00122.arrow" + }, + { + "filename": "data-00109-of-00122.arrow" + }, + { + "filename": "data-00110-of-00122.arrow" + }, + { + "filename": "data-00111-of-00122.arrow" + }, + { + "filename": "data-00112-of-00122.arrow" + }, + { + "filename": "data-00113-of-00122.arrow" + }, + { + "filename": "data-00114-of-00122.arrow" + }, + { + "filename": "data-00115-of-00122.arrow" + }, + { + "filename": "data-00116-of-00122.arrow" + }, + { + "filename": "data-00117-of-00122.arrow" + }, + { + "filename": "data-00118-of-00122.arrow" + }, + { + "filename": "data-00119-of-00122.arrow" + }, + { + "filename": "data-00120-of-00122.arrow" + }, + { + "filename": "data-00121-of-00122.arrow" + } + ], + "_fingerprint": "df8221b9cc176787", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": "train" +} \ No newline at end of file diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00000-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00000-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c58432cd7af8e18cfe24049cf30ee504c70ff434 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00000-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a945a86f5508b360f94f4fa6db4edb20723d17c46c9faf5290b4fb6bac4d9cd5 +size 482162104 diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00001-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00001-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..787da0772f69b4b926fa8125bc6aa613cdc88fc1 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00001-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:824e154d890276ea81f9ae61d470093d6701dc18c52ecf21956edb299ec457a7 +size 485822688 diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00002-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00002-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..f342d2b8cea112260fc741ac38480f39768e39a1 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00002-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81a45967a12f79c2fe1a00868f21414d545e4023349c8ea518ad05655c5f4309 +size 479235520 diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00003-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00003-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c61ef51277e498ac548d41077cdac37976bf74d3 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00003-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8130c609006ae70eeb041f679db3c1876f130cff8494eaa30d499642b4504a77 +size 478314608 diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00004-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00004-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..ca7215da68d4b8827b2accd8fed59718ce5956ac --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00004-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d831b8bb171031a88120bd5f22577ec0cae8b45f5f66b0313c26ab8b36674189 +size 479384280 diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00005-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00005-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..3392438e892ab3d1464c8d21e71a3d235a657ecb --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00005-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:735f46da0d2e475d27e5cb198d644c79b68c1810d46c98fadc0feaf3a7c9ca5a +size 483755272 diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00006-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00006-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..8f71ec7de93cee8ab5f30953985660900564c736 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00006-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41727ca6104ed0ef4f02c04c46a66a7ed62d8554826a5b9ee6b30842d7f56c51 +size 478751864 diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00007-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00007-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..9ca65709d00f5df4a7b6980e6e67312bd651d9e5 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00007-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0f2b9d5491bbe8a8243e69edafd25a4ab528991a55ca8c1234615f6d7d90f6d +size 483486472 diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00008-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00008-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..ab5412f40335ff03489de7fa21fc93096acbe4c8 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00008-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c85dc5817c8b737dbea0c76a6ad8a98561f6aefe78599d15b279c34049a35e2f +size 483576024 diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00009-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00009-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..451211c16e5c03ff0a9b2aac19ba8dec0bb7584a --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00009-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f57fbb796e332ce0498c2448a6d43920637c8d74fd7714970af5192581847b66 +size 481272056 diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00010-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00010-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b3407e0bed59ad4f3e3c640587bb2f5a1976b1d6 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00010-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7786f489f1618d95806c694b9d0ef8ae663584ab991d5577fc96a33b6fcc7bba +size 481184736 diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00011-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00011-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..1bd306b6e1fb0c39c4625e339abbeade686a14ed --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00011-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e3cad3ab3e2100c05e9edc84638648180a570979df6868f3c78e98c88709c7d +size 479852848 diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00012-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00012-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2cbcfd7233b93f9885139274491d851a651af139 --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00012-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f1036c00b9bd8d5f2decd877ff8ec2ded2a3a461818b3b2cd94fc0b381bc8ce +size 478081248 diff --git a/tokenized_tamil_CulturaX_dataset/validation/data-00013-of-00014.arrow b/tokenized_tamil_CulturaX_dataset/validation/data-00013-of-00014.arrow new file mode 100644 index 0000000000000000000000000000000000000000..d32b67bfb2f129bb04cf314308d74a2ba310a82c --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/data-00013-of-00014.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:376e2cd2e247383a7b11fff2515a87c7935187bd2700eb109f109427895dca06 +size 479963648 diff --git a/tokenized_tamil_CulturaX_dataset/validation/dataset_info.json b/tokenized_tamil_CulturaX_dataset/validation/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..a914b93a3d51757cee72714061e03c779890195b --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/dataset_info.json @@ -0,0 +1,188 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "ta", + "dataset_name": "cultura_x", + "dataset_size": 43202234827, + "description": "", + "download_checksums": { + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00000.parquet": { + "num_bytes": 1541822289, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00001.parquet": { + "num_bytes": 1544216758, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00002.parquet": { + "num_bytes": 1547106018, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00003.parquet": { + "num_bytes": 1542979052, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00004.parquet": { + "num_bytes": 1536500363, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00005.parquet": { + "num_bytes": 1536547401, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00006.parquet": { + "num_bytes": 1533679358, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00007.parquet": { + "num_bytes": 1823097879, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00008.parquet": { + "num_bytes": 1653835169, + "checksum": null + }, + "hf://datasets/uonlp/CulturaX@6a8734bc69fefcbb7735f4f9250f43e4cd7a442e/ta/ta_part_00009.parquet": { + "num_bytes": 1134462079, + "checksum": null + } + }, + "download_size": 15394246366, + "features": { + "text": { + "dtype": "string", + "_type": "Value" + }, + "timestamp": { + "dtype": "string", + "_type": "Value" + }, + "url": { + "dtype": "string", + "_type": "Value" + }, + "source": { + "dtype": "string", + "_type": "Value" + }, + "input_ids": { + "feature": { + "dtype": "int32", + "_type": "Value" + }, + "_type": "Sequence" + }, + "attention_mask": { + "feature": { + "dtype": "int8", + "_type": "Value" + }, + "_type": "Sequence" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 58596481193, + "splits": { + "train": { + "name": "train", + "num_bytes": 43202234827, + "num_examples": 4728460, + "shard_lengths": [ + 55000, + 55000, + 55000, + 56000, + 55000, + 55000, + 55000, + 55000, + 54846, + 55000, + 55000, + 55000, + 55000, + 55000, + 54000, + 54000, + 55000, + 54846, + 55000, + 54000, + 55000, + 55000, + 55000, + 54000, + 55000, + 54846, + 55000, + 55000, + 55000, + 54000, + 55000, + 55000, + 55000, + 55000, + 54846, + 55000, + 54000, + 55000, + 55000, + 55000, + 56000, + 55000, + 55000, + 54846, + 55000, + 56000, + 55000, + 55000, + 55000, + 56000, + 55000, + 54846, + 55000, + 55000, + 55000, + 55000, + 55000, + 56000, + 55000, + 56000, + 54846, + 55000, + 55000, + 55000, + 55000, + 56000, + 36000, + 35000, + 36000, + 37000, + 36846, + 36000, + 37000, + 37000, + 43000, + 43000, + 59000, + 87000, + 78000, + 82846, + 68000, + 70000, + 73000, + 77000, + 78000, + 48846 + ], + "dataset_name": "cultura_x" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/tokenized_tamil_CulturaX_dataset/validation/state.json b/tokenized_tamil_CulturaX_dataset/validation/state.json new file mode 100644 index 0000000000000000000000000000000000000000..a479463cd50830638a500a0e7b3af479d748e31c --- /dev/null +++ b/tokenized_tamil_CulturaX_dataset/validation/state.json @@ -0,0 +1,52 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00014.arrow" + }, + { + "filename": "data-00001-of-00014.arrow" + }, + { + "filename": "data-00002-of-00014.arrow" + }, + { + "filename": "data-00003-of-00014.arrow" + }, + { + "filename": "data-00004-of-00014.arrow" + }, + { + "filename": "data-00005-of-00014.arrow" + }, + { + "filename": "data-00006-of-00014.arrow" + }, + { + "filename": "data-00007-of-00014.arrow" + }, + { + "filename": "data-00008-of-00014.arrow" + }, + { + "filename": "data-00009-of-00014.arrow" + }, + { + "filename": "data-00010-of-00014.arrow" + }, + { + "filename": "data-00011-of-00014.arrow" + }, + { + "filename": "data-00012-of-00014.arrow" + }, + { + "filename": "data-00013-of-00014.arrow" + } + ], + "_fingerprint": "289a6242a72d1a2d", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": "train" +} \ No newline at end of file diff --git a/tokenizer.json b/tokenizer.json index 3cd6b568fe595b34055626ccd8d2907ef37af20c..61af369c11a2f593a294d865adef16fc90f2aa7c 100644 --- a/tokenizer.json +++ b/tokenizer.json @@ -1,21 +1,7 @@ { "version": "1.0", - "truncation": { - "direction": "Right", - "max_length": 1024, - "strategy": "LongestFirst", - "stride": 0 - }, - "padding": { - "strategy": { - "Fixed": 1024 - }, - "direction": "Left", - "pad_to_multiple_of": null, - "pad_id": 2, - "pad_type_id": 0, - "pad_token": "" - }, + "truncation": null, + "padding": null, "added_tokens": [ { "id": 0, diff --git a/training_args.bin b/training_args.bin index dd2d04ec0bfaadd001afe1e3a611c0df4ea33022..f54b86c5983a7b8d4e82b931b8ad3800080cd6e0 100644 --- a/training_args.bin +++ b/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:540685575635de0db49be2a5e1618584b25819090aa28f21eea86d1cfe5d258a -size 5304 +oid sha256:67f50a6d254ba1fe291951907309fcfedf5983524086ae254225fc51c2d20208 +size 5240