add data and model
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Data/README.md +6 -0
- Data/captioning/clipL_train.pt +3 -0
- Data/captioning/clipL_val.pt +3 -0
- Data/captioning/clip_train.pt +3 -0
- Data/captioning/clip_val.pt +3 -0
- Data/captioning/en_captions_train.jsonl +3 -0
- Data/captioning/en_captions_train_plus.jsonl +3 -0
- Data/captioning/en_captions_val.jsonl +3 -0
- Data/captioning/en_captions_val_plus.jsonl +3 -0
- Data/captioning/images_train.pt +3 -0
- Data/captioning/images_val.pt +3 -0
- Data/captioning/train_image_names.txt +3 -0
- Data/captioning/train_indices.txt +3 -0
- Data/captioning/val_image_names.txt +3 -0
- Data/captioning/val_indices.txt +3 -0
- Data/cc/de_500k.txt +3 -0
- Data/cc/de_cc_token2count_dict.facebook-mbart-large-cc25.json +0 -0
- Data/cc/de_cc_tokenID2count_dict.facebook-mbart-large-cc25.json +0 -0
- Data/cc/en_500k.txt +3 -0
- Data/cc/en_5k.txt +3 -0
- Data/cc/en_cc_token2count_dict.json +0 -0
- Data/cc/en_cc_tokenID2count_dict.50.json +0 -0
- Data/cc/en_cc_tokenID2count_dict.cc25.json +0 -0
- Data/cc/en_toy.txt +3 -0
- Data/cc/fs_manifest.csv +49 -0
- Data/cc/ne_500k.txt +3 -0
- Data/cc/ne_cc_token2count_dict.facebook-mbart-large-cc25.json +0 -0
- Data/cc/ne_cc_tokenID2count_dict.facebook-mbart-large-cc25.json +0 -0
- Data/cc/si_500k.txt +3 -0
- Data/cc/si_cc_token2count_dict.facebook-mbart-large-cc25.json +0 -0
- Data/cc/si_cc_tokenID2count_dict.facebook-mbart-large-cc25.json +0 -0
- Data/cc/zh-Hans_cc_token2count_dict.json +0 -0
- Data/cc/zh-Hans_cc_tokenID2count_dict.50.json +0 -0
- Data/cc/zh_500k.txt +3 -0
- Data/cc/zh_5k.txt +3 -0
- Data/cc/zh_cc_tokenID2count_dict.cc25.json +0 -0
- Data/cc/zh_toy.txt +3 -0
- Data/ec/clipL_train.pt +3 -0
- Data/ec/clipL_val.pt +3 -0
- Data/ec/clip_train.pt +3 -0
- Data/ec/clip_val.pt +3 -0
- Data/ec/en_captions_train.jsonl +3 -0
- Data/ec/en_captions_train_debug.jsonl +3 -0
- Data/ec/en_captions_train_plus.jsonl +3 -0
- Data/ec/en_captions_val.jsonl +3 -0
- Data/ec/en_captions_val_plus.jsonl +3 -0
- Data/ec/images_train.pt +3 -0
- Data/ec/images_train_debug.pt +3 -0
- Data/ec/images_val.pt +3 -0
- Data/ec/train_image_names.txt +3 -0
Data/README.md
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Prepare data for backtranslation
|
| 3 |
+
|
| 4 |
+
1. Download data from [CC-100](https://data.statmt.org/cc-100/) website
|
| 5 |
+
2. Run `head -500000 <language>.txt > <language>_500K.txt`
|
| 6 |
+
3. (Optional) For random sampling from `<language>.txt`, consider using `shuf <language>.txt > | head -500000 > <language>_500K.txt`. If the file is too large to fit in memory, consider using [terashuf](https://github.com/alexandres/terashuf)
|
Data/captioning/clipL_train.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c51fdfef9c8707a635f0b0adee2823bcbb29842e9db569028a92faae6b1872b
|
| 3 |
+
size 27648760
|
Data/captioning/clipL_val.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c9195c85b4b3d5177c088e63c54b67d51840795fc7b34cd7d9971b61088557a6
|
| 3 |
+
size 3072760
|
Data/captioning/clip_train.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8864cf0dcf2d5d85c65c20fea7b5c4893b31da803f3a561d8916a04b8171d710
|
| 3 |
+
size 18432760
|
Data/captioning/clip_val.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ab750635eb78f7b5774ac2e2b4c14e2b79568c46803c3ecf866f62ae0b410fa
|
| 3 |
+
size 2048760
|
Data/captioning/en_captions_train.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f2a8a09a4726f55bb0c2bfdc1b3239eadc605a71c625c9ad7bb188173e4fc4cf
|
| 3 |
+
size 2561241
|
Data/captioning/en_captions_train_plus.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dfce75d0c0fc215bf5ef6c05f6385d78bba1276de87af4140438ae1d026df338
|
| 3 |
+
size 5481364
|
Data/captioning/en_captions_val.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60984e82fc8bb63d49642ff4053f2e0f3a6fa91114ec0ad1bbc2b78e9b5684c2
|
| 3 |
+
size 285636
|
Data/captioning/en_captions_val_plus.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6aee70a9da8c5fcd4e572abf0b92077e72cd756bc21e04a41c67ef3021a52d8
|
| 3 |
+
size 610072
|
Data/captioning/images_train.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e2e012e0606c596547c38dd611e8c5757f82db05ca996bbc135f90b29dcf840f
|
| 3 |
+
size 73728760
|
Data/captioning/images_val.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a6548fdf959a9816b42ca922d54d2e853d40b3989d0a86fd5f67677bd1a041f
|
| 3 |
+
size 8192760
|
Data/captioning/train_image_names.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e1f97d988c7e65d4bde206ec2620e0a5ee4bab616cf516a3ba4206a291eae63
|
| 3 |
+
size 288000
|
Data/captioning/train_indices.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:254b1f404c62a6e5c0f36ad7d099d4119e66749dcac35fb4d6eb134316b9cd0f
|
| 3 |
+
size 52811
|
Data/captioning/val_image_names.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56e3064b5ca496c2d5160a956af472f2221a6c2f0f77f8ecd065a9b1bf01e917
|
| 3 |
+
size 32000
|
Data/captioning/val_indices.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0cbb162c41318b7579759c9ad32c02c95ae42f218eda659ed51d23a5a83c4ca8
|
| 3 |
+
size 5859
|
Data/cc/de_500k.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:197adb7b7e0e5ac55da119e29cdc1ba8520f85d8453a54e689e288a81e422bd2
|
| 3 |
+
size 92517130
|
Data/cc/de_cc_token2count_dict.facebook-mbart-large-cc25.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/cc/de_cc_tokenID2count_dict.facebook-mbart-large-cc25.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/cc/en_500k.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5192dae35f288ac2dfe0649b4806575daa1543bd24407452aa97003dea584cb5
|
| 3 |
+
size 75285870
|
Data/cc/en_5k.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66ed21f8dfa914c269ea57f627f10401b75f3aed66377c9e278061e531d4e75b
|
| 3 |
+
size 734833
|
Data/cc/en_cc_token2count_dict.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/cc/en_cc_tokenID2count_dict.50.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/cc/en_cc_tokenID2count_dict.cc25.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/cc/en_toy.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1e8475f6a998db3321a9e67a712dfeeaf94360ebf1df31039da1526fc2a8b21e
|
| 3 |
+
size 1134813
|
Data/cc/fs_manifest.csv
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
filename,filesize,encoding,header
|
| 2 |
+
ne_1.txt,85000000,,
|
| 3 |
+
ne_2.txt,85000000,,
|
| 4 |
+
ne_3.txt,85000000,,
|
| 5 |
+
ne_4.txt,85000000,,
|
| 6 |
+
ne_5.txt,85000000,,
|
| 7 |
+
ne_6.txt,85000000,,
|
| 8 |
+
ne_7.txt,85000000,,
|
| 9 |
+
ne_8.txt,85000000,,
|
| 10 |
+
ne_9.txt,85000000,,
|
| 11 |
+
ne_10.txt,85000000,,
|
| 12 |
+
ne_11.txt,85000000,,
|
| 13 |
+
ne_12.txt,85000000,,
|
| 14 |
+
ne_13.txt,85000000,,
|
| 15 |
+
ne_14.txt,85000000,,
|
| 16 |
+
ne_15.txt,85000000,,
|
| 17 |
+
ne_16.txt,85000000,,
|
| 18 |
+
ne_17.txt,85000000,,
|
| 19 |
+
ne_18.txt,85000000,,
|
| 20 |
+
ne_19.txt,85000000,,
|
| 21 |
+
ne_20.txt,85000000,,
|
| 22 |
+
ne_21.txt,85000000,,
|
| 23 |
+
ne_22.txt,85000000,,
|
| 24 |
+
ne_23.txt,85000000,,
|
| 25 |
+
ne_24.txt,85000000,,
|
| 26 |
+
ne_25.txt,85000000,,
|
| 27 |
+
ne_26.txt,85000000,,
|
| 28 |
+
ne_27.txt,85000000,,
|
| 29 |
+
ne_28.txt,85000000,,
|
| 30 |
+
ne_29.txt,85000000,,
|
| 31 |
+
ne_30.txt,85000000,,
|
| 32 |
+
ne_31.txt,85000000,,
|
| 33 |
+
ne_32.txt,85000000,,
|
| 34 |
+
ne_33.txt,85000000,,
|
| 35 |
+
ne_34.txt,85000000,,
|
| 36 |
+
ne_35.txt,85000000,,
|
| 37 |
+
ne_36.txt,85000000,,
|
| 38 |
+
ne_37.txt,85000000,,
|
| 39 |
+
ne_38.txt,85000000,,
|
| 40 |
+
ne_39.txt,85000000,,
|
| 41 |
+
ne_40.txt,85000000,,
|
| 42 |
+
ne_41.txt,85000000,,
|
| 43 |
+
ne_42.txt,85000000,,
|
| 44 |
+
ne_43.txt,85000000,,
|
| 45 |
+
ne_44.txt,85000000,,
|
| 46 |
+
ne_45.txt,85000000,,
|
| 47 |
+
ne_46.txt,85000000,,
|
| 48 |
+
ne_47.txt,85000000,,
|
| 49 |
+
ne_48.txt,43781415,,
|
Data/cc/ne_500k.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18d63689a078545a3a48840e5fddebbc718047e0bee4bf920f2a4912993d36b6
|
| 3 |
+
size 163644701
|
Data/cc/ne_cc_token2count_dict.facebook-mbart-large-cc25.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/cc/ne_cc_tokenID2count_dict.facebook-mbart-large-cc25.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/cc/si_500k.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38d2e6f4bf37cb8f05ea14d0bca97dcc36f495252c70940f77cf2f6eb8f7bc44
|
| 3 |
+
size 148552519
|
Data/cc/si_cc_token2count_dict.facebook-mbart-large-cc25.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/cc/si_cc_tokenID2count_dict.facebook-mbart-large-cc25.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/cc/zh-Hans_cc_token2count_dict.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/cc/zh-Hans_cc_tokenID2count_dict.50.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/cc/zh_500k.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51b0ccb19e45ae8fea4bd94a19ec9ceffa6c5f9cae7000a36499f271f90944cd
|
| 3 |
+
size 128844000
|
Data/cc/zh_5k.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:110cfa67b74f0ef1d3e1ddaa63993226231a21398ead4a5a941176c7ce528f8e
|
| 3 |
+
size 1332090
|
Data/cc/zh_cc_tokenID2count_dict.cc25.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/cc/zh_toy.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:afa7a9aba5c5ea9e87c9069f4a271b887b7a63306117ce83e101f16acc8d9e9e
|
| 3 |
+
size 1266888
|
Data/ec/clipL_train.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e2fa80ef0b32f63dfd9a0f9d3e9a9362a86bc153fce34f903c72e1f6ada263e2
|
| 3 |
+
size 223590136
|
Data/ec/clipL_val.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38a2ff5fe42407fd70051ae94b30205e0e686c32ccf675cd01a6c64f51ebe308
|
| 3 |
+
size 124429048
|
Data/ec/clip_train.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9f339eeece7cd9435d2fe14db599943ccffd6876ad60b0d4d3a04246db53d96
|
| 3 |
+
size 149060344
|
Data/ec/clip_val.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a5784eee99ffb1cf439f6bd187ff2ffb7d1f8feecd22d3d800217d049721a11
|
| 3 |
+
size 82952952
|
Data/ec/en_captions_train.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64b5d410e64e8a17e7d389db53b2db42224d2e45e186f7747a455334a6eb0ae5
|
| 3 |
+
size 20641664
|
Data/ec/en_captions_train_debug.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6f0f07f28c10a090ae52a94d934309512d7a6845cb6524e6c9c3bc106e407e9f
|
| 3 |
+
size 2952
|
Data/ec/en_captions_train_plus.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe7e0d963f555479a917c007535cd7c29b6855dd2b03facfbd15539becda726b
|
| 3 |
+
size 44257325
|
Data/ec/en_captions_val.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a74d3213a0c9ba3f1023d3880e9c7a745276e096af188e3c905b9237e2c5df49
|
| 3 |
+
size 11479290
|
Data/ec/en_captions_val_plus.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9b3fe9e7412c3468eb8ff9ccae7aef1d1e91773d0e4906a83c156149ca204fb
|
| 3 |
+
size 24379137
|
Data/ec/images_train.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88302dfc9d6894fe2a15fe94d279eb5eb6f016e19873ab219c7b0d1fd1e24edd
|
| 3 |
+
size 596239096
|
Data/ec/images_train_debug.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98dae8fbd8933fe4d2139dc86206ba76c2c2bcd6e94b51fbbdebfdc210722bd3
|
| 3 |
+
size 596239096
|
Data/ec/images_val.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c81c43d7c8dc9c7156ab980cf18f7df6c51a9656c81195314b385982db450019
|
| 3 |
+
size 331809528
|
Data/ec/train_image_names.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86c7dda21862597fffc8fb3a2961600382fd90485164c24bfcdb02a55acf1ecc
|
| 3 |
+
size 2329056
|