zeyuliu2 commited on
Commit
e02c7f5
·
1 Parent(s): 6ef9197

add data and model

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Data/README.md +6 -0
  2. Data/captioning/clipL_train.pt +3 -0
  3. Data/captioning/clipL_val.pt +3 -0
  4. Data/captioning/clip_train.pt +3 -0
  5. Data/captioning/clip_val.pt +3 -0
  6. Data/captioning/en_captions_train.jsonl +3 -0
  7. Data/captioning/en_captions_train_plus.jsonl +3 -0
  8. Data/captioning/en_captions_val.jsonl +3 -0
  9. Data/captioning/en_captions_val_plus.jsonl +3 -0
  10. Data/captioning/images_train.pt +3 -0
  11. Data/captioning/images_val.pt +3 -0
  12. Data/captioning/train_image_names.txt +3 -0
  13. Data/captioning/train_indices.txt +3 -0
  14. Data/captioning/val_image_names.txt +3 -0
  15. Data/captioning/val_indices.txt +3 -0
  16. Data/cc/de_500k.txt +3 -0
  17. Data/cc/de_cc_token2count_dict.facebook-mbart-large-cc25.json +0 -0
  18. Data/cc/de_cc_tokenID2count_dict.facebook-mbart-large-cc25.json +0 -0
  19. Data/cc/en_500k.txt +3 -0
  20. Data/cc/en_5k.txt +3 -0
  21. Data/cc/en_cc_token2count_dict.json +0 -0
  22. Data/cc/en_cc_tokenID2count_dict.50.json +0 -0
  23. Data/cc/en_cc_tokenID2count_dict.cc25.json +0 -0
  24. Data/cc/en_toy.txt +3 -0
  25. Data/cc/fs_manifest.csv +49 -0
  26. Data/cc/ne_500k.txt +3 -0
  27. Data/cc/ne_cc_token2count_dict.facebook-mbart-large-cc25.json +0 -0
  28. Data/cc/ne_cc_tokenID2count_dict.facebook-mbart-large-cc25.json +0 -0
  29. Data/cc/si_500k.txt +3 -0
  30. Data/cc/si_cc_token2count_dict.facebook-mbart-large-cc25.json +0 -0
  31. Data/cc/si_cc_tokenID2count_dict.facebook-mbart-large-cc25.json +0 -0
  32. Data/cc/zh-Hans_cc_token2count_dict.json +0 -0
  33. Data/cc/zh-Hans_cc_tokenID2count_dict.50.json +0 -0
  34. Data/cc/zh_500k.txt +3 -0
  35. Data/cc/zh_5k.txt +3 -0
  36. Data/cc/zh_cc_tokenID2count_dict.cc25.json +0 -0
  37. Data/cc/zh_toy.txt +3 -0
  38. Data/ec/clipL_train.pt +3 -0
  39. Data/ec/clipL_val.pt +3 -0
  40. Data/ec/clip_train.pt +3 -0
  41. Data/ec/clip_val.pt +3 -0
  42. Data/ec/en_captions_train.jsonl +3 -0
  43. Data/ec/en_captions_train_debug.jsonl +3 -0
  44. Data/ec/en_captions_train_plus.jsonl +3 -0
  45. Data/ec/en_captions_val.jsonl +3 -0
  46. Data/ec/en_captions_val_plus.jsonl +3 -0
  47. Data/ec/images_train.pt +3 -0
  48. Data/ec/images_train_debug.pt +3 -0
  49. Data/ec/images_val.pt +3 -0
  50. Data/ec/train_image_names.txt +3 -0
Data/README.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ # Prepare data for backtranslation
3
+
4
+ 1. Download data from [CC-100](https://data.statmt.org/cc-100/) website
5
+ 2. Run `head -500000 <language>.txt > <language>_500K.txt`
6
+ 3. (Optional) For random sampling from `<language>.txt`, consider using `shuf <language>.txt > | head -500000 > <language>_500K.txt`. If the file is too large to fit in memory, consider using [terashuf](https://github.com/alexandres/terashuf)
Data/captioning/clipL_train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c51fdfef9c8707a635f0b0adee2823bcbb29842e9db569028a92faae6b1872b
3
+ size 27648760
Data/captioning/clipL_val.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9195c85b4b3d5177c088e63c54b67d51840795fc7b34cd7d9971b61088557a6
3
+ size 3072760
Data/captioning/clip_train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8864cf0dcf2d5d85c65c20fea7b5c4893b31da803f3a561d8916a04b8171d710
3
+ size 18432760
Data/captioning/clip_val.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ab750635eb78f7b5774ac2e2b4c14e2b79568c46803c3ecf866f62ae0b410fa
3
+ size 2048760
Data/captioning/en_captions_train.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2a8a09a4726f55bb0c2bfdc1b3239eadc605a71c625c9ad7bb188173e4fc4cf
3
+ size 2561241
Data/captioning/en_captions_train_plus.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfce75d0c0fc215bf5ef6c05f6385d78bba1276de87af4140438ae1d026df338
3
+ size 5481364
Data/captioning/en_captions_val.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60984e82fc8bb63d49642ff4053f2e0f3a6fa91114ec0ad1bbc2b78e9b5684c2
3
+ size 285636
Data/captioning/en_captions_val_plus.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6aee70a9da8c5fcd4e572abf0b92077e72cd756bc21e04a41c67ef3021a52d8
3
+ size 610072
Data/captioning/images_train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2e012e0606c596547c38dd611e8c5757f82db05ca996bbc135f90b29dcf840f
3
+ size 73728760
Data/captioning/images_val.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a6548fdf959a9816b42ca922d54d2e853d40b3989d0a86fd5f67677bd1a041f
3
+ size 8192760
Data/captioning/train_image_names.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e1f97d988c7e65d4bde206ec2620e0a5ee4bab616cf516a3ba4206a291eae63
3
+ size 288000
Data/captioning/train_indices.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:254b1f404c62a6e5c0f36ad7d099d4119e66749dcac35fb4d6eb134316b9cd0f
3
+ size 52811
Data/captioning/val_image_names.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56e3064b5ca496c2d5160a956af472f2221a6c2f0f77f8ecd065a9b1bf01e917
3
+ size 32000
Data/captioning/val_indices.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cbb162c41318b7579759c9ad32c02c95ae42f218eda659ed51d23a5a83c4ca8
3
+ size 5859
Data/cc/de_500k.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:197adb7b7e0e5ac55da119e29cdc1ba8520f85d8453a54e689e288a81e422bd2
3
+ size 92517130
Data/cc/de_cc_token2count_dict.facebook-mbart-large-cc25.json ADDED
The diff for this file is too large to render. See raw diff
 
Data/cc/de_cc_tokenID2count_dict.facebook-mbart-large-cc25.json ADDED
The diff for this file is too large to render. See raw diff
 
Data/cc/en_500k.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5192dae35f288ac2dfe0649b4806575daa1543bd24407452aa97003dea584cb5
3
+ size 75285870
Data/cc/en_5k.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66ed21f8dfa914c269ea57f627f10401b75f3aed66377c9e278061e531d4e75b
3
+ size 734833
Data/cc/en_cc_token2count_dict.json ADDED
The diff for this file is too large to render. See raw diff
 
Data/cc/en_cc_tokenID2count_dict.50.json ADDED
The diff for this file is too large to render. See raw diff
 
Data/cc/en_cc_tokenID2count_dict.cc25.json ADDED
The diff for this file is too large to render. See raw diff
 
Data/cc/en_toy.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e8475f6a998db3321a9e67a712dfeeaf94360ebf1df31039da1526fc2a8b21e
3
+ size 1134813
Data/cc/fs_manifest.csv ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ filename,filesize,encoding,header
2
+ ne_1.txt,85000000,,
3
+ ne_2.txt,85000000,,
4
+ ne_3.txt,85000000,,
5
+ ne_4.txt,85000000,,
6
+ ne_5.txt,85000000,,
7
+ ne_6.txt,85000000,,
8
+ ne_7.txt,85000000,,
9
+ ne_8.txt,85000000,,
10
+ ne_9.txt,85000000,,
11
+ ne_10.txt,85000000,,
12
+ ne_11.txt,85000000,,
13
+ ne_12.txt,85000000,,
14
+ ne_13.txt,85000000,,
15
+ ne_14.txt,85000000,,
16
+ ne_15.txt,85000000,,
17
+ ne_16.txt,85000000,,
18
+ ne_17.txt,85000000,,
19
+ ne_18.txt,85000000,,
20
+ ne_19.txt,85000000,,
21
+ ne_20.txt,85000000,,
22
+ ne_21.txt,85000000,,
23
+ ne_22.txt,85000000,,
24
+ ne_23.txt,85000000,,
25
+ ne_24.txt,85000000,,
26
+ ne_25.txt,85000000,,
27
+ ne_26.txt,85000000,,
28
+ ne_27.txt,85000000,,
29
+ ne_28.txt,85000000,,
30
+ ne_29.txt,85000000,,
31
+ ne_30.txt,85000000,,
32
+ ne_31.txt,85000000,,
33
+ ne_32.txt,85000000,,
34
+ ne_33.txt,85000000,,
35
+ ne_34.txt,85000000,,
36
+ ne_35.txt,85000000,,
37
+ ne_36.txt,85000000,,
38
+ ne_37.txt,85000000,,
39
+ ne_38.txt,85000000,,
40
+ ne_39.txt,85000000,,
41
+ ne_40.txt,85000000,,
42
+ ne_41.txt,85000000,,
43
+ ne_42.txt,85000000,,
44
+ ne_43.txt,85000000,,
45
+ ne_44.txt,85000000,,
46
+ ne_45.txt,85000000,,
47
+ ne_46.txt,85000000,,
48
+ ne_47.txt,85000000,,
49
+ ne_48.txt,43781415,,
Data/cc/ne_500k.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18d63689a078545a3a48840e5fddebbc718047e0bee4bf920f2a4912993d36b6
3
+ size 163644701
Data/cc/ne_cc_token2count_dict.facebook-mbart-large-cc25.json ADDED
The diff for this file is too large to render. See raw diff
 
Data/cc/ne_cc_tokenID2count_dict.facebook-mbart-large-cc25.json ADDED
The diff for this file is too large to render. See raw diff
 
Data/cc/si_500k.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38d2e6f4bf37cb8f05ea14d0bca97dcc36f495252c70940f77cf2f6eb8f7bc44
3
+ size 148552519
Data/cc/si_cc_token2count_dict.facebook-mbart-large-cc25.json ADDED
The diff for this file is too large to render. See raw diff
 
Data/cc/si_cc_tokenID2count_dict.facebook-mbart-large-cc25.json ADDED
The diff for this file is too large to render. See raw diff
 
Data/cc/zh-Hans_cc_token2count_dict.json ADDED
The diff for this file is too large to render. See raw diff
 
Data/cc/zh-Hans_cc_tokenID2count_dict.50.json ADDED
The diff for this file is too large to render. See raw diff
 
Data/cc/zh_500k.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51b0ccb19e45ae8fea4bd94a19ec9ceffa6c5f9cae7000a36499f271f90944cd
3
+ size 128844000
Data/cc/zh_5k.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:110cfa67b74f0ef1d3e1ddaa63993226231a21398ead4a5a941176c7ce528f8e
3
+ size 1332090
Data/cc/zh_cc_tokenID2count_dict.cc25.json ADDED
The diff for this file is too large to render. See raw diff
 
Data/cc/zh_toy.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afa7a9aba5c5ea9e87c9069f4a271b887b7a63306117ce83e101f16acc8d9e9e
3
+ size 1266888
Data/ec/clipL_train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2fa80ef0b32f63dfd9a0f9d3e9a9362a86bc153fce34f903c72e1f6ada263e2
3
+ size 223590136
Data/ec/clipL_val.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38a2ff5fe42407fd70051ae94b30205e0e686c32ccf675cd01a6c64f51ebe308
3
+ size 124429048
Data/ec/clip_train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9f339eeece7cd9435d2fe14db599943ccffd6876ad60b0d4d3a04246db53d96
3
+ size 149060344
Data/ec/clip_val.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a5784eee99ffb1cf439f6bd187ff2ffb7d1f8feecd22d3d800217d049721a11
3
+ size 82952952
Data/ec/en_captions_train.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64b5d410e64e8a17e7d389db53b2db42224d2e45e186f7747a455334a6eb0ae5
3
+ size 20641664
Data/ec/en_captions_train_debug.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f0f07f28c10a090ae52a94d934309512d7a6845cb6524e6c9c3bc106e407e9f
3
+ size 2952
Data/ec/en_captions_train_plus.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe7e0d963f555479a917c007535cd7c29b6855dd2b03facfbd15539becda726b
3
+ size 44257325
Data/ec/en_captions_val.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a74d3213a0c9ba3f1023d3880e9c7a745276e096af188e3c905b9237e2c5df49
3
+ size 11479290
Data/ec/en_captions_val_plus.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9b3fe9e7412c3468eb8ff9ccae7aef1d1e91773d0e4906a83c156149ca204fb
3
+ size 24379137
Data/ec/images_train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88302dfc9d6894fe2a15fe94d279eb5eb6f016e19873ab219c7b0d1fd1e24edd
3
+ size 596239096
Data/ec/images_train_debug.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98dae8fbd8933fe4d2139dc86206ba76c2c2bcd6e94b51fbbdebfdc210722bd3
3
+ size 596239096
Data/ec/images_val.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c81c43d7c8dc9c7156ab980cf18f7df6c51a9656c81195314b385982db450019
3
+ size 331809528
Data/ec/train_image_names.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86c7dda21862597fffc8fb3a2961600382fd90485164c24bfcdb02a55acf1ecc
3
+ size 2329056