yuccaaa commited on
Commit
ed94f0e
·
verified ·
1 Parent(s): 4245984

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +53 -23
  2. README.md +3 -0
  3. clean_OntoProtein.jsonl +3 -0
  4. clean_bio.jsonl +3 -0
  5. clean_pmc_full_text.jsonl +3 -0
  6. clean_pmc_full_text_small.jsonl +3 -0
  7. clean_pubmed_abstract_part1.jsonl +3 -0
  8. clean_pubmed_abstract_part1_small.jsonl +3 -0
  9. clean_pubmed_abstract_part1_small1.jsonl +3 -0
  10. clean_pubmed_abstract_part1_small1_new.jsonl +3 -0
  11. clean_pubmed_abstract_part1_small_new.jsonl +3 -0
  12. clean_seq_in_text.jsonl +3 -0
  13. clean_seq_in_text_new.jsonl +3 -0
  14. clean_swissProt2Text.jsonl +3 -0
  15. clean_swissProt2Text_new.jsonl +3 -0
  16. cot/.gitattributes +59 -0
  17. cot/.hfd/aria2c_urls.txt +0 -0
  18. cot/.hfd/last_download_command +1 -0
  19. cot/.hfd/repo_metadata.json +1 -0
  20. cot/README.md +223 -0
  21. cot/all/train-00000-of-00015.parquet +3 -0
  22. cot/all/train-00001-of-00015.parquet +3 -0
  23. cot/all/train-00002-of-00015.parquet +3 -0
  24. cot/all/train-00003-of-00015.parquet +3 -0
  25. cot/all/train-00004-of-00015.parquet +3 -0
  26. cot/all/train-00005-of-00015.parquet +3 -0
  27. cot/all/train-00006-of-00015.parquet +3 -0
  28. cot/all/train-00007-of-00015.parquet +3 -0
  29. cot/all/train-00008-of-00015.parquet +3 -0
  30. cot/all/train-00009-of-00015.parquet +3 -0
  31. cot/all/train-00010-of-00015.parquet +3 -0
  32. cot/all/train-00011-of-00015.parquet +3 -0
  33. cot/all/train-00012-of-00015.parquet +3 -0
  34. cot/all/train-00013-of-00015.parquet +3 -0
  35. cot/all/train-00014-of-00015.parquet +3 -0
  36. cot/clean/merge_cot.jsonl +0 -0
  37. cot/clean/merge_cot_combine.jsonl +3 -0
  38. cot/code/train-00000-of-00008.parquet +3 -0
  39. cot/code/train-00001-of-00008.parquet +3 -0
  40. cot/code/train-00002-of-00008.parquet +3 -0
  41. cot/code/train-00003-of-00008.parquet +3 -0
  42. cot/code/train-00004-of-00008.parquet +3 -0
  43. cot/code/train-00005-of-00008.parquet +3 -0
  44. cot/code/train-00006-of-00008.parquet +3 -0
  45. cot/code/train-00007-of-00008.parquet +3 -0
  46. cot/code_mix.png +3 -0
  47. cot/data_mix.png +3 -0
  48. cot/math/train-00000-of-00004.parquet +3 -0
  49. cot/math/train-00001-of-00004.parquet +3 -0
  50. cot/math/train-00002-of-00004.parquet +3 -0
.gitattributes CHANGED
@@ -8,6 +8,7 @@
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
11
  *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
@@ -33,26 +34,55 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- 总和521041_最终版本.jsonl filter=lfs diff=lfs merge=lfs -text
37
- matched_bio.jsonl filter=lfs diff=lfs merge=lfs -text
38
- category_10/category_trackid.jsonl filter=lfs diff=lfs merge=lfs -text
39
- matched_records.jsonl filter=lfs diff=lfs merge=lfs -text
40
- category_10/output_new/biology/match_bio.jsonl filter=lfs diff=lfs merge=lfs -text
41
- category_10/output_new/biology/biology.jsonl filter=lfs diff=lfs merge=lfs -text
42
- category_10/outputs/Chemistry/Chemistry_001.jsonl filter=lfs diff=lfs merge=lfs -text
43
- category_10/outputs/Environmental[[:space:]]&[[:space:]]Geographical[[:space:]]Sciences/Environmental[[:space:]]&[[:space:]]Geographical[[:space:]]Sciences_001.jsonl filter=lfs diff=lfs merge=lfs -text
44
- category_10/outputs/Economics[[:space:]]&[[:space:]]Management/Economics[[:space:]]&[[:space:]]Management_001.jsonl filter=lfs diff=lfs merge=lfs -text
45
- category_10/outputs/Humanities[[:space:]]&[[:space:]]Social[[:space:]]Sciences/Humanities[[:space:]]&[[:space:]]Social[[:space:]]Sciences_001.jsonl filter=lfs diff=lfs merge=lfs -text
46
- category_10/outputs/Life[[:space:]]Sciences/Life[[:space:]]Sciences_001.jsonl filter=lfs diff=lfs merge=lfs -text
47
- category_10/outputs/Information[[:space:]]Sciences/Information[[:space:]]Sciences_001.jsonl filter=lfs diff=lfs merge=lfs -text
48
- category_10/outputs/Engineering/Engineering_001.jsonl filter=lfs diff=lfs merge=lfs -text
49
- category_10/outputs/Mathematics[[:space:]]&[[:space:]]Statistics/Mathematics[[:space:]]&[[:space:]]Statistics_002.jsonl filter=lfs diff=lfs merge=lfs -text
50
- category_10/outputs/Others/Others_001.jsonl filter=lfs diff=lfs merge=lfs -text
51
- category_10/outputs/Medical[[:space:]]Sciences/Medical[[:space:]]Sciences_001.jsonl filter=lfs diff=lfs merge=lfs -text
52
- trackid_major/samples_textbook-meta-20250318_part-67d966ae3a94-000001.jsonl filter=lfs diff=lfs merge=lfs -text
53
- trackid_major/samples_textbook-meta-20250318_part-67d966ae3a94-000003.jsonl filter=lfs diff=lfs merge=lfs -text
54
- trackid_major/samples_textbook-meta-20250318_part-67d966ae3a94-000005.jsonl filter=lfs diff=lfs merge=lfs -text
55
- category_10/outputs/Physics[[:space:]]&[[:space:]]Astronomy/Physics[[:space:]]&[[:space:]]Astronomy_001.jsonl filter=lfs diff=lfs merge=lfs -text
56
- trackid_major/samples_textbook-meta-20250318_part-67d966ae3a94-000008.jsonl filter=lfs diff=lfs merge=lfs -text
57
- trackid_major/samples_textbook-meta-20250318_part-67d966ae3a94-000007.jsonl filter=lfs diff=lfs merge=lfs -text
58
- category_10/outputs/Mathematics[[:space:]]&[[:space:]]Statistics/Mathematics[[:space:]]&[[:space:]]Statistics_001.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
  *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
  *.model filter=lfs diff=lfs merge=lfs -text
14
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
34
  *.zip filter=lfs diff=lfs merge=lfs -text
35
  *.zst filter=lfs diff=lfs merge=lfs -text
36
  *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
56
+ swissProt2Text.json filter=lfs diff=lfs merge=lfs -text
57
+ seq_in_text.json filter=lfs diff=lfs merge=lfs -text
58
+ pubmed_abstract_part1.json filter=lfs diff=lfs merge=lfs -text
59
+ pubmed_abstract_part2.json filter=lfs diff=lfs merge=lfs -text
60
+ pmc_full_text.json filter=lfs diff=lfs merge=lfs -text
61
+ clean_bio.jsonl filter=lfs diff=lfs merge=lfs -text
62
+ clean_OntoProtein.jsonl filter=lfs diff=lfs merge=lfs -text
63
+ clean_pmc_full_text_small.jsonl filter=lfs diff=lfs merge=lfs -text
64
+ clean_pubmed_abstract_part1_small.jsonl filter=lfs diff=lfs merge=lfs -text
65
+ clean_pubmed_abstract_part1_small1.jsonl filter=lfs diff=lfs merge=lfs -text
66
+ clean_pubmed_abstract_part1_small1_new.jsonl filter=lfs diff=lfs merge=lfs -text
67
+ clean_pubmed_abstract_part1_small_new.jsonl filter=lfs diff=lfs merge=lfs -text
68
+ clean_seq_in_text.jsonl filter=lfs diff=lfs merge=lfs -text
69
+ clean_seq_in_text_new.jsonl filter=lfs diff=lfs merge=lfs -text
70
+ clean_swissProt2Text.jsonl filter=lfs diff=lfs merge=lfs -text
71
+ clean_swissProt2Text_new.jsonl filter=lfs diff=lfs merge=lfs -text
72
+ clean_pmc_full_text.jsonl filter=lfs diff=lfs merge=lfs -text
73
+ clean_pubmed_abstract_part1.jsonl filter=lfs diff=lfs merge=lfs -text
74
+ pmc_full_text.jsonl filter=lfs diff=lfs merge=lfs -text
75
+ cot/clean/merge_cot_combine.jsonl filter=lfs diff=lfs merge=lfs -text
76
+ instruct/alpaca-gpt4-train.jsonl filter=lfs diff=lfs merge=lfs -text
77
+ instruct/alpaca-gpt4.jsonl filter=lfs diff=lfs merge=lfs -text
78
+ instruct/cot.jsonl filter=lfs diff=lfs merge=lfs -text
79
+ nan/code/clean-train-00000-of-00011.jsonl filter=lfs diff=lfs merge=lfs -text
80
+ nan/code/train-00000-of-00011.jsonl filter=lfs diff=lfs merge=lfs -text
81
+ nan/math_thinking/all/default-00000-of-00010.jsonl filter=lfs diff=lfs merge=lfs -text
82
+ nan/math_thinking/all/default-00001-of-00010.jsonl filter=lfs diff=lfs merge=lfs -text
83
+ nan/math_thinking/all/default-00002-of-00010.jsonl filter=lfs diff=lfs merge=lfs -text
84
+ nan/math_thinking/all/default-00003-of-00010.jsonl filter=lfs diff=lfs merge=lfs -text
85
+ nan/math_thinking/all/new-default-00000-of-00010.jsonl filter=lfs diff=lfs merge=lfs -text
86
+ nan/math_thinking/all/new-default-00001-of-00010.jsonl filter=lfs diff=lfs merge=lfs -text
87
+ nan/math_thinking/all/new-default-00002-of-00010.jsonl filter=lfs diff=lfs merge=lfs -text
88
+ nan/math_thinking/all/new-default-00003-of-00010.jsonl filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
clean_OntoProtein.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2a774e9779de31aed3de39f5b8daa8227180ac9a83e093c7cc5af0b43a46d88
3
+ size 509349450
clean_bio.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6e8972be9428d080d7dee67fdfb54a173d46be06ef39e8f3cb8c30b80624c3f
3
+ size 321426895
clean_pmc_full_text.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67fa2ba6fea3471445e5e96abe1f27651ef200a15a061cae1dc8180791158898
3
+ size 11531546153
clean_pmc_full_text_small.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00f79469d0af7ea7ddc348049cc5d4c501218411b0ab7f2da9f84b3db4b87819
3
+ size 1152340587
clean_pubmed_abstract_part1.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdb1f1c8e104abd3dae2e8228251b88d42dcde8143828007a6caecb4d4da761d
3
+ size 12203651957
clean_pubmed_abstract_part1_small.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f04e9be363ccf8257919ca94fbc1af1062afe07a70bb61a00fbc6c80b61be4b
3
+ size 1220153817
clean_pubmed_abstract_part1_small1.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f0952b3cb5d0e625c158a47dab22e54f749164ce17b261ae3f3dbf313d863c8
3
+ size 1158716536
clean_pubmed_abstract_part1_small1_new.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7c894971c8fd33ea9224d90c47c743ddca3820a0d9048d42685c056b20956fc
3
+ size 1145336156
clean_pubmed_abstract_part1_small_new.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b34f9e6f0527222c5f233cbcae714f2d887b27a4dc7604ca2ed812bde5d9ab2d
3
+ size 1206069461
clean_seq_in_text.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9edd45eb2c80bc5f5789c75e95d285619dffd5c74e44c5ef6f1a45bb95d8a280
3
+ size 182927858
clean_seq_in_text_new.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6bef7cf0f553073060716f7cbc0c14e690d0d6ee9b0a5fe9d7b23bb79ae401a
3
+ size 149175905
clean_swissProt2Text.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6734c083fb5f42aade27402b7e873425b434c77d4109b5596da8f1ce44216bb
3
+ size 912854831
clean_swissProt2Text_new.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1eb6aff31c7dbc6602156ad71ac07b8847c538dd076d0353a81158d534e0b261
3
+ size 906607824
cot/.gitattributes ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mds filter=lfs diff=lfs merge=lfs -text
13
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
14
+ *.model filter=lfs diff=lfs merge=lfs -text
15
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
16
+ *.npy filter=lfs diff=lfs merge=lfs -text
17
+ *.npz filter=lfs diff=lfs merge=lfs -text
18
+ *.onnx filter=lfs diff=lfs merge=lfs -text
19
+ *.ot filter=lfs diff=lfs merge=lfs -text
20
+ *.parquet filter=lfs diff=lfs merge=lfs -text
21
+ *.pb filter=lfs diff=lfs merge=lfs -text
22
+ *.pickle filter=lfs diff=lfs merge=lfs -text
23
+ *.pkl filter=lfs diff=lfs merge=lfs -text
24
+ *.pt filter=lfs diff=lfs merge=lfs -text
25
+ *.pth filter=lfs diff=lfs merge=lfs -text
26
+ *.rar filter=lfs diff=lfs merge=lfs -text
27
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
28
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
30
+ *.tar filter=lfs diff=lfs merge=lfs -text
31
+ *.tflite filter=lfs diff=lfs merge=lfs -text
32
+ *.tgz filter=lfs diff=lfs merge=lfs -text
33
+ *.wasm filter=lfs diff=lfs merge=lfs -text
34
+ *.xz filter=lfs diff=lfs merge=lfs -text
35
+ *.zip filter=lfs diff=lfs merge=lfs -text
36
+ *.zst filter=lfs diff=lfs merge=lfs -text
37
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
38
+ # Audio files - uncompressed
39
+ *.pcm filter=lfs diff=lfs merge=lfs -text
40
+ *.sam filter=lfs diff=lfs merge=lfs -text
41
+ *.raw filter=lfs diff=lfs merge=lfs -text
42
+ # Audio files - compressed
43
+ *.aac filter=lfs diff=lfs merge=lfs -text
44
+ *.flac filter=lfs diff=lfs merge=lfs -text
45
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
46
+ *.ogg filter=lfs diff=lfs merge=lfs -text
47
+ *.wav filter=lfs diff=lfs merge=lfs -text
48
+ # Image files - uncompressed
49
+ *.bmp filter=lfs diff=lfs merge=lfs -text
50
+ *.gif filter=lfs diff=lfs merge=lfs -text
51
+ *.png filter=lfs diff=lfs merge=lfs -text
52
+ *.tiff filter=lfs diff=lfs merge=lfs -text
53
+ # Image files - compressed
54
+ *.jpg filter=lfs diff=lfs merge=lfs -text
55
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
56
+ *.webp filter=lfs diff=lfs merge=lfs -text
57
+ # Video files - compressed
58
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
59
+ *.webm filter=lfs diff=lfs merge=lfs -text
cot/.hfd/aria2c_urls.txt ADDED
File without changes
cot/.hfd/last_download_command ADDED
@@ -0,0 +1 @@
 
 
1
+ REPO_ID=open-r1/Mixture-of-Thoughts TOOL=aria2c INCLUDE_PATTERNS= EXCLUDE_PATTERNS= DATASET=1 HF_USERNAME= HF_TOKEN= HF_ENDPOINT=https://hf-mirror.com REVISION=main
cot/.hfd/repo_metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_id":"6820fb77b82e61bb50999662","id":"open-r1/Mixture-of-Thoughts","author":"open-r1","sha":"e55fa28006c0d0ec60fb3547520f775dd42d02cd","lastModified":"2025-05-26T15:25:56.000Z","private":false,"gated":false,"disabled":false,"tags":["task_categories:text-generation","language:en","size_categories:100K<n<1M","format:parquet","modality:text","library:datasets","library:dask","library:mlcroissant","library:polars","arxiv:2504.21318","arxiv:2505.00949","region:us"],"citation":null,"description":"\n\n\n\t\n\t\t\n\t\tDataset summary\n\t\n\nMixture-of-Thoughts is a curated dataset of 350k verified reasoning traces distilled from DeepSeek-R1. The dataset spans tasks in mathematics, coding, and science, and is designed to teach language models to reason step-by-step. It was used in the Open R1 project to train OpenR1-Distill-7B, an SFT model that replicates the reasoning capabilities of deepseek-ai/DeepSeek-R1-Distill-Qwen-7B from the same base model.\nTo load the dataset, run:\nfrom datasets import… See the full description on the dataset page: https://huggingface.co/datasets/open-r1/Mixture-of-Thoughts.","downloads":29078,"likes":205,"cardData":{"dataset_info":[{"config_name":"all","features":[{"name":"messages","list":[{"name":"content","dtype":"string"},{"name":"role","dtype":"string"}]},{"name":"num_tokens","dtype":"int64"},{"name":"source","dtype":"string"}],"splits":[{"name":"train","num_bytes":7062819826.825458,"num_examples":349317}],"download_size":3077653717,"dataset_size":7062819826.825458},{"config_name":"code","features":[{"name":"messages","list":[{"name":"content","dtype":"string"},{"name":"role","dtype":"string"}]},{"name":"num_tokens","dtype":"int64"},{"name":"source","dtype":"string"}],"splits":[{"name":"train","num_bytes":3872656251.3167396,"num_examples":83070}],"download_size":1613338604,"dataset_size":3872656251.3167396},{"config_name":"math","features":[{"name":"messages","list":[{"name":"content","dtype":"string"},{"name":"role","dtype":"string"}]},{"name":"num_tokens","dtype":"int64"},{"name":"source","dtype":"string"}],"splits":[{"name":"train","num_bytes":1599028646,"num_examples":93733}],"download_size":704448153,"dataset_size":1599028646},{"config_name":"science","features":[{"name":"messages","list":[{"name":"content","dtype":"string"},{"name":"role","dtype":"string"}]},{"name":"num_tokens","dtype":"int64"},{"name":"source","dtype":"string"}],"splits":[{"name":"train","num_bytes":1590765326,"num_examples":172514}],"download_size":674333812,"dataset_size":1590765326}],"configs":[{"config_name":"all","data_files":[{"split":"train","path":"all/train-*"}]},{"config_name":"code","data_files":[{"split":"train","path":"code/train-*"}]},{"config_name":"math","data_files":[{"split":"train","path":"math/train-*"}]},{"config_name":"science","data_files":[{"split":"train","path":"science/train-*"}]}],"task_categories":["text-generation"],"language":["en"],"pretty_name":"Mixture of Thoughts","size_categories":["100K<n<1M"]},"siblings":[{"rfilename":".gitattributes"},{"rfilename":"README.md"},{"rfilename":"all/train-00000-of-00015.parquet"},{"rfilename":"all/train-00001-of-00015.parquet"},{"rfilename":"all/train-00002-of-00015.parquet"},{"rfilename":"all/train-00003-of-00015.parquet"},{"rfilename":"all/train-00004-of-00015.parquet"},{"rfilename":"all/train-00005-of-00015.parquet"},{"rfilename":"all/train-00006-of-00015.parquet"},{"rfilename":"all/train-00007-of-00015.parquet"},{"rfilename":"all/train-00008-of-00015.parquet"},{"rfilename":"all/train-00009-of-00015.parquet"},{"rfilename":"all/train-00010-of-00015.parquet"},{"rfilename":"all/train-00011-of-00015.parquet"},{"rfilename":"all/train-00012-of-00015.parquet"},{"rfilename":"all/train-00013-of-00015.parquet"},{"rfilename":"all/train-00014-of-00015.parquet"},{"rfilename":"code/train-00000-of-00008.parquet"},{"rfilename":"code/train-00001-of-00008.parquet"},{"rfilename":"code/train-00002-of-00008.parquet"},{"rfilename":"code/train-00003-of-00008.parquet"},{"rfilename":"code/train-00004-of-00008.parquet"},{"rfilename":"code/train-00005-of-00008.parquet"},{"rfilename":"code/train-00006-of-00008.parquet"},{"rfilename":"code/train-00007-of-00008.parquet"},{"rfilename":"code_mix.png"},{"rfilename":"data_mix.png"},{"rfilename":"math/train-00000-of-00004.parquet"},{"rfilename":"math/train-00001-of-00004.parquet"},{"rfilename":"math/train-00002-of-00004.parquet"},{"rfilename":"math/train-00003-of-00004.parquet"},{"rfilename":"math_mix.png"},{"rfilename":"mot-thumbnail.png"},{"rfilename":"science/train-00000-of-00004.parquet"},{"rfilename":"science/train-00001-of-00004.parquet"},{"rfilename":"science/train-00002-of-00004.parquet"},{"rfilename":"science/train-00003-of-00004.parquet"}],"createdAt":"2025-05-11T19:33:11.000Z","usedStorage":32745570895}
cot/README.md ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ dataset_info:
3
+ - config_name: all
4
+ features:
5
+ - name: messages
6
+ list:
7
+ - name: content
8
+ dtype: string
9
+ - name: role
10
+ dtype: string
11
+ - name: num_tokens
12
+ dtype: int64
13
+ - name: source
14
+ dtype: string
15
+ splits:
16
+ - name: train
17
+ num_bytes: 7062819826.825458
18
+ num_examples: 349317
19
+ download_size: 3077653717
20
+ dataset_size: 7062819826.825458
21
+ - config_name: code
22
+ features:
23
+ - name: messages
24
+ list:
25
+ - name: content
26
+ dtype: string
27
+ - name: role
28
+ dtype: string
29
+ - name: num_tokens
30
+ dtype: int64
31
+ - name: source
32
+ dtype: string
33
+ splits:
34
+ - name: train
35
+ num_bytes: 3872656251.3167396
36
+ num_examples: 83070
37
+ download_size: 1613338604
38
+ dataset_size: 3872656251.3167396
39
+ - config_name: math
40
+ features:
41
+ - name: messages
42
+ list:
43
+ - name: content
44
+ dtype: string
45
+ - name: role
46
+ dtype: string
47
+ - name: num_tokens
48
+ dtype: int64
49
+ - name: source
50
+ dtype: string
51
+ splits:
52
+ - name: train
53
+ num_bytes: 1599028646
54
+ num_examples: 93733
55
+ download_size: 704448153
56
+ dataset_size: 1599028646
57
+ - config_name: science
58
+ features:
59
+ - name: messages
60
+ list:
61
+ - name: content
62
+ dtype: string
63
+ - name: role
64
+ dtype: string
65
+ - name: num_tokens
66
+ dtype: int64
67
+ - name: source
68
+ dtype: string
69
+ splits:
70
+ - name: train
71
+ num_bytes: 1590765326
72
+ num_examples: 172514
73
+ download_size: 674333812
74
+ dataset_size: 1590765326
75
+ configs:
76
+ - config_name: all
77
+ data_files:
78
+ - split: train
79
+ path: all/train-*
80
+ - config_name: code
81
+ data_files:
82
+ - split: train
83
+ path: code/train-*
84
+ - config_name: math
85
+ data_files:
86
+ - split: train
87
+ path: math/train-*
88
+ - config_name: science
89
+ data_files:
90
+ - split: train
91
+ path: science/train-*
92
+ task_categories:
93
+ - text-generation
94
+ language:
95
+ - en
96
+ pretty_name: Mixture of Thoughts
97
+ size_categories:
98
+ - 100K<n<1M
99
+ ---
100
+
101
+ <img src="mot-thumbnail.png" alt="Centered Image" style="display: block; margin: 0 auto;" width="500">
102
+
103
+ # Dataset summary
104
+
105
+ Mixture-of-Thoughts is a curated dataset of 350k verified reasoning traces distilled from [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1). The dataset spans tasks in mathematics, coding, and science, and is designed to teach language models to reason step-by-step. It was used in the Open R1 project to train [OpenR1-Distill-7B](https://huggingface.co/open-r1/OpenR1-Distill-7B), an SFT model that replicates the reasoning capabilities of [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) from the same base model.
106
+
107
+ To load the dataset, run:
108
+
109
+ ```python
110
+ from datasets import load_dataset
111
+
112
+ dataset = load_dataset("open-r1/Mixture-of-Thoughts", "all", split="train")
113
+
114
+ # Load a specific domain
115
+ dataset_math = load_dataset("open-r1/Mixture-of-Thoughts", "math", split="train")
116
+ ```
117
+
118
+ ## Dataset composition
119
+
120
+ Mixture-of-Thoughts is composed of three domains: math, code, and science. Each domain contains reasoning traces that are designed to teach language models to reason step-by-step. The dataset is structured as follows:
121
+
122
+ - **math**: 93.7k reasoning traces for mathematical problems, sourced from the `default` subset of [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k)
123
+ - **code**: 83.1k reasoning traces for competitive programming problems in Python and C++, sourced from the `solutions` and `solutions_w_editorials` subsets of [open-r1/codeforces-cots](https://huggingface.co/datasets/open-r1/codeforces-cots)
124
+ - **science**: 173k reasoning traces for scientific problems, sourced from the `science` subset of [nvidia/Llama-Nemotron-Post-Training-Dataset](https://huggingface.co/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset)
125
+ - **all**: Contains all reasoning traces from the three domains, for a total of 350k traces.
126
+
127
+ ## Curation methodology
128
+
129
+ To optimise the data mixture, we followed the same methodology described in the [Phi-4-reasoning tech report](https://huggingface.co/papers/2504.21318), namely that mixtures can be optimised independently per domain, and then combined into a single dataset. For each ablation, we evaluate on AIME 2024, GPQA Diamond, and LiveCodeBench v4 every epoch and take the best performing model checkpoint. The figure below shows the results from post-training [open-r1/Qwen2.5-Math-7B-RoPE-300k](https://huggingface.co/open-r1/Qwen2.5-Math-7B-RoPE-300k) on each individual domain compared to the final mixture:
130
+
131
+ <img src="data_mix.png" alt="Centered Image" style="display: block; margin: 0 auto;">
132
+
133
+ Overall, we find that training on all domains simultaneously yields the best results. See the subsections below for more details on optimising the data mixture per domain.
134
+
135
+ > [!NOTE]
136
+ > We use LiveCodeBench v4 to accelerate evaluation during our ablations as it contains around half the problems of v5, yet is still representative of the full benchmark.
137
+
138
+ ### Code
139
+
140
+ During the development of [open-r1/OlympicCoder-7B](https://huggingface.co/open-r1/OlympicCoder-7B), we observed that generating R1 reasoning traces in C++ produced better results on the challenging [IOI 2024 benchmark](https://github.com/huggingface/ioi), while Python traces produced better results on LiveCodeBench (a Python-only benchmark). To optimise the data mixture, we therefore used a mix of C++ and Python traces sourced from the following subsets of [open-r1/codeforces-cots](https://huggingface.co/datasets/open-r1/codeforces-cots):
141
+
142
+ - `solutions`: we prompt R1 to solve the problem and produce code in C++.
143
+ - `solutions_py`: same as `solutions`, but with R1 prompted to produce code in Python.
144
+ - `solutions_w_editorials`: we prompt R1 to solve the problem and produce code, but also provide it with a human-written solution.
145
+ - `solutions_w_editorials_py`: same as `solutions_w_editorials`, but with R1 prompted to produce code in Python.
146
+
147
+ The figure below shows the evolution of our ablations on these subsets, using [Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) as the base model:
148
+
149
+ <img src="code_mix.png" alt="Centered Image" style="display: block; margin: 0 auto;">
150
+
151
+ The individual experiments correspond to the following:
152
+
153
+ * **exp1 - exp3:** scaling the learning rate on the `solutions` subset from 1e-5 to 2e-5, and 4e-5 respectively.
154
+ * **exp4 - exp5:** measuring the impact of training on the `solutions_w_editorials` subset vs the combined `solutions` and `solutions_w_editorials` subsets.
155
+ * **exp6 - exp9:** measuring the impact of blending in Python traces from the `solutions_py` and `solutions_w_editorials_py` subsets. exp6 combines the `solutions_w_editorials` and `solutions_w_editorials_py` subsets, while exp7 combines the `solutions` and `solutions_py` subsets. Finally, exp8 combines all four subsets.
156
+
157
+ We found that combining all subsets of C++ and Python traces yielded the best results on LiveCodeBench. We also found that using this data mixture to fine-tune [open-r1/Qwen2.5-Coder-7B-RoPE-300k](https://huggingface.co/open-r1/Qwen2.5-Coder-7B-RoPE-300k) led to comparable performance improvements, which shows the effectiveness of our curation strategy.
158
+
159
+ ### Math
160
+
161
+ For the math domain, we mostly focused on comparing the `default` and `extended` subsets of [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k). The `default` subset contains 93.7k reasoning traces, while the `extended` subset contains an additional 131k traces, containing simpler problems than the `default` subset. The figure below shows performance on each subset, using [Qwen/Qwen2.5-Math-7B-RoPE-300k](https://huggingface.co/Qwen/Qwen2.5-Math-7B-RoPE-300k) as the base model:
162
+
163
+ <img src="math_mix.png" alt="Centered Image" style="display: block; margin: 0 auto;">
164
+
165
+ Overall, we found that training on the `default` subset yielded better results than training on the `extended` subset, and that training on both subsets together yielded the best results. Nevertheless, we opted to use the `default` subset only for the final mixture, as including both would have led to a significant increase in the size of the dataset, for a modest improvement in performance.
166
+
167
+ ### Science
168
+
169
+ For the science domain, we used the `science` subset of [nvidia/Llama-Nemotron-Post-Training-Dataset](https://huggingface.co/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset/viewer/SFT/science), which contains 483k reasoning traces. However, we found that the subset was too large to be used in its entirety, as it would have led to a significant increase in the size of the dataset. Instead, we selected the subset of traces where no Qwen models were used for prompt pre-processing--see this [discussion](https://huggingface.co/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset/discussions/6) for more details. The result was 173k reasoning traces, which we used in the final mixture after ablating on the learning rate.
170
+
171
+ ## Citation
172
+
173
+ If you find this dataset is useful in your own work, please consider citing it as follows, together with the source of the specific domain you are using:
174
+
175
+ ```bibtex
176
+ @misc{openr1,
177
+ title = {Open R1: A fully open reproduction of DeepSeek-R1},
178
+ url = {https://github.com/huggingface/open-r1},
179
+ author = {Hugging Face},
180
+ month = {January},
181
+ year = {2025}
182
+ }
183
+ ```
184
+
185
+ **open-r1/codeforces-cots**
186
+
187
+ ```bibtex
188
+ @misc{penedo2025codeforces,
189
+ title={CodeForces CoTs},
190
+ author={Guilherme Penedo and Anton Lozhkov and Hynek Kydlíček and Loubna Ben Allal and Edward Beeching and Agustín Piqueres Lajarín and Quentin Gallouédec and Nathan Habib and Lewis Tunstall and Leandro von Werra},
191
+ year={2025},
192
+ publisher = {Hugging Face},
193
+ journal = {Hugging Face repository},
194
+ howpublished = {\url{https://huggingface.co/datasets/open-r1/codeforces-cots}}
195
+ }
196
+ ```
197
+
198
+ **open-r1/OpenR1-Math-220k**
199
+
200
+ ```bibtex
201
+ @misc{lozhkov2025openr1math220k,
202
+ title={OpenR1-Math-220k},
203
+ author={Anton Lozhkov and Hynek Kydlíček and Loubna Ben Allal and Guilherme Penedo and Edward Beeching and Quentin Gallouédec and Nathan Habib and Lewis Tunstall and Leandro von Werra},
204
+ year={2025},
205
+ publisher = {Hugging Face},
206
+ journal = {Hugging Face repository},
207
+ howpublished = {\url{https://huggingface.co/datasets/open-r1/OpenR1-Math-220k}}
208
+ }
209
+ ```
210
+
211
+ **nvidia/Llama-Nemotron-Post-Training-Dataset**
212
+
213
+ ```bibtex
214
+ @misc{bercovich2025llamanemotronefficientreasoningmodels,
215
+ title={Llama-Nemotron: Efficient Reasoning Models},
216
+ author={Akhiad Bercovich and Itay Levy and Izik Golan and Mohammad Dabbah and Ran El-Yaniv and Omri Puny and Ido Galil and Zach Moshe and Tomer Ronen and Najeeb Nabwani and Ido Shahaf and Oren Tropp and Ehud Karpas and Ran Zilberstein and Jiaqi Zeng and Soumye Singhal and Alexander Bukharin and Yian Zhang and Tugrul Konuk and Gerald Shen and Ameya Sunil Mahabaleshwarkar and Bilal Kartal and Yoshi Suhara and Olivier Delalleau and Zijia Chen and Zhilin Wang and David Mosallanezhad and Adi Renduchintala and Haifeng Qian and Dima Rekesh and Fei Jia and Somshubra Majumdar and Vahid Noroozi and Wasi Uddin Ahmad and Sean Narenthiran and Aleksander Ficek and Mehrzad Samadi and Jocelyn Huang and Siddhartha Jain and Igor Gitman and Ivan Moshkov and Wei Du and Shubham Toshniwal and George Armstrong and Branislav Kisacanin and Matvei Novikov and Daria Gitman and Evelina Bakhturina and Jane Polak Scowcroft and John Kamalu and Dan Su and Kezhi Kong and Markus Kliegl and Rabeeh Karimi and Ying Lin and Sanjeev Satheesh and Jupinder Parmar and Pritam Gundecha and Brandon Norick and Joseph Jennings and Shrimai Prabhumoye and Syeda Nahida Akter and Mostofa Patwary and Abhinav Khattar and Deepak Narayanan and Roger Waleffe and Jimmy Zhang and Bor-Yiing Su and Guyue Huang and Terry Kong and Parth Chadha and Sahil Jain and Christine Harvey and Elad Segal and Jining Huang and Sergey Kashirsky and Robert McQueen and Izzy Putterman and George Lam and Arun Venkatesan and Sherry Wu and Vinh Nguyen and Manoj Kilaru and Andrew Wang and Anna Warno and Abhilash Somasamudramath and Sandip Bhaskar and Maka Dong and Nave Assaf and Shahar Mor and Omer Ullman Argov and Scot Junkin and Oleksandr Romanenko and Pedro Larroy and Monika Katariya and Marco Rovinelli and Viji Balas and Nicholas Edelman and Anahita Bhiwandiwalla and Muthu Subramaniam and Smita Ithape and Karthik Ramamoorthy and Yuting Wu and Suguna Varshini Velury and Omri Almog and Joyjit Daw and Denys Fridman and Erick Galinkin and Michael Evans and Katherine Luna and Leon Derczynski and Nikki Pope and Eileen Long and Seth Schneider and Guillermo Siman and Tomasz Grzegorzek and Pablo Ribalta and Monika Katariya and Joey Conway and Trisha Saar and Ann Guan and Krzysztof Pawelec and Shyamala Prayaga and Oleksii Kuchaiev and Boris Ginsburg and Oluwatobi Olabiyi and Kari Briski and Jonathan Cohen and Bryan Catanzaro and Jonah Alben and Yonatan Geifman and Eric Chung and Chris Alexiuk},
217
+ year={2025},
218
+ eprint={2505.00949},
219
+ archivePrefix={arXiv},
220
+ primaryClass={cs.CL},
221
+ url={https://arxiv.org/abs/2505.00949},
222
+ }
223
+ ```
cot/all/train-00000-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e6a5c2127efe2fef214b755d907ea3d881dbc09990848bd5ae7d4b7afba06d0
3
+ size 204656813
cot/all/train-00001-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:702e4175a4b4220d314d1b635513364beb5cf8d3ee4ec8faece8786b5c3394f9
3
+ size 205444134
cot/all/train-00002-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6d7046bc06bc18788dec08ec044623d898f7b1f4d2428e06615bb9a6bfea3f2
3
+ size 202621347
cot/all/train-00003-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:687b3f24f2034e7cf9298eed7111a2593b63eb63e76ec506d79e272c2c75c14a
3
+ size 205185437
cot/all/train-00004-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5191335e4b0fc1ad995c8d30096e26f01c77e269e8c15fc805246ba8b52cde65
3
+ size 203735753
cot/all/train-00005-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e393d82a25f339e69cb01e6760f88b89e61dcc9a41298a8c7eb351091a77ad5
3
+ size 204780450
cot/all/train-00006-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20f167d206c77168ed7a3ec7fb7d4c70cc0bcbee250dad96627fae451585c92a
3
+ size 205663625
cot/all/train-00007-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4b4a73ef6f2e2f4e7a570f56b3577d206848cb0d9f24ef72a52eceebdc4b90d
3
+ size 203420571
cot/all/train-00008-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c502620a48c1acf50f0416aaa5d56e498571a40ec64319ac43fbfb7a5ab74027
3
+ size 206595941
cot/all/train-00009-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7415abc91fd63f74f9ac0b3f0c963c19e3b1faa6f1ffbb313967f25a63a2448a
3
+ size 206093749
cot/all/train-00010-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2c5253f58a080fdffdf76a58e12e311aa834c1ee48593a397da164c2cfbba6d
3
+ size 208337232
cot/all/train-00011-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33c543ff87d9bdc776ab61acb2c87ea477008fd33b0767f7a795f9b92a7f6275
3
+ size 206364022
cot/all/train-00012-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55fc000740e1c96424944baaf4289ebc2ebc763121524d59183a6ad4a4020e8a
3
+ size 203769641
cot/all/train-00013-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:610d056bf55c92d92b211462fe13922dc42ca5b24a34fa5693d5a40752632585
3
+ size 206192356
cot/all/train-00014-of-00015.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f425b05c385b7ab75f242544bf0d7397f91c520f4aeea6c63f10d58d9fafa267
3
+ size 204792646
cot/clean/merge_cot.jsonl ADDED
File without changes
cot/clean/merge_cot_combine.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3da5ed8c8804bcabf88b28fb8300d74a0a90cd98746d4cdb08f0d78e47b3f586
3
+ size 2509318757
cot/code/train-00000-of-00008.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad5574004721c7ee4e0aebcdce2c5c9c779915b1254a503c13515ea1c6d571bd
3
+ size 202072608
cot/code/train-00001-of-00008.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08be1dfa8b1b117861b869f2fce46310d92cdb6cc29c9e9f69f27d029a5cf3e7
3
+ size 200136663
cot/code/train-00002-of-00008.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f45bd763555430ae4affe79012462c2b4ee64b5711bd1026856fd768910f0453
3
+ size 201219969
cot/code/train-00003-of-00008.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9d0a15ff30767ba5232ae842e31cd3f10d75da99dd6e140a11b4c8a0ddc3a7d
3
+ size 202216604
cot/code/train-00004-of-00008.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17ebd068f395a96a9b76b4edb67d3e9d87e5fedbbe2af87a6b45b9a0a9fdd12e
3
+ size 201298095
cot/code/train-00005-of-00008.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d268ae42d016183cadf38e305b187fa675b97c94ad6f875a5c2e681b66f632a8
3
+ size 201348265
cot/code/train-00006-of-00008.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3dfc7b7a47b50a2370438d0b4a2aaf19aa1ec72a35c6a3de13c664a52e3d256
3
+ size 203350744
cot/code/train-00007-of-00008.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63c16aa24dac26ad15faa400fad26bdf2f0efcb07887d56dfdf28bf0deea04c0
3
+ size 201695656
cot/code_mix.png ADDED

Git LFS Details

  • SHA256: 9e03f88380ec55421e44946d75cdb0a522784e2e34040bff630469a88203862b
  • Pointer size: 131 Bytes
  • Size of remote file: 114 kB
cot/data_mix.png ADDED

Git LFS Details

  • SHA256: 95283eb012591b6c3fc50d61913a0a7de4dbf3080008d50eab20d3c4abc75635
  • Pointer size: 131 Bytes
  • Size of remote file: 246 kB
cot/math/train-00000-of-00004.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3875a97e04d614a4e45929ba9321ec492d1aebecdc35c31faf8a2293955c480
3
+ size 175553215
cot/math/train-00001-of-00004.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d152451dfeef388b6a41c99a92c900f5b85eabc90180fadaf85e9fcc56d64cb
3
+ size 176620120
cot/math/train-00002-of-00004.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9343a9833f6fc910891c0984bbe2804214f6284e7fbee9a91b9d490d6a724381
3
+ size 176512132