lll2343 commited on
Commit
622e120
·
verified ·
1 Parent(s): 7470dba

Upload folder using huggingface_hub

Browse files
Files changed (43) hide show
  1. .gitattributes +26 -0
  2. hf_data/LaMini_instruction_train.jsonl +3 -0
  3. hf_data/ScaleQuest_Code_train_157k.jsonl +3 -0
  4. hf_data/ScaleQuest_Math_train_1m.jsonl +3 -0
  5. hf_data/Table-GPT_train_13k.jsonl +3 -0
  6. hf_data/infinity_instruct_3M_train.jsonl +3 -0
  7. hf_data/infinity_instruct_7M_core_train_1_5m.jsonl +3 -0
  8. hf_data/infinity_instruct_gen_train_1400k.jsonl +3 -0
  9. hf_data/infinity_instruct_train_7m.jsonl +3 -0
  10. hf_data/opc-sft-stage1-filtered_infinity_instruct-1030k.jsonl +3 -0
  11. hf_data/opc-sft-stage1-largescale_diverse_instruct-2513k.jsonl +3 -0
  12. hf_data/opc-sft-stage1-realuser_instruct-675k.jsonl +3 -0
  13. hf_data/opc-sft-stage2-436k.jsonl +3 -0
  14. hf_data/opencodeinstruct_train_2m.jsonl +3 -0
  15. hf_data/opencodeinstruct_train_5m.jsonl +3 -0
  16. hf_data/opencodeinstruct_train_filter_score_eq_1.jsonl +3 -0
  17. hf_data/opencodeinstruct_train_filter_score_ge_08.jsonl +3 -0
  18. hf_data/opencodeinstruct_train_filter_score_ge_08_llm_judgement_avg_score_ge_3.jsonl +3 -0
  19. hf_data/openmathinstruct2_train.jsonl +3 -0
  20. hf_data/openmathinstruct2_train_1m.jsonl +3 -0
  21. hf_data/openmathinstruct2_train_2m.jsonl +3 -0
  22. hf_data/openmathinstruct2_train_5m.jsonl +3 -0
  23. hf_data/sciRIFF_4096_70k.jsonl +3 -0
  24. hf_data/script/processor_code_opc_sft.py +29 -0
  25. hf_data/script/processor_code_opc_sft_stage_1.py +30 -0
  26. hf_data/script/processor_magpie.py +23 -0
  27. hf_data/script/processor_open_code_instruct_filter.py +36 -0
  28. hf_data/script/processor_scalequest_code.py +25 -0
  29. hf_data/script/processor_scalequest_math.py +25 -0
  30. hf_data/script/processor_sciriff.py +25 -0
  31. hf_data/script/processor_smoltalk.py +38 -0
  32. hf_data/script/processor_table_gpt.py +28 -0
  33. hf_data/script/processor_tulu.py +25 -0
  34. hf_data/script/processor_tulu_mixture.py +28 -0
  35. hf_data/script/processpor_code.py +23 -0
  36. hf_data/script/processpor_infinity.py +22 -0
  37. hf_data/script/processpor_infinity_instruct.py +25 -0
  38. hf_data/script/processpor_lamini.py +22 -0
  39. hf_data/script/processpor_math.py +22 -0
  40. hf_data/smoltalk_1100k.jsonl +3 -0
  41. hf_data/tulu-3-sft-mixture_train_939k.jsonl +3 -0
  42. hf_data/tulu-3-sft-olmo-2-mixture_train_939k.jsonl +3 -0
  43. hf_data/tulu-3-sft-personas-instruction-following_30k.jsonl +3 -0
.gitattributes CHANGED
@@ -34,3 +34,29 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  qwen_sft_v1114/training_log.txt filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  qwen_sft_v1114/training_log.txt filter=lfs diff=lfs merge=lfs -text
37
+ hf_data/LaMini_instruction_train.jsonl filter=lfs diff=lfs merge=lfs -text
38
+ hf_data/ScaleQuest_Code_train_157k.jsonl filter=lfs diff=lfs merge=lfs -text
39
+ hf_data/ScaleQuest_Math_train_1m.jsonl filter=lfs diff=lfs merge=lfs -text
40
+ hf_data/Table-GPT_train_13k.jsonl filter=lfs diff=lfs merge=lfs -text
41
+ hf_data/infinity_instruct_3M_train.jsonl filter=lfs diff=lfs merge=lfs -text
42
+ hf_data/infinity_instruct_7M_core_train_1_5m.jsonl filter=lfs diff=lfs merge=lfs -text
43
+ hf_data/infinity_instruct_gen_train_1400k.jsonl filter=lfs diff=lfs merge=lfs -text
44
+ hf_data/infinity_instruct_train_7m.jsonl filter=lfs diff=lfs merge=lfs -text
45
+ hf_data/opc-sft-stage1-filtered_infinity_instruct-1030k.jsonl filter=lfs diff=lfs merge=lfs -text
46
+ hf_data/opc-sft-stage1-largescale_diverse_instruct-2513k.jsonl filter=lfs diff=lfs merge=lfs -text
47
+ hf_data/opc-sft-stage1-realuser_instruct-675k.jsonl filter=lfs diff=lfs merge=lfs -text
48
+ hf_data/opc-sft-stage2-436k.jsonl filter=lfs diff=lfs merge=lfs -text
49
+ hf_data/opencodeinstruct_train_2m.jsonl filter=lfs diff=lfs merge=lfs -text
50
+ hf_data/opencodeinstruct_train_5m.jsonl filter=lfs diff=lfs merge=lfs -text
51
+ hf_data/opencodeinstruct_train_filter_score_eq_1.jsonl filter=lfs diff=lfs merge=lfs -text
52
+ hf_data/opencodeinstruct_train_filter_score_ge_08.jsonl filter=lfs diff=lfs merge=lfs -text
53
+ hf_data/opencodeinstruct_train_filter_score_ge_08_llm_judgement_avg_score_ge_3.jsonl filter=lfs diff=lfs merge=lfs -text
54
+ hf_data/openmathinstruct2_train.jsonl filter=lfs diff=lfs merge=lfs -text
55
+ hf_data/openmathinstruct2_train_1m.jsonl filter=lfs diff=lfs merge=lfs -text
56
+ hf_data/openmathinstruct2_train_2m.jsonl filter=lfs diff=lfs merge=lfs -text
57
+ hf_data/openmathinstruct2_train_5m.jsonl filter=lfs diff=lfs merge=lfs -text
58
+ hf_data/sciRIFF_4096_70k.jsonl filter=lfs diff=lfs merge=lfs -text
59
+ hf_data/smoltalk_1100k.jsonl filter=lfs diff=lfs merge=lfs -text
60
+ hf_data/tulu-3-sft-mixture_train_939k.jsonl filter=lfs diff=lfs merge=lfs -text
61
+ hf_data/tulu-3-sft-olmo-2-mixture_train_939k.jsonl filter=lfs diff=lfs merge=lfs -text
62
+ hf_data/tulu-3-sft-personas-instruction-following_30k.jsonl filter=lfs diff=lfs merge=lfs -text
hf_data/LaMini_instruction_train.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a170d06468f1e0b75a93138084f28b821cdefe19da547af0bc7093be0057d2e5
3
+ size 1316278839
hf_data/ScaleQuest_Code_train_157k.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:332960c2672ae70b278a3ee7dc41fe1854a113b94ff8e4d7ceab302d23304f11
3
+ size 393762917
hf_data/ScaleQuest_Math_train_1m.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8504b7b492431641b23a9aa4e695572dd16836bf3156a94c21645a5361368817
3
+ size 1581059706
hf_data/Table-GPT_train_13k.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36cb71a030f3a9c98fa114ee7206bcc5f28697fbcf916d0c01e36ef0242ea443
3
+ size 33778304
hf_data/infinity_instruct_3M_train.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e870f4c34157f846443cb6adf423b6b183aba52268fb3b7adea9e5f899250601
3
+ size 7970107783
hf_data/infinity_instruct_7M_core_train_1_5m.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2ede1f8e4a3906c394f000595416dbade9d9a28c34f80337c272fbc46ba6822
3
+ size 3174471022
hf_data/infinity_instruct_gen_train_1400k.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a003195934389a860dca44da374eb580adcbdbc8763a49a3f940e4991b0fa3e
3
+ size 5782067170
hf_data/infinity_instruct_train_7m.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:522be9282906a1e539e2c66e2d51f7a044ceaa102346ae402e0e6573d4a83a54
3
+ size 14312887611
hf_data/opc-sft-stage1-filtered_infinity_instruct-1030k.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8a16827ffcb868bd79b97f6b12757e0ce8d505c3af65fb28c60df743527f0d1
3
+ size 2128437340
hf_data/opc-sft-stage1-largescale_diverse_instruct-2513k.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:514b8adc9f28e5faf00c7ae5fca1722045526b59e7b8d13c716beb0c93c9e26e
3
+ size 6822344959
hf_data/opc-sft-stage1-realuser_instruct-675k.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b11a4f995acb07b86d6679a5aa838d7c79931b2fd4543e2cf42576161a3253
3
+ size 2325200313
hf_data/opc-sft-stage2-436k.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1b8bdafd4500d8785803156056de94989af8fa0146109561af3124fab6ed04c
3
+ size 1109723658
hf_data/opencodeinstruct_train_2m.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:155fd30f5e0f7d329f0e1eddddab41e0bf7fd565c86613bb3215d01a1aa2ffe5
3
+ size 3950774175
hf_data/opencodeinstruct_train_5m.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff6393709448e0eb0d016ec1a2226f4a1b9e79db89f5a26b1c8f851d7adf6c52
3
+ size 10284863439
hf_data/opencodeinstruct_train_filter_score_eq_1.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6a3bf492ae82328be8ae575f84b65198a196918d513216b4b544f4ab6c07a99
3
+ size 2951101476
hf_data/opencodeinstruct_train_filter_score_ge_08.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bf3163adc2bca7d0103dcb17208a2c24d345dbcad8a85ad6e3fc975dff28927
3
+ size 4706279026
hf_data/opencodeinstruct_train_filter_score_ge_08_llm_judgement_avg_score_ge_3.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efeb82226ab27a81517d945ad7870ce77c74b06d31daed26429405ba73f3bb00
3
+ size 4671287609
hf_data/openmathinstruct2_train.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e070b0f4220feb93e6646caca1b881e2b5d33afcb64da9b30d59af6784b830f
3
+ size 16798025787
hf_data/openmathinstruct2_train_1m.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc79dd150daf1d376f2ce2772337d3ffa7ff9ff61d1d825e12bb5d7c21b109fc
3
+ size 1445582798
hf_data/openmathinstruct2_train_2m.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36fadca11edb9e5dc3fb9f8a60194d97a4448ac302af0c53fee69605c64785be
3
+ size 2952796652
hf_data/openmathinstruct2_train_5m.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba0e8cbc50a72e232a5e20731edd849c69b87ddef2f331c0ab366c955f2ac5e4
3
+ size 7012994994
hf_data/sciRIFF_4096_70k.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c771bd27670ff0d9fe0c5c4efd42c1fc945c8f29338c68e955244c13a2e68b7
3
+ size 390602961
hf_data/script/processor_code_opc_sft.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
+
3
+ import json
4
+
5
+ from datasets import load_dataset
6
+ from tqdm import tqdm
7
+
8
+ subset = ['educational_instruct', 'evol_instruct', 'mceval_instruct', 'package_instruct']
9
+ output_file = "../opc-sft-stage2-436k.jsonl"
10
+ ix = 0
11
+ with open(output_file, 'w', encoding='utf-8') as f:
12
+ for sub in subset:
13
+ dataset = load_dataset('../opc-sft-stage2', sub, split='train')
14
+ print(f"Converting dataset to jsonl format {sub}")
15
+
16
+ for item in tqdm(dataset):
17
+ # print(item, item)
18
+ # break
19
+ conv = {
20
+ 'id': item['seq_id'] if 'seq_id' in item.keys() else f'{sub}-{ix}',
21
+ 'conversations': [
22
+ {'from': 'human', 'value': item['instruction']},
23
+ {'from': 'gpt', 'value': item['output']}
24
+ ]
25
+ }
26
+ ix += 1
27
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
28
+
29
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processor_code_opc_sft_stage_1.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
+
3
+ import json
4
+
5
+ from datasets import load_dataset
6
+ from tqdm import tqdm
7
+
8
+ subset = ['filtered_infinity_instruct', 'realuser_instruct', 'largescale_diverse_instruct']
9
+ cnts = ['1030k', '2510k','676k']
10
+
11
+ for cnt, sub in zip(cnts, subset):
12
+ dataset = load_dataset('../opc-sft-stage1', sub, split='train')
13
+ output_file = f"../opc-sft-stage1-{sub}-{cnt}.jsonl"
14
+ print(f"Converting dataset to jsonl format {sub}")
15
+ ix = 0
16
+ with open(output_file, 'w', encoding='utf-8') as f:
17
+ for item in tqdm(dataset):
18
+ # print(item, item)
19
+ # break
20
+ conv = {
21
+ 'id': item['seq_id'] if 'seq_id' in item.keys() else f'{sub}-{ix}',
22
+ 'conversations': [
23
+ {'from': 'human', 'value': item['instruction']},
24
+ {'from': 'gpt', 'value': item['output']}
25
+ ]
26
+ }
27
+ ix += 1
28
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
29
+
30
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processor_magpie.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import load_dataset
4
+ from tqdm import tqdm
5
+
6
+ dataset = load_dataset('../Magpie-Qwen2.5-Pro-300K-Filtered', split='train')
7
+
8
+ print("Converting dataset to jsonl format")
9
+ output_file = "../Magpie-Qwen2_5-Pro-300K-Filtered.jsonl"
10
+ with open(output_file, 'w', encoding='utf-8') as f:
11
+ for item in tqdm(dataset):
12
+ # print(item)
13
+ # break
14
+ conv = {
15
+ 'id': item['uuid'],
16
+ 'conversations': [
17
+ {'from': 'human', 'value': item['instruction']},
18
+ {'from': 'gpt', 'value': item['response']}
19
+ ]
20
+ }
21
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
22
+
23
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processor_open_code_instruct_filter.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import load_dataset
4
+ from tqdm import tqdm
5
+
6
+ dataset = load_dataset('../OpenCodeInstruct', split='train')
7
+
8
+ print("Converting dataset to jsonl format")
9
+ output_file = "../opencodeinstruct_train_filter_score_ge_08.jsonl"
10
+ with open(output_file, 'w', encoding='utf-8') as f:
11
+ for item in tqdm(dataset):
12
+
13
+ llm_judgement = json.loads(item['llm_judgement'])
14
+ llm_judgement_avg_score = (
15
+ int(llm_judgement['requirement_conformance']['score']) +
16
+ int(llm_judgement['logical_correctness']['score']) +
17
+ int(llm_judgement['requirement_conformance']['score'])
18
+ ) / 3
19
+
20
+ # print(item)
21
+ # print(float(item['average_test_score']))
22
+ # print(llm_judgement_avg_score)
23
+ # break
24
+
25
+ if float(item['average_test_score']) >= 0.8:
26
+ conv = {
27
+ 'id': item['id'],
28
+ 'conversations': [
29
+ {'from': 'human', 'value': item['input']},
30
+ {'from': 'gpt', 'value': item['output']}
31
+ ]
32
+ }
33
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
34
+
35
+ print(f"Conversion complete. Output saved as {output_file}")
36
+
hf_data/script/processor_scalequest_code.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import load_dataset
4
+ from tqdm import tqdm
5
+
6
+ dataset = load_dataset('../ScaleQuest-Code', split='train')
7
+
8
+ print("Converting dataset to jsonl format")
9
+ output_file = "../ScaleQuest_Code_train_157k.jsonl"
10
+ ix = 0
11
+ with open(output_file, 'w', encoding='utf-8') as f:
12
+ for item in tqdm(dataset):
13
+ # print(item, item)
14
+ # break
15
+ conv = {
16
+ 'id': ix,
17
+ 'conversations': [
18
+ {'from': 'human', 'value': item['query']},
19
+ {'from': 'gpt', 'value': item['response']}
20
+ ]
21
+ }
22
+ ix += 1
23
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
24
+
25
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processor_scalequest_math.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import load_dataset
4
+ from tqdm import tqdm
5
+
6
+ dataset = load_dataset('dyyyyyyyy/ScaleQuest-Math', split='train')
7
+
8
+ print("Converting dataset to jsonl format")
9
+ output_file = "../ScaleQuest_Math_train_1m.jsonl"
10
+ ix = 0
11
+ with open(output_file, 'w', encoding='utf-8') as f:
12
+ for item in tqdm(dataset):
13
+ # print(item, item)
14
+ # break
15
+ conv = {
16
+ 'id': ix,
17
+ 'conversations': [
18
+ {'from': 'human', 'value': item['query']},
19
+ {'from': 'gpt', 'value': item['response']}
20
+ ]
21
+ }
22
+ ix += 1
23
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
24
+
25
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processor_sciriff.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
+
3
+ import json
4
+
5
+ from datasets import load_dataset
6
+ from tqdm import tqdm
7
+
8
+ dataset = load_dataset('../SciRIFF', '4096', split='train')
9
+
10
+ print("Converting dataset to jsonl format")
11
+ output_file = "../sciRIFF_4096_70k.jsonl"
12
+ with open(output_file, 'w', encoding='utf-8') as f:
13
+ for item in tqdm(dataset):
14
+ # print(item, item)
15
+ # break
16
+ conv = {
17
+ 'id': item['_instance_id'],
18
+ 'conversations': [
19
+ {'from': 'human', 'value': item['input']},
20
+ {'from': 'gpt', 'value': item['output']}
21
+ ]
22
+ }
23
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
24
+
25
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processor_smoltalk.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
+
3
+ import json
4
+
5
+ from datasets import load_dataset
6
+ from tqdm import tqdm
7
+
8
+ dataset = load_dataset('../smoltalk/', 'all', split='train')
9
+
10
+ print("Converting dataset to jsonl format")
11
+ output_file = "../smoltalk_1100k.jsonl"
12
+ ix = 0
13
+ roles = {
14
+ 'assistant': 'gpt',
15
+ 'user': 'human',
16
+ 'system': 'system'
17
+ }
18
+
19
+ with open(output_file, 'w', encoding='utf-8') as f:
20
+ for item in tqdm(dataset):
21
+ conversations = []
22
+ for msg in item['messages']:
23
+ from_str = roles[msg['role']]
24
+ value = msg['content']
25
+ conversations.append(
26
+ {
27
+ 'from': from_str,
28
+ 'value': value
29
+ }
30
+ )
31
+ conv = {
32
+ 'id': ix,
33
+ 'conversations': conversations
34
+ }
35
+ ix += 1
36
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
37
+
38
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processor_table_gpt.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
+
3
+ import json
4
+
5
+ from datasets import load_dataset
6
+ from tqdm import tqdm
7
+
8
+ dataset = load_dataset('../Table-GPT', 'All', split='train')
9
+
10
+ print("Converting dataset to jsonl format")
11
+ output_file = "../Table-GPT_train_13k.jsonl"
12
+ ix = 0
13
+ with open(output_file, 'w', encoding='utf-8') as f:
14
+ for item in tqdm(dataset):
15
+ # print(item, item)
16
+ # break
17
+ conv = {
18
+ 'id': ix,
19
+ 'conversations': [
20
+ {'from': 'system', 'value': 'You are a helpful assistant that specializes in tables'},
21
+ {'from': 'human', 'value': item['prompt']},
22
+ {'from': 'gpt', 'value': item['completion']}
23
+ ]
24
+ }
25
+ ix += 1
26
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
27
+
28
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processor_tulu.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
+
3
+ import json
4
+
5
+ from datasets import load_dataset
6
+ from tqdm import tqdm
7
+
8
+ dataset = load_dataset('../tulu-3-sft-personas-instruction-following', split='train')
9
+
10
+ print("Converting dataset to jsonl format")
11
+ output_file = "../tulu-3-sft-personas-instruction-following_30k.jsonl"
12
+ with open(output_file, 'w', encoding='utf-8') as f:
13
+ for item in tqdm(dataset):
14
+ # print(item, item)
15
+ # break
16
+ conv = {
17
+ 'id': item['id'],
18
+ 'conversations': [
19
+ {'from': 'human', 'value': item['prompt']},
20
+ {'from': 'gpt', 'value': item['messages'][1]['content']}
21
+ ]
22
+ }
23
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
24
+
25
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processor_tulu_mixture.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
+
3
+ import json
4
+
5
+ from datasets import load_dataset
6
+ from tqdm import tqdm
7
+
8
+ dataset = load_dataset('../tulu-3-sft-mixture', split='train')
9
+
10
+ print("Converting dataset to jsonl format")
11
+ output_file = "../tulu-3-sft-mixture_train_939k.jsonl"
12
+ with open(output_file, 'w', encoding='utf-8') as f:
13
+ for item in tqdm(dataset):
14
+ # print(item, item)
15
+ # break
16
+ try:
17
+ conv = {
18
+ 'id': item['id'],
19
+ 'conversations': [
20
+ {'from': 'human', 'value': item['messages'][0]['content']},
21
+ {'from': 'gpt', 'value': item['messages'][1]['content']}
22
+ ]
23
+ }
24
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
25
+ except IndexError:
26
+ print('e')
27
+
28
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processpor_code.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import load_dataset
4
+ from tqdm import tqdm
5
+
6
+ dataset = load_dataset('nvidia/OpenCodeInstruct', split='train')
7
+
8
+ print("Converting dataset to jsonl format")
9
+ output_file = "opencodeinstruct_train.jsonl"
10
+ with open(output_file, 'w', encoding='utf-8') as f:
11
+ for item in tqdm(dataset):
12
+ # print(item)
13
+ # break
14
+ conv = {
15
+ 'id': item['id'],
16
+ 'conversations': [
17
+ {'from': 'human', 'value': item['input']},
18
+ {'from': 'gpt', 'value': item['output']}
19
+ ]
20
+ }
21
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
22
+
23
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processpor_infinity.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import load_dataset
4
+ from tqdm import tqdm
5
+
6
+ dataset = load_dataset('../OpenMathInstruct-2', split='train')
7
+
8
+ print("Converting dataset to jsonl format")
9
+ output_file = "../openmathinstruct2_train.jsonl"
10
+ with open(output_file, 'w', encoding='utf-8') as f:
11
+ for item in tqdm(dataset):
12
+ # print(item)
13
+ # break
14
+ conv = {
15
+ 'conversations':[
16
+ {'from':'human', 'value':item['problem']},
17
+ {'from':'gpt', 'value':item['generated_solution']}
18
+ ]
19
+ }
20
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
21
+
22
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processpor_infinity_instruct.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
+
3
+ import json
4
+
5
+ from datasets import load_dataset
6
+ from tqdm import tqdm
7
+
8
+ dataset = load_dataset('BAAI/Infinity-Instruct', '7M_core', split='train')
9
+
10
+ print("Converting dataset to jsonl format")
11
+ output_file = "../infinity_instruct_7M_core_train_1_5m.jsonl"
12
+ with open(output_file, 'w', encoding='utf-8') as f:
13
+ for item in tqdm(dataset):
14
+ # print(item)
15
+ # break
16
+ # conv = {
17
+ # 'id': item['id'],
18
+ # 'conversations': [
19
+ # {'from': 'human', 'value': item['input']},
20
+ # {'from': 'gpt', 'value': item['output']}
21
+ # ]
22
+ # }
23
+ f.write(json.dumps(item, ensure_ascii=False) + '\n')
24
+
25
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processpor_lamini.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import load_dataset
4
+ from tqdm import tqdm
5
+
6
+ dataset = load_dataset('./LaMini-instruction', split='train')
7
+
8
+ print("Converting dataset to jsonl format")
9
+ output_file = "LaMini_instruction_train.jsonl"
10
+ with open(output_file, 'w', encoding='utf-8') as f:
11
+ for item in tqdm(dataset):
12
+ # print(item)
13
+ # break
14
+ conv = {
15
+ 'conversations':[
16
+ {'from':'human', 'value':item['instruction']},
17
+ {'from':'gpt', 'value':item['response']}
18
+ ]
19
+ }
20
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
21
+
22
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/script/processpor_math.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import load_dataset
4
+ from tqdm import tqdm
5
+
6
+ dataset = load_dataset('../OpenMathInstruct-2', split='train')
7
+
8
+ print("Converting dataset to jsonl format")
9
+ output_file = "../openmathinstruct2_train.jsonl"
10
+ with open(output_file, 'w', encoding='utf-8') as f:
11
+ for item in tqdm(dataset):
12
+ # print(item)
13
+ # break
14
+ conv = {
15
+ 'conversations':[
16
+ {'from':'human', 'value':item['problem']},
17
+ {'from':'gpt', 'value':item['generated_solution']}
18
+ ]
19
+ }
20
+ f.write(json.dumps(conv, ensure_ascii=False) + '\n')
21
+
22
+ print(f"Conversion complete. Output saved as {output_file}")
hf_data/smoltalk_1100k.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:729a6f12594034fe50fd44568aac5960fd9ba9b675d5142b4619c478bdce838e
3
+ size 4169152401
hf_data/tulu-3-sft-mixture_train_939k.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aef698a418fcd5aa6218bfc7f0ddd68dbfd87fbcbae3ea08dca6407503275da2
3
+ size 2463675206
hf_data/tulu-3-sft-olmo-2-mixture_train_939k.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09623e387ab69c78756c356ddfb261fe736c3790123a16ee8f8635ed98f1bc26
3
+ size 2463660893
hf_data/tulu-3-sft-personas-instruction-following_30k.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3131ecbd5057eb98c122507c481899c4009eaca87d97da02acce85514b6c7833
3
+ size 61667836