lll2343 commited on
Commit
f458890
·
verified ·
1 Parent(s): 29bb2fc

Delete folder hf_data with huggingface_hub

Browse files
Files changed (42) hide show
  1. hf_data/LaMini_instruction_train.jsonl +0 -3
  2. hf_data/ScaleQuest_Code_train_157k.jsonl +0 -3
  3. hf_data/ScaleQuest_Math_train_1m.jsonl +0 -3
  4. hf_data/Table-GPT_train_13k.jsonl +0 -3
  5. hf_data/infinity_instruct_3M_train.jsonl +0 -3
  6. hf_data/infinity_instruct_7M_core_train_1_5m.jsonl +0 -3
  7. hf_data/infinity_instruct_gen_train_1400k.jsonl +0 -3
  8. hf_data/infinity_instruct_train_7m.jsonl +0 -3
  9. hf_data/opc-sft-stage1-filtered_infinity_instruct-1030k.jsonl +0 -3
  10. hf_data/opc-sft-stage1-largescale_diverse_instruct-2513k.jsonl +0 -3
  11. hf_data/opc-sft-stage1-realuser_instruct-675k.jsonl +0 -3
  12. hf_data/opc-sft-stage2-436k.jsonl +0 -3
  13. hf_data/opencodeinstruct_train_2m.jsonl +0 -3
  14. hf_data/opencodeinstruct_train_5m.jsonl +0 -3
  15. hf_data/opencodeinstruct_train_filter_score_eq_1.jsonl +0 -3
  16. hf_data/opencodeinstruct_train_filter_score_ge_08.jsonl +0 -3
  17. hf_data/opencodeinstruct_train_filter_score_ge_08_llm_judgement_avg_score_ge_3.jsonl +0 -3
  18. hf_data/openmathinstruct2_train.jsonl +0 -3
  19. hf_data/openmathinstruct2_train_1m.jsonl +0 -3
  20. hf_data/openmathinstruct2_train_2m.jsonl +0 -3
  21. hf_data/openmathinstruct2_train_5m.jsonl +0 -3
  22. hf_data/sciRIFF_4096_70k.jsonl +0 -3
  23. hf_data/script/processor_code_opc_sft.py +0 -29
  24. hf_data/script/processor_code_opc_sft_stage_1.py +0 -30
  25. hf_data/script/processor_magpie.py +0 -23
  26. hf_data/script/processor_open_code_instruct_filter.py +0 -36
  27. hf_data/script/processor_scalequest_code.py +0 -25
  28. hf_data/script/processor_scalequest_math.py +0 -25
  29. hf_data/script/processor_sciriff.py +0 -25
  30. hf_data/script/processor_smoltalk.py +0 -38
  31. hf_data/script/processor_table_gpt.py +0 -28
  32. hf_data/script/processor_tulu.py +0 -25
  33. hf_data/script/processor_tulu_mixture.py +0 -28
  34. hf_data/script/processpor_code.py +0 -23
  35. hf_data/script/processpor_infinity.py +0 -22
  36. hf_data/script/processpor_infinity_instruct.py +0 -25
  37. hf_data/script/processpor_lamini.py +0 -22
  38. hf_data/script/processpor_math.py +0 -22
  39. hf_data/smoltalk_1100k.jsonl +0 -3
  40. hf_data/tulu-3-sft-mixture_train_939k.jsonl +0 -3
  41. hf_data/tulu-3-sft-olmo-2-mixture_train_939k.jsonl +0 -3
  42. hf_data/tulu-3-sft-personas-instruction-following_30k.jsonl +0 -3
hf_data/LaMini_instruction_train.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a170d06468f1e0b75a93138084f28b821cdefe19da547af0bc7093be0057d2e5
3
- size 1316278839
 
 
 
 
hf_data/ScaleQuest_Code_train_157k.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:332960c2672ae70b278a3ee7dc41fe1854a113b94ff8e4d7ceab302d23304f11
3
- size 393762917
 
 
 
 
hf_data/ScaleQuest_Math_train_1m.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8504b7b492431641b23a9aa4e695572dd16836bf3156a94c21645a5361368817
3
- size 1581059706
 
 
 
 
hf_data/Table-GPT_train_13k.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:36cb71a030f3a9c98fa114ee7206bcc5f28697fbcf916d0c01e36ef0242ea443
3
- size 33778304
 
 
 
 
hf_data/infinity_instruct_3M_train.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e870f4c34157f846443cb6adf423b6b183aba52268fb3b7adea9e5f899250601
3
- size 7970107783
 
 
 
 
hf_data/infinity_instruct_7M_core_train_1_5m.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2ede1f8e4a3906c394f000595416dbade9d9a28c34f80337c272fbc46ba6822
3
- size 3174471022
 
 
 
 
hf_data/infinity_instruct_gen_train_1400k.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a003195934389a860dca44da374eb580adcbdbc8763a49a3f940e4991b0fa3e
3
- size 5782067170
 
 
 
 
hf_data/infinity_instruct_train_7m.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:522be9282906a1e539e2c66e2d51f7a044ceaa102346ae402e0e6573d4a83a54
3
- size 14312887611
 
 
 
 
hf_data/opc-sft-stage1-filtered_infinity_instruct-1030k.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8a16827ffcb868bd79b97f6b12757e0ce8d505c3af65fb28c60df743527f0d1
3
- size 2128437340
 
 
 
 
hf_data/opc-sft-stage1-largescale_diverse_instruct-2513k.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:514b8adc9f28e5faf00c7ae5fca1722045526b59e7b8d13c716beb0c93c9e26e
3
- size 6822344959
 
 
 
 
hf_data/opc-sft-stage1-realuser_instruct-675k.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6b11a4f995acb07b86d6679a5aa838d7c79931b2fd4543e2cf42576161a3253
3
- size 2325200313
 
 
 
 
hf_data/opc-sft-stage2-436k.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1b8bdafd4500d8785803156056de94989af8fa0146109561af3124fab6ed04c
3
- size 1109723658
 
 
 
 
hf_data/opencodeinstruct_train_2m.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:155fd30f5e0f7d329f0e1eddddab41e0bf7fd565c86613bb3215d01a1aa2ffe5
3
- size 3950774175
 
 
 
 
hf_data/opencodeinstruct_train_5m.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff6393709448e0eb0d016ec1a2226f4a1b9e79db89f5a26b1c8f851d7adf6c52
3
- size 10284863439
 
 
 
 
hf_data/opencodeinstruct_train_filter_score_eq_1.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6a3bf492ae82328be8ae575f84b65198a196918d513216b4b544f4ab6c07a99
3
- size 2951101476
 
 
 
 
hf_data/opencodeinstruct_train_filter_score_ge_08.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bf3163adc2bca7d0103dcb17208a2c24d345dbcad8a85ad6e3fc975dff28927
3
- size 4706279026
 
 
 
 
hf_data/opencodeinstruct_train_filter_score_ge_08_llm_judgement_avg_score_ge_3.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:efeb82226ab27a81517d945ad7870ce77c74b06d31daed26429405ba73f3bb00
3
- size 4671287609
 
 
 
 
hf_data/openmathinstruct2_train.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e070b0f4220feb93e6646caca1b881e2b5d33afcb64da9b30d59af6784b830f
3
- size 16798025787
 
 
 
 
hf_data/openmathinstruct2_train_1m.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc79dd150daf1d376f2ce2772337d3ffa7ff9ff61d1d825e12bb5d7c21b109fc
3
- size 1445582798
 
 
 
 
hf_data/openmathinstruct2_train_2m.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:36fadca11edb9e5dc3fb9f8a60194d97a4448ac302af0c53fee69605c64785be
3
- size 2952796652
 
 
 
 
hf_data/openmathinstruct2_train_5m.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba0e8cbc50a72e232a5e20731edd849c69b87ddef2f331c0ab366c955f2ac5e4
3
- size 7012994994
 
 
 
 
hf_data/sciRIFF_4096_70k.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c771bd27670ff0d9fe0c5c4efd42c1fc945c8f29338c68e955244c13a2e68b7
3
- size 390602961
 
 
 
 
hf_data/script/processor_code_opc_sft.py DELETED
@@ -1,29 +0,0 @@
1
- # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
-
3
- import json
4
-
5
- from datasets import load_dataset
6
- from tqdm import tqdm
7
-
8
- subset = ['educational_instruct', 'evol_instruct', 'mceval_instruct', 'package_instruct']
9
- output_file = "../opc-sft-stage2-436k.jsonl"
10
- ix = 0
11
- with open(output_file, 'w', encoding='utf-8') as f:
12
- for sub in subset:
13
- dataset = load_dataset('../opc-sft-stage2', sub, split='train')
14
- print(f"Converting dataset to jsonl format {sub}")
15
-
16
- for item in tqdm(dataset):
17
- # print(item, item)
18
- # break
19
- conv = {
20
- 'id': item['seq_id'] if 'seq_id' in item.keys() else f'{sub}-{ix}',
21
- 'conversations': [
22
- {'from': 'human', 'value': item['instruction']},
23
- {'from': 'gpt', 'value': item['output']}
24
- ]
25
- }
26
- ix += 1
27
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
28
-
29
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processor_code_opc_sft_stage_1.py DELETED
@@ -1,30 +0,0 @@
1
- # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
-
3
- import json
4
-
5
- from datasets import load_dataset
6
- from tqdm import tqdm
7
-
8
- subset = ['filtered_infinity_instruct', 'realuser_instruct', 'largescale_diverse_instruct']
9
- cnts = ['1030k', '2510k','676k']
10
-
11
- for cnt, sub in zip(cnts, subset):
12
- dataset = load_dataset('../opc-sft-stage1', sub, split='train')
13
- output_file = f"../opc-sft-stage1-{sub}-{cnt}.jsonl"
14
- print(f"Converting dataset to jsonl format {sub}")
15
- ix = 0
16
- with open(output_file, 'w', encoding='utf-8') as f:
17
- for item in tqdm(dataset):
18
- # print(item, item)
19
- # break
20
- conv = {
21
- 'id': item['seq_id'] if 'seq_id' in item.keys() else f'{sub}-{ix}',
22
- 'conversations': [
23
- {'from': 'human', 'value': item['instruction']},
24
- {'from': 'gpt', 'value': item['output']}
25
- ]
26
- }
27
- ix += 1
28
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
29
-
30
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processor_magpie.py DELETED
@@ -1,23 +0,0 @@
1
- import json
2
-
3
- from datasets import load_dataset
4
- from tqdm import tqdm
5
-
6
- dataset = load_dataset('../Magpie-Qwen2.5-Pro-300K-Filtered', split='train')
7
-
8
- print("Converting dataset to jsonl format")
9
- output_file = "../Magpie-Qwen2_5-Pro-300K-Filtered.jsonl"
10
- with open(output_file, 'w', encoding='utf-8') as f:
11
- for item in tqdm(dataset):
12
- # print(item)
13
- # break
14
- conv = {
15
- 'id': item['uuid'],
16
- 'conversations': [
17
- {'from': 'human', 'value': item['instruction']},
18
- {'from': 'gpt', 'value': item['response']}
19
- ]
20
- }
21
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
22
-
23
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processor_open_code_instruct_filter.py DELETED
@@ -1,36 +0,0 @@
1
- import json
2
-
3
- from datasets import load_dataset
4
- from tqdm import tqdm
5
-
6
- dataset = load_dataset('../OpenCodeInstruct', split='train')
7
-
8
- print("Converting dataset to jsonl format")
9
- output_file = "../opencodeinstruct_train_filter_score_ge_08.jsonl"
10
- with open(output_file, 'w', encoding='utf-8') as f:
11
- for item in tqdm(dataset):
12
-
13
- llm_judgement = json.loads(item['llm_judgement'])
14
- llm_judgement_avg_score = (
15
- int(llm_judgement['requirement_conformance']['score']) +
16
- int(llm_judgement['logical_correctness']['score']) +
17
- int(llm_judgement['requirement_conformance']['score'])
18
- ) / 3
19
-
20
- # print(item)
21
- # print(float(item['average_test_score']))
22
- # print(llm_judgement_avg_score)
23
- # break
24
-
25
- if float(item['average_test_score']) >= 0.8:
26
- conv = {
27
- 'id': item['id'],
28
- 'conversations': [
29
- {'from': 'human', 'value': item['input']},
30
- {'from': 'gpt', 'value': item['output']}
31
- ]
32
- }
33
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
34
-
35
- print(f"Conversion complete. Output saved as {output_file}")
36
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processor_scalequest_code.py DELETED
@@ -1,25 +0,0 @@
1
- import json
2
-
3
- from datasets import load_dataset
4
- from tqdm import tqdm
5
-
6
- dataset = load_dataset('../ScaleQuest-Code', split='train')
7
-
8
- print("Converting dataset to jsonl format")
9
- output_file = "../ScaleQuest_Code_train_157k.jsonl"
10
- ix = 0
11
- with open(output_file, 'w', encoding='utf-8') as f:
12
- for item in tqdm(dataset):
13
- # print(item, item)
14
- # break
15
- conv = {
16
- 'id': ix,
17
- 'conversations': [
18
- {'from': 'human', 'value': item['query']},
19
- {'from': 'gpt', 'value': item['response']}
20
- ]
21
- }
22
- ix += 1
23
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
24
-
25
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processor_scalequest_math.py DELETED
@@ -1,25 +0,0 @@
1
- import json
2
-
3
- from datasets import load_dataset
4
- from tqdm import tqdm
5
-
6
- dataset = load_dataset('dyyyyyyyy/ScaleQuest-Math', split='train')
7
-
8
- print("Converting dataset to jsonl format")
9
- output_file = "../ScaleQuest_Math_train_1m.jsonl"
10
- ix = 0
11
- with open(output_file, 'w', encoding='utf-8') as f:
12
- for item in tqdm(dataset):
13
- # print(item, item)
14
- # break
15
- conv = {
16
- 'id': ix,
17
- 'conversations': [
18
- {'from': 'human', 'value': item['query']},
19
- {'from': 'gpt', 'value': item['response']}
20
- ]
21
- }
22
- ix += 1
23
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
24
-
25
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processor_sciriff.py DELETED
@@ -1,25 +0,0 @@
1
- # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
-
3
- import json
4
-
5
- from datasets import load_dataset
6
- from tqdm import tqdm
7
-
8
- dataset = load_dataset('../SciRIFF', '4096', split='train')
9
-
10
- print("Converting dataset to jsonl format")
11
- output_file = "../sciRIFF_4096_70k.jsonl"
12
- with open(output_file, 'w', encoding='utf-8') as f:
13
- for item in tqdm(dataset):
14
- # print(item, item)
15
- # break
16
- conv = {
17
- 'id': item['_instance_id'],
18
- 'conversations': [
19
- {'from': 'human', 'value': item['input']},
20
- {'from': 'gpt', 'value': item['output']}
21
- ]
22
- }
23
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
24
-
25
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processor_smoltalk.py DELETED
@@ -1,38 +0,0 @@
1
- # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
-
3
- import json
4
-
5
- from datasets import load_dataset
6
- from tqdm import tqdm
7
-
8
- dataset = load_dataset('../smoltalk/', 'all', split='train')
9
-
10
- print("Converting dataset to jsonl format")
11
- output_file = "../smoltalk_1100k.jsonl"
12
- ix = 0
13
- roles = {
14
- 'assistant': 'gpt',
15
- 'user': 'human',
16
- 'system': 'system'
17
- }
18
-
19
- with open(output_file, 'w', encoding='utf-8') as f:
20
- for item in tqdm(dataset):
21
- conversations = []
22
- for msg in item['messages']:
23
- from_str = roles[msg['role']]
24
- value = msg['content']
25
- conversations.append(
26
- {
27
- 'from': from_str,
28
- 'value': value
29
- }
30
- )
31
- conv = {
32
- 'id': ix,
33
- 'conversations': conversations
34
- }
35
- ix += 1
36
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
37
-
38
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processor_table_gpt.py DELETED
@@ -1,28 +0,0 @@
1
- # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
-
3
- import json
4
-
5
- from datasets import load_dataset
6
- from tqdm import tqdm
7
-
8
- dataset = load_dataset('../Table-GPT', 'All', split='train')
9
-
10
- print("Converting dataset to jsonl format")
11
- output_file = "../Table-GPT_train_13k.jsonl"
12
- ix = 0
13
- with open(output_file, 'w', encoding='utf-8') as f:
14
- for item in tqdm(dataset):
15
- # print(item, item)
16
- # break
17
- conv = {
18
- 'id': ix,
19
- 'conversations': [
20
- {'from': 'system', 'value': 'You are a helpful assistant that specializes in tables'},
21
- {'from': 'human', 'value': item['prompt']},
22
- {'from': 'gpt', 'value': item['completion']}
23
- ]
24
- }
25
- ix += 1
26
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
27
-
28
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processor_tulu.py DELETED
@@ -1,25 +0,0 @@
1
- # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
-
3
- import json
4
-
5
- from datasets import load_dataset
6
- from tqdm import tqdm
7
-
8
- dataset = load_dataset('../tulu-3-sft-personas-instruction-following', split='train')
9
-
10
- print("Converting dataset to jsonl format")
11
- output_file = "../tulu-3-sft-personas-instruction-following_30k.jsonl"
12
- with open(output_file, 'w', encoding='utf-8') as f:
13
- for item in tqdm(dataset):
14
- # print(item, item)
15
- # break
16
- conv = {
17
- 'id': item['id'],
18
- 'conversations': [
19
- {'from': 'human', 'value': item['prompt']},
20
- {'from': 'gpt', 'value': item['messages'][1]['content']}
21
- ]
22
- }
23
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
24
-
25
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processor_tulu_mixture.py DELETED
@@ -1,28 +0,0 @@
1
- # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
-
3
- import json
4
-
5
- from datasets import load_dataset
6
- from tqdm import tqdm
7
-
8
- dataset = load_dataset('../tulu-3-sft-mixture', split='train')
9
-
10
- print("Converting dataset to jsonl format")
11
- output_file = "../tulu-3-sft-mixture_train_939k.jsonl"
12
- with open(output_file, 'w', encoding='utf-8') as f:
13
- for item in tqdm(dataset):
14
- # print(item, item)
15
- # break
16
- try:
17
- conv = {
18
- 'id': item['id'],
19
- 'conversations': [
20
- {'from': 'human', 'value': item['messages'][0]['content']},
21
- {'from': 'gpt', 'value': item['messages'][1]['content']}
22
- ]
23
- }
24
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
25
- except IndexError:
26
- print('e')
27
-
28
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processpor_code.py DELETED
@@ -1,23 +0,0 @@
1
- import json
2
-
3
- from datasets import load_dataset
4
- from tqdm import tqdm
5
-
6
- dataset = load_dataset('nvidia/OpenCodeInstruct', split='train')
7
-
8
- print("Converting dataset to jsonl format")
9
- output_file = "opencodeinstruct_train.jsonl"
10
- with open(output_file, 'w', encoding='utf-8') as f:
11
- for item in tqdm(dataset):
12
- # print(item)
13
- # break
14
- conv = {
15
- 'id': item['id'],
16
- 'conversations': [
17
- {'from': 'human', 'value': item['input']},
18
- {'from': 'gpt', 'value': item['output']}
19
- ]
20
- }
21
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
22
-
23
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processpor_infinity.py DELETED
@@ -1,22 +0,0 @@
1
- import json
2
-
3
- from datasets import load_dataset
4
- from tqdm import tqdm
5
-
6
- dataset = load_dataset('../OpenMathInstruct-2', split='train')
7
-
8
- print("Converting dataset to jsonl format")
9
- output_file = "../openmathinstruct2_train.jsonl"
10
- with open(output_file, 'w', encoding='utf-8') as f:
11
- for item in tqdm(dataset):
12
- # print(item)
13
- # break
14
- conv = {
15
- 'conversations':[
16
- {'from':'human', 'value':item['problem']},
17
- {'from':'gpt', 'value':item['generated_solution']}
18
- ]
19
- }
20
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
21
-
22
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processpor_infinity_instruct.py DELETED
@@ -1,25 +0,0 @@
1
- # https://huggingface.co/datasets/BAAI/Infinity-Instruct
2
-
3
- import json
4
-
5
- from datasets import load_dataset
6
- from tqdm import tqdm
7
-
8
- dataset = load_dataset('BAAI/Infinity-Instruct', '7M_core', split='train')
9
-
10
- print("Converting dataset to jsonl format")
11
- output_file = "../infinity_instruct_7M_core_train_1_5m.jsonl"
12
- with open(output_file, 'w', encoding='utf-8') as f:
13
- for item in tqdm(dataset):
14
- # print(item)
15
- # break
16
- # conv = {
17
- # 'id': item['id'],
18
- # 'conversations': [
19
- # {'from': 'human', 'value': item['input']},
20
- # {'from': 'gpt', 'value': item['output']}
21
- # ]
22
- # }
23
- f.write(json.dumps(item, ensure_ascii=False) + '\n')
24
-
25
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processpor_lamini.py DELETED
@@ -1,22 +0,0 @@
1
- import json
2
-
3
- from datasets import load_dataset
4
- from tqdm import tqdm
5
-
6
- dataset = load_dataset('./LaMini-instruction', split='train')
7
-
8
- print("Converting dataset to jsonl format")
9
- output_file = "LaMini_instruction_train.jsonl"
10
- with open(output_file, 'w', encoding='utf-8') as f:
11
- for item in tqdm(dataset):
12
- # print(item)
13
- # break
14
- conv = {
15
- 'conversations':[
16
- {'from':'human', 'value':item['instruction']},
17
- {'from':'gpt', 'value':item['response']}
18
- ]
19
- }
20
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
21
-
22
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/script/processpor_math.py DELETED
@@ -1,22 +0,0 @@
1
- import json
2
-
3
- from datasets import load_dataset
4
- from tqdm import tqdm
5
-
6
- dataset = load_dataset('../OpenMathInstruct-2', split='train')
7
-
8
- print("Converting dataset to jsonl format")
9
- output_file = "../openmathinstruct2_train.jsonl"
10
- with open(output_file, 'w', encoding='utf-8') as f:
11
- for item in tqdm(dataset):
12
- # print(item)
13
- # break
14
- conv = {
15
- 'conversations':[
16
- {'from':'human', 'value':item['problem']},
17
- {'from':'gpt', 'value':item['generated_solution']}
18
- ]
19
- }
20
- f.write(json.dumps(conv, ensure_ascii=False) + '\n')
21
-
22
- print(f"Conversion complete. Output saved as {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_data/smoltalk_1100k.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:729a6f12594034fe50fd44568aac5960fd9ba9b675d5142b4619c478bdce838e
3
- size 4169152401
 
 
 
 
hf_data/tulu-3-sft-mixture_train_939k.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:aef698a418fcd5aa6218bfc7f0ddd68dbfd87fbcbae3ea08dca6407503275da2
3
- size 2463675206
 
 
 
 
hf_data/tulu-3-sft-olmo-2-mixture_train_939k.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:09623e387ab69c78756c356ddfb261fe736c3790123a16ee8f8635ed98f1bc26
3
- size 2463660893
 
 
 
 
hf_data/tulu-3-sft-personas-instruction-following_30k.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3131ecbd5057eb98c122507c481899c4009eaca87d97da02acce85514b6c7833
3
- size 61667836