lll2343 commited on Oct 10, 2025

Commit

622e120

verified ·

1 Parent(s): 7470dba

Upload folder using huggingface_hub

Browse files

Files changed (43) hide show

.gitattributes +26 -0
hf_data/LaMini_instruction_train.jsonl +3 -0
hf_data/ScaleQuest_Code_train_157k.jsonl +3 -0
hf_data/ScaleQuest_Math_train_1m.jsonl +3 -0
hf_data/Table-GPT_train_13k.jsonl +3 -0
hf_data/infinity_instruct_3M_train.jsonl +3 -0
hf_data/infinity_instruct_7M_core_train_1_5m.jsonl +3 -0
hf_data/infinity_instruct_gen_train_1400k.jsonl +3 -0
hf_data/infinity_instruct_train_7m.jsonl +3 -0
hf_data/opc-sft-stage1-filtered_infinity_instruct-1030k.jsonl +3 -0
hf_data/opc-sft-stage1-largescale_diverse_instruct-2513k.jsonl +3 -0
hf_data/opc-sft-stage1-realuser_instruct-675k.jsonl +3 -0
hf_data/opc-sft-stage2-436k.jsonl +3 -0
hf_data/opencodeinstruct_train_2m.jsonl +3 -0
hf_data/opencodeinstruct_train_5m.jsonl +3 -0
hf_data/opencodeinstruct_train_filter_score_eq_1.jsonl +3 -0
hf_data/opencodeinstruct_train_filter_score_ge_08.jsonl +3 -0
hf_data/opencodeinstruct_train_filter_score_ge_08_llm_judgement_avg_score_ge_3.jsonl +3 -0
hf_data/openmathinstruct2_train.jsonl +3 -0
hf_data/openmathinstruct2_train_1m.jsonl +3 -0
hf_data/openmathinstruct2_train_2m.jsonl +3 -0
hf_data/openmathinstruct2_train_5m.jsonl +3 -0
hf_data/sciRIFF_4096_70k.jsonl +3 -0
hf_data/script/processor_code_opc_sft.py +29 -0
hf_data/script/processor_code_opc_sft_stage_1.py +30 -0
hf_data/script/processor_magpie.py +23 -0
hf_data/script/processor_open_code_instruct_filter.py +36 -0
hf_data/script/processor_scalequest_code.py +25 -0
hf_data/script/processor_scalequest_math.py +25 -0
hf_data/script/processor_sciriff.py +25 -0
hf_data/script/processor_smoltalk.py +38 -0
hf_data/script/processor_table_gpt.py +28 -0
hf_data/script/processor_tulu.py +25 -0
hf_data/script/processor_tulu_mixture.py +28 -0
hf_data/script/processpor_code.py +23 -0
hf_data/script/processpor_infinity.py +22 -0
hf_data/script/processpor_infinity_instruct.py +25 -0
hf_data/script/processpor_lamini.py +22 -0
hf_data/script/processpor_math.py +22 -0
hf_data/smoltalk_1100k.jsonl +3 -0
hf_data/tulu-3-sft-mixture_train_939k.jsonl +3 -0
hf_data/tulu-3-sft-olmo-2-mixture_train_939k.jsonl +3 -0
hf_data/tulu-3-sft-personas-instruction-following_30k.jsonl +3 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,29 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 qwen_sft_v1114/training_log.txt filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 qwen_sft_v1114/training_log.txt filter=lfs diff=lfs merge=lfs -text
+hf_data/LaMini_instruction_train.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/ScaleQuest_Code_train_157k.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/ScaleQuest_Math_train_1m.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/Table-GPT_train_13k.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/infinity_instruct_3M_train.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/infinity_instruct_7M_core_train_1_5m.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/infinity_instruct_gen_train_1400k.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/infinity_instruct_train_7m.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/opc-sft-stage1-filtered_infinity_instruct-1030k.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/opc-sft-stage1-largescale_diverse_instruct-2513k.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/opc-sft-stage1-realuser_instruct-675k.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/opc-sft-stage2-436k.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/opencodeinstruct_train_2m.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/opencodeinstruct_train_5m.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/opencodeinstruct_train_filter_score_eq_1.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/opencodeinstruct_train_filter_score_ge_08.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/opencodeinstruct_train_filter_score_ge_08_llm_judgement_avg_score_ge_3.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/openmathinstruct2_train.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/openmathinstruct2_train_1m.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/openmathinstruct2_train_2m.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/openmathinstruct2_train_5m.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/sciRIFF_4096_70k.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/smoltalk_1100k.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/tulu-3-sft-mixture_train_939k.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/tulu-3-sft-olmo-2-mixture_train_939k.jsonl filter=lfs diff=lfs merge=lfs -text
+hf_data/tulu-3-sft-personas-instruction-following_30k.jsonl filter=lfs diff=lfs merge=lfs -text

hf_data/LaMini_instruction_train.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a170d06468f1e0b75a93138084f28b821cdefe19da547af0bc7093be0057d2e5
+size 1316278839

hf_data/ScaleQuest_Code_train_157k.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:332960c2672ae70b278a3ee7dc41fe1854a113b94ff8e4d7ceab302d23304f11
+size 393762917

hf_data/ScaleQuest_Math_train_1m.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8504b7b492431641b23a9aa4e695572dd16836bf3156a94c21645a5361368817
+size 1581059706

hf_data/Table-GPT_train_13k.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36cb71a030f3a9c98fa114ee7206bcc5f28697fbcf916d0c01e36ef0242ea443
+size 33778304

hf_data/infinity_instruct_3M_train.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e870f4c34157f846443cb6adf423b6b183aba52268fb3b7adea9e5f899250601
+size 7970107783

hf_data/infinity_instruct_7M_core_train_1_5m.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2ede1f8e4a3906c394f000595416dbade9d9a28c34f80337c272fbc46ba6822
+size 3174471022

hf_data/infinity_instruct_gen_train_1400k.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a003195934389a860dca44da374eb580adcbdbc8763a49a3f940e4991b0fa3e
+size 5782067170

hf_data/infinity_instruct_train_7m.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:522be9282906a1e539e2c66e2d51f7a044ceaa102346ae402e0e6573d4a83a54
+size 14312887611

hf_data/opc-sft-stage1-filtered_infinity_instruct-1030k.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8a16827ffcb868bd79b97f6b12757e0ce8d505c3af65fb28c60df743527f0d1
+size 2128437340

hf_data/opc-sft-stage1-largescale_diverse_instruct-2513k.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:514b8adc9f28e5faf00c7ae5fca1722045526b59e7b8d13c716beb0c93c9e26e
+size 6822344959

hf_data/opc-sft-stage1-realuser_instruct-675k.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6b11a4f995acb07b86d6679a5aa838d7c79931b2fd4543e2cf42576161a3253
+size 2325200313

hf_data/opc-sft-stage2-436k.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1b8bdafd4500d8785803156056de94989af8fa0146109561af3124fab6ed04c
+size 1109723658

hf_data/opencodeinstruct_train_2m.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:155fd30f5e0f7d329f0e1eddddab41e0bf7fd565c86613bb3215d01a1aa2ffe5
+size 3950774175

hf_data/opencodeinstruct_train_5m.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff6393709448e0eb0d016ec1a2226f4a1b9e79db89f5a26b1c8f851d7adf6c52
+size 10284863439

hf_data/opencodeinstruct_train_filter_score_eq_1.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6a3bf492ae82328be8ae575f84b65198a196918d513216b4b544f4ab6c07a99
+size 2951101476

hf_data/opencodeinstruct_train_filter_score_ge_08.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bf3163adc2bca7d0103dcb17208a2c24d345dbcad8a85ad6e3fc975dff28927
+size 4706279026

hf_data/opencodeinstruct_train_filter_score_ge_08_llm_judgement_avg_score_ge_3.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efeb82226ab27a81517d945ad7870ce77c74b06d31daed26429405ba73f3bb00
+size 4671287609

hf_data/openmathinstruct2_train.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e070b0f4220feb93e6646caca1b881e2b5d33afcb64da9b30d59af6784b830f
+size 16798025787

hf_data/openmathinstruct2_train_1m.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc79dd150daf1d376f2ce2772337d3ffa7ff9ff61d1d825e12bb5d7c21b109fc
+size 1445582798

hf_data/openmathinstruct2_train_2m.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36fadca11edb9e5dc3fb9f8a60194d97a4448ac302af0c53fee69605c64785be
+size 2952796652

hf_data/openmathinstruct2_train_5m.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba0e8cbc50a72e232a5e20731edd849c69b87ddef2f331c0ab366c955f2ac5e4
+size 7012994994

hf_data/sciRIFF_4096_70k.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c771bd27670ff0d9fe0c5c4efd42c1fc945c8f29338c68e955244c13a2e68b7
+size 390602961

hf_data/script/processor_code_opc_sft.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# https://huggingface.co/datasets/BAAI/Infinity-Instruct
+import json
+from datasets import load_dataset
+from tqdm import tqdm
+subset = ['educational_instruct', 'evol_instruct', 'mceval_instruct', 'package_instruct']
+output_file = "../opc-sft-stage2-436k.jsonl"
+ix = 0
+with open(output_file, 'w', encoding='utf-8') as f:
+    for sub in subset:
+        dataset = load_dataset('../opc-sft-stage2', sub, split='train')
+        print(f"Converting dataset to jsonl format {sub}")
+        for item in tqdm(dataset):
+            # print(item, item)
+            # break
+            conv = {
+                'id': item['seq_id'] if 'seq_id' in item.keys() else f'{sub}-{ix}',
+                'conversations': [
+                    {'from': 'human', 'value': item['instruction']},
+                    {'from': 'gpt', 'value': item['output']}
+                ]
+            }
+            ix += 1
+            f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processor_code_opc_sft_stage_1.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# https://huggingface.co/datasets/BAAI/Infinity-Instruct
+import json
+from datasets import load_dataset
+from tqdm import tqdm
+subset = ['filtered_infinity_instruct', 'realuser_instruct', 'largescale_diverse_instruct']
+cnts =   ['1030k', '2510k','676k']
+for cnt, sub in zip(cnts, subset):
+    dataset = load_dataset('../opc-sft-stage1', sub, split='train')
+    output_file = f"../opc-sft-stage1-{sub}-{cnt}.jsonl"
+    print(f"Converting dataset to jsonl format {sub}")
+    ix = 0
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for item in tqdm(dataset):
+            # print(item, item)
+            # break
+            conv = {
+                'id': item['seq_id'] if 'seq_id' in item.keys() else f'{sub}-{ix}',
+                'conversations': [
+                    {'from': 'human', 'value': item['instruction']},
+                    {'from': 'gpt', 'value': item['output']}
+                ]
+            }
+            ix += 1
+            f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+    print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processor_magpie.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('../Magpie-Qwen2.5-Pro-300K-Filtered', split='train')
+print("Converting dataset to jsonl format")
+output_file = "../Magpie-Qwen2_5-Pro-300K-Filtered.jsonl"
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        # print(item)
+        # break
+        conv = {
+            'id': item['uuid'],
+            'conversations': [
+                {'from': 'human', 'value': item['instruction']},
+                {'from': 'gpt', 'value': item['response']}
+            ]
+        }
+        f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processor_open_code_instruct_filter.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('../OpenCodeInstruct', split='train')
+print("Converting dataset to jsonl format")
+output_file = "../opencodeinstruct_train_filter_score_ge_08.jsonl"
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        llm_judgement = json.loads(item['llm_judgement'])
+        llm_judgement_avg_score = (
+          int(llm_judgement['requirement_conformance']['score']) +
+          int(llm_judgement['logical_correctness']['score']) +
+          int(llm_judgement['requirement_conformance']['score'])
+        ) / 3
+        # print(item)
+        # print(float(item['average_test_score']))
+        # print(llm_judgement_avg_score)
+        # break
+        if float(item['average_test_score']) >= 0.8:
+            conv = {
+                'id': item['id'],
+                'conversations': [
+                    {'from': 'human', 'value': item['input']},
+                    {'from': 'gpt', 'value': item['output']}
+                ]
+            }
+            f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processor_scalequest_code.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('../ScaleQuest-Code', split='train')
+print("Converting dataset to jsonl format")
+output_file = "../ScaleQuest_Code_train_157k.jsonl"
+ix = 0
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        # print(item, item)
+        # break
+        conv = {
+            'id': ix,
+            'conversations': [
+                {'from': 'human', 'value': item['query']},
+                {'from': 'gpt', 'value': item['response']}
+            ]
+        }
+        ix += 1
+        f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processor_scalequest_math.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('dyyyyyyyy/ScaleQuest-Math', split='train')
+print("Converting dataset to jsonl format")
+output_file = "../ScaleQuest_Math_train_1m.jsonl"
+ix = 0
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        # print(item, item)
+        # break
+        conv = {
+            'id': ix,
+            'conversations': [
+                {'from': 'human', 'value': item['query']},
+                {'from': 'gpt', 'value': item['response']}
+            ]
+        }
+        ix += 1
+        f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processor_sciriff.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# https://huggingface.co/datasets/BAAI/Infinity-Instruct
+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('../SciRIFF', '4096', split='train')
+print("Converting dataset to jsonl format")
+output_file = "../sciRIFF_4096_70k.jsonl"
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        # print(item, item)
+        # break
+        conv = {
+            'id': item['_instance_id'],
+            'conversations': [
+                {'from': 'human', 'value': item['input']},
+                {'from': 'gpt', 'value': item['output']}
+            ]
+        }
+        f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processor_smoltalk.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# https://huggingface.co/datasets/BAAI/Infinity-Instruct
+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('../smoltalk/', 'all', split='train')
+print("Converting dataset to jsonl format")
+output_file = "../smoltalk_1100k.jsonl"
+ix = 0
+roles = {
+    'assistant': 'gpt',
+    'user': 'human',
+    'system': 'system'
+}
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        conversations = []
+        for msg in item['messages']:
+            from_str = roles[msg['role']]
+            value = msg['content']
+            conversations.append(
+                {
+                    'from': from_str,
+                    'value': value
+                }
+            )
+        conv = {
+            'id': ix,
+            'conversations': conversations
+        }
+        ix += 1
+        f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processor_table_gpt.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# https://huggingface.co/datasets/BAAI/Infinity-Instruct
+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('../Table-GPT', 'All', split='train')
+print("Converting dataset to jsonl format")
+output_file = "../Table-GPT_train_13k.jsonl"
+ix = 0
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        # print(item, item)
+        # break
+        conv = {
+            'id': ix,
+            'conversations': [
+                {'from': 'system', 'value': 'You are a helpful assistant that specializes in tables'},
+                {'from': 'human', 'value': item['prompt']},
+                {'from': 'gpt', 'value': item['completion']}
+            ]
+        }
+        ix += 1
+        f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processor_tulu.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# https://huggingface.co/datasets/BAAI/Infinity-Instruct
+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('../tulu-3-sft-personas-instruction-following', split='train')
+print("Converting dataset to jsonl format")
+output_file = "../tulu-3-sft-personas-instruction-following_30k.jsonl"
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        # print(item, item)
+        # break
+        conv = {
+            'id': item['id'],
+            'conversations': [
+                {'from': 'human', 'value': item['prompt']},
+                {'from': 'gpt', 'value': item['messages'][1]['content']}
+            ]
+        }
+        f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processor_tulu_mixture.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# https://huggingface.co/datasets/BAAI/Infinity-Instruct
+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('../tulu-3-sft-mixture', split='train')
+print("Converting dataset to jsonl format")
+output_file = "../tulu-3-sft-mixture_train_939k.jsonl"
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        # print(item, item)
+        # break
+        try:
+            conv = {
+                'id': item['id'],
+                'conversations': [
+                    {'from': 'human', 'value': item['messages'][0]['content']},
+                    {'from': 'gpt', 'value': item['messages'][1]['content']}
+                ]
+            }
+            f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+        except IndexError:
+            print('e')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processpor_code.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('nvidia/OpenCodeInstruct', split='train')
+print("Converting dataset to jsonl format")
+output_file = "opencodeinstruct_train.jsonl"
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        # print(item)
+        # break
+        conv = {
+            'id': item['id'],
+            'conversations': [
+                {'from': 'human', 'value': item['input']},
+                {'from': 'gpt', 'value': item['output']}
+            ]
+        }
+        f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processpor_infinity.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('../OpenMathInstruct-2', split='train')
+print("Converting dataset to jsonl format")
+output_file = "../openmathinstruct2_train.jsonl"
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        # print(item)
+        # break
+        conv = {
+            'conversations':[
+                {'from':'human', 'value':item['problem']},
+                {'from':'gpt', 'value':item['generated_solution']}
+            ]
+        }
+        f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processpor_infinity_instruct.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# https://huggingface.co/datasets/BAAI/Infinity-Instruct
+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('BAAI/Infinity-Instruct', '7M_core', split='train')
+print("Converting dataset to jsonl format")
+output_file = "../infinity_instruct_7M_core_train_1_5m.jsonl"
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        # print(item)
+        # break
+        # conv = {
+        #     'id': item['id'],
+        #     'conversations': [
+        #         {'from': 'human', 'value': item['input']},
+        #         {'from': 'gpt', 'value': item['output']}
+        #     ]
+        # }
+        f.write(json.dumps(item, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processpor_lamini.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('./LaMini-instruction', split='train')
+print("Converting dataset to jsonl format")
+output_file = "LaMini_instruction_train.jsonl"
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        # print(item)
+        # break
+        conv = {
+            'conversations':[
+                {'from':'human', 'value':item['instruction']},
+                {'from':'gpt', 'value':item['response']}
+            ]
+        }
+        f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/script/processpor_math.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import json
+from datasets import load_dataset
+from tqdm import tqdm
+dataset = load_dataset('../OpenMathInstruct-2', split='train')
+print("Converting dataset to jsonl format")
+output_file = "../openmathinstruct2_train.jsonl"
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        # print(item)
+        # break
+        conv = {
+            'conversations':[
+                {'from':'human', 'value':item['problem']},
+                {'from':'gpt', 'value':item['generated_solution']}
+            ]
+        }
+        f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+print(f"Conversion complete. Output saved as {output_file}")

hf_data/smoltalk_1100k.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:729a6f12594034fe50fd44568aac5960fd9ba9b675d5142b4619c478bdce838e
+size 4169152401

hf_data/tulu-3-sft-mixture_train_939k.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aef698a418fcd5aa6218bfc7f0ddd68dbfd87fbcbae3ea08dca6407503275da2
+size 2463675206

hf_data/tulu-3-sft-olmo-2-mixture_train_939k.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09623e387ab69c78756c356ddfb261fe736c3790123a16ee8f8635ed98f1bc26
+size 2463660893

hf_data/tulu-3-sft-personas-instruction-following_30k.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3131ecbd5057eb98c122507c481899c4009eaca87d97da02acce85514b6c7833
+size 61667836