Upload data_prep.py with huggingface_hub
Browse files- data_prep.py +28 -0
data_prep.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json, re
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
CJK = re.compile(r"[一-鿿-ゟ゠-ヿ]")
|
| 4 |
+
N = 50000
|
| 5 |
+
out_path = "/workspace/dolphin_subset.jsonl"
|
| 6 |
+
ds = load_dataset("cognitivecomputations/dolphin-r1", "reasoning-deepseek", split="train", streaming=True)
|
| 7 |
+
n = 0
|
| 8 |
+
with open(out_path, "w") as f:
|
| 9 |
+
for row in ds:
|
| 10 |
+
msgs = row.get("messages") or []
|
| 11 |
+
sys_c = ""; usr_c = ""
|
| 12 |
+
for m in msgs:
|
| 13 |
+
r = (m.get("role") or "").lower()
|
| 14 |
+
if r == "system" and not sys_c: sys_c = str(m.get("content") or "")
|
| 15 |
+
elif r == "user" and not usr_c: usr_c = str(m.get("content") or "")
|
| 16 |
+
reasoning = str(row.get("reasoning") or "")
|
| 17 |
+
answer = str(row.get("answer") or "")
|
| 18 |
+
if not usr_c or not reasoning or not answer:
|
| 19 |
+
continue
|
| 20 |
+
query = (sys_c + "\n\n" + usr_c) if sys_c else usr_c
|
| 21 |
+
output = "<think>\n" + reasoning + "\n</think>\n\n" + answer
|
| 22 |
+
if CJK.search(output) or CJK.search(query):
|
| 23 |
+
continue
|
| 24 |
+
f.write(json.dumps({"input":[{"role":"user","content":query}], "output": output}) + "\n")
|
| 25 |
+
n += 1
|
| 26 |
+
if n % 5000 == 0: print("wrote", n, flush=True)
|
| 27 |
+
if n >= N: break
|
| 28 |
+
print("DONE wrote", n, "to", out_path, flush=True)
|