| import json, re |
| from datasets import load_dataset |
| CJK = re.compile(r"[一-鿿-ゟ゠-ヿ]") |
| N = 50000 |
| out_path = "/workspace/dolphin_subset.jsonl" |
| ds = load_dataset("cognitivecomputations/dolphin-r1", "reasoning-deepseek", split="train", streaming=True) |
| n = 0 |
| with open(out_path, "w") as f: |
| for row in ds: |
| msgs = row.get("messages") or [] |
| sys_c = ""; usr_c = "" |
| for m in msgs: |
| r = (m.get("role") or "").lower() |
| if r == "system" and not sys_c: sys_c = str(m.get("content") or "") |
| elif r == "user" and not usr_c: usr_c = str(m.get("content") or "") |
| reasoning = str(row.get("reasoning") or "") |
| answer = str(row.get("answer") or "") |
| if not usr_c or not reasoning or not answer: |
| continue |
| query = (sys_c + "\n\n" + usr_c) if sys_c else usr_c |
| output = "<think>\n" + reasoning + "\n</think>\n\n" + answer |
| if CJK.search(output) or CJK.search(query): |
| continue |
| f.write(json.dumps({"input":[{"role":"user","content":query}], "output": output}) + "\n") |
| n += 1 |
| if n % 5000 == 0: print("wrote", n, flush=True) |
| if n >= N: break |
| print("DONE wrote", n, "to", out_path, flush=True) |
|
|