import json, re from datasets import load_dataset CJK = re.compile(r"[一-鿿぀-ゟ゠-ヿ]") N = 50000 out_path = "/workspace/dolphin_subset.jsonl" ds = load_dataset("cognitivecomputations/dolphin-r1", "reasoning-deepseek", split="train", streaming=True) n = 0 with open(out_path, "w") as f: for row in ds: msgs = row.get("messages") or [] sys_c = ""; usr_c = "" for m in msgs: r = (m.get("role") or "").lower() if r == "system" and not sys_c: sys_c = str(m.get("content") or "") elif r == "user" and not usr_c: usr_c = str(m.get("content") or "") reasoning = str(row.get("reasoning") or "") answer = str(row.get("answer") or "") if not usr_c or not reasoning or not answer: continue query = (sys_c + "\n\n" + usr_c) if sys_c else usr_c output = "\n" + reasoning + "\n\n\n" + answer if CJK.search(output) or CJK.search(query): continue f.write(json.dumps({"input":[{"role":"user","content":query}], "output": output}) + "\n") n += 1 if n % 5000 == 0: print("wrote", n, flush=True) if n >= N: break print("DONE wrote", n, "to", out_path, flush=True)