File size: 1,234 Bytes
a04bca5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import json, re
from datasets import load_dataset
CJK = re.compile(r"[一-鿿぀-ゟ゠-ヿ]")
N = 50000
out_path = "/workspace/dolphin_subset.jsonl"
ds = load_dataset("cognitivecomputations/dolphin-r1", "reasoning-deepseek", split="train", streaming=True)
n = 0
with open(out_path, "w") as f:
    for row in ds:
        msgs = row.get("messages") or []
        sys_c = ""; usr_c = ""
        for m in msgs:
            r = (m.get("role") or "").lower()
            if r == "system" and not sys_c: sys_c = str(m.get("content") or "")
            elif r == "user" and not usr_c: usr_c = str(m.get("content") or "")
        reasoning = str(row.get("reasoning") or "")
        answer = str(row.get("answer") or "")
        if not usr_c or not reasoning or not answer:
            continue
        query = (sys_c + "\n\n" + usr_c) if sys_c else usr_c
        output = "<think>\n" + reasoning + "\n</think>\n\n" + answer
        if CJK.search(output) or CJK.search(query):
            continue
        f.write(json.dumps({"input":[{"role":"user","content":query}], "output": output}) + "\n")
        n += 1
        if n % 5000 == 0: print("wrote", n, flush=True)
        if n >= N: break
print("DONE wrote", n, "to", out_path, flush=True)