baya1116 commited on
Commit
a04bca5
·
verified ·
1 Parent(s): dd6b58f

Upload data_prep.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data_prep.py +28 -0
data_prep.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json, re
2
+ from datasets import load_dataset
3
+ CJK = re.compile(r"[一-鿿぀-ゟ゠-ヿ]")
4
+ N = 50000
5
+ out_path = "/workspace/dolphin_subset.jsonl"
6
+ ds = load_dataset("cognitivecomputations/dolphin-r1", "reasoning-deepseek", split="train", streaming=True)
7
+ n = 0
8
+ with open(out_path, "w") as f:
9
+ for row in ds:
10
+ msgs = row.get("messages") or []
11
+ sys_c = ""; usr_c = ""
12
+ for m in msgs:
13
+ r = (m.get("role") or "").lower()
14
+ if r == "system" and not sys_c: sys_c = str(m.get("content") or "")
15
+ elif r == "user" and not usr_c: usr_c = str(m.get("content") or "")
16
+ reasoning = str(row.get("reasoning") or "")
17
+ answer = str(row.get("answer") or "")
18
+ if not usr_c or not reasoning or not answer:
19
+ continue
20
+ query = (sys_c + "\n\n" + usr_c) if sys_c else usr_c
21
+ output = "<think>\n" + reasoning + "\n</think>\n\n" + answer
22
+ if CJK.search(output) or CJK.search(query):
23
+ continue
24
+ f.write(json.dumps({"input":[{"role":"user","content":query}], "output": output}) + "\n")
25
+ n += 1
26
+ if n % 5000 == 0: print("wrote", n, flush=True)
27
+ if n >= N: break
28
+ print("DONE wrote", n, "to", out_path, flush=True)