baya1116
/

hypernet-sp-distill

Model card Files Files and versions

hypernet-sp-distill / data_prep.py

baya1116's picture

Upload data_prep.py with huggingface_hub

a04bca5 verified 10 days ago

history blame contribute delete

1.23 kB

	import json, re
	from datasets import load_dataset
	CJK = re.compile(r"[一-鿿぀-ゟ゠-ヿ]")
	N = 50000
	out_path = "/workspace/dolphin_subset.jsonl"
	ds = load_dataset("cognitivecomputations/dolphin-r1", "reasoning-deepseek", split="train", streaming=True)
	n = 0
	with open(out_path, "w") as f:
	for row in ds:
	msgs = row.get("messages") or []
	sys_c = ""; usr_c = ""
	for m in msgs:
	r = (m.get("role") or "").lower()
	if r == "system" and not sys_c: sys_c = str(m.get("content") or "")
	elif r == "user" and not usr_c: usr_c = str(m.get("content") or "")
	reasoning = str(row.get("reasoning") or "")
	answer = str(row.get("answer") or "")
	if not usr_c or not reasoning or not answer:
	continue
	query = (sys_c + "\n\n" + usr_c) if sys_c else usr_c
	output = "<think>\n" + reasoning + "\n</think>\n\n" + answer
	if CJK.search(output) or CJK.search(query):
	continue
	f.write(json.dumps({"input":[{"role":"user","content":query}], "output": output}) + "\n")
	n += 1
	if n % 5000 == 0: print("wrote", n, flush=True)
	if n >= N: break
	print("DONE wrote", n, "to", out_path, flush=True)