source/scripts/prepare_sft_combined.sh · pathcosmos/frankenstallm at main

frankenstallm / source /scripts /prepare_sft_combined.sh

Upload folder using huggingface_hub (#17)

48ecd01 28 days ago

10.4 kB

	#!/usr/bin/env bash
	# prepare_sft_combined.sh — 3B SFT용 전체 데이터 통합
	# 모든 SFT 데이터를 하나의 train/val 파일로 합침
	#
	# 업데이트 (2026-03-02): sft_extra 신규 소스 추가
	# - nayohan_Evol-Instruct-Code-80k-v1-ko (코드 instruction)
	# - FreedomIntelligence_alpaca-gpt4-korean (GPT-4 alpaca 한국어)
	# - FreedomIntelligence_evol-instruct-korean (evol-instruct 한국어)
	# - coastral_korean-writing-style-instruct (한국어 글쓰기 스타일)
	# - maywell_ko_wikidata_QA (위키데이터 QA)
	# - OpenAssistant_oasst1_ko (OASST1 한국어, 트리 재구성)
	# - Bllossom_evol-instruct-ko (존재 확인 후 로드)
	set -euo pipefail
	BASE="$(cd "$(dirname "$0")/.." && pwd)"
	OUT_DIR="$BASE/data/sft_combined"
	mkdir -p "$OUT_DIR"

	python3 << 'PYEOF'
	import json, random, os, glob, hashlib
	from collections import defaultdict

	BASE = "/PROJECT/0325120031_A/ghong/taketimes/llm-bang/data"
	OUT_TRAIN = f"{BASE}/sft_combined/train.jsonl"
	OUT_VAL = f"{BASE}/sft_combined/val.jsonl"
	VAL_RATIO = 0.02
	SEED = 42

	# SFT 소스 파일 목록 (chat 포맷으로 변환 가능한 것들)
	SOURCES = [
	# (path, fmt) fmt: "messages" \| "auto" \| "oasst"
	(f"{BASE}/sft/train.jsonl", "messages"),
	(f"{BASE}/sft_extra/ultrachat_200k/train_sft.jsonl", "messages"),
	(f"{BASE}/sft_extra/open_korean_instructions/train.jsonl", "messages"),
	(f"{BASE}/sft_extra/korean_instruction_mix/train.jsonl", "messages"),
	(f"{BASE}/sft_extra/openhermes_2.5/train.jsonl", "messages"),
	(f"{BASE}/sft_extra/magpie_reasoning_v2/train.jsonl", "messages"),
	(f"{BASE}/sft_extra/magpie_reasoning_ko/train.jsonl", "messages"),
	(f"{BASE}/sft_extra/reasoning_r1_1.4m/train.jsonl", "messages"),
	(f"{BASE}/sft_extra/lemon-mint_smol-koreantalk.jsonl", "auto"),
	(f"{BASE}/sft_extra/dbdu_ShareGPT-74k-ko.jsonl", "auto"),
	(f"{BASE}/sft_extra/ko_lima/data.jsonl", "auto"),
	(f"{BASE}/sft_extra/koalpaca_v1_1a/data.jsonl", "auto"),
	(f"{BASE}/sft_extra/kullm_v2/data.jsonl", "auto"),
	(f"{BASE}/sft_extra/kuotient_orca-math-word-problems-193k-korean.jsonl", "auto"),
	(f"{BASE}/sft_extra/kyujinpy_KOR-OpenOrca-Platypus-v3/data.jsonl", "auto"),
	(f"{BASE}/sft_extra/nlp-with-deeplearning_Ko.WizardLM_evol_instruct_V2_196k.jsonl", "auto"),
	(f"{BASE}/sft_extra/AI-MO_NuminaMath-CoT/data.jsonl", "auto"),
	(f"{BASE}/sft_extra/zwhe99_DeepMath-103K/data.jsonl", "auto"),
	# ---- 신규 소스 (2026-03-02) ----
	(f"{BASE}/sft_extra/nayohan_Evol-Instruct-Code-80k-v1-ko/data.jsonl", "auto"),
	(f"{BASE}/sft_extra/FreedomIntelligence_alpaca-gpt4-korean.jsonl", "auto"),
	(f"{BASE}/sft_extra/FreedomIntelligence_evol-instruct-korean.jsonl", "auto"),
	(f"{BASE}/sft_extra/coastral_korean-writing-style-instruct.jsonl", "auto"),
	(f"{BASE}/sft_extra/maywell_ko_wikidata_QA.jsonl", "auto"),
	(f"{BASE}/sft_extra/OpenAssistant_oasst1_ko.jsonl", "oasst"),
	(f"{BASE}/sft_extra/Bllossom_evol-instruct-ko/data.jsonl", "auto"),
	]

	def to_messages(obj):
	"""다양한 포맷을 통일된 messages 포맷으로 변환"""
	# 이미 messages 포맷
	if 'messages' in obj and isinstance(obj['messages'], list):
	return obj['messages']
	# conversations 포맷
	if 'conversations' in obj:
	msgs = []
	for turn in obj['conversations']:
	role = turn.get('from', turn.get('role', ''))
	content = turn.get('value', turn.get('content', ''))
	if role in ('human', 'user', 'prompter'):
	msgs.append({'role': 'user', 'content': content})
	elif role in ('gpt', 'assistant', 'bot'):
	msgs.append({'role': 'assistant', 'content': content})
	return msgs if len(msgs) >= 2 else None
	# instruction/output 포맷
	if 'instruction' in obj:
	instruction = obj['instruction']
	inp = obj.get('input', '')
	output = obj.get('output', obj.get('response', ''))
	if not output: return None
	user_content = instruction + ('\n\n' + inp if inp else '')
	return [{'role': 'user', 'content': user_content}, {'role': 'assistant', 'content': output}]
	# question/answer 포맷
	if 'question' in obj and 'answer' in obj:
	return [{'role': 'user', 'content': obj['question']}, {'role': 'assistant', 'content': obj['answer']}]
	# prompt/response
	if 'prompt' in obj and ('response' in obj or 'completion' in obj):
	resp = obj.get('response', obj.get('completion', ''))
	return [{'role': 'user', 'content': obj['prompt']}, {'role': 'assistant', 'content': resp}]
	# problem/solution
	if 'problem' in obj and 'solution' in obj:
	return [{'role': 'user', 'content': obj['problem']}, {'role': 'assistant', 'content': obj['solution']}]
	return None


	def load_oasst(path):
	"""
	OpenAssistant OASST1 flat message 포맷을 대화 트리로 재구성.
	각 루트(prompter) 메시지에서 best-ranked assistant 응답(rank=0.0)을
	따라 단일 대화 스레드를 추출한다.
	deleted=True 메시지와 review_result=False 메시지는 제외.
	"""
	nodes = {} # message_id → obj
	children = defaultdict(list) # parent_id → [child_obj, ...]

	with open(path, 'r', errors='replace') as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	try:
	obj = json.loads(line)
	except Exception:
	continue
	if obj.get('deleted', False):
	continue
	if obj.get('review_result') is False:
	continue
	mid = obj.get('message_id')
	if mid:
	nodes[mid] = obj
	pid = obj.get('parent_id')
	if pid:
	children[pid].append(obj)

	# 자식 목록을 rank 오름차순 정렬 (rank=null은 뒤로)
	def sort_key(c):
	r = c.get('rank')
	mid = c.get('message_id', '')
	return (1, 0, mid) if r is None else (0, r, mid)
	for pid in children:
	children[pid].sort(key=sort_key)

	samples = []

	def build_thread(node, current_msgs):
	"""재귀적으로 대화 스레드를 따라 samples에 추가."""
	role = node.get('role', '')
	text = node.get('text', '')
	if role == 'prompter':
	mapped_role = 'user'
	elif role == 'assistant':
	mapped_role = 'assistant'
	else:
	return

	msgs = current_msgs + [{'role': mapped_role, 'content': text}]

	# 유효한 user→assistant 쌍이 있을 때만 샘플 추가
	if mapped_role == 'assistant' and len(msgs) >= 2:
	samples.append({'messages': msgs})

	# 자식 중 best (rank=0.0) 하나만 따라간다 (가장 품질 높은 경로)
	kids = children.get(node.get('message_id'), [])
	if kids:
	build_thread(kids[0], msgs)

	# 루트 노드: parent_id가 없는 prompter 메시지
	roots = [n for n in nodes.values() if n.get('parent_id') is None and n.get('role') == 'prompter']
	for root in roots:
	build_thread(root, [])

	return samples


	random.seed(SEED)
	all_samples = []

	for path, fmt in SOURCES:
	if not os.path.exists(path):
	print(f"[SKIP] {path}")
	continue

	if fmt == "oasst":
	samples = load_oasst(path)
	all_samples.extend(samples)
	print(f"[LOADED] {os.path.basename(path)}: {len(samples):,} samples (oasst tree)")
	continue

	count = 0
	with open(path, 'r', errors='replace') as f:
	for line in f:
	line = line.strip()
	if not line: continue
	try:
	obj = json.loads(line)
	except Exception:
	continue
	if fmt == "messages":
	msgs = obj.get('messages') or obj.get('conversations')
	if msgs:
	all_samples.append({'messages': msgs})
	count += 1
	else: # auto detect
	msgs = to_messages(obj)
	if msgs and len(msgs) >= 2:
	all_samples.append({'messages': msgs})
	count += 1
	print(f"[LOADED] {os.path.basename(path)}: {count:,} samples")
	if count == 0:
	print(f"[WARN] {os.path.basename(path)}: 0 samples extracted (format detection may have failed)")

	print(f"\n총 샘플: {len(all_samples):,}")

	# ---- Deduplication (MD5 of first user message) ----
	seen_hashes = set()
	unique_samples = []
	dup_count = 0
	for s in all_samples:
	msgs = s.get('messages', [])
	first_user = next((m['content'] for m in msgs if m.get('role') == 'user'), '')
	h = hashlib.md5(first_user.encode('utf-8', errors='replace')).hexdigest()
	if h in seen_hashes:
	dup_count += 1
	continue
	seen_hashes.add(h)
	unique_samples.append(s)

	print(f"[DEDUP] 제거: {dup_count:,}, 남은 샘플: {len(unique_samples):,}")
	all_samples = unique_samples

	# ---- Format validation ----
	def validate_messages(msgs):
	"""Check messages have valid role/content structure."""
	if not isinstance(msgs, list) or len(msgs) < 2:
	return False
	for m in msgs:
	if not isinstance(m, dict):
	return False
	if m.get('role') not in ('user', 'assistant', 'system'):
	return False
	if not isinstance(m.get('content'), str):
	return False
	return True

	valid_samples = []
	invalid_count = 0
	for s in all_samples:
	if validate_messages(s.get('messages', [])):
	valid_samples.append(s)
	else:
	invalid_count += 1

	print(f"[VALIDATE] 유효하지 않은 포맷 제거: {invalid_count:,}, 남은 샘플: {len(valid_samples):,}")
	all_samples = valid_samples

	random.shuffle(all_samples)

	n_val = int(len(all_samples) * VAL_RATIO)
	val_samples = all_samples[:n_val]
	train_samples = all_samples[n_val:]

	os.makedirs(os.path.dirname(OUT_TRAIN), exist_ok=True)
	with open(OUT_TRAIN, 'w') as f:
	for s in train_samples:
	f.write(json.dumps(s, ensure_ascii=False) + '\n')
	with open(OUT_VAL, 'w') as f:
	for s in val_samples:
	f.write(json.dumps(s, ensure_ascii=False) + '\n')

	print(f"[DONE] train: {len(train_samples):,} → {OUT_TRAIN}")
	print(f"[DONE] val: {len(val_samples):,} → {OUT_VAL}")
	PYEOF
	echo "SFT 데이터 병합 완료"