frankenstallm / source /scripts /prepare_sft_combined.sh
pathcosmos's picture
Upload folder using huggingface_hub (#17)
48ecd01
#!/usr/bin/env bash
# prepare_sft_combined.sh โ€” 3B SFT์šฉ ์ „์ฒด ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ
# ๋ชจ๋“  SFT ๋ฐ์ดํ„ฐ๋ฅผ ํ•˜๋‚˜์˜ train/val ํŒŒ์ผ๋กœ ํ•ฉ์นจ
#
# ์—…๋ฐ์ดํŠธ (2026-03-02): sft_extra ์‹ ๊ทœ ์†Œ์Šค ์ถ”๊ฐ€
# - nayohan_Evol-Instruct-Code-80k-v1-ko (์ฝ”๋“œ instruction)
# - FreedomIntelligence_alpaca-gpt4-korean (GPT-4 alpaca ํ•œ๊ตญ์–ด)
# - FreedomIntelligence_evol-instruct-korean (evol-instruct ํ•œ๊ตญ์–ด)
# - coastral_korean-writing-style-instruct (ํ•œ๊ตญ์–ด ๊ธ€์“ฐ๊ธฐ ์Šคํƒ€์ผ)
# - maywell_ko_wikidata_QA (์œ„ํ‚ค๋ฐ์ดํ„ฐ QA)
# - OpenAssistant_oasst1_ko (OASST1 ํ•œ๊ตญ์–ด, ํŠธ๋ฆฌ ์žฌ๊ตฌ์„ฑ)
# - Bllossom_evol-instruct-ko (์กด์žฌ ํ™•์ธ ํ›„ ๋กœ๋“œ)
set -euo pipefail
BASE="$(cd "$(dirname "$0")/.." && pwd)"
OUT_DIR="$BASE/data/sft_combined"
mkdir -p "$OUT_DIR"
python3 << 'PYEOF'
import json, random, os, glob, hashlib
from collections import defaultdict
BASE = "/PROJECT/0325120031_A/ghong/taketimes/llm-bang/data"
OUT_TRAIN = f"{BASE}/sft_combined/train.jsonl"
OUT_VAL = f"{BASE}/sft_combined/val.jsonl"
VAL_RATIO = 0.02
SEED = 42
# SFT ์†Œ์Šค ํŒŒ์ผ ๋ชฉ๋ก (chat ํฌ๋งท์œผ๋กœ ๋ณ€ํ™˜ ๊ฐ€๋Šฅํ•œ ๊ฒƒ๋“ค)
SOURCES = [
# (path, fmt) fmt: "messages" | "auto" | "oasst"
(f"{BASE}/sft/train.jsonl", "messages"),
(f"{BASE}/sft_extra/ultrachat_200k/train_sft.jsonl", "messages"),
(f"{BASE}/sft_extra/open_korean_instructions/train.jsonl", "messages"),
(f"{BASE}/sft_extra/korean_instruction_mix/train.jsonl", "messages"),
(f"{BASE}/sft_extra/openhermes_2.5/train.jsonl", "messages"),
(f"{BASE}/sft_extra/magpie_reasoning_v2/train.jsonl", "messages"),
(f"{BASE}/sft_extra/magpie_reasoning_ko/train.jsonl", "messages"),
(f"{BASE}/sft_extra/reasoning_r1_1.4m/train.jsonl", "messages"),
(f"{BASE}/sft_extra/lemon-mint_smol-koreantalk.jsonl", "auto"),
(f"{BASE}/sft_extra/dbdu_ShareGPT-74k-ko.jsonl", "auto"),
(f"{BASE}/sft_extra/ko_lima/data.jsonl", "auto"),
(f"{BASE}/sft_extra/koalpaca_v1_1a/data.jsonl", "auto"),
(f"{BASE}/sft_extra/kullm_v2/data.jsonl", "auto"),
(f"{BASE}/sft_extra/kuotient_orca-math-word-problems-193k-korean.jsonl", "auto"),
(f"{BASE}/sft_extra/kyujinpy_KOR-OpenOrca-Platypus-v3/data.jsonl", "auto"),
(f"{BASE}/sft_extra/nlp-with-deeplearning_Ko.WizardLM_evol_instruct_V2_196k.jsonl", "auto"),
(f"{BASE}/sft_extra/AI-MO_NuminaMath-CoT/data.jsonl", "auto"),
(f"{BASE}/sft_extra/zwhe99_DeepMath-103K/data.jsonl", "auto"),
# ---- ์‹ ๊ทœ ์†Œ์Šค (2026-03-02) ----
(f"{BASE}/sft_extra/nayohan_Evol-Instruct-Code-80k-v1-ko/data.jsonl", "auto"),
(f"{BASE}/sft_extra/FreedomIntelligence_alpaca-gpt4-korean.jsonl", "auto"),
(f"{BASE}/sft_extra/FreedomIntelligence_evol-instruct-korean.jsonl", "auto"),
(f"{BASE}/sft_extra/coastral_korean-writing-style-instruct.jsonl", "auto"),
(f"{BASE}/sft_extra/maywell_ko_wikidata_QA.jsonl", "auto"),
(f"{BASE}/sft_extra/OpenAssistant_oasst1_ko.jsonl", "oasst"),
(f"{BASE}/sft_extra/Bllossom_evol-instruct-ko/data.jsonl", "auto"),
]
def to_messages(obj):
"""๋‹ค์–‘ํ•œ ํฌ๋งท์„ ํ†ต์ผ๋œ messages ํฌ๋งท์œผ๋กœ ๋ณ€ํ™˜"""
# ์ด๋ฏธ messages ํฌ๋งท
if 'messages' in obj and isinstance(obj['messages'], list):
return obj['messages']
# conversations ํฌ๋งท
if 'conversations' in obj:
msgs = []
for turn in obj['conversations']:
role = turn.get('from', turn.get('role', ''))
content = turn.get('value', turn.get('content', ''))
if role in ('human', 'user', 'prompter'):
msgs.append({'role': 'user', 'content': content})
elif role in ('gpt', 'assistant', 'bot'):
msgs.append({'role': 'assistant', 'content': content})
return msgs if len(msgs) >= 2 else None
# instruction/output ํฌ๋งท
if 'instruction' in obj:
instruction = obj['instruction']
inp = obj.get('input', '')
output = obj.get('output', obj.get('response', ''))
if not output: return None
user_content = instruction + ('\n\n' + inp if inp else '')
return [{'role': 'user', 'content': user_content}, {'role': 'assistant', 'content': output}]
# question/answer ํฌ๋งท
if 'question' in obj and 'answer' in obj:
return [{'role': 'user', 'content': obj['question']}, {'role': 'assistant', 'content': obj['answer']}]
# prompt/response
if 'prompt' in obj and ('response' in obj or 'completion' in obj):
resp = obj.get('response', obj.get('completion', ''))
return [{'role': 'user', 'content': obj['prompt']}, {'role': 'assistant', 'content': resp}]
# problem/solution
if 'problem' in obj and 'solution' in obj:
return [{'role': 'user', 'content': obj['problem']}, {'role': 'assistant', 'content': obj['solution']}]
return None
def load_oasst(path):
"""
OpenAssistant OASST1 flat message ํฌ๋งท์„ ๋Œ€ํ™” ํŠธ๋ฆฌ๋กœ ์žฌ๊ตฌ์„ฑ.
๊ฐ ๋ฃจํŠธ(prompter) ๋ฉ”์‹œ์ง€์—์„œ best-ranked assistant ์‘๋‹ต(rank=0.0)์„
๋”ฐ๋ผ ๋‹จ์ผ ๋Œ€ํ™” ์Šค๋ ˆ๋“œ๋ฅผ ์ถ”์ถœํ•œ๋‹ค.
deleted=True ๋ฉ”์‹œ์ง€์™€ review_result=False ๋ฉ”์‹œ์ง€๋Š” ์ œ์™ธ.
"""
nodes = {} # message_id โ†’ obj
children = defaultdict(list) # parent_id โ†’ [child_obj, ...]
with open(path, 'r', errors='replace') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except Exception:
continue
if obj.get('deleted', False):
continue
if obj.get('review_result') is False:
continue
mid = obj.get('message_id')
if mid:
nodes[mid] = obj
pid = obj.get('parent_id')
if pid:
children[pid].append(obj)
# ์ž์‹ ๋ชฉ๋ก์„ rank ์˜ค๋ฆ„์ฐจ์ˆœ ์ •๋ ฌ (rank=null์€ ๋’ค๋กœ)
def sort_key(c):
r = c.get('rank')
mid = c.get('message_id', '')
return (1, 0, mid) if r is None else (0, r, mid)
for pid in children:
children[pid].sort(key=sort_key)
samples = []
def build_thread(node, current_msgs):
"""์žฌ๊ท€์ ์œผ๋กœ ๋Œ€ํ™” ์Šค๋ ˆ๋“œ๋ฅผ ๋”ฐ๋ผ samples์— ์ถ”๊ฐ€."""
role = node.get('role', '')
text = node.get('text', '')
if role == 'prompter':
mapped_role = 'user'
elif role == 'assistant':
mapped_role = 'assistant'
else:
return
msgs = current_msgs + [{'role': mapped_role, 'content': text}]
# ์œ ํšจํ•œ userโ†’assistant ์Œ์ด ์žˆ์„ ๋•Œ๋งŒ ์ƒ˜ํ”Œ ์ถ”๊ฐ€
if mapped_role == 'assistant' and len(msgs) >= 2:
samples.append({'messages': msgs})
# ์ž์‹ ์ค‘ best (rank=0.0) ํ•˜๋‚˜๋งŒ ๋”ฐ๋ผ๊ฐ„๋‹ค (๊ฐ€์žฅ ํ’ˆ์งˆ ๋†’์€ ๊ฒฝ๋กœ)
kids = children.get(node.get('message_id'), [])
if kids:
build_thread(kids[0], msgs)
# ๋ฃจํŠธ ๋…ธ๋“œ: parent_id๊ฐ€ ์—†๋Š” prompter ๋ฉ”์‹œ์ง€
roots = [n for n in nodes.values() if n.get('parent_id') is None and n.get('role') == 'prompter']
for root in roots:
build_thread(root, [])
return samples
random.seed(SEED)
all_samples = []
for path, fmt in SOURCES:
if not os.path.exists(path):
print(f"[SKIP] {path}")
continue
if fmt == "oasst":
samples = load_oasst(path)
all_samples.extend(samples)
print(f"[LOADED] {os.path.basename(path)}: {len(samples):,} samples (oasst tree)")
continue
count = 0
with open(path, 'r', errors='replace') as f:
for line in f:
line = line.strip()
if not line: continue
try:
obj = json.loads(line)
except Exception:
continue
if fmt == "messages":
msgs = obj.get('messages') or obj.get('conversations')
if msgs:
all_samples.append({'messages': msgs})
count += 1
else: # auto detect
msgs = to_messages(obj)
if msgs and len(msgs) >= 2:
all_samples.append({'messages': msgs})
count += 1
print(f"[LOADED] {os.path.basename(path)}: {count:,} samples")
if count == 0:
print(f"[WARN] {os.path.basename(path)}: 0 samples extracted (format detection may have failed)")
print(f"\n์ด ์ƒ˜ํ”Œ: {len(all_samples):,}")
# ---- Deduplication (MD5 of first user message) ----
seen_hashes = set()
unique_samples = []
dup_count = 0
for s in all_samples:
msgs = s.get('messages', [])
first_user = next((m['content'] for m in msgs if m.get('role') == 'user'), '')
h = hashlib.md5(first_user.encode('utf-8', errors='replace')).hexdigest()
if h in seen_hashes:
dup_count += 1
continue
seen_hashes.add(h)
unique_samples.append(s)
print(f"[DEDUP] ์ œ๊ฑฐ: {dup_count:,}, ๋‚จ์€ ์ƒ˜ํ”Œ: {len(unique_samples):,}")
all_samples = unique_samples
# ---- Format validation ----
def validate_messages(msgs):
"""Check messages have valid role/content structure."""
if not isinstance(msgs, list) or len(msgs) < 2:
return False
for m in msgs:
if not isinstance(m, dict):
return False
if m.get('role') not in ('user', 'assistant', 'system'):
return False
if not isinstance(m.get('content'), str):
return False
return True
valid_samples = []
invalid_count = 0
for s in all_samples:
if validate_messages(s.get('messages', [])):
valid_samples.append(s)
else:
invalid_count += 1
print(f"[VALIDATE] ์œ ํšจํ•˜์ง€ ์•Š์€ ํฌ๋งท ์ œ๊ฑฐ: {invalid_count:,}, ๋‚จ์€ ์ƒ˜ํ”Œ: {len(valid_samples):,}")
all_samples = valid_samples
random.shuffle(all_samples)
n_val = int(len(all_samples) * VAL_RATIO)
val_samples = all_samples[:n_val]
train_samples = all_samples[n_val:]
os.makedirs(os.path.dirname(OUT_TRAIN), exist_ok=True)
with open(OUT_TRAIN, 'w') as f:
for s in train_samples:
f.write(json.dumps(s, ensure_ascii=False) + '\n')
with open(OUT_VAL, 'w') as f:
for s in val_samples:
f.write(json.dumps(s, ensure_ascii=False) + '\n')
print(f"[DONE] train: {len(train_samples):,} โ†’ {OUT_TRAIN}")
print(f"[DONE] val: {len(val_samples):,} โ†’ {OUT_VAL}")
PYEOF
echo "SFT ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ ์™„๋ฃŒ"