source/scripts/prepare_sft_combined.sh · pathcosmos/frankenstallm at main

File size: 10,421 Bytes

48ecd01

#!/usr/bin/env bash
# prepare_sft_combined.sh — 3B SFT용 전체 데이터 통합
# 모든 SFT 데이터를 하나의 train/val 파일로 합침
#
# 업데이트 (2026-03-02): sft_extra 신규 소스 추가
#   - nayohan_Evol-Instruct-Code-80k-v1-ko  (코드 instruction)
#   - FreedomIntelligence_alpaca-gpt4-korean (GPT-4 alpaca 한국어)
#   - FreedomIntelligence_evol-instruct-korean (evol-instruct 한국어)
#   - coastral_korean-writing-style-instruct  (한국어 글쓰기 스타일)
#   - maywell_ko_wikidata_QA                  (위키데이터 QA)
#   - OpenAssistant_oasst1_ko                 (OASST1 한국어, 트리 재구성)
#   - Bllossom_evol-instruct-ko               (존재 확인 후 로드)
set -euo pipefail
BASE="$(cd "$(dirname "$0")/.." && pwd)"
OUT_DIR="$BASE/data/sft_combined"
mkdir -p "$OUT_DIR"

python3 << 'PYEOF'
import json, random, os, glob, hashlib
from collections import defaultdict

BASE = "/PROJECT/0325120031_A/ghong/taketimes/llm-bang/data"
OUT_TRAIN = f"{BASE}/sft_combined/train.jsonl"
OUT_VAL = f"{BASE}/sft_combined/val.jsonl"
VAL_RATIO = 0.02
SEED = 42

# SFT 소스 파일 목록 (chat 포맷으로 변환 가능한 것들)
SOURCES = [
    # (path, fmt)  fmt: "messages" | "auto" | "oasst"
    (f"{BASE}/sft/train.jsonl", "messages"),
    (f"{BASE}/sft_extra/ultrachat_200k/train_sft.jsonl", "messages"),
    (f"{BASE}/sft_extra/open_korean_instructions/train.jsonl", "messages"),
    (f"{BASE}/sft_extra/korean_instruction_mix/train.jsonl", "messages"),
    (f"{BASE}/sft_extra/openhermes_2.5/train.jsonl", "messages"),
    (f"{BASE}/sft_extra/magpie_reasoning_v2/train.jsonl", "messages"),
    (f"{BASE}/sft_extra/magpie_reasoning_ko/train.jsonl", "messages"),
    (f"{BASE}/sft_extra/reasoning_r1_1.4m/train.jsonl", "messages"),
    (f"{BASE}/sft_extra/lemon-mint_smol-koreantalk.jsonl", "auto"),
    (f"{BASE}/sft_extra/dbdu_ShareGPT-74k-ko.jsonl", "auto"),
    (f"{BASE}/sft_extra/ko_lima/data.jsonl", "auto"),
    (f"{BASE}/sft_extra/koalpaca_v1_1a/data.jsonl", "auto"),
    (f"{BASE}/sft_extra/kullm_v2/data.jsonl", "auto"),
    (f"{BASE}/sft_extra/kuotient_orca-math-word-problems-193k-korean.jsonl", "auto"),
    (f"{BASE}/sft_extra/kyujinpy_KOR-OpenOrca-Platypus-v3/data.jsonl", "auto"),
    (f"{BASE}/sft_extra/nlp-with-deeplearning_Ko.WizardLM_evol_instruct_V2_196k.jsonl", "auto"),
    (f"{BASE}/sft_extra/AI-MO_NuminaMath-CoT/data.jsonl", "auto"),
    (f"{BASE}/sft_extra/zwhe99_DeepMath-103K/data.jsonl", "auto"),
    # ---- 신규 소스 (2026-03-02) ----
    (f"{BASE}/sft_extra/nayohan_Evol-Instruct-Code-80k-v1-ko/data.jsonl", "auto"),
    (f"{BASE}/sft_extra/FreedomIntelligence_alpaca-gpt4-korean.jsonl", "auto"),
    (f"{BASE}/sft_extra/FreedomIntelligence_evol-instruct-korean.jsonl", "auto"),
    (f"{BASE}/sft_extra/coastral_korean-writing-style-instruct.jsonl", "auto"),
    (f"{BASE}/sft_extra/maywell_ko_wikidata_QA.jsonl", "auto"),
    (f"{BASE}/sft_extra/OpenAssistant_oasst1_ko.jsonl", "oasst"),
    (f"{BASE}/sft_extra/Bllossom_evol-instruct-ko/data.jsonl", "auto"),
]

def to_messages(obj):
    """다양한 포맷을 통일된 messages 포맷으로 변환"""
    # 이미 messages 포맷
    if 'messages' in obj and isinstance(obj['messages'], list):
        return obj['messages']
    # conversations 포맷
    if 'conversations' in obj:
        msgs = []
        for turn in obj['conversations']:
            role = turn.get('from', turn.get('role', ''))
            content = turn.get('value', turn.get('content', ''))
            if role in ('human', 'user', 'prompter'):
                msgs.append({'role': 'user', 'content': content})
            elif role in ('gpt', 'assistant', 'bot'):
                msgs.append({'role': 'assistant', 'content': content})
        return msgs if len(msgs) >= 2 else None
    # instruction/output 포맷
    if 'instruction' in obj:
        instruction = obj['instruction']
        inp = obj.get('input', '')
        output = obj.get('output', obj.get('response', ''))
        if not output: return None
        user_content = instruction + ('\n\n' + inp if inp else '')
        return [{'role': 'user', 'content': user_content}, {'role': 'assistant', 'content': output}]
    # question/answer 포맷
    if 'question' in obj and 'answer' in obj:
        return [{'role': 'user', 'content': obj['question']}, {'role': 'assistant', 'content': obj['answer']}]
    # prompt/response
    if 'prompt' in obj and ('response' in obj or 'completion' in obj):
        resp = obj.get('response', obj.get('completion', ''))
        return [{'role': 'user', 'content': obj['prompt']}, {'role': 'assistant', 'content': resp}]
    # problem/solution
    if 'problem' in obj and 'solution' in obj:
        return [{'role': 'user', 'content': obj['problem']}, {'role': 'assistant', 'content': obj['solution']}]
    return None


def load_oasst(path):
    """
    OpenAssistant OASST1 flat message 포맷을 대화 트리로 재구성.
    각 루트(prompter) 메시지에서 best-ranked assistant 응답(rank=0.0)을
    따라 단일 대화 스레드를 추출한다.
    deleted=True 메시지와 review_result=False 메시지는 제외.
    """
    nodes = {}      # message_id → obj
    children = defaultdict(list)  # parent_id → [child_obj, ...]

    with open(path, 'r', errors='replace') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except Exception:
                continue
            if obj.get('deleted', False):
                continue
            if obj.get('review_result') is False:
                continue
            mid = obj.get('message_id')
            if mid:
                nodes[mid] = obj
            pid = obj.get('parent_id')
            if pid:
                children[pid].append(obj)

    # 자식 목록을 rank 오름차순 정렬 (rank=null은 뒤로)
    def sort_key(c):
        r = c.get('rank')
        mid = c.get('message_id', '')
        return (1, 0, mid) if r is None else (0, r, mid)
    for pid in children:
        children[pid].sort(key=sort_key)

    samples = []

    def build_thread(node, current_msgs):
        """재귀적으로 대화 스레드를 따라 samples에 추가."""
        role = node.get('role', '')
        text = node.get('text', '')
        if role == 'prompter':
            mapped_role = 'user'
        elif role == 'assistant':
            mapped_role = 'assistant'
        else:
            return

        msgs = current_msgs + [{'role': mapped_role, 'content': text}]

        # 유효한 user→assistant 쌍이 있을 때만 샘플 추가
        if mapped_role == 'assistant' and len(msgs) >= 2:
            samples.append({'messages': msgs})

        # 자식 중 best (rank=0.0) 하나만 따라간다 (가장 품질 높은 경로)
        kids = children.get(node.get('message_id'), [])
        if kids:
            build_thread(kids[0], msgs)

    # 루트 노드: parent_id가 없는 prompter 메시지
    roots = [n for n in nodes.values() if n.get('parent_id') is None and n.get('role') == 'prompter']
    for root in roots:
        build_thread(root, [])

    return samples


random.seed(SEED)
all_samples = []

for path, fmt in SOURCES:
    if not os.path.exists(path):
        print(f"[SKIP] {path}")
        continue

    if fmt == "oasst":
        samples = load_oasst(path)
        all_samples.extend(samples)
        print(f"[LOADED] {os.path.basename(path)}: {len(samples):,} samples (oasst tree)")
        continue

    count = 0
    with open(path, 'r', errors='replace') as f:
        for line in f:
            line = line.strip()
            if not line: continue
            try:
                obj = json.loads(line)
            except Exception:
                continue
            if fmt == "messages":
                msgs = obj.get('messages') or obj.get('conversations')
                if msgs:
                    all_samples.append({'messages': msgs})
                    count += 1
            else:  # auto detect
                msgs = to_messages(obj)
                if msgs and len(msgs) >= 2:
                    all_samples.append({'messages': msgs})
                    count += 1
    print(f"[LOADED] {os.path.basename(path)}: {count:,} samples")
    if count == 0:
        print(f"[WARN] {os.path.basename(path)}: 0 samples extracted (format detection may have failed)")

print(f"\n총 샘플: {len(all_samples):,}")

# ---- Deduplication (MD5 of first user message) ----
seen_hashes = set()
unique_samples = []
dup_count = 0
for s in all_samples:
    msgs = s.get('messages', [])
    first_user = next((m['content'] for m in msgs if m.get('role') == 'user'), '')
    h = hashlib.md5(first_user.encode('utf-8', errors='replace')).hexdigest()
    if h in seen_hashes:
        dup_count += 1
        continue
    seen_hashes.add(h)
    unique_samples.append(s)

print(f"[DEDUP] 제거: {dup_count:,}, 남은 샘플: {len(unique_samples):,}")
all_samples = unique_samples

# ---- Format validation ----
def validate_messages(msgs):
    """Check messages have valid role/content structure."""
    if not isinstance(msgs, list) or len(msgs) < 2:
        return False
    for m in msgs:
        if not isinstance(m, dict):
            return False
        if m.get('role') not in ('user', 'assistant', 'system'):
            return False
        if not isinstance(m.get('content'), str):
            return False
    return True

valid_samples = []
invalid_count = 0
for s in all_samples:
    if validate_messages(s.get('messages', [])):
        valid_samples.append(s)
    else:
        invalid_count += 1

print(f"[VALIDATE] 유효하지 않은 포맷 제거: {invalid_count:,}, 남은 샘플: {len(valid_samples):,}")
all_samples = valid_samples

random.shuffle(all_samples)

n_val = int(len(all_samples) * VAL_RATIO)
val_samples = all_samples[:n_val]
train_samples = all_samples[n_val:]

os.makedirs(os.path.dirname(OUT_TRAIN), exist_ok=True)
with open(OUT_TRAIN, 'w') as f:
    for s in train_samples:
        f.write(json.dumps(s, ensure_ascii=False) + '\n')
with open(OUT_VAL, 'w') as f:
    for s in val_samples:
        f.write(json.dumps(s, ensure_ascii=False) + '\n')

print(f"[DONE] train: {len(train_samples):,} → {OUT_TRAIN}")
print(f"[DONE] val:   {len(val_samples):,} → {OUT_VAL}")
PYEOF
echo "SFT 데이터 병합 완료"