| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| set -euo pipefail |
| BASE="$(cd "$(dirname "$0")/.." && pwd)" |
| OUT_DIR="$BASE/data/sft_combined" |
| mkdir -p "$OUT_DIR" |
|
|
| python3 << 'PYEOF' |
| import json, random, os, glob, hashlib |
| from collections import defaultdict |
|
|
| BASE = "/PROJECT/0325120031_A/ghong/taketimes/llm-bang/data" |
| OUT_TRAIN = f"{BASE}/sft_combined/train.jsonl" |
| OUT_VAL = f"{BASE}/sft_combined/val.jsonl" |
| VAL_RATIO = 0.02 |
| SEED = 42 |
|
|
| |
| SOURCES = [ |
| |
| (f"{BASE}/sft/train.jsonl", "messages"), |
| (f"{BASE}/sft_extra/ultrachat_200k/train_sft.jsonl", "messages"), |
| (f"{BASE}/sft_extra/open_korean_instructions/train.jsonl", "messages"), |
| (f"{BASE}/sft_extra/korean_instruction_mix/train.jsonl", "messages"), |
| (f"{BASE}/sft_extra/openhermes_2.5/train.jsonl", "messages"), |
| (f"{BASE}/sft_extra/magpie_reasoning_v2/train.jsonl", "messages"), |
| (f"{BASE}/sft_extra/magpie_reasoning_ko/train.jsonl", "messages"), |
| (f"{BASE}/sft_extra/reasoning_r1_1.4m/train.jsonl", "messages"), |
| (f"{BASE}/sft_extra/lemon-mint_smol-koreantalk.jsonl", "auto"), |
| (f"{BASE}/sft_extra/dbdu_ShareGPT-74k-ko.jsonl", "auto"), |
| (f"{BASE}/sft_extra/ko_lima/data.jsonl", "auto"), |
| (f"{BASE}/sft_extra/koalpaca_v1_1a/data.jsonl", "auto"), |
| (f"{BASE}/sft_extra/kullm_v2/data.jsonl", "auto"), |
| (f"{BASE}/sft_extra/kuotient_orca-math-word-problems-193k-korean.jsonl", "auto"), |
| (f"{BASE}/sft_extra/kyujinpy_KOR-OpenOrca-Platypus-v3/data.jsonl", "auto"), |
| (f"{BASE}/sft_extra/nlp-with-deeplearning_Ko.WizardLM_evol_instruct_V2_196k.jsonl", "auto"), |
| (f"{BASE}/sft_extra/AI-MO_NuminaMath-CoT/data.jsonl", "auto"), |
| (f"{BASE}/sft_extra/zwhe99_DeepMath-103K/data.jsonl", "auto"), |
| |
| (f"{BASE}/sft_extra/nayohan_Evol-Instruct-Code-80k-v1-ko/data.jsonl", "auto"), |
| (f"{BASE}/sft_extra/FreedomIntelligence_alpaca-gpt4-korean.jsonl", "auto"), |
| (f"{BASE}/sft_extra/FreedomIntelligence_evol-instruct-korean.jsonl", "auto"), |
| (f"{BASE}/sft_extra/coastral_korean-writing-style-instruct.jsonl", "auto"), |
| (f"{BASE}/sft_extra/maywell_ko_wikidata_QA.jsonl", "auto"), |
| (f"{BASE}/sft_extra/OpenAssistant_oasst1_ko.jsonl", "oasst"), |
| (f"{BASE}/sft_extra/Bllossom_evol-instruct-ko/data.jsonl", "auto"), |
| ] |
|
|
| def to_messages(obj): |
| """๋ค์ํ ํฌ๋งท์ ํต์ผ๋ messages ํฌ๋งท์ผ๋ก ๋ณํ""" |
| |
| if 'messages' in obj and isinstance(obj['messages'], list): |
| return obj['messages'] |
| |
| if 'conversations' in obj: |
| msgs = [] |
| for turn in obj['conversations']: |
| role = turn.get('from', turn.get('role', '')) |
| content = turn.get('value', turn.get('content', '')) |
| if role in ('human', 'user', 'prompter'): |
| msgs.append({'role': 'user', 'content': content}) |
| elif role in ('gpt', 'assistant', 'bot'): |
| msgs.append({'role': 'assistant', 'content': content}) |
| return msgs if len(msgs) >= 2 else None |
| |
| if 'instruction' in obj: |
| instruction = obj['instruction'] |
| inp = obj.get('input', '') |
| output = obj.get('output', obj.get('response', '')) |
| if not output: return None |
| user_content = instruction + ('\n\n' + inp if inp else '') |
| return [{'role': 'user', 'content': user_content}, {'role': 'assistant', 'content': output}] |
| |
| if 'question' in obj and 'answer' in obj: |
| return [{'role': 'user', 'content': obj['question']}, {'role': 'assistant', 'content': obj['answer']}] |
| |
| if 'prompt' in obj and ('response' in obj or 'completion' in obj): |
| resp = obj.get('response', obj.get('completion', '')) |
| return [{'role': 'user', 'content': obj['prompt']}, {'role': 'assistant', 'content': resp}] |
| |
| if 'problem' in obj and 'solution' in obj: |
| return [{'role': 'user', 'content': obj['problem']}, {'role': 'assistant', 'content': obj['solution']}] |
| return None |
|
|
|
|
| def load_oasst(path): |
| """ |
| OpenAssistant OASST1 flat message ํฌ๋งท์ ๋ํ ํธ๋ฆฌ๋ก ์ฌ๊ตฌ์ฑ. |
| ๊ฐ ๋ฃจํธ(prompter) ๋ฉ์์ง์์ best-ranked assistant ์๋ต(rank=0.0)์ |
| ๋ฐ๋ผ ๋จ์ผ ๋ํ ์ค๋ ๋๋ฅผ ์ถ์ถํ๋ค. |
| deleted=True ๋ฉ์์ง์ review_result=False ๋ฉ์์ง๋ ์ ์ธ. |
| """ |
| nodes = {} |
| children = defaultdict(list) |
|
|
| with open(path, 'r', errors='replace') as f: |
| for line in f: |
| line = line.strip() |
| if not line: |
| continue |
| try: |
| obj = json.loads(line) |
| except Exception: |
| continue |
| if obj.get('deleted', False): |
| continue |
| if obj.get('review_result') is False: |
| continue |
| mid = obj.get('message_id') |
| if mid: |
| nodes[mid] = obj |
| pid = obj.get('parent_id') |
| if pid: |
| children[pid].append(obj) |
|
|
| |
| def sort_key(c): |
| r = c.get('rank') |
| mid = c.get('message_id', '') |
| return (1, 0, mid) if r is None else (0, r, mid) |
| for pid in children: |
| children[pid].sort(key=sort_key) |
|
|
| samples = [] |
|
|
| def build_thread(node, current_msgs): |
| """์ฌ๊ท์ ์ผ๋ก ๋ํ ์ค๋ ๋๋ฅผ ๋ฐ๋ผ samples์ ์ถ๊ฐ.""" |
| role = node.get('role', '') |
| text = node.get('text', '') |
| if role == 'prompter': |
| mapped_role = 'user' |
| elif role == 'assistant': |
| mapped_role = 'assistant' |
| else: |
| return |
|
|
| msgs = current_msgs + [{'role': mapped_role, 'content': text}] |
|
|
| |
| if mapped_role == 'assistant' and len(msgs) >= 2: |
| samples.append({'messages': msgs}) |
|
|
| |
| kids = children.get(node.get('message_id'), []) |
| if kids: |
| build_thread(kids[0], msgs) |
|
|
| |
| roots = [n for n in nodes.values() if n.get('parent_id') is None and n.get('role') == 'prompter'] |
| for root in roots: |
| build_thread(root, []) |
|
|
| return samples |
|
|
|
|
| random.seed(SEED) |
| all_samples = [] |
|
|
| for path, fmt in SOURCES: |
| if not os.path.exists(path): |
| print(f"[SKIP] {path}") |
| continue |
|
|
| if fmt == "oasst": |
| samples = load_oasst(path) |
| all_samples.extend(samples) |
| print(f"[LOADED] {os.path.basename(path)}: {len(samples):,} samples (oasst tree)") |
| continue |
|
|
| count = 0 |
| with open(path, 'r', errors='replace') as f: |
| for line in f: |
| line = line.strip() |
| if not line: continue |
| try: |
| obj = json.loads(line) |
| except Exception: |
| continue |
| if fmt == "messages": |
| msgs = obj.get('messages') or obj.get('conversations') |
| if msgs: |
| all_samples.append({'messages': msgs}) |
| count += 1 |
| else: |
| msgs = to_messages(obj) |
| if msgs and len(msgs) >= 2: |
| all_samples.append({'messages': msgs}) |
| count += 1 |
| print(f"[LOADED] {os.path.basename(path)}: {count:,} samples") |
| if count == 0: |
| print(f"[WARN] {os.path.basename(path)}: 0 samples extracted (format detection may have failed)") |
|
|
| print(f"\n์ด ์ํ: {len(all_samples):,}") |
|
|
| |
| seen_hashes = set() |
| unique_samples = [] |
| dup_count = 0 |
| for s in all_samples: |
| msgs = s.get('messages', []) |
| first_user = next((m['content'] for m in msgs if m.get('role') == 'user'), '') |
| h = hashlib.md5(first_user.encode('utf-8', errors='replace')).hexdigest() |
| if h in seen_hashes: |
| dup_count += 1 |
| continue |
| seen_hashes.add(h) |
| unique_samples.append(s) |
|
|
| print(f"[DEDUP] ์ ๊ฑฐ: {dup_count:,}, ๋จ์ ์ํ: {len(unique_samples):,}") |
| all_samples = unique_samples |
|
|
| |
| def validate_messages(msgs): |
| """Check messages have valid role/content structure.""" |
| if not isinstance(msgs, list) or len(msgs) < 2: |
| return False |
| for m in msgs: |
| if not isinstance(m, dict): |
| return False |
| if m.get('role') not in ('user', 'assistant', 'system'): |
| return False |
| if not isinstance(m.get('content'), str): |
| return False |
| return True |
|
|
| valid_samples = [] |
| invalid_count = 0 |
| for s in all_samples: |
| if validate_messages(s.get('messages', [])): |
| valid_samples.append(s) |
| else: |
| invalid_count += 1 |
|
|
| print(f"[VALIDATE] ์ ํจํ์ง ์์ ํฌ๋งท ์ ๊ฑฐ: {invalid_count:,}, ๋จ์ ์ํ: {len(valid_samples):,}") |
| all_samples = valid_samples |
|
|
| random.shuffle(all_samples) |
|
|
| n_val = int(len(all_samples) * VAL_RATIO) |
| val_samples = all_samples[:n_val] |
| train_samples = all_samples[n_val:] |
|
|
| os.makedirs(os.path.dirname(OUT_TRAIN), exist_ok=True) |
| with open(OUT_TRAIN, 'w') as f: |
| for s in train_samples: |
| f.write(json.dumps(s, ensure_ascii=False) + '\n') |
| with open(OUT_VAL, 'w') as f: |
| for s in val_samples: |
| f.write(json.dumps(s, ensure_ascii=False) + '\n') |
|
|
| print(f"[DONE] train: {len(train_samples):,} โ {OUT_TRAIN}") |
| print(f"[DONE] val: {len(val_samples):,} โ {OUT_VAL}") |
| PYEOF |
| echo "SFT ๋ฐ์ดํฐ ๋ณํฉ ์๋ฃ" |
|
|