File size: 3,315 Bytes
d304eb5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | """C17d: λͺ¨λ νμ΄ + κΈΈμ΄ νν° (1500μ μ΄νλ§) + NaN λ°©μ§"""
import json, re, random, torch, numpy as np, os
from collections import defaultdict
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from transformers import EarlyStoppingCallback
from datasets import Dataset
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(SEED)
if torch.cuda.get_device_capability()[0] >= 8: torch.set_float32_matmul_precision('high')
SP = "μ£Όμ΄μ§ μν λ¬Έμ λ₯Ό λ¨κ³λ³λ‘ νκ³ λ΅λ³μ μμ±νμΈμ.\nλ°λμ μ΅μ’
λ΅λ³μ \\boxed{μ μ} νμμΌλ‘ λ§μ§λ§ μ€μ μΆλ ₯νμΈμ.\nμμ: \\boxed{42}"
print("=== C17d: All solutions, length-filtered (β€1500 chars) ===")
with open("data/GSM8K_full_qwen3_30b.json") as f:
data = json.load(f)
# κΈΈμ΄ νν°: 1500μ μ΄νλ§
filtered = [d for d in data if len(d['answer']) <= 1500]
print(f"μλ³Έ: {len(data)}κ° β νν° ν: {len(filtered)}κ° (μ κ±°: {len(data)-len(filtered)})")
random.shuffle(filtered)
uq = len(set(d["question"] for d in filtered))
print(f"Unique: {uq}, avg {len(filtered)/uq:.1f}/q")
split = int(len(filtered) * 0.95)
train, test = filtered[:split], filtered[split:]
def to_sft(ex):
return {"prompt": [{"role":"user","content":SP+"\n\n"+ex["question"]}],
"completion": [{"role":"assistant","content":ex["answer"]}]}
cols = [c for c in Dataset.from_list(train[:1]).column_names if c not in ["prompt","completion"]]
train_ds = Dataset.from_list(train).map(to_sft, remove_columns=cols)
test_ds = Dataset.from_list(test).map(to_sft, remove_columns=cols)
print(f"νμ΅: {len(train_ds)} / κ²μ¦: {len(test_ds)}")
tokenizer = AutoTokenizer.from_pretrained("outputs/models/gemma-3-1b-it")
model = AutoModelForCausalLM.from_pretrained("outputs/models/gemma-3-1b-it", dtype=torch.bfloat16, device_map="auto", attn_implementation='flash_attention_2')
tokenizer.pad_token = tokenizer.eos_token
model.gradient_checkpointing_enable(); model.config.use_cache = False
cfg = SFTConfig(
report_to='none', seed=SEED, eval_strategy="steps", eval_steps=200,
save_total_limit=2, load_best_model_at_end=True, metric_for_best_model="eval_loss",
save_steps=200, num_train_epochs=3, warmup_ratio=0.05, weight_decay=0.01, max_grad_norm=1.0,
neftune_noise_alpha=5, per_device_train_batch_size=8, gradient_accumulation_steps=4,
per_device_eval_batch_size=2, max_length=2048, lr_scheduler_type='cosine',
learning_rate=2e-5, bf16=True, optim="paged_adamw_8bit",
output_dir="outputs/c17d_checkpoints", logging_steps=50, save_strategy="steps",
)
trainer = SFTTrainer(model=model, processing_class=tokenizer, train_dataset=train_ds, eval_dataset=test_ds, args=cfg,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)])
print("νμ΅ μμ (3 epochs, λͺ¨λ νμ΄, β€1500μ)")
r = trainer.train()
print(f"μλ£! Loss: {r.training_loss:.4f}")
SAVE = "outputs/models/c17d-gemma-3-1b-it-Math"
os.makedirs(SAVE, exist_ok=True)
model.eval(); model.save_pretrained(SAVE, safe_serialization=False); tokenizer.save_pretrained(SAVE)
print(f"μ μ₯: {SAVE}")
del model, trainer; torch.cuda.empty_cache()
print("GPU ν΄μ ")
|