| """ |
| Faz 3 — SmartCore V1 tam pretraining (fork/Triton hibrit, Colab A100). |
| |
| Model: Mamba-3 SISO (mamba-og Triton kernel) + her 6. katman GQA (torch SDPA, flash_attn'sız). |
| Veri: kdirgul/smartcore-v1-data parquet shard'ları (önce yerele indirilir → resumable, hızlı). |
| Eğitim: WSD (warmup→stable→decay), AdamW (2D-only wd), bf16 autocast, grad-accum ~0.5M token, |
| grad-clip 1.0, z-loss; checkpoint + async HF push + cross-session --resume. |
| |
| ÖNKOŞUL (Colab): mamba-og fork kurulu (Faz 3a). HF_TOKEN env (private repo). |
| |
| Kullanım (ilk): |
| HF_TOKEN=hf_xxx python faz3_train.py |
| Devam (yeni Colab session): |
| HF_TOKEN=hf_xxx python faz3_train.py --resume latest_hf |
| """ |
| import os, sys, time, math, glob, argparse, random, signal, threading |
| import torch, torch.nn as nn, torch.nn.functional as F |
| from functools import partial |
| from concurrent.futures import ThreadPoolExecutor |
|
|
| |
| |
| try: |
| from mamba_ssm.modules.block import Block |
| from mamba_ssm.modules.mamba3 import Mamba3 |
| from mamba_ssm.modules.mlp import GatedMLP |
| from mamba_ssm.ops.triton.layer_norm import RMSNorm |
| except ImportError: |
| Block = Mamba3 = GatedMLP = RMSNorm = None |
|
|
| |
| PRESETS = { |
| "177m": dict(d_model=768, n_layers=20, d_intermediate=1500, head_dim=64, n_heads=12, n_kv_heads=3), |
| "350m": dict(d_model=1024, n_layers=24, d_intermediate=2048, head_dim=64, n_heads=16, n_kv_heads=4), |
| } |
|
|
|
|
| def _rms(x, w, eps=1e-5): |
| return (x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)) * w |
|
|
|
|
| def _rot_half(x): |
| a, b = x.chunk(2, -1) |
| return torch.cat((-b, a), -1) |
|
|
|
|
| class GQAMixer(nn.Module): |
| """GQA attention (QK-norm + RoPE, causal) — torch SDPA, flash_attn YOK. Block'a uyumlu (x->tensor). |
| Çıkış projeksiyonu 'out_proj' adıyla → _init_weights residual-rescale'i yakalar.""" |
| def __init__(self, dim, n_heads=12, n_kv=3, base=10000.0, layer_idx=None, device=None, dtype=None): |
| super().__init__() |
| self.nh, self.nkv, self.hd = n_heads, n_kv, dim // n_heads |
| self.rep = n_heads // n_kv |
| fk = {"device": device, "dtype": dtype} |
| self.q_proj = nn.Linear(dim, n_heads * self.hd, bias=False, **fk) |
| self.k_proj = nn.Linear(dim, n_kv * self.hd, bias=False, **fk) |
| self.v_proj = nn.Linear(dim, n_kv * self.hd, bias=False, **fk) |
| self.out_proj = nn.Linear(n_heads * self.hd, dim, bias=False, **fk) |
| self.qn = nn.Parameter(torch.ones(self.hd, **fk)) |
| self.kn = nn.Parameter(torch.ones(self.hd, **fk)) |
| for lin in (self.q_proj, self.k_proj, self.v_proj): |
| nn.init.normal_(lin.weight, std=0.02) |
| self.register_buffer( |
| "inv", 1.0 / (base ** (torch.arange(0, self.hd, 2, device=device).float() / self.hd)), |
| persistent=False) |
|
|
| def _rope(self, x, T): |
| f = torch.outer(torch.arange(T, device=x.device, dtype=torch.float32), self.inv) |
| e = torch.cat((f, f), -1) |
| return (x * e.cos()[None, None] + _rot_half(x) * e.sin()[None, None]).to(x.dtype) |
|
|
| def forward(self, x, **kw): |
| B, T, _ = x.shape |
| q = self.q_proj(x).view(B, T, self.nh, self.hd).transpose(1, 2) |
| k = self.k_proj(x).view(B, T, self.nkv, self.hd).transpose(1, 2) |
| v = self.v_proj(x).view(B, T, self.nkv, self.hd).transpose(1, 2) |
| q = _rms(q.float(), self.qn.float()).to(x.dtype) |
| k = _rms(k.float(), self.kn.float()).to(x.dtype) |
| q, k = self._rope(q, T), self._rope(k, T) |
| k = k.repeat_interleave(self.rep, 1) |
| v = v.repeat_interleave(self.rep, 1) |
| y = F.scaled_dot_product_attention(q, k, v, is_causal=True) |
| return self.out_proj(y.transpose(1, 2).contiguous().view(B, T, -1)) |
|
|
|
|
| def _init_weights(m, n_layer): |
| if isinstance(m, nn.Linear) and m.bias is not None: |
| nn.init.zeros_(m.bias) |
| elif isinstance(m, nn.Embedding): |
| nn.init.normal_(m.weight, std=0.02) |
| for name, p in m.named_parameters(): |
| if name in ("out_proj.weight", "fc2.weight"): |
| nn.init.kaiming_uniform_(p, a=math.sqrt(5)) |
| with torch.no_grad(): |
| p /= math.sqrt(2 * n_layer) |
|
|
|
|
| class HybridLM(nn.Module): |
| def __init__(self, cfg, device=None, dtype=None): |
| super().__init__() |
| self.cfg = cfg |
| self.vocab = cfg["vocab_size"] |
| self.scaled_embed = cfg.get("scaled_embed", False) |
| self.z_loss = cfg.get("z_loss", 1e-4) |
| d = cfg["d_model"] |
| self.embedding = nn.Embedding(self.vocab, d, device=device, dtype=dtype) |
| self.layers = nn.ModuleList() |
| self.attn_idx = [] |
| for i in range(cfg["n_layers"]): |
| is_attn = ((i + 1) % cfg["attn_every"] == 0) and i != 0 and i != cfg["n_layers"] - 1 |
| fk = {"device": device, "dtype": dtype} |
| if is_attn: |
| mixer_cls = partial(GQAMixer, n_heads=cfg["n_heads"], n_kv=cfg["n_kv_heads"], |
| layer_idx=i, **fk) |
| self.attn_idx.append(i) |
| else: |
| ssm = dict(d_state=cfg["d_state"], expand=cfg["expand"], headdim=cfg["head_dim"], |
| ngroups=cfg["ngroups"], rope_fraction=cfg["rope_fraction"], |
| is_outproj_norm=False, is_mimo=cfg["is_mimo"], mimo_rank=cfg["mimo_rank"], |
| chunk_size=cfg["chunk_size"]) |
| mixer_cls = partial(Mamba3, layer_idx=i, **ssm, **fk) |
| blk = Block(d, mixer_cls, |
| partial(GatedMLP, hidden_features=cfg["d_intermediate"], out_features=d, **fk), |
| norm_cls=partial(RMSNorm, eps=1e-5, **fk), |
| fused_add_norm=True, residual_in_fp32=True) |
| blk.layer_idx = i |
| self.layers.append(blk) |
| self.norm_f = RMSNorm(d, eps=1e-5, device=device, dtype=dtype) |
| self.lm_head = nn.Linear(d, self.vocab, bias=False, device=device, dtype=dtype) |
| self.apply(partial(_init_weights, n_layer=cfg["n_layers"])) |
| self.lm_head.weight = self.embedding.weight |
|
|
| def forward(self, ids, labels=None): |
| h = self.embedding(ids) |
| if self.scaled_embed: |
| h = h * (self.cfg["d_model"] ** 0.5) |
| res = None |
| for l in self.layers: |
| h, res = l(h, res) |
| h = self.norm_f((h + res) if res is not None else h) |
| logits = self.lm_head(h.to(self.lm_head.weight.dtype)) |
| loss = None |
| if labels is not None: |
| sl = logits[:, :-1].reshape(-1, self.vocab).float() |
| tl = labels[:, 1:].reshape(-1) |
| loss = F.cross_entropy(sl, tl, ignore_index=-100) |
| if self.z_loss > 0: |
| z = torch.logsumexp(sl, dim=-1) |
| loss = loss + self.z_loss * (z ** 2).mean() |
| return logits, loss |
|
|
|
|
| def n_params(m): |
| seen, t = set(), 0 |
| for p in m.parameters(): |
| if id(p) in seen: |
| continue |
| seen.add(id(p)); t += p.numel() |
| return t |
|
|
|
|
| |
| import pyarrow.parquet as pq |
|
|
| MIXES = { |
| "177m": {"en_fineweb_edu": 0.55, "tr_fineweb2_hq": 0.22, "code_codeparrot": 0.13, "math_openwebmath": 0.10}, |
| "350m": {"en_fineweb_edu": 0.47, "tr_tc100b": 0.30, "code_codeparrot": 0.13, "math_openwebmath": 0.10}, |
| } |
|
|
|
|
| def ensure_local_data(data, token): |
| """HF repo ise yerele indir (resumable + hızlı), zaten yerel dizinse aynen döndür.""" |
| if os.path.isdir(data): |
| return data |
| from huggingface_hub import snapshot_download |
| print(f"[veri] {data} indiriliyor (snapshot)...", flush=True) |
| p = snapshot_download(data, repo_type="dataset", token=token, allow_patterns=["*/shard_*.parquet"]) |
| print(f"[veri] indirildi: {p}", flush=True) |
| return p |
|
|
|
|
| class ShardStream: |
| """Yerel parquet'ten oranlı, DETERMİNİSTİK ve RESUMABLE okuma (cursor + RNG kaydedilir).""" |
| def __init__(self, root, seq_len, mix, seed=42): |
| self.names = list(mix); self.w = [mix[n] for n in self.names] |
| self.seq_len = seq_len |
| self.files = {n: sorted(glob.glob(os.path.join(root, n, "shard_*.parquet"))) for n in self.names} |
| for n in self.names: |
| assert self.files[n], f"shard yok: {root}/{n}" |
| self.cursor = {n: [0, 0] for n in self.names} |
| self.cache = {} |
| self.rng = random.Random(seed) |
|
|
| def _rows(self, n): |
| si = self.cursor[n][0] % len(self.files[n]) |
| if self.cache.get(n, (None,))[0] != si: |
| |
| col = pq.read_table(self.files[n][si], columns=["input_ids"]).column("input_ids") |
| self.cache[n] = (si, col) |
| return self.cache[n][1] |
|
|
| def _next(self, n): |
| rows = self._rows(n) |
| if self.cursor[n][1] >= len(rows): |
| self.cursor[n][0] += 1; self.cursor[n][1] = 0; rows = self._rows(n) |
| ri = self.cursor[n][1]; self.cursor[n][1] = ri + 1 |
| return rows[ri].as_py()[:self.seq_len] |
|
|
| def batch(self, bsz, device): |
| rows = [self._next(self.rng.choices(self.names, weights=self.w, k=1)[0]) for _ in range(bsz)] |
| return torch.tensor(rows, dtype=torch.long, device=device) |
|
|
| def state(self): |
| |
| return {"cursor": {k: list(v) for k, v in self.cursor.items()}, "rng": self.rng.getstate()} |
|
|
| def load_state(self, s): |
| self.cursor = {k: list(v) for k, v in s["cursor"].items()} |
| self.rng.setstate(s["rng"]); self.cache = {} |
|
|
|
|
| |
| def wsd_lr(step, total, peak, floor, warmup, decay_frac=0.25): |
| if step < warmup: |
| return peak * (step + 1) / warmup |
| dec = int(total * (1 - decay_frac)) |
| if step < dec: |
| return peak |
| return peak - (peak - floor) * (step - dec) / max(1, total - dec) |
|
|
|
|
| |
| class Ckpt: |
| def __init__(self, local_dir, repo_id, token, keep=3): |
| self.dir = local_dir; self.repo = repo_id; self.keep = keep |
| os.makedirs(local_dir, exist_ok=True) |
| self.api = None |
| if repo_id and token: |
| from huggingface_hub import HfApi |
| self.api = HfApi(token=token) |
| self.ex = ThreadPoolExecutor(max_workers=1); self.lock = threading.Lock() |
|
|
| def save(self, step, model, opts, stream, extra): |
| d = os.path.join(self.dir, f"step_{step:06d}"); os.makedirs(d, exist_ok=True) |
| torch.save({"model": model.state_dict(), "opt": [o.state_dict() for o in opts], "step": step, |
| "stream": stream.state(), "torch_rng": torch.get_rng_state(), |
| "cuda_rng": torch.cuda.get_rng_state_all(), **extra}, |
| os.path.join(d, "ckpt.pt")) |
| self._rotate() |
| if self.api: |
| self.ex.submit(self._push, d, step) |
| print(f"[ckpt] kaydedildi step {step} -> {d}", flush=True) |
|
|
| def _push(self, d, step): |
| try: |
| with self.lock: |
| self.api.upload_folder(folder_path=d, repo_id=self.repo, repo_type="model", |
| path_in_repo=f"checkpoints/step_{step:06d}", |
| commit_message=f"ckpt step {step}") |
| print(f"[ckpt] HF push OK step {step}", flush=True) |
| except Exception as e: |
| print(f"[ckpt] HF push HATA step {step}: {repr(e)[:160]}", flush=True) |
|
|
| def _rotate(self): |
| ds = sorted(glob.glob(os.path.join(self.dir, "step_*"))) |
| for old in ds[:-self.keep]: |
| for f in glob.glob(os.path.join(old, "*")): |
| os.remove(f) |
| os.rmdir(old) |
|
|
| def latest_local(self): |
| ds = sorted(glob.glob(os.path.join(self.dir, "step_*", "ckpt.pt"))) |
| return ds[-1] if ds else None |
|
|
| def latest_hf(self): |
| if not self.api: |
| return None |
| from huggingface_hub import hf_hub_download |
| files = [f for f in self.api.list_repo_files(self.repo, repo_type="model") |
| if f.startswith("checkpoints/step_") and f.endswith("ckpt.pt")] |
| if not files: |
| return None |
| latest = max(files) |
| return hf_hub_download(self.repo, latest, repo_type="model") |
|
|
|
|
| |
| |
| def _ns5(G, steps=5): |
| """Newton-Schulz orthogonalizasyon (Muon çekirdeği) — G'yi yarı-ortogonale yaklaştır.""" |
| a, b, c = 3.4445, -4.7750, 2.0315 |
| X = G.bfloat16() |
| t = G.size(-2) > G.size(-1) |
| if t: |
| X = X.mT |
| X = X / (X.norm() + 1e-7) |
| for _ in range(steps): |
| A = X @ X.mT |
| B = b * A + c * (A @ A) |
| X = a * X + B @ X |
| if t: |
| X = X.mT |
| return X.to(G.dtype) |
|
|
|
|
| class Muon(torch.optim.Optimizer): |
| """Momentum + Newton-Schulz ortogonalize güncelleme (Keller Jordan). 2D matris ağırlıkları için.""" |
| def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5, weight_decay=0.0): |
| super().__init__(params, dict(lr=lr, momentum=momentum, nesterov=nesterov, |
| ns_steps=ns_steps, weight_decay=weight_decay)) |
|
|
| @torch.no_grad() |
| def step(self): |
| for grp in self.param_groups: |
| lr, mom, nest = grp["lr"], grp["momentum"], grp["nesterov"] |
| ns, wd = grp["ns_steps"], grp["weight_decay"] |
| for p in grp["params"]: |
| if p.grad is None: |
| continue |
| st = self.state[p] |
| if "buf" not in st: |
| st["buf"] = torch.zeros_like(p.grad) |
| buf = st["buf"]; buf.lerp_(p.grad, 1 - mom) |
| u = p.grad.lerp_(buf, mom) if nest else buf |
| u = _ns5(u, ns) * (max(1.0, p.size(-2) / p.size(-1)) ** 0.5) |
| if wd: |
| p.mul_(1 - lr * wd) |
| p.add_(u, alpha=-lr) |
|
|
|
|
| def build_optimizers(model, args): |
| """args.muon ise [Muon(2D-Linear), AdamW(embed+3D / 1D)]; değilse [AdamW] (orijinal). |
| Dönüş (opts, base_lrs) — base_lr WSD çarpanıyla (0→1→0.1) ölçeklenir.""" |
| if args.muon: |
| emb = {id(m.weight) for m in model.modules() if isinstance(m, nn.Embedding)} |
| muon_p, adam_wd, adam_nod = [], [], [] |
| for p in model.parameters(): |
| if p.ndim == 2 and id(p) not in emb: |
| muon_p.append(p) |
| elif p.ndim >= 2: |
| adam_wd.append(p) |
| else: |
| adam_nod.append(p) |
| o_m = Muon(muon_p, lr=args.muon_lr, momentum=0.95, ns_steps=5, weight_decay=0.0) |
| o_a = torch.optim.AdamW([{"params": adam_wd, "weight_decay": 0.1}, |
| {"params": adam_nod, "weight_decay": 0.0}], |
| lr=args.peak_lr, betas=(0.9, 0.95), eps=1e-8, fused=True) |
| mp = sum(p.numel() for p in muon_p); ap_ = sum(p.numel() for p in adam_wd + adam_nod) |
| print(f"[opt] MUON {mp/1e6:.1f}M (2D-Linear) + AdamW {ap_/1e6:.1f}M (embed+norm) | " |
| f"muon_lr {args.muon_lr} | peak_lr {args.peak_lr}", flush=True) |
| return [o_m, o_a], [args.muon_lr, args.peak_lr] |
| decay = [p for p in model.parameters() if p.ndim >= 2] |
| nod = [p for p in model.parameters() if p.ndim < 2] |
| o_a = torch.optim.AdamW([{"params": decay, "weight_decay": 0.1}, |
| {"params": nod, "weight_decay": 0.0}], |
| lr=args.peak_lr, betas=(0.9, 0.95), eps=1e-8, fused=True) |
| print(f"[opt] AdamW (tek) | peak_lr {args.peak_lr}", flush=True) |
| return [o_a], [args.peak_lr] |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--data", default="kdirgul/smartcore-v1-data") |
| ap.add_argument("--ckpt_repo", default="kdirgul/smartcore-v1") |
| ap.add_argument("--ckpt_dir", default="/content/ckpt") |
| ap.add_argument("--resume", default=None, help="latest_local | latest_hf | <path>") |
| ap.add_argument("--preset", default="177m", choices=["177m", "350m"]) |
| ap.add_argument("--n_layers", type=int, default=None) |
| ap.add_argument("--d_model", type=int, default=None) |
| ap.add_argument("--d_intermediate", type=int, default=None) |
| ap.add_argument("--n_heads", type=int, default=None) |
| ap.add_argument("--n_kv_heads", type=int, default=None) |
| ap.add_argument("--attn_every", type=int, default=6) |
| ap.add_argument("--seq_len", type=int, default=2048) |
| ap.add_argument("--micro_batch", type=int, default=4) |
| ap.add_argument("--grad_accum", type=int, default=64) |
| ap.add_argument("--total_tokens", type=float, default=12e9) |
| ap.add_argument("--peak_lr", type=float, default=5e-4) |
| ap.add_argument("--muon", action="store_true", help="v1.5: 2D-Linear ağırlıkları Muon (embed/norm AdamW)") |
| ap.add_argument("--muon_lr", type=float, default=0.02) |
| ap.add_argument("--warmup", type=int, default=600) |
| ap.add_argument("--save_every", type=int, default=500) |
| ap.add_argument("--log_every", type=int, default=10) |
| ap.add_argument("--seed", type=int, default=42) |
| args = ap.parse_args() |
|
|
| dev = torch.device("cuda") |
| torch.manual_seed(args.seed); random.seed(args.seed) |
| torch.set_float32_matmul_precision("high") |
| token = os.environ.get("HF_TOKEN") |
|
|
| P = dict(PRESETS[args.preset]) |
| for k in ("n_layers", "d_model", "d_intermediate", "n_heads", "n_kv_heads"): |
| if getattr(args, k) is not None: |
| P[k] = getattr(args, k) |
| cfg = dict(vocab_size=48000, d_model=P["d_model"], n_layers=P["n_layers"], d_state=128, expand=2, |
| head_dim=P["head_dim"], ngroups=1, d_intermediate=P["d_intermediate"], attn_every=args.attn_every, |
| n_heads=P["n_heads"], n_kv_heads=P["n_kv_heads"], rope_fraction=0.5, is_mimo=False, mimo_rank=1, |
| chunk_size=128, scaled_embed=False, z_loss=1e-4) |
| print(f"[cfg] preset={args.preset} | d_model={cfg['d_model']} n_layers={cfg['n_layers']} " |
| f"d_int={cfg['d_intermediate']} {cfg['n_heads']}/{cfg['n_kv_heads']} GQA attn_every={cfg['attn_every']}", flush=True) |
|
|
| model = HybridLM(cfg, device=dev, dtype=torch.bfloat16) |
| print(f"[model] {n_params(model)/1e6:.1f}M | {cfg['n_layers']-len(model.attn_idx)} Mamba + " |
| f"{len(model.attn_idx)} GQA (attn@{model.attn_idx})", flush=True) |
|
|
| opts, base_lrs = build_optimizers(model, args) |
|
|
| batch_tok = args.micro_batch * args.grad_accum * args.seq_len |
| total_steps = int(args.total_tokens / batch_tok) |
| print(f"[plan] {args.total_tokens/1e9:.0f}B token | {batch_tok} tok/step | {total_steps} step | " |
| f"warmup {args.warmup} | peak {args.peak_lr}", flush=True) |
|
|
| root = ensure_local_data(args.data, token) |
| mix = MIXES.get(args.preset, MIXES["177m"]) |
| print(f"[mix] {args.preset}: " + " ".join(f"{k}={v:.0%}" for k, v in mix.items()), flush=True) |
| stream = ShardStream(root, args.seq_len, mix, seed=args.seed) |
| ckpt = Ckpt(args.ckpt_dir, args.ckpt_repo, token) |
|
|
| start_step = 0 |
| if args.resume: |
| path = (ckpt.latest_local() if args.resume == "latest_local" else |
| ckpt.latest_hf() if args.resume == "latest_hf" else args.resume) |
| if path and os.path.exists(path): |
| st = torch.load(path, map_location="cpu") |
| model.load_state_dict(st["model"]) |
| osd = st["opt"] if isinstance(st["opt"], list) else [st["opt"]] |
| for o, s in zip(opts, osd): |
| o.load_state_dict(s) |
| stream.load_state(st["stream"]); start_step = st["step"] + 1 |
| torch.set_rng_state(st["torch_rng"]); torch.cuda.set_rng_state_all(st["cuda_rng"]) |
| print(f"[resume] {path} -> step {start_step}", flush=True) |
| else: |
| print(f"[resume] checkpoint bulunamadı ({args.resume}) — sıfırdan başlıyor", flush=True) |
|
|
| |
| cur = {"step": start_step} |
|
|
| def emergency(signum, frame): |
| ckpt.save(cur["step"], model, opts, stream, {"cfg": cfg}) |
| sys.exit(0) |
| signal.signal(signal.SIGTERM, emergency); signal.signal(signal.SIGINT, emergency) |
|
|
| model.train() |
| t0 = time.perf_counter(); seen = 0 |
| for step in range(start_step, total_steps): |
| cur["step"] = step |
| frac = wsd_lr(step, total_steps, 1.0, 0.1, args.warmup) |
| for o, b in zip(opts, base_lrs): |
| for g in o.param_groups: |
| g["lr"] = b * frac |
| for o in opts: |
| o.zero_grad(set_to_none=True) |
| loss_acc = 0.0 |
| for _ in range(args.grad_accum): |
| batch = stream.batch(args.micro_batch, dev) |
| with torch.autocast(device_type="cuda", dtype=torch.bfloat16): |
| _, loss = model(batch, labels=batch) |
| (loss / args.grad_accum).backward() |
| loss_acc += loss.item() / args.grad_accum |
| seen += batch.numel() |
| gn = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) |
| for o in opts: |
| o.step() |
| if step % args.log_every == 0: |
| tok_s = seen / (time.perf_counter() - t0) |
| print(f"step {step:6d}/{total_steps} | loss {loss_acc:.4f} | gnorm {gn:5.2f} | " |
| f"lr {base_lrs[-1]*frac:.2e} | {tok_s/1e3:.1f}k tok/s | {seen/1e9:.3f}B tok", flush=True) |
| if step > start_step and step % args.save_every == 0: |
| ckpt.save(step, model, opts, stream, {"cfg": cfg}) |
|
|
| ckpt.save(total_steps - 1, model, opts, stream, {"cfg": cfg, "final": True}) |
| print("[bitti] pretraining tamamlandı.", flush=True) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|