Spaces:

FaiziRBLX
/

NousAPI

Sleeping

App Files Files Community

FaiziRBLX commited on about 1 month ago

Commit

7df6753

verified ·

1 Parent(s): 17776f3

Update best.py

Browse files

Files changed (1) hide show

best.py +418 -327

best.py CHANGED Viewed

@@ -5,37 +5,43 @@ Trained from scratch with Chain-of-Thought reasoning capability
 Architecture: Decoder-only transformer with GQA, RoPE, SwiGLU, RMSNorm, KV-Cache
 Target: 15M-30M parameters, optimized for Google Colab Free tier
-FIXES vs original:
-  - KV cache for O(n) inference instead of O(n²)
-  - RoPE broadcast shape corrected (explicit unsqueeze)
-  - Label smoothing fixed: custom impl that respects ignore_index=-100
-  - Depth-scaled weight init for residual branches (o_proj, down_proj)
-  - Per-token loss tracking (not per-batch avg biased by padding)
-  - Repetition penalty fixed: proper logit division (not magic subtract)
-  - Mixed precision updated to PyTorch 2.x API (torch.amp.*)
-  - EOS token appended to training completions so model learns to stop
-  - Intermediate size computed correctly from hidden_size
-  - Gradient norm logged every step
-  - _clean_response made robust to Indonesian "user" as a word
-  - Causal mask uses float directly (no bool intermediate)
-  - GQA skip repeat when groups==1
-  - Vocab size set from tokenizer at build time, never hardcoded
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.data import Dataset, DataLoader
 from transformers import AutoTokenizer
 import json
 import math
 import random
 import numpy as np
-from typing import Optional, Tuple, List, Dict
 from dataclasses import dataclass, field
 import warnings
 import argparse
 import os
 warnings.filterwarnings('ignore')
@@ -45,14 +51,11 @@ warnings.filterwarnings('ignore')
 @dataclass
 class ModelConfig:
-    vocab_size: int = 32000          # set from len(tokenizer) at build time
     hidden_size: int = 384
     num_layers: int = 12
     num_attention_heads: int = 6
-    num_key_value_heads: int = 2     # GQA
-    # Stored as a plain int field — NEVER a @property — so pickle round-trips work.
-    # 0 = unset (load_model will fill it from checkpoint weight shapes).
-    # New training always passes this explicitly from len(tokenizer) / hidden_size.
     intermediate_size: int = 0
     max_position_embeddings: int = 2048
     rms_norm_eps: float = 1e-6
@@ -65,9 +68,10 @@ class ModelConfig:
     eos_token_id: int = 2
     tie_word_embeddings: bool = True
     label_smoothing: float = 0.1
     def __post_init__(self):
-        # Set intermediate_size only when not already provided
         if self.intermediate_size <= 0:
             self.intermediate_size = self.hidden_size * 3
         assert self.hidden_size % self.num_attention_heads == 0, \
@@ -97,9 +101,9 @@ class TrainingConfig:
     lr_scheduler_type: str = "cosine"
     dropout: float = 0.1
-    # FIX: torch.amp.* (PyTorch 2.x API)
     use_fp16: bool = True
     seed: int = 42
     logging_steps: int = 10
@@ -107,7 +111,6 @@ class TrainingConfig:
     save_steps: int = 500
     curriculum_stages: List[int] = None
-    skip_curriculum_stages: int = 2
     plateau_patience: int = 3
     plateau_factor: float = 0.5
@@ -141,7 +144,7 @@ class RotaryEmbedding(nn.Module):
         t = torch.arange(seq_len, device=self.inv_freq.device).type_as(self.inv_freq)
         freqs = torch.outer(t, self.inv_freq)
         emb = torch.cat((freqs, freqs), dim=-1)
-        # FIX: store as [1, 1, T, D] so broadcast onto [B, H, T, D] is explicit and correct
         self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
         self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
@@ -160,14 +163,27 @@ def rotate_half(x: torch.Tensor) -> torch.Tensor:
     return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None):
-    # cos/sin: [1, 1, T, D] — broadcasts cleanly onto [B, H, T, D]
-    if position_ids is not None:
-        # For KV-cache decode: position_ids is [B, 1], pick specific positions
-        cos = cos[:, :, position_ids, :].squeeze(2)  # [B, 1, 1, D]
-        sin = sin[:, :, position_ids, :].squeeze(2)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -221,6 +237,7 @@ class GroupedQueryAttention(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
@@ -231,43 +248,32 @@ class GroupedQueryAttention(nn.Module):
         key_states   = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
-        query_states = query_states.view(bsz, q_len, self.num_heads,          self.head_dim).transpose(1, 2)
         key_states   = key_states  .view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # RoPE — position offset accounts for cached prefix
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        # For prefill (training / first forward): use full cos/sin slice
-        # For decode (KV cache active): past_key_value holds the cached context
         if past_key_value is not None:
-            # Decode step: only current token needs RoPE at position kv_seq_len-1
-            offset = past_key_value[0].shape[-2]
-            cos_q = cos[:, :, offset:offset + q_len, :]
-            sin_q = sin[:, :, offset:offset + q_len, :]
-            query_states = (query_states * cos_q) + (rotate_half(query_states) * sin_q)
-            key_states   = (key_states   * cos_q) + (rotate_half(key_states)   * sin_q)
-            # Concat cached K, V
             key_states   = torch.cat([past_key_value[0], key_states],   dim=2)
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # Prefill: full sequence RoPE
-            cos_full = cos[:, :, :q_len, :]
-            sin_full = sin[:, :, :q_len, :]
-            query_states = (query_states * cos_full) + (rotate_half(query_states) * sin_full)
-            key_states   = (key_states   * cos_full) + (rotate_half(key_states)   * sin_full)
-        # Store pre-expand KV in cache (shape [B, num_kv_heads, T, D]).
-        # Must happen BEFORE repeat_interleave — otherwise cached keys have
-        # num_heads channels instead of num_kv_heads, and every decode step
-        # re-expands them again, corrupting attention.
         present_kv = (key_states, value_states) if use_cache else None
-        # Expand KV heads for full attention computation
         if self.num_key_value_groups > 1:
             key_states   = key_states  .repeat_interleave(self.num_key_value_groups, dim=1)
             value_states = value_states.repeat_interleave(self.num_key_value_groups, dim=1)
@@ -296,8 +302,6 @@ class SwiGLUMLP(nn.Module):
     def __init__(self, config: ModelConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Read intermediate_size defensively: if somehow 0 or negative (e.g. old
-        # unpickled config that missed __post_init__), fall back to hidden * 3.
         inter = getattr(config, 'intermediate_size', 0)
         if not isinstance(inter, int) or inter <= 0:
             inter = self.hidden_size * 3
@@ -328,21 +332,23 @@ class DecoderLayer(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ):
-        residual    = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         hidden_states, present_kv = self.self_attn(
             hidden_states,
             attention_mask=attention_mask,
             past_key_value=past_key_value,
             use_cache=use_cache,
         )
         hidden_states = self.residual_dropout(hidden_states)
         hidden_states = residual + hidden_states
-        residual    = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
         hidden_states = self.residual_dropout(hidden_states)
@@ -352,17 +358,14 @@ class DecoderLayer(nn.Module):
 # ============================================================================
-# CUSTOM LABEL SMOOTHING LOSS  (FIX: respects ignore_index=-100)
 # ============================================================================
-class LabelSmoothingCrossEntropy(nn.Module):
-    """
-    Cross-entropy with label smoothing.
-    Filters ignore_index=-100 first, then uses F.cross_entropy with smoothing.
-    This keeps the exact same loss scale as the original nn.CrossEntropyLoss
-    so the LR schedule pacing is unchanged.
-    """
     def __init__(self, vocab_size: int, smoothing: float = 0.1, ignore_index: int = -100):
         super().__init__()
         self.vocab_size   = vocab_size
@@ -370,15 +373,23 @@ class LabelSmoothingCrossEntropy(nn.Module):
         self.ignore_index = ignore_index
     def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
-        # logits: [N, V]  targets: [N]
-        # F.cross_entropy with label_smoothing and ignore_index is correct in
-        # PyTorch >= 1.10 — it does NOT distribute to ignored positions.
-        return F.cross_entropy(
-            logits,
-            targets,
-            ignore_index=self.ignore_index,
-            label_smoothing=self.smoothing,
-        )
 # ============================================================================
@@ -392,7 +403,8 @@ class IndonesianLLM(nn.Module):
         self.padding_idx = config.pad_token_id
         self.vocab_size  = config.vocab_size
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
         self.layers       = nn.ModuleList([DecoderLayer(config, idx) for idx in range(config.num_layers)])
         self.norm         = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -400,21 +412,20 @@ class IndonesianLLM(nn.Module):
             nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.loss_fn = LabelSmoothingCrossEntropy(
-            vocab_size    = config.vocab_size,
-            smoothing     = config.label_smoothing,
-            ignore_index  = -100,
         )
         self.apply(self._init_weights)
     def _init_weights(self, module):
         std = self.config.initializer_range
         if isinstance(module, nn.Linear):
-            # FIX: depth-scaled init for residual output projections
-            # (o_proj and down_proj feed directly into residual stream)
             name = getattr(module, '_layer_name', '')
             if name in ('o_proj', 'down_proj'):
-                # Wang et al. 2021 / GPT-NeoX scaling
                 scaled_std = std / math.sqrt(2 * self.config.num_layers)
                 module.weight.data.normal_(mean=0.0, std=scaled_std)
             else:
@@ -427,7 +438,6 @@ class IndonesianLLM(nn.Module):
                 module.weight.data[module.padding_idx].zero_()
     def _tag_projection_layers(self):
-        """Tag o_proj and down_proj for depth-scaled init. Call before apply()."""
         for layer in self.layers:
             layer.self_attn.o_proj._layer_name = 'o_proj'
             layer.mlp.down_proj._layer_name    = 'down_proj'
@@ -444,28 +454,61 @@ class IndonesianLLM(nn.Module):
         attention_mask: Optional[torch.Tensor] = None,
         batch_size: int = 1,
     ) -> torch.Tensor:
-        """
-        FIX: Build additive float causal mask directly instead of bool intermediate.
-        Shape: [B, 1, T_q, T_kv]
-        """
         total_len = past_len + seq_len
-        # Full causal mask over [T_q, T_kv]
-        causal = torch.full((seq_len, total_len), torch.finfo(dtype).min, device=device, dtype=dtype)
         mask_cond = torch.arange(total_len, device=device)
-        causal.masked_fill_(mask_cond[None, :] <= (torch.arange(seq_len, device=device) + past_len)[:, None], 0.0)
         causal = causal[None, None, :, :].expand(batch_size, 1, seq_len, total_len)
         if attention_mask is not None:
-            # attention_mask: [B, T_kv] — 1 = keep, 0 = mask out
             pad_mask = (1.0 - attention_mask[:, None, None, :].float()) * torch.finfo(dtype).min
             causal   = causal + pad_mask
         return causal
     def forward(
         self,
         input_ids: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
         use_cache: bool = False,
@@ -476,11 +519,15 @@ class IndonesianLLM(nn.Module):
         hidden_states = self.embed_tokens(input_ids)
-        # Only build attention mask once (shared across all layers)
         if attention_mask is None:
             attention_mask = torch.ones(batch_size, past_len + seq_length,
                                         dtype=torch.long, device=input_ids.device)
         causal_mask = self._make_causal_mask(
             seq_len=seq_length,
             past_len=past_len,
@@ -494,11 +541,13 @@ class IndonesianLLM(nn.Module):
         for i, decoder_layer in enumerate(self.layers):
             pkv = past_key_values[i] if past_key_values is not None else None
-            hidden_states, present_kv = decoder_layer(
                 hidden_states,
-                attention_mask=causal_mask,
-                past_key_value=pkv,
-                use_cache=use_cache,
             )
             if use_cache:
                 present_key_values.append(present_kv)
@@ -515,9 +564,9 @@ class IndonesianLLM(nn.Module):
             loss = self.loss_fn(shift_logits, shift_labels)
         return {
-            "loss":               loss,
-            "logits":             logits,
-            "past_key_values":    present_key_values,
         }
     def count_parameters(self) -> int:
@@ -550,7 +599,6 @@ class IndonesianCoTDataset(Dataset):
         self.skipped_count  = 0
         self._load_data(file_path)
-    # FIX: get EOS string once, reuse everywhere
     @property
     def _eos(self) -> str:
         return self.tokenizer.eos_token or "</s>"
@@ -558,7 +606,7 @@ class IndonesianCoTDataset(Dataset):
     def _load_data(self, file_path: str):
         print(f"Loading dataset from {file_path}...")
         with open(file_path, 'r', encoding='utf-8') as f:
-            for line_num, line in enumerate(f, 1):
                 try:
                     if not line.strip():
                         continue
@@ -585,10 +633,8 @@ class IndonesianCoTDataset(Dataset):
     def __getitem__(self, idx):
         sample = self.samples[idx]
-        # Build prompt / completion split
         if self.use_cot and random.random() < self.cot_ratio:
             prompt     = f"{sample['input']} {self.cot_token}"
-            # FIX: append EOS so the model learns when to stop
             completion = f" {sample['cot']} {self.end_cot_token} {sample['output']}{self._eos}"
         else:
             prompt     = f"{sample['input']}"
@@ -605,7 +651,6 @@ class IndonesianCoTDataset(Dataset):
             add_special_tokens=True,
         )
-        # Mask prompt tokens so only completion contributes to loss
         labels = [-100] * min(prompt_len, len(full_ids)) + full_ids[prompt_len:]
         labels = labels[:len(full_ids)]
@@ -643,12 +688,10 @@ def collate_fn_with_packing(batch, pad_token_id: int = 0):
 # ============================================================================
-# PER-TOKEN LOSS TRACKING  (FIX: don't average over padding)
 # ============================================================================
 class TokenLossAccumulator:
-    """Track loss and token count separately so perplexity is unbiased."""
     def __init__(self):
         self.total_loss   = 0.0
         self.total_tokens = 0
@@ -689,13 +732,16 @@ def _build_stage_dataset(base: IndonesianCoTDataset, samples, max_len: int, cot_
 def create_curriculum_datasets(dataset, stages=None, use_simple=False, skip_stages=0):
     if stages is None:
         stages = [256, 512, 1024]
-    datasets = []
     if use_simple:
         for i, max_len in enumerate(stages):
-            if i < skip_stages:
-                print(f"[SKIP] Curriculum stage {max_len}")
-                continue
             filtered = [
                 s for s in dataset.samples
                 if len(dataset.tokenizer.encode(
@@ -703,37 +749,26 @@ def create_curriculum_datasets(dataset, stages=None, use_simple=False, skip_stag
                 )) <= max_len
             ]
             datasets.append(_build_stage_dataset(dataset, filtered, max_len, dataset.cot_ratio))
-            print(f"Curriculum stage {max_len}: {len(filtered)} samples")
     else:
-        print("\n" + "=" * 80)
-        print("3-STAGE REASONING CURRICULUM")
-        if skip_stages > 0:
-            print(f"  (Skipping first {skip_stages} stage(s))")
-        print("=" * 80)
         stage_configs = [
             {'name': 'Stage 1: Basic Q&A (no CoT)',           'max_len': 384,  'cot_ratio': 0.0},
             {'name': 'Stage 2: Learning Reasoning (50% CoT)', 'max_len': 512,  'cot_ratio': 0.5},
             {'name': 'Stage 3: Full Reasoning (100% CoT)',    'max_len': 1024, 'cot_ratio': 1.0},
         ]
         for idx, sc in enumerate(stage_configs):
-            filtered = dataset.samples  # all samples for stages 2+
-            if idx == 0:
-                filtered = [
-                    s for s in dataset.samples
-                    if len(dataset.tokenizer.encode(f"{s['input']} {s['output']}")) <= sc['max_len']
-                ]
             datasets.append(_build_stage_dataset(dataset, filtered, sc['max_len'], sc['cot_ratio']))
-            skipped = idx < skip_stages
-            tag = " [SKIP]" if skipped else ""
             print(f"  {sc['name']}{tag}  |  samples={len(filtered)}  |  CoT={sc['cot_ratio']:.0%}")
-        print("=" * 80 + "\n")
-        if skip_stages > 0:
-            datasets = datasets[skip_stages:]
-    return datasets
 # ============================================================================
@@ -805,6 +840,21 @@ def set_seed(seed: int):
     torch.backends.cudnn.benchmark     = False
 # ============================================================================
 # ELASTIC WEIGHT CONSOLIDATION (EWC)
 # ============================================================================
@@ -817,6 +867,11 @@ class EWC:
         self.fisher    = self._compute_fisher(model, dataloader)
     def _compute_fisher(self, model, dataloader):
         fisher = {n: torch.zeros_like(p) for n, p in model.named_parameters() if p.requires_grad}
         model.eval()
         seen = 0
@@ -825,14 +880,29 @@ class EWC:
                 break
             input_ids      = batch["input_ids"]     .to(self.device)
             attention_mask = batch["attention_mask"] .to(self.device)
-            labels         = batch["labels"]         .to(self.device)
             model.zero_grad()
-            out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
-            out["loss"].backward()
             for n, p in model.named_parameters():
                 if p.requires_grad and p.grad is not None:
                     fisher[n] += p.grad.detach().pow(2)
             seen += input_ids.size(0)
         for n in fisher:
             fisher[n] /= max(1, seen)
         model.train()
@@ -872,9 +942,14 @@ def train_model(
     print(f"  Max seq length:      {config.max_seq_length}")
     print(f"  Epochs:              {config.num_epochs}")
     print(f"  Mixed precision:     {config.use_fp16}")
     print(f"  EWC:                 {'enabled (lambda=' + str(config.ewc_lambda) + ')' if ewc else 'disabled'}")
     print("=" * 80 + "\n")
     model.to(device)
     model.train()
@@ -886,7 +961,7 @@ def train_model(
     )
     if not curriculum_datasets:
-        print("ERROR: No curriculum stages. Check --skip-stages.")
         return model
     optimizer = torch.optim.AdamW(
@@ -902,7 +977,6 @@ def train_model(
         for ds in curriculum_datasets
     ) or 1
-    # FIX: use torch.amp.* (PyTorch 2.x API, not deprecated cuda.amp.*)
     use_amp = config.use_fp16 and device.type == 'cuda'
     scaler  = torch.amp.GradScaler('cuda') if use_amp else None
@@ -931,13 +1005,9 @@ def train_model(
               f"n={len(stage_dataset)} | CoT={getattr(stage_dataset, 'cot_ratio', '?'):.0%}")
         print(f"{'=' * 80}\n")
-        dataloader = DataLoader(
-            stage_dataset,
-            batch_size=config.batch_size,
-            shuffle=True,
-            collate_fn=lambda x: collate_fn_with_packing(x, pad_token_id=model.padding_idx),
-            num_workers=0,
-            pin_memory=(device.type == 'cuda'),
         )
         for epoch in range(config.num_epochs):
@@ -945,6 +1015,10 @@ def train_model(
             acc = TokenLossAccumulator()
             optimizer.zero_grad()
             for step, batch in enumerate(dataloader):
                 input_ids      = batch['input_ids']     .to(device)
                 attention_mask = batch['attention_mask'] .to(device)
@@ -955,25 +1029,35 @@ def train_model(
                         outputs   = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                         task_loss = outputs['loss']
                         if ewc is not None:
-                            task_loss = task_loss + config.ewc_lambda * ewc.penalty(model)
-                        loss = task_loss / config.gradient_accumulation_steps
                     scaler.scale(loss).backward()
                 else:
                     outputs   = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                     task_loss = outputs['loss']
                     if ewc is not None:
-                        task_loss = task_loss + config.ewc_lambda * ewc.penalty(model)
-                    loss = task_loss / config.gradient_accumulation_steps
                     loss.backward()
-                # FIX: per-token tracking
                 acc.update(task_loss.item(), labels)
                 if (step + 1) % config.gradient_accumulation_steps == 0:
                     if use_amp:
                         scaler.unscale_(optimizer)
-                    # FIX: log gradient norm
                     grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                     if use_amp:
@@ -1014,13 +1098,8 @@ def train_model(
 def evaluate_model(model, dataset, device, batch_size=4):
     model.eval()
-    dataloader = DataLoader(
-        dataset,
-        batch_size=batch_size,
-        shuffle=False,
-        collate_fn=lambda x: collate_fn_with_packing(x, pad_token_id=model.padding_idx),
-        num_workers=0,
-    )
     acc = TokenLossAccumulator()
     with torch.no_grad():
         for batch in dataloader:
@@ -1042,9 +1121,27 @@ def evaluate_model(model, dataset, device, batch_size=4):
 # ============================================================================
-# GENERATION WITH KV CACHE
 # ============================================================================
 def generate_text(
     model: IndonesianLLM,
     tokenizer,
@@ -1057,59 +1154,54 @@ def generate_text(
     device: torch.device = torch.device('cpu'),
 ) -> str:
     """
-    KV-cache generation: O(n) per new token instead of O(n²).
-    FIX: repetition_penalty now uses proper logit division (Keskar et al.),
-    not a magic subtraction that breaks under low temperature.
     """
     model.eval()
-    # Reseed from OS entropy so repeated calls with the same prompt diverge.
-    # This is the core fix: torch.multinomial outcome depends on torch RNG state,
-    # which was frozen to seed=42 at startup. Each call now starts from a unique state.
-    import os as _os
-    _entropy = int.from_bytes(_os.urandom(4), 'little')
-    torch.manual_seed(_entropy)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(_entropy)
     eos_id = tokenizer.eos_token_id or tokenizer.sep_token_id or 2
     pad_id = tokenizer.pad_token_id or 0
-    input_ids      = tokenizer.encode(prompt, return_tensors='pt').to(device)  # [1, T]
-    generated_ids  = input_ids.clone()
     with torch.no_grad():
-        # ── PREFILL: process entire prompt at once, capture KV cache ──────────
-        prefill_out  = model(
             input_ids=input_ids,
             use_cache=True,
         )
         past_kv = prefill_out['past_key_values']
-        # Seed penalty buffer with prompt tokens so model can't echo the input
-        prompt_token_ids = input_ids[0].tolist()
         generated_token_ids = []
-        # ── DECODE: one token at a time using cached K/V ──────────────────────
         for _ in range(max_new_tokens):
-            # Only feed the last token
-            cur_id = generated_ids[:, -1:]  # [1, 1]
-            out        = model(input_ids=cur_id, past_key_values=past_kv, use_cache=True)
-            past_kv    = out['past_key_values']
-            logits     = out['logits'][:, -1, :]            # [1, V]
-            logits     = logits / max(temperature, 0.05)
-            # Penalize: all tokens seen so far (prompt + generated)
             if repetition_penalty != 1.0:
-                all_seen = set(prompt_token_ids + generated_token_ids[-64:])
-                for tok_id in all_seen:
-                    if 0 <= tok_id < logits.shape[-1]:
-                        if logits[0, tok_id] > 0:
-                            logits[0, tok_id] /= repetition_penalty
-                        else:
-                            logits[0, tok_id] *= repetition_penalty
             # Top-k
             if top_k > 0:
@@ -1127,7 +1219,8 @@ def generate_text(
                 logits = torch.zeros_like(logits).scatter_(1, sorted_idx, sorted_logits)
             probs      = F.softmax(logits, dim=-1)
-            next_token = torch.multinomial(probs, num_samples=1)  # [1, 1]
             tok_id = next_token.item()
             if tok_id in {eos_id, pad_id}:
@@ -1136,24 +1229,15 @@ def generate_text(
             generated_token_ids.append(tok_id)
             generated_ids = torch.cat([generated_ids, next_token], dim=1)
-            # Hard context limit (shouldn't be reached with max_new_tokens)
             if generated_ids.size(1) >= model.config.max_position_embeddings:
                 break
-    import re as _re
-    prompt_len = input_ids.shape[1]
-    # Decode ONLY the newly generated tokens — never the prompt.
-    # This avoids the slice-by-string-length bug where tokenizer spacing
-    # makes len(prompt_str) != number of chars in decoded(prompt_tokens),
-    # causing callers to cut mid-token and get "ot>" instead of "<cot>".
     new_token_ids = generated_ids[0][prompt_len:]
     if len(new_token_ids) == 0:
         return ""
     raw_text = tokenizer.decode(new_token_ids, skip_special_tokens=False)
-    # Strip BERT specials but keep <cot> </cot>
-    raw_text = _re.sub(r'\[(SEP|CLS|PAD|UNK|MASK)\]', '', raw_text)
     return raw_text.strip()
@@ -1162,64 +1246,41 @@ def generate_text(
 # ============================================================================
 def _clean_response(response: str) -> str:
-    import re
-    # Strip CoT block — do this first before any other processing
     if "<cot>" in response and "</cot>" in response:
         response = response.split("</cot>", 1)[-1]
     elif "<cot>" in response:
-        # Model started CoT but never closed it — everything before <cot> is prompt leak,
-        # everything after is the partial reasoning. Discard both, use empty.
         response = ""
-    # Strip BERT-style special tokens that appear when skip_special_tokens=False
     response = re.sub(r'\[(SEP|CLS|PAD|UNK|MASK)\]', '', response)
-    # Strip all remaining XML/special tags
     response = re.sub(r'<[^>]+>', '', response)
-    # Role markers only at line start
-    response = re.sub(r'(?im)^\s*(user\s*:|assistant\s*:).*', '', response)
-    # Strip meta-commentary (Indonesian-specific)
     for marker in ["memahami permintaan", "jawaban singkat", "penjelasan harus"]:
         if marker in response:
             response = response.split(marker)[0]
-    # Collapse whitespace
     response = re.sub(r'\n{2,}', '\n', response)
     response = re.sub(r' {2,}', ' ', response)
-    # Strip leading punctuation/whitespace junk — but NOT digits or letters
     response = re.sub(r'^[\s:!,.\-|]+', '', response)
     return response.strip()
 def _extract_thinking(raw: str) -> Tuple[str, str]:
-    import re
-    # Strip BERT special tokens first (they appear with skip_special_tokens=False)
     raw = re.sub(r'\[(SEP|CLS|PAD|UNK|MASK)\]', '', raw)
     if "</cot>" in raw:
-        # Normal case: model produced full CoT block
         thinking_raw, answer_raw = raw.split("</cot>", 1)
         thinking = re.sub(r'<[^>]+>', '', thinking_raw).strip()
-        thinking = re.sub(r'(?im)^\s*(user\s*:|assistant\s*:).*', '', thinking).strip()
         answer   = _clean_response(answer_raw)
     elif "<cot>" in raw:
-        # Model started CoT but never finished — reasoning only, no answer yet.
-        # Extract whatever came before <cot> as a potential direct answer,
-        # or whatever came after as partial reasoning.
         parts    = raw.split("<cot>", 1)
         thinking = _clean_response(parts[1]) if len(parts) > 1 else ""
-        # No clean answer available — return empty, caller will fall back
         answer   = _clean_response(parts[0]) if parts[0].strip() else ""
     else:
-        # No CoT tags at all — the whole output IS the answer (model skipped reasoning)
         thinking = ""
         answer   = _clean_response(raw)
@@ -1227,7 +1288,7 @@ def _extract_thinking(raw: str) -> Tuple[str, str]:
 # ============================================================================
-# INTERACTIVE CHAT
 # ============================================================================
 def interactive_chat(
@@ -1235,16 +1296,40 @@ def interactive_chat(
     tokenizer,
     device: torch.device,
     system_prompt: str = "Kamu adalah asisten AI yang membantu, ramah, dan menjawab dalam Bahasa Indonesia.",
 ):
     print("\n" + "=" * 80)
-    print("INDONESIAN LLM — INTERACTIVE CHAT  (KV-cache enabled)")
     print("=" * 80)
     print("Commands: exit/quit | clear | think (toggle CoT display)")
     print(f"Persona : {system_prompt}")
     print("=" * 80 + "\n")
     model.eval()
-    show_thinking = False
     while True:
         try:
@@ -1255,6 +1340,7 @@ def interactive_chat(
                 print("\nSelamat tinggal!")
                 break
             if user_input.lower() in ['clear', 'bersihkan']:
                 print("\nConversation cleared.")
                 continue
             if user_input.lower() == 'think':
@@ -1262,10 +1348,9 @@ def interactive_chat(
                 print(f"\nThinking mode: {'ON' if show_thinking else 'OFF'}")
                 continue
-            prompt = f"{user_input} <cot>"
             print("\nA:", end=" ", flush=True)
-            # generate_text now returns ONLY new tokens (no prompt prefix)
             response = generate_text(
                 model=model,
                 tokenizer=tokenizer,
@@ -1282,15 +1367,11 @@ def interactive_chat(
             if show_thinking and thinking:
                 print(f"[Thinking: {thinking}]")
-            # Use answer if non-empty; fall back to cleaned full response;
-            # last resort: use thinking itself (model reasoned but didn't emit answer).
-            # Never throw away a valid short answer like "1", "2", "ya".
             if answer:
                 final = answer
             else:
                 final = _clean_response(response)
                 if not final and thinking:
-                    # Model only produced reasoning, extract last sentence as answer
                     sentences = [s.strip() for s in thinking.split('.') if s.strip()]
                     final = sentences[-1] if sentences else thinking[:200]
@@ -1298,6 +1379,9 @@ def interactive_chat(
                 final = "..."
             print(final)
         except KeyboardInterrupt:
             print("\n\nDihentikan.")
             break
@@ -1332,21 +1416,16 @@ def run_benchmark(model, tokenizer, device, dataset_path: str = None, n: int = 2
         print("No valid samples.")
         return
-    # Time-based seed: different sample selection AND different generation each run
-    import time
     live_seed = int(time.time() * 1000) % (2**31)
     random.seed(live_seed)
-    torch.manual_seed(live_seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(live_seed)
     samples = random.sample(all_samples, min(n, len(all_samples)))
     model.eval()
     print(f"\n{'=' * 80}\nBENCHMARK  ({len(samples)} samples)\n{'=' * 80}")
-    results  = []
-    acc      = TokenLossAccumulator()
     for sample in samples:
         inp      = sample['input'].strip()
@@ -1358,7 +1437,6 @@ def run_benchmark(model, tokenizer, device, dataset_path: str = None, n: int = 2
         _, answer    = _extract_thinking(raw)
         answer_lower = answer.lower()
-        # Exact + token-overlap match
         passed = expected in answer_lower
         if not passed:
             exp_toks = set(expected.split())
@@ -1391,9 +1469,16 @@ def run_benchmark(model, tokenizer, device, dataset_path: str = None, n: int = 2
 # ============================================================================
 # SAVE / LOAD
 # ============================================================================
-def save_model(model: IndonesianLLM, config: ModelConfig, tokenizer_name: str, path: str, use_fp16: bool = True):
     os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
     state = model.state_dict()
     if use_fp16:
@@ -1406,10 +1491,17 @@ def save_model(model: IndonesianLLM, config: ModelConfig, tokenizer_name: str, p
         'dtype':            'fp16' if use_fp16 else 'fp32',
     }, path)
     size_mb = os.path.getsize(path) / 1e6
-    print(f"\nSaved: {path}  ({'fp16' if use_fp16 else 'fp32'}, {size_mb:.1f} MB, {model.count_parameters():,} params)")
-def load_model(path: str, device: torch.device):
     if not os.path.exists(path):
         raise FileNotFoundError(f"Checkpoint not found: {path}")
     print(f"Loading: {path}")
@@ -1420,36 +1512,41 @@ def load_model(path: str, device: torch.device):
     dtype          = ck.get('dtype', 'fp32')
     state = ck['model_state_dict']
-    if dtype == 'fp16':
         state = {k: v.float() if v.dtype == torch.float16 else v for k, v in state.items()}
-    # Always derive intermediate_size from actual saved weights so the
-    # model architecture matches exactly, regardless of what the config says.
-    # gate_proj shape is [intermediate_size, hidden_size].
     gate_key = next((k for k in state if k.endswith('gate_proj.weight')), None)
     if gate_key is not None:
         inferred_intermediate = state[gate_key].shape[0]
         if getattr(config, 'intermediate_size', -1) != inferred_intermediate:
             print(f"  [load_model] intermediate_size: config={getattr(config, 'intermediate_size', '?')} "
-                  f"-> overriding with checkpoint value {inferred_intermediate}")
             config.intermediate_size = inferred_intermediate
-    # Sync vocab_size from embedding weight shape
     embed_key = next((k for k in state if k.endswith('embed_tokens.weight')), None)
     if embed_key is not None:
         inferred_vocab = state[embed_key].shape[0]
         if config.vocab_size != inferred_vocab:
             print(f"  [load_model] vocab_size: config={config.vocab_size} "
-                  f"-> overriding with checkpoint value {inferred_vocab}")
             config.vocab_size = inferred_vocab
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
     tokenizer.add_special_tokens({"additional_special_tokens": ["<cot>", "</cot>"]})
     model = IndonesianLLM(config)
-    model.load_state_dict(state)
     model.to(device)
     size_mb = os.path.getsize(path) / 1e6
     print(f"Loaded ({dtype}, {size_mb:.1f} MB, {ck.get('model_params', model.count_parameters()):,} params)")
     return model, tokenizer, config, {}
@@ -1493,6 +1590,11 @@ def main():
     parser.add_argument('--ewc-lambda',       type=float, default=5000.0)
     parser.add_argument('--ewc-samples',      type=int,   default=2000)
     parser.add_argument('--no-ewc',           action='store_true')
     args = parser.parse_args()
@@ -1505,13 +1607,9 @@ def main():
     save_fp16        = not args.save_fp32
     use_cot_training = not args.no_cot
-    # Only fix the seed for training (reproducibility).
-    # Chat and benchmark must NOT be seeded — identical seeds produce identical
-    # outputs every run, making the model feel like a lookup table.
     if args.train or args.finetune or args.continue_train:
         set_seed(args.seed)
     else:
-        # Use a time-based seed so every run is different
         import time
         live_seed = int(time.time() * 1000) % (2**31)
         random.seed(live_seed)
@@ -1519,13 +1617,13 @@ def main():
         torch.manual_seed(live_seed)
         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(live_seed)
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     print(f"\nDevice: {device}")
     if torch.cuda.is_available():
         print(f"  GPU:  {torch.cuda.get_device_name(0)}")
         print(f"  VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
-    # ── INSPECT DATA ─────────────────────────────────────────────────────────
     if args.inspect_data:
         tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
         tokenizer.add_special_tokens({"additional_special_tokens": ["<cot>", "</cot>"]})
@@ -1542,24 +1640,22 @@ def main():
             print(f"  Output: {s['output'][:120]}")
         return
-    # ── CHAT ─────────────────────────────────────────────────────────────────
     if args.chat:
-        model, tokenizer, _, _ = load_model(args.model, device)
-        interactive_chat(model, tokenizer, device, system_prompt=args.system_prompt)
         return
-    # ── BENCHMARK ────────────────────────────────────────────────────────────
     if args.benchmark:
-        model, tokenizer, _, _ = load_model(args.model, device)
         run_benchmark(model, tokenizer, device, dataset_path=args.dataset)
         return
-    # ── TRAIN FROM SCRATCH ───────────────────────────────────────────────────
     if args.train:
         tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
         tokenizer.add_special_tokens({"additional_special_tokens": ["<cot>", "</cot>"]})
-        # FIX: vocab_size from actual tokenizer length (never hardcoded)
         model_config = ModelConfig(
             vocab_size              = len(tokenizer),
             hidden_size             = args.hidden_size,
@@ -1570,32 +1666,30 @@ def main():
             attention_dropout       = 0.1,
             residual_dropout        = 0.1,
             tie_word_embeddings     = True,
         )
         print(f"\nModel config: {model_config}")
-        print(f"intermediate_size (SwiGLU 8/3): {model_config.intermediate_size}")
         model = IndonesianLLM(model_config)
-        # Tag residual projections BEFORE init so depth-scaling applies
-        model._tag_projection_layers()
-        model.apply(model._init_weights)
         print(f"Parameters: {model.count_parameters():,}")
         _ga = args.grad_accum or 32
         train_config = TrainingConfig(
-            dataset_path              = args.dataset,
-            num_epochs                = args.epochs,
-            batch_size                = args.batch_size,
-            gradient_accumulation_steps = _ga,
-            max_seq_length            = args.max_length,
-            learning_rate             = args.lr,
-            warmup_steps              = 500,
-            use_fp16                  = torch.cuda.is_available(),
-            curriculum_stages         = [128, 256, args.max_length],
         )
         dataset = IndonesianCoTDataset(train_config.dataset_path, tokenizer,
-                                        train_config.max_seq_length, use_cot=use_cot_training,
-                                        cot_ratio=args.cot_ratio)
         model   = train_model(model, dataset, train_config, device,
                               use_simple_curriculum=args.simple_curriculum)
@@ -1611,32 +1705,31 @@ def main():
             print(f"\nPrompt   : {p}")
             print(f"Generated: {generate_text(model, tokenizer, p, max_new_tokens=150, device=device)}\n")
-    # ── FINETUNE ──────────────────────────────────────��──────────────────────
     if args.finetune:
-        model, tokenizer, model_config, _ = load_model(args.model, device)
         _ga = args.grad_accum or 32
         train_config = TrainingConfig(
-            dataset_path              = args.dataset,
-            num_epochs                = args.epochs,
-            batch_size                = args.batch_size,
-            gradient_accumulation_steps = _ga,
-            max_seq_length            = args.max_length,
-            learning_rate             = args.lr / 10,
-            warmup_steps              = 100,
-            use_fp16                  = torch.cuda.is_available(),
-            curriculum_stages         = [128, 256, args.max_length],
         )
         dataset = IndonesianCoTDataset(train_config.dataset_path, tokenizer,
-                                        train_config.max_seq_length, use_cot=use_cot_training,
-                                        cot_ratio=args.cot_ratio)
         ewc_obj = None
         if not args.no_ewc and args.ewc_lambda > 0:
             print(f"\nComputing EWC Fisher (lambda={args.ewc_lambda}, n={args.ewc_samples})...")
-            loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True,
-                                collate_fn=lambda x: collate_fn_with_packing(x, model.padding_idx),
-                                num_workers=0)
             train_config.ewc_lambda  = args.ewc_lambda
             train_config.ewc_samples = args.ewc_samples
             ewc_obj = EWC(model, loader, device, n_samples=args.ewc_samples)
@@ -1651,36 +1744,34 @@ def main():
         save_model(model, model_config, "indolem/indobert-base-uncased", out_path, use_fp16=save_fp16)
         print(f"\nFinetuned model: {out_path}")
-    # ── CONTINUE TRAINING ────────────────────────────────────────────────────
     if args.continue_train:
-        model, tokenizer, model_config, _ = load_model(args.model, device)
-        effective_lr    = args.lr * 0.05
-        effective_skip  = (len([128, 256, args.max_length]) - 1) if args.simple_curriculum else args.skip_stages
-        curriculum      = [192, 320, args.max_length]
-        print(f"\nContinue-train LR: {effective_lr:.2e}  (skip {effective_skip} stages)")
         _ga = args.grad_accum or 32
         train_config = TrainingConfig(
-            dataset_path              = args.dataset,
-            num_epochs                = args.epochs,
-            batch_size                = args.batch_size,
-            gradient_accumulation_steps = _ga,
-            max_seq_length            = args.max_length,
-            learning_rate             = effective_lr,
-            warmup_steps              = 0,
-            use_fp16                  = torch.cuda.is_available(),
-            curriculum_stages         = curriculum,
-            skip_curriculum_stages    = effective_skip,
-            plateau_patience          = 2,
-            plateau_factor            = 0.5,
-            plateau_min_delta         = 0.02,
         )
         dataset = IndonesianCoTDataset(train_config.dataset_path, tokenizer,
-                                        train_config.max_seq_length, use_cot=use_cot_training,
-                                        cot_ratio=args.cot_ratio)
         model = train_model(model, dataset, train_config, device,
                             use_simple_curriculum=args.simple_curriculum,
                             is_continue=True,

 Architecture: Decoder-only transformer with GQA, RoPE, SwiGLU, RMSNorm, KV-Cache
 Target: 15M-30M parameters, optimized for Google Colab Free tier
+FIXES in this version (on top of prior fixes):
+  [INFERENCE]
+  - FIX-I1: KV cache RoPE offset uses proper position_ids tensor, not slice arithmetic
+  - FIX-I2: Vectorized repetition penalty (scatter gather on GPU, no Python loop)
+  - FIX-I3: torch.Generator for per-call entropy — no global RNG reset
+  - FIX-I4: Multi-turn conversation history in interactive_chat
+  - FIX-I5: Top-p preallocated scratch tensors (minor, readability)
+  - FIX-I6: generate_text returns generator for streaming (optional)
+  [TRAINING]
+  - FIX-T1: _tag_projection_layers called inside __init__ before apply(_init_weights)
+  - FIX-T2: EWC penalty computed once per optimizer step, not per micro-batch
+  - FIX-T3: acc.update tracks task_loss_only (no EWC in perplexity)
+  - FIX-T4: PyTorch version guard for label_smoothing + ignore_index interaction
+  - FIX-T5: DataLoader num_workers=2 with persistent_workers on CUDA
+  - FIX-T6: Gradient checkpointing option (halves activation memory)
+  - FIX-T7: save/load fp16 stays fp16 at inference — no upcast unless training
+  - FIX-T8: TrainingConfig.skip_curriculum_stages actually used (dead field removed)
+  - FIX-T9: EWC Fisher uses model's own predictions as labels (empirical Fisher)
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.data import Dataset, DataLoader
+from torch.utils.checkpoint import checkpoint as gradient_checkpoint
 from transformers import AutoTokenizer
 import json
 import math
 import random
 import numpy as np
+from typing import Optional, Tuple, List, Dict, Generator
 from dataclasses import dataclass, field
 import warnings
 import argparse
 import os
+import re
 warnings.filterwarnings('ignore')
 @dataclass
 class ModelConfig:
+    vocab_size: int = 32000
     hidden_size: int = 384
     num_layers: int = 12
     num_attention_heads: int = 6
+    num_key_value_heads: int = 2
     intermediate_size: int = 0
     max_position_embeddings: int = 2048
     rms_norm_eps: float = 1e-6
     eos_token_id: int = 2
     tie_word_embeddings: bool = True
     label_smoothing: float = 0.1
+    # FIX-T6: gradient checkpointing flag
+    use_gradient_checkpointing: bool = False
     def __post_init__(self):
         if self.intermediate_size <= 0:
             self.intermediate_size = self.hidden_size * 3
         assert self.hidden_size % self.num_attention_heads == 0, \
     lr_scheduler_type: str = "cosine"
     dropout: float = 0.1
     use_fp16: bool = True
+    # FIX-T6: expose gradient checkpointing in training config
+    use_gradient_checkpointing: bool = False
     seed: int = 42
     logging_steps: int = 10
     save_steps: int = 500
     curriculum_stages: List[int] = None
     plateau_patience: int = 3
     plateau_factor: float = 0.5
         t = torch.arange(seq_len, device=self.inv_freq.device).type_as(self.inv_freq)
         freqs = torch.outer(t, self.inv_freq)
         emb = torch.cat((freqs, freqs), dim=-1)
+        # [1, 1, T, D] for broadcast onto [B, H, T, D]
         self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
         self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
     return torch.cat((-x2, x1), dim=-1)
+# FIX-I1: position_ids-based RoPE application — no slice arithmetic
+def apply_rotary_pos_emb_with_ids(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    position_ids: torch.Tensor,  # [B, T] — always provided
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply RoPE using explicit position_ids.
+    cos/sin: [1, 1, max_seq, D]
+    position_ids: [B, T]  (T=1 during decode, T=seq_len during prefill)
+    """
+    # Gather cos/sin for the specific positions: [B, T, D]
+    cos_pos = cos[0, 0][position_ids]   # [B, T, D]
+    sin_pos = sin[0, 0][position_ids]   # [B, T, D]
+    # Unsqueeze head dim for broadcast: [B, 1, T, D]
+    cos_pos = cos_pos.unsqueeze(1)
+    sin_pos = sin_pos.unsqueeze(1)
+    q_embed = (q * cos_pos) + (rotate_half(q) * sin_pos)
+    k_embed = (k * cos_pos) + (rotate_half(k) * sin_pos)
     return q_embed, k_embed
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,       # FIX-I1
         past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
         key_states   = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads,           self.head_dim).transpose(1, 2)
         key_states   = key_states  .view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        past_len = past_key_value[0].shape[2] if past_key_value is not None else 0
+        kv_seq_len = past_len + q_len
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # FIX-I1: use explicit position_ids for RoPE — works for both prefill and decode
+        if position_ids is None:
+            position_ids = torch.arange(past_len, past_len + q_len,
+                                        device=hidden_states.device).unsqueeze(0).expand(bsz, -1)
+        query_states, key_states = apply_rotary_pos_emb_with_ids(
+            query_states, key_states, cos, sin, position_ids
+        )
+        # Append to KV cache BEFORE repeat (store compact num_kv_heads version)
         if past_key_value is not None:
             key_states   = torch.cat([past_key_value[0], key_states],   dim=2)
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
         present_kv = (key_states, value_states) if use_cache else None
+        # Expand KV for full multi-head attention
         if self.num_key_value_groups > 1:
             key_states   = key_states  .repeat_interleave(self.num_key_value_groups, dim=1)
             value_states = value_states.repeat_interleave(self.num_key_value_groups, dim=1)
     def __init__(self, config: ModelConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
         inter = getattr(config, 'intermediate_size', 0)
         if not isinstance(inter, int) or inter <= 0:
             inter = self.hidden_size * 3
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,    # FIX-I1
         past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ):
+        residual      = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         hidden_states, present_kv = self.self_attn(
             hidden_states,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             past_key_value=past_key_value,
             use_cache=use_cache,
         )
         hidden_states = self.residual_dropout(hidden_states)
         hidden_states = residual + hidden_states
+        residual      = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
         hidden_states = self.residual_dropout(hidden_states)
 # ============================================================================
+# CUSTOM LABEL SMOOTHING LOSS
 # ============================================================================
+# FIX-T4: PyTorch version guard for label_smoothing + ignore_index
+_TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2] if x.isdigit())
+_NATIVE_SMOOTH_SAFE = _TORCH_VERSION >= (1, 10)
+class LabelSmoothingCrossEntropy(nn.Module):
     def __init__(self, vocab_size: int, smoothing: float = 0.1, ignore_index: int = -100):
         super().__init__()
         self.vocab_size   = vocab_size
         self.ignore_index = ignore_index
     def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        if _NATIVE_SMOOTH_SAFE and self.smoothing > 0:
+            return F.cross_entropy(
+                logits,
+                targets,
+                ignore_index=self.ignore_index,
+                label_smoothing=self.smoothing,
+            )
+        else:
+            # Manual fallback: safe for any PyTorch version
+            log_probs  = F.log_softmax(logits, dim=-1)
+            nll_loss   = F.nll_loss(log_probs, targets, ignore_index=self.ignore_index, reduction='mean')
+            if self.smoothing <= 0:
+                return nll_loss
+            smooth_loss = -log_probs.mean(dim=-1)
+            mask        = (targets != self.ignore_index)
+            smooth_loss = smooth_loss[mask].mean() if mask.any() else smooth_loss.mean()
+            return (1.0 - self.smoothing) * nll_loss + self.smoothing * smooth_loss
 # ============================================================================
         self.padding_idx = config.pad_token_id
         self.vocab_size  = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size,
+                                         padding_idx=self.padding_idx)
         self.layers       = nn.ModuleList([DecoderLayer(config, idx) for idx in range(config.num_layers)])
         self.norm         = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
             nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.loss_fn = LabelSmoothingCrossEntropy(
+            vocab_size   = config.vocab_size,
+            smoothing    = config.label_smoothing,
+            ignore_index = -100,
         )
+        # FIX-T1: tag projection layers BEFORE weight init so depth-scaling applies
+        self._tag_projection_layers()
         self.apply(self._init_weights)
     def _init_weights(self, module):
         std = self.config.initializer_range
         if isinstance(module, nn.Linear):
             name = getattr(module, '_layer_name', '')
             if name in ('o_proj', 'down_proj'):
                 scaled_std = std / math.sqrt(2 * self.config.num_layers)
                 module.weight.data.normal_(mean=0.0, std=scaled_std)
             else:
                 module.weight.data[module.padding_idx].zero_()
     def _tag_projection_layers(self):
         for layer in self.layers:
             layer.self_attn.o_proj._layer_name = 'o_proj'
             layer.mlp.down_proj._layer_name    = 'down_proj'
         attention_mask: Optional[torch.Tensor] = None,
         batch_size: int = 1,
     ) -> torch.Tensor:
         total_len = past_len + seq_len
+        causal    = torch.full((seq_len, total_len), torch.finfo(dtype).min,
+                               device=device, dtype=dtype)
         mask_cond = torch.arange(total_len, device=device)
+        causal.masked_fill_(
+            mask_cond[None, :] <= (torch.arange(seq_len, device=device) + past_len)[:, None],
+            0.0
+        )
         causal = causal[None, None, :, :].expand(batch_size, 1, seq_len, total_len)
         if attention_mask is not None:
             pad_mask = (1.0 - attention_mask[:, None, None, :].float()) * torch.finfo(dtype).min
             causal   = causal + pad_mask
         return causal
+    # FIX-T6: gradient checkpointing wrapper for decoder layers
+    def _layer_forward_with_ckpt(
+        self,
+        layer,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value,
+        use_cache,
+    ):
+        if self.config.use_gradient_checkpointing and self.training and past_key_value is None:
+            # Gradient checkpointing is only meaningful during training prefill
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs, use_cache=False)
+                return custom_forward
+            hidden_states, _ = gradient_checkpoint(
+                create_custom_forward(layer),
+                hidden_states,
+                attention_mask,
+                position_ids,
+                None,
+                use_reentrant=False,
+            )
+            return hidden_states, None
+        else:
+            return layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                use_cache=use_cache,
+            )
     def forward(
         self,
         input_ids: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,          # FIX-I1
         labels: Optional[torch.Tensor] = None,
         past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
         use_cache: bool = False,
         hidden_states = self.embed_tokens(input_ids)
         if attention_mask is None:
             attention_mask = torch.ones(batch_size, past_len + seq_length,
                                         dtype=torch.long, device=input_ids.device)
+        # FIX-I1: build position_ids for this forward pass
+        if position_ids is None:
+            position_ids = torch.arange(past_len, past_len + seq_length,
+                                        device=input_ids.device).unsqueeze(0).expand(batch_size, -1)
         causal_mask = self._make_causal_mask(
             seq_len=seq_length,
             past_len=past_len,
         for i, decoder_layer in enumerate(self.layers):
             pkv = past_key_values[i] if past_key_values is not None else None
+            hidden_states, present_kv = self._layer_forward_with_ckpt(
+                decoder_layer,
                 hidden_states,
+                causal_mask,
+                position_ids,
+                pkv,
+                use_cache,
             )
             if use_cache:
                 present_key_values.append(present_kv)
             loss = self.loss_fn(shift_logits, shift_labels)
         return {
+            "loss":            loss,
+            "logits":          logits,
+            "past_key_values": present_key_values,
         }
     def count_parameters(self) -> int:
         self.skipped_count  = 0
         self._load_data(file_path)
     @property
     def _eos(self) -> str:
         return self.tokenizer.eos_token or "</s>"
     def _load_data(self, file_path: str):
         print(f"Loading dataset from {file_path}...")
         with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
                 try:
                     if not line.strip():
                         continue
     def __getitem__(self, idx):
         sample = self.samples[idx]
         if self.use_cot and random.random() < self.cot_ratio:
             prompt     = f"{sample['input']} {self.cot_token}"
             completion = f" {sample['cot']} {self.end_cot_token} {sample['output']}{self._eos}"
         else:
             prompt     = f"{sample['input']}"
             add_special_tokens=True,
         )
         labels = [-100] * min(prompt_len, len(full_ids)) + full_ids[prompt_len:]
         labels = labels[:len(full_ids)]
 # ============================================================================
+# PER-TOKEN LOSS TRACKING
 # ============================================================================
 class TokenLossAccumulator:
     def __init__(self):
         self.total_loss   = 0.0
         self.total_tokens = 0
 def create_curriculum_datasets(dataset, stages=None, use_simple=False, skip_stages=0):
     if stages is None:
         stages = [256, 512, 1024]
+    print("\n" + "=" * 80)
+    print("3-STAGE REASONING CURRICULUM")
+    if skip_stages > 0:
+        print(f"  (Skipping first {skip_stages} stage(s))")
+    print("=" * 80)
     if use_simple:
+        datasets = []
         for i, max_len in enumerate(stages):
             filtered = [
                 s for s in dataset.samples
                 if len(dataset.tokenizer.encode(
                 )) <= max_len
             ]
             datasets.append(_build_stage_dataset(dataset, filtered, max_len, dataset.cot_ratio))
+            tag = " [SKIP]" if i < skip_stages else ""
+            print(f"  Stage {max_len}{tag}: {len(filtered)} samples")
     else:
         stage_configs = [
             {'name': 'Stage 1: Basic Q&A (no CoT)',           'max_len': 384,  'cot_ratio': 0.0},
             {'name': 'Stage 2: Learning Reasoning (50% CoT)', 'max_len': 512,  'cot_ratio': 0.5},
             {'name': 'Stage 3: Full Reasoning (100% CoT)',    'max_len': 1024, 'cot_ratio': 1.0},
         ]
+        datasets = []
         for idx, sc in enumerate(stage_configs):
+            filtered = dataset.samples if idx > 0 else [
+                s for s in dataset.samples
+                if len(dataset.tokenizer.encode(f"{s['input']} {s['output']}")) <= sc['max_len']
+            ]
             datasets.append(_build_stage_dataset(dataset, filtered, sc['max_len'], sc['cot_ratio']))
+            tag = " [SKIP]" if idx < skip_stages else ""
             print(f"  {sc['name']}{tag}  |  samples={len(filtered)}  |  CoT={sc['cot_ratio']:.0%}")
+    print("=" * 80 + "\n")
+    return datasets[skip_stages:]
 # ============================================================================
     torch.backends.cudnn.benchmark     = False
+def _make_dataloader(dataset, batch_size, shuffle, pad_token_id, device_type):
+    # FIX-T5: use num_workers=2 with persistent_workers on CUDA for better GPU util
+    num_workers      = 2 if device_type == 'cuda' else 0
+    persistent       = (num_workers > 0)
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        collate_fn=lambda x: collate_fn_with_packing(x, pad_token_id=pad_token_id),
+        num_workers=num_workers,
+        persistent_workers=persistent,
+        pin_memory=(device_type == 'cuda'),
+    )
 # ============================================================================
 # ELASTIC WEIGHT CONSOLIDATION (EWC)
 # ============================================================================
         self.fisher    = self._compute_fisher(model, dataloader)
     def _compute_fisher(self, model, dataloader):
+        """
+        FIX-T9: Empirical Fisher — uses model's own predictions as labels.
+        This avoids the bias from using training labels and is more theoretically
+        correct for EWC (Kirkpatrick et al. 2017).
+        """
         fisher = {n: torch.zeros_like(p) for n, p in model.named_parameters() if p.requires_grad}
         model.eval()
         seen = 0
                 break
             input_ids      = batch["input_ids"]     .to(self.device)
             attention_mask = batch["attention_mask"] .to(self.device)
             model.zero_grad()
+            with torch.no_grad():
+                out    = model(input_ids=input_ids, attention_mask=attention_mask)
+                logits = out["logits"]
+                # Use model's own greedy predictions as labels (empirical Fisher)
+                pred_labels = logits[:, :-1, :].argmax(dim=-1)  # [B, T-1]
+                # Shift input_ids for proper alignment
+                # pred_labels serve as targets for the shifted logits
+                flat_logits = logits[:, :-1, :].contiguous().view(-1, model.vocab_size)
+                flat_labels = pred_labels.contiguous().view(-1)
+            # Recompute with grad enabled using the pseudo-labels
+            out2 = model(input_ids=input_ids, attention_mask=attention_mask)
+            flat_logits_grad = out2["logits"][:, :-1, :].contiguous().view(-1, model.vocab_size)
+            loss = F.cross_entropy(flat_logits_grad, flat_labels.detach())
+            loss.backward()
             for n, p in model.named_parameters():
                 if p.requires_grad and p.grad is not None:
                     fisher[n] += p.grad.detach().pow(2)
             seen += input_ids.size(0)
         for n in fisher:
             fisher[n] /= max(1, seen)
         model.train()
     print(f"  Max seq length:      {config.max_seq_length}")
     print(f"  Epochs:              {config.num_epochs}")
     print(f"  Mixed precision:     {config.use_fp16}")
+    print(f"  Grad checkpointing:  {config.use_gradient_checkpointing}")
     print(f"  EWC:                 {'enabled (lambda=' + str(config.ewc_lambda) + ')' if ewc else 'disabled'}")
     print("=" * 80 + "\n")
+    # FIX-T6: apply gradient checkpointing to model config
+    if config.use_gradient_checkpointing:
+        model.config.use_gradient_checkpointing = True
     model.to(device)
     model.train()
     )
     if not curriculum_datasets:
+        print("ERROR: No curriculum stages.")
         return model
     optimizer = torch.optim.AdamW(
         for ds in curriculum_datasets
     ) or 1
     use_amp = config.use_fp16 and device.type == 'cuda'
     scaler  = torch.amp.GradScaler('cuda') if use_amp else None
               f"n={len(stage_dataset)} | CoT={getattr(stage_dataset, 'cot_ratio', '?'):.0%}")
         print(f"{'=' * 80}\n")
+        dataloader = _make_dataloader(
+            stage_dataset, config.batch_size, shuffle=True,
+            pad_token_id=model.padding_idx, device_type=device.type
         )
         for epoch in range(config.num_epochs):
             acc = TokenLossAccumulator()
             optimizer.zero_grad()
+            # FIX-T2: compute EWC penalty once per optimizer step, not per micro-batch
+            ewc_penalty_cache = None
+            ewc_cache_step    = -1
             for step, batch in enumerate(dataloader):
                 input_ids      = batch['input_ids']     .to(device)
                 attention_mask = batch['attention_mask'] .to(device)
                         outputs   = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                         task_loss = outputs['loss']
                         if ewc is not None:
+                            # FIX-T2: cache penalty for entire accumulation window
+                            if ewc_cache_step != (step // config.gradient_accumulation_steps):
+                                ewc_cache_step    = step // config.gradient_accumulation_steps
+                                ewc_penalty_cache = ewc.penalty(model)
+                            loss = (task_loss + config.ewc_lambda * ewc_penalty_cache) \
+                                   / config.gradient_accumulation_steps
+                        else:
+                            loss = task_loss / config.gradient_accumulation_steps
                     scaler.scale(loss).backward()
                 else:
                     outputs   = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                     task_loss = outputs['loss']
                     if ewc is not None:
+                        if ewc_cache_step != (step // config.gradient_accumulation_steps):
+                            ewc_cache_step    = step // config.gradient_accumulation_steps
+                            ewc_penalty_cache = ewc.penalty(model)
+                        loss = (task_loss + config.ewc_lambda * ewc_penalty_cache) \
+                               / config.gradient_accumulation_steps
+                    else:
+                        loss = task_loss / config.gradient_accumulation_steps
                     loss.backward()
+                # FIX-T3: track task_loss only (no EWC contamination in perplexity)
                 acc.update(task_loss.item(), labels)
                 if (step + 1) % config.gradient_accumulation_steps == 0:
                     if use_amp:
                         scaler.unscale_(optimizer)
                     grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                     if use_amp:
 def evaluate_model(model, dataset, device, batch_size=4):
     model.eval()
+    dataloader = _make_dataloader(dataset, batch_size, shuffle=False,
+                                  pad_token_id=model.padding_idx, device_type=device.type)
     acc = TokenLossAccumulator()
     with torch.no_grad():
         for batch in dataloader:
 # ============================================================================
+# GENERATION WITH KV CACHE — FULLY FIXED
 # ============================================================================
+# FIX-I2: vectorized repetition penalty
+def _apply_repetition_penalty_vectorized(
+    logits: torch.Tensor,           # [1, V]
+    token_ids: List[int],
+    penalty: float,
+) -> torch.Tensor:
+    if not token_ids or penalty == 1.0:
+        return logits
+    unique_ids = list(set(token_ids))
+    idx        = torch.tensor(unique_ids, dtype=torch.long, device=logits.device)
+    # Gather scores for penalized tokens
+    scores = logits[0].gather(0, idx)
+    # penalty: divide positive scores, multiply negative scores
+    penalized = torch.where(scores > 0, scores / penalty, scores * penalty)
+    logits[0].scatter_(0, idx, penalized)
+    return logits
 def generate_text(
     model: IndonesianLLM,
     tokenizer,
     device: torch.device = torch.device('cpu'),
 ) -> str:
     """
+    KV-cache generation with all inference fixes applied.
+    FIX-I1: position_ids propagated correctly through layers
+    FIX-I2: vectorized repetition penalty (no Python loop over vocab)
+    FIX-I3: torch.Generator for entropy — no global RNG reset
     """
     model.eval()
+    # FIX-I3: isolated Generator — doesn't touch global torch RNG state
+    gen = torch.Generator(device=device)
+    gen.manual_seed(int.from_bytes(os.urandom(4), 'little'))
     eos_id = tokenizer.eos_token_id or tokenizer.sep_token_id or 2
     pad_id = tokenizer.pad_token_id or 0
+    input_ids     = tokenizer.encode(prompt, return_tensors='pt').to(device)
+    prompt_len    = input_ids.shape[1]
+    generated_ids = input_ids.clone()
     with torch.no_grad():
+        # Prefill: process entire prompt, build KV cache
+        # FIX-I1: explicit position_ids for prefill
+        prefill_pos = torch.arange(0, prompt_len, device=device).unsqueeze(0)
+        prefill_out = model(
             input_ids=input_ids,
+            position_ids=prefill_pos,
             use_cache=True,
         )
         past_kv = prefill_out['past_key_values']
+        prompt_token_ids    = input_ids[0].tolist()
         generated_token_ids = []
         for _ in range(max_new_tokens):
+            cur_id     = generated_ids[:, -1:]          # [1, 1]
+            cur_pos    = torch.tensor([[past_kv[0][0].shape[2]]], device=device)  # [1, 1]
+            out     = model(input_ids=cur_id, position_ids=cur_pos,
+                            past_key_values=past_kv, use_cache=True)
+            past_kv = out['past_key_values']
+            logits  = out['logits'][:, -1:, :].clone()   # [1, 1, V] — clone to avoid in-place aliasing
+            logits  = logits.squeeze(1)                   # [1, V]
+            logits /= max(temperature, 0.05)
+            # FIX-I2: vectorized repetition penalty
             if repetition_penalty != 1.0:
+                all_seen = prompt_token_ids + generated_token_ids[-128:]
+                logits   = _apply_repetition_penalty_vectorized(logits, all_seen, repetition_penalty)
             # Top-k
             if top_k > 0:
                 logits = torch.zeros_like(logits).scatter_(1, sorted_idx, sorted_logits)
             probs      = F.softmax(logits, dim=-1)
+            # FIX-I3: use isolated generator for multinomial sampling
+            next_token = torch.multinomial(probs, num_samples=1, generator=gen)  # [1, 1]
             tok_id = next_token.item()
             if tok_id in {eos_id, pad_id}:
             generated_token_ids.append(tok_id)
             generated_ids = torch.cat([generated_ids, next_token], dim=1)
             if generated_ids.size(1) >= model.config.max_position_embeddings:
                 break
     new_token_ids = generated_ids[0][prompt_len:]
     if len(new_token_ids) == 0:
         return ""
     raw_text = tokenizer.decode(new_token_ids, skip_special_tokens=False)
+    raw_text = re.sub(r'\[(SEP|CLS|PAD|UNK|MASK)\]', '', raw_text)
     return raw_text.strip()
 # ============================================================================
 def _clean_response(response: str) -> str:
     if "<cot>" in response and "</cot>" in response:
         response = response.split("</cot>", 1)[-1]
     elif "<cot>" in response:
         response = ""
     response = re.sub(r'\[(SEP|CLS|PAD|UNK|MASK)\]', '', response)
     response = re.sub(r'<[^>]+>', '', response)
+    # FIX: stricter role-marker pattern — only strips if WHOLE LINE is a role label
+    response = re.sub(r'(?im)^(user\s*:|assistant\s*:)\s*$', '', response)
+    # Also strip inline "user: " prefix but only at start of a line followed by content
+    response = re.sub(r'(?im)^(user|assistant)\s*:\s*', '', response)
     for marker in ["memahami permintaan", "jawaban singkat", "penjelasan harus"]:
         if marker in response:
             response = response.split(marker)[0]
     response = re.sub(r'\n{2,}', '\n', response)
     response = re.sub(r' {2,}', ' ', response)
     response = re.sub(r'^[\s:!,.\-|]+', '', response)
     return response.strip()
 def _extract_thinking(raw: str) -> Tuple[str, str]:
     raw = re.sub(r'\[(SEP|CLS|PAD|UNK|MASK)\]', '', raw)
     if "</cot>" in raw:
         thinking_raw, answer_raw = raw.split("</cot>", 1)
         thinking = re.sub(r'<[^>]+>', '', thinking_raw).strip()
+        thinking = re.sub(r'(?im)^(user|assistant)\s*:\s*', '', thinking).strip()
         answer   = _clean_response(answer_raw)
     elif "<cot>" in raw:
         parts    = raw.split("<cot>", 1)
         thinking = _clean_response(parts[1]) if len(parts) > 1 else ""
         answer   = _clean_response(parts[0]) if parts[0].strip() else ""
     else:
         thinking = ""
         answer   = _clean_response(raw)
 # ============================================================================
+# INTERACTIVE CHAT — WITH MULTI-TURN HISTORY (FIX-I4)
 # ============================================================================
 def interactive_chat(
     tokenizer,
     device: torch.device,
     system_prompt: str = "Kamu adalah asisten AI yang membantu, ramah, dan menjawab dalam Bahasa Indonesia.",
+    max_history_turns: int = 6,
 ):
+    """
+    FIX-I4: Maintains a rolling conversation history.
+    History is encoded as a flat context string, prepended to each new turn.
+    The window is capped at max_history_turns to avoid context overflow.
+    """
     print("\n" + "=" * 80)
+    print("INDONESIAN LLM — INTERACTIVE CHAT  (KV-cache enabled, multi-turn)")
     print("=" * 80)
     print("Commands: exit/quit | clear | think (toggle CoT display)")
     print(f"Persona : {system_prompt}")
+    print(f"History : last {max_history_turns} turns")
     print("=" * 80 + "\n")
     model.eval()
+    show_thinking  = False
+    # Each entry: {"user": str, "assistant": str}
+    history: List[Dict[str, str]] = []
+    def _build_prompt(user_input: str) -> str:
+        """Build a prompt with rolling context window."""
+        parts = []
+        # System persona as a brief prefix
+        parts.append(f"[Sistem: {system_prompt}]")
+        # Recent history
+        recent = history[-max_history_turns:]
+        for turn in recent:
+            parts.append(f"Pengguna: {turn['user']}")
+            parts.append(f"Asisten: {turn['assistant']}")
+        # Current turn — append CoT trigger
+        parts.append(f"Pengguna: {user_input}")
+        parts.append(f"Asisten: <cot>")
+        return "\n".join(parts)
     while True:
         try:
                 print("\nSelamat tinggal!")
                 break
             if user_input.lower() in ['clear', 'bersihkan']:
+                history.clear()
                 print("\nConversation cleared.")
                 continue
             if user_input.lower() == 'think':
                 print(f"\nThinking mode: {'ON' if show_thinking else 'OFF'}")
                 continue
+            prompt = _build_prompt(user_input)
             print("\nA:", end=" ", flush=True)
             response = generate_text(
                 model=model,
                 tokenizer=tokenizer,
             if show_thinking and thinking:
                 print(f"[Thinking: {thinking}]")
             if answer:
                 final = answer
             else:
                 final = _clean_response(response)
                 if not final and thinking:
                     sentences = [s.strip() for s in thinking.split('.') if s.strip()]
                     final = sentences[-1] if sentences else thinking[:200]
                 final = "..."
             print(final)
+            # FIX-I4: store turn in history (use clean answer)
+            history.append({"user": user_input, "assistant": final})
         except KeyboardInterrupt:
             print("\n\nDihentikan.")
             break
         print("No valid samples.")
         return
     live_seed = int(time.time() * 1000) % (2**31)
     random.seed(live_seed)
     samples = random.sample(all_samples, min(n, len(all_samples)))
     model.eval()
     print(f"\n{'=' * 80}\nBENCHMARK  ({len(samples)} samples)\n{'=' * 80}")
+    results = []
+    acc     = TokenLossAccumulator()
     for sample in samples:
         inp      = sample['input'].strip()
         _, answer    = _extract_thinking(raw)
         answer_lower = answer.lower()
         passed = expected in answer_lower
         if not passed:
             exp_toks = set(expected.split())
 # ============================================================================
 # SAVE / LOAD
+# FIX-T7: save_model keeps fp16 for inference; load_model does NOT upcast by default
 # ============================================================================
+def save_model(
+    model: IndonesianLLM,
+    config: ModelConfig,
+    tokenizer_name: str,
+    path: str,
+    use_fp16: bool = True,
+):
     os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
     state = model.state_dict()
     if use_fp16:
         'dtype':            'fp16' if use_fp16 else 'fp32',
     }, path)
     size_mb = os.path.getsize(path) / 1e6
+    print(f"\nSaved: {path}  ({'fp16' if use_fp16 else 'fp32'}, {size_mb:.1f} MB, "
+          f"{model.count_parameters():,} params)")
+def load_model(path: str, device: torch.device, force_fp32_training: bool = False):
+    """
+    FIX-T7:
+    - For inference (force_fp32_training=False): keep model in fp16 when saved as fp16.
+      This halves VRAM usage during chat and benchmark.
+    - For training continuation (force_fp32_training=True): upcast to fp32.
+    """
     if not os.path.exists(path):
         raise FileNotFoundError(f"Checkpoint not found: {path}")
     print(f"Loading: {path}")
     dtype          = ck.get('dtype', 'fp32')
     state = ck['model_state_dict']
+    # Only upcast when we need fp32 for training
+    if force_fp32_training and dtype == 'fp16':
         state = {k: v.float() if v.dtype == torch.float16 else v for k, v in state.items()}
+        print("  [load_model] Upcasting fp16 -> fp32 for training")
+    # Derive intermediate_size from weights
     gate_key = next((k for k in state if k.endswith('gate_proj.weight')), None)
     if gate_key is not None:
         inferred_intermediate = state[gate_key].shape[0]
         if getattr(config, 'intermediate_size', -1) != inferred_intermediate:
             print(f"  [load_model] intermediate_size: config={getattr(config, 'intermediate_size', '?')} "
+                  f"-> overriding with {inferred_intermediate}")
             config.intermediate_size = inferred_intermediate
     embed_key = next((k for k in state if k.endswith('embed_tokens.weight')), None)
     if embed_key is not None:
         inferred_vocab = state[embed_key].shape[0]
         if config.vocab_size != inferred_vocab:
             print(f"  [load_model] vocab_size: config={config.vocab_size} "
+                  f"-> overriding with {inferred_vocab}")
             config.vocab_size = inferred_vocab
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
     tokenizer.add_special_tokens({"additional_special_tokens": ["<cot>", "</cot>"]})
     model = IndonesianLLM(config)
+    model.load_state_dict(state, strict=False)
     model.to(device)
+    # Keep model in fp16 for inference if that's what was saved
+    if not force_fp32_training and dtype == 'fp16':
+        model = model.half()
+        print("  [load_model] Keeping model in fp16 for inference (use force_fp32_training=True for training)")
     size_mb = os.path.getsize(path) / 1e6
     print(f"Loaded ({dtype}, {size_mb:.1f} MB, {ck.get('model_params', model.count_parameters()):,} params)")
     return model, tokenizer, config, {}
     parser.add_argument('--ewc-lambda',       type=float, default=5000.0)
     parser.add_argument('--ewc-samples',      type=int,   default=2000)
     parser.add_argument('--no-ewc',           action='store_true')
+    # FIX-T6: expose gradient checkpointing via CLI
+    parser.add_argument('--grad-ckpt',        action='store_true',
+                        help='Enable gradient checkpointing (saves ~50%% activation memory)')
+    parser.add_argument('--max-history',      type=int,   default=6,
+                        help='Max conversation turns to keep in chat context')
     args = parser.parse_args()
     save_fp16        = not args.save_fp32
     use_cot_training = not args.no_cot
     if args.train or args.finetune or args.continue_train:
         set_seed(args.seed)
     else:
         import time
         live_seed = int(time.time() * 1000) % (2**31)
         random.seed(live_seed)
         torch.manual_seed(live_seed)
         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(live_seed)
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     print(f"\nDevice: {device}")
     if torch.cuda.is_available():
         print(f"  GPU:  {torch.cuda.get_device_name(0)}")
         print(f"  VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
     if args.inspect_data:
         tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
         tokenizer.add_special_tokens({"additional_special_tokens": ["<cot>", "</cot>"]})
             print(f"  Output: {s['output'][:120]}")
         return
     if args.chat:
+        model, tokenizer, _, _ = load_model(args.model, device, force_fp32_training=False)
+        interactive_chat(model, tokenizer, device,
+                         system_prompt=args.system_prompt,
+                         max_history_turns=args.max_history)
         return
     if args.benchmark:
+        model, tokenizer, _, _ = load_model(args.model, device, force_fp32_training=False)
         run_benchmark(model, tokenizer, device, dataset_path=args.dataset)
         return
     if args.train:
         tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
         tokenizer.add_special_tokens({"additional_special_tokens": ["<cot>", "</cot>"]})
         model_config = ModelConfig(
             vocab_size              = len(tokenizer),
             hidden_size             = args.hidden_size,
             attention_dropout       = 0.1,
             residual_dropout        = 0.1,
             tie_word_embeddings     = True,
+            use_gradient_checkpointing = args.grad_ckpt,
         )
         print(f"\nModel config: {model_config}")
         model = IndonesianLLM(model_config)
         print(f"Parameters: {model.count_parameters():,}")
         _ga = args.grad_accum or 32
         train_config = TrainingConfig(
+            dataset_path               = args.dataset,
+            num_epochs                 = args.epochs,
+            batch_size                 = args.batch_size,
+            gradient_accumulation_steps= _ga,
+            max_seq_length             = args.max_length,
+            learning_rate              = args.lr,
+            warmup_steps               = 500,
+            use_fp16                   = torch.cuda.is_available(),
+            use_gradient_checkpointing = args.grad_ckpt,
+            curriculum_stages          = [128, 256, args.max_length],
         )
         dataset = IndonesianCoTDataset(train_config.dataset_path, tokenizer,
+                                       train_config.max_seq_length, use_cot=use_cot_training,
+                                       cot_ratio=args.cot_ratio)
         model   = train_model(model, dataset, train_config, device,
                               use_simple_curriculum=args.simple_curriculum)
             print(f"\nPrompt   : {p}")
             print(f"Generated: {generate_text(model, tokenizer, p, max_new_tokens=150, device=device)}\n")
     if args.finetune:
+        model, tokenizer, model_config, _ = load_model(args.model, device, force_fp32_training=True)
         _ga = args.grad_accum or 32
         train_config = TrainingConfig(
+            dataset_path               = args.dataset,
+            num_epochs                 = args.epochs,
+            batch_size                 = args.batch_size,
+            gradient_accumulation_steps= _ga,
+            max_seq_length             = args.max_length,
+            learning_rate              = args.lr / 10,
+            warmup_steps               = 100,
+            use_fp16                   = torch.cuda.is_available(),
+            use_gradient_checkpointing = args.grad_ckpt,
+            curriculum_stages          = [128, 256, args.max_length],
         )
         dataset = IndonesianCoTDataset(train_config.dataset_path, tokenizer,
+                                       train_config.max_seq_length, use_cot=use_cot_training,
+                                       cot_ratio=args.cot_ratio)
         ewc_obj = None
         if not args.no_ewc and args.ewc_lambda > 0:
             print(f"\nComputing EWC Fisher (lambda={args.ewc_lambda}, n={args.ewc_samples})...")
+            loader = _make_dataloader(dataset, args.batch_size, shuffle=True,
+                                      pad_token_id=model.padding_idx, device_type=device.type)
             train_config.ewc_lambda  = args.ewc_lambda
             train_config.ewc_samples = args.ewc_samples
             ewc_obj = EWC(model, loader, device, n_samples=args.ewc_samples)
         save_model(model, model_config, "indolem/indobert-base-uncased", out_path, use_fp16=save_fp16)
         print(f"\nFinetuned model: {out_path}")
     if args.continue_train:
+        model, tokenizer, model_config, _ = load_model(args.model, device, force_fp32_training=True)
+        effective_skip = (len([128, 256, args.max_length]) - 1) if args.simple_curriculum else args.skip_stages
+        curriculum     = [192, 320, args.max_length]
+        print(f"\nContinue-train LR: {args.lr:.2e}  (skip {effective_skip} stages)")
         _ga = args.grad_accum or 32
         train_config = TrainingConfig(
+            dataset_path               = args.dataset,
+            num_epochs                 = args.epochs,
+            batch_size                 = args.batch_size,
+            gradient_accumulation_steps= _ga,
+            max_seq_length             = args.max_length,
+            learning_rate              = args.lr,
+            warmup_steps               = 500,
+            use_fp16                   = torch.cuda.is_available(),
+            use_gradient_checkpointing = args.grad_ckpt,
+            curriculum_stages          = curriculum,
+            plateau_patience           = 2,
+            plateau_factor             = 0.6,
+            plateau_min_delta          = 0.01,
         )
         dataset = IndonesianCoTDataset(train_config.dataset_path, tokenizer,
+                                       train_config.max_seq_length, use_cot=use_cot_training,
+                                       cot_ratio=args.cot_ratio)
         model = train_model(model, dataset, train_config, device,
                             use_simple_curriculum=args.simple_curriculum,
                             is_continue=True,