Spaces:

heerjtdev
/

LSTM-CRF_Train

Running

App Files Files Community

aagamjtdev commited on Oct 28, 2025

Commit

f1d3547

1 Parent(s): 9b410c3

training script

Browse files

Files changed (1) hide show

train_model.py +777 -0

train_model.py ADDED Viewed

	@@ -0,0 +1,777 @@

+import os
+import re
+import json
+import pickle
+import argparse
+from collections import Counter
+from typing import List, Tuple, Dict, Any
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+try:
+    from TorchCRF import CRF
+except ImportError:
+    print("Error: The 'TorchCRF' library is required. Please install it using 'pip install torch-crf'.")
+    exit()
+# ========== CONFIG ==========
+# Using the user's saved path information for DATA_DIR and model/vocab file names
+DATA_DIR = "output_data"
+MODEL_FILE = "model_enhanced.pt"  # Using user's saved model filename
+VOCAB_FILE = "vocabs_enhanced.pkl"  # Using user's saved vocab filename
+CHECKPOINT_FILE = "checkpoint_enhanced.pt"  # New file for full checkpoint (incl. optimizer, epoch, etc.)
+os.makedirs(DATA_DIR, exist_ok=True)
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MAX_CHAR_LEN = 16
+EMBED_DIM = 128  # Increased from 100
+CHAR_EMBED_DIM = 50  # Increased from 30
+CHAR_CNN_OUT = 50  # Increased from 30
+BBOX_DIM = 128  # Increased from 100
+HIDDEN_SIZE = 768  # Increased from 512 to match LayoutLM dimension
+BATCH_SIZE = 8
+EPOCHS = 10  # Increased from 30
+LR = 5e-4  # Decreased from 1e-3 for more stable training
+BBOX_NORM_CONSTANT = 1000.0
+CHUNK_SIZE = 450
+# Enhanced feature dimensions
+SPATIAL_FEATURE_DIM = 64  # Increased from 32
+POSITIONAL_DIM = 128  # New: For learnable positional embeddings
+# ========== LABELS ==========
+LABELS = [
+    "O",
+    "B-QUESTION", "I-QUESTION",
+    "B-OPTION", "I-OPTION",
+    "B-ANSWER", "I-ANSWER",
+    "B-IMAGE", "I-IMAGE",
+    "B-SECTION HEADING", "I-SECTION HEADING",
+    "B-PASSAGE", "I-PASSAGE"
+]
+LABEL2IDX = {l: i for i, l in enumerate(LABELS)}
+IDX2LABEL = {i: l for l, i in LABEL2IDX.items()}
+# ========== ENHANCED FEATURE EXTRACTION (PATTERN FUNCTION REMOVED) ==========
+def extract_spatial_features(tokens: List[Dict], idx: int) -> List[float]:
+    """Enhanced spatial features with relative positioning."""
+    current = tokens[idx]
+    features = []
+    # Vertical spacing with next token (look-ahead)
+    if idx < len(tokens) - 1:
+        next_tok = tokens[idx + 1]
+        forward_gap = next_tok['y0'] - current['y1']
+        features.append(min(forward_gap / 100.0, 1.0))
+    else:
+        features.append(0.0)
+    # Vertical spacing with previous token
+    if idx > 0:
+        prev = tokens[idx - 1]
+        vertical_gap = current['y0'] - prev['y1']
+        features.append(min(vertical_gap / 100.0, 1.0))
+    else:
+        features.append(0.0)
+    # Horizontal offset (indentation)
+    features.append(current['x0'] / BBOX_NORM_CONSTANT)
+    # Token dimensions
+    width = current['x1'] - current['x0']
+    height = current['y1'] - current['y0']
+    features.append(width / BBOX_NORM_CONSTANT)
+    features.append(height / BBOX_NORM_CONSTANT)
+    # Position in line
+    x_center = (current['x0'] + current['x1']) / 2
+    y_center = (current['y0'] + current['y1']) / 2
+    features.append(x_center / BBOX_NORM_CONSTANT)
+    features.append(y_center / BBOX_NORM_CONSTANT)
+    # Distance from left margin
+    features.append(current['x0'] / BBOX_NORM_CONSTANT)
+    # Aspect ratio
+    aspect = width / max(height, 1.0)
+    features.append(min(aspect / 10.0, 1.0))
+    # Alignment features (detect if aligned with previous/next)
+    if idx > 0:
+        prev = tokens[idx - 1]
+        x_alignment = abs(current['x0'] - prev['x0']) < 5  # Within 5 units
+        features.append(float(x_alignment))
+    else:
+        features.append(0.0)
+    # Area (normalized)
+    area = width * height
+    features.append(min(area / (BBOX_NORM_CONSTANT ** 2), 1.0))
+    return features
+def extract_context_features(tokens: List[Dict], idx: int, window: int = 3) -> Dict[str, Any]:
+    """Enhanced context with larger window and more patterns."""
+    context_features = []
+    # Previous context
+    prev_has_q = 0.0
+    prev_has_opt = 0.0
+    prev_has_caps = 0.0
+    for i in range(max(0, idx - window), idx):
+        text = tokens[i]['text'].lower().strip()
+        if re.match(r'^q?\.?\d+[.:]', text):
+            prev_has_q = 1.0
+        if re.match(r'^[a-dA-D][.)]', text):
+            prev_has_opt = 1.0
+        if tokens[i]['text'].strip().isupper() and len(tokens[i]['text'].strip()) > 2:
+            prev_has_caps = 1.0
+    context_features.extend([prev_has_q, prev_has_opt, prev_has_caps])
+    # Next context
+    next_has_q = 0.0
+    next_has_opt = 0.0
+    next_has_caps = 0.0
+    for i in range(idx + 1, min(len(tokens), idx + window + 1)):
+        text = tokens[i]['text'].lower().strip()
+        if re.match(r'^q?\.?\d+[.:]', text):
+            next_has_q = 1.0
+        if re.match(r'^[a-dA-D][.)]', text):
+            next_has_opt = 1.0
+        if tokens[i]['text'].strip().isupper() and len(tokens[i]['text'].strip()) > 2:
+            next_has_caps = 1.0
+    context_features.extend([next_has_q, next_has_opt, next_has_caps])
+    # Distance features: how far to next question/option marker
+    dist_to_next_q = window + 1
+    dist_to_next_opt = window + 1
+    for i in range(idx + 1, min(len(tokens), idx + window + 1)):
+        text = tokens[i]['text'].lower().strip()
+        if re.match(r'^q?\.?\d+[.:]', text) and dist_to_next_q > (i - idx):
+            dist_to_next_q = i - idx
+        if re.match(r'^[a-dA-D][.)]', text) and dist_to_next_opt > (i - idx):
+            dist_to_next_opt = i - idx
+    context_features.append(dist_to_next_q / window)
+    context_features.append(dist_to_next_opt / window)
+    return context_features
+# ========== Vocab Class ==========
+class Vocab:
+    def __init__(self, min_freq=1, unk_token="<UNK>", pad_token="<PAD>"):
+        self.min_freq = min_freq
+        self.unk_token = unk_token
+        self.pad_token = pad_token
+        self.freq = Counter()
+        self.itos = []
+        self.stoi = {}
+    def add_sentence(self, toks):
+        self.freq.update(toks)
+    def build(self):
+        items = [tok for tok, c in self.freq.items() if c >= self.min_freq]
+        items = [self.pad_token, self.unk_token] + sorted(items)
+        self.itos = items
+        self.stoi = {s: i for i, s in enumerate(self.itos)}
+    def __len__(self):
+        return len(self.itos)
+    def __getitem__(self, token: str) -> int:
+        return self.stoi.get(token, self.stoi[self.unk_token])
+    def __getstate__(self):
+        return {
+            'min_freq': self.min_freq,
+            'unk_token': self.unk_token,
+            'pad_token': self.pad_token,
+            'itos': self.itos,
+            'stoi': self.stoi,
+        }
+    def __setstate__(self, state):
+        self.min_freq = state['min_freq']
+        self.unk_token = state['unk_token']
+        self.pad_token = state['pad_token']
+        self.itos = state['itos']
+        self.stoi = state['stoi']
+        self.freq = Counter()
+# ========== Data Loading ==========
+def load_unified_data(unified_json_path: str) -> Tuple[List[Dict[str, Any]], List[List[str]]]:
+    """Loads data and extracts enhanced features."""
+    if not os.path.exists(unified_json_path):
+        raise FileNotFoundError(f"Unified JSON data not found at: {unified_json_path}")
+    with open(unified_json_path, 'r', encoding='utf-8') as f:
+        flat_tokens = json.load(f)
+    pages_tokens = []
+    labels_per_token = []
+    print(":mag: Extracting spatial and context features (patterns removed)...")
+    for i in tqdm(range(0, len(flat_tokens), CHUNK_SIZE), desc="Processing chunks"):
+        chunk = flat_tokens[i:i + CHUNK_SIZE]
+        if not chunk: continue
+        tokens_list = []
+        for j, t in enumerate(chunk):
+            token_dict = {
+                "text": t["token"],
+                "x0": t["bbox"][0], "y0": t["bbox"][1],
+                "x1": t["bbox"][2], "y1": t["bbox"][3],
+                "page_no": 0, "block_idx": 0
+            }
+            # Pattern feature extraction removed
+            tokens_list.append(token_dict)
+        for j, token_dict in enumerate(tokens_list):
+            token_dict["spatial_features"] = extract_spatial_features(tokens_list, j)
+            token_dict["context_features"] = extract_context_features(tokens_list, j, window=3)
+        pages_tokens.append({
+            "tokens": tokens_list,
+            "width": BBOX_NORM_CONSTANT,
+            "height": BBOX_NORM_CONSTANT
+        })
+        labels_per_token.append([t["label"] for t in chunk])
+    return pages_tokens, labels_per_token
+# ========== Dataset ==========
+class MCQTokenDataset(Dataset):
+    def __init__(self, pages_tokens, word_vocab, char_vocab, labels_per_token=None):
+        self.samples = []
+        self.bbox_norm_factor = BBOX_NORM_CONSTANT
+        for page_data in pages_tokens:
+            if len(page_data["tokens"]) == 0: continue
+            self.samples.append(page_data)
+        self.labels = labels_per_token
+        self.word_vocab = word_vocab
+        self.char_vocab = char_vocab
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        page_data = self.samples[idx]
+        toks = page_data["tokens"]
+        words = [t["text"] for t in toks]
+        word_ids = [self.word_vocab.stoi.get(w, self.word_vocab.stoi[self.word_vocab.unk_token]) for w in words]
+        char_ids = []
+        for w in words:
+            chs = [self.char_vocab.stoi.get(ch, self.char_vocab.stoi[self.char_vocab.unk_token]) for ch in
+                   w[:MAX_CHAR_LEN]]
+            if len(chs) < MAX_CHAR_LEN:
+                chs += [self.char_vocab.stoi[self.char_vocab.pad_token]] * (MAX_CHAR_LEN - len(chs))
+            char_ids.append(chs)
+        bboxes = []
+        for t in toks:
+            normalized_bbox = [
+                t["x0"] / self.bbox_norm_factor,
+                t["y0"] / self.bbox_norm_factor,
+                t["x1"] / self.bbox_norm_factor,
+                t["y1"] / self.bbox_norm_factor,
+            ]
+            bboxes.append(normalized_bbox)
+        # Pattern features removed
+        spatial_features = [t["spatial_features"] for t in toks]
+        context_features = [t["context_features"] for t in toks]
+        labels = None
+        if self.labels:
+            lbls = self.labels[idx]
+            labels = [LABEL2IDX[l] for l in lbls]
+        return {
+            "word_ids": torch.LongTensor(word_ids),
+            "char_ids": torch.LongTensor(char_ids),
+            "bboxes": torch.FloatTensor(bboxes),
+            # "pattern_features" removed
+            "spatial_features": torch.FloatTensor(spatial_features),
+            "context_features": torch.FloatTensor(context_features),
+            "labels": torch.LongTensor(labels) if labels is not None else None,
+            "tokens": toks
+        }
+def collate_batch(batch):
+    max_len = max(item["word_ids"].size(0) for item in batch)
+    batch_size = len(batch)
+    word_pad = torch.zeros((batch_size, max_len), dtype=torch.long)
+    char_pad = torch.zeros((batch_size, max_len, MAX_CHAR_LEN), dtype=torch.long)
+    bbox_pad = torch.zeros((batch_size, max_len, 4), dtype=torch.float)
+    # pattern_pad removed
+    spatial_pad = torch.zeros((batch_size, max_len, 11), dtype=torch.float)  # Note: 11 spatial features
+    context_pad = torch.zeros((batch_size, max_len, 8), dtype=torch.float)  # Note: 8 context features
+    mask = torch.zeros((batch_size, max_len), dtype=torch.bool)
+    label_pad = torch.full((batch_size, max_len), -1, dtype=torch.long)
+    tokens_list = []
+    for i, item in enumerate(batch):
+        L = item["word_ids"].size(0)
+        word_pad[i, :L] = item["word_ids"]
+        char_pad[i, :L, :] = item["char_ids"]
+        bbox_pad[i, :L, :] = item["bboxes"]
+        # pattern_pad removed
+        spatial_pad[i, :L, :] = item["spatial_features"]
+        context_pad[i, :L, :] = item["context_features"]
+        mask[i, :L] = 1
+        if item["labels"] is not None:
+            label_pad[i, :L] = item["labels"]
+        tokens_list.append(item["tokens"])
+    return {
+        "words": word_pad,
+        "chars": char_pad,
+        "bboxes": bbox_pad,
+        # "pattern_features" removed
+        "spatial_features": spatial_pad,
+        "context_features": context_pad,
+        "mask": mask,
+        "labels": label_pad,
+        "tokens": tokens_list
+    }
+# ========== ENHANCED MODEL ==========
+class CharCNNEncoder(nn.Module):
+    def __init__(self, char_vocab_size, char_emb_dim, out_dim, kernel_sizes=(2, 3, 4, 5)):
+        super().__init__()
+        self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
+        convs = [nn.Conv1d(char_emb_dim, out_dim, kernel_size=k) for k in kernel_sizes]
+        self.convs = nn.ModuleList(convs)
+        self.out_dim = out_dim * len(convs)
+    def forward(self, char_ids):
+        B, L, C = char_ids.size()
+        emb = self.char_emb(char_ids.view(B * L, C)).transpose(1, 2)
+        outs = [torch.max(torch.relu(conv(emb)), dim=2)[0] for conv in self.convs]
+        res = torch.cat(outs, dim=1)
+        return res.view(B, L, -1)
+class SpatialAttention(nn.Module):
+    """Attention mechanism for spatial relationships."""
+    def __init__(self, hidden_dim):
+        super().__init__()
+        self.query = nn.Linear(hidden_dim, hidden_dim)
+        self.key = nn.Linear(hidden_dim, hidden_dim)
+        self.value = nn.Linear(hidden_dim, hidden_dim)
+        self.scale = hidden_dim ** 0.5
+    def forward(self, x, mask):
+        Q = self.query(x)
+        K = self.key(x)
+        V = self.value(x)
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
+        # Apply mask
+        mask_expanded = mask.unsqueeze(1).expand_as(scores)
+        scores = scores.masked_fill(~mask_expanded, float('-inf'))
+        attn_weights = F.softmax(scores, dim=-1)
+        # Handle NaN from softmax on all -inf scores (shouldn't happen with proper mask, but for safety)
+        attn_weights = attn_weights.masked_fill(torch.isnan(attn_weights), 0.0)
+        output = torch.matmul(attn_weights, V)
+        return output
+class MCQTagger(nn.Module):
+    def __init__(self, vocab_size, char_vocab_size, n_labels, bbox_dim=BBOX_DIM):
+        super().__init__()
+        self.word_emb = nn.Embedding(vocab_size, EMBED_DIM, padding_idx=0)
+        self.char_enc = CharCNNEncoder(char_vocab_size, CHAR_EMBED_DIM, CHAR_CNN_OUT)
+        # Enhanced bbox encoding with MLP
+        self.bbox_proj = nn.Sequential(
+            nn.Linear(4, bbox_dim),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(bbox_dim, bbox_dim)
+        )
+        # Feature projections (Pattern projection removed)
+        self.spatial_proj = nn.Sequential(
+            nn.Linear(11, SPATIAL_FEATURE_DIM),  # 11 spatial features
+            nn.ReLU(),
+            nn.Dropout(0.1)
+        )
+        self.context_proj = nn.Sequential(
+            nn.Linear(8, 32),  # 8 context features
+            nn.ReLU(),
+            nn.Dropout(0.1)
+        )
+        # Positional encoding for sequence position awareness
+        self.positional_encoding = nn.Embedding(512, POSITIONAL_DIM)
+        # Input dimension updated (PATTERN_FEATURE_DIM removed)
+        in_dim = (EMBED_DIM + self.char_enc.out_dim + bbox_dim +
+                  SPATIAL_FEATURE_DIM + 32 + POSITIONAL_DIM)
+        # Deeper BiLSTM
+        self.bilstm = nn.LSTM(in_dim, HIDDEN_SIZE // 2, num_layers=3,
+                              batch_first=True, bidirectional=True, dropout=0.3)
+        # Spatial attention layer
+        self.spatial_attention = SpatialAttention(HIDDEN_SIZE)
+        # Layer normalization
+        self.layer_norm = nn.LayerNorm(HIDDEN_SIZE)
+        # Final projection with residual connection
+        self.ff = nn.Sequential(
+            nn.Linear(HIDDEN_SIZE * 2, HIDDEN_SIZE),  # *2 for attention concat
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(HIDDEN_SIZE, n_labels)
+        )
+        self.crf = CRF(n_labels)
+        self.dropout = nn.Dropout(p=0.5)
+    def forward_emissions(self, words, chars, bboxes, spatial_feats, context_feats, mask):
+        B, L = words.size()
+        # Embeddings
+        wemb = self.word_emb(words)
+        cenc = self.char_enc(chars)
+        benc = self.bbox_proj(bboxes)
+        # penc removed
+        senc = self.spatial_proj(spatial_feats)
+        cxt_enc = self.context_proj(context_feats)
+        # Positional encoding
+        positions = torch.arange(L, device=words.device).unsqueeze(0).expand(B, -1)
+        pos_enc = self.positional_encoding(positions.clamp(max=511))
+        # Concatenate all features (penc removed)
+        enc_in = torch.cat([wemb, cenc, benc, senc, cxt_enc, pos_enc], dim=-1)
+        enc_in = self.dropout(enc_in)
+        # BiLSTM
+        lengths = mask.sum(dim=1).cpu()
+        packed_in = nn.utils.rnn.pack_padded_sequence(enc_in, lengths, batch_first=True, enforce_sorted=False)
+        packed_out, _ = self.bilstm(packed_in)
+        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
+        # Spatial attention
+        attn_out = self.spatial_attention(lstm_out, mask)
+        # Combine LSTM and attention with residual
+        combined = torch.cat([lstm_out, attn_out], dim=-1)
+        combined = self.layer_norm(lstm_out + attn_out)
+        # Final projection
+        emissions = self.ff(torch.cat([lstm_out, attn_out], dim=-1))
+        return emissions
+    def forward(self, words, chars, bboxes, spatial_feats, context_feats, mask, labels=None,
+                class_weights=None, alpha=0.8):
+        # pattern_feats removed from arguments
+        emissions = self.forward_emissions(words, chars, bboxes, spatial_feats, context_feats, mask)
+        if labels is not None:
+            crf_loss = -self.crf(emissions, labels, mask=mask).sum()
+            if class_weights is not None:
+                # Use a combined loss for better learning, as intended by the previous code structure
+                ce_loss_fn = nn.CrossEntropyLoss(weight=class_weights.to(emissions.device), ignore_index=-1)
+                ce_loss = ce_loss_fn(emissions.view(-1, emissions.size(-1)), labels.view(-1))
+                return alpha * crf_loss + (1 - alpha) * ce_loss
+            return crf_loss
+        return self.crf.viterbi_decode(emissions, mask=mask)
+# ========== Training/Eval ==========
+def compute_class_weights(labels_list, num_labels):
+    all_labels_flat = [lbl for page in labels_list for lbl in page]
+    counts = Counter(all_labels_flat)
+    total = sum(counts.values())
+    weights = []
+    for i in range(num_labels):
+        count = counts.get(i, 0)
+        w = total / (num_labels * count) if count > 0 else 1.0
+        weights.append(w)
+    return torch.tensor(weights, dtype=torch.float)
+def eval_model(model, data_loader):
+    model.eval()
+    all_true, all_pred = [], []
+    with torch.no_grad():
+        for batch in data_loader:
+            words = batch["words"].to(DEVICE)
+            chars = batch["chars"].to(DEVICE)
+            bboxes = batch["bboxes"].to(DEVICE)
+            # pattern_feats removed
+            spatial_feats = batch["spatial_features"].to(DEVICE)
+            context_feats = batch["context_features"].to(DEVICE)
+            mask = batch["mask"].to(DEVICE)
+            labels = batch["labels"].to(DEVICE)
+            # pattern_feats removed from model call
+            preds_batch = model(words, chars, bboxes, spatial_feats, context_feats, mask, labels=None)
+            for i in range(len(preds_batch)):
+                L = len(preds_batch[i])
+                all_pred.extend(preds_batch[i])
+                all_true.extend(labels[i][:L].cpu().numpy().tolist())
+    from sklearn.metrics import precision_recall_fscore_support
+    # NOTE: Labels list excludes 'O' (0) for task-specific F1
+    p, r, f1, _ = precision_recall_fscore_support(all_true, all_pred, average='micro', zero_division=0,
+                                                  labels=list(range(1, len(LABELS))))
+    return p, r, f1
+# MODIFIED: Added CRITICAL FIX for OneCycleLR by saving/loading scheduler._step_count
+def train_model(model, train_loader, val_loader, epochs=EPOCHS, class_weights=None,
+                initial_best_f1=0.0, start_epoch=1, model_path=None, checkpoint_path=None):
+    model.to(DEVICE)
+    # Use AdamW with weight decay for better generalization
+    optim = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
+    # Learning rate scheduler with warmup (Must be initialized BEFORE loading state_dict)
+    scheduler = torch.optim.lr_scheduler.OneCycleLR(
+        optim, max_lr=LR, epochs=epochs, steps_per_epoch=len(train_loader),
+        pct_start=0.1, anneal_strategy='cos'
+    )
+    # --- CHECKPOINT LOADING ---
+    best_val_f1 = initial_best_f1
+    if os.path.exists(checkpoint_path):
+        checkpoint = torch.load(checkpoint_path, map_location=DEVICE)
+        # NOTE: model weights were loaded in train_from_json, but we load again for safety
+        model.load_state_dict(checkpoint['model_state_dict'])
+        optim.load_state_dict(checkpoint['optimizer_state_dict'])
+        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+        # CRITICAL FIX: Explicitly load the scheduler's internal step count
+        if '_step_count' in checkpoint:
+            scheduler._step_count = checkpoint['_step_count']
+        best_val_f1 = checkpoint['best_val_f1']
+        start_epoch = checkpoint['epoch'] + 1
+        print(f":floppy_disk: Resuming training from Epoch {start_epoch} with F1: {best_val_f1:.4f}")
+    # --- END CHECKPOINT LOADING ---
+    patience = 10
+    patience_counter = 0
+    for ep in range(start_epoch, epochs + 1):
+        model.train()
+        running_loss = 0.0
+        for batch in tqdm(train_loader, desc=f"Train E{ep}"):
+            optim.zero_grad()
+            words = batch["words"].to(DEVICE)
+            chars = batch["chars"].to(DEVICE)
+            bboxes = batch["bboxes"].to(DEVICE)
+            # pattern_feats removed
+            spatial_feats = batch["spatial_features"].to(DEVICE)
+            context_feats = batch["context_features"].to(DEVICE)
+            mask = batch["mask"].to(DEVICE)
+            labels = batch["labels"].to(DEVICE)
+            # pattern_feats removed from model call
+            loss = model(words, chars, bboxes, spatial_feats, context_feats, mask, labels,
+                         class_weights=class_weights)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optim.step()
+            scheduler.step()  # This step will now be correctly tracked
+            running_loss += loss.item()
+        avg_loss = running_loss / max(1, len(train_loader))
+        print(f"Epoch {ep} train loss {avg_loss:.4f} | LR: {scheduler.get_last_lr()[0]:.6f}")
+        # Evaluate on validation set
+        p, r, f1 = eval_model(model, val_loader)
+        print(f"VAL p={p:.4f} r={r:.4f} f1={f1:.4f}")
+        if f1 > best_val_f1:
+            best_val_f1 = f1
+            patience_counter = 0
+            # Save the BEST MODEL (just state_dict for deployment)
+            torch.save(model.state_dict(), model_path)
+            # Save the FULL CHECKPOINT for resuming training (UPDATED to include _step_count)
+            torch.save({
+                'epoch': ep,
+                'model_state_dict': model.state_dict(),
+                'optimizer_state_dict': optim.state_dict(),
+                'scheduler_state_dict': scheduler.state_dict(),
+                'best_val_f1': best_val_f1,
+                '_step_count': scheduler._step_count  # CRITICAL FIX: Save the step count
+            }, checkpoint_path)
+            print(f":white_check_mark: New best model and checkpoint saved! F1: {best_val_f1:.4f}")
+        else:
+            patience_counter += 1
+            if patience_counter >= patience:
+                print(f"Early stopping triggered after {ep} epochs")
+                break
+    print("Training complete. Best val F1:", best_val_f1)
+# ========== Helpers ==========
+def build_vocabs(train_pages_tokens):
+    word_vocab = Vocab(min_freq=1)
+    char_vocab = Vocab(min_freq=1, unk_token="<CUNK>", pad_token="<CPAD>")
+    for p in train_pages_tokens:
+        for tok in p["tokens"]:
+            text_value = tok["text"]
+            word_vocab.add_sentence([text_value])
+            char_vocab.add_sentence(list(text_value[:MAX_CHAR_LEN]))
+    word_vocab.build()
+    char_vocab.build()
+    if len(word_vocab) <= 2:
+        raise ValueError(f"FATAL: Word vocabulary size is only {len(word_vocab)}.")
+    return word_vocab, char_vocab
+def save_vocabs(path, word_vocab, char_vocab):
+    with open(path, "wb") as f:
+        pickle.dump((word_vocab, char_vocab), f)
+def convert_labels_to_indices(all_labels):
+    return [[LABEL2IDX[l] for l in page] for page in all_labels]
+# MODIFIED: train_from_json handles checkpoint loading setup
+def train_from_json(unified_json_path: str):
+    print(":fire: Loading unified layout-aware labeled data...")
+    all_pages_tokens, all_labels = load_unified_data(unified_json_path)
+    if not all_labels:
+        raise RuntimeError(":x: No labeled data found. Please check your unified JSON file.")
+    print(f":bar_chart: Total dataset size: {len(all_labels)} samples (chunks)")
+    # Data splitting
+    split_idx = int(len(all_pages_tokens) * 0.8)
+    train_pages_tokens = all_pages_tokens[:split_idx]
+    train_labels = all_labels[:split_idx]
+    val_pages_tokens = all_pages_tokens[split_idx:]
+    val_labels = all_labels[split_idx:]
+    print(f":white_check_mark: Training on {len(train_labels)} samples, validating on {len(val_labels)} samples")
+    # Class weights calculation
+    all_labels_indices = convert_labels_to_indices(all_labels)
+    class_weights = compute_class_weights(all_labels_indices, len(LABELS)).to(DEVICE)
+    print(":1234: Class weights:", class_weights)
+    # Vocab building
+    vocab_path = os.path.join(DATA_DIR, VOCAB_FILE)
+    word_vocab, char_vocab = build_vocabs(train_pages_tokens)
+    print(f"DEBUG: Final word vocab size: {len(word_vocab)}")
+    save_vocabs(vocab_path, word_vocab, char_vocab)
+    # Dataloaders
+    dataset_train = MCQTokenDataset(train_pages_tokens, word_vocab, char_vocab, labels_per_token=train_labels)
+    dataset_val = MCQTokenDataset(val_pages_tokens, word_vocab, char_vocab, labels_per_token=val_labels)
+    train_loader = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
+    val_loader = DataLoader(dataset_val, batch_size=BATCH_SIZE, collate_fn=collate_batch)
+    # Initialize model
+    model = MCQTagger(len(word_vocab), len(char_vocab), len(LABELS))
+    # --- CHECKPOINT SETUP ---
+    model_path = os.path.join(DATA_DIR, MODEL_FILE)
+    checkpoint_path = os.path.join(DATA_DIR, CHECKPOINT_FILE)
+    initial_best_f1 = 0.0
+    start_epoch = 1
+    # Load only model weights if checkpoint exists (to initialize the model before passing to train)
+    if os.path.exists(checkpoint_path):
+        try:
+            checkpoint = torch.load(checkpoint_path, map_location=DEVICE)
+            model.load_state_dict(checkpoint['model_state_dict'])
+            initial_best_f1 = checkpoint['best_val_f1']
+            start_epoch = checkpoint['epoch'] + 1
+            print(
+                f":floppy_disk: Full checkpoint found. Model weights loaded. Resuming setup from Epoch {start_epoch}.")
+        except Exception as e:
+            print(f":warning: Could not load full checkpoint: {e}. Starting from scratch.")
+    elif os.path.exists(model_path):
+        # Fallback: Load only model weights from the best F1 model file if no full checkpoint
+        print(f":floppy_disk: Found best model weights at {model_path}. Loading weights...")
+        try:
+            model.load_state_dict(torch.load(model_path, map_location=DEVICE))
+        except RuntimeError as e:
+            print(f":warning: Could not load model state: {e}. Starting fresh.")
+    else:
+        print(":rocket: Starting training from scratch (no model or checkpoint found)...")
+    # --- END CHECKPOINT SETUP ---
+    print(f":triangular_ruler: Model parameters: {sum(p.numel() for p in model.parameters()):,}")
+    # Pass paths, initial F1, and start epoch to train_model
+    train_model(model, train_loader, val_loader, epochs=EPOCHS, class_weights=class_weights,
+                initial_best_f1=initial_best_f1, start_epoch=start_epoch,
+                model_path=model_path, checkpoint_path=checkpoint_path)
+    print("\n:white_check_mark: Training complete.")
+    print(f":package: Best Model weights saved to: {model_path}")
+    print(f":package: Vocabularies saved to: {vocab_path}")
+# ========== MAIN EXECUTION BLOCK ==========
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Train an enhanced BiLSTM-CRF model with deep layout understanding for MCQ structure extraction.")
+    parser.add_argument(
+        "unified_json_path",
+        type=str,
+        help="Path to the unified JSON file containing token, bbox, and label data."
+    )
+    args = parser.parse_args()
+    train_from_json(args.unified_json_path)