toilachuoituyet
/

dasco

TensorBoard

Model card Files Files and versions

xet

Metrics Training metrics Community

toilachuoituyet commited on Jan 20

Commit

43532c9

verified ·

1 Parent(s): 1b3984b

Upload project files

Browse files

Files changed (2) hide show

MASC_finetune.py +19 -2
model.py +50 -5

MASC_finetune.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
 import argparse
 import os
 os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
 import random
 from transformers import BertTokenizer
@@ -14,10 +15,22 @@ import logging
 from accelerate import Accelerator
 from tqdm import tqdm
 from torch.optim import AdamW
-from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
 from eval_tools import *
 def set_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
@@ -249,7 +262,11 @@ if __name__ == "__main__":
     optimizer = AdamW(params=filter(lambda p: p.requires_grad, model.parameters()),
                 lr=args.lr, betas=(0.9, 0.98), weight_decay=0.05)
-    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=2*1000, eta_min=5e-5)
     print("start training")
     finetune(
         model=model,

 import torch
 import argparse
 import os
+import math
 os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
 import random
 from transformers import BertTokenizer
 from accelerate import Accelerator
 from tqdm import tqdm
 from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR
 from eval_tools import *
+def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, min_lr_ratio=0.1):
+    """Cosine LR scheduler with linear warmup for stable training."""
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            # Linear warmup
+            return float(current_step) / float(max(1, num_warmup_steps))
+        # Cosine annealing
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        return max(min_lr_ratio, 0.5 * (1.0 + math.cos(math.pi * progress)))
+    return LambdaLR(optimizer, lr_lambda)
 def set_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     optimizer = AdamW(params=filter(lambda p: p.requires_grad, model.parameters()),
                 lr=args.lr, betas=(0.9, 0.98), weight_decay=0.05)
+    # Estimate total steps: ~500 steps per epoch, 3 epochs
+    num_training_steps = 500 * args.epoch * 6  # 6 dataset chunks
+    num_warmup_steps = min(500, num_training_steps // 10)  # 10% warmup
+    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, min_lr_ratio=0.1)
+    print(f"Using cosine LR with {num_warmup_steps} warmup steps, {num_training_steps} total steps")
     print("start training")
     finetune(
         model=model,

model.py CHANGED Viewed

@@ -44,6 +44,46 @@ class LayerNorm(nn.Module):
         std = x.std(-1, keepdim=True)
         return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
 def attention(query, key, mask=None, dropout=None):
     d_k = query.size(-1)
     scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
@@ -126,11 +166,16 @@ class DASCO(nn.Module):
             self.classifier = nn.Linear(self.hidden_size*2, 2)
             self.criterion = nn.CrossEntropyLoss()
         elif self.task == 'MASC':
-            self.classifier = nn.Linear(self.hidden_size*2, 3)
-            # Class weights for imbalanced data: POS=80.8%, NEU=5.8%, NEG=13.3%
-            # Aggressive weights + label_smoothing for better minority class handling
-            self.register_buffer('class_weights', torch.tensor([1.0, 18.0, 8.0]))
-            self.criterion = nn.CrossEntropyLoss(weight=self.class_weights, label_smoothing=0.1)
     def projection(self, z: torch.Tensor) -> torch.Tensor:
         z = F.elu(self.fc1(z))

         std = x.std(-1, keepdim=True)
         return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
+class FocalLoss(nn.Module):
+    """Focal Loss for handling extreme class imbalance.
+    FL(p_t) = -alpha_t * (1 - p_t)^gamma * log(p_t)
+    - Reduces loss for well-classified samples (high p_t)
+    - Focuses training on hard misclassified samples
+    - alpha: class weights (inversely proportional to frequency)
+    - gamma: focusing parameter (higher = more focus on hard samples)
+    """
+    def __init__(self, alpha=None, gamma=2.0, reduction='mean', label_smoothing=0.0):
+        super(FocalLoss, self).__init__()
+        self.alpha = alpha  # class weights tensor
+        self.gamma = gamma  # focusing parameter
+        self.reduction = reduction
+        self.label_smoothing = label_smoothing
+    def forward(self, inputs, targets):
+        # inputs: [N, C], targets: [N]
+        ce_loss = F.cross_entropy(inputs, targets, reduction='none',
+                                   label_smoothing=self.label_smoothing)
+        pt = torch.exp(-ce_loss)  # probability of correct class
+        # Apply focal weight: (1 - pt)^gamma
+        focal_weight = (1 - pt) ** self.gamma
+        # Apply class weights if provided
+        if self.alpha is not None:
+            alpha_t = self.alpha[targets]
+            focal_loss = alpha_t * focal_weight * ce_loss
+        else:
+            focal_loss = focal_weight * ce_loss
+        if self.reduction == 'mean':
+            return focal_loss.mean()
+        elif self.reduction == 'sum':
+            return focal_loss.sum()
+        return focal_loss
 def attention(query, key, mask=None, dropout=None):
     d_k = query.size(-1)
     scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
             self.classifier = nn.Linear(self.hidden_size*2, 2)
             self.criterion = nn.CrossEntropyLoss()
         elif self.task == 'MASC':
+            # Enhanced classifier with hidden layer and dropout for better generalization
+            self.classifier = nn.Sequential(
+                nn.Linear(self.hidden_size*2, 256),
+                nn.GELU(),
+                nn.Dropout(0.3),
+                nn.Linear(256, 3)
+            )
+            # Focal Loss for extreme class imbalance: POS=80.8%, NEU=5.8%, NEG=13.3%
+            self.register_buffer('class_weights', torch.tensor([1.0, 12.0, 5.0]))
+            self.criterion = FocalLoss(alpha=self.class_weights, gamma=2.0, label_smoothing=0.05)
     def projection(self, z: torch.Tensor) -> torch.Tensor:
         z = F.elu(self.fc1(z))