Spaces:

jesse-tong
/

vietnamese_hate_speech_detection

Sleeping

App Files Files Community

jesse-tong commited on Mar 29, 2025

Commit

ae47555

1 Parent(s): 9e5f013

Add LSTM fine tuning

Browse files

Files changed (10) hide show

.gitignore +4 -1
dataset.py +16 -4
dataset_lstm.py +163 -0
distill_bert_to_lstm.py +193 -0
example_uses.txt +1 -0
inference_example.py +104 -0
knowledge_distillation.py +232 -0
models/__init__.py +0 -0
models/lstm_model.py +163 -0
train.py +1 -1

.gitignore CHANGED Viewed

@@ -4,4 +4,7 @@ __pycache__/
 *.pyc
 *.pyo
 *.pyd
-*.db

 *.pyc
 *.pyo
 *.pyd
+*.db
+metrics.txt
+predictions.txt
+*.pth

dataset.py CHANGED Viewed

@@ -29,7 +29,7 @@ class DocumentDataset(Dataset):
                           f"but found range [{min_label}, {max_label}]")
             logger.warning(f"Unique label values: {sorted(unique_labels)}")
-            # Fix labels by remapping them to start from 0
             if min_label != 0:
                 logger.warning(f"Auto-correcting labels to be zero-indexed...")
                 label_map = {original: idx for idx, original in enumerate(sorted(unique_labels))}
@@ -132,8 +132,20 @@ def create_data_loaders(train_data, val_data, test_data, tokenizer_name='bert-ba
     test_dataset = DocumentDataset(test_texts, test_labels, tokenizer_name, max_length, num_classes)
     # Create data loaders
-    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
-    val_loader = DataLoader(val_dataset, batch_size=batch_size)
-    test_loader = DataLoader(test_dataset, batch_size=batch_size)
     return train_loader, val_loader, test_loader

                           f"but found range [{min_label}, {max_label}]")
             logger.warning(f"Unique label values: {sorted(unique_labels)}")
+            # Fix labels by remapping them to start from 0 (some datasets might have labels starting from 1)
             if min_label != 0:
                 logger.warning(f"Auto-correcting labels to be zero-indexed...")
                 label_map = {original: idx for idx, original in enumerate(sorted(unique_labels))}
     test_dataset = DocumentDataset(test_texts, test_labels, tokenizer_name, max_length, num_classes)
     # Create data loaders
+    if len(train_dataset.texts) == 0:
+        logger.warning("Training dataset is empty. Check your data loading and splitting.")
+        train_loader = None
+    else:
+        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    if len(val_dataset.texts) == 0:
+        logger.warning("Validation dataset is empty. Check your data loading and splitting.")
+        val_loader = None
+    else:
+        val_loader = DataLoader(val_dataset, batch_size=batch_size)
+    if len(test_dataset.texts) == 0:
+        logger.warning("Test dataset is empty. Check your data loading and splitting.")
+        test_loader = None
+    else:
+        test_loader = DataLoader(test_dataset, batch_size=batch_size)
     return train_loader, val_loader, test_loader

dataset_lstm.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import pandas as pd
+from collections import Counter
+import re
+import logging
+logger = logging.getLogger(__name__)
+class LSTMTokenizer:
+    """
+    Simple tokenizer for LSTM models
+    """
+    def __init__(self, max_vocab_size=30000, max_seq_length=512):
+        self.word2idx = {}
+        self.idx2word = {}
+        self.word2idx['<pad>'] = 0
+        self.word2idx['<unk>'] = 1
+        self.idx2word[0] = '<pad>'
+        self.idx2word[1] = '<unk>'
+        self.vocab_size = 2  # Start with pad and unk tokens
+        self.max_vocab_size = max_vocab_size
+        self.max_seq_length = max_seq_length
+    def fit(self, texts):
+        """Build vocabulary from texts"""
+        word_counts = Counter()
+        # Clean and tokenize texts
+        for text in texts:
+            words = self._tokenize(text)
+            word_counts.update(words)
+        # Sort by frequency and take most common words
+        vocab_words = [word for word, count in word_counts.most_common(self.max_vocab_size - 2)]
+        # Add words to vocabulary
+        for word in vocab_words:
+            if word not in self.word2idx:
+                self.word2idx[word] = self.vocab_size
+                self.idx2word[self.vocab_size] = word
+                self.vocab_size += 1
+        logger.info(f"Vocabulary size: {self.vocab_size}")
+        return self
+    def _tokenize(self, text):
+        """Simple tokenization by splitting on whitespace and removing punctuation"""
+        text = text.lower()
+        # Remove punctuation and split on whitespace
+        text = re.sub(r'[^\w\s]', '', text)
+        return text.split()
+    def encode(self, text, padding=True, truncation=True):
+        """Convert text to token ids"""
+        words = self._tokenize(text)
+        # Truncate if needed
+        if truncation and len(words) > self.max_seq_length:
+            words = words[:self.max_seq_length]
+        # Convert to indices
+        ids = [self.word2idx.get(word, self.word2idx['<unk>']) for word in words]
+        # Create attention mask (1 for tokens, 0 for padding)
+        attention_mask = [1] * len(ids)
+        # Pad if needed
+        if padding and len(ids) < self.max_seq_length:
+            padding_length = self.max_seq_length - len(ids)
+            ids = ids + [self.word2idx['<pad>']] * padding_length
+            attention_mask = attention_mask + [0] * padding_length
+        return {
+            'input_ids': torch.tensor(ids, dtype=torch.long),
+            'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
+        }
+class LSTMDataset(Dataset):
+    """Dataset for LSTM model"""
+    def __init__(self, texts, labels, tokenizer):
+        self.texts = texts
+        self.labels = labels
+        self.tokenizer = tokenizer
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        text = str(self.texts[idx])
+        label = self.labels[idx]
+        # Tokenize
+        encoding = self.tokenizer.encode(text)
+        return {
+            'input_ids': encoding['input_ids'],
+            'attention_mask': encoding['attention_mask'],
+            'label': torch.tensor(label, dtype=torch.long)
+        }
+def prepare_lstm_data(data_path, text_col='text', label_col='label',
+                     max_vocab_size=30000, max_seq_length=512,
+                     val_split=0.1, test_split=0.1, batch_size=32, seed=42):
+    """
+    Load data and prepare for LSTM model
+    """
+    # Load data
+    if data_path.endswith('.csv'):
+        df = pd.read_csv(data_path)
+    elif data_path.endswith('.tsv'):
+        df = pd.read_csv(data_path, sep='\t')
+    else:
+        raise ValueError("Unsupported file format. Please provide CSV or TSV file.")
+    # Convert labels to numeric if they aren't already
+    if not np.issubdtype(df[label_col].dtype, np.number):
+        label_map = {label: idx for idx, label in enumerate(sorted(df[label_col].unique()))}
+        df['label_numeric'] = df[label_col].map(label_map)
+        labels = df['label_numeric'].values
+        logger.info(f"Label mapping: {label_map}")
+    else:
+        labels = df[label_col].values
+        # Make sure labels start from 0
+        min_label = labels.min()
+        if min_label != 0:
+            label_map = {label: idx for idx, label in enumerate(sorted(set(labels)))}
+            labels = np.array([label_map[label] for label in labels])
+    texts = df[text_col].values
+    # Split data
+    np.random.seed(seed)
+    indices = np.random.permutation(len(texts))
+    test_size = int(test_split * len(texts))
+    val_size = int(val_split * len(texts))
+    train_size = len(texts) - test_size - val_size
+    train_indices = indices[:train_size]
+    val_indices = indices[train_size:train_size + val_size]
+    test_indices = indices[train_size + val_size:]
+    train_texts, train_labels = texts[train_indices], labels[train_indices]
+    val_texts, val_labels = texts[val_indices], labels[val_indices]
+    test_texts, test_labels = texts[test_indices], labels[test_indices]
+    # Create tokenizer and fit on training data
+    tokenizer = LSTMTokenizer(max_vocab_size=max_vocab_size, max_seq_length=max_seq_length)
+    tokenizer.fit(train_texts)
+    # Create datasets
+    train_dataset = LSTMDataset(train_texts, train_labels, tokenizer)
+    val_dataset = LSTMDataset(val_texts, val_labels, tokenizer)
+    test_dataset = LSTMDataset(test_texts, test_labels, tokenizer)
+    # Create data loaders
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size)
+    return train_loader, val_loader, test_loader, tokenizer.vocab_size

distill_bert_to_lstm.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import argparse
+import os
+import logging
+import torch
+import random
+import numpy as np
+from model import DocBERT
+from models.lstm_model import DocumentBiLSTM
+from dataset import load_data, create_data_loaders
+from dataset_lstm import prepare_lstm_data
+from knowledge_distillation import DistillationTrainer
+from transformers import BertTokenizer
+# Setup logging
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    level=logging.INFO,
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+def set_seed(seed):
+    """Set all seeds for reproducibility"""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+def tokenize_for_lstm(texts, bert_tokenizer, max_seq_length=512):
+    """
+    Convert BERT tokenization format to format suitable for LSTM
+    This is a simple approach that just takes whole words from BERT tokenization
+    """
+    from collections import Counter
+    # Create vocabulary from all texts
+    word_counts = Counter()
+    all_words = []
+    for text in texts:
+        # Simple tokenization by splitting on whitespace
+        words = text.lower().split()
+        word_counts.update(words)
+        all_words.extend(words)
+    # Create word->index mapping
+    word2idx = {'<pad>': 0, '<unk>': 1}
+    for idx, (word, _) in enumerate(word_counts.most_common(30000 - 2), 2):
+        word2idx[word] = idx
+    vocab_size = len(word2idx)
+    logger.info(f"Created vocabulary with {vocab_size} tokens")
+    return word2idx, vocab_size
+def main():
+    parser = argparse.ArgumentParser(description="Distill knowledge from BERT to LSTM for document classification")
+    # Data arguments
+    parser.add_argument("--data_path", type=str, required=True, help="Path to the dataset file (CSV or TSV)")
+    parser.add_argument("--text_column", type=str, default="text", help="Name of the text column")
+    parser.add_argument("--label_column", type=str, default="label", help="Name of the label column")
+    parser.add_argument("--val_split", type=float, default=0.1, help="Validation set split ratio")
+    parser.add_argument("--test_split", type=float, default=0.1, help="Test set split ratio")
+    # BERT model arguments
+    parser.add_argument("--bert_model", type=str, default="bert-base-uncased", help="BERT model to use")
+    parser.add_argument("--bert_model_path", type=str, required=True, help="Path to saved BERT model weights")
+    parser.add_argument("--max_seq_length", type=int, default=512, help="Maximum sequence length")
+    # LSTM model arguments
+    parser.add_argument("--embedding_dim", type=int, default=300, help="Dimension of word embeddings in LSTM")
+    parser.add_argument("--hidden_dim", type=int, default=256, help="Hidden dimension of LSTM")
+    parser.add_argument("--num_layers", type=int, default=2, help="Number of LSTM layers")
+    parser.add_argument("--dropout", type=float, default=0.5, help="Dropout probability")
+    # Distillation arguments
+    parser.add_argument("--temperature", type=float, default=2.0, help="Temperature for softening probability distributions")
+    parser.add_argument("--alpha", type=float, default=0.5, help="Weight for distillation loss vs. regular loss")
+    parser.add_argument("--num_classes", type=int, required=True, help="Number of classes to predict")
+    # Training arguments
+    parser.add_argument("--batch_size", type=int, default=16, help="Training batch size")
+    parser.add_argument("--learning_rate", type=float, default=0.001, help="Learning rate for LSTM")
+    parser.add_argument("--epochs", type=int, default=20, help="Number of training epochs")
+    # Other arguments
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--output_dir", type=str, default="./output", help="Directory to save models")
+    args = parser.parse_args()
+    # Set seed for reproducibility
+    set_seed(args.seed)
+    # Create output directory if it doesn't exist
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    # Load and prepare data for both BERT and LSTM
+    logger.info("Loading and preparing data...")
+    # Load data first
+    train_data, val_data, test_data = load_data(
+        args.data_path,
+        text_col=args.text_column,
+        label_col=args.label_column,
+        validation_split=args.val_split,
+        test_split=args.test_split,
+        seed=args.seed
+    )
+    # Create BERT data loaders
+    logger.info("Creating BERT data loaders...")
+    bert_train_loader, bert_val_loader, bert_test_loader = create_data_loaders(
+        train_data,
+        val_data,
+        test_data,
+        tokenizer_name=args.bert_model,
+        max_length=args.max_seq_length,
+        batch_size=args.batch_size,
+        num_classes=args.num_classes
+    )
+    # Create LSTM data loaders
+    logger.info("Creating LSTM data loaders...")
+    lstm_train_loader, lstm_val_loader, lstm_test_loader, vocab_size = prepare_lstm_data(
+        args.data_path,
+        text_col=args.text_column,
+        label_col=args.label_column,
+        max_vocab_size=30000,
+        max_seq_length=args.max_seq_length,
+        batch_size=args.batch_size,
+        seed=args.seed
+    )
+    logger.info(f"LSTM Vocabulary size: {vocab_size}")
+    # Load pre-trained BERT model (teacher)
+    logger.info("Loading pre-trained BERT model (teacher)...")
+    bert_model = DocBERT(
+        num_classes=args.num_classes,
+        bert_model_name=args.bert_model,
+        dropout_prob=0.1
+    )
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load saved BERT weights
+    bert_model.load_state_dict(torch.load(args.bert_model_path, map_location=device))
+    logger.info(f"Loaded teacher model from {args.bert_model_path}")
+    # Initialize LSTM model (student)
+    logger.info("Initializing LSTM model (student)...")
+    lstm_model = DocumentBiLSTM(
+        vocab_size=vocab_size,
+        embedding_dim=args.embedding_dim,
+        hidden_dim=args.hidden_dim,
+        output_dim=args.num_classes,
+        n_layers=args.num_layers,
+        dropout=args.dropout
+    )
+    # Print model sizes for comparison
+    bert_params = sum(p.numel() for p in bert_model.parameters())
+    lstm_params = sum(p.numel() for p in lstm_model.parameters())
+    logger.info(f"BERT model size: {bert_params:,} parameters")
+    logger.info(f"LSTM model size: {lstm_params:,} parameters")
+    logger.info(f"Size reduction: {bert_params / lstm_params:.1f}x")
+    # Initialize distillation trainer
+    trainer = DistillationTrainer(
+        teacher_model=bert_model,
+        student_model=lstm_model,
+        train_loader=bert_train_loader,  # Using BERT loader to match tokenization
+        val_loader=bert_val_loader,
+        test_loader=bert_test_loader,
+        temperature=args.temperature,
+        alpha=args.alpha,
+        lr=args.learning_rate,
+        weight_decay=1e-5
+    )
+    # Train with knowledge distillation
+    logger.info("Starting knowledge distillation...")
+    save_path = os.path.join(args.output_dir, "distilled_lstm_model.pth")
+    trainer.train(epochs=args.epochs, save_path=save_path)
+    logger.info("Knowledge distillation completed!")
+if __name__ == "__main__":
+    main()

example_uses.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python .\inference_example.py --model_path "./bert_base_uncased/best_model.pth" --num_classes 4 --class_names "World" "Sports" "Business" "Science" --text_column "Description" --label_column "Class Index" --data_path "./train.csv" --inference_batch_limit 10

inference_example.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from model import DocBERT
+from dataset import load_data, create_data_loaders
+from trainer import Trainer
+import argparse
+import os, sklearn
+import numpy as np
+import torch
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Document Classification with Distillation")
+    parser.add_argument("--data_path", type=str, required=True, help="Path to the dataset")
+    parser.add_argument("--bert_model", type=str, default="bert-base-uncased", help="Pre-trained BERT model name")
+    parser.add_argument("--model_path", type=str, required=True, help="Path to the trained model")
+    parser.add_argument("--max_seq_length", type=int, default=512, help="Maximum sequence length for BERT")
+    parser.add_argument("--batch_size", type=int, default=32, help="Batch size for training and evaluation")
+    parser.add_argument("--num_classes", type=int, required=True, help="Number of classes for classification")
+    parser.add_argument("--text_column", type=str, default="text", help="Column name for text data")
+    parser.add_argument("--label_column", type=str, default="label", help="Column name for labels")
+    parser.add_argument("--class_names", type=str, nargs='+', required=True, help="List of class names for classification")
+    parser.add_argument("--inference_batch_limit", type=int, default=-1, help="Limit for inference batch counts")
+    parser.add_argument("--print_predictions", type=bool, default=False, help="Print predictions to console")
+    args = parser.parse_args()
+    class_names = args.class_names
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    train_data, val_data, test_data = load_data(args.data_path,
+                                                text_col=args.text_column,
+                                                label_col=args.label_column,
+                                                validation_split=0.0,
+                                                test_split=1.0)
+    train_loader, val_loader, test_loader = create_data_loaders(train_data=train_data,
+                                                                val_data=val_data,
+                                                                test_data=test_data,
+                                                                tokenizer_name=args.bert_model,
+                                                                batch_size=args.batch_size,
+                                                                max_length=args.max_seq_length)
+    model = DocBERT(bert_model_name=args.bert_model, num_classes=args.num_classes)
+    model.load_state_dict(torch.load(args.model_path, map_location=device))
+    model = model.to(device)
+    all_labels = np.array([], dtype=int)
+    all_predictions = np.array([], dtype=int)
+    batch_window_index = 0
+    batch_size = args.batch_size
+    # Inference
+    for batch in test_loader:
+        input_ids = batch['input_ids']
+        attention_mask = batch['attention_mask']
+        token_type_ids = batch['token_type_ids']
+        labels = batch['label']
+        input_ids = input_ids.to(device)
+        attention_mask = attention_mask.to(device)
+        token_type_ids = token_type_ids.to(device)
+        labels = labels.to(device)
+        all_labels = np.append(all_labels, labels.cpu().numpy())
+        with torch.no_grad():
+            outputs = model(input_ids, attention_mask=attention_mask)
+            logits = outputs
+            predictions = torch.argmax(logits, dim=-1)
+            all_predictions = np.append(all_predictions, predictions.cpu().numpy())
+        if args.print_predictions:
+            for i in range(len(predictions)):
+                idx = int(i)
+                print(f"Text: {test_data[0][batch_window_index*batch_size + idx]}")
+                print(f"True Label: {labels[idx].item()}, Predicted Label: {predictions[idx].item()}")
+                print(f"Predicted Class: {class_names[predictions[idx].item() if len(class_names) > predictions[idx].item() else 'Unknown']}")
+                print(f"True Class: {class_names[labels[idx].item()]  if len(class_names) > predictions[idx].item() else 'Unknown'}")
+                print("-" * 50)
+        batch_window_index += 1
+        if args.inference_batch_limit > 0 and batch_window_index >= args.inference_batch_limit:
+            break
+    # Calculate accuracy, F1 score, recall, and precision
+    accuracy = sklearn.metrics.accuracy_score(all_labels, all_predictions)
+    f1 = sklearn.metrics.f1_score(all_labels, all_predictions, average='weighted')
+    precision = sklearn.metrics.precision_score(all_labels, all_predictions, average='weighted')
+    recall = sklearn.metrics.recall_score(all_labels, all_predictions, average='weighted')
+    print(f"Accuracy: {accuracy}")
+    print(f"F1 Score: {f1}")
+    print(f"Precision: {precision}")
+    print(f"Recall: {recall}")
+    with open("predictions.txt", "w") as f:
+        for i in range(len(all_labels)):
+            idx = int(i)
+            f.write(f"Text: {test_data[0][idx]}\n")
+            f.write(f"True Label: {all_labels[idx]}, Predicted Label: {all_predictions[idx]}\n")
+            f.write(f"Predicted Class: {class_names[all_predictions[idx]] if len(class_names) > all_predictions[idx] else "Unknown"}, True Class: {class_names[all_labels[idx]] if len(class_names) > all_predictions[idx] else "Unknown"}\n")
+            f.write("-" * 50 + "\n")
+    with open("metrics.txt", "w") as f:
+        f.write(f"Accuracy: {accuracy}\n")
+        f.write(f"F1 Score: {f1}\n")
+        f.write(f"Precision: {precision}\n")
+        f.write(f"Recall: {recall}\n")

knowledge_distillation.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+import logging
+import os
+logger = logging.getLogger(__name__)
+class DistillationTrainer:
+    """
+    Trainer for knowledge distillation from teacher model (BERT) to student model (LSTM)
+    """
+    def __init__(
+        self,
+        teacher_model,
+        student_model,
+        train_loader,
+        val_loader,
+        test_loader=None,
+        temperature=2.0,
+        alpha=0.5,  # Weight for distillation loss vs. regular loss
+        lr=0.001,
+        weight_decay=1e-5,
+        device=None
+    ):
+        self.teacher_model = teacher_model
+        self.student_model = student_model
+        self.train_loader = train_loader
+        self.val_loader = val_loader
+        self.test_loader = test_loader
+        self.temperature = temperature
+        self.alpha = alpha
+        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        logger.info(f"Using device: {self.device}")
+        # Move models to device
+        self.teacher_model.to(self.device)
+        self.student_model.to(self.device)
+        # Set teacher model to evaluation mode
+        self.teacher_model.eval()
+        # Optimizer for student model
+        self.optimizer = torch.optim.Adam(
+            self.student_model.parameters(),
+            lr=lr,
+            weight_decay=weight_decay
+        )
+        # Learning rate scheduler
+        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            self.optimizer, mode='max', factor=0.5, patience=2, verbose=True
+        )
+        # Loss functions
+        self.ce_loss = nn.CrossEntropyLoss()  # For hard targets
+        # Tracking metrics
+        self.best_val_f1 = 0.0
+        self.best_model_state = None
+    def distillation_loss(self, student_logits, teacher_logits, labels, temperature, alpha):
+        """
+        Compute the knowledge distillation loss
+        Args:
+            student_logits: Output from student model
+            teacher_logits: Output from teacher model
+            labels: Ground truth labels
+            temperature: Temperature for softening probability distributions
+            alpha: Weight for distillation loss vs. cross-entropy loss
+        Returns:
+            Combined loss
+        """
+        # Softmax with temperature for soft targets
+        soft_targets = F.softmax(teacher_logits / temperature, dim=1)
+        soft_prob = F.log_softmax(student_logits / temperature, dim=1)
+        # Distillation loss (KL divergence)
+        distill_loss = F.kl_div(soft_prob, soft_targets, reduction='batchmean') * (temperature ** 2)
+        # Standard cross entropy with hard targets
+        ce_loss = self.ce_loss(student_logits, labels)
+        # Weighted combination of the two losses
+        loss = alpha * distill_loss + (1 - alpha) * ce_loss
+        return loss
+    def train(self, epochs, save_path='best_distilled_model.pth'):
+        """
+        Train student model with knowledge distillation
+        """
+        logger.info(f"Starting distillation training for {epochs} epochs")
+        logger.info(f"Temperature: {self.temperature}, Alpha: {self.alpha}")
+        for epoch in range(epochs):
+            self.student_model.train()
+            train_loss = 0.0
+            all_preds = []
+            all_labels = []
+            # Training loop
+            train_iterator = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]")
+            for batch in train_iterator:
+                # Move batch to device
+                input_ids = batch['input_ids'].to(self.device)
+                attention_mask = batch['attention_mask'].to(self.device)
+                labels = batch['label'].to(self.device)
+                # Get teacher predictions (no grad needed for teacher)
+                with torch.no_grad():
+                    teacher_logits = self.teacher_model(
+                        input_ids=input_ids,
+                        attention_mask=attention_mask
+                    )
+                # Forward pass through student model
+                student_logits = self.student_model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask
+                )
+                # Calculate distillation loss
+                loss = self.distillation_loss(
+                    student_logits,
+                    teacher_logits,
+                    labels,
+                    self.temperature,
+                    self.alpha
+                )
+                # Backward and optimize
+                self.optimizer.zero_grad()
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(self.student_model.parameters(), 1.0)
+                self.optimizer.step()
+                train_loss += loss.item()
+                # Calculate accuracy for progress tracking
+                _, preds = torch.max(student_logits, 1)
+                all_preds.extend(preds.cpu().tolist())
+                all_labels.extend(labels.cpu().tolist())
+                # Update progress bar
+                train_iterator.set_postfix({'loss': f"{loss.item():.4f}"})
+            # Calculate training metrics
+            train_loss = train_loss / len(self.train_loader)
+            train_acc = sum(1 for p, l in zip(all_preds, all_labels) if p == l) / len(all_preds)
+            # Evaluate on validation set
+            val_loss, val_acc, val_f1 = self.evaluate()
+            # Update learning rate based on validation performance
+            self.scheduler.step(val_f1)
+            # Save best model
+            if val_f1 > self.best_val_f1:
+                self.best_val_f1 = val_f1
+                self.best_model_state = self.student_model.state_dict().copy()
+                torch.save({
+                    'epoch': epoch,
+                    'model_state_dict': self.student_model.state_dict(),
+                    'optimizer_state_dict': self.optimizer.state_dict(),
+                    'val_f1': val_f1,
+                }, save_path)
+                logger.info(f"New best model saved with validation F1: {val_f1:.4f}")
+            logger.info(f"Epoch {epoch+1}/{epochs}: "
+                      f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
+                      f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")
+        # Load best model for final evaluation
+        if self.best_model_state is not None:
+            self.student_model.load_state_dict(self.best_model_state)
+            logger.info(f"Loaded best model with validation F1: {self.best_val_f1:.4f}")
+        # Final evaluation on test set if provided
+        if self.test_loader:
+            test_loss, test_acc, test_f1 = self.evaluate(self.test_loader, "Test")
+            logger.info(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}")
+    def evaluate(self, data_loader=None, phase="Validation"):
+        """
+        Evaluate the student model
+        """
+        if data_loader is None:
+            data_loader = self.val_loader
+        self.student_model.eval()
+        eval_loss = 0.0
+        all_preds = []
+        all_labels = []
+        with torch.no_grad():
+            for batch in tqdm(data_loader, desc=f"[{phase}]"):
+                input_ids = batch['input_ids'].to(self.device)
+                attention_mask = batch['attention_mask'].to(self.device)
+                labels = batch['label'].to(self.device)
+                # Forward pass through student
+                student_logits = self.student_model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask
+                )
+                # Calculate regular CE loss (no distillation during evaluation)
+                loss = self.ce_loss(student_logits, labels)
+                eval_loss += loss.item()
+                # Get predictions
+                _, preds = torch.max(student_logits, 1)
+                all_preds.extend(preds.cpu().tolist())
+                all_labels.extend(labels.cpu().tolist())
+        # Calculate metrics
+        eval_loss = eval_loss / len(data_loader)
+        # Accuracy
+        accuracy = sum(1 for p, l in zip(all_preds, all_labels) if p == l) / len(all_preds)
+        # F1 score (macro-averaged)
+        from sklearn.metrics import f1_score
+        f1 = f1_score(all_labels, all_preds, average='macro')
+        return eval_loss, accuracy, f1

models/__init__.py ADDED Viewed

File without changes

models/lstm_model.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchtext.vocab import GloVe  # For loading pre-trained word embeddings
+class DocumentLSTM(nn.Module):
+    """
+    LSTM model for document classification using GloVe embeddings
+    """
+    def __init__(self, num_classes, vocab_size=30000, embedding_dim=300,
+                 hidden_dim=256, num_layers=2, bidirectional=True,
+                 dropout_rate=0.3, use_pretrained=True, padding_idx=0):
+        super(DocumentLSTM, self).__init__()
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.bidirectional = bidirectional
+        self.num_directions = 2 if bidirectional else 1
+        # Embedding layer (with option to use pre-trained GloVe)
+        if use_pretrained:
+            # Initialize with GloVe embeddings
+            try:
+                glove = GloVe(name='6B', dim=embedding_dim)
+                # You'd need to map your vocabulary to GloVe indices
+                # This is a simplified placeholder
+                self.embedding = nn.Embedding.from_pretrained(
+                    glove.vectors[:vocab_size],
+                    padding_idx=padding_idx,
+                    freeze=False
+                )
+            except Exception as e:
+                print(f"Could not load pretrained embeddings: {e}")
+                # Fall back to random initialization
+                self.embedding = nn.Embedding(
+                    vocab_size, embedding_dim, padding_idx=padding_idx
+                )
+        else:
+            # Random initialization
+            self.embedding = nn.Embedding(
+                vocab_size, embedding_dim, padding_idx=padding_idx
+            )
+        # LSTM layer
+        self.lstm = nn.LSTM(
+            embedding_dim,
+            hidden_dim,
+            num_layers=num_layers,
+            bidirectional=bidirectional,
+            batch_first=True,
+            dropout=dropout_rate if num_layers > 1 else 0
+        )
+        # Attention mechanism
+        self.attention = nn.Linear(hidden_dim * self.num_directions, 1)
+        # Layer normalization
+        self.layer_norm = nn.LayerNorm(hidden_dim * self.num_directions)
+        # Dropout layer
+        self.dropout = nn.Dropout(dropout_rate)
+        # Classification layer
+        self.classifier = nn.Linear(hidden_dim * self.num_directions, num_classes)
+    def forward(self, input_ids, attention_mask=None, **kwargs):
+        """
+        Forward pass through LSTM model
+        Args:
+            input_ids: Tensor of token ids [batch_size, seq_len]
+            attention_mask: Tensor indicating which tokens to attend to [batch_size, seq_len]
+        """
+        # Word embeddings
+        embedded = self.embedding(input_ids)  # [batch_size, seq_len, embedding_dim]
+        # Pass through LSTM
+        lstm_out, (hidden, cell) = self.lstm(embedded)
+        # lstm_out: [batch_size, seq_len, hidden_dim * num_directions]
+        # Apply attention
+        if attention_mask is not None:
+            # Apply attention mask (1 for tokens to attend to, 0 for padding)
+            attention_mask = attention_mask.unsqueeze(-1)  # [batch_size, seq_len, 1]
+            attention_scores = self.attention(lstm_out)  # [batch_size, seq_len, 1]
+            attention_scores = attention_scores.masked_fill(attention_mask == 0, -1e10)
+            attention_weights = F.softmax(attention_scores, dim=1)  # [batch_size, seq_len, 1]
+            # Weighted sum
+            context_vector = torch.sum(attention_weights * lstm_out, dim=1)  # [batch_size, hidden_dim * num_directions]
+        else:
+            # If no attention mask, use the last hidden state
+            if self.bidirectional:
+                # For bidirectional LSTM, concatenate last hidden states from both directions
+                last_hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)  # [batch_size, hidden_dim * 2]
+            else:
+                last_hidden = hidden[-1]  # [batch_size, hidden_dim]
+            context_vector = last_hidden
+        # Layer normalization
+        normalized = self.layer_norm(context_vector)
+        # Dropout
+        dropped = self.dropout(normalized)
+        # Classification
+        logits = self.classifier(dropped)
+        return logits
+class DocumentBiLSTM(nn.Module):
+    """
+    A simpler BiLSTM implementation that doesn't require pre-loaded embeddings
+    Good for getting started quickly
+    """
+    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
+                 n_layers=2, dropout=0.5, pad_idx=0):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
+        self.lstm = nn.LSTM(embedding_dim,
+                           hidden_dim,
+                           num_layers=n_layers,
+                           bidirectional=True,
+                           dropout=dropout if n_layers > 1 else 0,
+                           batch_first=True)
+        self.fc = nn.Linear(hidden_dim * 2, output_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, input_ids, attention_mask=None, **kwargs):
+        # input_ids = [batch size, seq len]
+        # embedded = [batch size, seq len, emb dim]
+        embedded = self.embedding(input_ids)
+        # Apply dropout to embeddings
+        embedded = self.dropout(embedded)
+        if attention_mask is not None:
+            # Create packed sequence for variable length sequences
+            # This is a simplified version - in practice you'd use pack_padded_sequence
+            # but that requires knowing the actual sequence lengths
+            pass
+        # output = [batch size, seq len, hid dim * num directions]
+        # hidden = [n layers * num directions, batch size, hid dim]
+        # cell = [n layers * num directions, batch size, hid dim]
+        output, (hidden, cell) = self.lstm(embedded)
+        # Concatenate the final forward and backward hidden states
+        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
+        # Apply dropout to hidden state
+        hidden = self.dropout(hidden)
+        # prediction = [batch size, output dim]
+        prediction = self.fc(hidden)
+        return prediction

train.py CHANGED Viewed

@@ -121,7 +121,7 @@ def main():
     # Train the model
     logger.info("Starting training...")
-    save_path = os.path.join(args.output_dir, "best_model.pth")
     trainer.train(epochs=args.epochs, save_path=save_path)
     logger.info("Training completed!")

     # Train the model
     logger.info("Starting training...")
+    save_path = os.path.join(args.output_dir, "bert-base-uncased")
     trainer.train(epochs=args.epochs, save_path=save_path)
     logger.info("Training completed!")