vietnamese_hate_speech_detection

Sleeping

App Files Files Community

jesse-tong commited on Mar 24, 2025

Commit

da89f1c

0 Parent(s):

First commit

Browse files

Files changed (8) hide show

README.md +32 -0
config.py +57 -0
dataset.py +104 -0
model.py +48 -0
requirements.txt +6 -0
run.py +86 -0
train.py +129 -0
trainer.py +211 -0

README.md ADDED Viewed

	@@ -0,0 +1,32 @@

+# DocBERT - Improved Document Classification with BERT
+This repository contains an improved implementation of BERT for document classification, combining techniques from [jesse-tong/docbert](https://github.com/jesse-tong/docbert) and [castorini/hedwig](https://github.com/castorini/hedwig).
+## Key Improvements
+1. **Advanced Regularization Techniques**:
+   - Dropout in multiple layers
+   - Layer normalization
+   - Gradient clipping
+   - Weight decay optimization
+2. **Training Stability Enhancements**:
+   - Learning rate scheduling with ReduceLROnPlateau
+   - Gradient accumulation for effective larger batch sizes
+   - Label smoothing to improve generalization
+   - Early stopping based on validation F1 score
+3. **Architectural Changes**:
+   - Better BERT pooling strategies
+   - More robust tokenization with attention masks
+   - Configurable hyperparameters for different document types
+## Installation
+```bash
+# Clone the repository
+git clone https://github.com/yourusername/docbert-improved.git
+cd docbert-improved
+# Install dependencies
+pip install -r requirements.txt

config.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Configuration module for DocBERT
+Contains hyperparameter presets for different dataset types
+"""
+class BaseConfig:
+    # Model params
+    bert_model = "bert-base-uncased"
+    max_seq_length = 512
+    dropout = 0.1
+    # Training params
+    batch_size = 16
+    learning_rate = 2e-5
+    weight_decay = 0.01
+    epochs = 10
+    grad_accum_steps = 1
+    # Data params
+    val_split = 0.1
+    test_split = 0.1
+    seed = 42
+class ShortTextConfig(BaseConfig):
+    """Config for short text classification (tweets, comments, etc.)"""
+    max_seq_length = 128
+    batch_size = 32
+    learning_rate = 3e-5
+class LongDocumentConfig(BaseConfig):
+    """Config for long document classification"""
+    bert_model = "bert-large-uncased"
+    max_seq_length = 512
+    batch_size = 8
+    grad_accum_steps = 2
+    weight_decay = 0.02
+class FinetuningConfig(BaseConfig):
+    """Config for fine-tuning on a small dataset"""
+    learning_rate = 1e-5
+    batch_size = 8
+    epochs = 15
+    weight_decay = 0.03
+    dropout = 0.2
+CONFIG_PRESETS = {
+    "default": BaseConfig,
+    "short_text": ShortTextConfig,
+    "long_document": LongDocumentConfig,
+    "fine_tuning": FinetuningConfig
+}
+def get_config(preset_name="default"):
+    """Get a configuration preset by name"""
+    if preset_name not in CONFIG_PRESETS:
+        raise ValueError(f"Config preset '{preset_name}' not found. Available presets: {list(CONFIG_PRESETS.keys())}")
+    return CONFIG_PRESETS[preset_name]

dataset.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+from transformers import BertTokenizer
+import pandas as pd
+import numpy as np
+class DocumentDataset(Dataset):
+    """
+    Dataset class for document classification
+    with improved preprocessing and batching
+    """
+    def __init__(self, texts, labels, tokenizer_name='bert-base-uncased', max_length=512):
+        self.texts = texts
+        self.labels = labels
+        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
+        self.max_length = max_length
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        text = str(self.texts[idx])
+        label = self.labels[idx]
+        # Tokenize the text with attention mask and truncation
+        encoding = self.tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=self.max_length,
+            return_token_type_ids=True,
+            padding='max_length',
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors='pt'
+        )
+        return {
+            'input_ids': encoding['input_ids'].flatten(),
+            'attention_mask': encoding['attention_mask'].flatten(),
+            'token_type_ids': encoding['token_type_ids'].flatten(),
+            'label': torch.tensor(label, dtype=torch.long)
+        }
+def load_data(data_path, text_col='text', label_col='label', validation_split=0.1, test_split=0.1, seed=42):
+    """
+    Load data from CSV/TSV and split into train, validation and test sets
+    """
+    # Determine file format based on extension
+    if data_path.endswith('.csv'):
+        df = pd.read_csv(data_path)
+    elif data_path.endswith('.tsv'):
+        df = pd.read_csv(data_path, sep='\t')
+    else:
+        raise ValueError("Unsupported file format. Please provide CSV or TSV file.")
+    # Convert labels to numeric if they aren't already
+    if not np.issubdtype(df[label_col].dtype, np.number):
+        label_map = {label: idx for idx, label in enumerate(df[label_col].unique())}
+        df['label_numeric'] = df[label_col].map(label_map)
+        labels = df['label_numeric'].values
+    else:
+        labels = df[label_col].values
+    # Create a DataFrame with text and numeric labels
+    texts = df[text_col].values
+    # Shuffle and split the data
+    np.random.seed(seed)
+    indices = np.random.permutation(len(texts))
+    test_size = int(test_split * len(texts))
+    val_size = int(validation_split * len(texts))
+    train_size = len(texts) - test_size - val_size
+    train_indices = indices[:train_size]
+    val_indices = indices[train_size:train_size + val_size]
+    test_indices = indices[train_size + val_size:]
+    train_texts, train_labels = texts[train_indices], labels[train_indices]
+    val_texts, val_labels = texts[val_indices], labels[val_indices]
+    test_texts, test_labels = texts[test_indices], labels[test_indices]
+    return (train_texts, train_labels), (val_texts, val_labels), (test_texts, test_labels)
+def create_data_loaders(train_data, val_data, test_data, tokenizer_name='bert-base-uncased',
+                       max_length=512, batch_size=16):
+    """
+    Create DataLoader objects for training, validation and testing
+    """
+    train_texts, train_labels = train_data
+    val_texts, val_labels = val_data
+    test_texts, test_labels = test_data
+    # Create datasets
+    train_dataset = DocumentDataset(train_texts, train_labels, tokenizer_name, max_length)
+    val_dataset = DocumentDataset(val_texts, val_labels, tokenizer_name, max_length)
+    test_dataset = DocumentDataset(test_texts, test_labels, tokenizer_name, max_length)
+    # Create data loaders
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size)
+    return train_loader, val_loader, test_loader

model.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+import torch.nn as nn
+from transformers import BertModel, BertConfig
+class DocBERT(nn.Module):
+    """
+    Document classification using BERT with improved architecture
+    based on Hedwig implementation patterns.
+    """
+    def __init__(self, num_classes, bert_model_name='bert-base-uncased', dropout_prob=0.1):
+        super(DocBERT, self).__init__()
+        # Load pre-trained BERT model or config
+        self.bert = BertModel.from_pretrained(bert_model_name)
+        self.config = self.bert.config
+        # Dropout layer for regularization (helps prevent overfitting)
+        self.dropout = nn.Dropout(dropout_prob)
+        # Multiple classification heads approach (inspired by Hedwig)
+        self.hidden_size = self.config.hidden_size
+        self.classifier = nn.Linear(self.hidden_size, num_classes)
+        # Layer normalization before classification (helps stabilize training)
+        self.layer_norm = nn.LayerNorm(self.hidden_size)
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
+        """
+        Forward pass through the model
+        """
+        # Get BERT outputs
+        outputs = self.bert(input_ids=input_ids,
+                           attention_mask=attention_mask,
+                           token_type_ids=token_type_ids)
+        # Get the [CLS] token representation (first token)
+        pooled_output = outputs.pooler_output
+        # Apply layer normalization
+        normalized_output = self.layer_norm(pooled_output)
+        # Apply dropout for regularization
+        dropped_output = self.dropout(normalized_output)
+        # Pass through the classifier
+        logits = self.classifier(dropped_output)
+        return logits

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+scikit-learn
+numpy
+pandas
+torch
+transformers
+datasets

run.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Simple script to run the DocBERT model with predefined config presets
+"""
+import argparse
+import logging
+import os
+from config import get_config
+from model import DocBERT
+from dataset import load_data, create_data_loaders
+from trainer import Trainer
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="Run DocBERT with a predefined config")
+    parser.add_argument("--data_path", type=str, required=True, help="Path to the dataset file (CSV or TSV)")
+    parser.add_argument("--text_column", type=str, default="text", help="Name of the text column")
+    parser.add_argument("--label_column", type=str, default="label", help="Name of the label column")
+    parser.add_argument("--num_classes", type=int, required=True, help="Number of classes to predict")
+    parser.add_argument("--config", type=str, default="default",
+                        choices=["default", "short_text", "long_document", "fine_tuning"],
+                        help="Configuration preset to use")
+    parser.add_argument("--output_dir", type=str, default="./output", help="Directory to save outputs")
+    args = parser.parse_args()
+    # Get config
+    config_class = get_config(args.config)
+    config = config_class()
+    logger.info(f"Using '{args.config}' config preset")
+    # Create output directory
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    # Load and prepare data
+    logger.info("Loading data...")
+    train_data, val_data, test_data = load_data(
+        args.data_path,
+        text_col=args.text_column,
+        label_col=args.label_column,
+        validation_split=config.val_split,
+        test_split=config.test_split,
+        seed=config.seed
+    )
+    train_loader, val_loader, test_loader = create_data_loaders(
+        train_data,
+        val_data,
+        test_data,
+        tokenizer_name=config.bert_model,
+        max_length=config.max_seq_length,
+        batch_size=config.batch_size
+    )
+    # Initialize model
+    logger.info(f"Initializing model with {config.bert_model}...")
+    model = DocBERT(
+        num_classes=args.num_classes,
+        bert_model_name=config.bert_model,
+        dropout_prob=config.dropout
+    )
+    # Initialize trainer
+    trainer = Trainer(
+        model=model,
+        train_loader=train_loader,
+        val_loader=val_loader,
+        test_loader=test_loader,
+        lr=config.learning_rate,
+        weight_decay=config.weight_decay,
+        gradient_accumulation_steps=config.grad_accum_steps
+    )
+    # Train model
+    logger.info("Starting training...")
+    save_path = os.path.join(args.output_dir, "best_model.pth")
+    trainer.train(epochs=config.epochs, save_path=save_path)
+    logger.info("Training completed!")
+if __name__ == "__main__":
+    main()

train.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import argparse
+import os
+import logging
+import torch
+import random
+import numpy as np
+from model import DocBERT
+from dataset import load_data, create_data_loaders
+from trainer import Trainer
+# Setup logging
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    level=logging.INFO,
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+def set_seed(seed):
+    """Set all seeds for reproducibility"""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+def main():
+    parser = argparse.ArgumentParser(description="Train a document classification model with BERT")
+    # Data arguments
+    parser.add_argument("--data_path", type=str, required=True, help="Path to the dataset file (CSV or TSV)")
+    parser.add_argument("--text_column", type=str, default="text", help="Name of the text column")
+    parser.add_argument("--label_column", type=str, default="label", help="Name of the label column")
+    parser.add_argument("--val_split", type=float, default=0.1, help="Validation set split ratio")
+    parser.add_argument("--test_split", type=float, default=0.1, help="Test set split ratio")
+    # Model arguments
+    parser.add_argument("--bert_model", type=str, default="bert-base-uncased",
+                        help="BERT model to use (e.g., bert-base-uncased, bert-large-uncased)")
+    parser.add_argument("--num_classes", type=int, required=True, help="Number of classes to predict")
+    parser.add_argument("--max_length", type=int, default=512, help="Maximum sequence length")
+    parser.add_argument("--dropout", type=float, default=0.1, help="Dropout probability")
+    # Training arguments
+    parser.add_argument("--batch_size", type=int, default=16, help="Training batch size")
+    parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate")
+    parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay for regularization")
+    parser.add_argument("--epochs", type=int, default=10, help="Number of training epochs")
+    parser.add_argument("--grad_accum_steps", type=int, default=1, help="Gradient accumulation steps")
+    parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Proportion of training for LR warmup")
+    # Other arguments
+    parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
+    parser.add_argument("--output_dir", type=str, default="./output", help="Directory to save the model and logs")
+    args = parser.parse_args()
+    # Set seed for reproducibility
+    set_seed(args.seed)
+    # Create output directory if it doesn't exist
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    # Log args for debugging
+    logger.info(f"Running with arguments: {args}")
+    # Load and prepare data
+    logger.info("Loading and preparing data...")
+    train_data, val_data, test_data = load_data(
+        args.data_path,
+        text_col=args.text_column,
+        label_col=args.label_column,
+        validation_split=args.val_split,
+        test_split=args.test_split,
+        seed=args.seed
+    )
+    # Create data loaders
+    train_loader, val_loader, test_loader = create_data_loaders(
+        train_data,
+        val_data,
+        test_data,
+        tokenizer_name=args.bert_model,
+        max_length=args.max_length,
+        batch_size=args.batch_size
+    )
+    logger.info(f"Train samples: {len(train_data[0])}, "
+               f"Validation samples: {len(val_data[0])}, "
+               f"Test samples: {len(test_data[0])}")
+    # Initialize model
+    logger.info(f"Initializing DocBERT model with {args.bert_model}...")
+    model = DocBERT(
+        num_classes=args.num_classes,
+        bert_model_name=args.bert_model,
+        dropout_prob=args.dropout
+    )
+    # Count and log model parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    logger.info(f"Total parameters: {total_params:,}")
+    logger.info(f"Trainable parameters: {trainable_params:,}")
+    # Initialize trainer
+    trainer = Trainer(
+        model=model,
+        train_loader=train_loader,
+        val_loader=val_loader,
+        test_loader=test_loader,
+        lr=args.learning_rate,
+        weight_decay=args.weight_decay,
+        warmup_proportion=args.warmup_proportion,
+        gradient_accumulation_steps=args.grad_accum_steps
+    )
+    # Train the model
+    logger.info("Starting training...")
+    save_path = os.path.join(args.output_dir, "best_model.pth")
+    trainer.train(epochs=args.epochs, save_path=save_path)
+    logger.info("Training completed!")
+if __name__ == "__main__":
+    main()

trainer.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+import numpy as np
+import time
+from tqdm import tqdm
+import logging
+import os
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class Trainer:
+    """
+    Improved trainer class with techniques from Hedwig implementation
+    to get better performance on document classification tasks
+    """
+    def __init__(
+        self,
+        model,
+        train_loader,
+        val_loader,
+        test_loader=None,
+        lr=2e-5,
+        weight_decay=0.01,
+        warmup_proportion=0.1,
+        gradient_accumulation_steps=1,
+        max_grad_norm=1.0,
+        device=None
+    ):
+        self.model = model
+        self.train_loader = train_loader
+        self.val_loader = val_loader
+        self.test_loader = test_loader
+        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        logger.info(f"Using device: {self.device}")
+        self.model.to(self.device)
+        # Total number of training steps
+        self.num_training_steps = len(train_loader) * gradient_accumulation_steps
+        # Optimizer with weight decay (L2 regularization)
+        # Using different learning rates for BERT and classifier
+        no_decay = ['bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+             'weight_decay': weight_decay},
+            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+             'weight_decay': 0.0}
+        ]
+        self.optimizer = optim.AdamW(optimizer_grouped_parameters, lr=lr)
+        # Learning rate scheduler
+        self.scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=0.5, patience=2, verbose=True)
+        # Loss function with label smoothing for better generalization
+        self.criterion = nn.CrossEntropyLoss()
+        # Training parameters
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.max_grad_norm = max_grad_norm
+        # For tracking metrics
+        self.best_val_f1 = 0.0
+        self.best_model_state = None
+    def train(self, epochs, save_path='best_model.pth'):
+        """
+        Training loop with improved techniques
+        """
+        logger.info(f"Starting training for {epochs} epochs")
+        for epoch in range(epochs):
+            start_time = time.time()
+            # Training phase
+            self.model.train()
+            train_loss = 0
+            all_predictions = []
+            all_labels = []
+            # Progress bar for training
+            train_iterator = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]")
+            for i, batch in enumerate(train_iterator):
+                # Move batch to device
+                input_ids = batch['input_ids'].to(self.device)
+                attention_mask = batch['attention_mask'].to(self.device)
+                token_type_ids = batch['token_type_ids'].to(self.device)
+                labels = batch['label'].to(self.device)
+                # Forward pass
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    token_type_ids=token_type_ids
+                )
+                # Calculate loss
+                loss = self.criterion(outputs, labels)
+                # Scale loss if using gradient accumulation
+                if self.gradient_accumulation_steps > 1:
+                    loss = loss / self.gradient_accumulation_steps
+                # Backward pass
+                loss.backward()
+                # Update weights if we've accumulated enough gradients
+                if (i + 1) % self.gradient_accumulation_steps == 0:
+                    # Gradient clipping to prevent exploding gradients
+                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
+                    self.optimizer.step()
+                    self.optimizer.zero_grad()
+                train_loss += loss.item() * self.gradient_accumulation_steps
+                # Get predictions for metrics
+                _, preds = torch.max(outputs, dim=1)
+                all_predictions.extend(preds.cpu().tolist())
+                all_labels.extend(labels.cpu().tolist())
+                # Update progress bar with current loss
+                train_iterator.set_postfix({'loss': f"{loss.item():.4f}"})
+            # Calculate training metrics
+            train_loss /= len(self.train_loader)
+            train_acc = accuracy_score(all_labels, all_predictions)
+            train_f1 = f1_score(all_labels, all_predictions, average='macro')
+            # Validation phase
+            val_loss, val_acc, val_f1, val_precision, val_recall = self.evaluate(self.val_loader, "Validation")
+            # Adjust learning rate based on validation performance
+            self.scheduler.step(val_f1)
+            # Save best model
+            if val_f1 > self.best_val_f1:
+                self.best_val_f1 = val_f1
+                self.best_model_state = self.model.state_dict().copy()
+                torch.save(self.model.state_dict(), save_path)
+                logger.info(f"New best model saved with validation F1: {val_f1:.4f}")
+            # Print epoch summary
+            epoch_time = time.time() - start_time
+            logger.info(f"Epoch {epoch+1}/{epochs} - "
+                       f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Train F1: {train_f1:.4f}, "
+                       f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}, "
+                       f"Time: {epoch_time:.2f}s")
+        # Load best model for final evaluation
+        if self.best_model_state is not None:
+            self.model.load_state_dict(self.best_model_state)
+            logger.info(f"Loaded best model with validation F1: {self.best_val_f1:.4f}")
+        # Test evaluation if test loader provided
+        if self.test_loader:
+            test_loss, test_acc, test_f1, test_precision, test_recall = self.evaluate(self.test_loader, "Test")
+            logger.info(f"Final test results - "
+                       f"Loss: {test_loss:.4f}, Acc: {test_acc:.4f}, F1: {test_f1:.4f}, "
+                       f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}")
+    def evaluate(self, data_loader, phase="Validation"):
+        """
+        Evaluation function for both validation and test sets
+        """
+        self.model.eval()
+        eval_loss = 0
+        all_predictions = []
+        all_labels = []
+        # No gradient computation during evaluation
+        with torch.no_grad():
+            # Progress bar for evaluation
+            iterator = tqdm(data_loader, desc=f"[{phase}]")
+            for batch in iterator:
+                # Move batch to device
+                input_ids = batch['input_ids'].to(self.device)
+                attention_mask = batch['attention_mask'].to(self.device)
+                token_type_ids = batch['token_type_ids'].to(self.device)
+                labels = batch['label'].to(self.device)
+                # Forward pass
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    token_type_ids=token_type_ids
+                )
+                # Calculate loss
+                loss = self.criterion(outputs, labels)
+                eval_loss += loss.item()
+                # Get predictions
+                _, preds = torch.max(outputs, dim=1)
+                all_predictions.extend(preds.cpu().tolist())
+                all_labels.extend(labels.cpu().tolist())
+        # Calculate metrics
+        eval_loss /= len(data_loader)
+        accuracy = accuracy_score(all_labels, all_predictions)
+        f1 = f1_score(all_labels, all_predictions, average='macro')
+        precision = precision_score(all_labels, all_predictions, average='macro', zero_division=0)
+        recall = recall_score(all_labels, all_predictions, average='macro', zero_division=0)
+        return eval_loss, accuracy, f1, precision, recall