Update train script to filter calamity messages

Browse files

Files changed (3) hide show

README.md +4 -1
train/rd_dataset_loader.py +14 -6
train/train_mmbert_dual_soft_f1_simplified.py +2 -2

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 language:
 - nl
-license: other
 library_name: transformers
 pipeline_tag: text-classification
 base_model: bert-base-multilingual-cased
@@ -190,3 +190,6 @@ print("Beleving:",  topk(be_probs, labels["beleving"]))
 ## Acknowledgements
 - UWV WIM synthetic RD dataset
 - Hugging Face Transformers/Datasets

 ---
 language:
 - nl
+license: apache-2.0
 library_name: transformers
 pipeline_tag: text-classification
 base_model: bert-base-multilingual-cased
 ## Acknowledgements
 - UWV WIM synthetic RD dataset
 - Hugging Face Transformers/Datasets
+## License
+This model is licensed under the Apache License 2.0. See `LICENSE` for details.

train/rd_dataset_loader.py CHANGED Viewed

@@ -8,7 +8,7 @@ import numpy as np
 from datasets import load_dataset
-def load_rd_wim_dataset(max_samples=None, split='train'):
     """
     Load UWV/wim-synthetic-data-rd dataset and encode multi-labels.
@@ -17,21 +17,29 @@ def load_rd_wim_dataset(max_samples=None, split='train'):
     - beleving: How the citizen experienced the interaction (26 unique labels)
     Args:
-        max_samples: Limit number of samples (None = all 9,351 samples)
         split: Dataset split to load (default: 'train')
     Returns:
         texts: List of conversation strings
-        onderwerp_encoded: numpy array [n_samples, 96] - multi-hot encoded topics
-        beleving_encoded: numpy array [n_samples, 26] - multi-hot encoded experiences
-        onderwerp_labels: List of 96 onderwerp label names (sorted alphabetically)
-        beleving_labels: List of 26 beleving label names (sorted alphabetically)
     """
     # Load dataset from HuggingFace
     print(f"Loading UWV/wim-synthetic-data-rd dataset (split={split})...")
     ds = load_dataset('UWV/wim-synthetic-data-rd', split=split)
     # Limit samples if requested
     if max_samples is not None:
         ds = ds.select(range(min(max_samples, len(ds))))

 from datasets import load_dataset
+def load_rd_wim_dataset(max_samples=None, split='train', filter_calamity=True):
     """
     Load UWV/wim-synthetic-data-rd dataset and encode multi-labels.
     - beleving: How the citizen experienced the interaction (26 unique labels)
     Args:
+        max_samples: Limit number of samples (None = all samples)
         split: Dataset split to load (default: 'train')
+        filter_calamity: If True, exclude samples with is_calamity=True (default: True)
     Returns:
         texts: List of conversation strings
+        onderwerp_encoded: numpy array [n_samples, n_onderwerp] - multi-hot encoded topics
+        beleving_encoded: numpy array [n_samples, n_beleving] - multi-hot encoded experiences
+        onderwerp_labels: List of onderwerp label names (sorted alphabetically)
+        beleving_labels: List of beleving label names (sorted alphabetically)
     """
     # Load dataset from HuggingFace
     print(f"Loading UWV/wim-synthetic-data-rd dataset (split={split})...")
     ds = load_dataset('UWV/wim-synthetic-data-rd', split=split)
+    # Filter out calamity samples if requested
+    if filter_calamity:
+        original_len = len(ds)
+        ds = ds.filter(lambda x: not x['is_calamity'])
+        filtered_len = len(ds)
+        print(f"Filtered out {original_len - filtered_len} calamity samples ({filtered_len} remaining)")
     # Limit samples if requested
     if max_samples is not None:
         ds = ds.select(range(min(max_samples, len(ds))))

train/train_mmbert_dual_soft_f1_simplified.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
 Dual-head multi-label PyTorch training script for mmBERT-base.
-Two classification heads: onderwerp (96 labels) and beleving (26 labels).
 Uses combined F1+BCE loss with weight α (configurable balance).
 Features: learnable thresholds, warmup + cosine LR, gradient clipping.
 mmBERT: Modern multilingual encoder (1800+ languages, 2x faster than XLM-R).
@@ -702,7 +702,7 @@ def main():
     set_seed(cfg.seed)
     # Load RD dataset
-    print("\nLoading FULL RD dataset (9,351 samples)...")
     texts, onderwerp, beleving, onderwerp_names, beleving_names = load_rd_wim_dataset(
         max_samples=None  # Using full dataset for better training
     )

 #!/usr/bin/env python3
 """
 Dual-head multi-label PyTorch training script for mmBERT-base.
+Two classification heads: onderwerp (topic) and beleving (experience) with dynamic label counts.
 Uses combined F1+BCE loss with weight α (configurable balance).
 Features: learnable thresholds, warmup + cosine LR, gradient clipping.
 mmBERT: Modern multilingual encoder (1800+ languages, 2x faster than XLM-R).
     set_seed(cfg.seed)
     # Load RD dataset
+    print("\nLoading RD dataset...")
     texts, onderwerp, beleving, onderwerp_names, beleving_names = load_rd_wim_dataset(
         max_samples=None  # Using full dataset for better training
     )