Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

.gitattributes +1 -0
config.json +21 -0
config.py +177 -0
merged_model.py +352 -0
model.safetensors +3 -0
special_tokens_map.json +15 -0
tokenizer.json +3 -0
tokenizer_config.json +55 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "architectures": [
+    "QualityClassifierModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "merged_model.QualityClassifierConfig",
+    "AutoModel": "merged_model.QualityClassifierModel"
+  },
+  "base_model_name": "FacebookAI/xlm-roberta-base",
+  "dropout": 0.2,
+  "dtype": "float32",
+  "hidden_dim": 256,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "model_type": "quality_classifier",
+  "transformers_version": "4.57.3"
+}

config.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+Configuration for HQ document quality classifiers.
+Defines language-specific settings, dataset sources, and training hyperparameters
+for the FineWeb2-HQ methodology.
+"""
+from pathlib import Path
+# =============================================================================
+# Paths
+# =============================================================================
+HQ_DIR = Path(__file__).parent
+SRC_DIR = HQ_DIR.parent
+BASE_DIR = SRC_DIR.parent
+# =============================================================================
+# Available Encoder Models
+# =============================================================================
+ENCODER_MODELS = {
+    "mmbert-small": {
+        "model_name": "jhu-clsp/mmBERT-small",
+        "max_length": 512,
+        "embedding_dim": 384,
+        "description": "mmBERT-small: Modern multilingual encoder (1800+ languages)",
+    },
+    "mmbert-base": {
+        "model_name": "jhu-clsp/mmBERT-base",
+        "max_length": 512,
+        "embedding_dim": 768,
+        "description": "mmBERT-base: Larger multilingual encoder (1800+ languages)",
+    },
+    "xlm-roberta-base": {
+        "model_name": "FacebookAI/xlm-roberta-base",
+        "max_length": 512,
+        "embedding_dim": 768,
+        "description": "XLM-RoBERTa-base: Classic multilingual encoder (100 languages)",
+    },
+    "xlm-roberta-large": {
+        "model_name": "FacebookAI/xlm-roberta-large",
+        "max_length": 512,
+        "embedding_dim": 1024,
+        "description": "XLM-RoBERTa-large: Larger classic multilingual encoder",
+    },
+}
+# Default encoder
+DEFAULT_ENCODER = "mmbert-small"
+# =============================================================================
+# Embedding Model Configuration (default)
+# =============================================================================
+EMBEDDING_CONFIG = ENCODER_MODELS[DEFAULT_ENCODER].copy()
+# =============================================================================
+# Classifier Training Configuration
+# =============================================================================
+TRAINING_CONFIG = {
+    "epochs": 6,
+    "learning_rate": 0.0003,
+    "batch_size": 256,
+    "hidden_dim": 256,
+    "dropout": 0.2,
+    "embedding_batch_size": 32,
+}
+# =============================================================================
+# Language-Specific Configuration
+# =============================================================================
+LANGUAGE_CONFIG = {
+    "ara_Arab": {
+        "name": "Arabic",
+        "answer_label": "الإجابة:",
+        "positive_datasets": [
+            {
+                "dataset_id": "MBZUAI/ArabicMMLU",
+                "subset": "All",
+                "split": "test",
+                "format_type": "mcq",
+                "text_field": None,  # Use formatter
+            },
+            {
+                "dataset_id": "openai/MMMLU",
+                "subset": "AR_XY",
+                "split": "test",
+                "format_type": "mcq",
+                "text_field": None,
+            },
+            {
+                "dataset_id": "CohereForAI/aya_dataset",
+                "subset": None,
+                "split": "train",
+                "format_type": "instruction",
+                "text_field": None,
+                "language_filter": "Arabic",
+            },
+        ],
+        "negative_source": {
+            "dataset_id": "uonlp/CulturaX",
+            "subset": "ar",
+            "split": "train",
+            "text_field": "text",
+        },
+    },
+    "hin_Deva": {
+        "name": "Hindi",
+        "answer_label": "उत्तर:",
+        "positive_datasets": [
+            {
+                "dataset_id": "openai/MMMLU",
+                "subset": "HI_IN",
+                "split": "test",
+                "format_type": "mcq",
+                "text_field": None,
+            },
+            {
+                "dataset_id": "CohereForAI/aya_dataset",
+                "subset": None,
+                "split": "train",
+                "format_type": "instruction",
+                "text_field": None,
+                "language_filter": "Hindi",
+            },
+        ],
+        "negative_source": {
+            "dataset_id": "uonlp/CulturaX",
+            "subset": "hi",
+            "split": "train",
+            "text_field": "text",
+        },
+    },
+    "tur_Latn": {
+        "name": "Turkish",
+        "answer_label": "Cevap:",
+        "positive_datasets": [
+            {
+                "dataset_id": "AYueksel/TurkishMMLU",
+                "subset": "All",
+                "split": "test",
+                "format_type": "mcq",
+                "text_field": None,
+            },
+            # Note: openai/MMMLU does not have Turkish
+            {
+                "dataset_id": "CohereForAI/aya_dataset",
+                "subset": None,
+                "split": "train",
+                "format_type": "instruction",
+                "text_field": None,
+                "language_filter": "Turkish",
+            },
+        ],
+        "negative_source": {
+            "dataset_id": "uonlp/CulturaX",
+            "subset": "tr",
+            "split": "train",
+            "text_field": "text",
+        },
+    },
+}
+# =============================================================================
+# Supported Languages
+# =============================================================================
+SUPPORTED_LANGUAGES = list(LANGUAGE_CONFIG.keys())
+# =============================================================================
+# Default Sampling Configuration
+# =============================================================================
+SAMPLING_CONFIG = {
+    "max_positive_samples": 80000,
+    "max_negative_samples": 80000,
+    "min_text_length": 50,
+    "train_ratio": 0.8,
+    "valid_ratio": 0.1,
+    "test_ratio": 0.1,
+    "random_seed": 42,
+}

merged_model.py ADDED Viewed

	@@ -0,0 +1,352 @@

+"""
+Unified HuggingFace-compatible quality classifier model.
+Merges mmBERT encoder with trained MLP classifier head into a single
+PreTrainedModel that can be saved/loaded using standard HuggingFace methods
+and used with vLLM for efficient inference.
+Example:
+    # Merge trained classifier into unified model
+    from src.hq.merged_model import merge_and_save
+    merge_and_save(
+        base_model_name="jhu-clsp/mmBERT-small",
+        classifier_weights_path="./output/models/ara_Arab.pt",
+        output_dir="./release/arabic-quality-classifier"
+    )
+    # Load and use
+    model = QualityClassifierModel.from_pretrained("./release/arabic-quality-classifier")
+    tokenizer = AutoTokenizer.from_pretrained("./release/arabic-quality-classifier")
+"""
+import os
+from pathlib import Path
+from typing import Optional, Union
+import torch
+import torch.nn as nn
+from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PretrainedConfig
+from transformers.modeling_outputs import SequenceClassifierOutput
+from .config import EMBEDDING_CONFIG, TRAINING_CONFIG
+class QualityClassifierConfig(PretrainedConfig):
+    """Configuration for the unified quality classifier model."""
+    model_type = "quality_classifier"
+    def __init__(
+        self,
+        base_model_name: str = None,
+        hidden_dim: int = None,
+        dropout: float = None,
+        num_labels: int = 1,
+        **kwargs
+    ):
+        """
+        Initialize configuration.
+        Args:
+            base_model_name: HuggingFace model ID for the encoder
+            hidden_dim: Hidden dimension of the MLP classifier
+            dropout: Dropout probability
+            num_labels: Number of output labels (1 for binary)
+        """
+        super().__init__(**kwargs)
+        self.base_model_name = base_model_name or EMBEDDING_CONFIG["model_name"]
+        self.hidden_dim = hidden_dim or TRAINING_CONFIG["hidden_dim"]
+        self.dropout = dropout or TRAINING_CONFIG["dropout"]
+        self.num_labels = num_labels
+class QualityClassifierModel(PreTrainedModel):
+    """
+    Unified quality classifier combining mmBERT encoder with MLP head.
+    This model can be saved and loaded using standard HuggingFace methods:
+        model.save_pretrained("path/to/model")
+        model = QualityClassifierModel.from_pretrained("path/to/model")
+    It can also be used with vLLM for efficient inference since mmBERT
+    is supported.
+    Architecture:
+        - Encoder: mmBERT (small or base)
+        - Pooling: Mean pooling over sequence
+        - Classifier: Linear(768->256) -> ReLU -> Dropout(0.2) -> Linear(256->1) -> Sigmoid
+    """
+    config_class = QualityClassifierConfig
+    def __init__(self, config: QualityClassifierConfig):
+        """
+        Initialize the unified model.
+        Args:
+            config: QualityClassifierConfig instance
+        """
+        super().__init__(config)
+        # Load base encoder with eager attention to avoid flash_attn issues
+        self.encoder = AutoModel.from_pretrained(
+            config.base_model_name,
+            attn_implementation="eager",
+        )
+        hidden_size = self.encoder.config.hidden_size
+        # Classification head (matches standalone training architecture)
+        self.classifier = nn.Sequential(
+            nn.Linear(hidden_size, config.hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(config.dropout),
+            nn.Linear(config.hidden_dim, config.num_labels),
+            nn.Sigmoid()
+        )
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> SequenceClassifierOutput:
+        """
+        Forward pass with optional loss computation.
+        Args:
+            input_ids: Token IDs of shape (batch_size, seq_length)
+            attention_mask: Attention mask of shape (batch_size, seq_length)
+            token_type_ids: Token type IDs (unused for mmBERT)
+            labels: Ground truth labels for loss computation
+            return_dict: Whether to return a SequenceClassifierOutput
+        Returns:
+            SequenceClassifierOutput with loss, logits, and hidden states
+        """
+        # Encode
+        outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+        # Mean pooling
+        token_embeddings = outputs.last_hidden_state
+        if attention_mask is not None:
+            mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+            sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1)
+            sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
+            pooled = sum_embeddings / sum_mask
+        else:
+            pooled = token_embeddings.mean(dim=1)
+        # Classify
+        logits = self.classifier(pooled)
+        # Compute loss if labels provided
+        loss = None
+        if labels is not None:
+            loss_fn = nn.BCELoss()
+            loss = loss_fn(logits.squeeze(), labels.float())
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def predict(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Convenience method for inference.
+        Args:
+            input_ids: Token IDs
+            attention_mask: Attention mask
+        Returns:
+            Quality scores in range [0, 1]
+        """
+        self.eval()
+        with torch.no_grad():
+            outputs = self.forward(input_ids=input_ids, attention_mask=attention_mask)
+            return outputs.logits.squeeze()
+    def score_texts(
+        self,
+        texts: list,
+        tokenizer: AutoTokenizer,
+        batch_size: int = 32,
+        max_length: int = 512,
+        device: str = None,
+    ) -> list:
+        """
+        Score a list of texts.
+        Args:
+            texts: List of text strings to score
+            tokenizer: Tokenizer for the model
+            batch_size: Batch size for processing
+            max_length: Maximum sequence length
+            device: Device to use for inference
+        Returns:
+            List of quality scores in range [0, 1]
+        """
+        device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(device)
+        self.eval()
+        scores = []
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i + batch_size]
+            inputs = tokenizer(
+                batch,
+                return_tensors="pt",
+                max_length=max_length,
+                truncation=True,
+                padding=True,
+            ).to(device)
+            with torch.no_grad():
+                outputs = self.forward(**inputs)
+                batch_scores = outputs.logits.squeeze().cpu().tolist()
+            # Handle single item case
+            if isinstance(batch_scores, float):
+                batch_scores = [batch_scores]
+            scores.extend(batch_scores)
+        return scores
+def merge_and_save(
+    base_model_name: str,
+    classifier_weights_path: Union[str, Path],
+    output_dir: Union[str, Path],
+    hidden_dim: int = None,
+    dropout: float = None,
+) -> QualityClassifierModel:
+    """
+    Merge encoder and trained classifier head, then save as unified model.
+    The resulting model can be loaded with:
+        model = QualityClassifierModel.from_pretrained(output_dir)
+    Args:
+        base_model_name: HuggingFace model ID for the encoder
+        classifier_weights_path: Path to trained MLP weights (.pt file)
+        output_dir: Directory to save the merged model
+        hidden_dim: Hidden dimension of the MLP (must match training)
+        dropout: Dropout rate (must match training)
+    Returns:
+        The merged QualityClassifierModel
+    """
+    hidden_dim = hidden_dim or TRAINING_CONFIG["hidden_dim"]
+    dropout = dropout or TRAINING_CONFIG["dropout"]
+    output_dir = Path(output_dir)
+    print(f"Merging model...")
+    print(f"  Encoder: {base_model_name}")
+    print(f"  Classifier: {classifier_weights_path}")
+    # Create config
+    config = QualityClassifierConfig(
+        base_model_name=base_model_name,
+        hidden_dim=hidden_dim,
+        dropout=dropout,
+        num_labels=1
+    )
+    # Initialize model (loads encoder from HuggingFace)
+    model = QualityClassifierModel(config)
+    # Load trained classifier weights
+    checkpoint = torch.load(classifier_weights_path, map_location="cpu")
+    # Handle both new format (dict with state_dict) and old format (just state_dict)
+    if isinstance(checkpoint, dict) and "state_dict" in checkpoint:
+        trained_weights = checkpoint["state_dict"]
+    else:
+        trained_weights = checkpoint
+    # Map weights from standalone MLP to integrated classifier
+    # The standalone model saves with "classifier." prefix, strip it
+    stripped_weights = {}
+    for key, value in trained_weights.items():
+        new_key = key.replace("classifier.", "") if key.startswith("classifier.") else key
+        stripped_weights[new_key] = value
+    model.classifier.load_state_dict(stripped_weights)
+    # Save everything
+    output_dir.mkdir(parents=True, exist_ok=True)
+    model.save_pretrained(output_dir)
+    # Also save tokenizer for convenience
+    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+    tokenizer.save_pretrained(output_dir)
+    print(f"Model saved to {output_dir}")
+    print(f"Contents: {list(output_dir.iterdir())}")
+    return model
+def merge_all_classifiers(
+    models_dir: Union[str, Path],
+    output_base_dir: Union[str, Path],
+    base_model_name: str = None,
+) -> dict:
+    """
+    Merge all trained classifiers into unified models.
+    Args:
+        models_dir: Directory containing trained .pt files
+        output_base_dir: Base directory for output models
+        base_model_name: HuggingFace model ID for the encoder
+    Returns:
+        Dictionary mapping language codes to output directories
+    """
+    base_model_name = base_model_name or EMBEDDING_CONFIG["model_name"]
+    models_dir = Path(models_dir)
+    output_base_dir = Path(output_base_dir)
+    results = {}
+    for pt_file in models_dir.glob("*.pt"):
+        lang_code = pt_file.stem  # e.g., "ara_Arab"
+        output_dir = output_base_dir / f"{lang_code}-quality-classifier"
+        print(f"\n{'=' * 50}")
+        print(f"Processing: {lang_code}")
+        print(f"{'=' * 50}")
+        merge_and_save(
+            base_model_name=base_model_name,
+            classifier_weights_path=pt_file,
+            output_dir=output_dir,
+        )
+        results[lang_code] = str(output_dir)
+    return results
+# Register the model for auto-loading
+# This allows: AutoModel.from_pretrained("path") to work
+QualityClassifierConfig.register_for_auto_class()
+QualityClassifierModel.register_for_auto_class("AutoModel")

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e27f1fb9c9a19e2fdd4eb046a3f99e349bfa0ac7f43f22c1e6968be2898cd96c
+size 1112987508

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a56def25aa40facc030ea8b0b87f3688e4b3c39eb8b45d5702b3a1300fe2a20
+size 17082734

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}