File size: 5,801 Bytes

decba57

"""
Configuration for HQ document quality classifiers.

Defines language-specific settings, dataset sources, and training hyperparameters
for the FineWeb2-HQ methodology.
"""
from pathlib import Path

# =============================================================================
# Paths
# =============================================================================
HQ_DIR = Path(__file__).parent
SRC_DIR = HQ_DIR.parent
BASE_DIR = SRC_DIR.parent

# =============================================================================
# Available Encoder Models
# =============================================================================
ENCODER_MODELS = {
    "mmbert-small": {
        "model_name": "jhu-clsp/mmBERT-small",
        "max_length": 512,
        "embedding_dim": 384,
        "description": "mmBERT-small: Modern multilingual encoder (1800+ languages)",
    },
    "mmbert-base": {
        "model_name": "jhu-clsp/mmBERT-base",
        "max_length": 512,
        "embedding_dim": 768,
        "description": "mmBERT-base: Larger multilingual encoder (1800+ languages)",
    },
    "xlm-roberta-base": {
        "model_name": "FacebookAI/xlm-roberta-base",
        "max_length": 512,
        "embedding_dim": 768,
        "description": "XLM-RoBERTa-base: Classic multilingual encoder (100 languages)",
    },
    "xlm-roberta-large": {
        "model_name": "FacebookAI/xlm-roberta-large",
        "max_length": 512,
        "embedding_dim": 1024,
        "description": "XLM-RoBERTa-large: Larger classic multilingual encoder",
    },
}

# Default encoder
DEFAULT_ENCODER = "mmbert-small"

# =============================================================================
# Embedding Model Configuration (default)
# =============================================================================
EMBEDDING_CONFIG = ENCODER_MODELS[DEFAULT_ENCODER].copy()

# =============================================================================
# Classifier Training Configuration
# =============================================================================
TRAINING_CONFIG = {
    "epochs": 6,
    "learning_rate": 0.0003,
    "batch_size": 256,
    "hidden_dim": 256,
    "dropout": 0.2,
    "embedding_batch_size": 32,
}

# =============================================================================
# Language-Specific Configuration
# =============================================================================
LANGUAGE_CONFIG = {
    "ara_Arab": {
        "name": "Arabic",
        "answer_label": "الإجابة:",
        "positive_datasets": [
            {
                "dataset_id": "MBZUAI/ArabicMMLU",
                "subset": "All",
                "split": "test",
                "format_type": "mcq",
                "text_field": None,  # Use formatter
            },
            {
                "dataset_id": "openai/MMMLU",
                "subset": "AR_XY",
                "split": "test",
                "format_type": "mcq",
                "text_field": None,
            },
            {
                "dataset_id": "CohereForAI/aya_dataset",
                "subset": None,
                "split": "train",
                "format_type": "instruction",
                "text_field": None,
                "language_filter": "Arabic",
            },
        ],
        "negative_source": {
            "dataset_id": "uonlp/CulturaX",
            "subset": "ar",
            "split": "train",
            "text_field": "text",
        },
    },
    "hin_Deva": {
        "name": "Hindi",
        "answer_label": "उत्तर:",
        "positive_datasets": [
            {
                "dataset_id": "openai/MMMLU",
                "subset": "HI_IN",
                "split": "test",
                "format_type": "mcq",
                "text_field": None,
            },
            {
                "dataset_id": "CohereForAI/aya_dataset",
                "subset": None,
                "split": "train",
                "format_type": "instruction",
                "text_field": None,
                "language_filter": "Hindi",
            },
        ],
        "negative_source": {
            "dataset_id": "uonlp/CulturaX",
            "subset": "hi",
            "split": "train",
            "text_field": "text",
        },
    },
    "tur_Latn": {
        "name": "Turkish",
        "answer_label": "Cevap:",
        "positive_datasets": [
            {
                "dataset_id": "AYueksel/TurkishMMLU",
                "subset": "All",
                "split": "test",
                "format_type": "mcq",
                "text_field": None,
            },
            # Note: openai/MMMLU does not have Turkish
            {
                "dataset_id": "CohereForAI/aya_dataset",
                "subset": None,
                "split": "train",
                "format_type": "instruction",
                "text_field": None,
                "language_filter": "Turkish",
            },
        ],
        "negative_source": {
            "dataset_id": "uonlp/CulturaX",
            "subset": "tr",
            "split": "train",
            "text_field": "text",
        },
    },
}

# =============================================================================
# Supported Languages
# =============================================================================
SUPPORTED_LANGUAGES = list(LANGUAGE_CONFIG.keys())

# =============================================================================
# Default Sampling Configuration
# =============================================================================
SAMPLING_CONFIG = {
    "max_positive_samples": 80000,
    "max_negative_samples": 80000,
    "min_text_length": 50,
    "train_ratio": 0.8,
    "valid_ratio": 0.1,
    "test_ratio": 0.1,
    "random_seed": 42,
}