""" Configuration for HQ document quality classifiers. Defines language-specific settings, dataset sources, and training hyperparameters for the FineWeb2-HQ methodology. """ from pathlib import Path # ============================================================================= # Paths # ============================================================================= HQ_DIR = Path(__file__).parent SRC_DIR = HQ_DIR.parent BASE_DIR = SRC_DIR.parent # ============================================================================= # Available Encoder Models # ============================================================================= ENCODER_MODELS = { "mmbert-small": { "model_name": "jhu-clsp/mmBERT-small", "max_length": 512, "embedding_dim": 384, "description": "mmBERT-small: Modern multilingual encoder (1800+ languages)", }, "mmbert-base": { "model_name": "jhu-clsp/mmBERT-base", "max_length": 512, "embedding_dim": 768, "description": "mmBERT-base: Larger multilingual encoder (1800+ languages)", }, "xlm-roberta-base": { "model_name": "FacebookAI/xlm-roberta-base", "max_length": 512, "embedding_dim": 768, "description": "XLM-RoBERTa-base: Classic multilingual encoder (100 languages)", }, "xlm-roberta-large": { "model_name": "FacebookAI/xlm-roberta-large", "max_length": 512, "embedding_dim": 1024, "description": "XLM-RoBERTa-large: Larger classic multilingual encoder", }, } # Default encoder DEFAULT_ENCODER = "mmbert-small" # ============================================================================= # Embedding Model Configuration (default) # ============================================================================= EMBEDDING_CONFIG = ENCODER_MODELS[DEFAULT_ENCODER].copy() # ============================================================================= # Classifier Training Configuration # ============================================================================= TRAINING_CONFIG = { "epochs": 6, "learning_rate": 0.0003, "batch_size": 256, "hidden_dim": 256, "dropout": 0.2, "embedding_batch_size": 32, } # ============================================================================= # Language-Specific Configuration # ============================================================================= LANGUAGE_CONFIG = { "ara_Arab": { "name": "Arabic", "answer_label": "الإجابة:", "positive_datasets": [ { "dataset_id": "MBZUAI/ArabicMMLU", "subset": "All", "split": "test", "format_type": "mcq", "text_field": None, # Use formatter }, { "dataset_id": "openai/MMMLU", "subset": "AR_XY", "split": "test", "format_type": "mcq", "text_field": None, }, { "dataset_id": "CohereForAI/aya_dataset", "subset": None, "split": "train", "format_type": "instruction", "text_field": None, "language_filter": "Arabic", }, ], "negative_source": { "dataset_id": "uonlp/CulturaX", "subset": "ar", "split": "train", "text_field": "text", }, }, "hin_Deva": { "name": "Hindi", "answer_label": "उत्तर:", "positive_datasets": [ { "dataset_id": "openai/MMMLU", "subset": "HI_IN", "split": "test", "format_type": "mcq", "text_field": None, }, { "dataset_id": "CohereForAI/aya_dataset", "subset": None, "split": "train", "format_type": "instruction", "text_field": None, "language_filter": "Hindi", }, ], "negative_source": { "dataset_id": "uonlp/CulturaX", "subset": "hi", "split": "train", "text_field": "text", }, }, "tur_Latn": { "name": "Turkish", "answer_label": "Cevap:", "positive_datasets": [ { "dataset_id": "AYueksel/TurkishMMLU", "subset": "All", "split": "test", "format_type": "mcq", "text_field": None, }, # Note: openai/MMMLU does not have Turkish { "dataset_id": "CohereForAI/aya_dataset", "subset": None, "split": "train", "format_type": "instruction", "text_field": None, "language_filter": "Turkish", }, ], "negative_source": { "dataset_id": "uonlp/CulturaX", "subset": "tr", "split": "train", "text_field": "text", }, }, } # ============================================================================= # Supported Languages # ============================================================================= SUPPORTED_LANGUAGES = list(LANGUAGE_CONFIG.keys()) # ============================================================================= # Default Sampling Configuration # ============================================================================= SAMPLING_CONFIG = { "max_positive_samples": 80000, "max_negative_samples": 80000, "min_text_length": 50, "train_ratio": 0.8, "valid_ratio": 0.1, "test_ratio": 0.1, "random_seed": 42, }