|
|
""" |
|
|
Configuration for HQ document quality classifiers. |
|
|
|
|
|
Defines language-specific settings, dataset sources, and training hyperparameters |
|
|
for the FineWeb2-HQ methodology. |
|
|
""" |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HQ_DIR = Path(__file__).parent |
|
|
SRC_DIR = HQ_DIR.parent |
|
|
BASE_DIR = SRC_DIR.parent |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ENCODER_MODELS = { |
|
|
"mmbert-small": { |
|
|
"model_name": "jhu-clsp/mmBERT-small", |
|
|
"max_length": 512, |
|
|
"embedding_dim": 384, |
|
|
"description": "mmBERT-small: Modern multilingual encoder (1800+ languages)", |
|
|
}, |
|
|
"mmbert-base": { |
|
|
"model_name": "jhu-clsp/mmBERT-base", |
|
|
"max_length": 512, |
|
|
"embedding_dim": 768, |
|
|
"description": "mmBERT-base: Larger multilingual encoder (1800+ languages)", |
|
|
}, |
|
|
"xlm-roberta-base": { |
|
|
"model_name": "FacebookAI/xlm-roberta-base", |
|
|
"max_length": 512, |
|
|
"embedding_dim": 768, |
|
|
"description": "XLM-RoBERTa-base: Classic multilingual encoder (100 languages)", |
|
|
}, |
|
|
"xlm-roberta-large": { |
|
|
"model_name": "FacebookAI/xlm-roberta-large", |
|
|
"max_length": 512, |
|
|
"embedding_dim": 1024, |
|
|
"description": "XLM-RoBERTa-large: Larger classic multilingual encoder", |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
DEFAULT_ENCODER = "mmbert-small" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
EMBEDDING_CONFIG = ENCODER_MODELS[DEFAULT_ENCODER].copy() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TRAINING_CONFIG = { |
|
|
"epochs": 6, |
|
|
"learning_rate": 0.0003, |
|
|
"batch_size": 256, |
|
|
"hidden_dim": 256, |
|
|
"dropout": 0.2, |
|
|
"embedding_batch_size": 32, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LANGUAGE_CONFIG = { |
|
|
"ara_Arab": { |
|
|
"name": "Arabic", |
|
|
"answer_label": "الإجابة:", |
|
|
"positive_datasets": [ |
|
|
{ |
|
|
"dataset_id": "MBZUAI/ArabicMMLU", |
|
|
"subset": "All", |
|
|
"split": "test", |
|
|
"format_type": "mcq", |
|
|
"text_field": None, |
|
|
}, |
|
|
{ |
|
|
"dataset_id": "openai/MMMLU", |
|
|
"subset": "AR_XY", |
|
|
"split": "test", |
|
|
"format_type": "mcq", |
|
|
"text_field": None, |
|
|
}, |
|
|
{ |
|
|
"dataset_id": "CohereForAI/aya_dataset", |
|
|
"subset": None, |
|
|
"split": "train", |
|
|
"format_type": "instruction", |
|
|
"text_field": None, |
|
|
"language_filter": "Arabic", |
|
|
}, |
|
|
], |
|
|
"negative_source": { |
|
|
"dataset_id": "uonlp/CulturaX", |
|
|
"subset": "ar", |
|
|
"split": "train", |
|
|
"text_field": "text", |
|
|
}, |
|
|
}, |
|
|
"hin_Deva": { |
|
|
"name": "Hindi", |
|
|
"answer_label": "उत्तर:", |
|
|
"positive_datasets": [ |
|
|
{ |
|
|
"dataset_id": "openai/MMMLU", |
|
|
"subset": "HI_IN", |
|
|
"split": "test", |
|
|
"format_type": "mcq", |
|
|
"text_field": None, |
|
|
}, |
|
|
{ |
|
|
"dataset_id": "CohereForAI/aya_dataset", |
|
|
"subset": None, |
|
|
"split": "train", |
|
|
"format_type": "instruction", |
|
|
"text_field": None, |
|
|
"language_filter": "Hindi", |
|
|
}, |
|
|
], |
|
|
"negative_source": { |
|
|
"dataset_id": "uonlp/CulturaX", |
|
|
"subset": "hi", |
|
|
"split": "train", |
|
|
"text_field": "text", |
|
|
}, |
|
|
}, |
|
|
"tur_Latn": { |
|
|
"name": "Turkish", |
|
|
"answer_label": "Cevap:", |
|
|
"positive_datasets": [ |
|
|
{ |
|
|
"dataset_id": "AYueksel/TurkishMMLU", |
|
|
"subset": "All", |
|
|
"split": "test", |
|
|
"format_type": "mcq", |
|
|
"text_field": None, |
|
|
}, |
|
|
|
|
|
{ |
|
|
"dataset_id": "CohereForAI/aya_dataset", |
|
|
"subset": None, |
|
|
"split": "train", |
|
|
"format_type": "instruction", |
|
|
"text_field": None, |
|
|
"language_filter": "Turkish", |
|
|
}, |
|
|
], |
|
|
"negative_source": { |
|
|
"dataset_id": "uonlp/CulturaX", |
|
|
"subset": "tr", |
|
|
"split": "train", |
|
|
"text_field": "text", |
|
|
}, |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SUPPORTED_LANGUAGES = list(LANGUAGE_CONFIG.keys()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SAMPLING_CONFIG = { |
|
|
"max_positive_samples": 80000, |
|
|
"max_negative_samples": 80000, |
|
|
"min_text_length": 50, |
|
|
"train_ratio": 0.8, |
|
|
"valid_ratio": 0.1, |
|
|
"test_ratio": 0.1, |
|
|
"random_seed": 42, |
|
|
} |
|
|
|