SultanR's picture
Upload folder using huggingface_hub
decba57 verified
"""
Configuration for HQ document quality classifiers.
Defines language-specific settings, dataset sources, and training hyperparameters
for the FineWeb2-HQ methodology.
"""
from pathlib import Path
# =============================================================================
# Paths
# =============================================================================
HQ_DIR = Path(__file__).parent
SRC_DIR = HQ_DIR.parent
BASE_DIR = SRC_DIR.parent
# =============================================================================
# Available Encoder Models
# =============================================================================
ENCODER_MODELS = {
"mmbert-small": {
"model_name": "jhu-clsp/mmBERT-small",
"max_length": 512,
"embedding_dim": 384,
"description": "mmBERT-small: Modern multilingual encoder (1800+ languages)",
},
"mmbert-base": {
"model_name": "jhu-clsp/mmBERT-base",
"max_length": 512,
"embedding_dim": 768,
"description": "mmBERT-base: Larger multilingual encoder (1800+ languages)",
},
"xlm-roberta-base": {
"model_name": "FacebookAI/xlm-roberta-base",
"max_length": 512,
"embedding_dim": 768,
"description": "XLM-RoBERTa-base: Classic multilingual encoder (100 languages)",
},
"xlm-roberta-large": {
"model_name": "FacebookAI/xlm-roberta-large",
"max_length": 512,
"embedding_dim": 1024,
"description": "XLM-RoBERTa-large: Larger classic multilingual encoder",
},
}
# Default encoder
DEFAULT_ENCODER = "mmbert-small"
# =============================================================================
# Embedding Model Configuration (default)
# =============================================================================
EMBEDDING_CONFIG = ENCODER_MODELS[DEFAULT_ENCODER].copy()
# =============================================================================
# Classifier Training Configuration
# =============================================================================
TRAINING_CONFIG = {
"epochs": 6,
"learning_rate": 0.0003,
"batch_size": 256,
"hidden_dim": 256,
"dropout": 0.2,
"embedding_batch_size": 32,
}
# =============================================================================
# Language-Specific Configuration
# =============================================================================
LANGUAGE_CONFIG = {
"ara_Arab": {
"name": "Arabic",
"answer_label": "الإجابة:",
"positive_datasets": [
{
"dataset_id": "MBZUAI/ArabicMMLU",
"subset": "All",
"split": "test",
"format_type": "mcq",
"text_field": None, # Use formatter
},
{
"dataset_id": "openai/MMMLU",
"subset": "AR_XY",
"split": "test",
"format_type": "mcq",
"text_field": None,
},
{
"dataset_id": "CohereForAI/aya_dataset",
"subset": None,
"split": "train",
"format_type": "instruction",
"text_field": None,
"language_filter": "Arabic",
},
],
"negative_source": {
"dataset_id": "uonlp/CulturaX",
"subset": "ar",
"split": "train",
"text_field": "text",
},
},
"hin_Deva": {
"name": "Hindi",
"answer_label": "उत्तर:",
"positive_datasets": [
{
"dataset_id": "openai/MMMLU",
"subset": "HI_IN",
"split": "test",
"format_type": "mcq",
"text_field": None,
},
{
"dataset_id": "CohereForAI/aya_dataset",
"subset": None,
"split": "train",
"format_type": "instruction",
"text_field": None,
"language_filter": "Hindi",
},
],
"negative_source": {
"dataset_id": "uonlp/CulturaX",
"subset": "hi",
"split": "train",
"text_field": "text",
},
},
"tur_Latn": {
"name": "Turkish",
"answer_label": "Cevap:",
"positive_datasets": [
{
"dataset_id": "AYueksel/TurkishMMLU",
"subset": "All",
"split": "test",
"format_type": "mcq",
"text_field": None,
},
# Note: openai/MMMLU does not have Turkish
{
"dataset_id": "CohereForAI/aya_dataset",
"subset": None,
"split": "train",
"format_type": "instruction",
"text_field": None,
"language_filter": "Turkish",
},
],
"negative_source": {
"dataset_id": "uonlp/CulturaX",
"subset": "tr",
"split": "train",
"text_field": "text",
},
},
}
# =============================================================================
# Supported Languages
# =============================================================================
SUPPORTED_LANGUAGES = list(LANGUAGE_CONFIG.keys())
# =============================================================================
# Default Sampling Configuration
# =============================================================================
SAMPLING_CONFIG = {
"max_positive_samples": 80000,
"max_negative_samples": 80000,
"min_text_length": 50,
"train_ratio": 0.8,
"valid_ratio": 0.1,
"test_ratio": 0.1,
"random_seed": 42,
}