mmBERT-Arabic-Quality-Classifier / config.py

Upload folder using huggingface_hub

decba57 verified 22 days ago

5.8 kB

	"""
	Configuration for HQ document quality classifiers.

	Defines language-specific settings, dataset sources, and training hyperparameters
	for the FineWeb2-HQ methodology.
	"""
	from pathlib import Path

	# =============================================================================
	# Paths
	# =============================================================================
	HQ_DIR = Path(__file__).parent
	SRC_DIR = HQ_DIR.parent
	BASE_DIR = SRC_DIR.parent

	# =============================================================================
	# Available Encoder Models
	# =============================================================================
	ENCODER_MODELS = {
	"mmbert-small": {
	"model_name": "jhu-clsp/mmBERT-small",
	"max_length": 512,
	"embedding_dim": 384,
	"description": "mmBERT-small: Modern multilingual encoder (1800+ languages)",
	},
	"mmbert-base": {
	"model_name": "jhu-clsp/mmBERT-base",
	"max_length": 512,
	"embedding_dim": 768,
	"description": "mmBERT-base: Larger multilingual encoder (1800+ languages)",
	},
	"xlm-roberta-base": {
	"model_name": "FacebookAI/xlm-roberta-base",
	"max_length": 512,
	"embedding_dim": 768,
	"description": "XLM-RoBERTa-base: Classic multilingual encoder (100 languages)",
	},
	"xlm-roberta-large": {
	"model_name": "FacebookAI/xlm-roberta-large",
	"max_length": 512,
	"embedding_dim": 1024,
	"description": "XLM-RoBERTa-large: Larger classic multilingual encoder",
	},
	}

	# Default encoder
	DEFAULT_ENCODER = "mmbert-small"

	# =============================================================================
	# Embedding Model Configuration (default)
	# =============================================================================
	EMBEDDING_CONFIG = ENCODER_MODELS[DEFAULT_ENCODER].copy()

	# =============================================================================
	# Classifier Training Configuration
	# =============================================================================
	TRAINING_CONFIG = {
	"epochs": 6,
	"learning_rate": 0.0003,
	"batch_size": 256,
	"hidden_dim": 256,
	"dropout": 0.2,
	"embedding_batch_size": 32,
	}

	# =============================================================================
	# Language-Specific Configuration
	# =============================================================================
	LANGUAGE_CONFIG = {
	"ara_Arab": {
	"name": "Arabic",
	"answer_label": "الإجابة:",
	"positive_datasets": [
	{
	"dataset_id": "MBZUAI/ArabicMMLU",
	"subset": "All",
	"split": "test",
	"format_type": "mcq",
	"text_field": None, # Use formatter
	},
	{
	"dataset_id": "openai/MMMLU",
	"subset": "AR_XY",
	"split": "test",
	"format_type": "mcq",
	"text_field": None,
	},
	{
	"dataset_id": "CohereForAI/aya_dataset",
	"subset": None,
	"split": "train",
	"format_type": "instruction",
	"text_field": None,
	"language_filter": "Arabic",
	},
	],
	"negative_source": {
	"dataset_id": "uonlp/CulturaX",
	"subset": "ar",
	"split": "train",
	"text_field": "text",
	},
	},
	"hin_Deva": {
	"name": "Hindi",
	"answer_label": "उत्तर:",
	"positive_datasets": [
	{
	"dataset_id": "openai/MMMLU",
	"subset": "HI_IN",
	"split": "test",
	"format_type": "mcq",
	"text_field": None,
	},
	{
	"dataset_id": "CohereForAI/aya_dataset",
	"subset": None,
	"split": "train",
	"format_type": "instruction",
	"text_field": None,
	"language_filter": "Hindi",
	},
	],
	"negative_source": {
	"dataset_id": "uonlp/CulturaX",
	"subset": "hi",
	"split": "train",
	"text_field": "text",
	},
	},
	"tur_Latn": {
	"name": "Turkish",
	"answer_label": "Cevap:",
	"positive_datasets": [
	{
	"dataset_id": "AYueksel/TurkishMMLU",
	"subset": "All",
	"split": "test",
	"format_type": "mcq",
	"text_field": None,
	},
	# Note: openai/MMMLU does not have Turkish
	{
	"dataset_id": "CohereForAI/aya_dataset",
	"subset": None,
	"split": "train",
	"format_type": "instruction",
	"text_field": None,
	"language_filter": "Turkish",
	},
	],
	"negative_source": {
	"dataset_id": "uonlp/CulturaX",
	"subset": "tr",
	"split": "train",
	"text_field": "text",
	},
	},
	}

	# =============================================================================
	# Supported Languages
	# =============================================================================
	SUPPORTED_LANGUAGES = list(LANGUAGE_CONFIG.keys())

	# =============================================================================
	# Default Sampling Configuration
	# =============================================================================
	SAMPLING_CONFIG = {
	"max_positive_samples": 80000,
	"max_negative_samples": 80000,
	"min_text_length": 50,
	"train_ratio": 0.8,
	"valid_ratio": 0.1,
	"test_ratio": 0.1,
	"random_seed": 42,
	}