|
|
"""
|
|
|
COGNITIVE-CORE: Training Utilities
|
|
|
====================================
|
|
|
|
|
|
Standardized training utilities for cognitive models, including:
|
|
|
- Training configurations
|
|
|
- Trainer wrappers
|
|
|
- Dataset preparation helpers
|
|
|
- Progress tracking
|
|
|
|
|
|
Copyright © 2026 Mike Amega (Logo) - Ame Web Studio
|
|
|
License: Proprietary - All Rights Reserved
|
|
|
"""
|
|
|
|
|
|
import os
|
|
|
import torch
|
|
|
import torch.nn as nn
|
|
|
from typing import Dict, List, Optional, Any, Callable
|
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
class CognitiveTrainingConfig:
|
|
|
"""
|
|
|
Configuration standard pour l'entraînement de modèles cognitifs.
|
|
|
"""
|
|
|
|
|
|
|
|
|
output_dir: str = "./cognitive-output"
|
|
|
|
|
|
|
|
|
num_epochs: int = 1
|
|
|
batch_size: int = 1
|
|
|
gradient_accumulation_steps: int = 8
|
|
|
learning_rate: float = 1e-5
|
|
|
warmup_steps: int = 100
|
|
|
weight_decay: float = 0.01
|
|
|
max_grad_norm: float = 1.0
|
|
|
|
|
|
|
|
|
max_seq_len: int = 2048
|
|
|
|
|
|
|
|
|
use_fp16: bool = True
|
|
|
use_bf16: bool = False
|
|
|
|
|
|
|
|
|
logging_steps: int = 10
|
|
|
save_steps: int = 200
|
|
|
save_total_limit: int = 2
|
|
|
|
|
|
|
|
|
push_to_hub: bool = False
|
|
|
hub_model_id: Optional[str] = None
|
|
|
hub_private: bool = True
|
|
|
|
|
|
|
|
|
device: Optional[str] = None
|
|
|
|
|
|
def __post_init__(self):
|
|
|
os.makedirs(self.output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def prepare_dataset(
|
|
|
dataset,
|
|
|
tokenizer,
|
|
|
text_column: str = "text",
|
|
|
max_length: int = 2048,
|
|
|
num_proc: int = 4,
|
|
|
):
|
|
|
"""
|
|
|
Prépare un dataset pour l'entraînement d'un modèle cognitif.
|
|
|
|
|
|
Args:
|
|
|
dataset: Dataset HuggingFace
|
|
|
tokenizer: Tokenizer du modèle
|
|
|
text_column: Nom de la colonne contenant le texte
|
|
|
max_length: Longueur maximale des séquences
|
|
|
num_proc: Nombre de processus pour le mapping
|
|
|
|
|
|
Returns:
|
|
|
Dataset tokenisé prêt pour l'entraînement
|
|
|
"""
|
|
|
|
|
|
def tokenize_function(examples):
|
|
|
texts = examples[text_column]
|
|
|
if not isinstance(texts, list):
|
|
|
texts = [texts]
|
|
|
|
|
|
return tokenizer(
|
|
|
texts,
|
|
|
truncation=True,
|
|
|
padding="max_length",
|
|
|
max_length=max_length,
|
|
|
return_tensors=None,
|
|
|
)
|
|
|
|
|
|
|
|
|
columns_to_remove = dataset.column_names
|
|
|
if isinstance(columns_to_remove, dict):
|
|
|
columns_to_remove = columns_to_remove.get("train", [])
|
|
|
|
|
|
tokenized = dataset.map(
|
|
|
tokenize_function,
|
|
|
batched=True,
|
|
|
num_proc=num_proc,
|
|
|
remove_columns=columns_to_remove,
|
|
|
)
|
|
|
|
|
|
tokenized.set_format(type="torch")
|
|
|
return tokenized
|
|
|
|
|
|
|
|
|
def create_instruction_dataset(
|
|
|
examples: List[Dict[str, str]],
|
|
|
tokenizer,
|
|
|
max_length: int = 2048,
|
|
|
instruction_template: str = "### Instruction:\n{instruction}\n\n### Response:\n{response}",
|
|
|
):
|
|
|
"""
|
|
|
Crée un dataset d'instructions à partir d'exemples.
|
|
|
|
|
|
Args:
|
|
|
examples: Liste de dicts avec 'instruction' et 'response'
|
|
|
tokenizer: Tokenizer du modèle
|
|
|
max_length: Longueur maximale
|
|
|
instruction_template: Template de formatage
|
|
|
|
|
|
Returns:
|
|
|
Dataset tokenisé
|
|
|
"""
|
|
|
from datasets import Dataset
|
|
|
|
|
|
formatted = []
|
|
|
for ex in examples:
|
|
|
text = instruction_template.format(
|
|
|
instruction=ex.get("instruction", ""), response=ex.get("response", "")
|
|
|
)
|
|
|
formatted.append({"text": text})
|
|
|
|
|
|
dataset = Dataset.from_list(formatted)
|
|
|
return prepare_dataset(dataset, tokenizer, "text", max_length)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CognitiveTrainer:
|
|
|
"""
|
|
|
Trainer simplifié pour modèles cognitifs.
|
|
|
|
|
|
Wrapper autour du Trainer HuggingFace avec configuration optimisée
|
|
|
pour les architectures cognitives.
|
|
|
"""
|
|
|
|
|
|
def __init__(
|
|
|
self,
|
|
|
model,
|
|
|
tokenizer,
|
|
|
train_dataset,
|
|
|
config: CognitiveTrainingConfig,
|
|
|
eval_dataset=None,
|
|
|
callbacks: Optional[List] = None,
|
|
|
):
|
|
|
self.model = model
|
|
|
self.tokenizer = tokenizer
|
|
|
self.train_dataset = train_dataset
|
|
|
self.eval_dataset = eval_dataset
|
|
|
self.config = config
|
|
|
self.callbacks = callbacks or []
|
|
|
|
|
|
|
|
|
if tokenizer.pad_token is None:
|
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
|
|
self._setup_trainer()
|
|
|
|
|
|
def _setup_trainer(self):
|
|
|
"""Configure le Trainer HuggingFace."""
|
|
|
from transformers import (
|
|
|
Trainer,
|
|
|
TrainingArguments,
|
|
|
DataCollatorForLanguageModeling,
|
|
|
)
|
|
|
|
|
|
|
|
|
if self.config.device:
|
|
|
device = self.config.device
|
|
|
elif torch.cuda.is_available():
|
|
|
device = "cuda"
|
|
|
else:
|
|
|
device = "cpu"
|
|
|
|
|
|
|
|
|
training_args = TrainingArguments(
|
|
|
output_dir=self.config.output_dir,
|
|
|
overwrite_output_dir=True,
|
|
|
num_train_epochs=self.config.num_epochs,
|
|
|
per_device_train_batch_size=self.config.batch_size,
|
|
|
gradient_accumulation_steps=self.config.gradient_accumulation_steps,
|
|
|
learning_rate=self.config.learning_rate,
|
|
|
warmup_steps=self.config.warmup_steps,
|
|
|
weight_decay=self.config.weight_decay,
|
|
|
max_grad_norm=self.config.max_grad_norm,
|
|
|
logging_steps=self.config.logging_steps,
|
|
|
save_steps=self.config.save_steps,
|
|
|
save_total_limit=self.config.save_total_limit,
|
|
|
fp16=self.config.use_fp16 and device == "cuda",
|
|
|
bf16=self.config.use_bf16 and device == "cuda",
|
|
|
push_to_hub=self.config.push_to_hub,
|
|
|
hub_model_id=self.config.hub_model_id,
|
|
|
hub_private_repo=self.config.hub_private,
|
|
|
report_to="none",
|
|
|
remove_unused_columns=False,
|
|
|
dataloader_num_workers=0,
|
|
|
)
|
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling(
|
|
|
tokenizer=self.tokenizer, mlm=False
|
|
|
)
|
|
|
|
|
|
|
|
|
self.trainer = Trainer(
|
|
|
model=self.model,
|
|
|
args=training_args,
|
|
|
train_dataset=self.train_dataset,
|
|
|
eval_dataset=self.eval_dataset,
|
|
|
data_collator=data_collator,
|
|
|
tokenizer=self.tokenizer,
|
|
|
callbacks=self.callbacks,
|
|
|
)
|
|
|
|
|
|
def train(self, resume_from_checkpoint: Optional[str] = None):
|
|
|
"""
|
|
|
Lance l'entraînement.
|
|
|
|
|
|
Args:
|
|
|
resume_from_checkpoint: Chemin pour reprendre l'entraînement
|
|
|
|
|
|
Returns:
|
|
|
Résultats de l'entraînement
|
|
|
"""
|
|
|
print("\n🚀 ENTRAÎNEMENT COGNITIF")
|
|
|
print("=" * 60)
|
|
|
|
|
|
try:
|
|
|
result = self.trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
|
|
print("=" * 60)
|
|
|
print("✅ Entraînement terminé!")
|
|
|
return result
|
|
|
except Exception as e:
|
|
|
print(f"❌ Erreur: {e}")
|
|
|
import traceback
|
|
|
|
|
|
traceback.print_exc()
|
|
|
return None
|
|
|
|
|
|
def save(self, output_dir: Optional[str] = None):
|
|
|
"""Sauvegarde le modèle et tokenizer."""
|
|
|
save_dir = output_dir or self.config.output_dir
|
|
|
self.trainer.save_model(save_dir)
|
|
|
self.tokenizer.save_pretrained(save_dir)
|
|
|
print(f"💾 Modèle sauvegardé: {save_dir}")
|
|
|
|
|
|
def push_to_hub(self, repo_id: Optional[str] = None):
|
|
|
"""Push le modèle vers HuggingFace Hub."""
|
|
|
if repo_id:
|
|
|
self.config.hub_model_id = repo_id
|
|
|
|
|
|
try:
|
|
|
self.trainer.push_to_hub()
|
|
|
print(f"📤 Modèle pushé: {self.config.hub_model_id}")
|
|
|
except Exception as e:
|
|
|
print(f"⚠️ Erreur push: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CognitiveStateCallback:
|
|
|
"""
|
|
|
Callback pour monitorer l'état des modules cognitifs pendant l'entraînement.
|
|
|
"""
|
|
|
|
|
|
def __init__(self, log_every: int = 100):
|
|
|
self.log_every = log_every
|
|
|
self.step = 0
|
|
|
|
|
|
def on_step_end(self, args, state, control, model=None, **kwargs):
|
|
|
self.step += 1
|
|
|
|
|
|
if self.step % self.log_every == 0 and model is not None:
|
|
|
if hasattr(model, "get_cognitive_state"):
|
|
|
cog_state = model.get_cognitive_state()
|
|
|
print(f"\n📊 État cognitif (step {self.step}):")
|
|
|
for name, state_dict in cog_state.items():
|
|
|
if state_dict:
|
|
|
print(f" {name}: {len(state_dict)} buffers")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def quick_train(
|
|
|
model,
|
|
|
tokenizer,
|
|
|
texts: List[str],
|
|
|
output_dir: str = "./quick-train-output",
|
|
|
num_epochs: int = 1,
|
|
|
max_seq_len: int = 2048,
|
|
|
learning_rate: float = 1e-5,
|
|
|
push_to_hub: bool = False,
|
|
|
hub_model_id: Optional[str] = None,
|
|
|
):
|
|
|
"""
|
|
|
Entraînement rapide avec configuration minimale.
|
|
|
|
|
|
Args:
|
|
|
model: Modèle à entraîner
|
|
|
tokenizer: Tokenizer
|
|
|
texts: Liste de textes d'entraînement
|
|
|
output_dir: Répertoire de sortie
|
|
|
num_epochs: Nombre d'époques
|
|
|
max_seq_len: Longueur max des séquences
|
|
|
learning_rate: Taux d'apprentissage
|
|
|
push_to_hub: Pusher vers HuggingFace
|
|
|
hub_model_id: ID du repo HuggingFace
|
|
|
|
|
|
Returns:
|
|
|
Résultats de l'entraînement
|
|
|
"""
|
|
|
from datasets import Dataset
|
|
|
|
|
|
|
|
|
dataset = Dataset.from_dict({"text": texts})
|
|
|
tokenized = prepare_dataset(dataset, tokenizer, "text", max_seq_len)
|
|
|
|
|
|
|
|
|
config = CognitiveTrainingConfig(
|
|
|
output_dir=output_dir,
|
|
|
num_epochs=num_epochs,
|
|
|
max_seq_len=max_seq_len,
|
|
|
learning_rate=learning_rate,
|
|
|
push_to_hub=push_to_hub,
|
|
|
hub_model_id=hub_model_id,
|
|
|
)
|
|
|
|
|
|
|
|
|
trainer = CognitiveTrainer(model, tokenizer, tokenized, config)
|
|
|
result = trainer.train()
|
|
|
|
|
|
if result:
|
|
|
trainer.save()
|
|
|
|
|
|
return result
|
|
|
|