import json import os from collections import Counter from datasets import load_dataset, Dataset from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling from src.tokenizer import ChessTokenizer from src.model import ChessConfig, ChessForCausalLM # Infos pour le README ORG_NAME = "LLM-course" MY_PSEUDO = "MDaytek" def build_vocab(dataset, max_vocab=1200): counter = Counter() for game in dataset: moves = game["text"].split() counter.update(moves) special = ["[PAD]", "[BOS]", "[EOS]", "[UNK]"] vocab_tokens = special + [t for t, _ in counter.most_common(max_vocab - len(special))] vocab = {tok: i for i, tok in enumerate(vocab_tokens)} return vocab def encode_game(game, tokenizer, max_len=256): moves = game["text"].split() tokens = ["[BOS]"] + moves + ["[EOS]"] tokens = tokens[:max_len] ids = [tokenizer._convert_token_to_id(t) for t in tokens] ids += [tokenizer.pad_token_id] * (max_len - len(ids)) return ids def main(): print("Loading 5000 games dataset...") raw_ds = load_dataset("dlouapre/lichess_2025-01_1M", split="train[:5]") print("Building tokenizer...") vocab = build_vocab(raw_ds) tokenizer = ChessTokenizer(vocab) # Paramètres du modèle N_EMBD = 128 N_LAYER = 4 N_HEAD = 4 config = ChessConfig( vocab_size=tokenizer.vocab_size, n_embd=N_EMBD, n_layer=N_LAYER, n_head=N_HEAD, n_ctx=256 ) model = ChessForCausalLM(config) # Calcul des paramètres total_params = sum(p.numel() for p in model.parameters()) print(f"📊 Paramètres totaux : {total_params:,}") print("Tokenizing dataset...") input_ids = [encode_game(g, tokenizer) for g in raw_ds] ds = Dataset.from_dict({"input_ids": input_ids}) train_output = "./my_model" args = TrainingArguments( output_dir=train_output, per_device_train_batch_size=32, num_train_epochs=5, # 5 ÉPOQUES logging_steps=50, save_strategy="no", report_to="none", use_cpu=False ) trainer = Trainer( model=model, args=args, train_dataset=ds, data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), ) print("🚀 Starting FULL training...") trainer.train() print("💾 Saving model locally...") final_path = os.path.join(train_output, "final_model") model.save_pretrained(final_path, safe_serialization=False) tokenizer.save_pretrained(final_path) # --- README AVEC LES BONS TAGS --- readme_content = f"""--- library_name: transformers tags: - chess - llm-course - chess-challenge license: mit --- # Chess model submitted to the LLM Course Chess Challenge. ## Submission Info - **Submitted by:** {MY_PSEUDO} - **Parameters:** {total_params:,} - **Organization:** {ORG_NAME} ## Model Details - **Architecture:** Chess Transformer (Custom) - **Vocab size:** {tokenizer.vocab_size} - **Embedding dim:** {N_EMBD} - **Layers:** {N_LAYER} - **Heads:** {N_HEAD} """ with open(os.path.join(final_path, "README.md"), "w") as f: f.write(readme_content) print("✅ Training complete & README with tags generated.") if __name__ == "__main__": main()