|
|
| import json |
| import os |
| from collections import Counter |
| from datasets import load_dataset, Dataset |
| from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling |
| from src.tokenizer import ChessTokenizer |
| from src.model import ChessConfig, ChessForCausalLM |
|
|
| |
| ORG_NAME = "LLM-course" |
| MY_PSEUDO = "MDaytek" |
|
|
| def build_vocab(dataset, max_vocab=1200): |
| counter = Counter() |
| for game in dataset: |
| moves = game["text"].split() |
| counter.update(moves) |
| special = ["[PAD]", "[BOS]", "[EOS]", "[UNK]"] |
| vocab_tokens = special + [t for t, _ in counter.most_common(max_vocab - len(special))] |
| vocab = {tok: i for i, tok in enumerate(vocab_tokens)} |
| return vocab |
|
|
| def encode_game(game, tokenizer, max_len=256): |
| moves = game["text"].split() |
| tokens = ["[BOS]"] + moves + ["[EOS]"] |
| tokens = tokens[:max_len] |
| ids = [tokenizer._convert_token_to_id(t) for t in tokens] |
| ids += [tokenizer.pad_token_id] * (max_len - len(ids)) |
| return ids |
|
|
| def main(): |
| print("Loading 5000 games dataset...") |
| raw_ds = load_dataset("dlouapre/lichess_2025-01_1M", split="train[:5]") |
|
|
| print("Building tokenizer...") |
| vocab = build_vocab(raw_ds) |
| tokenizer = ChessTokenizer(vocab) |
|
|
| |
| N_EMBD = 128 |
| N_LAYER = 4 |
| N_HEAD = 4 |
|
|
| config = ChessConfig( |
| vocab_size=tokenizer.vocab_size, |
| n_embd=N_EMBD, |
| n_layer=N_LAYER, |
| n_head=N_HEAD, |
| n_ctx=256 |
| ) |
| model = ChessForCausalLM(config) |
|
|
| |
| total_params = sum(p.numel() for p in model.parameters()) |
| print(f"📊 Paramètres totaux : {total_params:,}") |
|
|
| print("Tokenizing dataset...") |
| input_ids = [encode_game(g, tokenizer) for g in raw_ds] |
| ds = Dataset.from_dict({"input_ids": input_ids}) |
|
|
| train_output = "./my_model" |
|
|
| args = TrainingArguments( |
| output_dir=train_output, |
| per_device_train_batch_size=32, |
| num_train_epochs=5, |
| logging_steps=50, |
| save_strategy="no", |
| report_to="none", |
| use_cpu=False |
| ) |
|
|
| trainer = Trainer( |
| model=model, |
| args=args, |
| train_dataset=ds, |
| data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), |
| ) |
|
|
| print("🚀 Starting FULL training...") |
| trainer.train() |
|
|
| print("💾 Saving model locally...") |
| final_path = os.path.join(train_output, "final_model") |
| model.save_pretrained(final_path, safe_serialization=False) |
| tokenizer.save_pretrained(final_path) |
|
|
| |
| readme_content = f"""--- |
| library_name: transformers |
| tags: |
| - chess |
| - llm-course |
| - chess-challenge |
| license: mit |
| --- |
| # Chess model submitted to the LLM Course Chess Challenge. |
| |
| ## Submission Info |
| - **Submitted by:** {MY_PSEUDO} |
| - **Parameters:** {total_params:,} |
| - **Organization:** {ORG_NAME} |
| |
| ## Model Details |
| - **Architecture:** Chess Transformer (Custom) |
| - **Vocab size:** {tokenizer.vocab_size} |
| - **Embedding dim:** {N_EMBD} |
| - **Layers:** {N_LAYER} |
| - **Heads:** {N_HEAD} |
| """ |
| with open(os.path.join(final_path, "README.md"), "w") as f: |
| f.write(readme_content) |
| |
| print("✅ Training complete & README with tags generated.") |
|
|
| if __name__ == "__main__": |
| main() |
|
|