| |
|
| | import json |
| | import os |
| | from collections import Counter |
| | from datasets import load_dataset, Dataset |
| | from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling |
| | from src.tokenizer import ChessTokenizer |
| | from src.model import ChessConfig, ChessForCausalLM |
| |
|
| | |
| | ORG_NAME = "LLM-course" |
| | MY_PSEUDO = "MDaytek" |
| |
|
| | def build_vocab(dataset, max_vocab=1200): |
| | counter = Counter() |
| | for game in dataset: |
| | moves = game["text"].split() |
| | counter.update(moves) |
| | special = ["[PAD]", "[BOS]", "[EOS]", "[UNK]"] |
| | vocab_tokens = special + [t for t, _ in counter.most_common(max_vocab - len(special))] |
| | vocab = {tok: i for i, tok in enumerate(vocab_tokens)} |
| | return vocab |
| |
|
| | def encode_game(game, tokenizer, max_len=256): |
| | moves = game["text"].split() |
| | tokens = ["[BOS]"] + moves + ["[EOS]"] |
| | tokens = tokens[:max_len] |
| | ids = [tokenizer._convert_token_to_id(t) for t in tokens] |
| | ids += [tokenizer.pad_token_id] * (max_len - len(ids)) |
| | return ids |
| |
|
| | def main(): |
| | print("Loading 5000 games dataset...") |
| | raw_ds = load_dataset("dlouapre/lichess_2025-01_1M", split="train[:5]") |
| |
|
| | print("Building tokenizer...") |
| | vocab = build_vocab(raw_ds) |
| | tokenizer = ChessTokenizer(vocab) |
| |
|
| | |
| | N_EMBD = 128 |
| | N_LAYER = 4 |
| | N_HEAD = 4 |
| |
|
| | config = ChessConfig( |
| | vocab_size=tokenizer.vocab_size, |
| | n_embd=N_EMBD, |
| | n_layer=N_LAYER, |
| | n_head=N_HEAD, |
| | n_ctx=256 |
| | ) |
| | model = ChessForCausalLM(config) |
| |
|
| | |
| | total_params = sum(p.numel() for p in model.parameters()) |
| | print(f"📊 Paramètres totaux : {total_params:,}") |
| |
|
| | print("Tokenizing dataset...") |
| | input_ids = [encode_game(g, tokenizer) for g in raw_ds] |
| | ds = Dataset.from_dict({"input_ids": input_ids}) |
| |
|
| | train_output = "./my_model" |
| |
|
| | args = TrainingArguments( |
| | output_dir=train_output, |
| | per_device_train_batch_size=32, |
| | num_train_epochs=5, |
| | logging_steps=50, |
| | save_strategy="no", |
| | report_to="none", |
| | use_cpu=False |
| | ) |
| |
|
| | trainer = Trainer( |
| | model=model, |
| | args=args, |
| | train_dataset=ds, |
| | data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), |
| | ) |
| |
|
| | print("🚀 Starting FULL training...") |
| | trainer.train() |
| |
|
| | print("💾 Saving model locally...") |
| | final_path = os.path.join(train_output, "final_model") |
| | model.save_pretrained(final_path, safe_serialization=False) |
| | tokenizer.save_pretrained(final_path) |
| |
|
| | |
| | readme_content = f"""--- |
| | library_name: transformers |
| | tags: |
| | - chess |
| | - llm-course |
| | - chess-challenge |
| | license: mit |
| | --- |
| | # Chess model submitted to the LLM Course Chess Challenge. |
| | |
| | ## Submission Info |
| | - **Submitted by:** {MY_PSEUDO} |
| | - **Parameters:** {total_params:,} |
| | - **Organization:** {ORG_NAME} |
| | |
| | ## Model Details |
| | - **Architecture:** Chess Transformer (Custom) |
| | - **Vocab size:** {tokenizer.vocab_size} |
| | - **Embedding dim:** {N_EMBD} |
| | - **Layers:** {N_LAYER} |
| | - **Heads:** {N_HEAD} |
| | """ |
| | with open(os.path.join(final_path, "README.md"), "w") as f: |
| | f.write(readme_content) |
| | |
| | print("✅ Training complete & README with tags generated.") |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|