Add files using upload-large-folder tool

Browse files

Files changed (12) hide show

README.md +384 -506
data/hard_negatives_chess.parquet +3 -0
data/hard_negatives_english.parquet +3 -0
data/theme_definitions.parquet +3 -0
model.safetensors +1 -1
scripts/compare_variants.py +175 -0
scripts/convert_to_english.py +216 -0
scripts/diag_ce_vs_bm25.py +145 -0
scripts/generate_theme_defs.py +168 -0
scripts/mine_hard_negs_v2.py +213 -0
scripts/train_chess_multitask.py +287 -0
scripts/train_chess_static.py +640 -0

README.md CHANGED Viewed

@@ -1,538 +1,416 @@
 ---
-language:
-- en
 license: apache-2.0
 tags:
 - sentence-transformers
-- sentence-similarity
-- feature-extraction
-- generated_from_trainer
-- dataset_size:1619946
-- loss:MatryoshkaLoss
-- loss:MultipleNegativesRankingLoss
-widget:
-- source_sentence: kingsideAttack master [UNK] mateIn1 oneMove [UNK] [UNK] Defense
-    Sicilian Defense [UNK] Attack
-  sentences:
-  - themes kingsideAttack master mate mateIn1 oneMove opening opening Sicilian Defense
-    Sicilian Defense Nyezhmetdinov-Rossolimo Attack moves f3e5 c6g2 f3e5+c6g2
-  - themes crushing middlegame queensideAttack sacrifice veryLong moves d7c7 b3e6
-    f7e6 e1e6 c8b8 f6d7 c7d7 e6d7 d7c7+b3e6 b3e6+f7e6 f7e6+e1e6 e1e6+c8b8 c8b8+f6d7
-    f6d7+c7d7 c7d7+e6d7
-  - themes advancedPawn crushing endgame veryLong zugzwang moves d4e6 c4e6 f7e6 h7g6
-    f8g8 f6f7 g8f8 g6f6 e6e5 f6e5 d4e6+c4e6 c4e6+f7e6 f7e6+h7g6 h7g6+f8g8 f8g8+f6f7
-    f6f7+g8f8 g8f8+g6f6 g6f6+e6e5 e6e5+f6e5
-- source_sentence: crushing intermezzo master middlegame sacrifice veryLong
-  sentences:
-  - themes crushing endgame master masterVsMaster veryLong moves f5f6 c5e6 h5g6 h7g6
-    c3f3 d5b4 f3c6 b4c6 f5f6+c5e6 c5e6+h5g6 h5g6+h7g6 h7g6+c3f3 c3f3+d5b4 d5b4+f3c6
-    f3c6+b4c6
-  - themes advancedPawn advantage endgame long master promotion rookEndgame moves
-    h3h2 g1g2 g3g2 a6a7 h2h1q a7b8q h3h2+g1g2 g1g2+g3g2 g3g2+a6a7 a6a7+h2h1q h2h1q+a7b8q
-  - themes crushing intermezzo master middlegame sacrifice veryLong moves a6c4 d6f6
-    f1f6 h6h1 g1f2 h8f6 f2e2 f6e7 a6c4+d6f6 d6f6+f1f6 f1f6+h6h1 h6h1+g1f2 g1f2+h8f6
-    h8f6+f2e2 f2e2+f6e7
-- source_sentence: advantage hangingPiece middlegame short Nimzo-Larsen Attack Nimzo-Larsen
-    Attack Modern [UNK]
-  sentences:
-  - themes hangingPiece mate mateIn1 middlegame oneMove opening Trompowsky Attack
-    Trompowsky Attack Classical Defense moves f4g4 d8d1 f4g4+d8d1
-  - themes advancedPawn crushing defensiveMove endgame master quietMove veryLong moves
-    f1e1 h3h2 f8h8 f5h4 h8e5 g3g2 e5e4 h4f3 f1e1+h3h2 h3h2+f8h8 f8h8+f5h4 f5h4+h8e5
-    h8e5+g3g2 g3g2+e5e4 e5e4+h4f3
-  - themes advantage hangingPiece middlegame short opening Nimzo-Larsen Attack Nimzo-Larsen
-    Attack Modern Variation moves f5d7 b5g5 e3e2 d1d2 f5d7+b5g5 b5g5+e3e2 e3e2+d1d2
-- source_sentence: '[UNK] defensiveMove [UNK] [UNK] veryLong'
-  sentences:
-  - themes advantage discoveredAttack exposedKing middlegame trappedPiece veryLong
-    opening French Defense French Defense Orthoschnapp Gambit moves e2d1 c4e3 d2e3
-    b5f1 d1d2 f1g2 g1e2 g2h1 e2d1+c4e3 c4e3+d2e3 d2e3+b5f1 b5f1+d1d2 d1d2+f1g2 f1g2+g1e2
-    g1e2+g2h1
-  - themes crushing defensiveMove enPassant middlegame veryLong moves g2e2 a3f3 f7f5
-    e5f6 c4f4 g3f4 e2g2 f3g3 g2e2+a3f3 a3f3+f7f5 f7f5+e5f6 e5f6+c4f4 c4f4+g3f4 g3f4+e2g2
-    e2g2+f3g3
-  - themes advancedPawn bishopEndgame crushing defensiveMove endgame veryLong moves
-    f3e4 a3a2 g6g7 e6f7 e5e6 f7g8 e6e7 c5e7 f3e4+a3a2 a3a2+g6g7 g6g7+e6f7 e6f7+e5e6
-    e5e6+f7g8 f7g8+e6e7 e6e7+c5e7
-- source_sentence: '[UNK] deflection discoveredAttack [UNK] queensideAttack short
-    Philidor Defense [UNK] Defense Other variations'
-  sentences:
-  - themes crushing middlegame pin queensideAttack short opening Sicilian Defense
-    Sicilian Defense Najdorf Variation moves c3d5 c5b3 c1b1 b3d2 c3d5+c5b3 c5b3+c1b1
-    c1b1+b3d2
-  - themes crushing deflection discoveredAttack middlegame queensideAttack short opening
-    Philidor Defense Philidor Defense Other variations moves d3c3 d4b3 c1b1 d7d1 d3c3+d4b3
-    d4b3+c1b1 c1b1+d7d1
-  - themes advantage discoveredAttack middlegame short opening Philidor Defense Philidor
-    Defense Other variations moves e4d4 d3f5 c8b8 d1d4 e4d4+d3f5 d3f5+c8b8 c8b8+d1d4
-pipeline_tag: sentence-similarity
-library_name: sentence-transformers
-metrics:
-- cosine_accuracy@1
-- cosine_accuracy@10
-- cosine_precision@1
-- cosine_precision@10
-- cosine_recall@1
-- cosine_recall@10
-- cosine_ndcg@10
-- cosine_mrr@10
-- cosine_map@100
-model-index:
-- name: Static chess embedding (512d) -- themes/openings <-> positions
-  results:
-  - task:
-      type: information-retrieval
-      name: Information Retrieval
-    dataset:
-      name: chess ir
-      type: chess-ir
-    metrics:
-    - type: cosine_accuracy@1
-      value: 0.005
-      name: Cosine Accuracy@1
-    - type: cosine_accuracy@10
-      value: 0.07
-      name: Cosine Accuracy@10
-    - type: cosine_precision@1
-      value: 0.005
-      name: Cosine Precision@1
-    - type: cosine_precision@10
-      value: 0.008
-      name: Cosine Precision@10
-    - type: cosine_recall@1
-      value: 0.0016666666666666666
-      name: Cosine Recall@1
-    - type: cosine_recall@10
-      value: 0.02666666666666666
-      name: Cosine Recall@10
-    - type: cosine_ndcg@10
-      value: 0.01682968253099316
-      name: Cosine Ndcg@10
-    - type: cosine_mrr@10
-      value: 0.020728174603174603
-      name: Cosine Mrr@10
-    - type: cosine_map@100
-      value: 0.014144217882495914
-      name: Cosine Map@100
-  - task:
-      type: information-retrieval
-      name: Information Retrieval
-    dataset:
-      name: chess ir tokens
-      type: chess-ir-tokens
-    metrics:
-    - type: cosine_accuracy@1
-      value: 0.07936507936507936
-      name: Cosine Accuracy@1
-    - type: cosine_accuracy@10
-      value: 0.25925925925925924
-      name: Cosine Accuracy@10
-    - type: cosine_precision@1
-      value: 0.07936507936507936
-      name: Cosine Precision@1
-    - type: cosine_precision@10
-      value: 0.06031746031746032
-      name: Cosine Precision@10
-    - type: cosine_recall@1
-      value: 0.00224439005944158
-      name: Cosine Recall@1
-    - type: cosine_recall@10
-      value: 0.023957890091684336
-      name: Cosine Recall@10
-    - type: cosine_ndcg@10
-      value: 0.067202690066618
-      name: Cosine Ndcg@10
-    - type: cosine_mrr@10
-      value: 0.12332031578063325
-      name: Cosine Mrr@10
-    - type: cosine_map@100
-      value: 0.03321093573791526
-      name: Cosine Map@100
 ---
-# Static chess embedding (512d) -- themes/openings <-> positions
-This is a [sentence-transformers](https://www.SBERT.net) model trained. It maps sentences & paragraphs to a 512-dimensional dense vector space and can be used for retrieval.
-## Model Details
-### Model Description
-- **Model Type:** Sentence Transformer
-<!-- - **Base model:** [Unknown](https://huggingface.co/unknown) -->
-- **Maximum Sequence Length:** inf tokens
-- **Output Dimensionality:** 512 dimensions
-- **Similarity Function:** Cosine Similarity
-- **Supported Modality:** Text
-<!-- - **Training Dataset:** Unknown -->
-- **Language:** en
-- **License:** apache-2.0
-### Model Sources
-- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
-- **Repository:** [Sentence Transformers on GitHub](https://github.com/huggingface/sentence-transformers)
-- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
-### Full Model Architecture
 ```
-SentenceTransformer(
-  (0): StaticEmbedding({})
-)
 ```
-## Usage
-### Direct Usage (Sentence Transformers)
-First install the Sentence Transformers library:
-```bash
-pip install -U sentence-transformers
-```
-Then you can load this model and run inference.
-```python
-from sentence_transformers import SentenceTransformer
-# Download from the 🤗 Hub
-model = SentenceTransformer("oneryalcin/static-embedding-chess")
-# Run inference
-queries = [
-    '[UNK] deflection discoveredAttack [UNK] queensideAttack short Philidor Defense [UNK] Defense Other variations',
-]
-documents = [
-    'themes crushing deflection discoveredAttack middlegame queensideAttack short opening Philidor Defense Philidor Defense Other variations moves d3c3 d4b3 c1b1 d7d1 d3c3+d4b3 d4b3+c1b1 c1b1+d7d1',
-    'themes advantage discoveredAttack middlegame short opening Philidor Defense Philidor Defense Other variations moves e4d4 d3f5 c8b8 d1d4 e4d4+d3f5 d3f5+c8b8 c8b8+d1d4',
-    'themes crushing middlegame pin queensideAttack short opening Sicilian Defense Sicilian Defense Najdorf Variation moves c3d5 c5b3 c1b1 b3d2 c3d5+c5b3 c5b3+c1b1 c1b1+b3d2',
-]
-query_embeddings = model.encode_query(queries)
-document_embeddings = model.encode_document(documents)
-print(query_embeddings.shape, document_embeddings.shape)
-# [1, 512] [3, 512]
-# Get the similarity scores for the embeddings
-similarities = model.similarity(query_embeddings, document_embeddings)
-print(similarities)
-# tensor([[0.8405, 0.5061, 0.2136]])
-```
-<!--
-### Direct Usage (Transformers)
-<details><summary>Click to see the direct usage in Transformers</summary>
-</details>
--->
-<!--
-### Downstream Usage (Sentence Transformers)
-You can finetune this model on your own dataset.
-<details><summary>Click to expand</summary>
-</details>
--->
-<!--
-### Out-of-Scope Use
-*List how the model may foreseeably be misused and address what users ought not to do with the model.*
--->
-## Evaluation
-### Metrics
-#### Information Retrieval
-* Datasets: `chess-ir` and `chess-ir-tokens`
-* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.sentence_transformer.evaluation.InformationRetrievalEvaluator)
-| Metric              | chess-ir   | chess-ir-tokens |
-|:--------------------|:-----------|:----------------|
-| cosine_accuracy@1   | 0.005      | 0.0794          |
-| cosine_accuracy@10  | 0.07       | 0.2593          |
-| cosine_precision@1  | 0.005      | 0.0794          |
-| cosine_precision@10 | 0.008      | 0.0603          |
-| cosine_recall@1     | 0.0017     | 0.0022          |
-| cosine_recall@10    | 0.0267     | 0.024           |
-| **cosine_ndcg@10**  | **0.0168** | **0.0672**      |
-| cosine_mrr@10       | 0.0207     | 0.1233          |
-| cosine_map@100      | 0.0141     | 0.0332          |
-<!--
-## Bias, Risks and Limitations
-*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
--->
-<!--
-### Recommendations
-*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
--->
-## Training Details
-### Training Dataset
-#### Unnamed Dataset
-* Size: 1,619,946 training samples
-* Columns: <code>anchor</code> and <code>positive</code>
-* Approximate statistics based on the first 100 samples:
-  |          | anchor                                                                                          | positive                                                                                         |
-  |:---------|:------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------|
-  | type     | string                                                                                          | string                                                                                           |
-  | modality | text                                                                                            | text                                                                                             |
-  | details  | <ul><li>min: 21 characters</li><li>mean: 75.57 characters</li><li>max: 122 characters</li></ul> | <ul><li>min: 86 characters</li><li>mean: 158.13 characters</li><li>max: 256 characters</li></ul> |
-* Samples:
-  | anchor                                                                                                               | positive                                                                                                                                                                               |
-  |:---------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-  | <code>kingsideAttack mate mateIn1 middlegame oneMove Horwitz Defense Horwitz Defense [UNK] variations</code>         | <code>themes kingsideAttack mate mateIn1 middlegame oneMove opening Horwitz Defense Horwitz Defense Other variations moves f7h8 g6g2 f7h8+g6g2</code>                                  |
-  | <code>backRankMate endgame mate mateIn2 short Kings Knight Opening Kings Knight Opening [UNK] [UNK]</code>           | <code>themes backRankMate endgame mate mateIn2 short opening Kings Knight Opening Kings Knight Opening Other variations moves c5d4 c3c8 g5d8 c8d8 c5d4+c3c8 c3c8+g5d8 g5d8+c8d8</code> |
-  | <code>kingsideAttack mate mateIn1 middlegame oneMove Sicilian Defense Sicilian Defense Paulsen-Basman Defense</code> | <code>themes kingsideAttack mate mateIn1 middlegame oneMove opening Sicilian Defense Sicilian Defense Paulsen-Basman Defense moves g3f3 c7h2 g3f3+c7h2</code>                          |
-* Loss: [<code>MatryoshkaLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#matryoshkaloss) with these parameters:
-  ```json
-  {
-      "loss": "MultipleNegativesRankingLoss",
-      "matryoshka_dims": [
-          512,
-          256,
-          128,
-          64,
-          32
-      ],
-      "matryoshka_weights": [
-          1,
-          1,
-          1,
-          1,
-          1
-      ],
-      "n_dims_per_step": -1
-  }
-  ```
-### Training Hyperparameters
-#### Non-Default Hyperparameters
-- `per_device_train_batch_size`: 4096
-- `num_train_epochs`: 20
-- `learning_rate`: 0.01
-- `warmup_steps`: 0.1
-- `weight_decay`: 0.01
-- `per_device_eval_batch_size`: 4096
-- `push_to_hub`: True
-- `hub_model_id`: oneryalcin/static-embedding-chess
-- `load_best_model_at_end`: True
-- `seed`: 12
-#### All Hyperparameters
-<details><summary>Click to expand</summary>
-- `per_device_train_batch_size`: 4096
-- `num_train_epochs`: 20
-- `max_steps`: -1
-- `learning_rate`: 0.01
-- `lr_scheduler_type`: linear
-- `lr_scheduler_kwargs`: None
-- `warmup_steps`: 0.1
-- `optim`: adamw_torch_fused
-- `optim_args`: None
-- `weight_decay`: 0.01
-- `adam_beta1`: 0.9
-- `adam_beta2`: 0.999
-- `adam_epsilon`: 1e-08
-- `optim_target_modules`: None
-- `gradient_accumulation_steps`: 1
-- `average_tokens_across_devices`: True
-- `max_grad_norm`: 1.0
-- `label_smoothing_factor`: 0.0
-- `bf16`: False
-- `fp16`: False
-- `bf16_full_eval`: False
-- `fp16_full_eval`: False
-- `tf32`: None
-- `gradient_checkpointing`: False
-- `gradient_checkpointing_kwargs`: None
-- `torch_compile`: False
-- `torch_compile_backend`: None
-- `torch_compile_mode`: None
-- `use_liger_kernel`: False
-- `liger_kernel_config`: None
-- `use_cache`: False
-- `neftune_noise_alpha`: None
-- `torch_empty_cache_steps`: None
-- `auto_find_batch_size`: False
-- `log_on_each_node`: True
-- `logging_nan_inf_filter`: True
-- `include_num_input_tokens_seen`: no
-- `log_level`: passive
-- `log_level_replica`: warning
-- `disable_tqdm`: False
-- `project`: huggingface
-- `trackio_space_id`: None
-- `trackio_bucket_id`: None
-- `trackio_static_space_id`: None
-- `per_device_eval_batch_size`: 4096
-- `prediction_loss_only`: True
-- `eval_on_start`: False
-- `eval_do_concat_batches`: True
-- `eval_use_gather_object`: False
-- `eval_accumulation_steps`: None
-- `include_for_metrics`: []
-- `batch_eval_metrics`: False
-- `save_only_model`: False
-- `save_on_each_node`: False
-- `enable_jit_checkpoint`: False
-- `push_to_hub`: True
-- `hub_private_repo`: None
-- `hub_model_id`: oneryalcin/static-embedding-chess
-- `hub_strategy`: every_save
-- `hub_always_push`: False
-- `hub_revision`: None
-- `load_best_model_at_end`: True
-- `ignore_data_skip`: False
-- `restore_callback_states_from_checkpoint`: False
-- `full_determinism`: False
-- `seed`: 12
-- `data_seed`: None
-- `use_cpu`: False
-- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
-- `parallelism_config`: None
-- `dataloader_drop_last`: False
-- `dataloader_num_workers`: 0
-- `dataloader_pin_memory`: True
-- `dataloader_persistent_workers`: False
-- `dataloader_prefetch_factor`: None
-- `remove_unused_columns`: True
-- `label_names`: None
-- `train_sampling_strategy`: random
-- `length_column_name`: length
-- `ddp_find_unused_parameters`: None
-- `ddp_bucket_cap_mb`: None
-- `ddp_broadcast_buffers`: False
-- `ddp_static_graph`: None
-- `ddp_backend`: None
-- `ddp_timeout`: 1800
-- `fsdp`: []
-- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
-- `deepspeed`: None
-- `debug`: []
-- `skip_memory_metrics`: True
-- `do_predict`: False
-- `resume_from_checkpoint`: None
-- `warmup_ratio`: None
-- `local_rank`: -1
-- `prompts`: None
-- `batch_sampler`: batch_sampler
-- `multi_dataset_batch_sampler`: proportional
-- `router_mapping`: {}
-- `learning_rate_mapping`: {}
-</details>
-### Training Logs
-| Epoch  | Step | Training Loss | chess-ir_cosine_ndcg@10 | chess-ir-tokens_cosine_ndcg@10 |
-|:------:|:----:|:-------------:|:-----------------------:|:------------------------------:|
-| -1     | -1   | -             | 0.0123                  | 0.0561                         |
-| 0.0025 | 1    | 27.3123       | -                       | -                              |
-| 0.2020 | 80   | 26.3304       | -                       | -                              |
-| 0.4040 | 160  | 22.2114       | -                       | -                              |
-| 0.6061 | 240  | 17.4522       | -                       | -                              |
-| 0.8081 | 320  | 12.8864       | -                       | -                              |
-| 1.0    | 396  | -             | 0.0800                  | 0.1181                         |
-| 1.0101 | 400  | 9.1439        | -                       | -                              |
-| 1.2121 | 480  | 6.5434        | -                       | -                              |
-| 1.4141 | 560  | 4.9138        | -                       | -                              |
-| 1.6162 | 640  | 3.9819        | -                       | -                              |
-| 1.8182 | 720  | 3.4584        | -                       | -                              |
-| 2.0    | 792  | -             | 0.0505                  | 0.0938                         |
-| 2.0202 | 800  | 3.1303        | -                       | -                              |
-| 2.2222 | 880  | 2.9652        | -                       | -                              |
-| 2.4242 | 960  | 2.8584        | -                       | -                              |
-| 2.6263 | 1040 | 2.7907        | -                       | -                              |
-| 2.8283 | 1120 | 2.7475        | -                       | -                              |
-| 3.0    | 1188 | -             | 0.0251                  | 0.0830                         |
-| 3.0303 | 1200 | 2.7031        | -                       | -                              |
-| 3.2323 | 1280 | 2.6927        | -                       | -                              |
-| 3.4343 | 1360 | 2.6516        | -                       | -                              |
-| 3.6364 | 1440 | 2.6441        | -                       | -                              |
-| 3.8384 | 1520 | 2.6202        | -                       | -                              |
-| 4.0    | 1584 | -             | 0.0168                  | 0.0672                         |
-### Training Time
-- **Training**: 4.1 minutes
-- **Evaluation**: 0.2 seconds
-- **Total**: 4.1 minutes
-### Framework Versions
-- Python: 3.12.10
-- Sentence Transformers: 5.5.0
-- Transformers: 5.8.0
-- PyTorch: 2.11.0
-- Accelerate: 1.13.0
-- Datasets: 4.8.5
-- Tokenizers: 0.22.2
-## Citation
-### BibTeX
-#### Sentence Transformers
-```bibtex
-@inproceedings{reimers-2019-sentence-bert,
-    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
-    author = "Reimers, Nils and Gurevych, Iryna",
-    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
-    month = "11",
-    year = "2019",
-    publisher = "Association for Computational Linguistics",
-    url = "https://arxiv.org/abs/1908.10084",
-}
-```
-#### MatryoshkaLoss
-```bibtex
-@misc{kusupati2024matryoshka,
-    title={Matryoshka Representation Learning},
-    author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
-    year={2024},
-    eprint={2205.13147},
-    archivePrefix={arXiv},
-    primaryClass={cs.LG}
-}
 ```
-#### MultipleNegativesRankingLoss
-```bibtex
-@misc{oord2019representationlearningcontrastivepredictive,
-      title={Representation Learning with Contrastive Predictive Coding},
-      author={Aaron van den Oord and Yazhe Li and Oriol Vinyals},
-      year={2019},
-      eprint={1807.03748},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG},
-      url={https://arxiv.org/abs/1807.03748},
-}
 ```
-<!--
-## Glossary
-*Clearly define terms in order to be accessible across audiences.*
--->
-<!--
-## Model Card Authors
-*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
--->
-<!--
-## Model Card Contact
-*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
--->

 ---
+language: en
 license: apache-2.0
+library_name: sentence-transformers
+pipeline_tag: sentence-similarity
 tags:
 - sentence-transformers
+- static-embedding
+- chess
+- retrieval
+- exploratory
+datasets:
+- Lichess/chess-puzzles
+- Lichess/chess-openings
+---
+# Chess Static Embedding (v4-C2) — Open Exploration
+A 4M-parameter `StaticEmbedding` model for chess content retrieval, plus the
+full **open-science methodology document** describing what we tried, what
+worked, what failed, and why.
+This repo is **exploratory experimental work**, published as-is. The model is
+genuinely useful (NDCG@10 = 0.12 on a compositional held-out eval, 50× smaller
+than typical retrieval encoders) but the bigger contribution is the
+**methodology narrative** below — particularly the *LLM-bridge* and
+*deterministic-bridge* findings.
 ---
+## Quick start
+```python
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer("oneryalcin/static-embedding-chess")
+query = "fork endgame short"
+docs = [
+    "themes crushing endgame fork short opening Sicilian Defense moves f2g3 e6e7",
+    "themes mate mateIn1 oneMove opening Caro-Kann moves d2d4 e7e5",
+]
+sims = model.encode(query) @ model.encode(docs).T
+```
+Static embedding: lookup table + average. Sub-millisecond CPU inference. No GPU
+required.
+---
+## Headline result
+| Variant | NDCG@10 | vs random init |
+|---------|---------|---------------|
+| v3 baseline (random init + MNRL) | 0.0801 | — |
+| v4-A hard-neg only | 0.1000 | +25% |
+| v4-B theme distill only | 0.0112 | -86% (regression — see methodology) |
+| v4-C multitask 500× | 0.1154 | +44% |
+| **v4-C2 multitask 5000× (this model)** | **0.1202** | **+50%** |
+Held-out eval: 200 unseen anchor combinations × 600-doc corpus. Compositional
+generalization — the model never saw these exact theme combinations during
+training, only the individual tokens in other combos.
+For **production-ready** chess search, see the **two-stage architecture** below
+(static + BM25 over English-bridged docs) that delivers NDCG@10 = 0.59-0.87.
+---
+## What's in this repo
 ```
+model.safetensors                     # 4M-param StaticEmbedding weights (~9MB)
+chess_tokenizer.json                  # WordLevel chess tokenizer (4,336 tokens)
+tokenizer.json                        # Same, in HF format for ST loading
+config_sentence_transformers.json     # Module config
+modules.json                          # Module pipeline
+data/
+├── theme_definitions.parquet         # 73 chess themes + LLM-generated English defs + MPNet embeddings (the LLM-bridge teacher signal)
+├── hard_negatives_chess.parquet      # 1.6M (anchor, positive, negative) triplets, chess-token format
+└── hard_negatives_english.parquet    # Same, English-bridged via deterministic conversion
+scripts/
+├── train_chess_static.py             # Main training entrypoint (multi-version, env-flag controlled)
+├── train_chess_multitask.py          # The v4-C2 winning recipe (theme distill + hard-neg MNRL)
+├── convert_to_english.py             # Deterministic chess→English (no LLM needed; python-chess + regex)
+├── mine_hard_negs_v2.py              # Memory-bounded custom hard-negative miner
+├── generate_theme_defs.py            # LLM-bridge: DeepSeek-v4-flash writes chess concept definitions
+├── compare_variants.py               # Side-by-side eval framework across all variants
+└── diag_ce_vs_bm25.py                # The critical "is your CE really helping" diagnostic
 ```
+---
+## Methodology — the full experimental journey
+This was 36+ hours of iterative exploration. The model is the small visible
+output; the methodology is the bigger contribution.
+### 1. Problem and approach
+**Task:** Free-text search over a chess puzzle corpus. User types something
+like `"fork endgame short"` and gets matching Lichess puzzles.
+**Why static embedding:** Tom Aarsen's
+[static-retrieval-mrl-en-v1](https://huggingface.co/sentence-transformers/static-retrieval-mrl-en-v1)
+showed StaticEmbedding can be a useful retrieval primitive with the right
+training. We adapted the recipe for a chess-specific domain with a custom
+WordLevel tokenizer so chess tokens (UCI moves, theme names, ECO codes) are
+first-class.
+**Data:** Lichess/chess-puzzles (5.8M puzzles, CC0) + Lichess/chess-openings
+(3.6K openings, CC0).
+### 2. Eval design — the hardest part
+**Initial mistake:** First eval used top-200 most-common theme strings as
+queries. The model had seen each of these ~50,000 times in training. Baseline
+NDCG@10 was inflated to 0.81 by lexical overlap before any training. Useless.
+**Fixed eval (used throughout):** *Compositional held-out anchors*. Pick 200
+theme-combination strings that appear exactly 3 times in the data
+(rare-but-multi-relevant), remove all matching pairs from train, use those rare
+combos as queries. Tests whether the model can compose meaning from individual
+theme tokens it learned, without having seen the specific combination.
+This is harsh — the model can never "memorize" the eval queries — and that's
+the point. Random-init baseline drops to NDCG@10 ≈ 0.01.
+### 3. Phase 1 — diagnostic of the v3 model (0.08 NDCG@10)
+A working baseline existed. Question: **why isn't it better?**
+Token-similarity probe revealed the core issue:
+| Pair | v3 cosine similarity |
+|---|---|
+| `fork` ↔ `pin` | +0.01 |
+| `fork` ↔ `skewer` | -0.12 |
+| `endgame` ↔ `middlegame` | -0.30 |
+**Token embeddings were essentially orthogonal.** The model learned per-token
+mappings to chess-content clusters but no relationships *between* tokens.
+Compositional generalization (the eval task) requires those relationships.
+Also discovered: 51% of held-out queries returned zero relevant in top-10
+(median NDCG@10 = 0). Bimodal failure pattern.
+Also discovered: model beat BM25 by 7.5× (0.08 vs 0.01), confirming it does
+real semantic work beyond keyword match.
+### 4. Phase 2 — distillation from raw MPNet (DEAD END)
+Hypothesis: distill student token embeddings to match teacher (MPNet)
+embeddings. Teacher knows English; should know that `fork ≈ pin`.
+**Result:** REGRESSION. Why? **MPNet itself scores NDCG@10 = 0.0094 on our
+eval.** 95.5% of queries get zero in top-10. MPNet doesn't know chess: UCI
+moves are character soup to its WordPiece tokenizer.
+**You can't distill what the teacher doesn't know.** This was the first key
+lesson.
+### 5. Phase 3 — LLM-bridge for theme distillation (BREAKTHROUGH)
+Key insight: an LLM can read both chess (in camelCase) AND English. Use it as
+a **translator** to put chess concepts into language MPNet *can* understand
+semantically.
+**Steps:**
+1. DeepSeek-v4-flash writes English definitions for 73 Lichess themes:
+   - `fork` → "A tactical motif where a single piece attacks two or more
+     enemy pieces simultaneously, forcing a material gain."
+2. MPNet embeds the *English definitions* (it knows English fluently).
+3. Distill the student's per-token embedding to match the definition embedding.
+After step 2 alone, MPNet's `fork ↔ skewer` similarity jumps from 0.39 (raw
+camelCase) to **0.87** (via definitions). Real semantic structure.
+Combined with hard-negative MNRL training (v4-C2): **NDCG@10 = 0.1202**, +50%
+over v3.
+Cost: 73 themes × DeepSeek API ≈ $0.01 + ~1 minute generation.
+This is the **LLM-bridge** pattern: when system A doesn't speak system B's
+language, use an LLM as a translator. The LLM is one-shot work, not part of
+inference.
+### 6. Phase 4 — hard-negative mining
+Used the v3 model to mine confusable documents per anchor. Custom
+memory-bounded miner because the sentence-transformers built-in OOMs on M4 at
+327k unique anchors × 327k positives. See `scripts/mine_hard_negs_v2.py`.
+1.6M triplets mined. Positive-negative margin: 0.135 mean (good signal for
+training).
+### 7. Phase 5 — multi-task training (v4-C2 winner)
+Multi-dataset trainer combining:
+- **Chess triplets** (1.6M, MNRL loss): teaches content associations
+- **Theme distillation** (73 themes × 5000 replicas via `EmbedDistillLoss`):
+  injects semantic structure between tokens
+With proportional sampling, theme tokens see ~500 gradient updates per epoch
+(via replication) vs chess pairs once. Theme distillation oversampling matters:
+| Theme replicas | NDCG@10 |
+|---|---|
+| 500× | 0.1154 |
+| 5000× | 0.1202 |
+### 8. Phase 6 — cross-encoder reranker attempts (ALL FAILED)
+Tried three variants:
+- MS-MARCO MiniLM (English-pretrained, 22M params) on chess-format docs
+- Same, with theme echo stripped from training docs
+- Fresh-init tiny BERT (5M params) with our chess tokenizer
+**All regressed below static-only.** Diagnosis: trained CEs operate at
+random-ordering level on the eval. Inspection of training predictions showed
+the trained CE got pair-ordering wrong 2/3 of the time on sample inputs.
+**Root cause:** documents are UCI move sequences (`f2g3 e6e7 ...`). To
+English-pretrained CE tokenizers these are character fragments with no
+meaningful representation. The CE can't learn what makes a "fork-y" move
+sequence from sparse labels alone. Static embedding worked because token-bag
+averaging is sample-efficient (each `fork` token gets gradients from many
+examples → converges to a useful cluster); the CE's pair-level processing is
+hungrier for signal not available in our data.
+### 9. Phase 7 — deterministic English bridge for documents (REVEALED THE TRUTH)
+Insight: we don't need an LLM to translate documents either. `python-chess`
+deterministically converts UCI → SAN with board context (`f2g3` → `Bxg3`).
+Regex decamelizes themes (`backRankMate` → `back rank mate`). Free, instant,
+reproducible. The `convert_to_english.py` script does the full 5.8M corpus in
+~3 minutes.
+Re-ran reranker training on English-bridged docs. **Untrained MS-MARCO CE hit
+the oracle ceiling (0.5947 at top-100).** Massive jump.
+But: ran a final diagnostic comparing trained CE vs **BM25** over the same
+English docs. They were *identical*:
+| K | Static | +CE | +BM25 | Oracle |
+|---|---|---|---|---|
+| 100 | 0.1202 | **0.5947** | **0.5947** | 0.5947 |
+| 200 | 0.1202 | 0.7706 | 0.7706 | 0.7706 |
+| 300 | 0.1202 | 0.8718 | 0.8718 | 0.8718 |
+The "LLM-bridge effect" we observed was **lexical match enabled by the
+English conversion**, not semantic CE understanding. BM25 over English docs
+does the same job.
+**Stress test**: stripped theme tokens from English docs too. Forces the CE
+to genuinely understand "fork query ↔ fork-pattern moves":
+| K | Static | +CE | +BM25 | Oracle |
+|---|---|---|---|---|
+| 100 | 0.1202 | 0.0726 | 0.4327 | 0.5947 |
+| 300 | 0.1202 | 0.0706 | 0.6252 | 0.8718 |
+CE drops below static (negative transfer — memorized "theme overlap = match"
+during training; can't generalize). BM25 still partially works via opening
+name overlap.
+**True semantic CE chess understanding is not achievable** with 22M-param
+English-pretrained models on our training signal.
+---
+## Production recommendation
+For a real chess search system, the winning architecture is:
 ```
+Stage 1: Static embedding (this model)
+  - Encode chess-format corpus (4M params, ~9MB)
+  - Sub-millisecond CPU inference
+  - Retrieve top-200 candidates via cosine similarity
+  - Recall@200 = 93.5%
+Stage 2: BM25 over English-bridged corpus
+  - python-chess + regex (one-time, $0)
+  - Index the English versions of all docs
+  - Rerank top-200 candidates to top-10
+  - NDCG@10 ≈ 0.55-0.62
+```
+**Total: <10ms/query, $0 inference cost, no GPU.**
+The cross-encoder is only worth adding if you have GPU available AND you train
+it on a fundamentally different signal (e.g., human-annotated relevance,
+chess-engine strategic descriptions, or much more parameters with chess in
+pretraining).
+---
+## Key learnings worth keeping (general, not chess-specific)
+1. **Eval methodology dominates.** Most time spent debugging the "model isn't
+   improving" turned out to be eval issues, not training issues. Compositional
+   held-out > top-frequent-string eval. Strip lexical leakage between query
+   and corpus when testing generalization.
+2. **Sentence-transformers' `NoDuplicatesBatchSampler` is O(epoch-progress)
+   per batch.** It walks a linked-list of deferred conflicts. For datasets
+   with limited unique anchors (our ~327k anchors over 5.8M pairs), this
+   creates monotonic step-time blowup. Switch to `BatchSamplers.BATCH_SAMPLER`.
+3. **`CachedMultipleNegativesRankingLoss` is incompatible with
+   `StaticEmbedding`** — explicit error. Token-bag has no transformer
+   activations to GradCache through.
+4. **Trackio crashes on first checkpoint push** with sentence-transformers
+   due to an empty `router_mapping` struct that pyarrow can't write. Use
+   `report_to="none"`.
+5. **The "LLM-bridge" pattern**: when system A speaks language X and system
+   B speaks language Y, use an LLM to translate B→X once (not at inference).
+   For chess: LLM writes English definitions of themes → general English
+   teacher can now embed them → distill into chess-specific model.
+6. **Deterministic translation often suffices** for the bridge. Don't pay LLM
+   API costs if `python-chess` and regex can produce the same English text.
+   Reserve LLMs for the parts that genuinely need understanding (concept
+   definitions, paraphrases, strategic narratives).
+7. **Compare your trained model against BM25** on the actual eval. If they
+   tie, your model is doing keyword matching, not semantic work. Diagnostic
+   in `scripts/diag_ce_vs_bm25.py`.
+8. **Modal `.spawn()` only survives entrypoint exit on deployed apps.** For
+   ephemeral `modal run`, the app dies when entrypoint returns — including
+   spawned calls. Use `.remote()` with `--detach`.
+9. **Apple Silicon M4 is competitive with cloud A100** for tiny models. Token
+   bag + small batch easily hits 17 it/s on MPS. GPU cost is wasted unless
+   the model is compute-bound.
+---
+## Reproducibility
+Clone this repo, then with sentence-transformers v5.5+:
+```bash
+# Inspect the recipe
+cat scripts/train_chess_multitask.py
+# Reproduce the data prep (one-time, ~10 min)
+python scripts/generate_theme_defs.py        # Needs DeepSeek API key in macOS keychain
+python scripts/convert_to_english.py         # python-chess + regex, $0
+python scripts/mine_hard_negs_v2.py          # ~10 min on M4 MPS
+# Reproduce the winning training
+python scripts/train_chess_multitask.py      # ~5 min on M4 MPS
+# Verify
+python scripts/compare_variants.py           # Side-by-side eval table
+python scripts/diag_ce_vs_bm25.py            # Is the rerank doing real work?
 ```
+---
+## Limitations and honest caveats
+- **NDCG@10 = 0.12 is modest in absolute terms.** Industry retrieval encoders
+  reach 0.4-0.6 on similar tasks. This model is competitive on size/speed,
+  not absolute quality.
+- **The two-stage architecture (NDCG@10 ≈ 0.6) is the production answer**
+  but relies on BM25 over English-converted docs, not on the cross-encoder.
+- **Cross-encoder didn't add semantic value** in our setup; results came from
+  lexical match enabled by the English bridge.
+- **Bimodal failure**: even the best model misses half of queries entirely
+  (median NDCG@10 = 0). The architecture has fundamental limits for chess
+  reasoning.
+- **English-pretrained models don't know chess.** Tried MPNet, MiniLM,
+  Jina-v5; all fail on UCI moves. Bigger English models won't fix this; only
+  chess-pretrained or deterministic conversion helps.
+- **No engine evaluation.** "Is this puzzle a fork?" was determined by
+  Lichess theme tags; we never ran a chess engine. A real production system
+  would integrate Stockfish for ground-truth tactical pattern detection.
+---
+## What this is NOT
+- Not a chess engine. See [`thomasahle/fastchess`](https://github.com/thomasahle/fastchess)
+  for FastText-based move prediction (closest related work).
+- Not a position similarity model. See `chess2vec` lineage on GitHub for
+  position-level embeddings.
+- Not a state-of-the-art retrieval model. It's a tiny first-stage filter
+  designed to pair with a reranker.
+---
+## License
+Apache 2.0 (model + scripts). Data derived from Lichess/chess-puzzles which is
+CC0 — derived parquets in this repo are also released under CC0.
+## Acknowledgments
+- [Lichess](https://lichess.org) for releasing puzzles + openings under CC0.
+- [Tom Aarsen](https://huggingface.co/tomaarsen) for the
+  `train-sentence-transformers` skill and `StaticEmbedding` recipe.
+- DeepSeek for the v4-flash API used for theme definitions.
+## Citation
+If this work is useful, please link to this repo. The scientific findings
+(particularly the deterministic-bridge insight that BM25 over English-bridged
+docs equals a trained cross-encoder for this task) are the main contribution.

data/hard_negatives_chess.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3dc7f1bfcb497ba5f5e61c1b9fffe76ca52825758454c65b3a2dc2010e3e68bb
+size 161012028

data/hard_negatives_english.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50b28d80013527fcb6f27554ee0cda91116e4b3967a74472320a089a7b1fa873
+size 111083130

data/theme_definitions.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f70e1629bfda29faedfca1474d2195bd527590eeb48b628fd862da12a2070f3
+size 456977

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:85ba258107839fe02a04763d71797aeb5f4fa19f2a8712e73a0ed0e38b4c15ff
 size 8880224

 version https://git-lfs.github.com/spec/v1
+oid sha256:6fa4d9dd8e62c4ef6d7f288ea1822f30d5f75f3a5ab178a923c4330e3b09652d
 size 8880224

scripts/compare_variants.py ADDED Viewed

	@@ -0,0 +1,175 @@

+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "sentence-transformers[train]>=5.5.0",
+#     "datasets>=2.19.0",
+#     "numpy",
+# ]
+# ///
+"""Side-by-side comparison of all chess static-embedding variants on the same
+held-out compositional eval. Produces the final table for NOTES.md.
+"""
+from __future__ import annotations
+import os
+import sys
+from collections import defaultdict
+import numpy as np
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer
+sys.stdout.reconfigure(line_buffering=True)
+VARIANTS = [
+    ("v3 baseline",            "models/static-embedding-chess/final"),
+    ("v4-A hard-neg only",     "models/static-embedding-chess-triplet/final"),
+    ("v4-B theme distill",     "models/static-embedding-chess-theme-only/final"),
+    ("v4-C multitask 500x",    "models/static-embedding-chess-multitask-500x/final"),
+    ("v4-C2 multitask 5000x",  "models/static-embedding-chess-multitask-5000x/final"),
+]
+HELDOUT_FREQ_MIN = 3
+HELDOUT_FREQ_MAX = 30
+EVAL_QUERIES = 200
+def _join_tags(tags):
+    return " ".join(t.replace("_", " ") for t in tags) if tags else ""
+def _bigram_token_str(moves):
+    toks = moves.split()
+    if len(toks) < 2:
+        return moves
+    return moves + " " + " ".join(f"{a}+{b}" for a, b in zip(toks, toks[1:]))
+def build_puzzle_pairs(batch):
+    anchors, positives = [], []
+    for themes, op, moves in zip(batch["Themes"], batch["OpeningTags"], batch["Moves"]):
+        themes_txt = _join_tags(themes)
+        op_txt = _join_tags(op)
+        if not themes_txt:
+            continue
+        anchor = themes_txt + (f" {op_txt}" if op_txt else "")
+        positive = f"themes {themes_txt}"
+        if op_txt:
+            positive += f" opening {op_txt}"
+        positive += f" moves {_bigram_token_str(moves)}"
+        anchors.append(anchor)
+        positives.append(positive)
+    return {"anchor": anchors, "positive": positives}
+def strip_theme_echo(p):
+    i = p.find(" moves ")
+    return p[i + 1 :] if i != -1 else p
+def ndcg_at_k(scores, rel, k=10):
+    ranked = sorted(scores, key=lambda kv: -kv[1])[:k]
+    dcg = sum((1.0 if d in rel else 0.0) / np.log2(r + 2) for r, (d, _) in enumerate(ranked))
+    idcg = sum(1.0 / np.log2(r + 2) for r in range(min(len(rel), k)))
+    return dcg / idcg if idcg > 0 else 0.0
+def main():
+    print("Loading + held-out selection...")
+    puzzles = load_dataset("Lichess/chess-puzzles", split="train")
+    pair_puzzles = puzzles.map(
+        build_puzzle_pairs,
+        batched=True, batch_size=20_000,
+        remove_columns=puzzles.column_names,
+        num_proc=4,
+    )
+    anchors = pair_puzzles["anchor"]
+    freq = defaultdict(int)
+    for a in anchors:
+        freq[a] += 1
+    rare_pool = sorted(
+        ((a, c) for a, c in freq.items() if HELDOUT_FREQ_MIN <= c <= HELDOUT_FREQ_MAX),
+        key=lambda kv: kv[1],
+    )
+    heldout = {a for a, _ in rare_pool[:EVAL_QUERIES]}
+    held_idx = [i for i, h in enumerate([a in heldout for a in anchors]) if h]
+    held_anchors = [anchors[i] for i in held_idx]
+    corpus_texts = [strip_theme_echo(pair_puzzles["positive"][i]) for i in held_idx]
+    corpus_ids = [f"d{i}" for i in range(len(corpus_texts))]
+    by_anchor = defaultdict(list)
+    for i, a in enumerate(held_anchors):
+        by_anchor[a].append(corpus_ids[i])
+    queries = list(by_anchor.keys())
+    print(f"  {len(queries)} queries, {len(corpus_texts)} corpus")
+    results = []
+    for name, path in VARIANTS:
+        if not os.path.exists(path):
+            print(f"\nSKIPPING {name}: {path} not found")
+            continue
+        print(f"\n=== {name} ({path}) ===")
+        m = SentenceTransformer(path)
+        c = m.encode(corpus_texts, batch_size=128, convert_to_numpy=True, show_progress_bar=False)
+        c = c / np.linalg.norm(c, axis=1, keepdims=True)
+        q = m.encode(queries, batch_size=128, convert_to_numpy=True, show_progress_bar=False)
+        q = q / np.linalg.norm(q, axis=1, keepdims=True)
+        sims = q @ c.T
+        ndcgs = []
+        for qi, query in enumerate(queries):
+            score_pairs = [(corpus_ids[ci], float(sims[qi, ci])) for ci in range(len(corpus_ids))]
+            rel = set(by_anchor[query])
+            ndcgs.append(ndcg_at_k(score_pairs, rel, k=10))
+        ndcg = np.mean(ndcgs)
+        median = np.median(ndcgs)
+        zero = sum(1 for n in ndcgs if n == 0)
+        results.append((name, ndcg, median, zero, len(ndcgs)))
+        print(f"  NDCG@10 = {ndcg:.4f}  median = {median:.4f}  zero = {zero}/{len(ndcgs)}")
+    print("\n" + "=" * 70)
+    print(f"{'Variant':<30} {'NDCG@10':>10} {'Median':>10} {'Zero/All':>15}")
+    print("=" * 70)
+    for name, ndcg, median, zero, total in results:
+        print(f"{name:<30} {ndcg:>10.4f} {median:>10.4f} {zero:>7}/{total:<7}")
+    print("=" * 70)
+    # === Token-similarity probe ===
+    # Measures the orthogonal-tokens problem from Phase 1: do related themes
+    # cluster in embedding space? Higher = more semantic structure.
+    print("\n=== Theme-token similarity (higher = more semantic clustering) ===")
+    PROBES = [
+        ("fork", "skewer"),       # tactical motifs (should be close)
+        ("fork", "pin"),
+        ("backRankMate", "smotheredMate"),  # mate patterns
+        ("kingsideAttack", "queensideAttack"),
+        ("endgame", "middlegame"),  # phases
+        ("fork", "promotion"),      # unrelated (control)
+    ]
+    print(f"{'Pair':<40}", end="")
+    for name, _ in VARIANTS:
+        if os.path.exists([p for n, p in VARIANTS if n == name][0]):
+            print(f" {name[:14]:>16}", end="")
+    print()
+    print("-" * 70)
+    for a, b in PROBES:
+        line = f"{a} <-> {b}".ljust(40)
+        for name, path in VARIANTS:
+            if not os.path.exists(path):
+                continue
+            m = SentenceTransformer(path)
+            ea = m.encode([a], convert_to_numpy=True)[0]
+            eb = m.encode([b], convert_to_numpy=True)[0]
+            ea = ea / max(np.linalg.norm(ea), 1e-9)
+            eb = eb / max(np.linalg.norm(eb), 1e-9)
+            sim = float(np.dot(ea, eb))
+            line += f" {sim:>+16.3f}"
+        print(line)
+if __name__ == "__main__":
+    main()
+if __name__ == "__main__":
+    main()

scripts/convert_to_english.py ADDED Viewed

	@@ -0,0 +1,216 @@

+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = ["chess", "datasets>=2.19", "tqdm"]
+# ///
+"""Deterministic chess→English converter for puzzles.
+Generates a standardized English-readable description of each puzzle WITHOUT
+any LLM. Uses python-chess for UCI→SAN conversion (with board context), regex
+for decamelizing themes, and a fixed template.
+For each puzzle, produces a doc like:
+    "White to move. Short middlegame puzzle with crushing fork and hanging
+    piece motifs. Opening: King's Pawn Game. Moves: Bxg3 Rxe7 Qb1+ Nc1 Qxc1+
+    Qxc1"
+Pretrained English cross-encoders have seen SAN notation in chess web content
+during pretraining, so this doc is semantically meaningful to them — unlike
+the raw UCI form (`f2g3`) which gets fragmented into character pieces.
+Output: parquet at models/puzzles_english.parquet with columns:
+    PuzzleId, anchor (original themes+opening str), english_doc
+Run:
+    SMOKE_TEST=1 uv run --exclude-newer=2026-05-12 convert_to_english.py
+    uv run --exclude-newer=2026-05-12 convert_to_english.py
+"""
+from __future__ import annotations
+import os
+import re
+import sys
+import chess
+from datasets import Dataset, load_dataset
+from tqdm import tqdm
+sys.stdout.reconfigure(line_buffering=True)
+OUTPUT_PATH = "models/puzzles_english.parquet"
+SMOKE_TEST = os.environ.get("SMOKE_TEST") == "1"
+# Length tag mapping
+LENGTH_MAP = {
+    "oneMove": "single-move",
+    "short": "short",
+    "long": "long",
+    "veryLong": "very long",
+}
+PHASE_TAGS = {"opening", "middlegame", "endgame"}
+LENGTH_TAGS = set(LENGTH_MAP.keys())
+# Anything matching `mateInN`, `mateIn1`, etc.
+MATE_IN_PATTERN = re.compile(r"^mateIn(\d+)$")
+# Specific mate-pattern names (their English form is just decamel)
+# camelCase → "camel case" via regex
+_CAMEL_BOUNDARY = re.compile(r"(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])")
+def decamelize(tag: str) -> str:
+    """`backRankMate` → 'back rank mate'. `attackingF2F7` → 'attacking f2 f7'."""
+    return _CAMEL_BOUNDARY.sub(" ", tag).lower()
+def themes_to_english(themes: list[str]) -> tuple[str, str, str, list[str]]:
+    """Returns (side_phrase, length_phrase, phase, decamelized_other_themes).
+    Splits themes into structural (phase, length, mate-in-N) and motif (everything else).
+    The motifs are returned decamelized.
+    """
+    if not themes:
+        return ("", "", "", [])
+    phase = ""
+    length = ""
+    mate_in = None
+    motifs = []
+    for t in themes:
+        if t in PHASE_TAGS:
+            phase = t
+        elif t in LENGTH_TAGS:
+            length = LENGTH_MAP[t]
+        elif (m := MATE_IN_PATTERN.match(t)):
+            mate_in = int(m.group(1))
+        else:
+            motifs.append(decamelize(t))
+    # Mate-in-N gets folded into motifs as natural-language phrase
+    if mate_in is not None:
+        motifs.append(f"mate in {mate_in}")
+    return phase, length, "", motifs  # side_phrase computed separately from FEN
+def opening_tags_to_english(opening_tags: list[str]) -> str:
+    """`['Kings_Pawn_Game', 'Kings_Pawn_Game_Leonardis_Variation']` → 'King's Pawn Game Leonardi's Variation'.
+    Dedupe by taking the longest matching tag."""
+    if not opening_tags:
+        return ""
+    # Use the longest tag (most specific) and replace underscores with spaces
+    longest = max(opening_tags, key=len)
+    return longest.replace("_", " ")
+def uci_to_san_sequence(fen: str, uci_moves: str) -> str:
+    """Convert UCI move sequence to SAN, using board context for disambiguation."""
+    try:
+        board = chess.Board(fen)
+        san_moves = []
+        for uci in uci_moves.split():
+            try:
+                move = chess.Move.from_uci(uci)
+                san = board.san(move)
+                san_moves.append(san)
+                board.push(move)
+            except Exception:
+                # Invalid move — skip rest
+                break
+        return " ".join(san_moves)
+    except Exception:
+        return uci_moves  # fall back to raw UCI
+def side_to_move(fen: str) -> str:
+    parts = fen.split()
+    if len(parts) >= 2 and parts[1] == "w":
+        return "White"
+    return "Black"
+def build_english_doc(row: dict) -> str:
+    """Build a deterministic English description from a Lichess puzzle row."""
+    side = side_to_move(row["FEN"])
+    phase, length, _, motifs = themes_to_english(row["Themes"] or [])
+    opening = opening_tags_to_english(row.get("OpeningTags") or [])
+    san = uci_to_san_sequence(row["FEN"], row["Moves"])
+    # Construct sentence
+    parts = []
+    parts.append(f"{side} to move.")
+    # "Short middlegame puzzle with crushing fork and hanging piece motifs."
+    descriptor = []
+    if length:
+        descriptor.append(length)
+    if phase:
+        descriptor.append(phase)
+    descriptor.append("puzzle")
+    descriptor_str = " ".join(descriptor)
+    if motifs:
+        motifs_str = ", ".join(motifs)
+        descriptor_str += f" with {motifs_str} motifs"
+    parts.append(descriptor_str.capitalize() + ".")
+    if opening:
+        parts.append(f"Opening: {opening}.")
+    if san:
+        parts.append(f"Moves: {san}")
+    return " ".join(parts)
+def build_english_anchor(row: dict) -> str:
+    """Anchor side: same as before (themes + opening) but in deterministic English.
+    Used as query for retrieval/reranker training."""
+    phase, length, _, motifs = themes_to_english(row["Themes"] or [])
+    opening = opening_tags_to_english(row.get("OpeningTags") or [])
+    parts = []
+    if motifs:
+        parts.append(", ".join(motifs))
+    if length:
+        parts.append(length)
+    if phase:
+        parts.append(phase)
+    if opening:
+        parts.append(opening)
+    return " ".join(parts).strip()
+def main():
+    print("Loading puzzles...")
+    puzzles = load_dataset("Lichess/chess-puzzles", split="train")
+    if SMOKE_TEST:
+        puzzles = puzzles.select(range(2_000))
+    print(f"  {len(puzzles):,} rows")
+    print("Converting to English (deterministic)...")
+    def proc(batch):
+        ids, anchors, docs = [], [], []
+        for r in [{k: batch[k][i] for k in batch} for i in range(len(batch["PuzzleId"]))]:
+            if not r["Themes"]:
+                continue
+            ids.append(r["PuzzleId"])
+            anchors.append(build_english_anchor(r))
+            docs.append(build_english_doc(r))
+        return {"PuzzleId": ids, "anchor_en": anchors, "doc_en": docs}
+    out = puzzles.map(
+        proc, batched=True, batch_size=10_000,
+        remove_columns=puzzles.column_names,
+        num_proc=4,
+    )
+    print(f"  produced {len(out):,} English-converted rows")
+    print("\n=== Sample conversions ===")
+    for i in [0, 100, 1000]:
+        r = out[i]
+        print(f"\nPuzzleId: {r['PuzzleId']}")
+        print(f"  anchor: {r['anchor_en']!r}")
+        print(f"  doc:    {r['doc_en'][:200]!r}")
+    out.to_parquet(OUTPUT_PATH)
+    print(f"\nSaved to {OUTPUT_PATH} ({os.path.getsize(OUTPUT_PATH) / 1e6:.1f} MB)")
+if __name__ == "__main__":
+    main()

scripts/diag_ce_vs_bm25.py ADDED Viewed

	@@ -0,0 +1,145 @@

+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = ["sentence-transformers[train]>=5.5.0", "datasets>=2.19", "numpy", "rank-bm25", "chess"]
+# ///
+"""Compare trained CE vs BM25 on English-bridged docs, plus top-K sweep.
+Tests:
+1. Is the 0.59 CE result just lexical match that BM25 could also do?
+2. Does increasing K to 200/300 push past oracle 0.59 → 0.77 → 0.87?
+"""
+import os
+import sys
+from collections import defaultdict
+import numpy as np
+from datasets import Dataset, load_dataset
+from rank_bm25 import BM25Okapi
+from sentence_transformers import CrossEncoder, SentenceTransformer
+sys.stdout.reconfigure(line_buffering=True)
+sys.path.insert(0, os.path.dirname(__file__))
+from convert_to_english import build_english_anchor, build_english_doc
+HELDOUT_FREQ_MIN = 3
+HELDOUT_FREQ_MAX = 30
+EVAL_QUERIES = 200
+def _join_tags(tags):
+    return " ".join(t.replace("_", " ") for t in tags) if tags else ""
+def _bigram(m):
+    toks = m.split()
+    return m + " " + " ".join(f"{a}+{b}" for a, b in zip(toks, toks[1:])) if len(toks) > 1 else m
+def build_chess_anchor(themes, op):
+    tt = _join_tags(themes)
+    ot = _join_tags(op or [])
+    return tt + (f" {ot}" if ot else "")
+def build_chess_doc_stripped(themes, op, moves):
+    return f"moves {_bigram(moves)}"
+def ndcg_at_k(scores, rel, k=10):
+    r = sorted(scores, key=lambda kv: -kv[1])[:k]
+    dcg = sum((1.0 if d in rel else 0.0) / np.log2(rr + 2) for rr, (d, _) in enumerate(r))
+    idcg = sum(1.0 / np.log2(rr + 2) for rr in range(min(len(rel), k)))
+    return dcg / idcg if idcg > 0 else 0
+def main():
+    print("Building eval set...")
+    puzzles = load_dataset("Lichess/chess-puzzles", split="train")
+    freq = defaultdict(int)
+    rows_by_anchor = defaultdict(list)
+    for r in puzzles:
+        if not r["Themes"]:
+            continue
+        ca = build_chess_anchor(r["Themes"], r["OpeningTags"])
+        freq[ca] += 1
+        rows_by_anchor[ca].append(r)
+    rare = sorted(((a, c) for a, c in freq.items() if HELDOUT_FREQ_MIN <= c <= HELDOUT_FREQ_MAX), key=lambda kv: kv[1])
+    heldout = [a for a, _ in rare[:EVAL_QUERIES]]
+    print(f"  {len(heldout)} held-out anchors")
+    qchess, qen = [], []
+    corp_chess, corp_en = [], []
+    held_per_doc = []
+    ch_to_en = {}
+    for ca in heldout:
+        for r in rows_by_anchor[ca]:
+            corp_chess.append(build_chess_doc_stripped(r["Themes"], r["OpeningTags"], r["Moves"]))
+            corp_en.append(build_english_doc(r))
+            held_per_doc.append(ca)
+            if ca not in ch_to_en:
+                ch_to_en[ca] = build_english_anchor(r)
+    qchess = list(heldout)
+    qen = [ch_to_en[a] for a in qchess]
+    by_anchor = defaultdict(list)
+    for i, a in enumerate(held_per_doc):
+        by_anchor[a].append(i)
+    print(f"  corpus: {len(corp_chess)} docs")
+    print("\nLoading static (v4-C2) for first-stage...")
+    static = SentenceTransformer("models/static-embedding-chess-multitask-5000x/final")
+    sc = static.encode(corp_chess, batch_size=128, convert_to_numpy=True, show_progress_bar=False)
+    sc = sc / np.linalg.norm(sc, axis=1, keepdims=True)
+    sq = static.encode(qchess, batch_size=128, convert_to_numpy=True, show_progress_bar=False)
+    sq = sq / np.linalg.norm(sq, axis=1, keepdims=True)
+    static_sims = sq @ sc.T
+    # Loaded trained CE
+    print("Loading trained CE...")
+    ce = CrossEncoder("models/chess-reranker-english/final")
+    # BM25 on English docs
+    print("Building BM25 over English docs...")
+    bm25 = BM25Okapi([d.split() for d in corp_en])
+    print("\n" + "=" * 80)
+    print(f"  {'K':>4} {'Static':>10} {'+CE':>10} {'+BM25':>10} {'Oracle':>10}")
+    print("=" * 80)
+    for k in [10, 50, 100, 200, 300]:
+        if k > len(corp_chess):
+            continue
+        static_ndcg = []
+        ce_ndcg = []
+        bm25_ndcg = []
+        oracle_ndcg = []
+        for qi, q_chess in enumerate(qchess):
+            rel = set(by_anchor[q_chess])
+            # Static-only at top-10
+            top10 = np.argsort(-static_sims[qi])[:10]
+            sp = [(int(i), float(static_sims[qi, int(i)])) for i in top10]
+            static_ndcg.append(ndcg_at_k(sp, rel, k=10))
+            # Top-K shortlist
+            topk = np.argsort(-static_sims[qi])[:k]
+            # CE rerank
+            pairs = [[qen[qi], corp_en[int(i)]] for i in topk]
+            ce_scores = ce.predict(pairs, batch_size=64, show_progress_bar=False, convert_to_numpy=True)
+            ce_sp = [(int(topk[j]), float(ce_scores[j])) for j in range(len(topk))]
+            ce_ndcg.append(ndcg_at_k(ce_sp, rel, k=10))
+            # BM25 rerank over top-K shortlist
+            bm_full = bm25.get_scores(qen[qi].split())
+            bm_sp = [(int(topk[j]), float(bm_full[int(topk[j])])) for j in range(len(topk))]
+            bm25_ndcg.append(ndcg_at_k(bm_sp, rel, k=10))
+            # Oracle ceiling
+            rel_in_topk = len(rel & set(int(i) for i in topk))
+            n10 = min(10, rel_in_topk)
+            dcg = sum(1.0 / np.log2(r + 2) for r in range(n10))
+            idcg = sum(1.0 / np.log2(r + 2) for r in range(min(len(rel), 10)))
+            oracle_ndcg.append(dcg / idcg if idcg > 0 else 0)
+        # static stays the same regardless of K
+        static_v = np.mean(static_ndcg)
+        print(f"  {k:>4} {static_v:>10.4f} {np.mean(ce_ndcg):>10.4f} {np.mean(bm25_ndcg):>10.4f} {np.mean(oracle_ndcg):>10.4f}")
+    print("=" * 80)
+if __name__ == "__main__":
+    main()

scripts/generate_theme_defs.py ADDED Viewed

	@@ -0,0 +1,168 @@

+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "datasets>=2.19.0",
+#     "openai>=1.0",
+#     "sentence-transformers[train]>=5.5.0",
+#     "tqdm",
+#     "numpy",
+# ]
+# ///
+"""Generate natural-language definitions for each Lichess theme via DeepSeek,
+then embed those definitions with a general sentence-transformer (MPNet).
+The resulting (theme_token, definition_embedding) pairs form a "chess-aware
+teacher" — an English description of each chess concept that MPNet CAN
+understand semantically. We can then distill those embeddings into our
+StaticEmbedding model's token table.
+Solves the "MPNet doesn't know chess" problem: MPNet can't read UCI moves,
+but it CAN read English ("A tactical motif where one piece attacks two pieces
+simultaneously" → semantically near "A tactic where you create a double
+attack threatening two pieces at once"). Token-level semantic structure
+emerges from the LLM bridge.
+Run:
+    SMOKE_TEST=1 uv run --exclude-newer=2026-05-12 generate_theme_defs.py
+    uv run --exclude-newer=2026-05-12 generate_theme_defs.py
+"""
+import json
+import os
+import subprocess
+import sys
+from collections import Counter
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import numpy as np
+from datasets import Dataset, load_dataset
+from openai import OpenAI
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+MODEL = "deepseek-v4-flash"
+TEACHER_MODEL = "sentence-transformers/all-mpnet-base-v2"
+OUTPUT_PATH = "models/theme_definitions.parquet"
+SMOKE_TEST = os.environ.get("SMOKE_TEST") == "1"
+PARALLEL_WORKERS = 4
+SYSTEM_PROMPT = """You write concise dictionary-style definitions of chess
+concepts. Given a theme/concept name (often in camelCase from Lichess.org's
+puzzle tagging system), write a single English sentence of 10-25 words
+explaining the concept. Be specific and use the standard chess vocabulary that
+would appear in any chess textbook.
+Output ONLY the definition sentence. No labels, no quotes, no commentary.
+Examples:
+  Input: fork
+  Output: A tactical motif where a single piece attacks two or more enemy pieces simultaneously, forcing a material gain.
+  Input: backRankMate
+  Output: A checkmate delivered along the opponent's back rank, typically with a rook or queen, when the king is trapped by its own pawns.
+  Input: zugzwang
+  Output: A position in which any move worsens the player's position, so being forced to move becomes a disadvantage.
+"""
+def get_deepseek_key():
+    r = subprocess.run(
+        ["security", "find-generic-password", "-s", "deepseek-api", "-w"],
+        capture_output=True, text=True, timeout=5,
+    )
+    return r.stdout.strip() if r.returncode == 0 else os.environ.get("DEEPSEEK_API_KEY")
+def define_theme(client, theme, debug=False):
+    try:
+        resp = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": theme},
+            ],
+            temperature=0.2,
+            max_tokens=1500,  # DeepSeek-v4-flash spends tokens on reasoning_content; obscure mate-pattern names need lots
+            timeout=30,
+        )
+        content = resp.choices[0].message.content
+        return content.strip() if content else None
+    except Exception as e:
+        if debug:
+            print(f"  EXC for {theme!r}: {type(e).__name__}: {e}")
+        return None
+def main():
+    key = get_deepseek_key()
+    if not key:
+        sys.exit("No DeepSeek API key in keychain")
+    client = OpenAI(api_key=key, base_url="https://api.deepseek.com/v1")
+    print("Enumerating themes from Lichess puzzles...")
+    puzzles = load_dataset("Lichess/chess-puzzles", split="train", streaming=True)
+    counter = Counter()
+    sample_size = 50_000 if SMOKE_TEST else 1_000_000
+    for i, r in enumerate(puzzles):
+        if i >= sample_size:
+            break
+        for t in (r["Themes"] or []):
+            counter[t] += 1
+    themes = sorted(counter.keys())
+    print(f"  {len(themes)} unique themes")
+    if SMOKE_TEST:
+        themes = themes[:10]
+        print(f"  SMOKE_TEST=1: limited to {len(themes)}")
+    print(f"\nGenerating definitions via {MODEL}...")
+    defs = {}
+    with ThreadPoolExecutor(max_workers=PARALLEL_WORKERS) as ex:
+        futs = {ex.submit(define_theme, client, t, True): t for t in themes}
+        for f in tqdm(as_completed(futs), total=len(futs)):
+            t = futs[f]
+            defs[t] = f.result()
+    failed = [t for t, d in defs.items() if not d]
+    if failed:
+        print(f"  {len(failed)} themes failed: {failed[:5]}")
+    print(f"  {len(defs) - len(failed)}/{len(defs)} succeeded")
+    print("\nSample definitions:")
+    for t in themes[:8]:
+        if defs[t]:
+            print(f"  {t:>20s} -> {defs[t]}")
+    valid = [(t, defs[t]) for t in themes if defs[t]]
+    print(f"\nEmbedding {len(valid)} definitions with {TEACHER_MODEL}...")
+    teacher = SentenceTransformer(TEACHER_MODEL)
+    sentences = [d for _, d in valid]
+    embs = teacher.encode(sentences, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
+    # Sanity: do related themes have similar embeddings?
+    emb_norm = embs / np.linalg.norm(embs, axis=1, keepdims=True)
+    sim = emb_norm @ emb_norm.T
+    print("\nSanity check: pairwise similarities for related themes")
+    name_to_idx = {t: i for i, (t, _) in enumerate(valid)}
+    for a, b in [
+        ("fork", "skewer"), ("fork", "pin"), ("backRankMate", "smotheredMate"),
+        ("kingsideAttack", "queensideAttack"), ("endgame", "middlegame"),
+        ("fork", "promotion"),  # not directly related
+    ]:
+        if a in name_to_idx and b in name_to_idx:
+            print(f"  {a!r:>20} <-> {b!r:25} = {sim[name_to_idx[a], name_to_idx[b]]:+.3f}")
+    out = Dataset.from_dict({
+        "theme": [t for t, _ in valid],
+        "definition": [d for _, d in valid],
+        "embedding": embs.tolist(),
+    })
+    os.makedirs(os.path.dirname(OUTPUT_PATH) or ".", exist_ok=True)
+    out.to_parquet(OUTPUT_PATH)
+    print(f"\nSaved {len(out)} theme definitions to {OUTPUT_PATH}")
+if __name__ == "__main__":
+    main()

scripts/mine_hard_negs_v2.py ADDED Viewed

	@@ -0,0 +1,213 @@

+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "sentence-transformers[train]>=5.5.0",
+#     "datasets>=2.19.0",
+#     "numpy",
+#     "tqdm",
+# ]
+# ///
+"""Memory-bounded hard-negative miner. Custom impl (not sentence-transformers
+util) because the SE function tries to hold the full anchor × corpus similarity
+matrix, which OOMs at 327k anchors × 327k positives on M4.
+Algorithm:
+1. Encode all unique positives once -> N x dim float32 (~670MB at 327k x 512).
+2. Encode all unique anchors once -> M x dim float32.
+3. For each anchor batch (size B):
+   - scores = batch_emb @ positives_emb.T  -> B x N
+   - per anchor: argpartition for top RANGE_MAX, exclude actual positive,
+     sample NUM_NEGATIVES from rank [RANGE_MIN, RANGE_MAX).
+4. Stream triplets to parquet.
+Peak memory: B * N * 4 bytes for scores. With B=500, N=327k: 650MB.
+Run:
+    SMOKE_TEST=1 uv run --exclude-newer=2026-05-12 mine_hard_negs_v2.py
+    uv run --exclude-newer=2026-05-12 mine_hard_negs_v2.py
+"""
+from __future__ import annotations
+import os
+import random
+import re
+import sys
+from collections import defaultdict
+# Force unbuffered stdout so progress is visible when piped
+sys.stdout.reconfigure(line_buffering=True)
+import numpy as np
+import torch
+from datasets import Dataset, load_dataset
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+V3_MODEL_PATH = "models/static-embedding-chess/final"
+OUTPUT_PATH = "models/hard_negatives.parquet"
+SMOKE_TEST = os.environ.get("SMOKE_TEST") == "1"
+HELDOUT_FREQ_MIN = 3
+HELDOUT_FREQ_MAX = 30
+EVAL_QUERIES = 200
+NUM_NEGATIVES = 5
+RANGE_MIN = 10
+RANGE_MAX = 50
+ANCHOR_BATCH_SIZE = 500  # 500 * 327k * 4 = ~650MB scratch per batch
+def _join_tags(tags):
+    return " ".join(t.replace("_", " ") for t in tags) if tags else ""
+def _bigram_token_str(moves):
+    toks = moves.split()
+    if len(toks) < 2:
+        return moves
+    bigrams = " ".join(f"{a}+{b}" for a, b in zip(toks, toks[1:]))
+    return f"{moves} {bigrams}"
+def build_puzzle_pairs(batch):
+    anchors, positives = [], []
+    for themes, op, moves in zip(batch["Themes"], batch["OpeningTags"], batch["Moves"]):
+        themes_txt = _join_tags(themes)
+        op_txt = _join_tags(op)
+        if not themes_txt:
+            continue
+        anchor = themes_txt + (f" {op_txt}" if op_txt else "")
+        positive = f"themes {themes_txt}"
+        if op_txt:
+            positive += f" opening {op_txt}"
+        positive += f" moves {_bigram_token_str(moves)}"
+        anchors.append(anchor)
+        positives.append(positive)
+    return {"anchor": anchors, "positive": positives}
+def main():
+    print(f"Loading v3 model from {V3_MODEL_PATH}")
+    model = SentenceTransformer(V3_MODEL_PATH)
+    print("Loading puzzles...")
+    puzzles = load_dataset("Lichess/chess-puzzles", split="train")
+    if SMOKE_TEST:
+        puzzles = puzzles.select(range(100_000))
+    pair_puzzles = puzzles.map(
+        build_puzzle_pairs,
+        batched=True,
+        batch_size=20_000,
+        remove_columns=puzzles.column_names,
+        num_proc=4,
+    )
+    # Materialize columns ONCE as Python lists (HF Dataset random access is
+    # O(N) per call due to Arrow buffer slicing -- 5.8M iterations would take
+    # forever otherwise).
+    print("Materializing columns...")
+    anchors_list = pair_puzzles["anchor"]
+    positives_list = pair_puzzles["positive"]
+    print(f"  done ({len(anchors_list):,} rows)")
+    # Remove held-out anchors
+    freq = defaultdict(int)
+    for a in anchors_list:
+        freq[a] += 1
+    rare_pool = sorted(
+        ((a, c) for a, c in freq.items() if HELDOUT_FREQ_MIN <= c <= HELDOUT_FREQ_MAX),
+        key=lambda kv: kv[1],
+    )
+    heldout = {a for a, _ in rare_pool[:EVAL_QUERIES]}
+    # Build one-per-anchor (use as both the anchor source AND the corpus source)
+    by_anchor = defaultdict(list)
+    for a, p in zip(anchors_list, positives_list):
+        if a not in heldout:
+            by_anchor[a].append(p)
+    print(f"  unique anchors (post-heldout-strip): {len(by_anchor):,}")
+    rng = random.Random(12)
+    unique_anchors = list(by_anchor.keys())
+    if SMOKE_TEST:
+        unique_anchors = unique_anchors[:200]
+        print(f"  SMOKE_TEST=1: trimmed to {len(unique_anchors)}")
+    # For each anchor, pick ONE random positive (skip the O(n^2) filter -- just
+    # iterate unique_anchors directly).
+    print(f"  Sampling one positive per anchor...")
+    positives = [rng.choice(by_anchor[a]) for a in unique_anchors]
+    print(f"  done")
+    # Encode anchors and positives
+    print(f"\nEncoding {len(unique_anchors):,} anchors...")
+    anchor_emb = model.encode(
+        unique_anchors, batch_size=512, show_progress_bar=True, convert_to_numpy=True
+    )
+    anchor_emb = anchor_emb / np.linalg.norm(anchor_emb, axis=1, keepdims=True)
+    print(f"  anchor shape: {anchor_emb.shape}, mem: {anchor_emb.nbytes / 1e6:.1f}MB")
+    print(f"\nEncoding {len(positives):,} positives...")
+    positive_emb = model.encode(
+        positives, batch_size=512, show_progress_bar=True, convert_to_numpy=True
+    )
+    positive_emb = positive_emb / np.linalg.norm(positive_emb, axis=1, keepdims=True)
+    print(f"  positive shape: {positive_emb.shape}, mem: {positive_emb.nbytes / 1e6:.1f}MB")
+    # Mine hard negs in chunks
+    print(f"\nMining hard negs (range={RANGE_MIN}..{RANGE_MAX}, num={NUM_NEGATIVES}, batch={ANCHOR_BATCH_SIZE})...")
+    out_anchors, out_positives, out_negatives = [], [], []
+    pos_scores_acc, neg_scores_acc = [], []
+    n_anchors = len(unique_anchors)
+    for start in tqdm(range(0, n_anchors, ANCHOR_BATCH_SIZE)):
+        end = min(start + ANCHOR_BATCH_SIZE, n_anchors)
+        ab = anchor_emb[start:end]  # B x D
+        # scores: B x N. Each row i is anchor[start+i] vs all positives.
+        scores = ab @ positive_emb.T  # B x N (float32)
+        # For each anchor i in batch, sort scores desc, get top RANGE_MAX
+        # excluding the actual positive (which is at column start+i).
+        # We use argpartition for efficiency.
+        for i in range(end - start):
+            anchor_idx = start + i
+            row = scores[i].copy()
+            # Mask out the actual positive (anchor's own positive is at anchor_idx)
+            row[anchor_idx] = -np.inf
+            # Take top RANGE_MAX indices
+            top_idx = np.argpartition(-row, RANGE_MAX)[:RANGE_MAX]
+            # Sort them by score
+            top_idx = top_idx[np.argsort(-row[top_idx])]
+            # Sample NUM_NEGATIVES from rank [RANGE_MIN, RANGE_MAX)
+            mid_range = top_idx[RANGE_MIN:RANGE_MAX]
+            sampled = rng.sample(list(mid_range), min(NUM_NEGATIVES, len(mid_range)))
+            for neg_idx in sampled:
+                out_anchors.append(unique_anchors[anchor_idx])
+                out_positives.append(positives[anchor_idx])
+                out_negatives.append(positives[neg_idx])
+                pos_scores_acc.append(float(scores[i, anchor_idx]))
+                neg_scores_acc.append(float(scores[i, neg_idx]))
+    print(f"\n  output triplets: {len(out_anchors):,}")
+    print(f"  positive scores: mean={np.mean(pos_scores_acc):.3f} std={np.std(pos_scores_acc):.3f}")
+    print(f"  hard-neg scores: mean={np.mean(neg_scores_acc):.3f} std={np.std(neg_scores_acc):.3f}")
+    print(f"  margin (pos - neg): mean={np.mean(np.array(pos_scores_acc) - np.array(neg_scores_acc)):.3f}")
+    # Save
+    os.makedirs(os.path.dirname(OUTPUT_PATH) or ".", exist_ok=True)
+    Dataset.from_dict({
+        "anchor": out_anchors,
+        "positive": out_positives,
+        "negative": out_negatives,
+    }).to_parquet(OUTPUT_PATH)
+    print(f"  saved to {OUTPUT_PATH} ({os.path.getsize(OUTPUT_PATH) / 1e6:.1f} MB)")
+    # Sample
+    print("\n=== Sample triplets ===")
+    for i in [0, len(out_anchors)//2, len(out_anchors)-1]:
+        print(f"  ANCHOR:  {out_anchors[i]!r}")
+        print(f"  POSITIVE:{out_positives[i][:100]!r}")
+        print(f"  NEGATIVE:{out_negatives[i][:100]!r}")
+        print()
+if __name__ == "__main__":
+    main()

scripts/train_chess_multitask.py ADDED Viewed

	@@ -0,0 +1,287 @@

+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "sentence-transformers[train]>=5.5.0",
+#     "datasets>=2.19.0",
+#     "accelerate>=0.26.0",
+#     "tokenizers>=0.20",
+# ]
+# ///
+"""Multi-task training: chess-aware semantic structure + hard-negative MNRL.
+Two simultaneous training signals:
+1. THEME-DISTILL dataset: (theme_token, mpnet_definition_emb)
+   - 73 rows (one per Lichess theme)
+   - Loss: EmbedDistillLoss (project student 512d -> 768d, match teacher)
+   - Effect: enc("fork") moves toward MPNet("a tactical motif where one piece...")
+   - Solves orthogonal-token-embeddings problem identified in Phase 1
+2. CHESS-CONTENT dataset: (anchor, positive, hard_negative)
+   - From mined hard-negs of v3 model
+   - Loss: MultipleNegativesRankingLoss (handles triplets natively)
+   - Effect: maintains chess-content associations, sharpens discriminative ability
+Multi-task trainer interleaves batches from both datasets. The theme dataset is
+tiny (73 rows) but high-impact -- it injects semantic structure into 73 token
+embeddings. The chess dataset is large (1.6M+ triplets) and shapes the rest.
+Run:
+    SMOKE_TEST=1 uv run --exclude-newer=2026-05-12 train_chess_multitask.py
+    uv run --exclude-newer=2026-05-12 train_chess_multitask.py
+"""
+from __future__ import annotations
+import logging
+import os
+import random
+import re
+import time
+from collections import defaultdict
+from contextlib import nullcontext
+import numpy as np
+import torch
+from datasets import Dataset, concatenate_datasets, load_dataset
+from tokenizers import Tokenizer
+from sentence_transformers import (
+    SentenceTransformer,
+    SentenceTransformerModelCardData,
+    SentenceTransformerTrainer,
+    SentenceTransformerTrainingArguments,
+)
+from sentence_transformers.base.sampler import BatchSamplers, MultiDatasetBatchSamplers
+from sentence_transformers.sentence_transformer.evaluation import (
+    InformationRetrievalEvaluator,
+)
+from sentence_transformers.sentence_transformer.losses import (
+    EmbedDistillLoss,
+    MultipleNegativesRankingLoss,
+)
+from sentence_transformers.sentence_transformer.modules import StaticEmbedding
+from transformers import EarlyStoppingCallback, TrainerCallback
+THEME_DEFS_PATH = "models/theme_definitions.parquet"
+TRIPLETS_PATH = "models/hard_negatives.parquet"
+TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", "models/static-embedding-chess/chess_tokenizer.json")
+OUTPUT_DIR = "models/static-embedding-chess-multitask"
+RUN_NAME = "static-embedding-chess-multitask"
+SMOKE_TEST = os.environ.get("SMOKE_TEST") == "1"
+EMBEDDING_DIM = 512
+TEACHER_DIM = 768
+HELDOUT_FREQ_MIN = 3
+HELDOUT_FREQ_MAX = 30
+EVAL_QUERIES = 200
+THEME_REPLICAS = int(os.environ.get("THEME_REPLICAS", "500"))  # oversample theme dataset
+IS_CUDA = torch.cuda.is_available()
+IS_MPS = (not IS_CUDA) and torch.backends.mps.is_available()
+BATCH_SIZE = 4096 if IS_CUDA else (4096 if IS_MPS else 256)
+def setup_logging():
+    os.makedirs("logs", exist_ok=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    logging.basicConfig(
+        format="%(asctime)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=logging.INFO,
+        handlers=[logging.StreamHandler(), logging.FileHandler(f"logs/{RUN_NAME}.log")],
+        force=True,
+    )
+    for noisy in ("httpx", "httpcore", "huggingface_hub", "urllib3", "filelock", "fsspec"):
+        logging.getLogger(noisy).setLevel(logging.WARNING)
+def _join_tags(tags):
+    return " ".join(t.replace("_", " ") for t in tags) if tags else ""
+def _bigram_token_str(moves):
+    toks = moves.split()
+    if len(toks) < 2:
+        return moves
+    bigrams = " ".join(f"{a}+{b}" for a, b in zip(toks, toks[1:]))
+    return f"{moves} {bigrams}"
+def build_puzzle_pairs(batch):
+    anchors, positives = [], []
+    for themes, op, moves in zip(batch["Themes"], batch["OpeningTags"], batch["Moves"]):
+        themes_txt = _join_tags(themes)
+        op_txt = _join_tags(op)
+        if not themes_txt:
+            continue
+        anchor = themes_txt + (f" {op_txt}" if op_txt else "")
+        positive = f"themes {themes_txt}"
+        if op_txt:
+            positive += f" opening {op_txt}"
+        positive += f" moves {_bigram_token_str(moves)}"
+        anchors.append(anchor)
+        positives.append(positive)
+    return {"anchor": anchors, "positive": positives}
+def strip_theme_echo(p):
+    i = p.find(" moves ")
+    return p[i + 1 :] if i != -1 else p
+def build_evaluator(holdout):
+    corpus = {f"d{i}": strip_theme_echo(row["positive"]) for i, row in enumerate(holdout)}
+    by_anchor = defaultdict(set)
+    for i, row in enumerate(holdout):
+        by_anchor[row["anchor"]].add(f"d{i}")
+    sorted_a = sorted(by_anchor.items(), key=lambda kv: -len(kv[1]))
+    queries = {f"q{i}": a for i, (a, _) in enumerate(sorted_a)}
+    relevant = {f"q{i}": ids for i, (_, ids) in enumerate(sorted_a)}
+    return InformationRetrievalEvaluator(
+        queries=queries, corpus=corpus, relevant_docs=relevant,
+        name="chess-ir", ndcg_at_k=[10], mrr_at_k=[10],
+        accuracy_at_k=[1, 10], precision_recall_at_k=[1, 10],
+        show_progress_bar=False, batch_size=256,
+    )
+def autocast_ctx():
+    if IS_CUDA:
+        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+        return torch.autocast("cuda", dtype=dtype)
+    if IS_MPS:
+        return torch.autocast("mps", dtype=torch.float16)
+    return nullcontext()
+def main():
+    setup_logging()
+    logging.info(f"Loading tokenizer from {TOKENIZER_PATH}")
+    tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
+    logging.info(f"  vocab: {tokenizer.get_vocab_size():,}")
+    logging.info(f"Building random-init StaticEmbedding (dim={EMBEDDING_DIM})")
+    static = StaticEmbedding(tokenizer, embedding_dim=EMBEDDING_DIM)
+    model = SentenceTransformer(
+        modules=[static],
+        model_card_data=SentenceTransformerModelCardData(
+            language="en", license="apache-2.0",
+            model_name=f"Static chess embedding ({EMBEDDING_DIM}d) -- multi-task (theme distill + hard-neg MNRL)",
+        ),
+    )
+    # === Dataset A: theme distillation ===
+    logging.info(f"Loading theme definitions from {THEME_DEFS_PATH}")
+    theme_ds_full = Dataset.from_parquet(THEME_DEFS_PATH)
+    # EmbedDistillLoss expects columns: sentence, label
+    theme_ds = theme_ds_full.rename_columns({"theme": "sentence", "embedding": "label"}).remove_columns(["definition"])
+    # Oversample to be seen alongside the much-larger chess dataset
+    if not SMOKE_TEST:
+        theme_ds = concatenate_datasets([theme_ds] * THEME_REPLICAS).shuffle(seed=12)
+    logging.info(f"  {len(theme_ds):,} theme rows (after oversampling)")
+    # === Dataset B: chess triplets ===
+    logging.info(f"Loading triplets from {TRIPLETS_PATH}")
+    triplet_ds = Dataset.from_parquet(TRIPLETS_PATH)
+    if SMOKE_TEST:
+        triplet_ds = triplet_ds.select(range(min(500, len(triplet_ds))))
+    logging.info(f"  {len(triplet_ds):,} triplets, columns: {triplet_ds.column_names}")
+    # === Build eval (same as previous runs) ===
+    logging.info("Building held-out eval")
+    puzzles = load_dataset("Lichess/chess-puzzles", split="train")
+    if SMOKE_TEST:
+        puzzles = puzzles.select(range(2_000))
+    pair_puzzles = puzzles.map(
+        build_puzzle_pairs, batched=True, batch_size=20_000,
+        remove_columns=puzzles.column_names, num_proc=4,
+    )
+    anchors = pair_puzzles["anchor"]
+    freq = defaultdict(int)
+    for a in anchors:
+        freq[a] += 1
+    rare_pool = sorted(
+        ((a, c) for a, c in freq.items() if HELDOUT_FREQ_MIN <= c <= HELDOUT_FREQ_MAX),
+        key=lambda kv: kv[1],
+    )
+    n_eval = 20 if SMOKE_TEST else EVAL_QUERIES
+    heldout = {a for a, _ in rare_pool[:n_eval]}
+    held_idx = [i for i, h in enumerate([a in heldout for a in anchors]) if h]
+    holdout = pair_puzzles.select(held_idx)
+    logging.info(f"  holdout: {len(holdout)}")
+    evaluator = build_evaluator(holdout)
+    logging.info("Baseline eval (random init):")
+    with autocast_ctx():
+        baseline = evaluator(model)[evaluator.primary_metric]
+    metric_key = f"eval_{evaluator.primary_metric}"
+    logging.info(f"  baseline {evaluator.primary_metric} = {baseline:.4f}")
+    # === Multi-task setup ===
+    train_datasets = {
+        "chess": triplet_ds,
+        "themes": theme_ds,
+    }
+    losses = {
+        "chess": MultipleNegativesRankingLoss(model),
+        "themes": EmbedDistillLoss(model, distance_metric="cosine", projection_dim=TEACHER_DIM),
+    }
+    args = SentenceTransformerTrainingArguments(
+        output_dir=OUTPUT_DIR,
+        num_train_epochs=5,
+        max_steps=1 if SMOKE_TEST else -1,
+        per_device_train_batch_size=BATCH_SIZE,
+        per_device_eval_batch_size=BATCH_SIZE,
+        learning_rate=1e-2,
+        weight_decay=0.01,
+        warmup_steps=0.1,
+        lr_scheduler_type="linear",
+        bf16=IS_CUDA and torch.cuda.is_bf16_supported(),
+        fp16=IS_CUDA and not torch.cuda.is_bf16_supported(),
+        batch_sampler=BatchSamplers.BATCH_SAMPLER,
+        multi_dataset_batch_sampler=MultiDatasetBatchSamplers.PROPORTIONAL,
+        eval_strategy="steps",
+        eval_steps=0.05,
+        save_strategy="steps",
+        save_steps=0.05,
+        save_total_limit=2,
+        logging_steps=0.02,
+        logging_first_step=True,
+        load_best_model_at_end=True,
+        metric_for_best_model=metric_key,
+        greater_is_better=True,
+        report_to="none",
+        run_name=RUN_NAME,
+        seed=12,
+        push_to_hub=False,
+    )
+    trainer = SentenceTransformerTrainer(
+        model=model, args=args,
+        train_dataset=train_datasets, loss=losses, evaluator=evaluator,
+        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+    trainer.train()
+    logging.info("Post-training eval:")
+    with autocast_ctx():
+        score = evaluator(model)[evaluator.primary_metric]
+    delta = score - baseline
+    verdict = "WIN" if delta >= 0.005 else "MARGINAL" if delta >= 0 else "REGRESSION"
+    logging.info(
+        f"VERDICT: {verdict} | score={score:.4f} | baseline={baseline:.4f} | delta={delta:+.4f}"
+    )
+    # Also report current absolute vs v3 baseline (0.080)
+    v3_baseline = 0.0801
+    logging.info(f"  vs v3 (0.0801): delta = {score - v3_baseline:+.4f}")
+    final_dir = f"{OUTPUT_DIR}/final"
+    model.save_pretrained(final_dir)
+    logging.info(f"Saved final model to {final_dir}")
+if __name__ == "__main__":
+    main()

scripts/train_chess_static.py ADDED Viewed

	@@ -0,0 +1,640 @@

+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "sentence-transformers[train]>=5.5.0",
+#     "datasets>=2.19.0",
+#     "accelerate>=0.26.0",
+#     "tokenizers>=0.20",
+#     "trackio",
+# ]
+# ///
+"""Train a StaticEmbedding model for chess retrieval.
+Pair shape:
+    anchor   = "<themes> [<opening words>]"
+    positive = "themes <themes> [opening <words>] moves <uci>"          (puzzles)
+               "name <words> eco <code> pgn <san>"                       (openings)
+Datasets:
+- Lichess/chess-puzzles  (5.8M rows; themes + opening tags + UCI moves)
+- Lichess/chess-openings (3.6K rows; opening name + ECO + SAN moves)
+Use case: free-text search over a chess corpus. "fork endgame short" -> puzzles
+with that motif; "Sicilian Najdorf" -> matching openings.
+Design choices:
+- Custom WordLevel + Whitespace tokenizer trained on the corpus. Every chess
+  token (UCI move e2e4, SAN move Nxd4, ECO code B90, theme name, opening word)
+  is one whole token -- BERT WordPiece would shred them 4-way.
+- FEN dropped: position-as-character-soup doesn't fit a token-bag.
+- PGN move numbers stripped ("1. e4 c5" -> "e4 c5") so SAN moves are high-freq.
+- IR eval is custom (themes -> puzzles), not NanoBEIR -- general-English IR
+  benchmarks don't measure chess retrieval.
+Run:
+    SMOKE_TEST=1 uv run --exclude-newer=2026-05-12 train_chess_static.py
+    uv run --exclude-newer=2026-05-12 train_chess_static.py
+"""
+from __future__ import annotations
+import logging
+import os
+import re
+from collections import defaultdict
+from contextlib import nullcontext
+import datasets
+import random
+import torch
+from datasets import Dataset, concatenate_datasets, load_dataset
+from tokenizers import Tokenizer
+from tokenizers.models import WordLevel
+from tokenizers.pre_tokenizers import Whitespace
+from tokenizers.trainers import WordLevelTrainer
+from sentence_transformers import (
+    SentenceTransformer,
+    SentenceTransformerModelCardData,
+    SentenceTransformerTrainer,
+    SentenceTransformerTrainingArguments,
+)
+from sentence_transformers.base.sampler import BatchSamplers
+from sentence_transformers.sentence_transformer.evaluation import (
+    InformationRetrievalEvaluator,
+    SequentialEvaluator,
+)
+from sentence_transformers.sentence_transformer.losses import (
+    MatryoshkaLoss,
+    MultipleNegativesRankingLoss,
+)
+from sentence_transformers.sentence_transformer.modules import StaticEmbedding
+from transformers import EarlyStoppingCallback, TrainerCallback
+import time
+EMBEDDING_DIM = 512  # was 256; 512 gives more capacity for bigram tokens
+MATRYOSHKA_DIMS = [512, 256, 128, 64, 32]
+VOCAB_SIZE = 100_000  # was 50_000; UCI/SAN bigrams add ~20-50k vocab
+OUTPUT_DIR = "models/static-embedding-chess"
+RUN_NAME = "static-embedding-chess"
+HUB_MODEL_ID = os.environ.get("HUB_MODEL_ID", "oneryalcin/static-embedding-chess")
+# TOKENIZER_PATH default lives next to the model output. On Modal, set this to
+# a path on the persistent volume (e.g. /cache/chess_tokenizer.json) so the
+# 6-min WordLevelTrainer run is amortized across launches.
+TOKENIZER_PATH = os.environ.get(
+    "TOKENIZER_PATH", f"{OUTPUT_DIR}/chess_tokenizer.json"
+)
+RETRAIN_TOKENIZER = os.environ.get("RETRAIN_TOKENIZER") == "1"
+SMOKE_TEST = os.environ.get("SMOKE_TEST") == "1"
+FORCE_CPU = os.environ.get("FORCE_CPU") == "1"
+# Diagnostic knobs (default: full recipe). Both MPS and T4 show monotonic
+# step-time growth with the full Matryoshka stack -- toggle these to isolate.
+DISABLE_MATRYOSHKA = os.environ.get("DISABLE_MATRYOSHKA") == "1"
+MAX_STEPS_OVERRIDE = int(os.environ.get("MAX_STEPS", "0")) or None
+EVAL_STEPS_OVERRIDE = int(os.environ.get("EVAL_STEPS", "0")) or None
+EVAL_QUERIES = 200
+EVAL_CORPUS = 5_000
+# Held-out anchor selection: pick rare combos in this freq range. Low end > 1
+# keeps multi-relevant NDCG meaningful; high end caps memorization potential.
+HELDOUT_FREQ_MIN = 3
+HELDOUT_FREQ_MAX = 30
+# Balanced-dataset config: each unique anchor expands to N (anchor, sampled_pos)
+# rows. The original 5.8M pairs let the model memorize specific (anchor, pos)
+# pairings since each anchor has ~1933 distinct positives. Capping at 100
+# random samples per anchor gives the model meaningful variety without the
+# 50x redundancy that fuels overfitting.
+BALANCED_POSITIVES_PER_ANCHOR = int(os.environ.get("POSITIVES_PER_ANCHOR", "100"))
+# Anchor token masking probability during training. 0 disables.
+ANCHOR_MASK_PROB = float(os.environ.get("ANCHOR_MASK_PROB", "0.15"))
+# Device-aware defaults. MPS (Apple Silicon) can't do bf16 and has unified-
+# memory pressure, so the CUDA-targeted skill template defaults (batch=2048,
+# bf16=True) don't apply. Scale BATCH_SIZE up if your M-series has 36GB+.
+IS_CUDA = torch.cuda.is_available() and not FORCE_CPU
+IS_MPS = (not IS_CUDA) and torch.backends.mps.is_available() and not FORCE_CPU
+# StaticEmbedding is a lookup+average -- no transformer activations to fit.
+# Memory cost is the (batch x batch) similarity matrix + (batch x seq x dim)
+# lookups, both tiny. CachedMultipleNegativesRankingLoss is NOT compatible
+# with StaticEmbedding (no encoder to GradCache through), so we just crank
+# the real batch. Scale up freely if your M-series has the headroom.
+BATCH_SIZE = 4096 if IS_CUDA else (4096 if IS_MPS else 256)
+MOVE_NUM_RE = re.compile(r"\d+\.+")
+class StepTimingCallback(TrainerCallback):
+    """Per-step instrumentation: wall time, CUDA memory, allocator state.
+    Costs ~1ms/step. Run-once-and-read approach to diagnosing slowdowns
+    instead of swapping configs and rerunning.
+    """
+    def on_step_begin(self, args, state, control, **kw):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        self._t0 = time.perf_counter()
+    def on_step_end(self, args, state, control, **kw):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        dt = time.perf_counter() - self._t0
+        # Log every step for the first 20 to see startup; then every 10th.
+        if state.global_step <= 20 or state.global_step % 10 == 0:
+            if torch.cuda.is_available():
+                mem = torch.cuda.memory_allocated() / 1e6
+                reserved = torch.cuda.memory_reserved() / 1e6
+                logging.info(
+                    f"STEP {state.global_step}: dt={dt:.3f}s mem={mem:.0f}MB reserved={reserved:.0f}MB"
+                )
+            else:
+                logging.info(f"STEP {state.global_step}: dt={dt:.3f}s (cpu/mps)")
+def autocast_ctx():
+    if IS_CUDA:
+        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+        return torch.autocast("cuda", dtype=dtype)
+    if IS_MPS:
+        return torch.autocast("mps", dtype=torch.float16)
+    return nullcontext()
+def setup_logging():
+    os.makedirs("logs", exist_ok=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    logging.basicConfig(
+        format="%(asctime)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=logging.INFO,
+        handlers=[logging.StreamHandler(), logging.FileHandler(f"logs/{RUN_NAME}.log")],
+        force=True,
+    )
+    for noisy in ("httpx", "httpcore", "huggingface_hub", "urllib3", "filelock", "fsspec"):
+        logging.getLogger(noisy).setLevel(logging.WARNING)
+    if torch.cuda.is_available():
+        torch.set_float32_matmul_precision("high")
+def _join_tags(tags) -> str:
+    if not tags:
+        return ""
+    return " ".join(t.replace("_", " ") for t in tags)
+def _strip_pgn_move_numbers(pgn: str) -> str:
+    return MOVE_NUM_RE.sub("", pgn).strip()
+def _bigram_token_str(moves: str) -> str:
+    """Append bigram tokens to a whitespace-separated move sequence.
+    "f2g3 e6e7 b2b1" -> "f2g3 e6e7 b2b1 f2g3+e6e7 e6e7+b2b1"
+    Bigrams use `+` as the join char so they're distinct from unigrams in the
+    WordLevel tokenizer's whitespace pretokenizer. A token-bag averaging across
+    unigrams alone loses move ordering; adding adjacent-pair tokens lets the
+    model learn that "e2e4 e7e5" (king's pawn opening) is its own pattern.
+    """
+    tokens = moves.split()
+    if len(tokens) < 2:
+        return moves
+    bigrams = " ".join(f"{a}+{b}" for a, b in zip(tokens, tokens[1:]))
+    return f"{moves} {bigrams}"
+def build_puzzle_pairs(row_batch: dict) -> dict:
+    anchors, positives = [], []
+    for themes, opening_tags, moves in zip(
+        row_batch["Themes"], row_batch["OpeningTags"], row_batch["Moves"]
+    ):
+        themes_txt = _join_tags(themes)
+        opening_txt = _join_tags(opening_tags)
+        if not themes_txt:
+            continue
+        anchor = themes_txt + (f" {opening_txt}" if opening_txt else "")
+        positive = f"themes {themes_txt}"
+        if opening_txt:
+            positive += f" opening {opening_txt}"
+        positive += f" moves {_bigram_token_str(moves)}"
+        anchors.append(anchor)
+        positives.append(positive)
+    return {"anchor": anchors, "positive": positives}
+def build_opening_pairs(row_batch: dict) -> dict:
+    anchors, positives = [], []
+    for name, eco, pgn in zip(row_batch["name"], row_batch["eco"], row_batch["pgn"]):
+        san = _strip_pgn_move_numbers(pgn)
+        anchors.append(f"{name} {eco}")
+        positives.append(f"name {name} eco {eco} pgn {_bigram_token_str(san)}")
+    return {"anchor": anchors, "positive": positives}
+def load_chess_pairs() -> tuple[Dataset, Dataset]:
+    """Returns (train, holdout) where the holdout anchors are rare combinations
+    NEVER seen in train.
+    Old eval used the top-200 most-common theme strings as queries. The model
+    memorized these in training (each appears ~50k times) so eval was a recall
+    test on memorized lookups, not generalization. Replaced with compositional
+    held-out anchors:
+      - Pick anchor strings with frequency in [HELDOUT_FREQ_MIN, HELDOUT_FREQ_MAX]:
+        rare enough to be informative, common enough to have multiple positives
+        for multi-relevant eval.
+      - REMOVE all pairs with those anchors from train (no leakage).
+      - Use those rare anchors as eval queries; the held-out pairs become the
+        eval corpus.
+      - Individual theme tokens within those anchors still appear *separately*
+        in many other training anchors, so the model has learned each token's
+        embedding -- it just hasn't seen this particular combination. Tests
+        compositional generalization.
+    """
+    logging.info("Loading Lichess/chess-puzzles (5.8M rows)")
+    puzzles = load_dataset("Lichess/chess-puzzles", split="train")
+    if SMOKE_TEST:
+        puzzles = puzzles.select(range(2_000))
+    pair_puzzles = puzzles.map(
+        build_puzzle_pairs,
+        batched=True,
+        batch_size=10_000,
+        remove_columns=puzzles.column_names,
+        desc="puzzles -> pairs",
+    )
+    logging.info(f"  built {len(pair_puzzles):,} puzzle pairs")
+    logging.info("Loading Lichess/chess-openings (3.6K rows)")
+    openings = load_dataset("Lichess/chess-openings", split="train").remove_columns(["img"])
+    pair_openings = openings.map(
+        build_opening_pairs,
+        batched=True,
+        remove_columns=openings.column_names,
+        desc="openings -> pairs",
+    )
+    logging.info(f"  built {len(pair_openings):,} opening pairs")
+    # Count anchor frequencies across the puzzle pairs.
+    logging.info("Computing anchor frequencies for held-out selection")
+    anchors = pair_puzzles["anchor"]
+    freq: dict[str, int] = defaultdict(int)
+    for a in anchors:
+        freq[a] += 1
+    logging.info(f"  {len(freq):,} unique anchors in puzzle pairs")
+    # Pick rare anchors: each appears in [HELDOUT_FREQ_MIN, HELDOUT_FREQ_MAX] pairs.
+    # In smoke mode, lower the min so the tiny corpus still produces enough
+    # held-out queries (smoke has ~2k puzzles, most anchors freq 1-2).
+    min_freq = 2 if SMOKE_TEST else HELDOUT_FREQ_MIN
+    max_freq = HELDOUT_FREQ_MAX
+    rare_pool = sorted(
+        ((a, c) for a, c in freq.items() if min_freq <= c <= max_freq),
+        key=lambda kv: kv[1],  # ascending: rarest first
+    )
+    n_queries_target = 20 if SMOKE_TEST else EVAL_QUERIES
+    if len(rare_pool) < n_queries_target:
+        logging.warning(
+            f"Only {len(rare_pool)} anchors in freq range [{HELDOUT_FREQ_MIN},{HELDOUT_FREQ_MAX}]; "
+            f"using all of them ({n_queries_target} requested)"
+        )
+    heldout_anchors = {a for a, _ in rare_pool[:n_queries_target]}
+    logging.info(
+        f"  selected {len(heldout_anchors)} held-out anchors "
+        f"(freq range: {rare_pool[0][1] if rare_pool else 0}..{rare_pool[min(n_queries_target, len(rare_pool))-1][1] if rare_pool else 0})"
+    )
+    # Filter: pairs whose anchor is held-out -> eval; everything else -> train.
+    held_mask = [a in heldout_anchors for a in anchors]
+    holdout = pair_puzzles.select([i for i, h in enumerate(held_mask) if h])
+    train_puzzles = pair_puzzles.select([i for i, h in enumerate(held_mask) if not h])
+    logging.info(
+        f"  split by held-out anchors: train={len(train_puzzles):,}, holdout={len(holdout):,}"
+    )
+    # Train includes the (non-held) puzzle pairs + all openings.
+    train = concatenate_datasets([train_puzzles, pair_openings]).shuffle(seed=12)
+    logging.info(f"  train: {len(train):,} pairs | holdout: {len(holdout):,} pairs")
+    return train, holdout
+def make_balanced_dataset(train: Dataset, n_per_anchor: int) -> Dataset:
+    """Cap each anchor's positives to `n_per_anchor` random picks. Breaks the
+    5.8M pairs' redundancy (each anchor x ~1933 positives) so the model can't
+    memorize specific (anchor, positive) pairings while still seeing useful
+    positive variety per anchor.
+    """
+    by_anchor: dict[str, list[str]] = defaultdict(list)
+    for row in train:
+        by_anchor[row["anchor"]].append(row["positive"])
+    rng = random.Random(12)
+    new_anchors, new_positives = [], []
+    for anchor, positives in by_anchor.items():
+        sample = (
+            rng.sample(positives, n_per_anchor)
+            if len(positives) > n_per_anchor
+            else positives
+        )
+        for p in sample:
+            new_anchors.append(anchor)
+            new_positives.append(p)
+    logging.info(
+        f"Balanced dataset: {len(by_anchor):,} unique anchors -> "
+        f"{len(new_anchors):,} pairs (cap {n_per_anchor}/anchor)"
+    )
+    return Dataset.from_dict({"anchor": new_anchors, "positive": new_positives}).shuffle(seed=12)
+def make_anchor_masker(mask_prob: float, rng_seed: int = 12):
+    """Return a `set_transform` callable that randomly replaces theme tokens
+    with [UNK] in the anchor. Token-bag dropout: forces the model to use
+    remaining tokens instead of memorizing the exact combination."""
+    if mask_prob <= 0:
+        return None
+    rng = random.Random(rng_seed)
+    def _mask(batch: dict) -> dict:
+        anchors = batch["anchor"]
+        new_anchors = []
+        for a in anchors:
+            tokens = a.split()
+            if len(tokens) <= 1:
+                new_anchors.append(a)
+                continue
+            kept = [t if rng.random() >= mask_prob else "[UNK]" for t in tokens]
+            # Guard against masking everything: if all UNK, restore one random token.
+            if all(t == "[UNK]" for t in kept):
+                kept[rng.randrange(len(kept))] = tokens[rng.randrange(len(tokens))]
+            new_anchors.append(" ".join(kept))
+        return {"anchor": new_anchors, "positive": batch["positive"]}
+    return _mask
+def train_chess_tokenizer(train: Dataset) -> Tokenizer:
+    """Train or load a WordLevel tokenizer for the chess corpus.
+    Every space-separated unit (theme word, opening word, ECO code, UCI move,
+    SAN move) becomes one whole token. Compare to BERT WordPiece which fragments
+    "f2g3" into 4 subword pieces -- a token-bag wastes capacity on subword joins
+    that carry no chess meaning.
+    Caching: if TOKENIZER_PATH exists, load and return it instead of rebuilding.
+    The WordLevelTrainer is single-threaded Rust and takes ~6 min on 11.6M
+    strings. Tokenizer is deterministic given the same corpus + config, so
+    caching is safe. Set RETRAIN_TOKENIZER=1 to force rebuild.
+    """
+    if not RETRAIN_TOKENIZER and os.path.exists(TOKENIZER_PATH):
+        tok = Tokenizer.from_file(TOKENIZER_PATH)
+        logging.info(
+            f"Reusing cached tokenizer ({tok.get_vocab_size():,} tokens) from {TOKENIZER_PATH}"
+        )
+        return tok
+    logging.info(f"Training WordLevel tokenizer on {len(train):,} pairs (vocab={VOCAB_SIZE})")
+    tok = Tokenizer(WordLevel(unk_token="[UNK]"))
+    tok.pre_tokenizer = Whitespace()
+    trainer = WordLevelTrainer(
+        vocab_size=VOCAB_SIZE,
+        special_tokens=["[UNK]", "[PAD]"],
+        min_frequency=2,
+    )
+    def text_iter():
+        for row in train:
+            yield row["anchor"]
+            yield row["positive"]
+    tok.train_from_iterator(text_iter(), trainer=trainer, length=2 * len(train))
+    actual_vocab = tok.get_vocab_size()
+    logging.info(f"  tokenizer trained: {actual_vocab:,} tokens (cap was {VOCAB_SIZE:,})")
+    os.makedirs(os.path.dirname(TOKENIZER_PATH) or ".", exist_ok=True)
+    tok.save(TOKENIZER_PATH)
+    logging.info(f"  saved tokenizer to {TOKENIZER_PATH}")
+    return tok
+def _strip_theme_echo(positive: str) -> str:
+    """Eval corpus must not echo the themes the query asks about, or the
+    baseline (random-init) scores high just from lexical token overlap. Keep
+    only the moves segment."""
+    idx = positive.find(" moves ")
+    return positive[idx + 1 :] if idx != -1 else positive
+def _build_compositional_ir_evaluator(
+    holdout: Dataset, corpus: dict[str, str], name: str
+) -> InformationRetrievalEvaluator:
+    """Compositional: each unseen anchor string is a query."""
+    by_anchor: dict[str, set[str]] = defaultdict(set)
+    for i, row in enumerate(holdout):
+        by_anchor[row["anchor"]].add(f"d{i}")
+    sorted_anchors = sorted(by_anchor.items(), key=lambda kv: -len(kv[1]))
+    queries = {f"q{i}": anchor for i, (anchor, _) in enumerate(sorted_anchors)}
+    relevant_docs = {f"q{i}": docs for i, (_, docs) in enumerate(sorted_anchors)}
+    avg_rel = sum(len(v) for v in relevant_docs.values()) / max(1, len(relevant_docs))
+    logging.info(
+        f"  [{name}] {len(queries)} queries (unseen combos), avg relevant/query={avg_rel:.1f}"
+    )
+    return _ir_evaluator(queries, corpus, relevant_docs, name)
+def _build_single_theme_ir_evaluator(
+    holdout: Dataset, corpus: dict[str, str], name: str
+) -> InformationRetrievalEvaluator:
+    """Single-theme: each individual theme token from the held-out anchors is
+    a query. Tests whether per-token embeddings are useful in isolation.
+    Relevant docs for query "fork" = any held-out doc whose anchor contains
+    the token "fork". Coarser than the compositional eval (much higher avg
+    relevant/query) but a sharper test of token-level meaning.
+    """
+    theme_to_docs: dict[str, set[str]] = defaultdict(set)
+    for i, row in enumerate(holdout):
+        for token in row["anchor"].split():
+            theme_to_docs[token].add(f"d{i}")
+    min_relevant = 2 if SMOKE_TEST else 3
+    candidates = [(t, d) for t, d in theme_to_docs.items() if len(d) >= min_relevant]
+    candidates.sort(key=lambda kv: -len(kv[1]))
+    queries = {f"t{i}": tok for i, (tok, _) in enumerate(candidates)}
+    relevant_docs = {f"t{i}": docs for i, (_, docs) in enumerate(candidates)}
+    avg_rel = sum(len(v) for v in relevant_docs.values()) / max(1, len(relevant_docs))
+    logging.info(
+        f"  [{name}] {len(queries)} single-token queries, avg relevant/query={avg_rel:.1f}"
+    )
+    return _ir_evaluator(queries, corpus, relevant_docs, name)
+def _ir_evaluator(queries, corpus, relevant_docs, name):
+    return InformationRetrievalEvaluator(
+        queries=queries,
+        corpus=corpus,
+        relevant_docs=relevant_docs,
+        name=name,
+        ndcg_at_k=[10],
+        mrr_at_k=[10],
+        accuracy_at_k=[1, 10],
+        precision_recall_at_k=[1, 10],
+        show_progress_bar=False,
+        batch_size=256,
+    )
+def build_ir_evaluator(holdout: Dataset, name: str = "chess-ir") -> SequentialEvaluator:
+    """Wraps two evaluators (compositional + single-theme) into a sequential
+    pass. The compositional one's score drives best-model selection; the
+    single-theme one is informational.
+    """
+    corpus = {f"d{i}": _strip_theme_echo(row["positive"]) for i, row in enumerate(holdout)}
+    logging.info(f"IR eval setup ({len(corpus)} corpus docs):")
+    compositional = _build_compositional_ir_evaluator(holdout, corpus, name=name)
+    single_theme = _build_single_theme_ir_evaluator(holdout, corpus, name=f"{name}-tokens")
+    # First evaluator's score drives load_best_model_at_end (compositional).
+    return SequentialEvaluator(
+        [compositional, single_theme],
+        main_score_function=lambda scores: scores[0],
+    )
+def main() -> None:
+    setup_logging()
+    train_dataset, holdout = load_chess_pairs()
+    if SMOKE_TEST:
+        train_dataset = train_dataset.select(range(min(500, len(train_dataset))))
+    # Train the tokenizer on the FULL (pre-balanced) corpus -- we want every
+    # token to be seen as many times as possible for the vocab pass.
+    tokenizer = train_chess_tokenizer(train_dataset)
+    # Now down-sample to a balanced dataset for the contrastive training.
+    train_dataset = make_balanced_dataset(train_dataset, BALANCED_POSITIVES_PER_ANCHOR)
+    # Optional anchor-token masking applied on the fly via set_transform.
+    masker = make_anchor_masker(ANCHOR_MASK_PROB)
+    if masker is not None:
+        logging.info(f"Anchor token masking enabled (p={ANCHOR_MASK_PROB})")
+        train_dataset.set_transform(masker)
+    logging.info(f"Random-init StaticEmbedding (dim={EMBEDDING_DIM})")
+    static_embedding = StaticEmbedding(tokenizer, embedding_dim=EMBEDDING_DIM)
+    model = SentenceTransformer(
+        modules=[static_embedding],
+        model_card_data=SentenceTransformerModelCardData(
+            language="en",
+            license="apache-2.0",
+            model_name=f"Static chess embedding ({EMBEDDING_DIM}d) -- themes/openings <-> positions",
+        ),
+    )
+    evaluator = build_ir_evaluator(holdout)
+    inner = MultipleNegativesRankingLoss(model)
+    if DISABLE_MATRYOSHKA:
+        logging.info("Matryoshka DISABLED -- training at single dim (diagnostic)")
+        loss = inner
+    else:
+        loss = MatryoshkaLoss(model, inner, matryoshka_dims=MATRYOSHKA_DIMS)
+    logging.info("Baseline evaluation (random init -- expect near-zero):")
+    with autocast_ctx():
+        baseline_eval = evaluator(model)[evaluator.primary_metric]
+    metric_key = f"eval_{evaluator.primary_metric}"
+    logging.info(f"  baseline {evaluator.primary_metric} = {baseline_eval:.4f}")
+    if SMOKE_TEST:
+        max_steps = 1
+    elif MAX_STEPS_OVERRIDE:
+        max_steps = MAX_STEPS_OVERRIDE
+    else:
+        max_steps = -1
+    eval_steps = EVAL_STEPS_OVERRIDE if EVAL_STEPS_OVERRIDE else 0.05  # 20 evals/run
+    save_steps = EVAL_STEPS_OVERRIDE if EVAL_STEPS_OVERRIDE else 0.05
+    args = SentenceTransformerTrainingArguments(
+        output_dir=OUTPUT_DIR,
+        # Balanced dataset is small (~300k pairs); need many epochs to reach
+        # comparable total training signal. Early stopping handles excess.
+        num_train_epochs=20,
+        max_steps=max_steps,
+        per_device_train_batch_size=BATCH_SIZE,
+        per_device_eval_batch_size=BATCH_SIZE,
+        learning_rate=1e-2,  # was 5e-2 -- much slower convergence, shifts peak later
+        weight_decay=0.01,   # was 0.0  -- regularization on the embedding table
+        warmup_steps=0.1,
+        lr_scheduler_type="linear",
+        bf16=IS_CUDA and torch.cuda.is_bf16_supported(),
+        fp16=IS_CUDA and not torch.cuda.is_bf16_supported(),
+        # was NO_DUPLICATES -- linked-list scan over deferred conflicts gives
+        # O(epoch_progress) per-batch cost. With ~3000 unique anchors over
+        # 5.8M pairs, dedup is fighting impossible odds. BATCH_SAMPLER (random)
+        # is fast and accepts mild within-batch anchor duplication.
+        batch_sampler=BatchSamplers.BATCH_SAMPLER,
+        eval_strategy="steps",
+        eval_steps=eval_steps,
+        save_strategy="steps",
+        save_steps=save_steps,
+        save_total_limit=2,
+        logging_steps=0.01,
+        logging_first_step=True,
+        load_best_model_at_end=True,
+        metric_for_best_model=metric_key,
+        greater_is_better=True,
+        # Trackio crashes at first checkpoint push: empty `router_mapping`
+        # struct can't be written to parquet. Disable.
+        report_to="none",
+        run_name=RUN_NAME,
+        seed=12,
+        # HF Jobs: container is destroyed after run -- push every checkpoint to
+        # the Hub so partial progress survives a timeout. The end-of-run
+        # model.push_to_hub() below is the belt to this suspenders.
+        push_to_hub=not SMOKE_TEST,
+        hub_model_id=HUB_MODEL_ID,
+        hub_strategy="every_save",
+    )
+    trainer = SentenceTransformerTrainer(
+        model=model,
+        args=args,
+        train_dataset=train_dataset,
+        loss=loss,
+        evaluator=evaluator,
+        callbacks=[
+            # Auto-stop if compositional NDCG@10 doesn't improve for 3 evals.
+            # Lower lr makes curves smoother -- give it slack vs the patience=2
+            # we used at lr=5e-2.
+            EarlyStoppingCallback(early_stopping_patience=3),
+            # Per-step memory + dt logging.
+            StepTimingCallback(),
+        ],
+    )
+    trainer.train()
+    logging.info("Post-training evaluation:")
+    with autocast_ctx():
+        score = evaluator(model)[evaluator.primary_metric]
+    delta = score - baseline_eval
+    verdict = "WIN" if delta >= 0.005 else "MARGINAL" if delta >= 0 else "REGRESSION"
+    logging.info(
+        f"VERDICT: {verdict} | score={score:.4f} | baseline={baseline_eval:.4f} | delta={delta:+.4f}"
+    )
+    final_dir = f"{OUTPUT_DIR}/final"
+    model.save_pretrained(final_dir)
+    logging.info(f"Saved final model to {final_dir}")
+    if SMOKE_TEST:
+        logging.info("SMOKE_TEST=1: skipping Hub push")
+        return
+    try:
+        commit_url = model.push_to_hub(HUB_MODEL_ID)
+        logging.info(f"Pushed model to {commit_url.rsplit('/commit/', 1)[0]}")
+    except Exception:
+        import traceback
+        logging.error(f"Hub push failed:\n{traceback.format_exc()}")
+if __name__ == "__main__":
+    main()