Sentence Similarity
sentence-transformers
Safetensors
English
static-embedding
chess
retrieval
exploratory
Instructions to use oneryalcin/static-embedding-chess with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- sentence-transformers
How to use oneryalcin/static-embedding-chess with sentence-transformers:
from sentence_transformers import SentenceTransformer model = SentenceTransformer("oneryalcin/static-embedding-chess") sentences = [ "That is a happy person", "That is a happy dog", "That is a very happy person", "Today is a sunny day" ] embeddings = model.encode(sentences) similarities = model.similarity(embeddings, embeddings) print(similarities.shape) # [4, 4] - Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| # /// script | |
| # requires-python = ">=3.10" | |
| # dependencies = ["chess", "datasets>=2.19", "tqdm"] | |
| # /// | |
| """Deterministic chess→English converter for puzzles. | |
| Generates a standardized English-readable description of each puzzle WITHOUT | |
| any LLM. Uses python-chess for UCI→SAN conversion (with board context), regex | |
| for decamelizing themes, and a fixed template. | |
| For each puzzle, produces a doc like: | |
| "White to move. Short middlegame puzzle with crushing fork and hanging | |
| piece motifs. Opening: King's Pawn Game. Moves: Bxg3 Rxe7 Qb1+ Nc1 Qxc1+ | |
| Qxc1" | |
| Pretrained English cross-encoders have seen SAN notation in chess web content | |
| during pretraining, so this doc is semantically meaningful to them — unlike | |
| the raw UCI form (`f2g3`) which gets fragmented into character pieces. | |
| Output: parquet at models/puzzles_english.parquet with columns: | |
| PuzzleId, anchor (original themes+opening str), english_doc | |
| Run: | |
| SMOKE_TEST=1 uv run --exclude-newer=2026-05-12 convert_to_english.py | |
| uv run --exclude-newer=2026-05-12 convert_to_english.py | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import re | |
| import sys | |
| import chess | |
| from datasets import Dataset, load_dataset | |
| from tqdm import tqdm | |
| sys.stdout.reconfigure(line_buffering=True) | |
| OUTPUT_PATH = "models/puzzles_english.parquet" | |
| SMOKE_TEST = os.environ.get("SMOKE_TEST") == "1" | |
| # Length tag mapping | |
| LENGTH_MAP = { | |
| "oneMove": "single-move", | |
| "short": "short", | |
| "long": "long", | |
| "veryLong": "very long", | |
| } | |
| PHASE_TAGS = {"opening", "middlegame", "endgame"} | |
| LENGTH_TAGS = set(LENGTH_MAP.keys()) | |
| # Anything matching `mateInN`, `mateIn1`, etc. | |
| MATE_IN_PATTERN = re.compile(r"^mateIn(\d+)$") | |
| # Specific mate-pattern names (their English form is just decamel) | |
| # camelCase → "camel case" via regex | |
| _CAMEL_BOUNDARY = re.compile(r"(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])") | |
| def decamelize(tag: str) -> str: | |
| """`backRankMate` → 'back rank mate'. `attackingF2F7` → 'attacking f2 f7'.""" | |
| return _CAMEL_BOUNDARY.sub(" ", tag).lower() | |
| def themes_to_english(themes: list[str]) -> tuple[str, str, str, list[str]]: | |
| """Returns (side_phrase, length_phrase, phase, decamelized_other_themes). | |
| Splits themes into structural (phase, length, mate-in-N) and motif (everything else). | |
| The motifs are returned decamelized. | |
| """ | |
| if not themes: | |
| return ("", "", "", []) | |
| phase = "" | |
| length = "" | |
| mate_in = None | |
| motifs = [] | |
| for t in themes: | |
| if t in PHASE_TAGS: | |
| phase = t | |
| elif t in LENGTH_TAGS: | |
| length = LENGTH_MAP[t] | |
| elif (m := MATE_IN_PATTERN.match(t)): | |
| mate_in = int(m.group(1)) | |
| else: | |
| motifs.append(decamelize(t)) | |
| # Mate-in-N gets folded into motifs as natural-language phrase | |
| if mate_in is not None: | |
| motifs.append(f"mate in {mate_in}") | |
| return phase, length, "", motifs # side_phrase computed separately from FEN | |
| def opening_tags_to_english(opening_tags: list[str]) -> str: | |
| """`['Kings_Pawn_Game', 'Kings_Pawn_Game_Leonardis_Variation']` → 'King's Pawn Game Leonardi's Variation'. | |
| Dedupe by taking the longest matching tag.""" | |
| if not opening_tags: | |
| return "" | |
| # Use the longest tag (most specific) and replace underscores with spaces | |
| longest = max(opening_tags, key=len) | |
| return longest.replace("_", " ") | |
| def uci_to_san_sequence(fen: str, uci_moves: str) -> str: | |
| """Convert UCI move sequence to SAN, using board context for disambiguation.""" | |
| try: | |
| board = chess.Board(fen) | |
| san_moves = [] | |
| for uci in uci_moves.split(): | |
| try: | |
| move = chess.Move.from_uci(uci) | |
| san = board.san(move) | |
| san_moves.append(san) | |
| board.push(move) | |
| except Exception: | |
| # Invalid move — skip rest | |
| break | |
| return " ".join(san_moves) | |
| except Exception: | |
| return uci_moves # fall back to raw UCI | |
| def side_to_move(fen: str) -> str: | |
| parts = fen.split() | |
| if len(parts) >= 2 and parts[1] == "w": | |
| return "White" | |
| return "Black" | |
| def build_english_doc(row: dict) -> str: | |
| """Build a deterministic English description from a Lichess puzzle row.""" | |
| side = side_to_move(row["FEN"]) | |
| phase, length, _, motifs = themes_to_english(row["Themes"] or []) | |
| opening = opening_tags_to_english(row.get("OpeningTags") or []) | |
| san = uci_to_san_sequence(row["FEN"], row["Moves"]) | |
| # Construct sentence | |
| parts = [] | |
| parts.append(f"{side} to move.") | |
| # "Short middlegame puzzle with crushing fork and hanging piece motifs." | |
| descriptor = [] | |
| if length: | |
| descriptor.append(length) | |
| if phase: | |
| descriptor.append(phase) | |
| descriptor.append("puzzle") | |
| descriptor_str = " ".join(descriptor) | |
| if motifs: | |
| motifs_str = ", ".join(motifs) | |
| descriptor_str += f" with {motifs_str} motifs" | |
| parts.append(descriptor_str.capitalize() + ".") | |
| if opening: | |
| parts.append(f"Opening: {opening}.") | |
| if san: | |
| parts.append(f"Moves: {san}") | |
| return " ".join(parts) | |
| def build_english_anchor(row: dict) -> str: | |
| """Anchor side: same as before (themes + opening) but in deterministic English. | |
| Used as query for retrieval/reranker training.""" | |
| phase, length, _, motifs = themes_to_english(row["Themes"] or []) | |
| opening = opening_tags_to_english(row.get("OpeningTags") or []) | |
| parts = [] | |
| if motifs: | |
| parts.append(", ".join(motifs)) | |
| if length: | |
| parts.append(length) | |
| if phase: | |
| parts.append(phase) | |
| if opening: | |
| parts.append(opening) | |
| return " ".join(parts).strip() | |
| def main(): | |
| print("Loading puzzles...") | |
| puzzles = load_dataset("Lichess/chess-puzzles", split="train") | |
| if SMOKE_TEST: | |
| puzzles = puzzles.select(range(2_000)) | |
| print(f" {len(puzzles):,} rows") | |
| print("Converting to English (deterministic)...") | |
| def proc(batch): | |
| ids, anchors, docs = [], [], [] | |
| for r in [{k: batch[k][i] for k in batch} for i in range(len(batch["PuzzleId"]))]: | |
| if not r["Themes"]: | |
| continue | |
| ids.append(r["PuzzleId"]) | |
| anchors.append(build_english_anchor(r)) | |
| docs.append(build_english_doc(r)) | |
| return {"PuzzleId": ids, "anchor_en": anchors, "doc_en": docs} | |
| out = puzzles.map( | |
| proc, batched=True, batch_size=10_000, | |
| remove_columns=puzzles.column_names, | |
| num_proc=4, | |
| ) | |
| print(f" produced {len(out):,} English-converted rows") | |
| print("\n=== Sample conversions ===") | |
| for i in [0, 100, 1000]: | |
| r = out[i] | |
| print(f"\nPuzzleId: {r['PuzzleId']}") | |
| print(f" anchor: {r['anchor_en']!r}") | |
| print(f" doc: {r['doc_en'][:200]!r}") | |
| out.to_parquet(OUTPUT_PATH) | |
| print(f"\nSaved to {OUTPUT_PATH} ({os.path.getsize(OUTPUT_PATH) / 1e6:.1f} MB)") | |
| if __name__ == "__main__": | |
| main() | |