Sentence Similarity
sentence-transformers
Safetensors
English
static-embedding
chess
retrieval
exploratory
Instructions to use oneryalcin/static-embedding-chess with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- sentence-transformers
How to use oneryalcin/static-embedding-chess with sentence-transformers:
from sentence_transformers import SentenceTransformer model = SentenceTransformer("oneryalcin/static-embedding-chess") sentences = [ "That is a happy person", "That is a happy dog", "That is a very happy person", "Today is a sunny day" ] embeddings = model.encode(sentences) similarities = model.similarity(embeddings, embeddings) print(similarities.shape) # [4, 4] - Notebooks
- Google Colab
- Kaggle
File size: 6,184 Bytes
f8392aa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | #!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "datasets>=2.19.0",
# "openai>=1.0",
# "sentence-transformers[train]>=5.5.0",
# "tqdm",
# "numpy",
# ]
# ///
"""Generate natural-language definitions for each Lichess theme via DeepSeek,
then embed those definitions with a general sentence-transformer (MPNet).
The resulting (theme_token, definition_embedding) pairs form a "chess-aware
teacher" — an English description of each chess concept that MPNet CAN
understand semantically. We can then distill those embeddings into our
StaticEmbedding model's token table.
Solves the "MPNet doesn't know chess" problem: MPNet can't read UCI moves,
but it CAN read English ("A tactical motif where one piece attacks two pieces
simultaneously" → semantically near "A tactic where you create a double
attack threatening two pieces at once"). Token-level semantic structure
emerges from the LLM bridge.
Run:
SMOKE_TEST=1 uv run --exclude-newer=2026-05-12 generate_theme_defs.py
uv run --exclude-newer=2026-05-12 generate_theme_defs.py
"""
import json
import os
import subprocess
import sys
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
from datasets import Dataset, load_dataset
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
MODEL = "deepseek-v4-flash"
TEACHER_MODEL = "sentence-transformers/all-mpnet-base-v2"
OUTPUT_PATH = "models/theme_definitions.parquet"
SMOKE_TEST = os.environ.get("SMOKE_TEST") == "1"
PARALLEL_WORKERS = 4
SYSTEM_PROMPT = """You write concise dictionary-style definitions of chess
concepts. Given a theme/concept name (often in camelCase from Lichess.org's
puzzle tagging system), write a single English sentence of 10-25 words
explaining the concept. Be specific and use the standard chess vocabulary that
would appear in any chess textbook.
Output ONLY the definition sentence. No labels, no quotes, no commentary.
Examples:
Input: fork
Output: A tactical motif where a single piece attacks two or more enemy pieces simultaneously, forcing a material gain.
Input: backRankMate
Output: A checkmate delivered along the opponent's back rank, typically with a rook or queen, when the king is trapped by its own pawns.
Input: zugzwang
Output: A position in which any move worsens the player's position, so being forced to move becomes a disadvantage.
"""
def get_deepseek_key():
r = subprocess.run(
["security", "find-generic-password", "-s", "deepseek-api", "-w"],
capture_output=True, text=True, timeout=5,
)
return r.stdout.strip() if r.returncode == 0 else os.environ.get("DEEPSEEK_API_KEY")
def define_theme(client, theme, debug=False):
try:
resp = client.chat.completions.create(
model=MODEL,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": theme},
],
temperature=0.2,
max_tokens=1500, # DeepSeek-v4-flash spends tokens on reasoning_content; obscure mate-pattern names need lots
timeout=30,
)
content = resp.choices[0].message.content
return content.strip() if content else None
except Exception as e:
if debug:
print(f" EXC for {theme!r}: {type(e).__name__}: {e}")
return None
def main():
key = get_deepseek_key()
if not key:
sys.exit("No DeepSeek API key in keychain")
client = OpenAI(api_key=key, base_url="https://api.deepseek.com/v1")
print("Enumerating themes from Lichess puzzles...")
puzzles = load_dataset("Lichess/chess-puzzles", split="train", streaming=True)
counter = Counter()
sample_size = 50_000 if SMOKE_TEST else 1_000_000
for i, r in enumerate(puzzles):
if i >= sample_size:
break
for t in (r["Themes"] or []):
counter[t] += 1
themes = sorted(counter.keys())
print(f" {len(themes)} unique themes")
if SMOKE_TEST:
themes = themes[:10]
print(f" SMOKE_TEST=1: limited to {len(themes)}")
print(f"\nGenerating definitions via {MODEL}...")
defs = {}
with ThreadPoolExecutor(max_workers=PARALLEL_WORKERS) as ex:
futs = {ex.submit(define_theme, client, t, True): t for t in themes}
for f in tqdm(as_completed(futs), total=len(futs)):
t = futs[f]
defs[t] = f.result()
failed = [t for t, d in defs.items() if not d]
if failed:
print(f" {len(failed)} themes failed: {failed[:5]}")
print(f" {len(defs) - len(failed)}/{len(defs)} succeeded")
print("\nSample definitions:")
for t in themes[:8]:
if defs[t]:
print(f" {t:>20s} -> {defs[t]}")
valid = [(t, defs[t]) for t in themes if defs[t]]
print(f"\nEmbedding {len(valid)} definitions with {TEACHER_MODEL}...")
teacher = SentenceTransformer(TEACHER_MODEL)
sentences = [d for _, d in valid]
embs = teacher.encode(sentences, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
# Sanity: do related themes have similar embeddings?
emb_norm = embs / np.linalg.norm(embs, axis=1, keepdims=True)
sim = emb_norm @ emb_norm.T
print("\nSanity check: pairwise similarities for related themes")
name_to_idx = {t: i for i, (t, _) in enumerate(valid)}
for a, b in [
("fork", "skewer"), ("fork", "pin"), ("backRankMate", "smotheredMate"),
("kingsideAttack", "queensideAttack"), ("endgame", "middlegame"),
("fork", "promotion"), # not directly related
]:
if a in name_to_idx and b in name_to_idx:
print(f" {a!r:>20} <-> {b!r:25} = {sim[name_to_idx[a], name_to_idx[b]]:+.3f}")
out = Dataset.from_dict({
"theme": [t for t, _ in valid],
"definition": [d for _, d in valid],
"embedding": embs.tolist(),
})
os.makedirs(os.path.dirname(OUTPUT_PATH) or ".", exist_ok=True)
out.to_parquet(OUTPUT_PATH)
print(f"\nSaved {len(out)} theme definitions to {OUTPUT_PATH}")
if __name__ == "__main__":
main()
|