Spaces:
Sleeping
Sleeping
File size: 11,836 Bytes
fc6dcab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 |
# Sampling with Tournament
# input:
# original: original word
# alternatives: substitute candidates
# similarity: similarity scores
# context: 1-token left context
# key: secret key for hashing
# m: number of tournament rounds. Set to 2
# c: number of candidates per round (not including original): c is set to the number of alternatives = K
# alpha: scaling factor like temperature. Set to 1
# Process:
# 1. Normalize similarity scores to [0,1] though softmax with scaling factor alpha
# 2. Create random score for each candidate with m rounds: so we have a matrix of shape (K, m)
# 3. Each score is generated by hashing key, context, candidate, round index
# 4. For the first round, sampling from normalized similarity scores M = K^m candidates (each candidate can appear multiple times) and divide into groups of size K
# 5. For each group, select the candidate with the highest random score to go to the next round. If they are tied, just randomly select one of the tied candidates. For example, K = 3, we have 1 1 0 score, so break the 1 1 tie randomly.
# 6: For next round, use the winners from the previous round, but the scores from the matrix for that round. Repeat until we have one winner.
# Output: return winner candidate index
import hmac
import hashlib
import math
from typing import List, Tuple
def _softmax(xs: List[float], alpha: float = 1.0) -> List[float]:
# Temperature-scaled softmax normalized to sum 1
if not xs:
return []
# subtract max for numerical stability
m = max(xs)
zs = [math.exp(alpha * (x - m)) for x in xs]
s = sum(zs)
return [z / s for z in zs]
def _hash_bytes(key: str, payload: str) -> bytes:
return hmac.new(key.encode("utf-8"), payload.encode("utf-8"), hashlib.sha256).digest()
def _hash_to_uniform01(key: str, payload: str) -> float:
# Map 8 bytes to [0,1) as a 64-bit integer / 2**64
b = _hash_bytes(key, payload)[:8]
n = int.from_bytes(b, "big", signed=False)
return n / 2**64
def _ctx_str(context: List[str]) -> str:
# Use last token (you can widen to last-4 if you like)
toks = (context or [])
return " ".join(toks[-1:]).lower()
def _per_round_score(key: str, ctx: str, candidate: str, round_idx: int) -> int:
# Deterministic "random" score following Bernoulli(0.5): 0 or 1
payload = f"score::{ctx}::{candidate}::r{round_idx}"
uniform_score = _hash_to_uniform01(key, payload)
# Convert uniform [0,1) to Bernoulli(0.5): 0 if < 0.5, 1 if >= 0.5
return 1 if uniform_score >= 0.5 else 0
def _sample_categorical_from_uniform(u: float, probs: List[float]) -> int:
# Inverse-CDF sampling from probs using a uniform u in [0,1)
c = 0.0
for i, p in enumerate(probs):
c += p
if u < c:
return i
return len(probs) - 1 # fallback on last due to float sums
def _draw_M_candidates(key: str, ctx: str, probs: List[float], K: int, c: int, m: int) -> List[int]:
"""
Draw M = c^m candidates (indices 0..K-1) with replacement from probs,
using a deterministic stream of uniforms keyed by (key, ctx).
"""
M = c ** m
picks = []
for draw_idx in range(M):
u = _hash_to_uniform01(key, f"draw::{ctx}::m{m}::c{c}::i{draw_idx}")
j = _sample_categorical_from_uniform(u, probs)
picks.append(j)
return picks
def _run_tournament_round(
key: str,
ctx: str,
picks: List[int],
candidates: List[str],
round_idx: int,
group_size: int,
) -> List[int]:
"""
Split 'picks' into consecutive groups of size 'group_size'.
For each group, select the winner = argmax per-round-score;
tie-break deterministically.
Returns the list of winning indices (into candidates list).
"""
assert len(picks) % group_size == 0, "Group partition must divide evenly."
winners = []
# Add detailed logging for all rounds
# print(f"\n=== ROUND {round_idx} DETAILS ===")
# print(f"Total picks: {len(picks)}")
# print(f"Group size: {group_size}")
# print(f"Number of groups: {len(picks) // group_size}")
# print(f"Picks array: {picks}")
# print(f"Candidates: {candidates}")
for g in range(0, len(picks), group_size):
group = picks[g:g+group_size] # indices into candidates list
group_num = g // group_size + 1
# Compute per-round scores
scored: List[Tuple[int, int]] = []
for idx in group:
cand = candidates[idx]
s = _per_round_score(key, ctx, cand, round_idx)
scored.append((s, idx))
# Add detailed logging for all rounds
# print(f"\n--- Group {group_num} ---")
# print(f"Group indices: {group}")
# print(f"Group candidates: {[candidates[idx] for idx in group]}")
# print("Scores:")
# for score, idx in scored:
# print(f" {candidates[idx]}: {score}")
# Find max score; tie-break with a deterministic secondary key
max_s = max(s for s, _ in scored)
tied = [idx for (s, idx) in scored if s == max_s]
if len(tied) == 1:
winner_idx = tied[0]
winners.append(winner_idx)
# print(f"Winner: {candidates[winner_idx]} (score: {max_s})")
else:
# Random tie-breaker: pick one randomly from tied candidates
import random
# Create a more robust seed by combining key, context, round, and group info
seed_string = f"{key}:{ctx}:r{round_idx}:g{group_num}"
# Convert string to integer seed using a more reliable method
seed_value = sum(ord(c) * (i + 1) for i, c in enumerate(seed_string))
random.seed(seed_value)
winner_idx = random.choice(tied)
random.seed() # Reset seed to avoid affecting other random operations
winners.append(winner_idx)
# print(f"TIE! Winners: {[candidates[idx] for idx in tied]}")
# print(f"Random tie-breaker winner: {candidates[winner_idx]} (score: {max_s})")
# print(f" (Seed used: {seed_string} -> {seed_value})")
# Add summary logging for all rounds
# print(f"\nRound {round_idx} winners: {[candidates[idx] for idx in winners]}")
# print(f"Winner indices: {winners}")
# print("=" * 50)
return winners
def _show_score_matrix(key: str, ctx: str, candidates: List[str], m: int):
"""
Display the complete score matrix for all candidates across all rounds.
Each candidate gets a Bernoulli(0.5) score for each round.
"""
# print(f"\n=== SCORE MATRIX (Bernoulli 0.5) ===")
# print(f"Format: candidate -> [round1_score, round2_score, ...]")
# for i, candidate in enumerate(candidates):
# scores = []
# for round_idx in range(1, m + 1):
# score = _per_round_score(key, ctx, candidate, round_idx)
# scores.append(score)
# print(f"{candidate:8} -> {scores}")
# print("=" * 50)
pass
def tournament_randomize(
original: str,
alternatives: List[str],
similarity: List[float],
context: List[str],
key: str,
m: int = 2,
c: int = 2,
alpha: float = 1.0,
) -> int:
"""
Tournament sampling among alternatives only (K = len(alternatives)).
Returns the WINNER INDEX (0..K-1) into 'alternatives'.
Steps:
1) softmax(similarity, alpha) -> probs over alternatives
2) build per-round scores for each candidate via HMAC (on the fly)
3) Round 1: draw M = c^m picks from probs; group into size c; pick per-group winners by scores of round 1
4) Next rounds: group winners into size c; pick winners using that round's scores
5) Repeat until one winner remains
"""
assert len(alternatives) == len(similarity) > 0, "Need at least one alternative with a similarity score."
assert m >= 1, "Tournament rounds m must be >= 1"
assert c >= 2, "Number of competitors per match c must be >= 2"
K = len(alternatives)
ctx = _ctx_str(context)
probs = _softmax(similarity, alpha=alpha)
# Add detailed logging for setup
# print(f"\n=== TOURNAMENT SETUP ===")
# print(f"Original word: '{original}'")
# print(f"Alternatives: {alternatives}")
# print(f"Similarity scores: {similarity}")
# print(f"Context: {context}")
# print(f"Key: {key}")
# print(f"Tournament rounds (m): {m}")
# print(f"Competitors per match (c): {c}")
# print(f"Alpha (temperature): {alpha}")
# print(f"K (number of alternatives): {K}")
# print(f"Context string: '{ctx}'")
# print(f"Normalized probabilities: {[f'{p:.6f}' for p in probs]}")
# print(f"Expected picks (M = c^m): {c**m}")
# First round: M = c^m draws from probs
picks = _draw_M_candidates(key, ctx, probs, K, c, m) # indices 0..K-1 into alternatives
# Show the picks array for the first round
# print(f"Generated picks array: {picks}")
# print(f"Picks correspond to words: {[alternatives[idx] for idx in picks]}")
# We treat 'candidates' as the alternatives list; indices map directly
candidates = alternatives
# Show the complete score matrix for all candidates across all rounds
_show_score_matrix(key, ctx, candidates, m)
# Run m rounds. Each round groups current list into blocks of size c,
# picks one per group using per-round scores.
current = picks
for r in range(1, m + 1):
# group_size is now c instead of K
current = _run_tournament_round(
key=key,
ctx=ctx,
picks=current,
candidates=candidates,
round_idx=r,
group_size=c,
)
# After each round, the number of survivors shrinks by factor c.
# After m rounds, we must have exactly one winner.
assert len(current) == 1, f"Expected a single winner after {m} rounds, got {len(current)}"
final_winner_idx = current[0]
# print(f"\n=== FINAL RESULT ===")
# print(f"Winner index: {final_winner_idx}")
# print(f"Winner word: '{candidates[final_winner_idx]}'")
return final_winner_idx
# Convenience wrapper that returns the selected word
def tournament_select_word(
original: str,
alternatives: List[str],
similarity: List[float],
context: List[str],
key: str,
m: int = 2,
c: int = 2,
alpha: float = 1.0,
) -> str:
idx = tournament_randomize(original, alternatives, similarity, context, key, m, c, alpha)
return alternatives[idx]
# # Sample running
# original = "big"
# alts = ["large", "huge", "massive"] # K = 3
# sims = [0.82, 0.55, 0.60] # any similarity scores
# ctx = ["a"] # 1-token left context (e.g., "... a ___")
# key = "super-secret-key"
# m = 2
# c = 2
# alpha = 1.0
# winner_idx = tournament_randomize(original, alts, sims, ctx, key, m, c, alpha)
# print("Winner index:", winner_idx, "->", alts[winner_idx])
# # Or directly:
# print("Winner word:", tournament_select_word(original, alts, sims, ctx, key, m, c, alpha))
# # Test with a different key
# key2 = "different-secret-key"
# winner_idx2 = tournament_randomize(original, alts, sims, ctx, key2, m, c, alpha)
# print("Winner index with different key:", winner_idx2, "->", alts[winner_idx2])
# # Test with a different key
# key3 = "key224242"
# winner_idx3 = tournament_randomize(original, alts, sims, ctx, key3, m, c, alpha)
# print("Winner index with different key:", winner_idx3, "->", alts[winner_idx3])
|