Telugu BPE Tokenizer
Byte Pair Encoding tokenizer for Telugu.
Stats:
- Vocab: 5,000
- Compression: 3.24x
- Training: 2,000 texts
Usage:
import json
# Load tokenizer
with open('telugu_merges.json') as f:
merges = {eval(k): v for k, v in json.load(f).items()}
with open('telugu_vocab.json') as f:
vocab = {int(k): bytes.fromhex(v) for k, v in json.load(f).items()}
def get_stats(ids):
counts = {}
for pair in zip(ids, ids[1:]):
counts[pair] = counts.get(pair, 0) + 1
return counts
def merge(ids, pair, idx):
newids = []
i = 0
while i < len(ids):
if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
newids.append(idx)
i += 2
else:
newids.append(ids[i])
i += 1
return newids
def encode(text):
tokens = list(text.encode("utf-8"))
while len(tokens) >= 2:
stats = get_stats(tokens)
pair = min(stats, key=lambda p: merges.get(p, float("inf")))
if pair not in merges:
break
tokens = merge(tokens, pair, merges[pair])
return tokens
def decode(ids):
return b"".join(vocab[i] for i in ids).decode("utf-8", errors="replace")
# Example
text = "నమస్కారం"
print(decode(encode(text)))
Examples:
- నమస్కారం → 4 tokens
- తెలుగు భాష → 5 tokens
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support