Byte-lingua-code / offline_unigram_prob.py
2ira's picture
offline_compression_graph_code
72c0672 verified
import json
from collections import defaultdict
import numpy as np
def unigram_first_byte_prob(corpus_path: str, output_path: str):
with open(corpus_path, 'r', encoding='utf-8') as f:
utf8_byte_lines = [json.loads(line)['text'].encode('utf-8') for line in f]
byte_counts = defaultdict(int)
total_bytes = 0
for utf8_bytes in utf8_byte_lines:
for byte in utf8_bytes:
byte_counts[byte] += 1
total_bytes += 1
unigram_prob = [0.0] * 256
for byte, count in byte_counts.items():
unigram_prob[byte] = count / total_bytes
for byte in range(256):
p = unigram_prob[byte]
# Get ASCII character representation (if printable)
if 32 <= byte <= 126:
char = chr(byte)
elif byte in [9, 10, 13]: # Tab, Line Feed, Carriage Return
char = {9: '\\t', 10: '\\n', 13: '\\r'}.get(byte)
else:
char = '<control>' # Non-printable control character
# Get common name/description for selected bytes
desc = {
0: 'NUL (Null)',
9: 'HT (Horizontal Tab)',
10: 'LF (Line Feed)',
13: 'CR (Carriage Return)',
27: 'ESC (Escape)',
32: 'SPACE',
127: 'DEL (Delete)'
}.get(byte, '')
print(f"{byte}\t{p:.6f}\t\t{char}\t\t{desc}")
with open(output_path, 'w') as f:
json.dump(unigram_prob, f)
def smoothing_unigram_prob(unigram_prob_path: str, smoothing_factor: float, output_path: str):
with open(unigram_prob_path, 'r') as f:
unigram_prob = json.load(f)
unigram_prob_np = np.array(unigram_prob, dtype=np.float64)
vocab_size = unigram_prob_np.shape[0]
# Calculate smoothed probability
smoothed_prob_np = unigram_prob_np * (1 - smoothing_factor) + smoothing_factor / vocab_size
smoothed_prob = smoothed_prob_np.astype(np.float32).tolist()
# Save smoothed probability
with open(output_path, 'w') as f:
json.dump(smoothed_prob, f)
if __name__ == '__main__':
unigram_prob_path = '/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/ac_unigram_probs/opencoder13G_unigram_prob.json'
smoothing_factor = 0.2
smoothed_unigram_prob_path = '/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/ac_unigram_probs/opencoder13G_unigram_prob_smooth{}.json'
# unigram_first_byte_prob(
# 'subsample_opencoder.jsonl',
# unigram_prob_path
# )
smoothing_unigram_prob(
unigram_prob_path,
smoothing_factor,
smoothed_unigram_prob_path.format(smoothing_factor)
)