2ira
/

Byte-lingua-code

Model card Files Files and versions

Byte-lingua-code / offline_unigram_prob.py

2ira's picture

offline_compression_graph_code

72c0672 verified 4 months ago

history blame contribute delete

2.62 kB

	import json
	from collections import defaultdict
	import numpy as np

	def unigram_first_byte_prob(corpus_path: str, output_path: str):
	with open(corpus_path, 'r', encoding='utf-8') as f:
	utf8_byte_lines = [json.loads(line)['text'].encode('utf-8') for line in f]

	byte_counts = defaultdict(int)
	total_bytes = 0
	for utf8_bytes in utf8_byte_lines:
	for byte in utf8_bytes:
	byte_counts[byte] += 1
	total_bytes += 1

	unigram_prob = [0.0] * 256
	for byte, count in byte_counts.items():
	unigram_prob[byte] = count / total_bytes

	for byte in range(256):
	p = unigram_prob[byte]

	# Get ASCII character representation (if printable)
	if 32 <= byte <= 126:
	char = chr(byte)
	elif byte in [9, 10, 13]: # Tab, Line Feed, Carriage Return
	char = {9: '\\t', 10: '\\n', 13: '\\r'}.get(byte)
	else:
	char = '<control>' # Non-printable control character

	# Get common name/description for selected bytes
	desc = {
	0: 'NUL (Null)',
	9: 'HT (Horizontal Tab)',
	10: 'LF (Line Feed)',
	13: 'CR (Carriage Return)',
	27: 'ESC (Escape)',
	32: 'SPACE',
	127: 'DEL (Delete)'
	}.get(byte, '')

	print(f"{byte}\t{p:.6f}\t\t{char}\t\t{desc}")

	with open(output_path, 'w') as f:
	json.dump(unigram_prob, f)

	def smoothing_unigram_prob(unigram_prob_path: str, smoothing_factor: float, output_path: str):
	with open(unigram_prob_path, 'r') as f:
	unigram_prob = json.load(f)
	unigram_prob_np = np.array(unigram_prob, dtype=np.float64)
	vocab_size = unigram_prob_np.shape[0]
	# Calculate smoothed probability
	smoothed_prob_np = unigram_prob_np * (1 - smoothing_factor) + smoothing_factor / vocab_size

	smoothed_prob = smoothed_prob_np.astype(np.float32).tolist()
	# Save smoothed probability
	with open(output_path, 'w') as f:
	json.dump(smoothed_prob, f)

	if __name__ == '__main__':
	unigram_prob_path = '/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/ac_unigram_probs/opencoder13G_unigram_prob.json'
	smoothing_factor = 0.2
	smoothed_unigram_prob_path = '/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/ac_unigram_probs/opencoder13G_unigram_prob_smooth{}.json'
	# unigram_first_byte_prob(
	# 'subsample_opencoder.jsonl',
	# unigram_prob_path
	# )
	smoothing_unigram_prob(
	unigram_prob_path,
	smoothing_factor,
	smoothed_unigram_prob_path.format(smoothing_factor)
	)