| import json |
| from collections import defaultdict |
| import numpy as np |
|
|
| def unigram_first_byte_prob(corpus_path: str, output_path: str): |
| with open(corpus_path, 'r', encoding='utf-8') as f: |
| utf8_byte_lines = [json.loads(line)['text'].encode('utf-8') for line in f] |
|
|
| byte_counts = defaultdict(int) |
| total_bytes = 0 |
| for utf8_bytes in utf8_byte_lines: |
| for byte in utf8_bytes: |
| byte_counts[byte] += 1 |
| total_bytes += 1 |
|
|
| unigram_prob = [0.0] * 256 |
| for byte, count in byte_counts.items(): |
| unigram_prob[byte] = count / total_bytes |
|
|
| for byte in range(256): |
| p = unigram_prob[byte] |
| |
| |
| if 32 <= byte <= 126: |
| char = chr(byte) |
| elif byte in [9, 10, 13]: |
| char = {9: '\\t', 10: '\\n', 13: '\\r'}.get(byte) |
| else: |
| char = '<control>' |
| |
| |
| desc = { |
| 0: 'NUL (Null)', |
| 9: 'HT (Horizontal Tab)', |
| 10: 'LF (Line Feed)', |
| 13: 'CR (Carriage Return)', |
| 27: 'ESC (Escape)', |
| 32: 'SPACE', |
| 127: 'DEL (Delete)' |
| }.get(byte, '') |
| |
| print(f"{byte}\t{p:.6f}\t\t{char}\t\t{desc}") |
|
|
| with open(output_path, 'w') as f: |
| json.dump(unigram_prob, f) |
|
|
| def smoothing_unigram_prob(unigram_prob_path: str, smoothing_factor: float, output_path: str): |
| with open(unigram_prob_path, 'r') as f: |
| unigram_prob = json.load(f) |
| unigram_prob_np = np.array(unigram_prob, dtype=np.float64) |
| vocab_size = unigram_prob_np.shape[0] |
| |
| smoothed_prob_np = unigram_prob_np * (1 - smoothing_factor) + smoothing_factor / vocab_size |
|
|
| smoothed_prob = smoothed_prob_np.astype(np.float32).tolist() |
| |
| with open(output_path, 'w') as f: |
| json.dump(smoothed_prob, f) |
|
|
| if __name__ == '__main__': |
| unigram_prob_path = '/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/ac_unigram_probs/opencoder13G_unigram_prob.json' |
| smoothing_factor = 0.2 |
| smoothed_unigram_prob_path = '/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/ac_unigram_probs/opencoder13G_unigram_prob_smooth{}.json' |
| |
| |
| |
| |
| smoothing_unigram_prob( |
| unigram_prob_path, |
| smoothing_factor, |
| smoothed_unigram_prob_path.format(smoothing_factor) |
| ) |
|
|