File size: 2,622 Bytes

72c0672

import json
from collections import defaultdict
import numpy as np

def unigram_first_byte_prob(corpus_path: str, output_path: str):
    with open(corpus_path, 'r', encoding='utf-8') as f:
        utf8_byte_lines = [json.loads(line)['text'].encode('utf-8') for line in f]

    byte_counts = defaultdict(int)
    total_bytes = 0
    for utf8_bytes in utf8_byte_lines:
        for byte in utf8_bytes:
            byte_counts[byte] += 1
            total_bytes += 1

    unigram_prob = [0.0] * 256
    for byte, count in byte_counts.items():
        unigram_prob[byte] = count / total_bytes

    for byte in range(256):
        p = unigram_prob[byte]
        
        # Get ASCII character representation (if printable)
        if 32 <= byte <= 126:
            char = chr(byte)
        elif byte in [9, 10, 13]:  # Tab, Line Feed, Carriage Return
            char = {9: '\\t', 10: '\\n', 13: '\\r'}.get(byte)
        else:
            char = '<control>'  # Non-printable control character
        
        # Get common name/description for selected bytes
        desc = {
            0: 'NUL (Null)',
            9: 'HT (Horizontal Tab)',
            10: 'LF (Line Feed)',
            13: 'CR (Carriage Return)',
            27: 'ESC (Escape)',
            32: 'SPACE',
            127: 'DEL (Delete)'
        }.get(byte, '')
        
        print(f"{byte}\t{p:.6f}\t\t{char}\t\t{desc}")

    with open(output_path, 'w') as f:
        json.dump(unigram_prob, f)

def smoothing_unigram_prob(unigram_prob_path: str, smoothing_factor: float, output_path: str):
    with open(unigram_prob_path, 'r') as f:
        unigram_prob = json.load(f)
    unigram_prob_np = np.array(unigram_prob, dtype=np.float64)
    vocab_size = unigram_prob_np.shape[0]
    # Calculate smoothed probability
    smoothed_prob_np = unigram_prob_np * (1 - smoothing_factor) + smoothing_factor / vocab_size

    smoothed_prob = smoothed_prob_np.astype(np.float32).tolist()
    # Save smoothed probability
    with open(output_path, 'w') as f:
        json.dump(smoothed_prob, f)

if __name__ == '__main__':
    unigram_prob_path = '/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/ac_unigram_probs/opencoder13G_unigram_prob.json'
    smoothing_factor = 0.2
    smoothed_unigram_prob_path = '/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/ac_unigram_probs/opencoder13G_unigram_prob_smooth{}.json'
    # unigram_first_byte_prob(
    #     'subsample_opencoder.jsonl', 
    #     unigram_prob_path
    # )
    smoothing_unigram_prob(
        unigram_prob_path,
        smoothing_factor,
        smoothed_unigram_prob_path.format(smoothing_factor)
    )