Spaces:

anveshplus
/

BPE-Tokenizer

Sleeping

File size: 6,275 Bytes

e4d5fc0

import pandas as pd
import re

# Data section  start--> 
# Load the CSV files
file_paths = [
    '/Users/anvesh/codebase/llm/data/telugu_books/telugu_books.csv',
    '/Users/anvesh/codebase/llm/data/telugu_news/1_telugu_news.csv',
    '/Users/anvesh/codebase/llm/data/telugu_news/2_telugu_news.csv'
]

# Combine data from all files
telugu_texts = []
for file_path in file_paths:
    df = pd.read_csv(file_path)
    if 'text' in df.columns:
        telugu_texts.append(' '.join(df['text'].astype(str).tolist()))
    elif 'body' in df.columns:
        telugu_texts.append(' '.join(df['body'].astype(str).tolist()))

# Concatenate all texts and remove all English, numerical values, and quotes
telugu_text = ' '.join(telugu_texts)
telugu_text = re.sub(r'[A-Za-z0-9\'"]', '', telugu_text)  # Remove English letters, numbers, and quotes
telugu_text = re.sub(r'[\r\n\xa0]', '', telugu_text)  # Remove line breaks and non-breaking spaces

print('telugu_text befores utf-8 encoding:', telugu_text[:100])

vocabulary_size = len(set(telugu_text.split()))
print('Original text size:', len(telugu_text))
print('Vocabulary size of telugu_text:', vocabulary_size)

unique_characters = set(telugu_text)
unique_count = len(unique_characters)
print('Original text size:', len(telugu_text))
print('Unique character count in telugu_text:', unique_count)

# Data section  end--> 

# utf-8 encoding section start -->
import encode_parallel_telugu as encode_parallel
import time

tokens = encode_parallel.load_telugu_texts()
# Start the timer
start_time = time.time()
# Encode the tokens in parallel and get concatenated results
encoded_tokens = encode_parallel.encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
print('encoded_tokens:', encoded_tokens[:100])
print(len(encoded_tokens))
# End the timer
end_time = time.time()
print(f"Time taken to encode and process tokens in parallel: {end_time - start_time:.4f} seconds")

print('length of encoded_text:', len(encoded_tokens))
print('unique characters in encoded_text:', set(encoded_tokens))
print('unique characters in encoded_text:', len(set(encoded_tokens)))
# utf-8 encoding section end -->

# BPE section start -->
#### **BPE implementation**

tokens = encoded_tokens

def get_stats(ids):
    counts = {}
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts

def merge(ids, pair, idx):
    new_ids = []
    i = 0
    while i < len(ids):
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
            new_ids.append(idx)
            i += 2
        else:
            new_ids.append(ids[i])
            i += 1
    return new_ids

# ---
vocab_size = 500 # the desired final vocabulary size
num_merges = vocab_size - 256 ## our unique tokens are 194, for our sample text.
ids = list(tokens) # copy so we don't destroy the original list

merges = {} # (int, int) -> int
from tqdm import tqdm  # Import tqdm for progress bar

for i in tqdm(range(num_merges), desc="Merging tokens"):
    stats = get_stats(ids)
    pair = max(stats, key=stats.get)
    idx = 256 + i
    # print(f"merging {pair} into a new token {idx}")
    ids = merge(ids, pair, idx)
    merges[pair] = idx # merge has a pair of tokens and the new token index
    
print("tokens length:", len(tokens))
print("ids length:", len(ids))
print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
print(f"token size: {len(set(tokens))}")
    
# print(ids)
# BPE section end -->

# Building the vocabulary section start -->
telugu_unicode_chars = [chr(i) for i in range(0x0C00, 0x0C7F)]  # Telugu Unicode range

# Add these characters to the vocabulary
import json
vocab = {token: idx for token, idx in merges.items()}
# Add unique Telugu characters to the vocabulary
for idx, char in enumerate([chr(i).encode('utf-8') for i in range(0x0C00, 0x0C7F)]):
    if idx < 256:  # Ensure we only add up to 256 characters
        vocab[char] = idx  # Map the character to its index

vocab[b' '] = 255
vocab[b'.'] = 254
# Save merges and vocab to a file
# with open('merges_vocab.json', 'w') as f:
#     json.dump({'merges': merges, 'vocab': vocab}, f)

# saving the merges and vocab to a file
with open('merges_vocab.json', 'w') as f:
    json.dump({'merges': {str(k): v for k, v in merges.items()}, 'vocab': {str(k): v for k, v in vocab.items()}}, f)
    
# Building the vocabulary section end -->


# Reading the merges and vocab from a file section start -->
import json
from collections import defaultdict

# Read the merges and vocab data from the JSON file
with open('merges_vocab.json', 'r') as f:
    data = json.load(f)

# Create a defaultdict to store the data in a distributed manner
distributed_data = defaultdict(list)

# Distribute the merges and vocab data
# for key, value in data['merges'].items():
#     distributed_data['merges'].append({key: value})

for key, value in data['vocab'].items():
    distributed_data['vocab'].append({key: value})

# Optionally, print the distributed data for verification
print(distributed_data)
distributed_data['vocab']
# Convert the list of dictionaries to a single dictionary
formatted_vocab = {}
for item in distributed_data['vocab']:
    for k, v in item.items():
        if ',' not in k:
            formatted_vocab[(eval(k),)] = v
        else:
            formatted_vocab[eval(k)] = v
print(formatted_vocab[:50])
# inverting the vocab
inverted_vocab = {v: k for k, v in formatted_vocab.items()}
inverted_vocab

# Reading the merges and vocab from a file section end -->

# Expanding the vocab section start -->
def convert_to_bytes(value):
    if isinstance(value, bytes):
        return value
    elif value in inverted_vocab:
        return process_tuple(inverted_vocab[value])
    else:
        print(f'value not found in inverted_vocab: {value}')
        return None

def process_tuple(value_tuple):
    # print(f'value_tuple: {value_tuple}')
    # for vi in value_tuple:
    #     print(f'v: {vi}')
    converted_values = []
    for v in value_tuple:
        result = convert_to_bytes(v)
        if isinstance(result, tuple):
            converted_values.extend(result)
        else:
            converted_values.append(result)
    return tuple(converted_values)

decoder_map = {k: process_tuple(v) for k, v in inverted_vocab.items()}