BPE-Tokenizer / tokenizer_backup.py
anveshplus's picture
updated
e4d5fc0
import pandas as pd
import re
# Data section start-->
# Load the CSV files
file_paths = [
'/Users/anvesh/codebase/llm/data/telugu_books/telugu_books.csv',
'/Users/anvesh/codebase/llm/data/telugu_news/1_telugu_news.csv',
'/Users/anvesh/codebase/llm/data/telugu_news/2_telugu_news.csv'
]
# Combine data from all files
telugu_texts = []
for file_path in file_paths:
df = pd.read_csv(file_path)
if 'text' in df.columns:
telugu_texts.append(' '.join(df['text'].astype(str).tolist()))
elif 'body' in df.columns:
telugu_texts.append(' '.join(df['body'].astype(str).tolist()))
# Concatenate all texts and remove all English, numerical values, and quotes
telugu_text = ' '.join(telugu_texts)
telugu_text = re.sub(r'[A-Za-z0-9\'"]', '', telugu_text) # Remove English letters, numbers, and quotes
telugu_text = re.sub(r'[\r\n\xa0]', '', telugu_text) # Remove line breaks and non-breaking spaces
print('telugu_text befores utf-8 encoding:', telugu_text[:100])
vocabulary_size = len(set(telugu_text.split()))
print('Original text size:', len(telugu_text))
print('Vocabulary size of telugu_text:', vocabulary_size)
unique_characters = set(telugu_text)
unique_count = len(unique_characters)
print('Original text size:', len(telugu_text))
print('Unique character count in telugu_text:', unique_count)
# Data section end-->
# utf-8 encoding section start -->
import encode_parallel_telugu as encode_parallel
import time
tokens = encode_parallel.load_telugu_texts()
# Start the timer
start_time = time.time()
# Encode the tokens in parallel and get concatenated results
encoded_tokens = encode_parallel.encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
print('encoded_tokens:', encoded_tokens[:100])
print(len(encoded_tokens))
# End the timer
end_time = time.time()
print(f"Time taken to encode and process tokens in parallel: {end_time - start_time:.4f} seconds")
print('length of encoded_text:', len(encoded_tokens))
print('unique characters in encoded_text:', set(encoded_tokens))
print('unique characters in encoded_text:', len(set(encoded_tokens)))
# utf-8 encoding section end -->
# BPE section start -->
#### **BPE implementation**
tokens = encoded_tokens
def get_stats(ids):
counts = {}
for pair in zip(ids, ids[1:]):
counts[pair] = counts.get(pair, 0) + 1
return counts
def merge(ids, pair, idx):
new_ids = []
i = 0
while i < len(ids):
if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
new_ids.append(idx)
i += 2
else:
new_ids.append(ids[i])
i += 1
return new_ids
# ---
vocab_size = 500 # the desired final vocabulary size
num_merges = vocab_size - 256 ## our unique tokens are 194, for our sample text.
ids = list(tokens) # copy so we don't destroy the original list
merges = {} # (int, int) -> int
from tqdm import tqdm # Import tqdm for progress bar
for i in tqdm(range(num_merges), desc="Merging tokens"):
stats = get_stats(ids)
pair = max(stats, key=stats.get)
idx = 256 + i
# print(f"merging {pair} into a new token {idx}")
ids = merge(ids, pair, idx)
merges[pair] = idx # merge has a pair of tokens and the new token index
print("tokens length:", len(tokens))
print("ids length:", len(ids))
print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
print(f"token size: {len(set(tokens))}")
# print(ids)
# BPE section end -->
# Building the vocabulary section start -->
telugu_unicode_chars = [chr(i) for i in range(0x0C00, 0x0C7F)] # Telugu Unicode range
# Add these characters to the vocabulary
import json
vocab = {token: idx for token, idx in merges.items()}
# Add unique Telugu characters to the vocabulary
for idx, char in enumerate([chr(i).encode('utf-8') for i in range(0x0C00, 0x0C7F)]):
if idx < 256: # Ensure we only add up to 256 characters
vocab[char] = idx # Map the character to its index
vocab[b' '] = 255
vocab[b'.'] = 254
# Save merges and vocab to a file
# with open('merges_vocab.json', 'w') as f:
# json.dump({'merges': merges, 'vocab': vocab}, f)
# saving the merges and vocab to a file
with open('merges_vocab.json', 'w') as f:
json.dump({'merges': {str(k): v for k, v in merges.items()}, 'vocab': {str(k): v for k, v in vocab.items()}}, f)
# Building the vocabulary section end -->
# Reading the merges and vocab from a file section start -->
import json
from collections import defaultdict
# Read the merges and vocab data from the JSON file
with open('merges_vocab.json', 'r') as f:
data = json.load(f)
# Create a defaultdict to store the data in a distributed manner
distributed_data = defaultdict(list)
# Distribute the merges and vocab data
# for key, value in data['merges'].items():
# distributed_data['merges'].append({key: value})
for key, value in data['vocab'].items():
distributed_data['vocab'].append({key: value})
# Optionally, print the distributed data for verification
print(distributed_data)
distributed_data['vocab']
# Convert the list of dictionaries to a single dictionary
formatted_vocab = {}
for item in distributed_data['vocab']:
for k, v in item.items():
if ',' not in k:
formatted_vocab[(eval(k),)] = v
else:
formatted_vocab[eval(k)] = v
print(formatted_vocab[:50])
# inverting the vocab
inverted_vocab = {v: k for k, v in formatted_vocab.items()}
inverted_vocab
# Reading the merges and vocab from a file section end -->
# Expanding the vocab section start -->
def convert_to_bytes(value):
if isinstance(value, bytes):
return value
elif value in inverted_vocab:
return process_tuple(inverted_vocab[value])
else:
print(f'value not found in inverted_vocab: {value}')
return None
def process_tuple(value_tuple):
# print(f'value_tuple: {value_tuple}')
# for vi in value_tuple:
# print(f'v: {vi}')
converted_values = []
for v in value_tuple:
result = convert_to_bytes(v)
if isinstance(result, tuple):
converted_values.extend(result)
else:
converted_values.append(result)
return tuple(converted_values)
decoder_map = {k: process_tuple(v) for k, v in inverted_vocab.items()}