Spaces:

anveshplus
/

BPE-Tokenizer

Sleeping

App Files Files Community

BPE-Tokenizer / tokenizer_backup.py

anveshplus

updated

e4d5fc0 about 1 year ago

raw

history blame contribute delete

6.28 kB

	import pandas as pd
	import re

	# Data section start-->
	# Load the CSV files
	file_paths = [
	'/Users/anvesh/codebase/llm/data/telugu_books/telugu_books.csv',
	'/Users/anvesh/codebase/llm/data/telugu_news/1_telugu_news.csv',
	'/Users/anvesh/codebase/llm/data/telugu_news/2_telugu_news.csv'
	]

	# Combine data from all files
	telugu_texts = []
	for file_path in file_paths:
	df = pd.read_csv(file_path)
	if 'text' in df.columns:
	telugu_texts.append(' '.join(df['text'].astype(str).tolist()))
	elif 'body' in df.columns:
	telugu_texts.append(' '.join(df['body'].astype(str).tolist()))

	# Concatenate all texts and remove all English, numerical values, and quotes
	telugu_text = ' '.join(telugu_texts)
	telugu_text = re.sub(r'[A-Za-z0-9\'"]', '', telugu_text) # Remove English letters, numbers, and quotes
	telugu_text = re.sub(r'[\r\n\xa0]', '', telugu_text) # Remove line breaks and non-breaking spaces

	print('telugu_text befores utf-8 encoding:', telugu_text[:100])

	vocabulary_size = len(set(telugu_text.split()))
	print('Original text size:', len(telugu_text))
	print('Vocabulary size of telugu_text:', vocabulary_size)

	unique_characters = set(telugu_text)
	unique_count = len(unique_characters)
	print('Original text size:', len(telugu_text))
	print('Unique character count in telugu_text:', unique_count)

	# Data section end-->

	# utf-8 encoding section start -->
	import encode_parallel_telugu as encode_parallel
	import time

	tokens = encode_parallel.load_telugu_texts()
	# Start the timer
	start_time = time.time()
	# Encode the tokens in parallel and get concatenated results
	encoded_tokens = encode_parallel.encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
	print('encoded_tokens:', encoded_tokens[:100])
	print(len(encoded_tokens))
	# End the timer
	end_time = time.time()
	print(f"Time taken to encode and process tokens in parallel: {end_time - start_time:.4f} seconds")

	print('length of encoded_text:', len(encoded_tokens))
	print('unique characters in encoded_text:', set(encoded_tokens))
	print('unique characters in encoded_text:', len(set(encoded_tokens)))
	# utf-8 encoding section end -->

	# BPE section start -->
	#### BPE implementation

	tokens = encoded_tokens

	def get_stats(ids):
	counts = {}
	for pair in zip(ids, ids[1:]):
	counts[pair] = counts.get(pair, 0) + 1
	return counts

	def merge(ids, pair, idx):
	new_ids = []
	i = 0
	while i < len(ids):
	if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
	new_ids.append(idx)
	i += 2
	else:
	new_ids.append(ids[i])
	i += 1
	return new_ids

	# ---
	vocab_size = 500 # the desired final vocabulary size
	num_merges = vocab_size - 256 ## our unique tokens are 194, for our sample text.
	ids = list(tokens) # copy so we don't destroy the original list

	merges = {} # (int, int) -> int
	from tqdm import tqdm # Import tqdm for progress bar

	for i in tqdm(range(num_merges), desc="Merging tokens"):
	stats = get_stats(ids)
	pair = max(stats, key=stats.get)
	idx = 256 + i
	# print(f"merging {pair} into a new token {idx}")
	ids = merge(ids, pair, idx)
	merges[pair] = idx # merge has a pair of tokens and the new token index

	print("tokens length:", len(tokens))
	print("ids length:", len(ids))
	print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
	print(f"token size: {len(set(tokens))}")

	# print(ids)
	# BPE section end -->

	# Building the vocabulary section start -->
	telugu_unicode_chars = [chr(i) for i in range(0x0C00, 0x0C7F)] # Telugu Unicode range

	# Add these characters to the vocabulary
	import json
	vocab = {token: idx for token, idx in merges.items()}
	# Add unique Telugu characters to the vocabulary
	for idx, char in enumerate([chr(i).encode('utf-8') for i in range(0x0C00, 0x0C7F)]):
	if idx < 256: # Ensure we only add up to 256 characters
	vocab[char] = idx # Map the character to its index

	vocab[b' '] = 255
	vocab[b'.'] = 254
	# Save merges and vocab to a file
	# with open('merges_vocab.json', 'w') as f:
	# json.dump({'merges': merges, 'vocab': vocab}, f)

	# saving the merges and vocab to a file
	with open('merges_vocab.json', 'w') as f:
	json.dump({'merges': {str(k): v for k, v in merges.items()}, 'vocab': {str(k): v for k, v in vocab.items()}}, f)

	# Building the vocabulary section end -->


	# Reading the merges and vocab from a file section start -->
	import json
	from collections import defaultdict

	# Read the merges and vocab data from the JSON file
	with open('merges_vocab.json', 'r') as f:
	data = json.load(f)

	# Create a defaultdict to store the data in a distributed manner
	distributed_data = defaultdict(list)

	# Distribute the merges and vocab data
	# for key, value in data['merges'].items():
	# distributed_data['merges'].append({key: value})

	for key, value in data['vocab'].items():
	distributed_data['vocab'].append({key: value})

	# Optionally, print the distributed data for verification
	print(distributed_data)
	distributed_data['vocab']
	# Convert the list of dictionaries to a single dictionary
	formatted_vocab = {}
	for item in distributed_data['vocab']:
	for k, v in item.items():
	if ',' not in k:
	formatted_vocab[(eval(k),)] = v
	else:
	formatted_vocab[eval(k)] = v
	print(formatted_vocab[:50])
	# inverting the vocab
	inverted_vocab = {v: k for k, v in formatted_vocab.items()}
	inverted_vocab

	# Reading the merges and vocab from a file section end -->

	# Expanding the vocab section start -->
	def convert_to_bytes(value):
	if isinstance(value, bytes):
	return value
	elif value in inverted_vocab:
	return process_tuple(inverted_vocab[value])
	else:
	print(f'value not found in inverted_vocab: {value}')
	return None

	def process_tuple(value_tuple):
	# print(f'value_tuple: {value_tuple}')
	# for vi in value_tuple:
	# print(f'v: {vi}')
	converted_values = []
	for v in value_tuple:
	result = convert_to_bytes(v)
	if isinstance(result, tuple):
	converted_values.extend(result)
	else:
	converted_values.append(result)
	return tuple(converted_values)

	decoder_map = {k: process_tuple(v) for k, v in inverted_vocab.items()}