Spaces:
Sleeping
Sleeping
Commit ·
e4d5fc0
1
Parent(s): d4b4f47
updated
Browse files- __pycache__/consecutive_tokens.cpython-312.pyc +0 -0
- __pycache__/encoder_parallel_telugu.cpython-312.pyc +0 -0
- __pycache__/tokenizer.cpython-312.pyc +0 -0
- app.py +20 -2
- consecutive_tokens.py +51 -0
- encoder_parallel_telugu.py +69 -0
- merges_vocab.json +1 -0
- tokenizer.py +135 -0
- tokenizer_backup.py +194 -0
__pycache__/consecutive_tokens.cpython-312.pyc
ADDED
|
Binary file (2.55 kB). View file
|
|
|
__pycache__/encoder_parallel_telugu.cpython-312.pyc
ADDED
|
Binary file (3.25 kB). View file
|
|
|
__pycache__/tokenizer.cpython-312.pyc
ADDED
|
Binary file (6.88 kB). View file
|
|
|
app.py
CHANGED
|
@@ -1,8 +1,26 @@
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
def encode(text):
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
def decode(text):
|
| 8 |
# Placeholder for decoding logic
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import encoder_parallel_telugu as encode_parallel
|
| 3 |
+
from consecutive_tokens import get_consecutive_tokens, search_consecutive_tokens
|
| 4 |
+
import tokenizer
|
| 5 |
|
| 6 |
def encode(text):
|
| 7 |
+
if text == "":
|
| 8 |
+
return "Enter text to encode..."
|
| 9 |
+
encoded_tokens = [token.encode('utf-8') for token in text]
|
| 10 |
+
consective_tokens = get_consecutive_tokens(encoded_tokens,window_size=4)
|
| 11 |
+
# Reading vocabulary from file
|
| 12 |
+
formatted_vocab = tokenizer.read_vocab_from_file()
|
| 13 |
+
# Invert vocabulary
|
| 14 |
+
inverted_vocab = {v: k for k, v in formatted_vocab.items()}
|
| 15 |
+
# Expand vocabulary
|
| 16 |
+
decoder_map = tokenizer.expand_vocab(inverted_vocab)
|
| 17 |
+
# Invert back again after expansion
|
| 18 |
+
re_inverted_vocab = {k: v for v, k in decoder_map.items()}
|
| 19 |
+
|
| 20 |
+
# encoded_tokens = [re_inverted_vocab.get(token) for token in consective_tokens]
|
| 21 |
+
encoded_tokens = search_consecutive_tokens(consective_tokens, re_inverted_vocab)
|
| 22 |
+
print(encoded_tokens)
|
| 23 |
+
return f"Encoded: {encoded_tokens}"
|
| 24 |
|
| 25 |
def decode(text):
|
| 26 |
# Placeholder for decoding logic
|
consecutive_tokens.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import OrderedDict
|
| 2 |
+
|
| 3 |
+
def get_consecutive_tokens(li, window_size=4):
|
| 4 |
+
if len(li) == 0:
|
| 5 |
+
return []
|
| 6 |
+
final_token_dict = OrderedDict((token, []) for token in range(len(li)))
|
| 7 |
+
i = 0
|
| 8 |
+
while i <= len(li)-1:
|
| 9 |
+
j = 1
|
| 10 |
+
while j <= window_size:
|
| 11 |
+
final_token_dict[i].append(tuple(li[i:i+j]))
|
| 12 |
+
j+=1
|
| 13 |
+
i+=1
|
| 14 |
+
|
| 15 |
+
reversed_token_dict = {key: [tuple(tup) for tup in reversed(value)] for key, value in final_token_dict.items()}
|
| 16 |
+
return reversed_token_dict
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def search_consecutive_tokens(ordered_dict, encoded_token_dict):
|
| 21 |
+
final_encoded_tokens = []
|
| 22 |
+
keys = list(ordered_dict.keys())
|
| 23 |
+
i = 0
|
| 24 |
+
while i < len(keys):
|
| 25 |
+
key = keys[i]
|
| 26 |
+
j = 0
|
| 27 |
+
jump = False
|
| 28 |
+
while j<len(ordered_dict[key]):
|
| 29 |
+
if ordered_dict[key][j] in encoded_token_dict:
|
| 30 |
+
final_encoded_tokens.append(encoded_token_dict[ordered_dict[key][j]])
|
| 31 |
+
i+=len(ordered_dict[key][j])
|
| 32 |
+
jump = True
|
| 33 |
+
j = 0
|
| 34 |
+
break
|
| 35 |
+
j+=1
|
| 36 |
+
if not jump:
|
| 37 |
+
i+=1
|
| 38 |
+
return final_encoded_tokens
|
| 39 |
+
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
text = "తెలుగు భాష ఒక ద్రావిడ భాష."
|
| 42 |
+
op_li = get_consecutive_tokens([1,2,3,4,5])
|
| 43 |
+
print(op_li)
|
| 44 |
+
|
| 45 |
+
dict = {(1,2):9,(3,):10, (4,5):11}
|
| 46 |
+
print(search_consecutive_tokens(op_li, dict))
|
| 47 |
+
|
| 48 |
+
# encoded_tokens = encode_tokens_parallel(text, chunk_size=1_000_000, max_workers=2)
|
| 49 |
+
# encoded_tokens = [token.encode('utf-8') for token in text]
|
| 50 |
+
# decoded_tokens = [i.decode('utf-8') for i in encoded_tokens]
|
| 51 |
+
# print(get_consecutive_tokens(decoded_tokens))
|
encoder_parallel_telugu.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import concurrent.futures
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
# Function to encode a chunk of tokens into UTF-8 and return as bytes
|
| 8 |
+
def encode_chunk(chunk):
|
| 9 |
+
# Encode each token in the chunk to UTF-8
|
| 10 |
+
return [token.encode('utf-8') for token in chunk]
|
| 11 |
+
|
| 12 |
+
# Main function to handle parallel encoding and return concatenated results
|
| 13 |
+
def encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10):
|
| 14 |
+
# Split the tokens into chunks of size chunk_size (1 million tokens per chunk)
|
| 15 |
+
chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
|
| 16 |
+
|
| 17 |
+
# Prepare the progress bar
|
| 18 |
+
total_chunks = len(chunks)
|
| 19 |
+
|
| 20 |
+
# Use ProcessPoolExecutor to process chunks in parallel
|
| 21 |
+
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
|
| 22 |
+
# Use tqdm to show a progress bar while processing chunks
|
| 23 |
+
encoded_chunks = list(tqdm(executor.map(encode_chunk, chunks), total=total_chunks, desc="Processing Chunks"))
|
| 24 |
+
|
| 25 |
+
# Concatenate all encoded chunks into a single list
|
| 26 |
+
concatenated_encoded = [token for chunk in encoded_chunks for token in chunk]
|
| 27 |
+
|
| 28 |
+
return concatenated_encoded
|
| 29 |
+
|
| 30 |
+
def load_telugu_texts():
|
| 31 |
+
file_paths = [
|
| 32 |
+
'/Users/anvesh/codebase/llm/data/telugu_books/telugu_books.csv',
|
| 33 |
+
'/Users/anvesh/codebase/llm/data/telugu_news/1_telugu_news.csv',
|
| 34 |
+
'/Users/anvesh/codebase/llm/data/telugu_news/2_telugu_news.csv'
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
# Combine data from all files
|
| 38 |
+
telugu_texts = []
|
| 39 |
+
for file_path in file_paths:
|
| 40 |
+
df = pd.read_csv(file_path)
|
| 41 |
+
if 'text' in df.columns:
|
| 42 |
+
telugu_texts.append(' '.join(df['text'].astype(str).tolist()))
|
| 43 |
+
elif 'body' in df.columns:
|
| 44 |
+
telugu_texts.append(' '.join(df['body'].astype(str).tolist()))
|
| 45 |
+
# Concatenate all texts and remove all English, numerical values, quotes, and characters outside the UTF-8 range 0x0C00 to 0x0C7F, including special characters like @, #, $, and %.
|
| 46 |
+
telugu_text = ' '.join(telugu_texts)
|
| 47 |
+
telugu_text = re.sub(r'[^\u0C00-\u0C7F@#$%]', '', telugu_text) # Remove characters outside the specified UTF-8 range and special characters
|
| 48 |
+
telugu_text = re.sub(r'[\r\n\xa0]', '', telugu_text) # Remove line breaks and non-breaking spaces
|
| 49 |
+
return telugu_text
|
| 50 |
+
|
| 51 |
+
# Main script
|
| 52 |
+
if __name__ == '__main__':
|
| 53 |
+
# Load the Telugu texts
|
| 54 |
+
tokens = load_telugu_texts()
|
| 55 |
+
# Start the timer
|
| 56 |
+
start_time = time.time()
|
| 57 |
+
|
| 58 |
+
# Encode the tokens in parallel and get concatenated results
|
| 59 |
+
encoded_tokens = encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
|
| 60 |
+
print(encoded_tokens[:100])
|
| 61 |
+
print(len(encoded_tokens))
|
| 62 |
+
# End the timer
|
| 63 |
+
end_time = time.time()
|
| 64 |
+
|
| 65 |
+
# Calculate the time taken
|
| 66 |
+
time_taken = end_time - start_time
|
| 67 |
+
|
| 68 |
+
print(f"Time taken to encode and process tokens in parallel: {time_taken:.4f} seconds")
|
| 69 |
+
print("Encoding and processing completed!")
|
merges_vocab.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"merges": {"(b'\\xe0\\xb0\\xa8', b'\\xe0\\xb0\\xbf')": 256}, "vocab": {"(b'\\xe0\\xb0\\xa8', b'\\xe0\\xb0\\xbf')": 256, "b'\\xe0\\xb0\\x80'": 0, "b'\\xe0\\xb0\\x81'": 1, "b'\\xe0\\xb0\\x82'": 2, "b'\\xe0\\xb0\\x83'": 3, "b'\\xe0\\xb0\\x84'": 4, "b'\\xe0\\xb0\\x85'": 5, "b'\\xe0\\xb0\\x86'": 6, "b'\\xe0\\xb0\\x87'": 7, "b'\\xe0\\xb0\\x88'": 8, "b'\\xe0\\xb0\\x89'": 9, "b'\\xe0\\xb0\\x8a'": 10, "b'\\xe0\\xb0\\x8b'": 11, "b'\\xe0\\xb0\\x8c'": 12, "b'\\xe0\\xb0\\x8d'": 13, "b'\\xe0\\xb0\\x8e'": 14, "b'\\xe0\\xb0\\x8f'": 15, "b'\\xe0\\xb0\\x90'": 16, "b'\\xe0\\xb0\\x91'": 17, "b'\\xe0\\xb0\\x92'": 18, "b'\\xe0\\xb0\\x93'": 19, "b'\\xe0\\xb0\\x94'": 20, "b'\\xe0\\xb0\\x95'": 21, "b'\\xe0\\xb0\\x96'": 22, "b'\\xe0\\xb0\\x97'": 23, "b'\\xe0\\xb0\\x98'": 24, "b'\\xe0\\xb0\\x99'": 25, "b'\\xe0\\xb0\\x9a'": 26, "b'\\xe0\\xb0\\x9b'": 27, "b'\\xe0\\xb0\\x9c'": 28, "b'\\xe0\\xb0\\x9d'": 29, "b'\\xe0\\xb0\\x9e'": 30, "b'\\xe0\\xb0\\x9f'": 31, "b'\\xe0\\xb0\\xa0'": 32, "b'\\xe0\\xb0\\xa1'": 33, "b'\\xe0\\xb0\\xa2'": 34, "b'\\xe0\\xb0\\xa3'": 35, "b'\\xe0\\xb0\\xa4'": 36, "b'\\xe0\\xb0\\xa5'": 37, "b'\\xe0\\xb0\\xa6'": 38, "b'\\xe0\\xb0\\xa7'": 39, "b'\\xe0\\xb0\\xa8'": 40, "b'\\xe0\\xb0\\xa9'": 41, "b'\\xe0\\xb0\\xaa'": 42, "b'\\xe0\\xb0\\xab'": 43, "b'\\xe0\\xb0\\xac'": 44, "b'\\xe0\\xb0\\xad'": 45, "b'\\xe0\\xb0\\xae'": 46, "b'\\xe0\\xb0\\xaf'": 47, "b'\\xe0\\xb0\\xb0'": 48, "b'\\xe0\\xb0\\xb1'": 49, "b'\\xe0\\xb0\\xb2'": 50, "b'\\xe0\\xb0\\xb3'": 51, "b'\\xe0\\xb0\\xb4'": 52, "b'\\xe0\\xb0\\xb5'": 53, "b'\\xe0\\xb0\\xb6'": 54, "b'\\xe0\\xb0\\xb7'": 55, "b'\\xe0\\xb0\\xb8'": 56, "b'\\xe0\\xb0\\xb9'": 57, "b'\\xe0\\xb0\\xba'": 58, "b'\\xe0\\xb0\\xbb'": 59, "b'\\xe0\\xb0\\xbc'": 60, "b'\\xe0\\xb0\\xbd'": 61, "b'\\xe0\\xb0\\xbe'": 62, "b'\\xe0\\xb0\\xbf'": 63, "b'\\xe0\\xb1\\x80'": 64, "b'\\xe0\\xb1\\x81'": 65, "b'\\xe0\\xb1\\x82'": 66, "b'\\xe0\\xb1\\x83'": 67, "b'\\xe0\\xb1\\x84'": 68, "b'\\xe0\\xb1\\x85'": 69, "b'\\xe0\\xb1\\x86'": 70, "b'\\xe0\\xb1\\x87'": 71, "b'\\xe0\\xb1\\x88'": 72, "b'\\xe0\\xb1\\x89'": 73, "b'\\xe0\\xb1\\x8a'": 74, "b'\\xe0\\xb1\\x8b'": 75, "b'\\xe0\\xb1\\x8c'": 76, "b'\\xe0\\xb1\\x8d'": 77, "b'\\xe0\\xb1\\x8e'": 78, "b'\\xe0\\xb1\\x8f'": 79, "b'\\xe0\\xb1\\x90'": 80, "b'\\xe0\\xb1\\x91'": 81, "b'\\xe0\\xb1\\x92'": 82, "b'\\xe0\\xb1\\x93'": 83, "b'\\xe0\\xb1\\x94'": 84, "b'\\xe0\\xb1\\x95'": 85, "b'\\xe0\\xb1\\x96'": 86, "b'\\xe0\\xb1\\x97'": 87, "b'\\xe0\\xb1\\x98'": 88, "b'\\xe0\\xb1\\x99'": 89, "b'\\xe0\\xb1\\x9a'": 90, "b'\\xe0\\xb1\\x9b'": 91, "b'\\xe0\\xb1\\x9c'": 92, "b'\\xe0\\xb1\\x9d'": 93, "b'\\xe0\\xb1\\x9e'": 94, "b'\\xe0\\xb1\\x9f'": 95, "b'\\xe0\\xb1\\xa0'": 96, "b'\\xe0\\xb1\\xa1'": 97, "b'\\xe0\\xb1\\xa2'": 98, "b'\\xe0\\xb1\\xa3'": 99, "b'\\xe0\\xb1\\xa4'": 100, "b'\\xe0\\xb1\\xa5'": 101, "b'\\xe0\\xb1\\xa6'": 102, "b'\\xe0\\xb1\\xa7'": 103, "b'\\xe0\\xb1\\xa8'": 104, "b'\\xe0\\xb1\\xa9'": 105, "b'\\xe0\\xb1\\xaa'": 106, "b'\\xe0\\xb1\\xab'": 107, "b'\\xe0\\xb1\\xac'": 108, "b'\\xe0\\xb1\\xad'": 109, "b'\\xe0\\xb1\\xae'": 110, "b'\\xe0\\xb1\\xaf'": 111, "b'\\xe0\\xb1\\xb0'": 112, "b'\\xe0\\xb1\\xb1'": 113, "b'\\xe0\\xb1\\xb2'": 114, "b'\\xe0\\xb1\\xb3'": 115, "b'\\xe0\\xb1\\xb4'": 116, "b'\\xe0\\xb1\\xb5'": 117, "b'\\xe0\\xb1\\xb6'": 118, "b'\\xe0\\xb1\\xb7'": 119, "b'\\xe0\\xb1\\xb8'": 120, "b'\\xe0\\xb1\\xb9'": 121, "b'\\xe0\\xb1\\xba'": 122, "b'\\xe0\\xb1\\xbb'": 123, "b'\\xe0\\xb1\\xbc'": 124, "b'\\xe0\\xb1\\xbd'": 125, "b'\\xe0\\xb1\\xbe'": 126, "b' '": 255, "b'.'": 254}}
|
tokenizer.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import re
|
| 3 |
+
import encoder_parallel_telugu as encode_parallel
|
| 4 |
+
import time
|
| 5 |
+
import json
|
| 6 |
+
from collections import defaultdict
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
|
| 9 |
+
def load_and_encode_tokens():
|
| 10 |
+
tokens = encode_parallel.load_telugu_texts()
|
| 11 |
+
start_time = time.time()
|
| 12 |
+
encoded_tokens = encode_parallel.encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
|
| 13 |
+
print('encoded_tokens:', encoded_tokens[:100])
|
| 14 |
+
print(len(encoded_tokens))
|
| 15 |
+
end_time = time.time()
|
| 16 |
+
print(f"Time taken to encode and process tokens in parallel: {end_time - start_time:.4f} seconds")
|
| 17 |
+
print('length of encoded_text:', len(encoded_tokens))
|
| 18 |
+
print('unique characters in decoded_text:', {token.decode('utf-8') for token in set(encoded_tokens)})
|
| 19 |
+
# print('unique characters in encoded_text:', set(encoded_tokens))
|
| 20 |
+
print('unique characters in encoded_text:', len(set(encoded_tokens)))
|
| 21 |
+
return encoded_tokens
|
| 22 |
+
|
| 23 |
+
def get_stats(ids):
|
| 24 |
+
counts = {}
|
| 25 |
+
for pair in zip(ids, ids[1:]):
|
| 26 |
+
counts[pair] = counts.get(pair, 0) + 1
|
| 27 |
+
return counts
|
| 28 |
+
|
| 29 |
+
def merge(ids, pair, idx):
|
| 30 |
+
new_ids = []
|
| 31 |
+
i = 0
|
| 32 |
+
while i < len(ids):
|
| 33 |
+
if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
|
| 34 |
+
new_ids.append(idx)
|
| 35 |
+
i += 2
|
| 36 |
+
else:
|
| 37 |
+
new_ids.append(ids[i])
|
| 38 |
+
i += 1
|
| 39 |
+
return new_ids
|
| 40 |
+
|
| 41 |
+
def bpe_process(encoded_tokens,vocab_size=500, encoded_tokens_length=10_00_000):
|
| 42 |
+
num_merges = vocab_size - 256 # our unique tokens are 194, for our sample text.
|
| 43 |
+
encoded_tokens = encoded_tokens[:encoded_tokens_length]
|
| 44 |
+
ids = list(encoded_tokens) # copy so we don't destroy the original list
|
| 45 |
+
merges = {} # (int, int) -> int
|
| 46 |
+
|
| 47 |
+
for i in tqdm(range(num_merges), desc="Merging tokens"):
|
| 48 |
+
stats = get_stats(ids)
|
| 49 |
+
pair = max(stats, key=stats.get)
|
| 50 |
+
idx = 256 + i
|
| 51 |
+
ids = merge(ids, pair, idx)
|
| 52 |
+
merges[pair] = idx # merge has a pair of tokens and the new token index
|
| 53 |
+
|
| 54 |
+
print("tokens length:", len(encoded_tokens))
|
| 55 |
+
print("ids length:", len(ids))
|
| 56 |
+
print("by paired tokens length:", len(set(ids)))
|
| 57 |
+
print(f"compression ratio: {len(encoded_tokens) / len(ids):.2f}X")
|
| 58 |
+
# print(f"token size: {len(set(encoded_tokens))}")
|
| 59 |
+
|
| 60 |
+
return merges
|
| 61 |
+
|
| 62 |
+
def build_vocabulary(merges):
|
| 63 |
+
telugu_unicode_chars = [chr(i) for i in range(0x0C00, 0x0C7F)] # Telugu Unicode range
|
| 64 |
+
vocab = {token: idx for token, idx in merges.items()}
|
| 65 |
+
|
| 66 |
+
for idx, char in enumerate([chr(i).encode('utf-8') for i in range(0x0C00, 0x0C7F)]):
|
| 67 |
+
if idx < 256: # Ensure we only add up to 256 characters
|
| 68 |
+
vocab[char] = idx # Map the character to its index
|
| 69 |
+
|
| 70 |
+
vocab[b' '] = 255
|
| 71 |
+
vocab[b'.'] = 254
|
| 72 |
+
|
| 73 |
+
with open('merges_vocab.json', 'w') as f:
|
| 74 |
+
json.dump({'merges': {str(k): v for k, v in merges.items()}, 'vocab': {str(k): v for k, v in vocab.items()}}, f)
|
| 75 |
+
|
| 76 |
+
def read_vocab_from_file():
|
| 77 |
+
with open('merges_vocab.json', 'r') as f:
|
| 78 |
+
data = json.load(f)
|
| 79 |
+
|
| 80 |
+
distributed_data = defaultdict(list)
|
| 81 |
+
|
| 82 |
+
for key, value in data['vocab'].items():
|
| 83 |
+
distributed_data['vocab'].append({key: value})
|
| 84 |
+
|
| 85 |
+
formatted_vocab = {}
|
| 86 |
+
for item in distributed_data['vocab']:
|
| 87 |
+
for k, v in item.items():
|
| 88 |
+
if ',' not in k:
|
| 89 |
+
formatted_vocab[(eval(k),)] = v
|
| 90 |
+
else:
|
| 91 |
+
formatted_vocab[eval(k)] = v
|
| 92 |
+
|
| 93 |
+
return formatted_vocab
|
| 94 |
+
|
| 95 |
+
def expand_vocab(inverted_vocab):
|
| 96 |
+
def convert_to_bytes(value):
|
| 97 |
+
if isinstance(value, bytes):
|
| 98 |
+
return value
|
| 99 |
+
elif value in inverted_vocab:
|
| 100 |
+
return process_tuple(inverted_vocab[value])
|
| 101 |
+
else:
|
| 102 |
+
print(f'value not found in inverted_vocab: {value}')
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
def process_tuple(value_tuple):
|
| 106 |
+
converted_values = []
|
| 107 |
+
for v in value_tuple:
|
| 108 |
+
result = convert_to_bytes(v)
|
| 109 |
+
if isinstance(result, tuple):
|
| 110 |
+
converted_values.extend(result)
|
| 111 |
+
else:
|
| 112 |
+
converted_values.append(result)
|
| 113 |
+
return tuple(converted_values)
|
| 114 |
+
|
| 115 |
+
decoder_map = {k: process_tuple(v) for k, v in inverted_vocab.items()}
|
| 116 |
+
print("sample decoder map:", {k: decoder_map[k] for k in list(decoder_map)[:5]})
|
| 117 |
+
return decoder_map
|
| 118 |
+
|
| 119 |
+
# # Main execution
|
| 120 |
+
# if __name__ == "__main__":
|
| 121 |
+
# # 1. Load and encode tokens
|
| 122 |
+
# encoded_tokens = load_and_encode_tokens()
|
| 123 |
+
# # 2. Process BPE
|
| 124 |
+
# merges = bpe_process(encoded_tokens,vocab_size=257, encoded_tokens_length=20_00_000)
|
| 125 |
+
# # 3. Build vocabulary
|
| 126 |
+
# build_vocabulary(merges)
|
| 127 |
+
# # 4. Read vocabulary from file
|
| 128 |
+
# formatted_vocab = read_vocab_from_file()
|
| 129 |
+
# # 5. Invert vocabulary
|
| 130 |
+
# inverted_vocab = {v: k for k, v in formatted_vocab.items()}
|
| 131 |
+
# # 6. Expand vocabulary
|
| 132 |
+
# decoder_map = expand_vocab(inverted_vocab)
|
| 133 |
+
# # 7. Invert back again
|
| 134 |
+
# re_inverted_vocab = {k: v for v, k in decoder_map.items()}
|
| 135 |
+
# print(re_inverted_vocab)
|
tokenizer_backup.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
# Data section start-->
|
| 5 |
+
# Load the CSV files
|
| 6 |
+
file_paths = [
|
| 7 |
+
'/Users/anvesh/codebase/llm/data/telugu_books/telugu_books.csv',
|
| 8 |
+
'/Users/anvesh/codebase/llm/data/telugu_news/1_telugu_news.csv',
|
| 9 |
+
'/Users/anvesh/codebase/llm/data/telugu_news/2_telugu_news.csv'
|
| 10 |
+
]
|
| 11 |
+
|
| 12 |
+
# Combine data from all files
|
| 13 |
+
telugu_texts = []
|
| 14 |
+
for file_path in file_paths:
|
| 15 |
+
df = pd.read_csv(file_path)
|
| 16 |
+
if 'text' in df.columns:
|
| 17 |
+
telugu_texts.append(' '.join(df['text'].astype(str).tolist()))
|
| 18 |
+
elif 'body' in df.columns:
|
| 19 |
+
telugu_texts.append(' '.join(df['body'].astype(str).tolist()))
|
| 20 |
+
|
| 21 |
+
# Concatenate all texts and remove all English, numerical values, and quotes
|
| 22 |
+
telugu_text = ' '.join(telugu_texts)
|
| 23 |
+
telugu_text = re.sub(r'[A-Za-z0-9\'"]', '', telugu_text) # Remove English letters, numbers, and quotes
|
| 24 |
+
telugu_text = re.sub(r'[\r\n\xa0]', '', telugu_text) # Remove line breaks and non-breaking spaces
|
| 25 |
+
|
| 26 |
+
print('telugu_text befores utf-8 encoding:', telugu_text[:100])
|
| 27 |
+
|
| 28 |
+
vocabulary_size = len(set(telugu_text.split()))
|
| 29 |
+
print('Original text size:', len(telugu_text))
|
| 30 |
+
print('Vocabulary size of telugu_text:', vocabulary_size)
|
| 31 |
+
|
| 32 |
+
unique_characters = set(telugu_text)
|
| 33 |
+
unique_count = len(unique_characters)
|
| 34 |
+
print('Original text size:', len(telugu_text))
|
| 35 |
+
print('Unique character count in telugu_text:', unique_count)
|
| 36 |
+
|
| 37 |
+
# Data section end-->
|
| 38 |
+
|
| 39 |
+
# utf-8 encoding section start -->
|
| 40 |
+
import encode_parallel_telugu as encode_parallel
|
| 41 |
+
import time
|
| 42 |
+
|
| 43 |
+
tokens = encode_parallel.load_telugu_texts()
|
| 44 |
+
# Start the timer
|
| 45 |
+
start_time = time.time()
|
| 46 |
+
# Encode the tokens in parallel and get concatenated results
|
| 47 |
+
encoded_tokens = encode_parallel.encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
|
| 48 |
+
print('encoded_tokens:', encoded_tokens[:100])
|
| 49 |
+
print(len(encoded_tokens))
|
| 50 |
+
# End the timer
|
| 51 |
+
end_time = time.time()
|
| 52 |
+
print(f"Time taken to encode and process tokens in parallel: {end_time - start_time:.4f} seconds")
|
| 53 |
+
|
| 54 |
+
print('length of encoded_text:', len(encoded_tokens))
|
| 55 |
+
print('unique characters in encoded_text:', set(encoded_tokens))
|
| 56 |
+
print('unique characters in encoded_text:', len(set(encoded_tokens)))
|
| 57 |
+
# utf-8 encoding section end -->
|
| 58 |
+
|
| 59 |
+
# BPE section start -->
|
| 60 |
+
#### **BPE implementation**
|
| 61 |
+
|
| 62 |
+
tokens = encoded_tokens
|
| 63 |
+
|
| 64 |
+
def get_stats(ids):
|
| 65 |
+
counts = {}
|
| 66 |
+
for pair in zip(ids, ids[1:]):
|
| 67 |
+
counts[pair] = counts.get(pair, 0) + 1
|
| 68 |
+
return counts
|
| 69 |
+
|
| 70 |
+
def merge(ids, pair, idx):
|
| 71 |
+
new_ids = []
|
| 72 |
+
i = 0
|
| 73 |
+
while i < len(ids):
|
| 74 |
+
if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
|
| 75 |
+
new_ids.append(idx)
|
| 76 |
+
i += 2
|
| 77 |
+
else:
|
| 78 |
+
new_ids.append(ids[i])
|
| 79 |
+
i += 1
|
| 80 |
+
return new_ids
|
| 81 |
+
|
| 82 |
+
# ---
|
| 83 |
+
vocab_size = 500 # the desired final vocabulary size
|
| 84 |
+
num_merges = vocab_size - 256 ## our unique tokens are 194, for our sample text.
|
| 85 |
+
ids = list(tokens) # copy so we don't destroy the original list
|
| 86 |
+
|
| 87 |
+
merges = {} # (int, int) -> int
|
| 88 |
+
from tqdm import tqdm # Import tqdm for progress bar
|
| 89 |
+
|
| 90 |
+
for i in tqdm(range(num_merges), desc="Merging tokens"):
|
| 91 |
+
stats = get_stats(ids)
|
| 92 |
+
pair = max(stats, key=stats.get)
|
| 93 |
+
idx = 256 + i
|
| 94 |
+
# print(f"merging {pair} into a new token {idx}")
|
| 95 |
+
ids = merge(ids, pair, idx)
|
| 96 |
+
merges[pair] = idx # merge has a pair of tokens and the new token index
|
| 97 |
+
|
| 98 |
+
print("tokens length:", len(tokens))
|
| 99 |
+
print("ids length:", len(ids))
|
| 100 |
+
print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
|
| 101 |
+
print(f"token size: {len(set(tokens))}")
|
| 102 |
+
|
| 103 |
+
# print(ids)
|
| 104 |
+
# BPE section end -->
|
| 105 |
+
|
| 106 |
+
# Building the vocabulary section start -->
|
| 107 |
+
telugu_unicode_chars = [chr(i) for i in range(0x0C00, 0x0C7F)] # Telugu Unicode range
|
| 108 |
+
|
| 109 |
+
# Add these characters to the vocabulary
|
| 110 |
+
import json
|
| 111 |
+
vocab = {token: idx for token, idx in merges.items()}
|
| 112 |
+
# Add unique Telugu characters to the vocabulary
|
| 113 |
+
for idx, char in enumerate([chr(i).encode('utf-8') for i in range(0x0C00, 0x0C7F)]):
|
| 114 |
+
if idx < 256: # Ensure we only add up to 256 characters
|
| 115 |
+
vocab[char] = idx # Map the character to its index
|
| 116 |
+
|
| 117 |
+
vocab[b' '] = 255
|
| 118 |
+
vocab[b'.'] = 254
|
| 119 |
+
# Save merges and vocab to a file
|
| 120 |
+
# with open('merges_vocab.json', 'w') as f:
|
| 121 |
+
# json.dump({'merges': merges, 'vocab': vocab}, f)
|
| 122 |
+
|
| 123 |
+
# saving the merges and vocab to a file
|
| 124 |
+
with open('merges_vocab.json', 'w') as f:
|
| 125 |
+
json.dump({'merges': {str(k): v for k, v in merges.items()}, 'vocab': {str(k): v for k, v in vocab.items()}}, f)
|
| 126 |
+
|
| 127 |
+
# Building the vocabulary section end -->
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# Reading the merges and vocab from a file section start -->
|
| 131 |
+
import json
|
| 132 |
+
from collections import defaultdict
|
| 133 |
+
|
| 134 |
+
# Read the merges and vocab data from the JSON file
|
| 135 |
+
with open('merges_vocab.json', 'r') as f:
|
| 136 |
+
data = json.load(f)
|
| 137 |
+
|
| 138 |
+
# Create a defaultdict to store the data in a distributed manner
|
| 139 |
+
distributed_data = defaultdict(list)
|
| 140 |
+
|
| 141 |
+
# Distribute the merges and vocab data
|
| 142 |
+
# for key, value in data['merges'].items():
|
| 143 |
+
# distributed_data['merges'].append({key: value})
|
| 144 |
+
|
| 145 |
+
for key, value in data['vocab'].items():
|
| 146 |
+
distributed_data['vocab'].append({key: value})
|
| 147 |
+
|
| 148 |
+
# Optionally, print the distributed data for verification
|
| 149 |
+
print(distributed_data)
|
| 150 |
+
distributed_data['vocab']
|
| 151 |
+
# Convert the list of dictionaries to a single dictionary
|
| 152 |
+
formatted_vocab = {}
|
| 153 |
+
for item in distributed_data['vocab']:
|
| 154 |
+
for k, v in item.items():
|
| 155 |
+
if ',' not in k:
|
| 156 |
+
formatted_vocab[(eval(k),)] = v
|
| 157 |
+
else:
|
| 158 |
+
formatted_vocab[eval(k)] = v
|
| 159 |
+
print(formatted_vocab[:50])
|
| 160 |
+
# inverting the vocab
|
| 161 |
+
inverted_vocab = {v: k for k, v in formatted_vocab.items()}
|
| 162 |
+
inverted_vocab
|
| 163 |
+
|
| 164 |
+
# Reading the merges and vocab from a file section end -->
|
| 165 |
+
|
| 166 |
+
# Expanding the vocab section start -->
|
| 167 |
+
def convert_to_bytes(value):
|
| 168 |
+
if isinstance(value, bytes):
|
| 169 |
+
return value
|
| 170 |
+
elif value in inverted_vocab:
|
| 171 |
+
return process_tuple(inverted_vocab[value])
|
| 172 |
+
else:
|
| 173 |
+
print(f'value not found in inverted_vocab: {value}')
|
| 174 |
+
return None
|
| 175 |
+
|
| 176 |
+
def process_tuple(value_tuple):
|
| 177 |
+
# print(f'value_tuple: {value_tuple}')
|
| 178 |
+
# for vi in value_tuple:
|
| 179 |
+
# print(f'v: {vi}')
|
| 180 |
+
converted_values = []
|
| 181 |
+
for v in value_tuple:
|
| 182 |
+
result = convert_to_bytes(v)
|
| 183 |
+
if isinstance(result, tuple):
|
| 184 |
+
converted_values.extend(result)
|
| 185 |
+
else:
|
| 186 |
+
converted_values.append(result)
|
| 187 |
+
return tuple(converted_values)
|
| 188 |
+
|
| 189 |
+
decoder_map = {k: process_tuple(v) for k, v in inverted_vocab.items()}
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
|