File size: 6,275 Bytes
e4d5fc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import pandas as pd
import re

# Data section  start--> 
# Load the CSV files
file_paths = [
    '/Users/anvesh/codebase/llm/data/telugu_books/telugu_books.csv',
    '/Users/anvesh/codebase/llm/data/telugu_news/1_telugu_news.csv',
    '/Users/anvesh/codebase/llm/data/telugu_news/2_telugu_news.csv'
]

# Combine data from all files
telugu_texts = []
for file_path in file_paths:
    df = pd.read_csv(file_path)
    if 'text' in df.columns:
        telugu_texts.append(' '.join(df['text'].astype(str).tolist()))
    elif 'body' in df.columns:
        telugu_texts.append(' '.join(df['body'].astype(str).tolist()))

# Concatenate all texts and remove all English, numerical values, and quotes
telugu_text = ' '.join(telugu_texts)
telugu_text = re.sub(r'[A-Za-z0-9\'"]', '', telugu_text)  # Remove English letters, numbers, and quotes
telugu_text = re.sub(r'[\r\n\xa0]', '', telugu_text)  # Remove line breaks and non-breaking spaces

print('telugu_text befores utf-8 encoding:', telugu_text[:100])

vocabulary_size = len(set(telugu_text.split()))
print('Original text size:', len(telugu_text))
print('Vocabulary size of telugu_text:', vocabulary_size)

unique_characters = set(telugu_text)
unique_count = len(unique_characters)
print('Original text size:', len(telugu_text))
print('Unique character count in telugu_text:', unique_count)

# Data section  end--> 

# utf-8 encoding section start -->
import encode_parallel_telugu as encode_parallel
import time

tokens = encode_parallel.load_telugu_texts()
# Start the timer
start_time = time.time()
# Encode the tokens in parallel and get concatenated results
encoded_tokens = encode_parallel.encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
print('encoded_tokens:', encoded_tokens[:100])
print(len(encoded_tokens))
# End the timer
end_time = time.time()
print(f"Time taken to encode and process tokens in parallel: {end_time - start_time:.4f} seconds")

print('length of encoded_text:', len(encoded_tokens))
print('unique characters in encoded_text:', set(encoded_tokens))
print('unique characters in encoded_text:', len(set(encoded_tokens)))
# utf-8 encoding section end -->

# BPE section start -->
#### **BPE implementation**

tokens = encoded_tokens

def get_stats(ids):
    counts = {}
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts

def merge(ids, pair, idx):
    new_ids = []
    i = 0
    while i < len(ids):
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
            new_ids.append(idx)
            i += 2
        else:
            new_ids.append(ids[i])
            i += 1
    return new_ids

# ---
vocab_size = 500 # the desired final vocabulary size
num_merges = vocab_size - 256 ## our unique tokens are 194, for our sample text.
ids = list(tokens) # copy so we don't destroy the original list

merges = {} # (int, int) -> int
from tqdm import tqdm  # Import tqdm for progress bar

for i in tqdm(range(num_merges), desc="Merging tokens"):
    stats = get_stats(ids)
    pair = max(stats, key=stats.get)
    idx = 256 + i
    # print(f"merging {pair} into a new token {idx}")
    ids = merge(ids, pair, idx)
    merges[pair] = idx # merge has a pair of tokens and the new token index
    
print("tokens length:", len(tokens))
print("ids length:", len(ids))
print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
print(f"token size: {len(set(tokens))}")
    
# print(ids)
# BPE section end -->

# Building the vocabulary section start -->
telugu_unicode_chars = [chr(i) for i in range(0x0C00, 0x0C7F)]  # Telugu Unicode range

# Add these characters to the vocabulary
import json
vocab = {token: idx for token, idx in merges.items()}
# Add unique Telugu characters to the vocabulary
for idx, char in enumerate([chr(i).encode('utf-8') for i in range(0x0C00, 0x0C7F)]):
    if idx < 256:  # Ensure we only add up to 256 characters
        vocab[char] = idx  # Map the character to its index

vocab[b' '] = 255
vocab[b'.'] = 254
# Save merges and vocab to a file
# with open('merges_vocab.json', 'w') as f:
#     json.dump({'merges': merges, 'vocab': vocab}, f)

# saving the merges and vocab to a file
with open('merges_vocab.json', 'w') as f:
    json.dump({'merges': {str(k): v for k, v in merges.items()}, 'vocab': {str(k): v for k, v in vocab.items()}}, f)
    
# Building the vocabulary section end -->


# Reading the merges and vocab from a file section start -->
import json
from collections import defaultdict

# Read the merges and vocab data from the JSON file
with open('merges_vocab.json', 'r') as f:
    data = json.load(f)

# Create a defaultdict to store the data in a distributed manner
distributed_data = defaultdict(list)

# Distribute the merges and vocab data
# for key, value in data['merges'].items():
#     distributed_data['merges'].append({key: value})

for key, value in data['vocab'].items():
    distributed_data['vocab'].append({key: value})

# Optionally, print the distributed data for verification
print(distributed_data)
distributed_data['vocab']
# Convert the list of dictionaries to a single dictionary
formatted_vocab = {}
for item in distributed_data['vocab']:
    for k, v in item.items():
        if ',' not in k:
            formatted_vocab[(eval(k),)] = v
        else:
            formatted_vocab[eval(k)] = v
print(formatted_vocab[:50])
# inverting the vocab
inverted_vocab = {v: k for k, v in formatted_vocab.items()}
inverted_vocab

# Reading the merges and vocab from a file section end -->

# Expanding the vocab section start -->
def convert_to_bytes(value):
    if isinstance(value, bytes):
        return value
    elif value in inverted_vocab:
        return process_tuple(inverted_vocab[value])
    else:
        print(f'value not found in inverted_vocab: {value}')
        return None

def process_tuple(value_tuple):
    # print(f'value_tuple: {value_tuple}')
    # for vi in value_tuple:
    #     print(f'v: {vi}')
    converted_values = []
    for v in value_tuple:
        result = convert_to_bytes(v)
        if isinstance(result, tuple):
            converted_values.extend(result)
        else:
            converted_values.append(result)
    return tuple(converted_values)

decoder_map = {k: process_tuple(v) for k, v in inverted_vocab.items()}