anveshplus commited on
Commit
e4d5fc0
·
1 Parent(s): d4b4f47
__pycache__/consecutive_tokens.cpython-312.pyc ADDED
Binary file (2.55 kB). View file
 
__pycache__/encoder_parallel_telugu.cpython-312.pyc ADDED
Binary file (3.25 kB). View file
 
__pycache__/tokenizer.cpython-312.pyc ADDED
Binary file (6.88 kB). View file
 
app.py CHANGED
@@ -1,8 +1,26 @@
1
  import streamlit as st
 
 
 
2
 
3
  def encode(text):
4
- # Placeholder for encoding logic
5
- return f"Encoded: {text}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def decode(text):
8
  # Placeholder for decoding logic
 
1
  import streamlit as st
2
+ import encoder_parallel_telugu as encode_parallel
3
+ from consecutive_tokens import get_consecutive_tokens, search_consecutive_tokens
4
+ import tokenizer
5
 
6
  def encode(text):
7
+ if text == "":
8
+ return "Enter text to encode..."
9
+ encoded_tokens = [token.encode('utf-8') for token in text]
10
+ consective_tokens = get_consecutive_tokens(encoded_tokens,window_size=4)
11
+ # Reading vocabulary from file
12
+ formatted_vocab = tokenizer.read_vocab_from_file()
13
+ # Invert vocabulary
14
+ inverted_vocab = {v: k for k, v in formatted_vocab.items()}
15
+ # Expand vocabulary
16
+ decoder_map = tokenizer.expand_vocab(inverted_vocab)
17
+ # Invert back again after expansion
18
+ re_inverted_vocab = {k: v for v, k in decoder_map.items()}
19
+
20
+ # encoded_tokens = [re_inverted_vocab.get(token) for token in consective_tokens]
21
+ encoded_tokens = search_consecutive_tokens(consective_tokens, re_inverted_vocab)
22
+ print(encoded_tokens)
23
+ return f"Encoded: {encoded_tokens}"
24
 
25
  def decode(text):
26
  # Placeholder for decoding logic
consecutive_tokens.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+
3
+ def get_consecutive_tokens(li, window_size=4):
4
+ if len(li) == 0:
5
+ return []
6
+ final_token_dict = OrderedDict((token, []) for token in range(len(li)))
7
+ i = 0
8
+ while i <= len(li)-1:
9
+ j = 1
10
+ while j <= window_size:
11
+ final_token_dict[i].append(tuple(li[i:i+j]))
12
+ j+=1
13
+ i+=1
14
+
15
+ reversed_token_dict = {key: [tuple(tup) for tup in reversed(value)] for key, value in final_token_dict.items()}
16
+ return reversed_token_dict
17
+
18
+
19
+
20
+ def search_consecutive_tokens(ordered_dict, encoded_token_dict):
21
+ final_encoded_tokens = []
22
+ keys = list(ordered_dict.keys())
23
+ i = 0
24
+ while i < len(keys):
25
+ key = keys[i]
26
+ j = 0
27
+ jump = False
28
+ while j<len(ordered_dict[key]):
29
+ if ordered_dict[key][j] in encoded_token_dict:
30
+ final_encoded_tokens.append(encoded_token_dict[ordered_dict[key][j]])
31
+ i+=len(ordered_dict[key][j])
32
+ jump = True
33
+ j = 0
34
+ break
35
+ j+=1
36
+ if not jump:
37
+ i+=1
38
+ return final_encoded_tokens
39
+
40
+ if __name__ == "__main__":
41
+ text = "తెలుగు భాష ఒక ద్రావిడ భాష."
42
+ op_li = get_consecutive_tokens([1,2,3,4,5])
43
+ print(op_li)
44
+
45
+ dict = {(1,2):9,(3,):10, (4,5):11}
46
+ print(search_consecutive_tokens(op_li, dict))
47
+
48
+ # encoded_tokens = encode_tokens_parallel(text, chunk_size=1_000_000, max_workers=2)
49
+ # encoded_tokens = [token.encode('utf-8') for token in text]
50
+ # decoded_tokens = [i.decode('utf-8') for i in encoded_tokens]
51
+ # print(get_consecutive_tokens(decoded_tokens))
encoder_parallel_telugu.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import concurrent.futures
3
+ from tqdm import tqdm
4
+ import pandas as pd
5
+ import re
6
+
7
+ # Function to encode a chunk of tokens into UTF-8 and return as bytes
8
+ def encode_chunk(chunk):
9
+ # Encode each token in the chunk to UTF-8
10
+ return [token.encode('utf-8') for token in chunk]
11
+
12
+ # Main function to handle parallel encoding and return concatenated results
13
+ def encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10):
14
+ # Split the tokens into chunks of size chunk_size (1 million tokens per chunk)
15
+ chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
16
+
17
+ # Prepare the progress bar
18
+ total_chunks = len(chunks)
19
+
20
+ # Use ProcessPoolExecutor to process chunks in parallel
21
+ with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
22
+ # Use tqdm to show a progress bar while processing chunks
23
+ encoded_chunks = list(tqdm(executor.map(encode_chunk, chunks), total=total_chunks, desc="Processing Chunks"))
24
+
25
+ # Concatenate all encoded chunks into a single list
26
+ concatenated_encoded = [token for chunk in encoded_chunks for token in chunk]
27
+
28
+ return concatenated_encoded
29
+
30
+ def load_telugu_texts():
31
+ file_paths = [
32
+ '/Users/anvesh/codebase/llm/data/telugu_books/telugu_books.csv',
33
+ '/Users/anvesh/codebase/llm/data/telugu_news/1_telugu_news.csv',
34
+ '/Users/anvesh/codebase/llm/data/telugu_news/2_telugu_news.csv'
35
+ ]
36
+
37
+ # Combine data from all files
38
+ telugu_texts = []
39
+ for file_path in file_paths:
40
+ df = pd.read_csv(file_path)
41
+ if 'text' in df.columns:
42
+ telugu_texts.append(' '.join(df['text'].astype(str).tolist()))
43
+ elif 'body' in df.columns:
44
+ telugu_texts.append(' '.join(df['body'].astype(str).tolist()))
45
+ # Concatenate all texts and remove all English, numerical values, quotes, and characters outside the UTF-8 range 0x0C00 to 0x0C7F, including special characters like @, #, $, and %.
46
+ telugu_text = ' '.join(telugu_texts)
47
+ telugu_text = re.sub(r'[^\u0C00-\u0C7F@#$%]', '', telugu_text) # Remove characters outside the specified UTF-8 range and special characters
48
+ telugu_text = re.sub(r'[\r\n\xa0]', '', telugu_text) # Remove line breaks and non-breaking spaces
49
+ return telugu_text
50
+
51
+ # Main script
52
+ if __name__ == '__main__':
53
+ # Load the Telugu texts
54
+ tokens = load_telugu_texts()
55
+ # Start the timer
56
+ start_time = time.time()
57
+
58
+ # Encode the tokens in parallel and get concatenated results
59
+ encoded_tokens = encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
60
+ print(encoded_tokens[:100])
61
+ print(len(encoded_tokens))
62
+ # End the timer
63
+ end_time = time.time()
64
+
65
+ # Calculate the time taken
66
+ time_taken = end_time - start_time
67
+
68
+ print(f"Time taken to encode and process tokens in parallel: {time_taken:.4f} seconds")
69
+ print("Encoding and processing completed!")
merges_vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"merges": {"(b'\\xe0\\xb0\\xa8', b'\\xe0\\xb0\\xbf')": 256}, "vocab": {"(b'\\xe0\\xb0\\xa8', b'\\xe0\\xb0\\xbf')": 256, "b'\\xe0\\xb0\\x80'": 0, "b'\\xe0\\xb0\\x81'": 1, "b'\\xe0\\xb0\\x82'": 2, "b'\\xe0\\xb0\\x83'": 3, "b'\\xe0\\xb0\\x84'": 4, "b'\\xe0\\xb0\\x85'": 5, "b'\\xe0\\xb0\\x86'": 6, "b'\\xe0\\xb0\\x87'": 7, "b'\\xe0\\xb0\\x88'": 8, "b'\\xe0\\xb0\\x89'": 9, "b'\\xe0\\xb0\\x8a'": 10, "b'\\xe0\\xb0\\x8b'": 11, "b'\\xe0\\xb0\\x8c'": 12, "b'\\xe0\\xb0\\x8d'": 13, "b'\\xe0\\xb0\\x8e'": 14, "b'\\xe0\\xb0\\x8f'": 15, "b'\\xe0\\xb0\\x90'": 16, "b'\\xe0\\xb0\\x91'": 17, "b'\\xe0\\xb0\\x92'": 18, "b'\\xe0\\xb0\\x93'": 19, "b'\\xe0\\xb0\\x94'": 20, "b'\\xe0\\xb0\\x95'": 21, "b'\\xe0\\xb0\\x96'": 22, "b'\\xe0\\xb0\\x97'": 23, "b'\\xe0\\xb0\\x98'": 24, "b'\\xe0\\xb0\\x99'": 25, "b'\\xe0\\xb0\\x9a'": 26, "b'\\xe0\\xb0\\x9b'": 27, "b'\\xe0\\xb0\\x9c'": 28, "b'\\xe0\\xb0\\x9d'": 29, "b'\\xe0\\xb0\\x9e'": 30, "b'\\xe0\\xb0\\x9f'": 31, "b'\\xe0\\xb0\\xa0'": 32, "b'\\xe0\\xb0\\xa1'": 33, "b'\\xe0\\xb0\\xa2'": 34, "b'\\xe0\\xb0\\xa3'": 35, "b'\\xe0\\xb0\\xa4'": 36, "b'\\xe0\\xb0\\xa5'": 37, "b'\\xe0\\xb0\\xa6'": 38, "b'\\xe0\\xb0\\xa7'": 39, "b'\\xe0\\xb0\\xa8'": 40, "b'\\xe0\\xb0\\xa9'": 41, "b'\\xe0\\xb0\\xaa'": 42, "b'\\xe0\\xb0\\xab'": 43, "b'\\xe0\\xb0\\xac'": 44, "b'\\xe0\\xb0\\xad'": 45, "b'\\xe0\\xb0\\xae'": 46, "b'\\xe0\\xb0\\xaf'": 47, "b'\\xe0\\xb0\\xb0'": 48, "b'\\xe0\\xb0\\xb1'": 49, "b'\\xe0\\xb0\\xb2'": 50, "b'\\xe0\\xb0\\xb3'": 51, "b'\\xe0\\xb0\\xb4'": 52, "b'\\xe0\\xb0\\xb5'": 53, "b'\\xe0\\xb0\\xb6'": 54, "b'\\xe0\\xb0\\xb7'": 55, "b'\\xe0\\xb0\\xb8'": 56, "b'\\xe0\\xb0\\xb9'": 57, "b'\\xe0\\xb0\\xba'": 58, "b'\\xe0\\xb0\\xbb'": 59, "b'\\xe0\\xb0\\xbc'": 60, "b'\\xe0\\xb0\\xbd'": 61, "b'\\xe0\\xb0\\xbe'": 62, "b'\\xe0\\xb0\\xbf'": 63, "b'\\xe0\\xb1\\x80'": 64, "b'\\xe0\\xb1\\x81'": 65, "b'\\xe0\\xb1\\x82'": 66, "b'\\xe0\\xb1\\x83'": 67, "b'\\xe0\\xb1\\x84'": 68, "b'\\xe0\\xb1\\x85'": 69, "b'\\xe0\\xb1\\x86'": 70, "b'\\xe0\\xb1\\x87'": 71, "b'\\xe0\\xb1\\x88'": 72, "b'\\xe0\\xb1\\x89'": 73, "b'\\xe0\\xb1\\x8a'": 74, "b'\\xe0\\xb1\\x8b'": 75, "b'\\xe0\\xb1\\x8c'": 76, "b'\\xe0\\xb1\\x8d'": 77, "b'\\xe0\\xb1\\x8e'": 78, "b'\\xe0\\xb1\\x8f'": 79, "b'\\xe0\\xb1\\x90'": 80, "b'\\xe0\\xb1\\x91'": 81, "b'\\xe0\\xb1\\x92'": 82, "b'\\xe0\\xb1\\x93'": 83, "b'\\xe0\\xb1\\x94'": 84, "b'\\xe0\\xb1\\x95'": 85, "b'\\xe0\\xb1\\x96'": 86, "b'\\xe0\\xb1\\x97'": 87, "b'\\xe0\\xb1\\x98'": 88, "b'\\xe0\\xb1\\x99'": 89, "b'\\xe0\\xb1\\x9a'": 90, "b'\\xe0\\xb1\\x9b'": 91, "b'\\xe0\\xb1\\x9c'": 92, "b'\\xe0\\xb1\\x9d'": 93, "b'\\xe0\\xb1\\x9e'": 94, "b'\\xe0\\xb1\\x9f'": 95, "b'\\xe0\\xb1\\xa0'": 96, "b'\\xe0\\xb1\\xa1'": 97, "b'\\xe0\\xb1\\xa2'": 98, "b'\\xe0\\xb1\\xa3'": 99, "b'\\xe0\\xb1\\xa4'": 100, "b'\\xe0\\xb1\\xa5'": 101, "b'\\xe0\\xb1\\xa6'": 102, "b'\\xe0\\xb1\\xa7'": 103, "b'\\xe0\\xb1\\xa8'": 104, "b'\\xe0\\xb1\\xa9'": 105, "b'\\xe0\\xb1\\xaa'": 106, "b'\\xe0\\xb1\\xab'": 107, "b'\\xe0\\xb1\\xac'": 108, "b'\\xe0\\xb1\\xad'": 109, "b'\\xe0\\xb1\\xae'": 110, "b'\\xe0\\xb1\\xaf'": 111, "b'\\xe0\\xb1\\xb0'": 112, "b'\\xe0\\xb1\\xb1'": 113, "b'\\xe0\\xb1\\xb2'": 114, "b'\\xe0\\xb1\\xb3'": 115, "b'\\xe0\\xb1\\xb4'": 116, "b'\\xe0\\xb1\\xb5'": 117, "b'\\xe0\\xb1\\xb6'": 118, "b'\\xe0\\xb1\\xb7'": 119, "b'\\xe0\\xb1\\xb8'": 120, "b'\\xe0\\xb1\\xb9'": 121, "b'\\xe0\\xb1\\xba'": 122, "b'\\xe0\\xb1\\xbb'": 123, "b'\\xe0\\xb1\\xbc'": 124, "b'\\xe0\\xb1\\xbd'": 125, "b'\\xe0\\xb1\\xbe'": 126, "b' '": 255, "b'.'": 254}}
tokenizer.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ import encoder_parallel_telugu as encode_parallel
4
+ import time
5
+ import json
6
+ from collections import defaultdict
7
+ from tqdm import tqdm
8
+
9
+ def load_and_encode_tokens():
10
+ tokens = encode_parallel.load_telugu_texts()
11
+ start_time = time.time()
12
+ encoded_tokens = encode_parallel.encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
13
+ print('encoded_tokens:', encoded_tokens[:100])
14
+ print(len(encoded_tokens))
15
+ end_time = time.time()
16
+ print(f"Time taken to encode and process tokens in parallel: {end_time - start_time:.4f} seconds")
17
+ print('length of encoded_text:', len(encoded_tokens))
18
+ print('unique characters in decoded_text:', {token.decode('utf-8') for token in set(encoded_tokens)})
19
+ # print('unique characters in encoded_text:', set(encoded_tokens))
20
+ print('unique characters in encoded_text:', len(set(encoded_tokens)))
21
+ return encoded_tokens
22
+
23
+ def get_stats(ids):
24
+ counts = {}
25
+ for pair in zip(ids, ids[1:]):
26
+ counts[pair] = counts.get(pair, 0) + 1
27
+ return counts
28
+
29
+ def merge(ids, pair, idx):
30
+ new_ids = []
31
+ i = 0
32
+ while i < len(ids):
33
+ if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
34
+ new_ids.append(idx)
35
+ i += 2
36
+ else:
37
+ new_ids.append(ids[i])
38
+ i += 1
39
+ return new_ids
40
+
41
+ def bpe_process(encoded_tokens,vocab_size=500, encoded_tokens_length=10_00_000):
42
+ num_merges = vocab_size - 256 # our unique tokens are 194, for our sample text.
43
+ encoded_tokens = encoded_tokens[:encoded_tokens_length]
44
+ ids = list(encoded_tokens) # copy so we don't destroy the original list
45
+ merges = {} # (int, int) -> int
46
+
47
+ for i in tqdm(range(num_merges), desc="Merging tokens"):
48
+ stats = get_stats(ids)
49
+ pair = max(stats, key=stats.get)
50
+ idx = 256 + i
51
+ ids = merge(ids, pair, idx)
52
+ merges[pair] = idx # merge has a pair of tokens and the new token index
53
+
54
+ print("tokens length:", len(encoded_tokens))
55
+ print("ids length:", len(ids))
56
+ print("by paired tokens length:", len(set(ids)))
57
+ print(f"compression ratio: {len(encoded_tokens) / len(ids):.2f}X")
58
+ # print(f"token size: {len(set(encoded_tokens))}")
59
+
60
+ return merges
61
+
62
+ def build_vocabulary(merges):
63
+ telugu_unicode_chars = [chr(i) for i in range(0x0C00, 0x0C7F)] # Telugu Unicode range
64
+ vocab = {token: idx for token, idx in merges.items()}
65
+
66
+ for idx, char in enumerate([chr(i).encode('utf-8') for i in range(0x0C00, 0x0C7F)]):
67
+ if idx < 256: # Ensure we only add up to 256 characters
68
+ vocab[char] = idx # Map the character to its index
69
+
70
+ vocab[b' '] = 255
71
+ vocab[b'.'] = 254
72
+
73
+ with open('merges_vocab.json', 'w') as f:
74
+ json.dump({'merges': {str(k): v for k, v in merges.items()}, 'vocab': {str(k): v for k, v in vocab.items()}}, f)
75
+
76
+ def read_vocab_from_file():
77
+ with open('merges_vocab.json', 'r') as f:
78
+ data = json.load(f)
79
+
80
+ distributed_data = defaultdict(list)
81
+
82
+ for key, value in data['vocab'].items():
83
+ distributed_data['vocab'].append({key: value})
84
+
85
+ formatted_vocab = {}
86
+ for item in distributed_data['vocab']:
87
+ for k, v in item.items():
88
+ if ',' not in k:
89
+ formatted_vocab[(eval(k),)] = v
90
+ else:
91
+ formatted_vocab[eval(k)] = v
92
+
93
+ return formatted_vocab
94
+
95
+ def expand_vocab(inverted_vocab):
96
+ def convert_to_bytes(value):
97
+ if isinstance(value, bytes):
98
+ return value
99
+ elif value in inverted_vocab:
100
+ return process_tuple(inverted_vocab[value])
101
+ else:
102
+ print(f'value not found in inverted_vocab: {value}')
103
+ return None
104
+
105
+ def process_tuple(value_tuple):
106
+ converted_values = []
107
+ for v in value_tuple:
108
+ result = convert_to_bytes(v)
109
+ if isinstance(result, tuple):
110
+ converted_values.extend(result)
111
+ else:
112
+ converted_values.append(result)
113
+ return tuple(converted_values)
114
+
115
+ decoder_map = {k: process_tuple(v) for k, v in inverted_vocab.items()}
116
+ print("sample decoder map:", {k: decoder_map[k] for k in list(decoder_map)[:5]})
117
+ return decoder_map
118
+
119
+ # # Main execution
120
+ # if __name__ == "__main__":
121
+ # # 1. Load and encode tokens
122
+ # encoded_tokens = load_and_encode_tokens()
123
+ # # 2. Process BPE
124
+ # merges = bpe_process(encoded_tokens,vocab_size=257, encoded_tokens_length=20_00_000)
125
+ # # 3. Build vocabulary
126
+ # build_vocabulary(merges)
127
+ # # 4. Read vocabulary from file
128
+ # formatted_vocab = read_vocab_from_file()
129
+ # # 5. Invert vocabulary
130
+ # inverted_vocab = {v: k for k, v in formatted_vocab.items()}
131
+ # # 6. Expand vocabulary
132
+ # decoder_map = expand_vocab(inverted_vocab)
133
+ # # 7. Invert back again
134
+ # re_inverted_vocab = {k: v for v, k in decoder_map.items()}
135
+ # print(re_inverted_vocab)
tokenizer_backup.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+
4
+ # Data section start-->
5
+ # Load the CSV files
6
+ file_paths = [
7
+ '/Users/anvesh/codebase/llm/data/telugu_books/telugu_books.csv',
8
+ '/Users/anvesh/codebase/llm/data/telugu_news/1_telugu_news.csv',
9
+ '/Users/anvesh/codebase/llm/data/telugu_news/2_telugu_news.csv'
10
+ ]
11
+
12
+ # Combine data from all files
13
+ telugu_texts = []
14
+ for file_path in file_paths:
15
+ df = pd.read_csv(file_path)
16
+ if 'text' in df.columns:
17
+ telugu_texts.append(' '.join(df['text'].astype(str).tolist()))
18
+ elif 'body' in df.columns:
19
+ telugu_texts.append(' '.join(df['body'].astype(str).tolist()))
20
+
21
+ # Concatenate all texts and remove all English, numerical values, and quotes
22
+ telugu_text = ' '.join(telugu_texts)
23
+ telugu_text = re.sub(r'[A-Za-z0-9\'"]', '', telugu_text) # Remove English letters, numbers, and quotes
24
+ telugu_text = re.sub(r'[\r\n\xa0]', '', telugu_text) # Remove line breaks and non-breaking spaces
25
+
26
+ print('telugu_text befores utf-8 encoding:', telugu_text[:100])
27
+
28
+ vocabulary_size = len(set(telugu_text.split()))
29
+ print('Original text size:', len(telugu_text))
30
+ print('Vocabulary size of telugu_text:', vocabulary_size)
31
+
32
+ unique_characters = set(telugu_text)
33
+ unique_count = len(unique_characters)
34
+ print('Original text size:', len(telugu_text))
35
+ print('Unique character count in telugu_text:', unique_count)
36
+
37
+ # Data section end-->
38
+
39
+ # utf-8 encoding section start -->
40
+ import encode_parallel_telugu as encode_parallel
41
+ import time
42
+
43
+ tokens = encode_parallel.load_telugu_texts()
44
+ # Start the timer
45
+ start_time = time.time()
46
+ # Encode the tokens in parallel and get concatenated results
47
+ encoded_tokens = encode_parallel.encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
48
+ print('encoded_tokens:', encoded_tokens[:100])
49
+ print(len(encoded_tokens))
50
+ # End the timer
51
+ end_time = time.time()
52
+ print(f"Time taken to encode and process tokens in parallel: {end_time - start_time:.4f} seconds")
53
+
54
+ print('length of encoded_text:', len(encoded_tokens))
55
+ print('unique characters in encoded_text:', set(encoded_tokens))
56
+ print('unique characters in encoded_text:', len(set(encoded_tokens)))
57
+ # utf-8 encoding section end -->
58
+
59
+ # BPE section start -->
60
+ #### **BPE implementation**
61
+
62
+ tokens = encoded_tokens
63
+
64
+ def get_stats(ids):
65
+ counts = {}
66
+ for pair in zip(ids, ids[1:]):
67
+ counts[pair] = counts.get(pair, 0) + 1
68
+ return counts
69
+
70
+ def merge(ids, pair, idx):
71
+ new_ids = []
72
+ i = 0
73
+ while i < len(ids):
74
+ if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
75
+ new_ids.append(idx)
76
+ i += 2
77
+ else:
78
+ new_ids.append(ids[i])
79
+ i += 1
80
+ return new_ids
81
+
82
+ # ---
83
+ vocab_size = 500 # the desired final vocabulary size
84
+ num_merges = vocab_size - 256 ## our unique tokens are 194, for our sample text.
85
+ ids = list(tokens) # copy so we don't destroy the original list
86
+
87
+ merges = {} # (int, int) -> int
88
+ from tqdm import tqdm # Import tqdm for progress bar
89
+
90
+ for i in tqdm(range(num_merges), desc="Merging tokens"):
91
+ stats = get_stats(ids)
92
+ pair = max(stats, key=stats.get)
93
+ idx = 256 + i
94
+ # print(f"merging {pair} into a new token {idx}")
95
+ ids = merge(ids, pair, idx)
96
+ merges[pair] = idx # merge has a pair of tokens and the new token index
97
+
98
+ print("tokens length:", len(tokens))
99
+ print("ids length:", len(ids))
100
+ print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
101
+ print(f"token size: {len(set(tokens))}")
102
+
103
+ # print(ids)
104
+ # BPE section end -->
105
+
106
+ # Building the vocabulary section start -->
107
+ telugu_unicode_chars = [chr(i) for i in range(0x0C00, 0x0C7F)] # Telugu Unicode range
108
+
109
+ # Add these characters to the vocabulary
110
+ import json
111
+ vocab = {token: idx for token, idx in merges.items()}
112
+ # Add unique Telugu characters to the vocabulary
113
+ for idx, char in enumerate([chr(i).encode('utf-8') for i in range(0x0C00, 0x0C7F)]):
114
+ if idx < 256: # Ensure we only add up to 256 characters
115
+ vocab[char] = idx # Map the character to its index
116
+
117
+ vocab[b' '] = 255
118
+ vocab[b'.'] = 254
119
+ # Save merges and vocab to a file
120
+ # with open('merges_vocab.json', 'w') as f:
121
+ # json.dump({'merges': merges, 'vocab': vocab}, f)
122
+
123
+ # saving the merges and vocab to a file
124
+ with open('merges_vocab.json', 'w') as f:
125
+ json.dump({'merges': {str(k): v for k, v in merges.items()}, 'vocab': {str(k): v for k, v in vocab.items()}}, f)
126
+
127
+ # Building the vocabulary section end -->
128
+
129
+
130
+ # Reading the merges and vocab from a file section start -->
131
+ import json
132
+ from collections import defaultdict
133
+
134
+ # Read the merges and vocab data from the JSON file
135
+ with open('merges_vocab.json', 'r') as f:
136
+ data = json.load(f)
137
+
138
+ # Create a defaultdict to store the data in a distributed manner
139
+ distributed_data = defaultdict(list)
140
+
141
+ # Distribute the merges and vocab data
142
+ # for key, value in data['merges'].items():
143
+ # distributed_data['merges'].append({key: value})
144
+
145
+ for key, value in data['vocab'].items():
146
+ distributed_data['vocab'].append({key: value})
147
+
148
+ # Optionally, print the distributed data for verification
149
+ print(distributed_data)
150
+ distributed_data['vocab']
151
+ # Convert the list of dictionaries to a single dictionary
152
+ formatted_vocab = {}
153
+ for item in distributed_data['vocab']:
154
+ for k, v in item.items():
155
+ if ',' not in k:
156
+ formatted_vocab[(eval(k),)] = v
157
+ else:
158
+ formatted_vocab[eval(k)] = v
159
+ print(formatted_vocab[:50])
160
+ # inverting the vocab
161
+ inverted_vocab = {v: k for k, v in formatted_vocab.items()}
162
+ inverted_vocab
163
+
164
+ # Reading the merges and vocab from a file section end -->
165
+
166
+ # Expanding the vocab section start -->
167
+ def convert_to_bytes(value):
168
+ if isinstance(value, bytes):
169
+ return value
170
+ elif value in inverted_vocab:
171
+ return process_tuple(inverted_vocab[value])
172
+ else:
173
+ print(f'value not found in inverted_vocab: {value}')
174
+ return None
175
+
176
+ def process_tuple(value_tuple):
177
+ # print(f'value_tuple: {value_tuple}')
178
+ # for vi in value_tuple:
179
+ # print(f'v: {vi}')
180
+ converted_values = []
181
+ for v in value_tuple:
182
+ result = convert_to_bytes(v)
183
+ if isinstance(result, tuple):
184
+ converted_values.extend(result)
185
+ else:
186
+ converted_values.append(result)
187
+ return tuple(converted_values)
188
+
189
+ decoder_map = {k: process_tuple(v) for k, v in inverted_vocab.items()}
190
+
191
+
192
+
193
+
194
+