Spaces:
Sleeping
Sleeping
File size: 6,275 Bytes
e4d5fc0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import pandas as pd
import re
# Data section start-->
# Load the CSV files
file_paths = [
'/Users/anvesh/codebase/llm/data/telugu_books/telugu_books.csv',
'/Users/anvesh/codebase/llm/data/telugu_news/1_telugu_news.csv',
'/Users/anvesh/codebase/llm/data/telugu_news/2_telugu_news.csv'
]
# Combine data from all files
telugu_texts = []
for file_path in file_paths:
df = pd.read_csv(file_path)
if 'text' in df.columns:
telugu_texts.append(' '.join(df['text'].astype(str).tolist()))
elif 'body' in df.columns:
telugu_texts.append(' '.join(df['body'].astype(str).tolist()))
# Concatenate all texts and remove all English, numerical values, and quotes
telugu_text = ' '.join(telugu_texts)
telugu_text = re.sub(r'[A-Za-z0-9\'"]', '', telugu_text) # Remove English letters, numbers, and quotes
telugu_text = re.sub(r'[\r\n\xa0]', '', telugu_text) # Remove line breaks and non-breaking spaces
print('telugu_text befores utf-8 encoding:', telugu_text[:100])
vocabulary_size = len(set(telugu_text.split()))
print('Original text size:', len(telugu_text))
print('Vocabulary size of telugu_text:', vocabulary_size)
unique_characters = set(telugu_text)
unique_count = len(unique_characters)
print('Original text size:', len(telugu_text))
print('Unique character count in telugu_text:', unique_count)
# Data section end-->
# utf-8 encoding section start -->
import encode_parallel_telugu as encode_parallel
import time
tokens = encode_parallel.load_telugu_texts()
# Start the timer
start_time = time.time()
# Encode the tokens in parallel and get concatenated results
encoded_tokens = encode_parallel.encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
print('encoded_tokens:', encoded_tokens[:100])
print(len(encoded_tokens))
# End the timer
end_time = time.time()
print(f"Time taken to encode and process tokens in parallel: {end_time - start_time:.4f} seconds")
print('length of encoded_text:', len(encoded_tokens))
print('unique characters in encoded_text:', set(encoded_tokens))
print('unique characters in encoded_text:', len(set(encoded_tokens)))
# utf-8 encoding section end -->
# BPE section start -->
#### **BPE implementation**
tokens = encoded_tokens
def get_stats(ids):
counts = {}
for pair in zip(ids, ids[1:]):
counts[pair] = counts.get(pair, 0) + 1
return counts
def merge(ids, pair, idx):
new_ids = []
i = 0
while i < len(ids):
if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
new_ids.append(idx)
i += 2
else:
new_ids.append(ids[i])
i += 1
return new_ids
# ---
vocab_size = 500 # the desired final vocabulary size
num_merges = vocab_size - 256 ## our unique tokens are 194, for our sample text.
ids = list(tokens) # copy so we don't destroy the original list
merges = {} # (int, int) -> int
from tqdm import tqdm # Import tqdm for progress bar
for i in tqdm(range(num_merges), desc="Merging tokens"):
stats = get_stats(ids)
pair = max(stats, key=stats.get)
idx = 256 + i
# print(f"merging {pair} into a new token {idx}")
ids = merge(ids, pair, idx)
merges[pair] = idx # merge has a pair of tokens and the new token index
print("tokens length:", len(tokens))
print("ids length:", len(ids))
print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
print(f"token size: {len(set(tokens))}")
# print(ids)
# BPE section end -->
# Building the vocabulary section start -->
telugu_unicode_chars = [chr(i) for i in range(0x0C00, 0x0C7F)] # Telugu Unicode range
# Add these characters to the vocabulary
import json
vocab = {token: idx for token, idx in merges.items()}
# Add unique Telugu characters to the vocabulary
for idx, char in enumerate([chr(i).encode('utf-8') for i in range(0x0C00, 0x0C7F)]):
if idx < 256: # Ensure we only add up to 256 characters
vocab[char] = idx # Map the character to its index
vocab[b' '] = 255
vocab[b'.'] = 254
# Save merges and vocab to a file
# with open('merges_vocab.json', 'w') as f:
# json.dump({'merges': merges, 'vocab': vocab}, f)
# saving the merges and vocab to a file
with open('merges_vocab.json', 'w') as f:
json.dump({'merges': {str(k): v for k, v in merges.items()}, 'vocab': {str(k): v for k, v in vocab.items()}}, f)
# Building the vocabulary section end -->
# Reading the merges and vocab from a file section start -->
import json
from collections import defaultdict
# Read the merges and vocab data from the JSON file
with open('merges_vocab.json', 'r') as f:
data = json.load(f)
# Create a defaultdict to store the data in a distributed manner
distributed_data = defaultdict(list)
# Distribute the merges and vocab data
# for key, value in data['merges'].items():
# distributed_data['merges'].append({key: value})
for key, value in data['vocab'].items():
distributed_data['vocab'].append({key: value})
# Optionally, print the distributed data for verification
print(distributed_data)
distributed_data['vocab']
# Convert the list of dictionaries to a single dictionary
formatted_vocab = {}
for item in distributed_data['vocab']:
for k, v in item.items():
if ',' not in k:
formatted_vocab[(eval(k),)] = v
else:
formatted_vocab[eval(k)] = v
print(formatted_vocab[:50])
# inverting the vocab
inverted_vocab = {v: k for k, v in formatted_vocab.items()}
inverted_vocab
# Reading the merges and vocab from a file section end -->
# Expanding the vocab section start -->
def convert_to_bytes(value):
if isinstance(value, bytes):
return value
elif value in inverted_vocab:
return process_tuple(inverted_vocab[value])
else:
print(f'value not found in inverted_vocab: {value}')
return None
def process_tuple(value_tuple):
# print(f'value_tuple: {value_tuple}')
# for vi in value_tuple:
# print(f'v: {vi}')
converted_values = []
for v in value_tuple:
result = convert_to_bytes(v)
if isinstance(result, tuple):
converted_values.extend(result)
else:
converted_values.append(result)
return tuple(converted_values)
decoder_map = {k: process_tuple(v) for k, v in inverted_vocab.items()}
|