Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import re | |
| # Data section start--> | |
| # Load the CSV files | |
| file_paths = [ | |
| '/Users/anvesh/codebase/llm/data/telugu_books/telugu_books.csv', | |
| '/Users/anvesh/codebase/llm/data/telugu_news/1_telugu_news.csv', | |
| '/Users/anvesh/codebase/llm/data/telugu_news/2_telugu_news.csv' | |
| ] | |
| # Combine data from all files | |
| telugu_texts = [] | |
| for file_path in file_paths: | |
| df = pd.read_csv(file_path) | |
| if 'text' in df.columns: | |
| telugu_texts.append(' '.join(df['text'].astype(str).tolist())) | |
| elif 'body' in df.columns: | |
| telugu_texts.append(' '.join(df['body'].astype(str).tolist())) | |
| # Concatenate all texts and remove all English, numerical values, and quotes | |
| telugu_text = ' '.join(telugu_texts) | |
| telugu_text = re.sub(r'[A-Za-z0-9\'"]', '', telugu_text) # Remove English letters, numbers, and quotes | |
| telugu_text = re.sub(r'[\r\n\xa0]', '', telugu_text) # Remove line breaks and non-breaking spaces | |
| print('telugu_text befores utf-8 encoding:', telugu_text[:100]) | |
| vocabulary_size = len(set(telugu_text.split())) | |
| print('Original text size:', len(telugu_text)) | |
| print('Vocabulary size of telugu_text:', vocabulary_size) | |
| unique_characters = set(telugu_text) | |
| unique_count = len(unique_characters) | |
| print('Original text size:', len(telugu_text)) | |
| print('Unique character count in telugu_text:', unique_count) | |
| # Data section end--> | |
| # utf-8 encoding section start --> | |
| import encode_parallel_telugu as encode_parallel | |
| import time | |
| tokens = encode_parallel.load_telugu_texts() | |
| # Start the timer | |
| start_time = time.time() | |
| # Encode the tokens in parallel and get concatenated results | |
| encoded_tokens = encode_parallel.encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10) | |
| print('encoded_tokens:', encoded_tokens[:100]) | |
| print(len(encoded_tokens)) | |
| # End the timer | |
| end_time = time.time() | |
| print(f"Time taken to encode and process tokens in parallel: {end_time - start_time:.4f} seconds") | |
| print('length of encoded_text:', len(encoded_tokens)) | |
| print('unique characters in encoded_text:', set(encoded_tokens)) | |
| print('unique characters in encoded_text:', len(set(encoded_tokens))) | |
| # utf-8 encoding section end --> | |
| # BPE section start --> | |
| #### **BPE implementation** | |
| tokens = encoded_tokens | |
| def get_stats(ids): | |
| counts = {} | |
| for pair in zip(ids, ids[1:]): | |
| counts[pair] = counts.get(pair, 0) + 1 | |
| return counts | |
| def merge(ids, pair, idx): | |
| new_ids = [] | |
| i = 0 | |
| while i < len(ids): | |
| if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]: | |
| new_ids.append(idx) | |
| i += 2 | |
| else: | |
| new_ids.append(ids[i]) | |
| i += 1 | |
| return new_ids | |
| # --- | |
| vocab_size = 500 # the desired final vocabulary size | |
| num_merges = vocab_size - 256 ## our unique tokens are 194, for our sample text. | |
| ids = list(tokens) # copy so we don't destroy the original list | |
| merges = {} # (int, int) -> int | |
| from tqdm import tqdm # Import tqdm for progress bar | |
| for i in tqdm(range(num_merges), desc="Merging tokens"): | |
| stats = get_stats(ids) | |
| pair = max(stats, key=stats.get) | |
| idx = 256 + i | |
| # print(f"merging {pair} into a new token {idx}") | |
| ids = merge(ids, pair, idx) | |
| merges[pair] = idx # merge has a pair of tokens and the new token index | |
| print("tokens length:", len(tokens)) | |
| print("ids length:", len(ids)) | |
| print(f"compression ratio: {len(tokens) / len(ids):.2f}X") | |
| print(f"token size: {len(set(tokens))}") | |
| # print(ids) | |
| # BPE section end --> | |
| # Building the vocabulary section start --> | |
| telugu_unicode_chars = [chr(i) for i in range(0x0C00, 0x0C7F)] # Telugu Unicode range | |
| # Add these characters to the vocabulary | |
| import json | |
| vocab = {token: idx for token, idx in merges.items()} | |
| # Add unique Telugu characters to the vocabulary | |
| for idx, char in enumerate([chr(i).encode('utf-8') for i in range(0x0C00, 0x0C7F)]): | |
| if idx < 256: # Ensure we only add up to 256 characters | |
| vocab[char] = idx # Map the character to its index | |
| vocab[b' '] = 255 | |
| vocab[b'.'] = 254 | |
| # Save merges and vocab to a file | |
| # with open('merges_vocab.json', 'w') as f: | |
| # json.dump({'merges': merges, 'vocab': vocab}, f) | |
| # saving the merges and vocab to a file | |
| with open('merges_vocab.json', 'w') as f: | |
| json.dump({'merges': {str(k): v for k, v in merges.items()}, 'vocab': {str(k): v for k, v in vocab.items()}}, f) | |
| # Building the vocabulary section end --> | |
| # Reading the merges and vocab from a file section start --> | |
| import json | |
| from collections import defaultdict | |
| # Read the merges and vocab data from the JSON file | |
| with open('merges_vocab.json', 'r') as f: | |
| data = json.load(f) | |
| # Create a defaultdict to store the data in a distributed manner | |
| distributed_data = defaultdict(list) | |
| # Distribute the merges and vocab data | |
| # for key, value in data['merges'].items(): | |
| # distributed_data['merges'].append({key: value}) | |
| for key, value in data['vocab'].items(): | |
| distributed_data['vocab'].append({key: value}) | |
| # Optionally, print the distributed data for verification | |
| print(distributed_data) | |
| distributed_data['vocab'] | |
| # Convert the list of dictionaries to a single dictionary | |
| formatted_vocab = {} | |
| for item in distributed_data['vocab']: | |
| for k, v in item.items(): | |
| if ',' not in k: | |
| formatted_vocab[(eval(k),)] = v | |
| else: | |
| formatted_vocab[eval(k)] = v | |
| print(formatted_vocab[:50]) | |
| # inverting the vocab | |
| inverted_vocab = {v: k for k, v in formatted_vocab.items()} | |
| inverted_vocab | |
| # Reading the merges and vocab from a file section end --> | |
| # Expanding the vocab section start --> | |
| def convert_to_bytes(value): | |
| if isinstance(value, bytes): | |
| return value | |
| elif value in inverted_vocab: | |
| return process_tuple(inverted_vocab[value]) | |
| else: | |
| print(f'value not found in inverted_vocab: {value}') | |
| return None | |
| def process_tuple(value_tuple): | |
| # print(f'value_tuple: {value_tuple}') | |
| # for vi in value_tuple: | |
| # print(f'v: {vi}') | |
| converted_values = [] | |
| for v in value_tuple: | |
| result = convert_to_bytes(v) | |
| if isinstance(result, tuple): | |
| converted_values.extend(result) | |
| else: | |
| converted_values.append(result) | |
| return tuple(converted_values) | |
| decoder_map = {k: process_tuple(v) for k, v in inverted_vocab.items()} | |