Spaces:

anveshplus
/

BPE-Tokenizer

Sleeping

File size: 2,915 Bytes

e4d5fc0

import time
import concurrent.futures
from tqdm import tqdm
import pandas as pd
import re

# Function to encode a chunk of tokens into UTF-8 and return as bytes
def encode_chunk(chunk):
    # Encode each token in the chunk to UTF-8
    return [token.encode('utf-8') for token in chunk]

# Main function to handle parallel encoding and return concatenated results
def encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10):
    # Split the tokens into chunks of size chunk_size (1 million tokens per chunk)
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    
    # Prepare the progress bar
    total_chunks = len(chunks)
    
    # Use ProcessPoolExecutor to process chunks in parallel
    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        # Use tqdm to show a progress bar while processing chunks
        encoded_chunks = list(tqdm(executor.map(encode_chunk, chunks), total=total_chunks, desc="Processing Chunks"))
    
    # Concatenate all encoded chunks into a single list
    concatenated_encoded = [token for chunk in encoded_chunks for token in chunk]
    
    return concatenated_encoded

def load_telugu_texts():
    file_paths = [
    '/Users/anvesh/codebase/llm/data/telugu_books/telugu_books.csv',
    '/Users/anvesh/codebase/llm/data/telugu_news/1_telugu_news.csv',
    '/Users/anvesh/codebase/llm/data/telugu_news/2_telugu_news.csv'
    ]

    # Combine data from all files
    telugu_texts = []
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        if 'text' in df.columns:
            telugu_texts.append(' '.join(df['text'].astype(str).tolist()))
        elif 'body' in df.columns:
            telugu_texts.append(' '.join(df['body'].astype(str).tolist()))
    # Concatenate all texts and remove all English, numerical values, quotes, and characters outside the UTF-8 range 0x0C00 to 0x0C7F, including special characters like @, #, $, and %.
    telugu_text = ' '.join(telugu_texts)
    telugu_text = re.sub(r'[^\u0C00-\u0C7F@#$%]', '', telugu_text)  # Remove characters outside the specified UTF-8 range and special characters
    telugu_text = re.sub(r'[\r\n\xa0]', '', telugu_text)  # Remove line breaks and non-breaking spaces
    return telugu_text

# Main script
if __name__ == '__main__':
    # Load the Telugu texts
    tokens = load_telugu_texts()
    # Start the timer
    start_time = time.time()

    # Encode the tokens in parallel and get concatenated results
    encoded_tokens = encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
    print(encoded_tokens[:100])
    print(len(encoded_tokens))
    # End the timer
    end_time = time.time()

    # Calculate the time taken
    time_taken = end_time - start_time

    print(f"Time taken to encode and process tokens in parallel: {time_taken:.4f} seconds")
    print("Encoding and processing completed!")