Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from typing import List, Dict, Tuple | |
| import numpy as np | |
| def get_stats(ids): | |
| counts = {} | |
| for pair in zip(ids, ids[1:]): | |
| counts[pair] = counts.get(pair, 0) + 1 | |
| return counts | |
| def merge(ids, pair, idx): | |
| newids = [] | |
| i = 0 | |
| while i < len(ids): | |
| if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]: | |
| newids.append(idx) | |
| i += 2 | |
| else: | |
| newids.append(ids[i]) | |
| i += 1 | |
| return newids | |
| # Read the Telugu text file and train BPE | |
| def train_bpe(vocab_size: int = 350): | |
| # Read the preprocessed Telugu text | |
| with open('telugu_preprocessed_file.txt', 'r', encoding='utf-8') as f: | |
| text = f.read() | |
| # Convert initial text to bytes | |
| tokens = list(text.encode('utf-8')) | |
| # Train merges | |
| num_merges = vocab_size - 256 | |
| ids = list(tokens) | |
| merges = {} | |
| for i in range(num_merges): | |
| stats = get_stats(ids) | |
| if not stats: # If no more pairs to merge | |
| break | |
| pair = max(stats, key=stats.get) | |
| idx = 256 + i | |
| print(f"merging {pair} into a new token {idx}") # Optional: for monitoring training | |
| ids = merge(ids, pair, idx) | |
| merges[pair] = idx | |
| return merges | |
| # Train the tokenizer | |
| merges = train_bpe() | |
| class OptimizedBPETokenizer: | |
| def __init__(self, merges: Dict[Tuple[int, int], int]): | |
| self.merges = merges | |
| self.idx_to_pair = {idx: pair for pair, idx in merges.items()} | |
| # Create lookup table for faster encoding | |
| self.merge_lookup = {} | |
| for (first, second), idx in merges.items(): | |
| if first not in self.merge_lookup: | |
| self.merge_lookup[first] = {} | |
| self.merge_lookup[first][second] = idx | |
| def encode(self, text: str, chunk_size: int = 1000000) -> List[int]: | |
| if not isinstance(text, str): | |
| return [] | |
| # Convert to regular integers instead of numpy types | |
| ids = [int(x) for x in text.encode('utf-8')] | |
| # Apply merges | |
| while True: | |
| stats = get_stats(ids) | |
| if not stats: | |
| break | |
| pair = max(stats, key=stats.get) | |
| if pair not in self.merges: | |
| break | |
| ids = merge(ids, pair, self.merges[pair]) | |
| return ids | |
| def decode(self, ids: List[int]) -> str: | |
| result = [] | |
| for token in ids: | |
| if token < 256: | |
| result.append(token) | |
| else: | |
| # Expand merged tokens | |
| pair = self.idx_to_pair[token] | |
| result.extend(self._expand_token(pair[0])) | |
| result.extend(self._expand_token(pair[1])) | |
| return bytes(result).decode('utf-8') | |
| def _expand_token(self, token: int) -> List[int]: | |
| if token < 256: | |
| return [token] | |
| pair = self.idx_to_pair[token] | |
| result = [] | |
| result.extend(self._expand_token(pair[0])) | |
| result.extend(self._expand_token(pair[1])) | |
| return result | |
| # Initialize tokenizer | |
| tokenizer = OptimizedBPETokenizer(merges) | |
| def encode_text(text: str) -> str: | |
| """Function to handle encoding""" | |
| if not text: | |
| return "Please enter text to encode" | |
| try: | |
| tokens = tokenizer.encode(text) | |
| return f"Encoded tokens: {tokens}\nToken count: {len(tokens)}" | |
| except Exception as e: | |
| return f"Encoding error: {str(e)}" | |
| def decode_tokens(text: str) -> str: | |
| """Function to handle decoding""" | |
| if not text: | |
| return "Please enter tokens to decode" | |
| try: | |
| tokens = [int(x) for x in text.strip('[]').split(',')] | |
| decoded_text = tokenizer.decode(tokens) | |
| return f"Decoded text: {decoded_text}" | |
| except Exception as e: | |
| return f"Error: Please provide valid integers for decoding. Details: {str(e)}" | |
| # Create the Gradio interface | |
| with gr.Blocks(title="Telugu BPE Tokenizer") as iface: | |
| gr.Markdown("# Telugu BPE Tokenizer") | |
| gr.Markdown("A byte-pair encoding tokenizer trained on Telugu text.") | |
| with gr.Row(): | |
| # Encoding Section | |
| with gr.Column(): | |
| gr.Markdown("### Encode Text") | |
| input_text = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Enter Telugu text to encode..." | |
| ) | |
| encode_button = gr.Button("Encode") | |
| encode_output = gr.Textbox(label="Encoding Result") | |
| # Decoding Section | |
| with gr.Column(): | |
| gr.Markdown("### Decode Tokens") | |
| input_tokens = gr.Textbox( | |
| label="Input Tokens", | |
| placeholder="Enter comma-separated tokens (e.g., 256,257,258)" | |
| ) | |
| decode_button = gr.Button("Decode") | |
| decode_output = gr.Textbox(label="Decoding Result") | |
| # Set up the button click events | |
| encode_button.click( | |
| fn=encode_text, | |
| inputs=input_text, | |
| outputs=encode_output | |
| ) | |
| decode_button.click( | |
| fn=decode_tokens, | |
| inputs=input_tokens, | |
| outputs=decode_output | |
| ) | |
| # Add examples | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Examples( | |
| examples=[ | |
| ["నమస్కారం"], | |
| ["తెలుగు భాష"], | |
| ], | |
| inputs=input_text, | |
| outputs=encode_output, | |
| fn=encode_text, | |
| label="Encoding Examples" | |
| ) | |
| with gr.Column(): | |
| gr.Examples( | |
| examples=[ | |
| ["256,257,258"], # Example tokens | |
| ], | |
| inputs=input_tokens, | |
| outputs=decode_output, | |
| fn=decode_tokens, | |
| label="Decoding Examples" | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |