Spaces:

ChaitraSaiK
/

Byte_pair_Encoder

Runtime error

App Files Files Community

ChaitraSaiK commited on Jan 11, 2025

Commit

b3797cd

1 Parent(s): 4984d4a

removed 350_file, third commit

Browse files

Files changed (2) hide show

app.py +152 -75
bpe_vocab_350_merges.pkl +0 -0

app.py CHANGED Viewed

@@ -1,8 +1,54 @@
 import gradio as gr
-import pickle
 from typing import List, Dict, Tuple
 import numpy as np
 class OptimizedBPETokenizer:
     def __init__(self, merges: Dict[Tuple[int, int], int]):
         self.merges = merges
@@ -19,98 +65,129 @@ class OptimizedBPETokenizer:
         if not isinstance(text, str):
             return []
-        ids = np.array(list(text.encode('utf-8')), dtype=np.uint16)
-        result = []
-        for i in range(0, len(ids), chunk_size):
-            chunk = ids[i:i + chunk_size]
-            processed_chunk = self._encode_chunk(chunk)
-            result.extend(processed_chunk)
-        return result
-    def _encode_chunk(self, ids: np.ndarray) -> List[int]:
-        output = []
-        i = 0
-        while i < len(ids):
-            if i < len(ids) - 1:
-                first, second = ids[i], ids[i + 1]
-                if first in self.merge_lookup and second in self.merge_lookup[first]:
-                    output.append(self.merge_lookup[first][second])
-                    i += 2
-                    continue
-            output.append(ids[i])
-            i += 1
-        return output
-    def decode(self, ids: List[int], chunk_size: int = 1000000) -> str:
-        byte_tokens = []
-        for i in range(0, len(ids), chunk_size):
-            chunk = ids[i:i + chunk_size]
-            decoded_chunk = self._decode_chunk(chunk)
-            byte_tokens.extend(decoded_chunk)
-        return bytes(byte_tokens).decode('utf-8')
-    def _decode_chunk(self, ids: List[int]) -> List[int]:
         result = []
         for token in ids:
             if token < 256:
                 result.append(token)
             else:
-                result.extend(self._expand_token(token))
-        return result
     def _expand_token(self, token: int) -> List[int]:
         if token < 256:
             return [token]
         pair = self.idx_to_pair[token]
-        expanded = []
-        for t in pair:
-            expanded.extend(self._expand_token(t))
-        return expanded
-# Load the pre-trained merges
-with open("bpe_vocab_350_merges.pkl", "rb") as f:
-    merges = pickle.load(f)
 tokenizer = OptimizedBPETokenizer(merges)
-def process_text(text: str, operation: str) -> str:
-    if operation == "Encode":
         tokens = tokenizer.encode(text)
         return f"Encoded tokens: {tokens}\nToken count: {len(tokens)}"
-    else:  # Decode
-        try:
-            # Convert string of numbers to list of integers
-            tokens = [int(x) for x in text.strip('[]').split(',')]
-            decoded_text = tokenizer.decode(tokens)
-            return f"Decoded text: {decoded_text}"
-        except:
-            return "Error: Please provide a valid list of integers for decoding"
-# Create the Gradio interface
-iface = gr.Interface(
-    fn=process_text,
-    inputs=[
-        gr.Textbox(label="Input Text", placeholder="Enter text to encode or tokens to decode..."),
-        gr.Radio(["Encode", "Decode"], label="Operation", value="Encode")
-    ],
-    outputs=gr.Textbox(label="Output"),
-    title="Telugu BPE Tokenizer",
-    description="A byte-pair encoding tokenizer trained on Telugu text. For encoding, enter Telugu text. For decoding, enter a list of integers (e.g., [256, 257, 258])."
-)
-# if __name__ == "__main__":
-#     # Test encoding
-#     test_text = "నమస్కారం"  # Telugu "Hello"
-#     encoded = tokenizer.encode(test_text)
-#     print(f"Test Encode: '{test_text}' -> {encoded}")
-#     # Test decoding
-#     decoded = tokenizer.decode(encoded)
-#     print(f"Test Decode: {encoded} -> '{decoded}'")
-    # Launch the interface
-iface.launch()

 import gradio as gr
 from typing import List, Dict, Tuple
 import numpy as np
+def get_stats(ids):
+    counts = {}
+    for pair in zip(ids, ids[1:]):
+        counts[pair] = counts.get(pair, 0) + 1
+    return counts
+def merge(ids, pair, idx):
+    newids = []
+    i = 0
+    while i < len(ids):
+        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
+            newids.append(idx)
+            i += 2
+        else:
+            newids.append(ids[i])
+            i += 1
+    return newids
+# Read the Telugu text file and train BPE
+def train_bpe(vocab_size: int = 350):
+    # Read the preprocessed Telugu text
+    with open('telugu_preprocessed_file.txt', 'r', encoding='utf-8') as f:
+        text = f.read()
+    # Convert initial text to bytes
+    tokens = list(text.encode('utf-8'))
+    # Train merges
+    num_merges = vocab_size - 256
+    ids = list(tokens)
+    merges = {}
+    for i in range(num_merges):
+        stats = get_stats(ids)
+        if not stats:  # If no more pairs to merge
+            break
+        pair = max(stats, key=stats.get)
+        idx = 256 + i
+        print(f"merging {pair} into a new token {idx}")  # Optional: for monitoring training
+        ids = merge(ids, pair, idx)
+        merges[pair] = idx
+    return merges
+# Train the tokenizer
+merges = train_bpe()
 class OptimizedBPETokenizer:
     def __init__(self, merges: Dict[Tuple[int, int], int]):
         self.merges = merges
         if not isinstance(text, str):
             return []
+        # Convert to regular integers instead of numpy types
+        ids = [int(x) for x in text.encode('utf-8')]
+        # Apply merges
+        while True:
+            stats = get_stats(ids)
+            if not stats:
+                break
+            pair = max(stats, key=stats.get)
+            if pair not in self.merges:
+                break
+            ids = merge(ids, pair, self.merges[pair])
+        return ids
+    def decode(self, ids: List[int]) -> str:
         result = []
         for token in ids:
             if token < 256:
                 result.append(token)
             else:
+                # Expand merged tokens
+                pair = self.idx_to_pair[token]
+                result.extend(self._expand_token(pair[0]))
+                result.extend(self._expand_token(pair[1]))
+        return bytes(result).decode('utf-8')
     def _expand_token(self, token: int) -> List[int]:
         if token < 256:
             return [token]
         pair = self.idx_to_pair[token]
+        result = []
+        result.extend(self._expand_token(pair[0]))
+        result.extend(self._expand_token(pair[1]))
+        return result
+# Initialize tokenizer
 tokenizer = OptimizedBPETokenizer(merges)
+def encode_text(text: str) -> str:
+    """Function to handle encoding"""
+    if not text:
+        return "Please enter text to encode"
+    try:
         tokens = tokenizer.encode(text)
         return f"Encoded tokens: {tokens}\nToken count: {len(tokens)}"
+    except Exception as e:
+        return f"Encoding error: {str(e)}"
+def decode_tokens(text: str) -> str:
+    """Function to handle decoding"""
+    if not text:
+        return "Please enter tokens to decode"
+    try:
+        tokens = [int(x) for x in text.strip('[]').split(',')]
+        decoded_text = tokenizer.decode(tokens)
+        return f"Decoded text: {decoded_text}"
+    except Exception as e:
+        return f"Error: Please provide valid integers for decoding. Details: {str(e)}"
+# Create the Gradio interface
+with gr.Blocks(title="Telugu BPE Tokenizer") as iface:
+    gr.Markdown("# Telugu BPE Tokenizer")
+    gr.Markdown("A byte-pair encoding tokenizer trained on Telugu text.")
+    with gr.Row():
+        # Encoding Section
+        with gr.Column():
+            gr.Markdown("### Encode Text")
+            input_text = gr.Textbox(
+                label="Input Text",
+                placeholder="Enter Telugu text to encode..."
+            )
+            encode_button = gr.Button("Encode")
+            encode_output = gr.Textbox(label="Encoding Result")
+        # Decoding Section
+        with gr.Column():
+            gr.Markdown("### Decode Tokens")
+            input_tokens = gr.Textbox(
+                label="Input Tokens",
+                placeholder="Enter comma-separated tokens (e.g., 256,257,258)"
+            )
+            decode_button = gr.Button("Decode")
+            decode_output = gr.Textbox(label="Decoding Result")
+    # Set up the button click events
+    encode_button.click(
+        fn=encode_text,
+        inputs=input_text,
+        outputs=encode_output
+    )
+    decode_button.click(
+        fn=decode_tokens,
+        inputs=input_tokens,
+        outputs=decode_output
+    )
+    # Add examples
+    with gr.Row():
+        with gr.Column():
+            gr.Examples(
+                examples=[
+                    ["నమస్కారం"],
+                    ["తెలుగు భాష"],
+                ],
+                inputs=input_text,
+                outputs=encode_output,
+                fn=encode_text,
+                label="Encoding Examples"
+            )
+        with gr.Column():
+            gr.Examples(
+                examples=[
+                    ["256,257,258"],  # Example tokens
+                ],
+                inputs=input_tokens,
+                outputs=decode_output,
+                fn=decode_tokens,
+                label="Decoding Examples"
+            )
+if __name__ == "__main__":
+    iface.launch()

bpe_vocab_350_merges.pkl DELETED Viewed

Binary file (984 Bytes)