Spaces:

santhoshv6
/

Kannada_BPE_Tokenizer

Sleeping

App Files Files Community

Santhosh V commited on Nov 16, 2025

Commit

82a1d74

1 Parent(s): 243e07f

inital push

Browse files

Files changed (4) hide show

README.md +13 -7
app.py +147 -0
kannada_bpe_final.pkl +3 -0
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -1,14 +1,20 @@
 ---
 title: Kannada BPE Tokenizer
-emoji: 📚
-colorFrom: purple
-colorTo: purple
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
-license: mit
-short_description: Kannada BPE Tokenizer
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Kannada BPE Tokenizer
+emoji: 🔤
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 4.7.1
 app_file: app.py
 pinned: false
 ---
+# Kannada BPE Tokenizer
+From-scratch implementation of Byte-Pair Encoding for Kannada language.
+- 6,000 token vocabulary
+- 9.301x compression ratio
+- 100% accurate encoding/decoding
+Try it out with Kannada text!

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# app.py
+import gradio as gr
+import pickle
+# Load tokenizer
+with open('kannada_bpe_final.pkl', 'rb') as f:
+    tokenizer_data = pickle.load(f)
+class KannadaBPE:
+    def __init__(self, data):
+        self.vocab = data['vocab']
+        self.merges = data['merges']
+        self.pattern = data.get('pattern', r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\u0C80-\u0CFF]+| ?\w+| ?[0-9]+| ?[^\s\w]+|\s+(?!\S)|\s+""")
+        import re
+        self.compiled_pattern = re.compile(self.pattern)
+    def _get_stats(self, ids):
+        counts = {}
+        for pair in zip(ids, ids[1:]):
+            counts[pair] = counts.get(pair, 0) + 1
+        return counts
+    def _merge(self, ids, pair, new_id):
+        new_ids = []
+        i = 0
+        while i < len(ids):
+            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
+                new_ids.append(new_id)
+                i += 2
+            else:
+                new_ids.append(ids[i])
+                i += 1
+        return new_ids
+    def encode(self, text):
+        import re
+        text_chunks = re.findall(self.compiled_pattern, text)
+        all_tokens = []
+        for chunk in text_chunks:
+            tokens = list(chunk.encode('utf-8'))
+            while len(tokens) >= 2:
+                stats = self._get_stats(tokens)
+                pair = min(stats, key=lambda p: self.merges.get(p, float('inf')))
+                if pair not in self.merges:
+                    break
+                new_id = self.merges[pair]
+                tokens = self._merge(tokens, pair, new_id)
+            all_tokens.extend(tokens)
+        return all_tokens
+    def decode(self, ids):
+        tokens = b"".join([self.vocab[idx] for idx in ids])
+        return tokens.decode('utf-8', errors='replace')
+tokenizer = KannadaBPE(tokenizer_data)
+def tokenize_text(input_text):
+    """Tokenize input and show results"""
+    if not input_text:
+        return "Please enter some text", "", "", ""
+    # Encode
+    tokens = tokenizer.encode(input_text)
+    # Decode
+    decoded = tokenizer.decode(tokens)
+    # Calculate compression
+    original_bytes = len(input_text.encode('utf-8'))
+    num_tokens = len(tokens)
+    compression_ratio = original_bytes / num_tokens if num_tokens > 0 else 0
+    # Format output
+    token_ids_str = str(tokens[:50]) + ("..." if len(tokens) > 50 else "")
+    stats = f"""
+**Statistics:**
+- Original bytes: {original_bytes:,}
+- Number of tokens: {num_tokens:,}
+- Compression ratio: {compression_ratio:.3f}x
+- Match: {'✅ Perfect' if input_text == decoded else '❌ Mismatch'}
+"""
+    return token_ids_str, decoded, stats, f"{num_tokens:,} tokens"
+# Examples
+examples = [
+    ["ನಮಸ್ಕಾರ, ಇದು ಕನ್ನಡ ಭಾಷೆಯ ಪರೀಕ್ಷೆ"],
+    ["ಕರ್ನಾಟಕ ರಾಜ್ಯದ ರಾಜಧಾನಿ ಬೆಂಗಳೂರು"],
+    ["Hello123 World456 Mixed ಕನ್ನಡ text"],
+    ["ಕನ್ನಡ ಸಂಖ್ಯೆಗಳು: ೧೨೩೪೫"],
+]
+# Gradio Interface
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🔤 Kannada BPE Tokenizer
+    **From-scratch Byte-Pair Encoding tokenizer for Kannada**
+    - Vocabulary: 6,000 tokens
+    - Compression: 9.301x average
+    - Training: 500k samples from CulturaX-Kn
+    Try tokenizing Kannada or mixed-language text below!
+    """)
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(
+                label="Input Text",
+                placeholder="Enter Kannada or mixed text...",
+                lines=5
+            )
+            tokenize_btn = gr.Button("🚀 Tokenize", variant="primary")
+        with gr.Column():
+            token_count = gr.Textbox(label="Token Count", interactive=False)
+            stats_output = gr.Markdown(label="Statistics")
+    with gr.Row():
+        token_ids = gr.Textbox(label="Token IDs (first 50)", lines=3, interactive=False)
+        decoded_text = gr.Textbox(label="Decoded Text", lines=3, interactive=False)
+    gr.Examples(
+        examples=examples,
+        inputs=input_text,
+        label="Try these examples:"
+    )
+    gr.Markdown("""
+    ### 📊 Model Info
+    - **Training Time**: 502 minutes (~8.4 hours)
+    - **Dataset**: CulturaX-Kn (500K samples)
+    - **Algorithm**: Byte-Pair Encoding (BPE) from scratch
+    - **Language**: Kannada (Indian language)
+    ⭐ [GitHub Repository](#) | 📝 [Training Notebook](#)
+    """)
+    tokenize_btn.click(
+        fn=tokenize_text,
+        inputs=input_text,
+        outputs=[token_ids, decoded_text, stats_output, token_count]
+    )
+demo.launch()

kannada_bpe_final.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:564c97b84ff5b221dc952a18f38a95356fd3dd2294b5c23a3f4d230a0a2b142c
+size 165687

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ gradio==4.7.1