Santhosh V commited on
Commit
82a1d74
·
1 Parent(s): 243e07f

inital push

Browse files
Files changed (4) hide show
  1. README.md +13 -7
  2. app.py +147 -0
  3. kannada_bpe_final.pkl +3 -0
  4. requirements.txt +1 -0
README.md CHANGED
@@ -1,14 +1,20 @@
1
  ---
2
  title: Kannada BPE Tokenizer
3
- emoji: 📚
4
- colorFrom: purple
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
- short_description: Kannada BPE Tokenizer
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Kannada BPE Tokenizer
3
+ emoji: 🔤
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.7.1
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
+ # Kannada BPE Tokenizer
13
+
14
+ From-scratch implementation of Byte-Pair Encoding for Kannada language.
15
+
16
+ - 6,000 token vocabulary
17
+ - 9.301x compression ratio
18
+ - 100% accurate encoding/decoding
19
+
20
+ Try it out with Kannada text!
app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ import pickle
4
+
5
+ # Load tokenizer
6
+ with open('kannada_bpe_final.pkl', 'rb') as f:
7
+ tokenizer_data = pickle.load(f)
8
+
9
+ class KannadaBPE:
10
+ def __init__(self, data):
11
+ self.vocab = data['vocab']
12
+ self.merges = data['merges']
13
+ self.pattern = data.get('pattern', r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\u0C80-\u0CFF]+| ?\w+| ?[0-9]+| ?[^\s\w]+|\s+(?!\S)|\s+""")
14
+ import re
15
+ self.compiled_pattern = re.compile(self.pattern)
16
+
17
+ def _get_stats(self, ids):
18
+ counts = {}
19
+ for pair in zip(ids, ids[1:]):
20
+ counts[pair] = counts.get(pair, 0) + 1
21
+ return counts
22
+
23
+ def _merge(self, ids, pair, new_id):
24
+ new_ids = []
25
+ i = 0
26
+ while i < len(ids):
27
+ if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
28
+ new_ids.append(new_id)
29
+ i += 2
30
+ else:
31
+ new_ids.append(ids[i])
32
+ i += 1
33
+ return new_ids
34
+
35
+ def encode(self, text):
36
+ import re
37
+ text_chunks = re.findall(self.compiled_pattern, text)
38
+ all_tokens = []
39
+ for chunk in text_chunks:
40
+ tokens = list(chunk.encode('utf-8'))
41
+ while len(tokens) >= 2:
42
+ stats = self._get_stats(tokens)
43
+ pair = min(stats, key=lambda p: self.merges.get(p, float('inf')))
44
+ if pair not in self.merges:
45
+ break
46
+ new_id = self.merges[pair]
47
+ tokens = self._merge(tokens, pair, new_id)
48
+ all_tokens.extend(tokens)
49
+ return all_tokens
50
+
51
+ def decode(self, ids):
52
+ tokens = b"".join([self.vocab[idx] for idx in ids])
53
+ return tokens.decode('utf-8', errors='replace')
54
+
55
+ tokenizer = KannadaBPE(tokenizer_data)
56
+
57
+ def tokenize_text(input_text):
58
+ """Tokenize input and show results"""
59
+ if not input_text:
60
+ return "Please enter some text", "", "", ""
61
+
62
+ # Encode
63
+ tokens = tokenizer.encode(input_text)
64
+
65
+ # Decode
66
+ decoded = tokenizer.decode(tokens)
67
+
68
+ # Calculate compression
69
+ original_bytes = len(input_text.encode('utf-8'))
70
+ num_tokens = len(tokens)
71
+ compression_ratio = original_bytes / num_tokens if num_tokens > 0 else 0
72
+
73
+ # Format output
74
+ token_ids_str = str(tokens[:50]) + ("..." if len(tokens) > 50 else "")
75
+
76
+ stats = f"""
77
+ **Statistics:**
78
+ - Original bytes: {original_bytes:,}
79
+ - Number of tokens: {num_tokens:,}
80
+ - Compression ratio: {compression_ratio:.3f}x
81
+ - Match: {'✅ Perfect' if input_text == decoded else '❌ Mismatch'}
82
+ """
83
+
84
+ return token_ids_str, decoded, stats, f"{num_tokens:,} tokens"
85
+
86
+ # Examples
87
+ examples = [
88
+ ["ನಮಸ್ಕಾರ, ಇದು ಕನ್ನಡ ಭಾಷೆಯ ಪರೀಕ್ಷೆ"],
89
+ ["ಕರ್ನಾಟಕ ರಾಜ್ಯದ ರಾಜಧಾನಿ ಬೆಂಗಳೂರು"],
90
+ ["Hello123 World456 Mixed ಕನ್ನಡ text"],
91
+ ["ಕನ್ನಡ ಸಂಖ್ಯೆಗಳು: ೧೨೩೪೫"],
92
+ ]
93
+
94
+ # Gradio Interface
95
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
96
+ gr.Markdown("""
97
+ # 🔤 Kannada BPE Tokenizer
98
+
99
+ **From-scratch Byte-Pair Encoding tokenizer for Kannada**
100
+
101
+ - Vocabulary: 6,000 tokens
102
+ - Compression: 9.301x average
103
+ - Training: 500k samples from CulturaX-Kn
104
+
105
+ Try tokenizing Kannada or mixed-language text below!
106
+ """)
107
+
108
+ with gr.Row():
109
+ with gr.Column():
110
+ input_text = gr.Textbox(
111
+ label="Input Text",
112
+ placeholder="Enter Kannada or mixed text...",
113
+ lines=5
114
+ )
115
+ tokenize_btn = gr.Button("🚀 Tokenize", variant="primary")
116
+
117
+ with gr.Column():
118
+ token_count = gr.Textbox(label="Token Count", interactive=False)
119
+ stats_output = gr.Markdown(label="Statistics")
120
+
121
+ with gr.Row():
122
+ token_ids = gr.Textbox(label="Token IDs (first 50)", lines=3, interactive=False)
123
+ decoded_text = gr.Textbox(label="Decoded Text", lines=3, interactive=False)
124
+
125
+ gr.Examples(
126
+ examples=examples,
127
+ inputs=input_text,
128
+ label="Try these examples:"
129
+ )
130
+
131
+ gr.Markdown("""
132
+ ### 📊 Model Info
133
+ - **Training Time**: 502 minutes (~8.4 hours)
134
+ - **Dataset**: CulturaX-Kn (500K samples)
135
+ - **Algorithm**: Byte-Pair Encoding (BPE) from scratch
136
+ - **Language**: Kannada (Indian language)
137
+
138
+ ⭐ [GitHub Repository](#) | 📝 [Training Notebook](#)
139
+ """)
140
+
141
+ tokenize_btn.click(
142
+ fn=tokenize_text,
143
+ inputs=input_text,
144
+ outputs=[token_ids, decoded_text, stats_output, token_count]
145
+ )
146
+
147
+ demo.launch()
kannada_bpe_final.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:564c97b84ff5b221dc952a18f38a95356fd3dd2294b5c23a3f4d230a0a2b142c
3
+ size 165687
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ gradio==4.7.1