ChaitraSaiK commited on
Commit
4984d4a
·
1 Parent(s): 21a6d06

first commit

Browse files
Files changed (3) hide show
  1. .gitattributes +0 -35
  2. app.py +30 -29
  3. bpe_vocab_350_merges.pkl +0 -0
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
- import numpy as np
3
  import pickle
4
  from typing import List, Dict, Tuple
 
5
 
6
  class OptimizedBPETokenizer:
7
  def __init__(self, merges: Dict[Tuple[int, int], int]):
@@ -71,45 +71,46 @@ class OptimizedBPETokenizer:
71
  expanded.extend(self._expand_token(t))
72
  return expanded
73
 
74
- # Load the BPE merges
75
- def load_tokenizer():
76
- try:
77
- with open("bpe_vocab_350_merges.pkl", "rb") as f:
78
- merges = pickle.load(f)
79
- return OptimizedBPETokenizer(merges)
80
- except FileNotFoundError:
81
- raise Exception("Tokenizer merges file not found!")
82
 
83
- tokenizer = load_tokenizer()
84
 
85
- def process_text(text, mode):
86
- if mode == "Encode":
87
- encoded = tokenizer.encode(text)
88
- return f"Encoded tokens: {encoded}\nToken count: {len(encoded)}"
89
  else: # Decode
90
  try:
91
  # Convert string of numbers to list of integers
92
- tokens = [int(t) for t in text.strip('[]').split(',')]
93
- decoded = tokenizer.decode(tokens)
94
- return decoded
95
  except:
96
- return "Error: Please provide tokens as comma-separated numbers"
97
 
98
- # Create the interface
99
  iface = gr.Interface(
100
  fn=process_text,
101
  inputs=[
102
- gr.Textbox(label="Input Text", lines=5),
103
- gr.Radio(["Encode", "Decode"], label="Mode", value="Encode")
104
  ],
105
- outputs=gr.Textbox(label="Output", lines=5),
106
  title="Telugu BPE Tokenizer",
107
- description="Encode Telugu text into BPE tokens or decode tokens back to text.",
108
- examples=[
109
- ["నమస్కారం", "Encode"],
110
- ["[224, 176, 184, 224, 176, 184]", "Decode"]
111
- ]
112
  )
113
 
114
- if __name__ == "__main__":
115
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import pickle
3
  from typing import List, Dict, Tuple
4
+ import numpy as np
5
 
6
  class OptimizedBPETokenizer:
7
  def __init__(self, merges: Dict[Tuple[int, int], int]):
 
71
  expanded.extend(self._expand_token(t))
72
  return expanded
73
 
74
+ # Load the pre-trained merges
75
+ with open("bpe_vocab_350_merges.pkl", "rb") as f:
76
+ merges = pickle.load(f)
 
 
 
 
 
77
 
78
+ tokenizer = OptimizedBPETokenizer(merges)
79
 
80
+ def process_text(text: str, operation: str) -> str:
81
+ if operation == "Encode":
82
+ tokens = tokenizer.encode(text)
83
+ return f"Encoded tokens: {tokens}\nToken count: {len(tokens)}"
84
  else: # Decode
85
  try:
86
  # Convert string of numbers to list of integers
87
+ tokens = [int(x) for x in text.strip('[]').split(',')]
88
+ decoded_text = tokenizer.decode(tokens)
89
+ return f"Decoded text: {decoded_text}"
90
  except:
91
+ return "Error: Please provide a valid list of integers for decoding"
92
 
93
+ # Create the Gradio interface
94
  iface = gr.Interface(
95
  fn=process_text,
96
  inputs=[
97
+ gr.Textbox(label="Input Text", placeholder="Enter text to encode or tokens to decode..."),
98
+ gr.Radio(["Encode", "Decode"], label="Operation", value="Encode")
99
  ],
100
+ outputs=gr.Textbox(label="Output"),
101
  title="Telugu BPE Tokenizer",
102
+ description="A byte-pair encoding tokenizer trained on Telugu text. For encoding, enter Telugu text. For decoding, enter a list of integers (e.g., [256, 257, 258])."
 
 
 
 
103
  )
104
 
105
+ # if __name__ == "__main__":
106
+ # # Test encoding
107
+ # test_text = "నమస్కారం" # Telugu "Hello"
108
+ # encoded = tokenizer.encode(test_text)
109
+ # print(f"Test Encode: '{test_text}' -> {encoded}")
110
+
111
+ # # Test decoding
112
+ # decoded = tokenizer.decode(encoded)
113
+ # print(f"Test Decode: {encoded} -> '{decoded}'")
114
+
115
+ # Launch the interface
116
+ iface.launch()
bpe_vocab_350_merges.pkl CHANGED
Binary files a/bpe_vocab_350_merges.pkl and b/bpe_vocab_350_merges.pkl differ