HSinghHuggingFace commited on
Commit
d83c04d
·
1 Parent(s): 4edb757

Hindi language tokenizer

Browse files
Files changed (5) hide show
  1. README.md +27 -8
  2. app.py +138 -0
  3. hindi_tokenizer.py +72 -0
  4. output/hindi_encoder.json +0 -0
  5. requirements.txt +4 -0
README.md CHANGED
@@ -1,14 +1,33 @@
1
  ---
2
- title: Hindi Tokenizer
3
- emoji: 🌖
4
- colorFrom: purple
5
- colorTo: purple
6
  sdk: streamlit
7
- sdk_version: 1.41.1
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
- short_description: hindi text tokenizer using HuggingFace Tokenizer library
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Hindi BPE Tokenizer
3
+ colorFrom: blue
4
+ colorTo: red
 
5
  sdk: streamlit
6
+ sdk_version: 1.31.1
7
  app_file: app.py
8
  pinned: false
 
 
9
  ---
10
 
11
+ # Hindi BPE Tokenizer
12
+
13
+ A Streamlit web application for encoding Hindi text to BPE tokens and decoding tokens back to text.
14
+
15
+ ## Features
16
+
17
+ - Encode Hindi text to BPE tokens and token IDs
18
+ - Decode token IDs back to Hindi text
19
+ - Pre-trained on 5,000,000 lines of Hindi text
20
+ - Vocabulary size: 4,500 tokens
21
+ - Includes special tokens: `<pad>`, `<unk>`, `<s>`, `</s>`
22
+
23
+ ## Usage
24
+
25
+ 1. **Encoding**: Enter Hindi text in the left panel and click "Encode"
26
+ 2. **Decoding**: Enter comma-separated token IDs in the right panel and click "Decode"
27
+
28
+ ## Technical Details
29
+
30
+ - BPE (Byte Pair Encoding) tokenizer
31
+ - Trained on IndicCorp Hindi dataset
32
+ - Compression ratio > 3.2
33
+ - Preserves Hindi Unicode range (\\u0900-\\u097F)
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+ from hindi_tokenizer import load_tokenizer, encode_text, decode_text
4
+
5
+ def load_hindi_tokenizer():
6
+ """Load the trained Hindi BPE tokenizer"""
7
+ output_dir = Path(__file__).parent / "output"
8
+ config_path = output_dir / "hindi_encoder.json"
9
+
10
+ if not config_path.exists():
11
+ st.error("Error: Tokenizer configuration not found! Please train the tokenizer first.")
12
+ st.stop()
13
+
14
+ try:
15
+ return load_tokenizer(str(config_path))
16
+ except Exception as e:
17
+ st.error(f"Error-1 loading tokenizer: {e}")
18
+ st.stop()
19
+
20
+
21
+ def main():
22
+ st.set_page_config(
23
+ page_title="Hindi BPE Tokenizer",
24
+ page_icon="🇮🇳",
25
+ layout="wide"
26
+ )
27
+
28
+ st.title("Hindi BPE Tokenizer")
29
+ st.markdown("A web interface for encoding and decoding Hindi text using BPE tokenization")
30
+
31
+ # Load tokenizer
32
+ try:
33
+ tokenizer = load_hindi_tokenizer()
34
+ except Exception as e:
35
+ st.error(f"Error loading tokenizer: {e}")
36
+ st.stop()
37
+
38
+ # Create two columns
39
+ encode_col, decode_col = st.columns(2)
40
+
41
+ # Encoding Section
42
+ with encode_col:
43
+ st.header("Encode Hindi Text")
44
+ st.markdown("Convert Hindi text into token IDs")
45
+
46
+ input_text = st.text_area(
47
+ "Enter Hindi Text",
48
+ placeholder="यहाँ हिंदी टेक्स्ट लिखें...",
49
+ height=150,
50
+ key="encode_input"
51
+ )
52
+
53
+ if st.button("Encode", key="encode_button"):
54
+ if input_text.strip():
55
+ try:
56
+ token_ids, tokens = encode_text(tokenizer, input_text)
57
+
58
+ st.subheader("Results:")
59
+ st.markdown("**Tokens:**")
60
+ st.write(tokens)
61
+
62
+ st.markdown("**Token IDs:**")
63
+ st.write(token_ids)
64
+
65
+ # Display as comma-separated string for easy copying
66
+ st.markdown("**Token IDs (comma-separated):**")
67
+ st.code(", ".join(map(str, token_ids)))
68
+
69
+ except Exception as e:
70
+ st.error(f"Error during encoding: {e}")
71
+ else:
72
+ st.warning("Please enter some text to encode")
73
+
74
+ # Decoding Section
75
+ with decode_col:
76
+ st.header("Decode Token IDs")
77
+ st.markdown("Convert token IDs back to Hindi text")
78
+
79
+ input_ids = st.text_area(
80
+ "Enter Token IDs (comma-separated)",
81
+ placeholder="2517, 2074, 340, 4, 201...",
82
+ height=150,
83
+ key="decode_input"
84
+ )
85
+
86
+ if st.button("Decode", key="decode_button"):
87
+ if input_ids.strip():
88
+ try:
89
+ # Convert string of IDs to list of integers
90
+ token_ids = [int(id.strip()) for id in input_ids.split(",")]
91
+
92
+ decoded_text = decode_text(tokenizer, token_ids)
93
+
94
+ st.subheader("Results:")
95
+ st.markdown("**Decoded Text:**")
96
+ st.write(decoded_text)
97
+
98
+ # Display in a box for better visibility
99
+ st.text_area(
100
+ "Decoded Text (copyable)",
101
+ value=decoded_text,
102
+ height=100,
103
+ key="decoded_output"
104
+ )
105
+
106
+ except ValueError:
107
+ st.error("Invalid input format. Please enter comma-separated numbers.")
108
+ except Exception as e:
109
+ st.error(f"Error during decoding: {e}")
110
+ else:
111
+ st.warning("Please enter token IDs to decode")
112
+
113
+ # Add information section at the bottom
114
+ st.markdown("---")
115
+ st.markdown("### About the Tokenizer")
116
+
117
+ info_col1, info_col2 = st.columns(2)
118
+
119
+ with info_col1:
120
+ st.markdown("""
121
+ **Tokenizer Details:**
122
+ - Type: Byte Pair Encoding (BPE)
123
+ - Vocabulary Size: 4,500 tokens
124
+ - Special Tokens: `<pad>`, `<unk>`, `<s>`, `</s>`
125
+ - Minimum Token Frequency: 2
126
+ """)
127
+
128
+ with info_col2:
129
+ st.markdown("""
130
+ **Preprocessing:**
131
+ - Retains Hindi Unicode (\\u0900-\\u097F)
132
+ - Removes digits and special characters
133
+ - Normalizes punctuation
134
+ - Cleans whitespace
135
+ """)
136
+
137
+ if __name__ == "__main__":
138
+ main()
hindi_tokenizer.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ from pathlib import Path
4
+ from tokenizers import Tokenizer
5
+ from tokenizers.models import BPE
6
+ from tokenizers.trainers import BpeTrainer
7
+ from tokenizers.pre_tokenizers import Whitespace
8
+ from tqdm import tqdm
9
+
10
+ def preprocess_hindi_text(text):
11
+ """
12
+ Preprocesses Hindi text by removing unwanted characters and normalizing punctuation.
13
+
14
+ Args:
15
+ text (str): Raw Hindi text input
16
+
17
+ Returns:
18
+ str: Cleaned and normalized text
19
+ """
20
+ # Retain Hindi characters and punctuation
21
+ text = re.sub(r"[^\u0900-\u097F\s।,.!?\-]", "", text)
22
+ # Remove digits (both English and Hindi)
23
+ text = re.sub(r"[0-9०-९]", "", text)
24
+ # Normalize full stops and whitespace
25
+ text = re.sub(r"।", ".", text)
26
+ text = re.sub(r"\s+", " ", text).strip()
27
+ return text
28
+
29
+
30
+ def encode_text(tokenizer, text):
31
+ """
32
+ Encodes Hindi text into token IDs.
33
+
34
+ Args:
35
+ tokenizer (Tokenizer): Trained BPE tokenizer
36
+ text (str): Hindi text to encode
37
+
38
+ Returns:
39
+ tuple: (token_ids, tokens)
40
+ """
41
+ # Preprocess the text first
42
+ cleaned_text = preprocess_hindi_text(text)
43
+
44
+ # Encode the text
45
+ encoding = tokenizer.encode(cleaned_text)
46
+ return encoding.ids, encoding.tokens
47
+
48
+ def decode_text(tokenizer, token_ids):
49
+ """
50
+ Decodes token IDs back into Hindi text.
51
+
52
+ Args:
53
+ tokenizer (Tokenizer): Trained BPE tokenizer
54
+ token_ids (list): List of token IDs to decode
55
+
56
+ Returns:
57
+ str: Decoded Hindi text
58
+ """
59
+ return tokenizer.decode(token_ids)
60
+
61
+
62
+ def load_tokenizer(config_path):
63
+ """
64
+ Loads a previously trained tokenizer from a configuration file.
65
+
66
+ Args:
67
+ config_path (str): Path to the tokenizer configuration file
68
+
69
+ Returns:
70
+ Tokenizer: Loaded tokenizer
71
+ """
72
+ return Tokenizer.from_file(config_path)
output/hindi_encoder.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit==1.31.1
2
+ tokenizers==0.21.0
3
+ requests==2.31.0
4
+ tqdm==4.66.1