Spaces:

HSinghHuggingFace
/

hindi-tokenizer

Runtime error

App Files Files Community

HSinghHuggingFace commited on Jan 9, 2025

Commit

d83c04d

1 Parent(s): 4edb757

Hindi language tokenizer

Browse files

Files changed (5) hide show

README.md +27 -8
app.py +138 -0
hindi_tokenizer.py +72 -0
output/hindi_encoder.json +0 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,14 +1,33 @@
 ---
-title: Hindi Tokenizer
-emoji: 🌖
-colorFrom: purple
-colorTo: purple
 sdk: streamlit
-sdk_version: 1.41.1
 app_file: app.py
 pinned: false
-license: apache-2.0
-short_description: hindi text tokenizer using HuggingFace Tokenizer library
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Hindi BPE Tokenizer
+colorFrom: blue
+colorTo: red
 sdk: streamlit
+sdk_version: 1.31.1
 app_file: app.py
 pinned: false
 ---
+# Hindi BPE Tokenizer
+A Streamlit web application for encoding Hindi text to BPE tokens and decoding tokens back to text.
+## Features
+- Encode Hindi text to BPE tokens and token IDs
+- Decode token IDs back to Hindi text
+- Pre-trained on 5,000,000 lines of Hindi text
+- Vocabulary size: 4,500 tokens
+- Includes special tokens: `<pad>`, `<unk>`, `<s>`, `</s>`
+## Usage
+1. **Encoding**: Enter Hindi text in the left panel and click "Encode"
+2. **Decoding**: Enter comma-separated token IDs in the right panel and click "Decode"
+## Technical Details
+- BPE (Byte Pair Encoding) tokenizer
+- Trained on IndicCorp Hindi dataset
+- Compression ratio > 3.2
+- Preserves Hindi Unicode range (\\u0900-\\u097F)

app.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import streamlit as st
+from pathlib import Path
+from hindi_tokenizer import load_tokenizer, encode_text, decode_text
+def load_hindi_tokenizer():
+    """Load the trained Hindi BPE tokenizer"""
+    output_dir = Path(__file__).parent / "output"
+    config_path = output_dir / "hindi_encoder.json"
+    if not config_path.exists():
+        st.error("Error: Tokenizer configuration not found! Please train the tokenizer first.")
+        st.stop()
+    try:
+        return load_tokenizer(str(config_path))
+    except Exception as e:
+        st.error(f"Error-1 loading tokenizer: {e}")
+        st.stop()
+def main():
+    st.set_page_config(
+        page_title="Hindi BPE Tokenizer",
+        page_icon="🇮🇳",
+        layout="wide"
+    )
+    st.title("Hindi BPE Tokenizer")
+    st.markdown("A web interface for encoding and decoding Hindi text using BPE tokenization")
+    # Load tokenizer
+    try:
+        tokenizer = load_hindi_tokenizer()
+    except Exception as e:
+        st.error(f"Error loading tokenizer: {e}")
+        st.stop()
+    # Create two columns
+    encode_col, decode_col = st.columns(2)
+    # Encoding Section
+    with encode_col:
+        st.header("Encode Hindi Text")
+        st.markdown("Convert Hindi text into token IDs")
+        input_text = st.text_area(
+            "Enter Hindi Text",
+            placeholder="यहाँ हिंदी टेक्स्ट लिखें...",
+            height=150,
+            key="encode_input"
+        )
+        if st.button("Encode", key="encode_button"):
+            if input_text.strip():
+                try:
+                    token_ids, tokens = encode_text(tokenizer, input_text)
+                    st.subheader("Results:")
+                    st.markdown("**Tokens:**")
+                    st.write(tokens)
+                    st.markdown("**Token IDs:**")
+                    st.write(token_ids)
+                    # Display as comma-separated string for easy copying
+                    st.markdown("**Token IDs (comma-separated):**")
+                    st.code(", ".join(map(str, token_ids)))
+                except Exception as e:
+                    st.error(f"Error during encoding: {e}")
+            else:
+                st.warning("Please enter some text to encode")
+    # Decoding Section
+    with decode_col:
+        st.header("Decode Token IDs")
+        st.markdown("Convert token IDs back to Hindi text")
+        input_ids = st.text_area(
+            "Enter Token IDs (comma-separated)",
+            placeholder="2517, 2074, 340, 4, 201...",
+            height=150,
+            key="decode_input"
+        )
+        if st.button("Decode", key="decode_button"):
+            if input_ids.strip():
+                try:
+                    # Convert string of IDs to list of integers
+                    token_ids = [int(id.strip()) for id in input_ids.split(",")]
+                    decoded_text = decode_text(tokenizer, token_ids)
+                    st.subheader("Results:")
+                    st.markdown("**Decoded Text:**")
+                    st.write(decoded_text)
+                    # Display in a box for better visibility
+                    st.text_area(
+                        "Decoded Text (copyable)",
+                        value=decoded_text,
+                        height=100,
+                        key="decoded_output"
+                    )
+                except ValueError:
+                    st.error("Invalid input format. Please enter comma-separated numbers.")
+                except Exception as e:
+                    st.error(f"Error during decoding: {e}")
+            else:
+                st.warning("Please enter token IDs to decode")
+    # Add information section at the bottom
+    st.markdown("---")
+    st.markdown("### About the Tokenizer")
+    info_col1, info_col2 = st.columns(2)
+    with info_col1:
+        st.markdown("""
+        **Tokenizer Details:**
+        - Type: Byte Pair Encoding (BPE)
+        - Vocabulary Size: 4,500 tokens
+        - Special Tokens: `<pad>`, `<unk>`, `<s>`, `</s>`
+        - Minimum Token Frequency: 2
+        """)
+    with info_col2:
+        st.markdown("""
+        **Preprocessing:**
+        - Retains Hindi Unicode (\\u0900-\\u097F)
+        - Removes digits and special characters
+        - Normalizes punctuation
+        - Cleans whitespace
+        """)
+if __name__ == "__main__":
+    main()

hindi_tokenizer.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import re
+import requests
+from pathlib import Path
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Whitespace
+from tqdm import tqdm
+def preprocess_hindi_text(text):
+    """
+    Preprocesses Hindi text by removing unwanted characters and normalizing punctuation.
+    Args:
+        text (str): Raw Hindi text input
+    Returns:
+        str: Cleaned and normalized text
+    """
+    # Retain Hindi characters and punctuation
+    text = re.sub(r"[^\u0900-\u097F\s।,.!?\-]", "", text)
+    # Remove digits (both English and Hindi)
+    text = re.sub(r"[0-9०-९]", "", text)
+    # Normalize full stops and whitespace
+    text = re.sub(r"।", ".", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def encode_text(tokenizer, text):
+    """
+    Encodes Hindi text into token IDs.
+    Args:
+        tokenizer (Tokenizer): Trained BPE tokenizer
+        text (str): Hindi text to encode
+    Returns:
+        tuple: (token_ids, tokens)
+    """
+    # Preprocess the text first
+    cleaned_text = preprocess_hindi_text(text)
+    # Encode the text
+    encoding = tokenizer.encode(cleaned_text)
+    return encoding.ids, encoding.tokens
+def decode_text(tokenizer, token_ids):
+    """
+    Decodes token IDs back into Hindi text.
+    Args:
+        tokenizer (Tokenizer): Trained BPE tokenizer
+        token_ids (list): List of token IDs to decode
+    Returns:
+        str: Decoded Hindi text
+    """
+    return tokenizer.decode(token_ids)
+def load_tokenizer(config_path):
+    """
+    Loads a previously trained tokenizer from a configuration file.
+    Args:
+        config_path (str): Path to the tokenizer configuration file
+    Returns:
+        Tokenizer: Loaded tokenizer
+    """
+    return Tokenizer.from_file(config_path)

output/hindi_encoder.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+streamlit==1.31.1
+tokenizers==0.21.0
+requests==2.31.0
+tqdm==4.66.1