Spaces:

jatingocodeo
/

Hindi-BPE

Sleeping

jatingocodeo commited on Jan 11, 2025

Commit

f61c187

verified ·

1 Parent(s): ca6a8e2

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,9 +1,25 @@
 import gradio as gr
 from src.hindi_bpe import HindiBPE
 # Initialize the tokenizer
 tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)
 def process_text(text: str, mode: str) -> str:
     """Process text using the tokenizer"""
     if not text.strip():
@@ -27,11 +43,12 @@ iface = gr.Interface(
         gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
     ],
     outputs=gr.Textbox(label="Result"),
-    title="Hindi BPE Tokenizer",
-    description="""This is a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text.
     Features:
     - Vocabulary size: < 5000 tokens
     - Compression ratio: ≥ 3.2
     - Proper handling of Hindi Unicode characters and combining marks""",
     examples=[
         ["नमस्ते भारत", "Encode & Decode"],

 import gradio as gr
 from src.hindi_bpe import HindiBPE
+import pickle
+import os
 # Initialize the tokenizer
 tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)
+# Load production model state
+model_file = 'hindi_bpe_model.pkl'
+if os.path.exists(model_file):
+    print("Loading production model...")
+    with open(model_file, 'rb') as f:
+        state = pickle.load(f)
+        tokenizer.vocab = state['vocab']
+        tokenizer.inverse_vocab = state['inverse_vocab']
+        tokenizer.bpe_ranks = state['bpe_ranks']
+    print(f"Model loaded successfully!")
+    print(f"Vocabulary size: {len(tokenizer.vocab)} tokens")
+else:
+    raise FileNotFoundError("Production model not found! Please run train_bpe.py first and copy the model file.")
 def process_text(text: str, mode: str) -> str:
     """Process text using the tokenizer"""
     if not text.strip():
         gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
     ],
     outputs=gr.Textbox(label="Result"),
+    title="Hindi BPE Tokenizer (Production Model)",
+    description="""This is a production-grade Byte Pair Encoding (BPE) tokenizer trained on 1 million Hindi sentences.
     Features:
     - Vocabulary size: < 5000 tokens
     - Compression ratio: ≥ 3.2
+    - Trained on 1M sentences
     - Proper handling of Hindi Unicode characters and combining marks""",
     examples=[
         ["नमस्ते भारत", "Encode & Decode"],