Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from src.hindi_bpe import HindiBPE | |
| import pickle | |
| import os | |
| # Initialize the tokenizer | |
| tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2) | |
| # Load production model state | |
| model_file = 'hindi_bpe_model.pkl' | |
| if os.path.exists(model_file): | |
| print("Loading production model...") | |
| with open(model_file, 'rb') as f: | |
| state = pickle.load(f) | |
| tokenizer.vocab = state['vocab'] | |
| tokenizer.inverse_vocab = state['inverse_vocab'] | |
| tokenizer.bpe_ranks = state['bpe_ranks'] | |
| print(f"Model loaded successfully!") | |
| print(f"Vocabulary size: {len(tokenizer.vocab)} tokens") | |
| else: | |
| raise FileNotFoundError("Production model not found! Please run train_bpe.py first and copy the model file.") | |
| def process_text(text: str, mode: str) -> str: | |
| """Process text using the tokenizer""" | |
| if not text.strip(): | |
| return "Please enter some text." | |
| if mode == "Encode": | |
| # Encode the text | |
| encoded = tokenizer.encode(text) | |
| return f"Encoded tokens: {encoded}" | |
| else: | |
| # First encode then decode to show the round trip | |
| encoded = tokenizer.encode(text) | |
| decoded = tokenizer.decode(encoded) | |
| return f"Original: {text}\nDecoded: {decoded}\nMatches: {'✓' if text == decoded else '✗'}" | |
| # Create the interface | |
| iface = gr.Interface( | |
| fn=process_text, | |
| inputs=[ | |
| gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते भारत"), | |
| gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode") | |
| ], | |
| outputs=gr.Textbox(label="Result"), | |
| title="Hindi BPE Tokenizer (Production Model)", | |
| description="""This is a production-grade Byte Pair Encoding (BPE) tokenizer trained on 1 million Hindi sentences. | |
| Features: | |
| - Vocabulary size: < 5000 tokens | |
| - Compression ratio: ≥ 3.2 | |
| - Trained on 1M sentences | |
| - Proper handling of Hindi Unicode characters and combining marks""", | |
| examples=[ | |
| ["नमस्ते भारत", "Encode & Decode"], | |
| ["मैं हिंदी सीख रहा हूं", "Encode & Decode"], | |
| ["यह एक परीक्षण वाक्य है", "Encode & Decode"], | |
| ["भारत एक विशाल देश है", "Encode & Decode"], | |
| ["मुझे हिंदी भाषा बहुत पसंद है", "Encode & Decode"] | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |