Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from telugu_bpe import TeluguBPE | |
| import os | |
| # Initialize the BPE model | |
| bpe = TeluguBPE(vocab_size=5000) | |
| # Get the absolute path to the model file | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |
| model_path = os.path.join(current_dir, "telugu_bpe_model.json") | |
| # Load the pre-trained model | |
| try: | |
| bpe.load_model(model_path) | |
| print("Model loaded successfully!") | |
| except FileNotFoundError: | |
| print(f"Error: Model file not found at {model_path}") | |
| # Train a small model with sample text if model doesn't exist | |
| sample_text = """ | |
| నమస్కారం తెలుగు భాష చాలా అందమైన భాష | |
| తెలుగు భారతదేశంలోని ద్రావిడ భాషల్లో ఒకటి | |
| తెలుగు అక్షరమాల లో 56 అక్షరాలు ఉన్నాయి | |
| """ | |
| processed_text = bpe.preprocess_telugu_text(sample_text) | |
| bpe.learn_bpe(processed_text) | |
| bpe.save_model(model_path) | |
| print("Created a new model with sample text") | |
| def process_text(input_text: str) -> dict: | |
| """ | |
| Process input Telugu text and return tokenization results | |
| """ | |
| if not input_text or input_text.strip() == "": | |
| return { | |
| "Error": "Please enter some Telugu text" | |
| } | |
| try: | |
| # Preprocess the input text | |
| processed_text = bpe.preprocess_telugu_text(input_text) | |
| # Encode the text | |
| encoded_tokens = bpe.encode(processed_text) | |
| # Calculate statistics | |
| char_count = len(processed_text) | |
| token_count = len(encoded_tokens) | |
| compression_ratio = char_count / token_count if token_count > 0 else 0 | |
| return { | |
| "Preprocessed Text": processed_text, | |
| "Tokens": encoded_tokens, | |
| "Character Count": char_count, | |
| "Token Count": token_count, | |
| "Compression Ratio": f"{compression_ratio:.2f}x", | |
| "Vocabulary Size": len(bpe.vocab) | |
| } | |
| except Exception as e: | |
| return { | |
| "Error": f"An error occurred: {str(e)}" | |
| } | |
| # Create Gradio interface | |
| demo = gr.Interface( | |
| fn=process_text, | |
| inputs=[ | |
| gr.Textbox( | |
| lines=4, | |
| placeholder="Enter Telugu text here...", | |
| label="Input Telugu Text", | |
| value="నమస్కారం" | |
| ) | |
| ], | |
| outputs=gr.JSON(label="Tokenization Results"), | |
| title="Telugu BPE Tokenizer", | |
| description=""" | |
| ## Telugu Byte Pair Encoding (BPE) Tokenizer | |
| This tokenizer is specifically designed for Telugu text processing with a vocabulary size of ~5000 tokens. | |
| ### Features: | |
| - Telugu-specific preprocessing | |
| - BPE tokenization | |
| - Compression statistics | |
| - Character and token counts | |
| ### How to use: | |
| 1. Enter Telugu text in the input box | |
| 2. Get tokenized output and statistics | |
| ### Example inputs provided below ⬇️ | |
| """, | |
| examples=[ | |
| ["నమస్కారం"], | |
| ["తెలుగు భాష చాలా అందమైన భాష"], | |
| ["నేను తెలుగులో మాట్లాడగలను"], | |
| ["తెలుగు అక్షరమాల లో 56 అక్షరాలు ఉన్నాయి"] | |
| ], | |
| theme=gr.themes.Soft(), | |
| allow_flagging="never", | |
| cache_examples=True | |
| ) | |
| # Launch configuration for Hugging Face Spaces | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) |