Spaces:
Running
Running
| import fasttext | |
| from huggingface_hub import hf_hub_download | |
| import regex | |
| import gradio as gr | |
| import os | |
| import asyncio | |
| import atexit | |
| # Constants | |
| MAX_INPUT_LENGTH = 10000 # Maximum characters allowed | |
| # Preprocessing patterns | |
| NONWORD_REPLACE_STR = r"[^\p{Word}\p{Zs}]|\d" | |
| NONWORD_REPLACE_PATTERN = regex.compile(NONWORD_REPLACE_STR) | |
| SPACE_PATTERN = regex.compile(r"\s\s+") | |
| def preprocess(text): | |
| """Preprocess text for language identification.""" | |
| text = text.strip().replace('\n', ' ').lower() | |
| text = regex.sub(SPACE_PATTERN, " ", text) | |
| text = regex.sub(NONWORD_REPLACE_PATTERN, "", text) | |
| return text | |
| # Load model once at startup | |
| print("Loading OpenLID-v3 model...") | |
| model_path = hf_hub_download( | |
| repo_id="HPLT/OpenLID-v3", | |
| filename="openlid-v3.bin" | |
| ) | |
| model = fasttext.load_model(model_path) | |
| print("Model loaded successfully!") | |
| def predict_language(text, top_k=3, threshold=0.5): | |
| """ | |
| Predict language of input text. | |
| Args: | |
| text: Input text to analyze | |
| top_k: Number of top predictions to return (1-10) | |
| threshold: Confidence threshold (0.0-1.0) | |
| """ | |
| # Check input length first | |
| if len(text) > MAX_INPUT_LENGTH: | |
| return f"**Error**: Input too long ({len(text):,} characters). Maximum allowed is {MAX_INPUT_LENGTH:,} characters." | |
| if not text or not text.strip(): | |
| return "Please enter some text to analyze." | |
| # Preprocess | |
| processed_text = preprocess(text) | |
| if not processed_text.strip(): | |
| return "Text contains no valid characters for language identification." | |
| # Get predictions | |
| predictions = model.predict( | |
| text=processed_text, | |
| k=min(top_k, 10), | |
| threshold=threshold, | |
| on_unicode_error="strict", | |
| ) | |
| labels, scores = predictions | |
| # Format results | |
| results = [] | |
| for label, score in zip(labels, scores): | |
| # Remove __label__ prefix and format | |
| lang_code = label.replace("__label__", "") | |
| confidence = float(score) * 100 | |
| results.append(f"**{lang_code}**: {confidence:.2f}%") | |
| return "\n\n".join(results) | |
| # Cleanup function to prevent async errors on shutdown | |
| def cleanup(): | |
| try: | |
| loop = asyncio.get_event_loop() | |
| if loop.is_running(): | |
| loop.stop() | |
| if not loop.is_closed(): | |
| loop.close() | |
| except Exception: | |
| pass | |
| atexit.register(cleanup) | |
| # Create Gradio interface | |
| with gr.Blocks(title="OpenLID-v3 Language Identification") as demo: | |
| # Use HTML with target="_blank" to open in new tab | |
| gr.HTML(""" | |
| <h1>OpenLID-v3 Language Identifier</h1> | |
| <p>Identify the language of any text with state-of-the-art accuracy.<br> | |
| Supports 194+ language varieties.</p> | |
| <p><em>Model: <a href="https://huggingface.co/HPLT/OpenLID-v3" target="_blank" rel="noopener noreferrer">HPLT/OpenLID-v3</a></em></p> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_text = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Enter text to identify its language...", | |
| lines=5, | |
| max_lines=10, | |
| max_length=MAX_INPUT_LENGTH # Also enforce in UI | |
| ) | |
| with gr.Row(): | |
| top_k = gr.Slider( | |
| minimum=1, | |
| maximum=10, | |
| value=3, | |
| step=1, | |
| label="Top-K Predictions" | |
| ) | |
| threshold = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.5, | |
| step=0.05, | |
| label="Confidence Threshold" | |
| ) | |
| submit_btn = gr.Button("Identify Language", variant="primary") | |
| with gr.Column(): | |
| output = gr.Markdown(label="Predictions") | |
| # Examples with Kabyle and Occitan as defaults | |
| gr.Examples( | |
| examples=[ | |
| ["Asebter-a yura s wudem awurman d amagrad s tutlayt taqbaylit."], | |
| ["L'interès es d'utilizar un sistèma liure, personalizable e en occitan."], | |
| ["Maskinsjefen er oppteken av å løfta fram dei maritime utdanningane."], | |
| ["The quick brown fox jumps over the lazy dog."], | |
| ["Le renard brun rapide saute par-dessus le chien paresseux."], | |
| ["El rápido zorro marrón salta sobre el perro perezoso."], | |
| ["Быстрая коричневая лисица прыгает через ленивую собаку."], | |
| ["快速的棕色狐狸跳过了懒惰的狗。"], | |
| ], | |
| inputs=input_text, | |
| label="Try these examples (Kabyle and Occitan featured)" | |
| ) | |
| gr.Markdown(f""" | |
| ### Tips for best results: | |
| - Text is automatically preprocessed (lowercased, normalized) | |
| - Longer texts generally give more accurate predictions | |
| - The model supports 194+ language varieties | |
| - Use higher thresholds to filter out uncertain predictions | |
| - **Maximum input length: {MAX_INPUT_LENGTH:,} characters** | |
| """) | |
| # Event handlers | |
| submit_btn.click( | |
| fn=predict_language, | |
| inputs=[input_text, top_k, threshold], | |
| outputs=output | |
| ) | |
| input_text.submit( | |
| fn=predict_language, | |
| inputs=[input_text, top_k, threshold], | |
| outputs=output | |
| ) | |
| if __name__ == "__main__": | |
| # Get port from environment (HF Spaces sets this) | |
| port = int(os.environ.get("PORT", 7860)) | |
| try: | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=port, | |
| ssr_mode=False, # Disable experimental SSR to prevent the error | |
| share=False, | |
| show_error=True | |
| ) | |
| except KeyboardInterrupt: | |
| print("\nShutting down gracefully...") | |
| finally: | |
| cleanup() |