import fasttext from huggingface_hub import hf_hub_download import regex import gradio as gr import os import asyncio import atexit # Constants MAX_INPUT_LENGTH = 10000 # Maximum characters allowed # Preprocessing patterns NONWORD_REPLACE_STR = r"[^\p{Word}\p{Zs}]|\d" NONWORD_REPLACE_PATTERN = regex.compile(NONWORD_REPLACE_STR) SPACE_PATTERN = regex.compile(r"\s\s+") def preprocess(text): """Preprocess text for language identification.""" text = text.strip().replace('\n', ' ').lower() text = regex.sub(SPACE_PATTERN, " ", text) text = regex.sub(NONWORD_REPLACE_PATTERN, "", text) return text # Load model once at startup print("Loading OpenLID-v3 model...") model_path = hf_hub_download( repo_id="HPLT/OpenLID-v3", filename="openlid-v3.bin" ) model = fasttext.load_model(model_path) print("Model loaded successfully!") def predict_language(text, top_k=3, threshold=0.5): """ Predict language of input text. Args: text: Input text to analyze top_k: Number of top predictions to return (1-10) threshold: Confidence threshold (0.0-1.0) """ # Check input length first if len(text) > MAX_INPUT_LENGTH: return f"**Error**: Input too long ({len(text):,} characters). Maximum allowed is {MAX_INPUT_LENGTH:,} characters." if not text or not text.strip(): return "Please enter some text to analyze." # Preprocess processed_text = preprocess(text) if not processed_text.strip(): return "Text contains no valid characters for language identification." # Get predictions predictions = model.predict( text=processed_text, k=min(top_k, 10), threshold=threshold, on_unicode_error="strict", ) labels, scores = predictions # Format results results = [] for label, score in zip(labels, scores): # Remove __label__ prefix and format lang_code = label.replace("__label__", "") confidence = float(score) * 100 results.append(f"**{lang_code}**: {confidence:.2f}%") return "\n\n".join(results) # Cleanup function to prevent async errors on shutdown def cleanup(): try: loop = asyncio.get_event_loop() if loop.is_running(): loop.stop() if not loop.is_closed(): loop.close() except Exception: pass atexit.register(cleanup) # Create Gradio interface with gr.Blocks(title="OpenLID-v3 Language Identification") as demo: # Use HTML with target="_blank" to open in new tab gr.HTML("""

OpenLID-v3 Language Identifier

Identify the language of any text with state-of-the-art accuracy.
Supports 194+ language varieties.

Model: HPLT/OpenLID-v3

""") with gr.Row(): with gr.Column(): input_text = gr.Textbox( label="Input Text", placeholder="Enter text to identify its language...", lines=5, max_lines=10, max_length=MAX_INPUT_LENGTH # Also enforce in UI ) with gr.Row(): top_k = gr.Slider( minimum=1, maximum=10, value=3, step=1, label="Top-K Predictions" ) threshold = gr.Slider( minimum=0.0, maximum=1.0, value=0.5, step=0.05, label="Confidence Threshold" ) submit_btn = gr.Button("Identify Language", variant="primary") with gr.Column(): output = gr.Markdown(label="Predictions") # Examples with Kabyle and Occitan as defaults gr.Examples( examples=[ ["Asebter-a yura s wudem awurman d amagrad s tutlayt taqbaylit."], ["L'interès es d'utilizar un sistèma liure, personalizable e en occitan."], ["Maskinsjefen er oppteken av å løfta fram dei maritime utdanningane."], ["The quick brown fox jumps over the lazy dog."], ["Le renard brun rapide saute par-dessus le chien paresseux."], ["El rápido zorro marrón salta sobre el perro perezoso."], ["Быстрая коричневая лисица прыгает через ленивую собаку."], ["快速的棕色狐狸跳过了懒惰的狗。"], ], inputs=input_text, label="Try these examples (Kabyle and Occitan featured)" ) gr.Markdown(f""" ### Tips for best results: - Text is automatically preprocessed (lowercased, normalized) - Longer texts generally give more accurate predictions - The model supports 194+ language varieties - Use higher thresholds to filter out uncertain predictions - **Maximum input length: {MAX_INPUT_LENGTH:,} characters** """) # Event handlers submit_btn.click( fn=predict_language, inputs=[input_text, top_k, threshold], outputs=output ) input_text.submit( fn=predict_language, inputs=[input_text, top_k, threshold], outputs=output ) if __name__ == "__main__": # Get port from environment (HF Spaces sets this) port = int(os.environ.get("PORT", 7860)) try: demo.launch( server_name="0.0.0.0", server_port=port, ssr_mode=False, # Disable experimental SSR to prevent the error share=False, show_error=True ) except KeyboardInterrupt: print("\nShutting down gracefully...") finally: cleanup()