Spaces:

boffire
/

OpenLID-v3_test

Sleeping

File size: 5,864 Bytes

import fasttext
from huggingface_hub import hf_hub_download
import regex
import gradio as gr
import os
import asyncio
import atexit

# Constants
MAX_INPUT_LENGTH = 10000  # Maximum characters allowed

# Preprocessing patterns
NONWORD_REPLACE_STR = r"[^\p{Word}\p{Zs}]|\d"
NONWORD_REPLACE_PATTERN = regex.compile(NONWORD_REPLACE_STR)
SPACE_PATTERN = regex.compile(r"\s\s+")

def preprocess(text):
    """Preprocess text for language identification."""
    text = text.strip().replace('\n', ' ').lower()
    text = regex.sub(SPACE_PATTERN, " ", text)
    text = regex.sub(NONWORD_REPLACE_PATTERN, "", text)
    return text

# Load model once at startup
print("Loading OpenLID-v3 model...")
model_path = hf_hub_download(
    repo_id="HPLT/OpenLID-v3", 
    filename="openlid-v3.bin"
)
model = fasttext.load_model(model_path)
print("Model loaded successfully!")

def predict_language(text, top_k=3, threshold=0.5):
    """
    Predict language of input text.
    
    Args:
        text: Input text to analyze
        top_k: Number of top predictions to return (1-10)
        threshold: Confidence threshold (0.0-1.0)
    """
    # Check input length first
    if len(text) > MAX_INPUT_LENGTH:
        return f"**Error**: Input too long ({len(text):,} characters). Maximum allowed is {MAX_INPUT_LENGTH:,} characters."
    
    if not text or not text.strip():
        return "Please enter some text to analyze."
    
    # Preprocess
    processed_text = preprocess(text)
    
    if not processed_text.strip():
        return "Text contains no valid characters for language identification."
    
    # Get predictions
    predictions = model.predict(
        text=processed_text,
        k=min(top_k, 10),
        threshold=threshold,
        on_unicode_error="strict",
    )
    
    labels, scores = predictions
    
    # Format results
    results = []
    for label, score in zip(labels, scores):
        # Remove __label__ prefix and format
        lang_code = label.replace("__label__", "")
        confidence = float(score) * 100
        results.append(f"**{lang_code}**: {confidence:.2f}%")
    
    return "\n\n".join(results)

# Cleanup function to prevent async errors on shutdown
def cleanup():
    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            loop.stop()
        if not loop.is_closed():
            loop.close()
    except Exception:
        pass

atexit.register(cleanup)

# Create Gradio interface
with gr.Blocks(title="OpenLID-v3 Language Identification") as demo:
    # Use HTML with target="_blank" to open in new tab
    gr.HTML("""
    <h1>OpenLID-v3 Language Identifier</h1>
    <p>Identify the language of any text with state-of-the-art accuracy.<br>
    Supports 194+ language varieties.</p>
    <p><em>Model: <a href="https://huggingface.co/HPLT/OpenLID-v3" target="_blank" rel="noopener noreferrer">HPLT/OpenLID-v3</a></em></p>
    """)
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Input Text",
                placeholder="Enter text to identify its language...",
                lines=5,
                max_lines=10,
                max_length=MAX_INPUT_LENGTH  # Also enforce in UI
            )
            with gr.Row():
                top_k = gr.Slider(
                    minimum=1, 
                    maximum=10, 
                    value=3, 
                    step=1, 
                    label="Top-K Predictions"
                )
                threshold = gr.Slider(
                    minimum=0.0, 
                    maximum=1.0, 
                    value=0.5, 
                    step=0.05, 
                    label="Confidence Threshold"
                )
            submit_btn = gr.Button("Identify Language", variant="primary")
        
        with gr.Column():
            output = gr.Markdown(label="Predictions")
    
    # Examples with Kabyle and Occitan as defaults
    gr.Examples(
        examples=[
            ["Asebter-a yura s wudem awurman d amagrad s tutlayt taqbaylit."],
            ["L'interès es d'utilizar un sistèma liure, personalizable e en occitan."],
            ["Maskinsjefen er oppteken av å løfta fram dei maritime utdanningane."],
            ["The quick brown fox jumps over the lazy dog."],
            ["Le renard brun rapide saute par-dessus le chien paresseux."],
            ["El rápido zorro marrón salta sobre el perro perezoso."],
            ["Быстрая коричневая лисица прыгает через ленивую собаку."],
            ["快速的棕色狐狸跳过了懒惰的狗。"],
        ],
        inputs=input_text,
        label="Try these examples (Kabyle and Occitan featured)"
    )
    
    gr.Markdown(f"""
    ### Tips for best results:
    - Text is automatically preprocessed (lowercased, normalized)
    - Longer texts generally give more accurate predictions
    - The model supports 194+ language varieties
    - Use higher thresholds to filter out uncertain predictions
    - **Maximum input length: {MAX_INPUT_LENGTH:,} characters**
    """)
    
    # Event handlers
    submit_btn.click(
        fn=predict_language,
        inputs=[input_text, top_k, threshold],
        outputs=output
    )
    
    input_text.submit(
        fn=predict_language,
        inputs=[input_text, top_k, threshold],
        outputs=output
    )

if __name__ == "__main__":
    # Get port from environment (HF Spaces sets this)
    port = int(os.environ.get("PORT", 7860))
    
    try:
        demo.launch(
            server_name="0.0.0.0",
            server_port=port,
            ssr_mode=False,  # Disable experimental SSR to prevent the error
            share=False,
            show_error=True
        )
    except KeyboardInterrupt:
        print("\nShutting down gracefully...")
    finally:
        cleanup()