File size: 5,864 Bytes
9349334
 
 
 
 
36e285c
 
9349334
41a4e04
 
 
9349334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41a4e04
 
 
 
9349334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36e285c
 
 
 
 
 
 
 
 
 
 
 
 
9349334
 
2181426
 
 
 
 
 
9349334
 
 
 
 
 
 
 
41a4e04
 
9349334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36e285c
9349334
 
 
 
 
 
 
 
 
 
 
41a4e04
9349334
 
 
 
 
41a4e04
9349334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36e285c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import fasttext
from huggingface_hub import hf_hub_download
import regex
import gradio as gr
import os
import asyncio
import atexit

# Constants
MAX_INPUT_LENGTH = 10000  # Maximum characters allowed

# Preprocessing patterns
NONWORD_REPLACE_STR = r"[^\p{Word}\p{Zs}]|\d"
NONWORD_REPLACE_PATTERN = regex.compile(NONWORD_REPLACE_STR)
SPACE_PATTERN = regex.compile(r"\s\s+")

def preprocess(text):
    """Preprocess text for language identification."""
    text = text.strip().replace('\n', ' ').lower()
    text = regex.sub(SPACE_PATTERN, " ", text)
    text = regex.sub(NONWORD_REPLACE_PATTERN, "", text)
    return text

# Load model once at startup
print("Loading OpenLID-v3 model...")
model_path = hf_hub_download(
    repo_id="HPLT/OpenLID-v3", 
    filename="openlid-v3.bin"
)
model = fasttext.load_model(model_path)
print("Model loaded successfully!")

def predict_language(text, top_k=3, threshold=0.5):
    """
    Predict language of input text.
    
    Args:
        text: Input text to analyze
        top_k: Number of top predictions to return (1-10)
        threshold: Confidence threshold (0.0-1.0)
    """
    # Check input length first
    if len(text) > MAX_INPUT_LENGTH:
        return f"**Error**: Input too long ({len(text):,} characters). Maximum allowed is {MAX_INPUT_LENGTH:,} characters."
    
    if not text or not text.strip():
        return "Please enter some text to analyze."
    
    # Preprocess
    processed_text = preprocess(text)
    
    if not processed_text.strip():
        return "Text contains no valid characters for language identification."
    
    # Get predictions
    predictions = model.predict(
        text=processed_text,
        k=min(top_k, 10),
        threshold=threshold,
        on_unicode_error="strict",
    )
    
    labels, scores = predictions
    
    # Format results
    results = []
    for label, score in zip(labels, scores):
        # Remove __label__ prefix and format
        lang_code = label.replace("__label__", "")
        confidence = float(score) * 100
        results.append(f"**{lang_code}**: {confidence:.2f}%")
    
    return "\n\n".join(results)

# Cleanup function to prevent async errors on shutdown
def cleanup():
    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            loop.stop()
        if not loop.is_closed():
            loop.close()
    except Exception:
        pass

atexit.register(cleanup)

# Create Gradio interface
with gr.Blocks(title="OpenLID-v3 Language Identification") as demo:
    # Use HTML with target="_blank" to open in new tab
    gr.HTML("""
    <h1>OpenLID-v3 Language Identifier</h1>
    <p>Identify the language of any text with state-of-the-art accuracy.<br>
    Supports 194+ language varieties.</p>
    <p><em>Model: <a href="https://huggingface.co/HPLT/OpenLID-v3" target="_blank" rel="noopener noreferrer">HPLT/OpenLID-v3</a></em></p>
    """)
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Input Text",
                placeholder="Enter text to identify its language...",
                lines=5,
                max_lines=10,
                max_length=MAX_INPUT_LENGTH  # Also enforce in UI
            )
            with gr.Row():
                top_k = gr.Slider(
                    minimum=1, 
                    maximum=10, 
                    value=3, 
                    step=1, 
                    label="Top-K Predictions"
                )
                threshold = gr.Slider(
                    minimum=0.0, 
                    maximum=1.0, 
                    value=0.5, 
                    step=0.05, 
                    label="Confidence Threshold"
                )
            submit_btn = gr.Button("Identify Language", variant="primary")
        
        with gr.Column():
            output = gr.Markdown(label="Predictions")
    
    # Examples with Kabyle and Occitan as defaults
    gr.Examples(
        examples=[
            ["Asebter-a yura s wudem awurman d amagrad s tutlayt taqbaylit."],
            ["L'interès es d'utilizar un sistèma liure, personalizable e en occitan."],
            ["Maskinsjefen er oppteken av å løfta fram dei maritime utdanningane."],
            ["The quick brown fox jumps over the lazy dog."],
            ["Le renard brun rapide saute par-dessus le chien paresseux."],
            ["El rápido zorro marrón salta sobre el perro perezoso."],
            ["Быстрая коричневая лисица прыгает через ленивую собаку."],
            ["快速的棕色狐狸跳过了懒惰的狗。"],
        ],
        inputs=input_text,
        label="Try these examples (Kabyle and Occitan featured)"
    )
    
    gr.Markdown(f"""
    ### Tips for best results:
    - Text is automatically preprocessed (lowercased, normalized)
    - Longer texts generally give more accurate predictions
    - The model supports 194+ language varieties
    - Use higher thresholds to filter out uncertain predictions
    - **Maximum input length: {MAX_INPUT_LENGTH:,} characters**
    """)
    
    # Event handlers
    submit_btn.click(
        fn=predict_language,
        inputs=[input_text, top_k, threshold],
        outputs=output
    )
    
    input_text.submit(
        fn=predict_language,
        inputs=[input_text, top_k, threshold],
        outputs=output
    )

if __name__ == "__main__":
    # Get port from environment (HF Spaces sets this)
    port = int(os.environ.get("PORT", 7860))
    
    try:
        demo.launch(
            server_name="0.0.0.0",
            server_port=port,
            ssr_mode=False,  # Disable experimental SSR to prevent the error
            share=False,
            show_error=True
        )
    except KeyboardInterrupt:
        print("\nShutting down gracefully...")
    finally:
        cleanup()