OpenLID-v3_test / app.py
boffire's picture
Update app.py
36e285c verified
import fasttext
from huggingface_hub import hf_hub_download
import regex
import gradio as gr
import os
import asyncio
import atexit
# Constants
MAX_INPUT_LENGTH = 10000 # Maximum characters allowed
# Preprocessing patterns
NONWORD_REPLACE_STR = r"[^\p{Word}\p{Zs}]|\d"
NONWORD_REPLACE_PATTERN = regex.compile(NONWORD_REPLACE_STR)
SPACE_PATTERN = regex.compile(r"\s\s+")
def preprocess(text):
"""Preprocess text for language identification."""
text = text.strip().replace('\n', ' ').lower()
text = regex.sub(SPACE_PATTERN, " ", text)
text = regex.sub(NONWORD_REPLACE_PATTERN, "", text)
return text
# Load model once at startup
print("Loading OpenLID-v3 model...")
model_path = hf_hub_download(
repo_id="HPLT/OpenLID-v3",
filename="openlid-v3.bin"
)
model = fasttext.load_model(model_path)
print("Model loaded successfully!")
def predict_language(text, top_k=3, threshold=0.5):
"""
Predict language of input text.
Args:
text: Input text to analyze
top_k: Number of top predictions to return (1-10)
threshold: Confidence threshold (0.0-1.0)
"""
# Check input length first
if len(text) > MAX_INPUT_LENGTH:
return f"**Error**: Input too long ({len(text):,} characters). Maximum allowed is {MAX_INPUT_LENGTH:,} characters."
if not text or not text.strip():
return "Please enter some text to analyze."
# Preprocess
processed_text = preprocess(text)
if not processed_text.strip():
return "Text contains no valid characters for language identification."
# Get predictions
predictions = model.predict(
text=processed_text,
k=min(top_k, 10),
threshold=threshold,
on_unicode_error="strict",
)
labels, scores = predictions
# Format results
results = []
for label, score in zip(labels, scores):
# Remove __label__ prefix and format
lang_code = label.replace("__label__", "")
confidence = float(score) * 100
results.append(f"**{lang_code}**: {confidence:.2f}%")
return "\n\n".join(results)
# Cleanup function to prevent async errors on shutdown
def cleanup():
try:
loop = asyncio.get_event_loop()
if loop.is_running():
loop.stop()
if not loop.is_closed():
loop.close()
except Exception:
pass
atexit.register(cleanup)
# Create Gradio interface
with gr.Blocks(title="OpenLID-v3 Language Identification") as demo:
# Use HTML with target="_blank" to open in new tab
gr.HTML("""
<h1>OpenLID-v3 Language Identifier</h1>
<p>Identify the language of any text with state-of-the-art accuracy.<br>
Supports 194+ language varieties.</p>
<p><em>Model: <a href="https://huggingface.co/HPLT/OpenLID-v3" target="_blank" rel="noopener noreferrer">HPLT/OpenLID-v3</a></em></p>
""")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Input Text",
placeholder="Enter text to identify its language...",
lines=5,
max_lines=10,
max_length=MAX_INPUT_LENGTH # Also enforce in UI
)
with gr.Row():
top_k = gr.Slider(
minimum=1,
maximum=10,
value=3,
step=1,
label="Top-K Predictions"
)
threshold = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.5,
step=0.05,
label="Confidence Threshold"
)
submit_btn = gr.Button("Identify Language", variant="primary")
with gr.Column():
output = gr.Markdown(label="Predictions")
# Examples with Kabyle and Occitan as defaults
gr.Examples(
examples=[
["Asebter-a yura s wudem awurman d amagrad s tutlayt taqbaylit."],
["L'interès es d'utilizar un sistèma liure, personalizable e en occitan."],
["Maskinsjefen er oppteken av å løfta fram dei maritime utdanningane."],
["The quick brown fox jumps over the lazy dog."],
["Le renard brun rapide saute par-dessus le chien paresseux."],
["El rápido zorro marrón salta sobre el perro perezoso."],
["Быстрая коричневая лисица прыгает через ленивую собаку."],
["快速的棕色狐狸跳过了懒惰的狗。"],
],
inputs=input_text,
label="Try these examples (Kabyle and Occitan featured)"
)
gr.Markdown(f"""
### Tips for best results:
- Text is automatically preprocessed (lowercased, normalized)
- Longer texts generally give more accurate predictions
- The model supports 194+ language varieties
- Use higher thresholds to filter out uncertain predictions
- **Maximum input length: {MAX_INPUT_LENGTH:,} characters**
""")
# Event handlers
submit_btn.click(
fn=predict_language,
inputs=[input_text, top_k, threshold],
outputs=output
)
input_text.submit(
fn=predict_language,
inputs=[input_text, top_k, threshold],
outputs=output
)
if __name__ == "__main__":
# Get port from environment (HF Spaces sets this)
port = int(os.environ.get("PORT", 7860))
try:
demo.launch(
server_name="0.0.0.0",
server_port=port,
ssr_mode=False, # Disable experimental SSR to prevent the error
share=False,
show_error=True
)
except KeyboardInterrupt:
print("\nShutting down gracefully...")
finally:
cleanup()