Spaces:
Sleeping
Sleeping
File size: 5,864 Bytes
9349334 36e285c 9349334 41a4e04 9349334 41a4e04 9349334 36e285c 9349334 2181426 9349334 41a4e04 9349334 36e285c 9349334 41a4e04 9349334 41a4e04 9349334 36e285c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import fasttext
from huggingface_hub import hf_hub_download
import regex
import gradio as gr
import os
import asyncio
import atexit
# Constants
MAX_INPUT_LENGTH = 10000 # Maximum characters allowed
# Preprocessing patterns
NONWORD_REPLACE_STR = r"[^\p{Word}\p{Zs}]|\d"
NONWORD_REPLACE_PATTERN = regex.compile(NONWORD_REPLACE_STR)
SPACE_PATTERN = regex.compile(r"\s\s+")
def preprocess(text):
"""Preprocess text for language identification."""
text = text.strip().replace('\n', ' ').lower()
text = regex.sub(SPACE_PATTERN, " ", text)
text = regex.sub(NONWORD_REPLACE_PATTERN, "", text)
return text
# Load model once at startup
print("Loading OpenLID-v3 model...")
model_path = hf_hub_download(
repo_id="HPLT/OpenLID-v3",
filename="openlid-v3.bin"
)
model = fasttext.load_model(model_path)
print("Model loaded successfully!")
def predict_language(text, top_k=3, threshold=0.5):
"""
Predict language of input text.
Args:
text: Input text to analyze
top_k: Number of top predictions to return (1-10)
threshold: Confidence threshold (0.0-1.0)
"""
# Check input length first
if len(text) > MAX_INPUT_LENGTH:
return f"**Error**: Input too long ({len(text):,} characters). Maximum allowed is {MAX_INPUT_LENGTH:,} characters."
if not text or not text.strip():
return "Please enter some text to analyze."
# Preprocess
processed_text = preprocess(text)
if not processed_text.strip():
return "Text contains no valid characters for language identification."
# Get predictions
predictions = model.predict(
text=processed_text,
k=min(top_k, 10),
threshold=threshold,
on_unicode_error="strict",
)
labels, scores = predictions
# Format results
results = []
for label, score in zip(labels, scores):
# Remove __label__ prefix and format
lang_code = label.replace("__label__", "")
confidence = float(score) * 100
results.append(f"**{lang_code}**: {confidence:.2f}%")
return "\n\n".join(results)
# Cleanup function to prevent async errors on shutdown
def cleanup():
try:
loop = asyncio.get_event_loop()
if loop.is_running():
loop.stop()
if not loop.is_closed():
loop.close()
except Exception:
pass
atexit.register(cleanup)
# Create Gradio interface
with gr.Blocks(title="OpenLID-v3 Language Identification") as demo:
# Use HTML with target="_blank" to open in new tab
gr.HTML("""
<h1>OpenLID-v3 Language Identifier</h1>
<p>Identify the language of any text with state-of-the-art accuracy.<br>
Supports 194+ language varieties.</p>
<p><em>Model: <a href="https://huggingface.co/HPLT/OpenLID-v3" target="_blank" rel="noopener noreferrer">HPLT/OpenLID-v3</a></em></p>
""")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Input Text",
placeholder="Enter text to identify its language...",
lines=5,
max_lines=10,
max_length=MAX_INPUT_LENGTH # Also enforce in UI
)
with gr.Row():
top_k = gr.Slider(
minimum=1,
maximum=10,
value=3,
step=1,
label="Top-K Predictions"
)
threshold = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.5,
step=0.05,
label="Confidence Threshold"
)
submit_btn = gr.Button("Identify Language", variant="primary")
with gr.Column():
output = gr.Markdown(label="Predictions")
# Examples with Kabyle and Occitan as defaults
gr.Examples(
examples=[
["Asebter-a yura s wudem awurman d amagrad s tutlayt taqbaylit."],
["L'interès es d'utilizar un sistèma liure, personalizable e en occitan."],
["Maskinsjefen er oppteken av å løfta fram dei maritime utdanningane."],
["The quick brown fox jumps over the lazy dog."],
["Le renard brun rapide saute par-dessus le chien paresseux."],
["El rápido zorro marrón salta sobre el perro perezoso."],
["Быстрая коричневая лисица прыгает через ленивую собаку."],
["快速的棕色狐狸跳过了懒惰的狗。"],
],
inputs=input_text,
label="Try these examples (Kabyle and Occitan featured)"
)
gr.Markdown(f"""
### Tips for best results:
- Text is automatically preprocessed (lowercased, normalized)
- Longer texts generally give more accurate predictions
- The model supports 194+ language varieties
- Use higher thresholds to filter out uncertain predictions
- **Maximum input length: {MAX_INPUT_LENGTH:,} characters**
""")
# Event handlers
submit_btn.click(
fn=predict_language,
inputs=[input_text, top_k, threshold],
outputs=output
)
input_text.submit(
fn=predict_language,
inputs=[input_text, top_k, threshold],
outputs=output
)
if __name__ == "__main__":
# Get port from environment (HF Spaces sets this)
port = int(os.environ.get("PORT", 7860))
try:
demo.launch(
server_name="0.0.0.0",
server_port=port,
ssr_mode=False, # Disable experimental SSR to prevent the error
share=False,
show_error=True
)
except KeyboardInterrupt:
print("\nShutting down gracefully...")
finally:
cleanup() |