Spaces:

boffire
/

OpenLID-v3_test

Running

App Files Files Community

OpenLID-v3_test / app.py

boffire

Update app.py

36e285c verified 4 days ago

raw

history blame contribute delete

5.86 kB

	import fasttext
	from huggingface_hub import hf_hub_download
	import regex
	import gradio as gr
	import os
	import asyncio
	import atexit

	# Constants
	MAX_INPUT_LENGTH = 10000 # Maximum characters allowed

	# Preprocessing patterns
	NONWORD_REPLACE_STR = r"[^\p{Word}\p{Zs}]\|\d"
	NONWORD_REPLACE_PATTERN = regex.compile(NONWORD_REPLACE_STR)
	SPACE_PATTERN = regex.compile(r"\s\s+")

	def preprocess(text):
	"""Preprocess text for language identification."""
	text = text.strip().replace('\n', ' ').lower()
	text = regex.sub(SPACE_PATTERN, " ", text)
	text = regex.sub(NONWORD_REPLACE_PATTERN, "", text)
	return text

	# Load model once at startup
	print("Loading OpenLID-v3 model...")
	model_path = hf_hub_download(
	repo_id="HPLT/OpenLID-v3",
	filename="openlid-v3.bin"
	)
	model = fasttext.load_model(model_path)
	print("Model loaded successfully!")

	def predict_language(text, top_k=3, threshold=0.5):
	"""
	Predict language of input text.

	Args:
	text: Input text to analyze
	top_k: Number of top predictions to return (1-10)
	threshold: Confidence threshold (0.0-1.0)
	"""
	# Check input length first
	if len(text) > MAX_INPUT_LENGTH:
	return f"Error: Input too long ({len(text):,} characters). Maximum allowed is {MAX_INPUT_LENGTH:,} characters."

	if not text or not text.strip():
	return "Please enter some text to analyze."

	# Preprocess
	processed_text = preprocess(text)

	if not processed_text.strip():
	return "Text contains no valid characters for language identification."

	# Get predictions
	predictions = model.predict(
	text=processed_text,
	k=min(top_k, 10),
	threshold=threshold,
	on_unicode_error="strict",
	)

	labels, scores = predictions

	# Format results
	results = []
	for label, score in zip(labels, scores):
	# Remove __label__ prefix and format
	lang_code = label.replace("__label__", "")
	confidence = float(score) * 100
	results.append(f"{lang_code}: {confidence:.2f}%")

	return "\n\n".join(results)

	# Cleanup function to prevent async errors on shutdown
	def cleanup():
	try:
	loop = asyncio.get_event_loop()
	if loop.is_running():
	loop.stop()
	if not loop.is_closed():
	loop.close()
	except Exception:
	pass

	atexit.register(cleanup)

	# Create Gradio interface
	with gr.Blocks(title="OpenLID-v3 Language Identification") as demo:
	# Use HTML with target="_blank" to open in new tab
	gr.HTML("""
	<h1>OpenLID-v3 Language Identifier</h1>
	<p>Identify the language of any text with state-of-the-art accuracy.<br>
	Supports 194+ language varieties.</p>
	<p><em>Model: <a href="https://huggingface.co/HPLT/OpenLID-v3" target="_blank" rel="noopener noreferrer">HPLT/OpenLID-v3</a></em></p>
	""")

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Input Text",
	placeholder="Enter text to identify its language...",
	lines=5,
	max_lines=10,
	max_length=MAX_INPUT_LENGTH # Also enforce in UI
	)
	with gr.Row():
	top_k = gr.Slider(
	minimum=1,
	maximum=10,
	value=3,
	step=1,
	label="Top-K Predictions"
	)
	threshold = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.5,
	step=0.05,
	label="Confidence Threshold"
	)
	submit_btn = gr.Button("Identify Language", variant="primary")

	with gr.Column():
	output = gr.Markdown(label="Predictions")

	# Examples with Kabyle and Occitan as defaults
	gr.Examples(
	examples=[
	["Asebter-a yura s wudem awurman d amagrad s tutlayt taqbaylit."],
	["L'interès es d'utilizar un sistèma liure, personalizable e en occitan."],
	["Maskinsjefen er oppteken av å løfta fram dei maritime utdanningane."],
	["The quick brown fox jumps over the lazy dog."],
	["Le renard brun rapide saute par-dessus le chien paresseux."],
	["El rápido zorro marrón salta sobre el perro perezoso."],
	["Быстрая коричневая лисица прыгает через ленивую собаку."],
	["快速的棕色狐狸跳过了懒惰的狗。"],
	],
	inputs=input_text,
	label="Try these examples (Kabyle and Occitan featured)"
	)

	gr.Markdown(f"""
	### Tips for best results:
	- Text is automatically preprocessed (lowercased, normalized)
	- Longer texts generally give more accurate predictions
	- The model supports 194+ language varieties
	- Use higher thresholds to filter out uncertain predictions
	- Maximum input length: {MAX_INPUT_LENGTH:,} characters
	""")

	# Event handlers
	submit_btn.click(
	fn=predict_language,
	inputs=[input_text, top_k, threshold],
	outputs=output
	)

	input_text.submit(
	fn=predict_language,
	inputs=[input_text, top_k, threshold],
	outputs=output
	)

	if __name__ == "__main__":
	# Get port from environment (HF Spaces sets this)
	port = int(os.environ.get("PORT", 7860))

	try:
	demo.launch(
	server_name="0.0.0.0",
	server_port=port,
	ssr_mode=False, # Disable experimental SSR to prevent the error
	share=False,
	show_error=True
	)
	except KeyboardInterrupt:
	print("\nShutting down gracefully...")
	finally:
	cleanup()