File size: 2,430 Bytes
8b1d1ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# app.py — Gradio demo for the multilingual language classifier (Hugging Face Space)
# Loads the fine-tuned XLM-RoBERTa model from the Hub and serves an interactive UI.
import os
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_ID = os.environ.get("MODEL_ID", "SashaSk/xlm-roberta-language-id")

# Human-readable names for the 20 ISO codes the model predicts.
LANG_NAMES = {
    "ar": "Arabic", "bg": "Bulgarian", "de": "German", "el": "Greek", "en": "English",
    "es": "Spanish", "fr": "French", "hi": "Hindi", "it": "Italian", "ja": "Japanese",
    "nl": "Dutch", "pl": "Polish", "pt": "Portuguese", "ru": "Russian", "sw": "Swahili",
    "th": "Thai", "tr": "Turkish", "ur": "Urdu", "vi": "Vietnamese", "zh": "Chinese",
}

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID).eval()
id2label = model.config.id2label


def classify(text: str):
    """Return a {label: probability} dict over the top languages for gr.Label."""
    if not text or not text.strip():
        return {}
    with torch.no_grad():
        enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
        probs = torch.softmax(model(**enc).logits, dim=-1)[0]
    out = {}
    for i, p in enumerate(probs.tolist()):
        code = id2label[i]
        out[f"{LANG_NAMES.get(code, code)} ({code})"] = p
    return out


EXAMPLES = [
    ["Bonjour, comment allez-vous aujourd'hui ?"],
    ["¿Dónde está la biblioteca más cercana?"],
    ["こんにちは、お元気ですか?"],
    ["Привет, как у тебя дела?"],
    ["مرحبا، كيف حالك اليوم؟"],
    ["Guten Tag, schön Sie kennenzulernen."],
]

with gr.Blocks(title="Multilingual Language Classifier") as demo:
    gr.Markdown(
        "# 🌍 Multilingual Language Classifier\n"
        "Fine-tuned **XLM-RoBERTa** detecting one of **20 languages** — "
        "**99.6%** test accuracy. Type or pick an example below."
    )
    with gr.Row():
        inp = gr.Textbox(
            label="Text", lines=3,
            placeholder="Type text in any of the 20 supported languages…",
        )
        out = gr.Label(num_top_classes=5, label="Predicted language (top 5)")
    inp.change(classify, inputs=inp, outputs=out)
    gr.Examples(EXAMPLES, inputs=inp)

if __name__ == "__main__":
    demo.launch()