YAML Metadata Warning: empty or missing yaml metadata in repo card
Check out the documentation for more information.
langid-mmbert-small-8gpu
Language identification model based on jhu-clsp/mmBERT-small for 14 classes:
ko,no,da,sv,fi,nl,en,fr,de,es,pt,it,jaUNKNOWN
Inference Guide
This project supports two inference styles:
pipeline("text-classification")for quick usage.AutoTokenizer+AutoModelForSequenceClassificationfor explicit forward-pass control.
Both can use a fast UNKNOWN pre-check:
import re
URL_PATTERN = re.compile(r"https?://\S+|www\.\S+", flags=re.IGNORECASE)
def fast_detect_unknown(text: str) -> bool:
s = text.strip()
if not s:
# Empty input is treated as non-language.
return True
if URL_PATTERN.search(s):
# URLs should map to UNKNOWN.
return True
total = len(s)
alpha = sum(ch.isalpha() for ch in s)
digits = sum(ch.isdigit() for ch in s)
spaces = sum(ch.isspace() for ch in s)
symbols = total - alpha - digits - spaces
non_space = max(1, total - spaces)
# Mostly numeric strings (ids, phone numbers, etc.).
if digits / non_space >= 0.8:
return True
# Symbol-heavy text is usually not valid language content.
if symbols / non_space >= 0.45:
return True
# Very low alphabetic ratio indicates gibberish-like input.
if total >= 6 and (alpha / non_space) < 0.2:
return True
# Long compact mixed tokens often represent hashes/usernames/keys.
if " " not in s and total >= 12 and (alpha / non_space) < 0.45 and (digits > 0 or symbols > 0):
return True
return False
Option A: Pipeline
import torch
from transformers import pipeline
model_id = "chiennv/langid-mmbert-small-8gpu"
device = 0 if torch.cuda.is_available() else -1
clf = pipeline(
"text-classification",
model=model_id,
tokenizer=model_id,
top_k=1,
device=device, # GPU id (0,1,...) or -1 for CPU
)
text = "Bonjour tout le monde"
if fast_detect_unknown(text):
print({"label": "UNKNOWN", "score": 1.0})
else:
out = clf(text)[0][0]
print({"label": out["label"], "score": round(out["score"], 4)})
Option B: AutoModelForSequenceClassification Only
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_id = "chiennv/langid-mmbert-small-8gpu"
tokenizer = AutoTokenizer.from_pretrained(model_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Use FP16 on GPU for faster inference and lower memory.
dtype = torch.float16 if device.type == "cuda" else torch.float32
model = AutoModelForSequenceClassification.from_pretrained(model_id, torch_dtype=dtype).to(device)
model.eval()
text = "Bonjour tout le monde"
if fast_detect_unknown(text):
print({"label": "UNKNOWN", "score": 1.0})
else:
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.softmax(logits, dim=-1).squeeze(0)
pred_id = int(torch.argmax(probs).item())
pred_label = model.config.id2label[pred_id]
pred_score = float(probs[pred_id].item())
print({"label": pred_label, "score": round(pred_score, 4)})
Run local infer.py
python infer.py
GPU Notes
- Check CUDA availability:
python -c "import torch; print(torch.cuda.is_available(), torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'no-gpu')"
- The AutoModel example above automatically uses GPU + FP16 when CUDA is available.
- Downloads last month
- -
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support