Spaces:
Sleeping
Sleeping
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| import torch | |
| import re | |
| MODEL_NAME = "papluca/xlm-roberta-base-language-detection" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME) | |
| # Unicode ranges for Indic scripts | |
| TAMIL_RANGE = re.compile(r"[\u0B80-\u0BFF]") | |
| TELUGU_RANGE = re.compile(r"[\u0C00-\u0C7F]") | |
| KANNADA_RANGE = re.compile(r"[\u0C80-\u0CFF]") | |
| MALAYALAM_RANGE = re.compile(r"[\u0D00-\u0D7F]") | |
| DEVANAGARI_RANGE = re.compile(r"[\u0900-\u097F]") | |
| def detect_language(text: str) -> str: | |
| # 🔒 HARD OVERRIDES (MOST IMPORTANT) | |
| if TAMIL_RANGE.search(text): | |
| return "ta" | |
| if TELUGU_RANGE.search(text): | |
| return "te" | |
| if KANNADA_RANGE.search(text): | |
| return "kn" | |
| if MALAYALAM_RANGE.search(text): | |
| return "ml" | |
| if DEVANAGARI_RANGE.search(text): | |
| return "hi" | |
| # Fallback to ML detection | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| lang_id = torch.argmax(logits, dim=1).item() | |
| return model.config.id2label[lang_id] |