chatbot / language.py
Kamaljeyaram07's picture
Update language.py
b931e4a verified
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import re
MODEL_NAME = "papluca/xlm-roberta-base-language-detection"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
# Unicode ranges for Indic scripts
TAMIL_RANGE = re.compile(r"[\u0B80-\u0BFF]")
TELUGU_RANGE = re.compile(r"[\u0C00-\u0C7F]")
KANNADA_RANGE = re.compile(r"[\u0C80-\u0CFF]")
MALAYALAM_RANGE = re.compile(r"[\u0D00-\u0D7F]")
DEVANAGARI_RANGE = re.compile(r"[\u0900-\u097F]")
def detect_language(text: str) -> str:
# 🔒 HARD OVERRIDES (MOST IMPORTANT)
if TAMIL_RANGE.search(text):
return "ta"
if TELUGU_RANGE.search(text):
return "te"
if KANNADA_RANGE.search(text):
return "kn"
if MALAYALAM_RANGE.search(text):
return "ml"
if DEVANAGARI_RANGE.search(text):
return "hi"
# Fallback to ML detection
inputs = tokenizer(text, return_tensors="pt", truncation=True)
with torch.no_grad():
logits = model(**inputs).logits
lang_id = torch.argmax(logits, dim=1).item()
return model.config.id2label[lang_id]