Spaces:

sgAtdbd
/

Hateshield-bn

Sleeping

File size: 2,657 Bytes

8ad9255

from langdetect import detect, DetectorFactory, LangDetectException
import re

# Set seed for consistent results
DetectorFactory.seed = 0

def detect_language(text: str) -> str:
    """
    Detect if text is English, Bengali, Mixed, or Unknown
    Uses multiple detection strategies for accuracy
    """
    
    if not text or len(text.strip()) < 3:
        return "unknown"
    
    # Strategy 1: Check for Bengali Unicode characters
    bengali_pattern = r'[\u0980-\u09FF]'
    has_bengali = bool(re.search(bengali_pattern, text))
    
    # Strategy 2: Check for English characters
    english_pattern = r'[a-zA-Z]'
    has_english = bool(re.search(english_pattern, text))
    
    # If both present, it's mixed
    if has_bengali and has_english:
        bengali_chars = len(re.findall(bengali_pattern, text))
        english_chars = len(re.findall(english_pattern, text))
        
        # If one language dominates heavily (>80%), classify as that language
        total_chars = bengali_chars + english_chars
        if bengali_chars / total_chars > 0.8:
            return "bengali"
        elif english_chars / total_chars > 0.8:
            return "english"
        else:
            return "mixed"
    
    # If only Bengali
    if has_bengali:
        return "bengali"
    
    # If only English
    if has_english:
        try:
            # Use langdetect for confirmation
            detected = detect(text)
            if detected == 'en':
                return "english"
            elif detected == 'bn':
                return "bengali"
            else:
                # If langdetect finds another language but we have English chars
                return "english"
        except LangDetectException:
            return "english"
    
    # Fallback to langdetect
    try:
        detected = detect(text)
        if detected == 'en':
            return "english"
        elif detected == 'bn':
            return "bengali"
        else:
            return "unknown"
    except LangDetectException:
        return "unknown"

def get_language_script_info(text: str) -> dict:
    """
    Get detailed information about the scripts used in text
    Useful for debugging and fine-tuning
    """
    bengali_chars = len(re.findall(r'[\u0980-\u09FF]', text))
    english_chars = len(re.findall(r'[a-zA-Z]', text))
    digits = len(re.findall(r'\d', text))
    other_chars = len(text) - bengali_chars - english_chars - digits
    
    return {
        "bengali_characters": bengali_chars,
        "english_characters": english_chars,
        "digits": digits,
        "other_characters": other_chars,
        "total_length": len(text)
    }