Spaces:
Sleeping
Sleeping
| from langdetect import detect, DetectorFactory, LangDetectException | |
| import re | |
| # Set seed for consistent results | |
| DetectorFactory.seed = 0 | |
| def detect_language(text: str) -> str: | |
| """ | |
| Detect if text is English, Bengali, Mixed, or Unknown | |
| Uses multiple detection strategies for accuracy | |
| """ | |
| if not text or len(text.strip()) < 3: | |
| return "unknown" | |
| # Strategy 1: Check for Bengali Unicode characters | |
| bengali_pattern = r'[\u0980-\u09FF]' | |
| has_bengali = bool(re.search(bengali_pattern, text)) | |
| # Strategy 2: Check for English characters | |
| english_pattern = r'[a-zA-Z]' | |
| has_english = bool(re.search(english_pattern, text)) | |
| # If both present, it's mixed | |
| if has_bengali and has_english: | |
| bengali_chars = len(re.findall(bengali_pattern, text)) | |
| english_chars = len(re.findall(english_pattern, text)) | |
| # If one language dominates heavily (>80%), classify as that language | |
| total_chars = bengali_chars + english_chars | |
| if bengali_chars / total_chars > 0.8: | |
| return "bengali" | |
| elif english_chars / total_chars > 0.8: | |
| return "english" | |
| else: | |
| return "mixed" | |
| # If only Bengali | |
| if has_bengali: | |
| return "bengali" | |
| # If only English | |
| if has_english: | |
| try: | |
| # Use langdetect for confirmation | |
| detected = detect(text) | |
| if detected == 'en': | |
| return "english" | |
| elif detected == 'bn': | |
| return "bengali" | |
| else: | |
| # If langdetect finds another language but we have English chars | |
| return "english" | |
| except LangDetectException: | |
| return "english" | |
| # Fallback to langdetect | |
| try: | |
| detected = detect(text) | |
| if detected == 'en': | |
| return "english" | |
| elif detected == 'bn': | |
| return "bengali" | |
| else: | |
| return "unknown" | |
| except LangDetectException: | |
| return "unknown" | |
| def get_language_script_info(text: str) -> dict: | |
| """ | |
| Get detailed information about the scripts used in text | |
| Useful for debugging and fine-tuning | |
| """ | |
| bengali_chars = len(re.findall(r'[\u0980-\u09FF]', text)) | |
| english_chars = len(re.findall(r'[a-zA-Z]', text)) | |
| digits = len(re.findall(r'\d', text)) | |
| other_chars = len(text) - bengali_chars - english_chars - digits | |
| return { | |
| "bengali_characters": bengali_chars, | |
| "english_characters": english_chars, | |
| "digits": digits, | |
| "other_characters": other_chars, | |
| "total_length": len(text) | |
| } |