Spaces:
Sleeping
Sleeping
File size: 8,024 Bytes
b519d5f 09c18f6 b519d5f 7052b49 b519d5f 09c18f6 b519d5f 09c18f6 d62cc6f 09c18f6 7052b49 d62cc6f 09c18f6 7052b49 d62cc6f 09c18f6 d62cc6f 09c18f6 d62cc6f 7052b49 09c18f6 d62cc6f 7052b49 09c18f6 d62cc6f 7052b49 09c18f6 b519d5f d62cc6f 57c3ca4 b519d5f 09c18f6 b519d5f 09c18f6 b519d5f 09c18f6 b519d5f 09c18f6 b519d5f 09c18f6 b519d5f 09c18f6 a10a131 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | """
Advanced Language Detection Module for AumCore AI
Version: 2.0.1
Author: AumCore AI
"""
import re
from typing import Optional, Tuple, Dict
import langdetect
from langdetect import DetectorFactory, lang_detect_exception
# Ensure consistent results
DetectorFactory.seed = 0
class LanguageDetector:
"""Professional language detection with multi-layered approach"""
# Language scripts detection ranges
SCRIPT_RANGES = {
'hi': [(0x0900, 0x097F)], # Devanagari
'en': [(0x0041, 0x007A)], # Basic Latin
'es': [(0x0041, 0x007A)], # Spanish uses Latin
'fr': [(0x0041, 0x007A)], # French uses Latin
}
# Common words/phrases for quick detection
LANGUAGE_KEYWORDS = {
'hi': [
'नमस्ते', 'धन्यवाद', 'कैसे', 'हैं', 'आप', 'मैं', 'हूँ',
'क्या', 'जी', 'हाँ', 'नहीं', 'ठीक', 'अच्छा'
],
'en': [
'hello', 'thanks', 'how', 'are', 'you', 'i', 'am',
'what', 'yes', 'no', 'okay', 'good', 'please'
],
'es': ['hola', 'gracias', 'cómo', 'estás'],
'fr': ['bonjour', 'merci', 'comment', 'allez-vous']
}
def __init__(self, confidence_threshold: float = 0.6):
self.confidence_threshold = confidence_threshold
def detect_input_language(self, text: str, fallback: str = 'en') -> str:
"""
Detect language using multi-stage approach
Args:
text: Input text to analyze
fallback: Default language if detection fails
Returns:
Language code (en, hi, es, fr, etc.)
"""
if not text or len(text.strip()) < 2:
return fallback
# Clean text
clean_text = self._preprocess_text(text)
# Multi-stage detection
detection_methods = [
self._detect_by_script,
self._detect_by_keywords,
self._detect_by_langdetect,
]
for method in detection_methods:
try:
result = method(clean_text)
if result and result != 'unknown':
return result
except:
continue
return fallback
def _preprocess_text(self, text: str) -> str:
"""Clean and normalize text"""
# Remove URLs, emails, special characters (keep language chars)
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'\S+@\S+', '', text)
text = re.sub(r'[^\w\s\u0900-\u097F]', ' ', text) # Keep Devanagari
return text.strip()
def _detect_by_script(self, text: str) -> Optional[str]:
"""Detect language by character script/unicode range"""
sample = text[:100] # Check first 100 chars
for lang_code, ranges in self.SCRIPT_RANGES.items():
for start, end in ranges:
for char in sample:
if start <= ord(char) <= end:
return lang_code
return None
def _detect_by_keywords(self, text: str) -> Optional[str]:
"""Detect language by common keywords"""
text_lower = text.lower()
for lang_code, keywords in self.LANGUAGE_KEYWORDS.items():
for keyword in keywords:
if keyword in text_lower:
return lang_code
return None
def _detect_by_langdetect(self, text: str) -> Optional[str]:
"""Use langdetect library for statistical detection"""
try:
# Get probabilities for all languages
from langdetect import detect_langs
try:
languages = detect_langs(text)
if languages:
# Return language with highest probability
best_lang = max(languages, key=lambda x: x.prob)
if best_lang.prob >= self.confidence_threshold:
return best_lang.lang
except:
# Fallback to simple detect
return langdetect.detect(text)
except (lang_detect_exception.LangDetectException, Exception):
pass
return None
def get_detection_confidence(self, text: str, language: str) -> float:
"""Calculate confidence score for detection"""
if not text:
return 0.0
# Simple confidence calculation
text_lower = text.lower()
keywords = self.LANGUAGE_KEYWORDS.get(language, [])
if keywords:
matches = sum(1 for kw in keywords if kw in text_lower)
confidence = min(1.0, matches / len(keywords) * 2)
return round(confidence, 2)
return 0.5 # Default medium confidence
# Global instance for easy import
detector = LanguageDetector()
# ==========================================
# MAIN FUNCTIONS FOR IMPORT
# ==========================================
def detect_input_language(text: str) -> str:
"""Simple wrapper for backward compatibility"""
return detector.detect_input_language(text)
def get_system_prompt(language: str = "en", username: str = None) -> str:
"""
Get system prompt based on language and username
Args:
language: Language code (en, hi, etc.)
username: Optional username for personalization
Returns:
System prompt string
"""
# Default prompts
prompts = {
"en": f"""You are AumCore AI{' (' + username + ')' if username else ''}, an advanced AI assistant specializing in programming,
system design, and technical solutions. Provide detailed, accurate, and
professional responses.""",
"hi": f"""आप AumCore AI{' (' + username + ')' if username else ''} हैं, एक उन्नत AI सहायक जो प्रोग्रामिंग, सिस्टम डिज़ाइन और
तकनीकी समाधानों में विशेषज्ञ है। विस्तृत, सटीक और पेशेवर प्रतिक्रियाएँ दें।""",
"es": f"""Eres AumCore AI{' (' + username + ')' if username else ''}, un asistente de IA avanzado especializado en programación,
diseño de sistemas y soluciones técnicas. Proporciona respuestas detalladas,
precisas y profesionales.""",
"fr": f"""Vous êtes AumCore AI{' (' + username + ')' if username else ''}, un assistant IA avancé especializado en programación,
conception de systèmes et solutions técnicas. Fournissez des réponses détaillées,
précises et professionnelles."""
}
return prompts.get(language, prompts["en"])
def detect_with_confidence(text: str) -> Tuple[str, float]:
"""Detect language with confidence score"""
detector_obj = LanguageDetector()
language = detector_obj.detect_input_language(text)
confidence = detector_obj.get_detection_confidence(text, language)
return language, confidence
def generate_basic_code(task):
"""Generate basic code templates - TEMPORARY SIMPLE VERSION"""
task_lower = task.lower()
if 'drive' in task_lower or 'mount' in task_lower:
return "```python\nfrom google.colab import drive\ndrive.mount('/content/gdrive')\n```"
elif 'web' in task_lower or 'app' in task_lower:
return "```python\nfrom fastapi import FastAPI\napp = FastAPI()\n@app.get('/')\ndef home(): return {'message': 'Hello'}\n```"
else:
return "```python\nprint('Hello from AumCore AI')\n```"
# Module metadata
__version__ = "2.0.1"
__author__ = "AumCore AI"
__all__ = [
'detect_input_language',
'get_system_prompt',
'detect_with_confidence',
'LanguageDetector',
'generate_basic_code'
] |