AumCoreAI commited on
Commit
d62cc6f
·
verified ·
1 Parent(s): 7052b49

Update core/language_detector.py

Browse files
Files changed (1) hide show
  1. core/language_detector.py +23 -172
core/language_detector.py CHANGED
@@ -1,179 +1,30 @@
1
- """
2
- Advanced Language Detection Module for AumCore AI
3
- Version: 2.0.0
4
- Author: AumCore AI
5
- """
6
 
7
- import re
8
- from typing import Optional, Tuple, Dict
9
- import langdetect
10
- from langdetect import DetectorFactory, lang_detect_exception
11
-
12
- # Ensure consistent results
13
- DetectorFactory.seed = 0
14
-
15
- class LanguageDetector:
16
- """Professional language detection with multi-layered approach"""
17
-
18
- # Language scripts detection ranges
19
- SCRIPT_RANGES = {
20
- 'hi': [(0x0900, 0x097F)], # Devanagari
21
- 'en': [(0x0041, 0x007A)], # Basic Latin
22
- 'es': [(0x0041, 0x007A)], # Spanish uses Latin
23
- 'fr': [(0x0041, 0x007A)], # French uses Latin
24
- }
25
-
26
- # Common words/phrases for quick detection
27
- LANGUAGE_KEYWORDS = {
28
- 'hi': [
29
- 'नमस्ते', 'धन्यवाद', 'कैसे', 'हैं', 'आप', 'मैं', 'हूँ',
30
- 'क्या', 'जी', 'हाँ', 'नहीं', 'ठीक', 'अच्छा'
31
- ],
32
- 'en': [
33
- 'hello', 'thanks', 'how', 'are', 'you', 'i', 'am',
34
- 'what', 'yes', 'no', 'okay', 'good', 'please'
35
- ],
36
- 'es': ['hola', 'gracias', 'cómo', 'estás'],
37
- 'fr': ['bonjour', 'merci', 'comment', 'allez-vous']
38
- }
39
-
40
- def __init__(self, confidence_threshold: float = 0.6):
41
- self.confidence_threshold = confidence_threshold
42
 
43
- def detect_input_language(self, text: str, fallback: str = 'en') -> str:
44
- """
45
- Detect language using multi-stage approach
46
 
47
- Args:
48
- text: Input text to analyze
49
- fallback: Default language if detection fails
50
-
51
- Returns:
52
- Language code (en, hi, es, fr, etc.)
53
- """
54
- if not text or len(text.strip()) < 2:
55
- return fallback
56
 
57
- # Clean text
58
- clean_text = self._preprocess_text(text)
59
 
60
- # Multi-stage detection
61
- detection_methods = [
62
- self._detect_by_script,
63
- self._detect_by_keywords,
64
- self._detect_by_langdetect,
65
- ]
66
-
67
- for method in detection_methods:
68
- try:
69
- result = method(clean_text)
70
- if result and result != 'unknown':
71
- return result
72
- except:
73
- continue
74
-
75
- return fallback
76
-
77
- def _preprocess_text(self, text: str) -> str:
78
- """Clean and normalize text"""
79
- # Remove URLs, emails, special characters (keep language chars)
80
- text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
81
- text = re.sub(r'\S+@\S+', '', text)
82
- text = re.sub(r'[^\w\s\u0900-\u097F]', ' ', text) # Keep Devanagari
83
- return text.strip()
84
-
85
- def _detect_by_script(self, text: str) -> Optional[str]:
86
- """Detect language by character script/unicode range"""
87
- sample = text[:100] # Check first 100 chars
88
-
89
- for lang_code, ranges in self.SCRIPT_RANGES.items():
90
- for start, end in ranges:
91
- for char in sample:
92
- if start <= ord(char) <= end:
93
- return lang_code
94
- return None
95
-
96
- def _detect_by_keywords(self, text: str) -> Optional[str]:
97
- """Detect language by common keywords"""
98
- text_lower = text.lower()
99
 
100
- for lang_code, keywords in self.LANGUAGE_KEYWORDS.items():
101
- for keyword in keywords:
102
- if keyword in text_lower:
103
- return lang_code
104
- return None
105
-
106
- def _detect_by_langdetect(self, text: str) -> Optional[str]:
107
- """Use langdetect library for statistical detection"""
108
- try:
109
- # Get probabilities for all languages
110
- from langdetect import detect_langs
111
-
112
- try:
113
- languages = detect_langs(text)
114
- if languages:
115
- # Return language with highest probability
116
- best_lang = max(languages, key=lambda x: x.prob)
117
- if best_lang.prob >= self.confidence_threshold:
118
- return best_lang.lang
119
- except:
120
- # Fallback to simple detect
121
- return langdetect.detect(text)
122
- except (lang_detect_exception.LangDetectException, Exception):
123
- pass
124
-
125
- return None
126
-
127
- def get_detection_confidence(self, text: str, language: str) -> float:
128
- """Calculate confidence score for detection"""
129
- if not text:
130
- return 0.0
131
-
132
- # Simple confidence calculation
133
- text_lower = text.lower()
134
- keywords = self.LANGUAGE_KEYWORDS.get(language, [])
135
-
136
- if keywords:
137
- matches = sum(1 for kw in keywords if kw in text_lower)
138
- confidence = min(1.0, matches / len(keywords) * 2)
139
- return round(confidence, 2)
140
-
141
- return 0.5 # Default medium confidence
142
-
143
- # Global instance for easy import
144
- detector = LanguageDetector()
145
-
146
- # Simplified function for backward compatibility
147
- def detect_input_language(text: str) -> str:
148
- """Simple wrapper for backward compatibility"""
149
- return detector.detect_input_language(text)
150
-
151
- # Optional: Additional utility function
152
- def detect_with_confidence(text: str) -> Tuple[str, float]:
153
- """Detect language with confidence score"""
154
- detector_obj = LanguageDetector()
155
- language = detector_obj.detect_input_language(text)
156
- confidence = detector_obj.get_detection_confidence(text, language)
157
- return language, confidence
158
-
159
- # Keep the old function if it's being used elsewhere
160
- def generate_basic_code(task):
161
- """Generate basic code templates - TEMPORARY SIMPLE VERSION"""
162
- task_lower = task.lower()
163
 
164
- if 'drive' in task_lower or 'mount' in task_lower:
165
- return "```python\nfrom google.colab import drive\ndrive.mount('/content/gdrive')\n```"
166
- elif 'web' in task_lower or 'app' in task_lower:
167
- return "```python\nfrom fastapi import FastAPI\napp = FastAPI()\n@app.get('/')\ndef home(): return {'message': 'Hello'}\n```"
168
- else:
169
- return "```python\nprint('Hello from AumCore AI')\n```"
170
-
171
- # Module metadata
172
- __version__ = "2.0.0"
173
- __author__ = "AumCore AI"
174
- __all__ = [
175
- 'detect_input_language',
176
- 'detect_with_confidence',
177
- 'LanguageDetector',
178
- 'generate_basic_code'
179
- ]
 
1
+ # Add this function to your language_detector.py file
 
 
 
 
2
 
3
+ def get_system_prompt(language: str = "en") -> str:
4
+ """
5
+ Get system prompt based on language
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ Args:
8
+ language: Language code (en, hi, etc.)
 
9
 
10
+ Returns:
11
+ System prompt string
12
+ """
13
+ prompts = {
14
+ "en": """You are AumCore AI, an advanced AI assistant specializing in programming,
15
+ system design, and technical solutions. Provide detailed, accurate, and
16
+ professional responses.""",
 
 
17
 
18
+ "hi": """आप AumCore AI हैं, एक उन्नत AI सहायक जो प्रोग्रामिंग, सिस्टम डिज़ाइन और
19
+ तकनीकी समाधानों में विशेषज्ञ है। विस्तृत, सटीक और पेशेवर प्रतिक्रियाएँ दें।""",
20
 
21
+ "es": """Eres AumCore AI, un asistente de IA avanzado especializado en programación,
22
+ diseño de sistemas y soluciones técnicas. Proporciona respuestas detalladas,
23
+ precisas y profesionales.""",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ "fr": """Vous êtes AumCore AI, un assistant IA avancé spécialisé en programmation,
26
+ conception de systèmes et solutions techniques. Fournissez des réponses détaillées,
27
+ précises et professionnelles."""
28
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ return prompts.get(language, prompts["en"])