Spaces:

AumCoreAI
/

AumCore-AI

Sleeping

App Files Files Community

AumCoreAI commited on Dec 25, 2025

Commit

a92c0df

verified ·

1 Parent(s): 5a98dce

Update language_detector.py

Browse files

Files changed (1) hide show

language_detector.py +137 -480

language_detector.py CHANGED Viewed

@@ -1,507 +1,164 @@
-# language_detector.py - FINAL 300+ LINES VERSION
 from langdetect import detect, DetectorFactory
 import re
-import json
-from typing import Dict, List, Optional, Tuple, Any
-from datetime import datetime
-import hashlib
-###############################################################################
-# LANGUAGE DETECTION MODULE - ENHANCED VERSION
-###############################################################################
 DetectorFactory.seed = 0
-class LanguageDetector:
-    """Advanced language detection with confidence scoring"""
-    SUPPORTED_LANGUAGES = {
-        'hi': 'hindi',
-        'en': 'english',
-        'ur': 'urdu',
-        'bn': 'bengali',
-        'pa': 'punjabi'
-    }
-    def __init__(self):
-        self.detection_history = []
-    def detect_with_confidence(self, text: str) -> Tuple[str, float]:
-        """
-        Detect language with confidence score
-        Returns: (language_mode, confidence)
-        """
-        try:
-            # Preprocess text
-            clean_text = re.sub(r'[^\w\s\u0900-\u097F\u0980-\u09FF]', '', text)
-            clean_text = clean_text.strip()
-            if not clean_text or len(clean_text) < 2:
-                return ('mixed', 0.5)
-            # Detect primary language
-            primary_lang = detect(clean_text)
-            # Calculate confidence based on text length
-            confidence = min(0.95, len(clean_text) / 100)
-            # Map to our language modes
-            if primary_lang == 'hi':
-                return ('hindi', confidence)
-            elif primary_lang == 'en':
-                return ('english', confidence)
-            else:
-                # Check for mixed language patterns
-                hindi_chars = re.findall(r'[\u0900-\u097F]', text)
-                english_chars = re.findall(r'[a-zA-Z]', text)
-                if hindi_chars and english_chars:
-                    return ('mixed', 0.8)
-                else:
-                    return ('mixed', 0.6)
-        except Exception as e:
-            print(f"Language detection error: {e}")
-            return ('mixed', 0.5)
-    def get_detection_stats(self) -> Dict[str, Any]:
-        """Get statistics about language detection patterns"""
-        return {
-            'total_detections': len(self.detection_history),
-            'last_detection': self.detection_history[-1] if self.detection_history else None,
-            'common_languages': self._get_common_languages()
-        }
-    def _get_common_languages(self) -> List[str]:
-        """Get most frequently detected languages"""
-        # Implementation for frequency analysis
-        return ['hindi', 'english', 'mixed']
-# Global detector instance
-language_detector = LanguageDetector()
-def detect_input_language(text: str) -> str:
-    """
-    Main language detection function
-    Enhanced with better mixed language handling
     """
-    lang_mode, confidence = language_detector.detect_with_confidence(text)
-    # Log this detection
-    detection_record = {
-        'timestamp': datetime.now().isoformat(),
-        'input': text[:100],  # First 100 chars
-        'language': lang_mode,
-        'confidence': confidence,
-        'text_length': len(text)
-    }
-    language_detector.detection_history.append(detection_record)
-    # Keep only last 1000 records
-    if len(language_detector.detection_history) > 1000:
-        language_detector.detection_history = language_detector.detection_history[-1000:]
-    return lang_mode
-###############################################################################
-# PROMPT ENGINEERING MODULE - COMPREHENSIVE VERSION
-###############################################################################
-class PromptEngine:
-    """Advanced prompt engineering for AI responses"""
-    def __init__(self, username: str):
-        self.username = username
-        self.prompt_templates = self._load_templates()
-        self.response_patterns = self._load_response_patterns()
-    def _load_templates(self) -> Dict[str, str]:
-        """Load comprehensive prompt templates"""
-        return {
-            'hindi': self._get_hindi_template(),
-            'english': self._get_english_template(),
-            'mixed': self._get_mixed_template(),
-            'technical': self._get_technical_template(),
-            'casual': self._get_casual_template()
-        }
-    def _load_response_patterns(self) -> Dict[str, List[str]]:
-        """Load response patterns for different intents"""
-        return {
-            'code_request': ['code', 'program', 'script', 'function', 'implement', 'create', 'build', 'develop', 'generate'],
-            'error_fix': ['error', 'fix', 'debug', 'not working', 'problem', 'issue', 'solve', 'correct'],
-            'technical_query': ['how to', 'tutorial', 'guide', 'example', 'explain', 'teach', 'learn'],
-            'casual_chat': ['hello', 'hi', 'how are you', 'what\'s up', 'kya haal hai', 'namaste', 'good morning'],
-            'knowledge_query': ['what is', 'who is', 'when is', 'where is', 'why is', 'how is', 'tell me about']
-        }
-    def _get_hindi_template(self) -> str:
-        """Hindi language prompt template"""
-        return f"""
-        भूमिका: आप AumCore AI हैं - सीनियर AI आर्किटेक्ट और कोडिंग विशेषज्ञ।
-        उपयोगकर्ता: {self.username}
-        मुख्य नियम:
-        1. भाषा शैली: 100% हिंदी (कोड के अलावा)
-        2. कोड निर्णय: केवल तकनीकी अनुरोधों पर कोड प्रदान करें
-        3. कोड प्रारूप: केवल RAW पायथन कोड, कोई मार्कडाउन ब्लॉक नहीं
-        4. कोड गुणवत्ता: उत्पादन-तैयार कोड (300+ पंक्तियाँ जब आवश्यक हो)
-        5. त्रुटि प्रबंधन: यदि उपयोगकर्ता त्रुटि दिखाता है, तो विश्लेषण करें और सही कोड दें
-        इरादा पहचान नियम:
-        ✅ कोड दें जब: "कोड", "प्रोग्राम", "स्क्रिप्ट", "फ़ंक्शन", "बनाएं", "विकसित करें"
-        ❌ कोड न दें जब: "नमस्ते", "क्या हाल है", "कोई भजन आता है", "सपने सच होंगे"
-        उदाहरण प्रवाह:
-        - उपयोगकर्ता: "google drive mount code do"
-          AI: "from google.colab import drive\ndrive.mount('/content/gdrive')"
-        - उपयोगकर्ता: "koi bhajan aata hai"
-          AI: "हाँ {self.username} भाई, मुझे कुछ भजन याद हैं। आप किस भजन के बारे में पूछ रहे हैं?"
-        - उपयोगकर्ता: "ye code error de raha hai: x = 10\\nprint(y)"
-          AI: "त्रुटि: y परिभाषित नहीं है। सही कोड:\\nx = 10\\ny = x\\nprint(y)"
-        """
-    def _get_english_template(self) -> str:
-        """English language prompt template"""
-        return f"""
-        ROLE: You are AumCore AI - Senior AI Architect and Coding Expert.
-        USER: {self.username}
-        CORE RULES:
-        1. LANGUAGE STYLE: 100% English (except code)
-        2. CODE DECISION: Provide code only for technical requests
-        3. CODE FORMAT: RAW Python code only, no markdown blocks
-        4. CODE QUALITY: Production-ready code (300+ lines when appropriate)
-        5. ERROR HANDLING: If user shows error, analyze and provide corrected code
-        INTENT DETECTION RULES:
-        ✅ PROVIDE CODE WHEN: "code", "program", "script", "function", "create", "build", "develop"
-        ❌ NO CODE WHEN: "hello", "how are you", "do you know bhajans", "sapne sach honge"
-        EXAMPLE FLOW:
-        - User: "google drive mount code"
-          AI: "from google.colab import drive\ndrive.mount('/content/gdrive')"
-        - User: "do you know any bhajan"
-          AI: "Yes {self.username}, I'm familiar with some bhajans. Which one are you asking about?"
-        - User: "this code has error: x = 10\\nprint(y)"
-          AI: "Error: y is not defined. Corrected code:\\nx = 10\\ny = x\\nprint(y)"
         """
-    def _get_mixed_template(self) -> str:
-        """Mixed Hindi-English prompt template"""
-        return f"""
-        ROLE: You are AumCore AI - Senior AI Architect and Coding Expert.
-        USER: {self.username}
-        CORE RULES:
-        1. LANGUAGE STYLE: 60% English + 40% Hindi (blended naturally)
-        2. CODE DECISION: Code sirf technical requests pe dena
-        3. CODE FORMAT: RAW Python code only, bilkul bhi markdown nahi
-        4. CODE QUALITY: Production-ready code (300+ lines jab appropriate ho)
-        5. ERROR HANDLING: Agar user error dikhaye, analyze karo aur corrected code do
-        INTENT DETECTION RULES:
-        ✅ CODE DO JAB: "code", "program", "script", "function", "create", "build", "develop", "banao", "banao"
-        ❌ CODE MAT DO JAB: "hello", "hi", "kya haal hai", "koi bhajan aata hai", "sapne sach honge"
-        EXAMPLE FLOW:
-        - User: "google drive mount code do"
-          AI: "from google.colab import drive\ndrive.mount('/content/gdrive')"
-        - User: "are bhai, koi bhajan aata hai"
-          AI: "Haan {self.username} bhai, mujhe kuch bhajans aate hain. Aap kis bhajan ke bare mein puch rahe ho?"
-        - User: "ye code error de raha hai: x = 10\\nprint(y)"
-          AI: "Error: y defined nahi hai. Corrected code:\\nx = 10\\ny = x\\nprint(y)"
-        """
-    def _get_technical_template(self) -> str:
-        """Technical/Code-focused template"""
-        return f"""
-        TECHNICAL CODING GUIDELINES:
-        1. CODE GENERATION STANDARDS:
-           - Always provide complete, runnable code
-           - Include error handling with try-except blocks
-           - Add proper logging for production environments
-           - Follow PEP 8 style guidelines
-           - Include docstrings for all functions
-           - Use type hints where applicable
-           - Add configuration management
-           - Include basic test structure
-        2. ERROR RESOLUTION PROTOCOL:
-           Step 1: Parse error message and traceback
-           Step 2: Identify error category (Syntax, Name, Type, Import, Runtime)
-           Step 3: Apply appropriate fix pattern
-           Step 4: Return corrected code with brief explanation
-        3. CODE TEMPLATE LIBRARY:
-           - Web Applications: Flask/FastAPI with authentication, database, APIs
-           - Data Analysis: Pandas, NumPy, Matplotlib with visualization
-           - ML Pipelines: Scikit-learn, TensorFlow/PyTorch workflows
-           - Automation Scripts: File processing, API integration, scheduling
-           - Utilities: Logging, configuration, error handling modules
-        """
-    def _get_casual_template(self) -> str:
-        """Casual conversation template"""
-        return f"""
-        CASUAL CONVERSATION GUIDELINES:
-        1. RESPONSE STYLE:
-           - Be friendly, helpful, and engaging
-           - Maintain professional yet approachable tone
-           - Use appropriate language based on user's input
-           - Keep responses concise but meaningful
-        2. TOPIC HANDLING:
-           - General greetings: Respond warmly
-           - Personal questions: Answer appropriately
-           - Knowledge queries: Provide accurate information
-           - Off-topic chats: Gently steer back to relevant topics
-        3. BOUNDARIES:
-           - Do not provide medical, legal, or financial advice
-           - Maintain privacy and confidentiality
-           - Avoid political or controversial topics
-           - Stay within technical and general knowledge domains
-        """
-    def generate_system_prompt(self, lang_mode: str) -> str:
-        """Generate complete system prompt for given language mode"""
-        # Base template
-        base_prompt = self.prompt_templates.get(lang_mode, self.prompt_templates['mixed'])
-        # Add technical guidelines for code scenarios
-        technical_guidelines = self.prompt_templates['technical']
-        # Add casual guidelines for non-code scenarios
-        casual_guidelines = self.prompt_templates['casual']
-        # Combine all relevant sections
-        full_prompt = f"""
-        {base_prompt}
-        {technical_guidelines}
-        {casual_guidelines}
-        FINAL REMINDER: You are {self.username}'s personal AI assistant -
-        be helpful, accurate, and context-aware in all interactions.
-        """
-        return full_prompt.strip()
-###############################################################################
-# MAIN INTERFACE FUNCTIONS
-###############################################################################
-# Global prompt engine
-prompt_engine = PromptEngine(username="Sanjay")
-def get_system_prompt(lang_mode: str, username: str) -> str:
     """
-    Main function to get system prompt
-    Enhanced with advanced prompt engineering
-    """
-    # Update username if different
-    if username != prompt_engine.username:
-        global prompt_engine
-        prompt_engine = PromptEngine(username=username)
-    # Generate comprehensive prompt
-    return prompt_engine.generate_system_prompt(lang_mode)
-###############################################################################
-# CODE GENERATION MODULE - ENHANCED VERSION
-###############################################################################
-class CodeGenerator:
-    """Advanced code generation with multiple templates"""
-    def __init__(self):
-        self.templates = self._load_code_templates()
-        self.code_snippets = self._load_code_snippets()
-    def _load_code_templates(self) -> Dict[str, str]:
-        """Load comprehensive code templates"""
-        return {
-            'web_app': self._web_app_template(),
-            'data_analysis': self._data_analysis_template(),
-            'ml_pipeline': self._ml_pipeline_template(),
-            'automation': self._automation_template(),
-            'api_service': self._api_service_template(),
-            'utility': self._utility_template()
-        }
-    def _load_code_snippets(self) -> Dict[str, List[str]]:
-        """Load reusable code snippets"""
-        return {
-            'imports': self._import_snippets(),
-            'error_handling': self._error_handling_snippets(),
-            'logging': self._logging_snippets(),
-            'config': self._config_snippets()
-        }
-    def _web_app_template(self) -> str:
-        """Web application template (300+ lines)"""
-        # [300+ lines of comprehensive web app code]
-        return """
-from fastapi import FastAPI, HTTPException, Depends, status
-from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
-from pydantic import BaseModel, Field, validator
-from typing import List, Optional, Dict, Any
 import uvicorn
-import logging
-import json
-from datetime import datetime, timedelta
-import os
-import secrets
-from contextlib import asynccontextmanager
-# [298 more lines of professional web app code...]
-"""
-    def _data_analysis_template(self) -> str:
-        """Data analysis template (300+ lines)"""
-        # [300+ lines of comprehensive data analysis code]
-        return """
 import pandas as pd
 import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
-from scipy import stats
-import warnings
-warnings.filterwarnings('ignore')
-from sklearn.preprocessing import StandardScaler, LabelEncoder
-from sklearn.model_selection import train_test_split, cross_val_score
-from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
-# [295 more lines of professional data analysis code...]
 """
-    # [Additional template methods...]
-    def _import_snippets(self) -> List[str]:
-        """Common import snippets"""
-        return [
-            "import os\nimport sys\nimport json\nimport logging\nfrom datetime import datetime",
-            "from typing import List, Dict, Optional, Any, Tuple, Union",
-            "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt"
-        ]
-    # [Additional snippet methods...]
-    def generate_code(self, task_description: str, code_type: str = 'auto') -> str:
-        """Generate code based on task description"""
-        if code_type == 'auto':
-            code_type = self._detect_code_type(task_description)
-        template = self.templates.get(code_type, self.templates['utility'])
-        # Enhance template with relevant snippets
-        enhanced_code = self._enhance_with_snippets(template, task_description)
-        return enhanced_code
-    def _detect_code_type(self, description: str) -> str:
-        """Auto-detect code type from description"""
-        description_lower = description.lower()
-        if any(word in description_lower for word in ['web', 'app', 'flask', 'fastapi', 'django']):
-            return 'web_app'
-        elif any(word in description_lower for word in ['data', 'analysis', 'pandas', 'numpy', 'visualize']):
-            return 'data_analysis'
-        elif any(word in description_lower for word in ['machine', 'learning', 'ml', 'ai', 'model']):
-            return 'ml_pipeline'
-        elif any(word in description_lower for word in ['automate', 'script', 'batch', 'process']):
-            return 'automation'
-        elif any(word in description_lower for word in ['api', 'rest', 'endpoint', 'service']):
-            return 'api_service'
-        else:
-            return 'utility'
-    def _enhance_with_snippets(self, template: str, description: str) -> str:
-        """Enhance template with appropriate snippets"""
-        enhanced = template
-        # Add imports based on description
-        if 'logging' in description.lower() or 'debug' in description.lower():
-            enhanced = self.code_snippets['logging'][0] + "\n\n" + enhanced
-        if 'config' in description.lower() or 'setting' in description.lower():
-            enhanced = self.code_snippets['config'][0] + "\n\n" + enhanced
-        return enhanced
-# Global code generator
-code_generator = CodeGenerator()
-def generate_expert_code(task_description: str) -> str:
-    """
-    Generate expert-level code (300+ lines)
-    Enhanced with intelligent template selection
-    """
-    return code_generator.generate_code(task_description)
-###############################################################################
-# MODULE INITIALIZATION AND EXPORTS
-###############################################################################
-def initialize_modules():
-    """Initialize all modules"""
-    print("Initializing Language Detection Module...")
-    print("Initializing Prompt Engineering Module...")
-    print("Initializing Code Generation Module...")
-    print("All modules initialized successfully!")
-    return {
-        'language_detector': language_detector,
-        'prompt_engine': prompt_engine,
-        'code_generator': code_generator
-    }
-# Auto-initialize on import
-_MODULES = initialize_modules()
-# Export main functions
-__all__ = [
-    'detect_input_language',
-    'get_system_prompt',
-    'generate_expert_code',
-    'language_detector',
-    'prompt_engine',
-    'code_generator'
-]
-###############################################################################
-# USAGE EXAMPLE
-###############################################################################
 if __name__ == "__main__":
-    # Test language detection
-    test_texts = [
-        "नमस्ते, कोड बताओ",
-        "hello, give me code",
-        "hi bhai, code de do",
-        "sapne sach honge ek din"
     ]
-    for text in test_texts:
-        lang = detect_input_language(text)
-        print(f"Text: {text[:30]}... -> Language: {lang}")
-    # Test prompt generation
-    prompt = get_system_prompt('hindi', 'Sanjay')
-    print(f"\nGenerated prompt length: {len(prompt)} characters")
-    print("\n✅ language_detector.py module loaded successfully!")
-    print("   - Advanced language detection with confidence scoring")
-    print("   - Comprehensive prompt engineering")
-    print("   - Professional code generation (300+ lines)")
-    print("   - Ready for AumCore AI integration")

+# language_detector.py - FINAL WORKING VERSION (200 lines)
 from langdetect import detect, DetectorFactory
 import re
 DetectorFactory.seed = 0
+def detect_input_language(text):
+    """Detect if text is Hindi, English or Mixed"""
+    try:
+        clean_text = re.sub(r'[^\w\s]', '', text)
+        if not clean_text.strip():
+            return 'mixed'
+        lang = detect(clean_text)
+        # Hindi detection
+        hindi_chars = re.findall(r'[\u0900-\u097F]', text)
+        if lang == 'hi' or hindi_chars:
+            # Check if mixed with English
+            english_chars = re.findall(r'[a-zA-Z]', text)
+            if hindi_chars and english_chars:
+                return 'mixed'
+            return 'hindi'
+        # English detection
+        if lang == 'en':
+            return 'english'
+        return 'mixed'
+    except:
+        return 'mixed'
+def get_system_prompt(lang_mode, username):
+    """Generate system prompt based on language and intent"""
+    # CORE RULES - COMMON FOR ALL
+    core_rules = f"""
+    ROLE: AumCore AI - Senior Coding Assistant
+    USER: {username}
+    CRITICAL RULES:
+    1. CODE vs CHAT DECISION:
+       - CODE WHEN: User says 'code', 'program', 'script', 'function', 'create', 'build'
+       - CHAT WHEN: General conversation, greetings, knowledge questions
+       - EXAMPLES:
+           * "google drive code" → RAW CODE
+           * "hello how are you" → TEXT RESPONSE
+           * "koi bhajan aata hai" → TEXT RESPONSE
+    2. CODE FORMAT:
+       - RAW PYTHON CODE ONLY
+       - NO markdown blocks (```python```)
+       - NO 'python' keyword in response
+       - Example: "from google.colab import drive\\ndrive.mount('/content/gdrive')"
+    3. ERROR HANDLING:
+       - If user shows error, analyze and provide corrected code
+       - Include brief explanation of fix
+    4. CODE QUALITY:
+       - Production-ready code
+       - Error handling included
+       - Proper structure
     """
+    # LANGUAGE SPECIFIC STYLES
+    styles = {
+        'hindi': """
+        STYLE: 100% Hindi (except code)
+        EXAMPLES:
+        - User: "नमस्ते, कोड बताओ" → RAW CODE
+        - User: "क्या हाल है" → "सब ठीक है {username} भाई!"
+        - User: "त्रुटि: x परिभाषित नहीं" → "x = 10\\ny = x\\nprint(y)"
+        """,
+        'english': """
+        STYLE: 100% English (except code)
+        EXAMPLES:
+        - User: "hello, give code" → RAW CODE
+        - User: "how are you" → "I'm good {username}!"
+        - User: "error: x not defined" → "x = 10\\ny = x\\nprint(y)"
+        """,
+        'mixed': """
+        STYLE: 60% English + 40% Hindi (natural blend)
+        EXAMPLES:
+        - User: "hi bhai, code de" → RAW CODE
+        - User: "are yaar, kya haal hai" → "Sab badhiya hai {username} bhai!"
+        - User: "error aaya: x not defined" → "x = 10\\ny = x\\nprint(y)"
         """
+    }
+    # COMBINE
+    full_prompt = f"""{core_rules}
+    {styles.get(lang_mode, styles['mixed'])}
+    FINAL REMINDER: Be {username}'s helpful AI assistant.
+    Provide accurate code for technical requests.
+    Engage naturally in conversation.
     """
+    return full_prompt.strip()
+# SIMPLE CODE GENERATOR (Optional - can be expanded)
+def generate_basic_code(task):
+    """Generate basic code templates"""
+    templates = {
+        'web': """
+from fastapi import FastAPI
 import uvicorn
+app = FastAPI()
+@app.get("/")
+def home():
+    return {"message": "Hello from AumCore AI"}
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
+""",
+        'data': """
 import pandas as pd
 import numpy as np
+# Load data
+df = pd.read_csv("data.csv")
+# Basic analysis
+print(f"Shape: {df.shape}")
+print(f"Columns: {list(df.columns)}")
+print(f"Summary:\\n{df.describe()}")
+""",
+        'drive': """
+from google.colab import drive
+drive.mount('/content/gdrive')
 """
+    }
+    task_lower = task.lower()
+    if 'drive' in task_lower or 'mount' in task_lower:
+        return templates['drive']
+    elif 'web' in task_lower or 'app' in task_lower:
+        return templates['web']
+    elif 'data' in task_lower or 'analy' in task_lower:
+        return templates['data']
+    return templates['drive']  # Default
+# Test function
 if __name__ == "__main__":
+    # Test detection
+    tests = [
+        "नमस्ते",
+        "hello world",
+        "hi bhai kya haal hai",
+        "google drive mount code do"
     ]
+    for test in tests:
+        lang = detect_input_language(test)
+        print(f"{test[:20]:20} -> {lang}")
+    print("\\n✅ language_detector.py ready for AumCore AI")