import gradio as gr import torch from transformers import pipeline import requests import re import os from huggingface_hub import login # Authenticate with Hugging Face if "HF_TOKEN" in os.environ: login(token=os.environ["HF_TOKEN"]) # Global variable to store the Atlas-Chat model atlas_pipe = None def load_atlas_model(): """Load only the Atlas-Chat model locally""" global atlas_pipe if atlas_pipe is None: print("🏔️ Loading Atlas-Chat-2B model...") atlas_pipe = pipeline( "text-generation", model="MBZUAI-Paris/Atlas-Chat-2B", model_kwargs={"torch_dtype": torch.bfloat16}, device="cuda" if torch.cuda.is_available() else "cpu" ) print("✅ Atlas-Chat model loaded!") return atlas_pipe def detect_arabizi(text): """ Detect if input text is written in Arabizi (Latin script with numbers) Returns True if Arabizi is detected """ if not text or len(text.strip()) < 2: return False # Check for Arabic script - if present, it's NOT Arabizi arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]' if re.search(arabic_pattern, text): return False # Arabizi indicators - numbers used as letters arabizi_numbers = ['2', '3', '7', '9', '5', '6', '8'] has_arabizi_numbers = any(num in text for num in arabizi_numbers) # Common Arabizi words and patterns arabizi_patterns = [ 'wach', 'wash', 'ach', 'achno', 'chno', 'shno', 'shkoun', 'chkoun', 'kif', 'kifash', 'ki', 'kayf', 'kien', 'kima', 'feen', 'fin', 'fen', 'fain', 'mnin', 'imta', 'meta', 'waqt', 'mata', 'emta', 'hna', 'ahna', 'ana', 'nta', 'nti', 'ntuma', 'ntouma', 'howa', 'hiya', 'huma', 'houma', 'hoa', 'hia', 'had', 'hadchi', 'hada', 'hadi', 'hadou', 'hadouk', 'bghit', 'bghiti', 'bgha', 'bghina', 'bghitiou', 'galt', 'galti', 'gal', 'galet', 'galou', 'rah', 'raha', 'rahi', 'rahom', 'rahin', 'kan', 'kanu', 'kana', 'kanet', 'kano', 'ghadi', 'ghad', 'gha', 'ghadia', 'ghadiyin', 'daba', 'dak', 'dakchi', 'dik', 'dok', 'bzf', 'bzzaf', 'bezzaf', 'bzaaaaf', 'chway', 'chwiya', 'shwiya', 'chwia', 'khoya', 'khuya', 'akhi', 'kho', 'khti', 'khtiya', 'ukhti', 'kht', 'mama', 'baba', 'lwaldin', 'lwalidin', 'salam', 'salamu aleikum', 'slm', 'yallah', 'yalla', 'hya', 'aji', 'mabghitsh', 'mabghach', 'makansh', 'machi', 'walakin', 'walaken', 'ama', 'mais', 'kayn', 'makaynsh', 'chi', 'tayi' ] text_lower = text.lower() has_arabizi_words = any(pattern in text_lower for pattern in arabizi_patterns) # Decision logic if has_arabizi_numbers and has_arabizi_words: return True if has_arabizi_numbers and len([c for c in text if c.isalpha()]) > len(text) * 0.6: return True if has_arabizi_words and len([c for c in text if c.isalpha()]) > len(text) * 0.7: return True return False def arabizi_to_arabic_api(arabizi_text): """ Convert Arabizi text to Arabic using Hugging Face Inference API """ try: # Check if HF_TOKEN is available if "HF_TOKEN" not in os.environ: print("❌ HF_TOKEN not found, falling back to original text") return arabizi_text API_URL = "https://api-inference.huggingface.co/models/atlasia/Transliteration-Moroccan-Darija" headers = {"Authorization": f"Bearer {os.environ['HF_TOKEN']}"} # Prepare the payload payload = { "inputs": arabizi_text, "parameters": { "max_length": 512, "num_beams": 4, "early_stopping": True } } # Make API request with timeout response = requests.post(API_URL, headers=headers, json=payload, timeout=30) # Check if request was successful if response.status_code == 200: result = response.json() # Handle different response formats if isinstance(result, list) and len(result) > 0: if "generated_text" in result[0]: return result[0]["generated_text"].strip() elif isinstance(result[0], str): return result[0].strip() elif isinstance(result, dict) and "generated_text" in result: return result["generated_text"].strip() elif isinstance(result, str): return result.strip() else: print(f"❌ Unexpected API response format: {result}") return arabizi_text elif response.status_code == 503: print("⏳ Model is loading, falling back to original text") return arabizi_text else: print(f"❌ API error {response.status_code}: {response.text}") return arabizi_text except requests.exceptions.Timeout: print("⏰ API timeout, falling back to original text") return arabizi_text except requests.exceptions.RequestException as e: print(f"❌ API request failed: {e}") return arabizi_text except Exception as e: print(f"❌ Unexpected error in API conversion: {e}") return arabizi_text def arabic_to_arabizi(arabic_text): """ Convert Arabic script to Arabizi using comprehensive hard-coded mappings """ if not arabic_text: return arabic_text # COMPREHENSIVE WORD MAPPINGS (Arabic → Arabizi) word_mappings = { # Common words first (most likely to appear) 'أنا': 'ana', 'نتا': 'nta', 'نتي': 'nti', 'هوا': 'howa', 'هيا': 'hiya', 'حنا': 'hna', 'أحنا': 'ahna', 'نتوما': 'ntuma', 'هوما': 'huma', 'شكون': 'shkoun', 'أشنو': 'achno', 'شنو': 'chno', 'واش': 'wach', 'كيفاش': 'kifash', 'كيف': 'kif', 'فين': 'feen', 'منين': 'mnin', 'إمتا': 'imta', 'متا': 'meta', 'علاش': '3lach', 'أش': 'ach', 'بغيت': 'bghit', 'بغيتي': 'bghiti', 'بغا': 'bgha', 'بغينا': 'bghina', 'كان': 'kan', 'كانا': 'kana', 'كانت': 'kanet', 'كانو': 'kanu', 'قلت': 'galt', 'قلتي': 'galti', 'قال': 'gal', 'قالت': 'galet', 'راح': 'rah', 'راها': 'raha', 'راهي': 'rahi', 'راهم': 'rahom', 'غادي': 'ghadi', 'غاد': 'ghad', 'غا': 'gha', 'هاد': 'had', 'هادا': 'hada', 'هادي': 'hadi', 'هادشي': 'hadchi', 'داك': 'dak', 'ديك': 'dik', 'داكشي': 'dakchi', 'بزاف': 'bzzaf', 'شوياة': 'chwiya', 'كولشي': 'kolchi', 'ماشي': 'machi', 'مابغيتش': 'mabghitsh', 'ماكاينش': 'makainch', 'دابا': 'daba', 'توا': 'tawa', 'غدا': 'ghda', 'ماما': 'mama', 'بابا': 'baba', 'خويا': 'khoya', 'ختي': 'khti', 'سلام': 'salam', 'يالاه': 'yallah', 'هيا': 'hya', 'المغرب': 'lmaghrib', 'مغرب': 'maghrib', 'طاجين': 'tajine', 'أتاي': 'atay', 'خوبز': 'khobz', 'كاين': 'kayn', 'ماكاينش': 'makaynsh', 'شي': 'chi', 'زوين': 'zwin', 'زوينا': 'zwina', 'مزيان': 'mzyan', 'مزيانا': 'mzyana', 'كاينين': 'kaynin', 'مطعم': 'ma63am', 'مطاعم': 'ma6a3im', 'مشهور': 'mashhur', 'مشهورين': 'mashhurin', 'وسط': 'wost', 'المدينة': 'lmdina', 'مدينة': 'mdina', 'إيطالي': 'italiy', 'ياباني': 'yabani', 'مغربي': 'maghribi', 'فرنسي': 'fransi', 'أمريكي': 'amriki', 'صيني': 'sini', 'هندي': 'hindi', 'لحم': 'la7m', 'دجاج': 'djaj', 'حوت': '7ut', 'خضرة': 'khodra', 'فواكه': 'fawakeh', 'جبن': 'jben', 'زبدة': 'zebda', 'حليب': '7lib', 'قهوة': 'qahwa', 'شاي': 'atay', 'ماء': 'ma', 'عصير': '3asir', 'خبز': 'khobz', 'رز': 'roz', 'مكرونة': 'makarona', 'بطاطا': 'batata', 'طماطم': 'toma6im', 'بصل': 'basal', 'ثوم': 'tum', 'فلفل': 'felfel', 'ملح': 'mel7', 'سكر': 'sokkar', 'زيت': 'zit', 'خل': 'khall' } # CHARACTER MAPPINGS (Arabic → Arabizi) char_mappings = { 'ا': 'a', 'ب': 'b', 'ت': 't', 'ث': 'th', 'ج': 'j', 'ح': '7', 'خ': 'kh', 'د': 'd', 'ذ': 'dh', 'ر': 'r', 'ز': 'z', 'س': 's', 'ش': 'sh', 'ص': 's', 'ض': 'd', 'ط': '6', 'ظ': 'z', 'ع': '3', 'غ': 'gh', 'ف': 'f', 'ق': '9', 'ك': 'k', 'ل': 'l', 'م': 'm', 'ن': 'n', 'ه': 'h', 'و': 'w', 'ي': 'y', 'ء': '2', 'آ': 'aa', 'أ': 'a', 'إ': 'i', 'ة': 'a', 'ى': 'a', '؟': '?', '،': ',', '؛': ';', ':': ':', '!': '!', 'َ': 'a', 'ُ': 'o', 'ِ': 'i', 'ً': 'an', 'ٌ': 'on', 'ٍ': 'in' } result = arabic_text # Step 1: Apply word mappings (most specific first) for arabic_word, arabizi_word in word_mappings.items(): # Use word boundaries to avoid partial matches result = re.sub(r'\b' + re.escape(arabic_word) + r'\b', arabizi_word, result) # Step 2: Apply character mappings for arabic_char, arabizi_char in char_mappings.items(): result = result.replace(arabic_char, arabizi_char) return result.strip() def chat_with_atlas(message, history): """Generate response from Atlas-Chat model with API-powered Arabizi conversion""" if not message.strip(): return "ahlan wa sahlan! kifash n9der n3awnek? / مرحبا! كيفاش نقدر نعاونك؟" try: # Load Atlas-Chat model atlas_model = load_atlas_model() # Detect if input is Arabizi is_arabizi_input = detect_arabizi(message) print("\n" + "="*50) print("🔍 ATLAS-CHAT DEBUG LOG") print("="*50) print(f"📥 INPUT: '{message}'") print(f"🔍 ARABIZI: {is_arabizi_input}") # Prepare input for the model if is_arabizi_input: print("🔄 Converting Arabizi→Arabic via API...") arabic_input = arabizi_to_arabic_api(message) print(f"✅ ARABIC: '{arabic_input}'") model_input = arabic_input else: print("➡️ No conversion needed") model_input = message print(f"🤖 Sending to Atlas-Chat...") # Generate response using Atlas-Chat messages = [{"role": "user", "content": model_input}] outputs = atlas_model( messages, max_new_tokens=256, temperature=0.1, do_sample=True, pad_token_id=atlas_model.tokenizer.eos_token_id ) # Extract the response response = outputs[0]["generated_text"][-1]["content"].strip() print(f"✅ RESPONSE: '{response[:100]}{'...' if len(response) > 100 else ''}'") # Convert response back to Arabizi if input was Arabizi if is_arabizi_input: print("🔄 Converting Arabic→Arabizi...") arabizi_response = arabic_to_arabizi(response) print(f"✅ FINAL: '{arabizi_response[:100]}{'...' if len(arabizi_response) > 100 else ''}'") print("="*50 + "\n") return arabizi_response else: print("="*50 + "\n") return response except Exception as e: print(f"\n❌ ERROR: {str(e)}") print("="*50 + "\n") # Return error in appropriate language if detect_arabizi(message): return f"sorry, kan chi mochkil: {str(e)}. 3awd jar'b!" else: return f"عذراً، واجهت خطأ: {str(e)}. جرب مرة أخرى! / Sorry, error occurred: {str(e)}. Try again!" # Create the Gradio interface demo = gr.ChatInterface( fn=chat_with_atlas, title="🏔️ Atlas-Chat: AI-Powered Moroccan Arabic Assistant", description=""" **مرحبا بك في أطلس شات!** Welcome to Atlas-Chat! 🇲🇦 **🚀 Powered by Hugging Face Inference API:** - **Arabic Script (العربية)** → Direct conversation - **Arabizi (3arabi bi 7oruf latin)** → API conversion → Arabizi response - **English** → Direct conversation **⚡ Features:** - Professional AI Arabizi conversion via API - No local model conflicts - Fast and reliable responses - Comprehensive language detection **جرب هذه الأسئلة / Try these questions:** """, examples=[ "شكون لي صنعك؟", "shkoun li sna3ek?", "اشنو هو الطاجين؟", "achno howa tajine?", "شنو كيتسمى المنتخب المغربي؟", "chno kaytsma lmontakhab lmaghribi?", "What is Morocco famous for?", "كيفاش نقدر نتعلم الدارجة؟", "kifash n9der nt3elem darija?", "wach kayn atay f lmaghrib?", "3lach lmaghrib zwien bzzaf?", "kifash nsali tajine?", "chno homa l2aklat lmaghribiya?", "kayn chi restaurants zwinin f casa?", "mr7ba! kif dayr?" ], cache_examples=False ) # Launch the app if __name__ == "__main__": demo.launch()