|
|
import gradio as gr |
|
|
import torch |
|
|
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
import re |
|
|
|
|
|
|
|
|
atlas_pipe = None |
|
|
transliteration_tokenizer = None |
|
|
transliteration_model = None |
|
|
|
|
|
def load_models(): |
|
|
"""Load both Atlas-Chat and Transliteration models""" |
|
|
global atlas_pipe, transliteration_tokenizer, transliteration_model |
|
|
|
|
|
|
|
|
if atlas_pipe is None: |
|
|
print("๐๏ธ Loading Atlas-Chat-2B model...") |
|
|
atlas_pipe = pipeline( |
|
|
"text-generation", |
|
|
model="MBZUAI-Paris/Atlas-Chat-2B", |
|
|
model_kwargs={"torch_dtype": torch.bfloat16}, |
|
|
device="cuda" if torch.cuda.is_available() else "cpu" |
|
|
) |
|
|
print("โ
Atlas-Chat model loaded!") |
|
|
|
|
|
|
|
|
if transliteration_tokenizer is None or transliteration_model is None: |
|
|
print("๐ Loading Transliteration model...") |
|
|
transliteration_tokenizer = AutoTokenizer.from_pretrained("atlasia/Transliteration-Moroccan-Darija") |
|
|
transliteration_model = AutoModelForSeq2SeqLM.from_pretrained("atlasia/Transliteration-Moroccan-Darija") |
|
|
print("โ
Transliteration model loaded!") |
|
|
|
|
|
return atlas_pipe, transliteration_tokenizer, transliteration_model |
|
|
|
|
|
def detect_arabizi(text): |
|
|
""" |
|
|
Detect if input text is written in Arabizi (Latin script with numbers) |
|
|
Returns True if Arabizi is detected |
|
|
""" |
|
|
if not text or len(text.strip()) < 2: |
|
|
return False |
|
|
|
|
|
|
|
|
arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]' |
|
|
if re.search(arabic_pattern, text): |
|
|
return False |
|
|
|
|
|
|
|
|
arabizi_numbers = ['2', '3', '7', '9', '5', '6', '8'] |
|
|
has_arabizi_numbers = any(num in text for num in arabizi_numbers) |
|
|
|
|
|
|
|
|
arabizi_patterns = [ |
|
|
'wach', 'wash', 'ach', 'achno', 'chno', 'shno', 'shkoun', 'chkoun', |
|
|
'kif', 'kifash', 'ki', 'kayf', 'kien', 'kima', |
|
|
'feen', 'fin', 'fen', 'fain', 'mnin', |
|
|
'imta', 'meta', 'waqt', 'mata', 'emta', |
|
|
'hna', 'ahna', 'ana', 'nta', 'nti', 'ntuma', 'ntouma', |
|
|
'howa', 'hiya', 'huma', 'houma', 'hoa', 'hia', |
|
|
'had', 'hadchi', 'hada', 'hadi', 'hadou', 'hadouk', |
|
|
'bghit', 'bghiti', 'bgha', 'bghina', 'bghitiou', |
|
|
'galt', 'galti', 'gal', 'galet', 'galou', |
|
|
'rah', 'raha', 'rahi', 'rahom', 'rahin', |
|
|
'kan', 'kanu', 'kana', 'kanet', 'kano', |
|
|
'ghadi', 'ghad', 'gha', 'ghadia', 'ghadiyin', |
|
|
'daba', 'dak', 'dakchi', 'dik', 'dok', |
|
|
'bzf', 'bzzaf', 'bezzaf', 'bzaaaaf', |
|
|
'chway', 'chwiya', 'shwiya', 'chwia', |
|
|
'khoya', 'khuya', 'akhi', 'kho', |
|
|
'khti', 'khtiya', 'ukhti', 'kht', |
|
|
'mama', 'baba', 'lwaldin', 'lwalidin', |
|
|
'salam', 'salamu aleikum', 'slm', |
|
|
'yallah', 'yalla', 'hya', 'aji', |
|
|
'mabghitsh', 'mabghach', 'makansh', 'machi', |
|
|
'walakin', 'walaken', 'ama', 'mais', |
|
|
'kayn', 'makaynsh', 'chi', 'tayi' |
|
|
] |
|
|
|
|
|
text_lower = text.lower() |
|
|
has_arabizi_words = any(pattern in text_lower for pattern in arabizi_patterns) |
|
|
|
|
|
|
|
|
if has_arabizi_numbers and has_arabizi_words: |
|
|
return True |
|
|
if has_arabizi_numbers and len([c for c in text if c.isalpha()]) > len(text) * 0.6: |
|
|
return True |
|
|
if has_arabizi_words and len([c for c in text if c.isalpha()]) > len(text) * 0.7: |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
def arabizi_to_arabic_ai(arabizi_text): |
|
|
""" |
|
|
Convert Arabizi text to Arabic using the specialized AI model |
|
|
""" |
|
|
try: |
|
|
_, tokenizer, model = load_models() |
|
|
|
|
|
|
|
|
input_tokens = tokenizer(arabizi_text, return_tensors="pt", padding=True, truncation=True, max_length=512) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
output_tokens = model.generate( |
|
|
**input_tokens, |
|
|
max_length=512, |
|
|
num_beams=4, |
|
|
early_stopping=True, |
|
|
no_repeat_ngram_size=2 |
|
|
) |
|
|
|
|
|
|
|
|
arabic_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True) |
|
|
|
|
|
return arabic_text.strip() |
|
|
|
|
|
except Exception as e: |
|
|
print(f"โ Error in ArabiziโArabic conversion: {e}") |
|
|
|
|
|
return arabizi_text |
|
|
|
|
|
def arabic_to_arabizi(arabic_text): |
|
|
""" |
|
|
Convert Arabic script to Arabizi using character mappings |
|
|
(Keeping this as backup since no reverse model available) |
|
|
""" |
|
|
if not arabic_text: |
|
|
return arabic_text |
|
|
|
|
|
|
|
|
word_mappings = { |
|
|
|
|
|
'ุฃูุง': 'ana', 'ูุชุง': 'nta', 'ูุชู': 'nti', 'ููุง': 'howa', 'ููุง': 'hiya', |
|
|
'ุญูุง': 'hna', 'ุฃุญูุง': 'ahna', 'ูุชูู
ุง': 'ntuma', 'ููู
ุง': 'huma', |
|
|
'ุดููู': 'shkoun', 'ุฃุดูู': 'achno', 'ุดูู': 'chno', 'ูุงุด': 'wach', |
|
|
'ูููุงุด': 'kifash', 'ููู': 'kif', 'ููู': 'feen', 'ู
ููู': 'mnin', |
|
|
'ุฅู
ุชุง': 'imta', 'ู
ุชุง': 'meta', 'ุนูุงุด': '3lach', 'ุฃุด': 'ach', |
|
|
'ุจุบูุช': 'bghit', 'ุจุบูุชู': 'bghiti', 'ุจุบุง': 'bgha', 'ุจุบููุง': 'bghina', |
|
|
'ูุงู': 'kan', 'ูุงูุง': 'kana', 'ูุงูุช': 'kanet', 'ูุงูู': 'kanu', |
|
|
'ููุช': 'galt', 'ููุชู': 'galti', 'ูุงู': 'gal', 'ูุงูุช': 'galet', |
|
|
'ุฑุงุญ': 'rah', 'ุฑุงูุง': 'raha', 'ุฑุงูู': 'rahi', 'ุฑุงูู
': 'rahom', |
|
|
'ุบุงุฏู': 'ghadi', 'ุบุงุฏ': 'ghad', 'ุบุง': 'gha', |
|
|
'ูุงุฏ': 'had', 'ูุงุฏุง': 'hada', 'ูุงุฏู': 'hadi', 'ูุงุฏุดู': 'hadchi', |
|
|
'ุฏุงู': 'dak', 'ุฏูู': 'dik', 'ุฏุงูุดู': 'dakchi', |
|
|
'ุจุฒุงู': 'bzzaf', 'ุดููุงุฉ': 'chwiya', 'ูููุดู': 'kolchi', |
|
|
'ู
ุงุดู': 'machi', 'ู
ุงุจุบูุชุด': 'mabghitsh', 'ู
ุงูุงููุด': 'makainch', |
|
|
'ุฏุงุจุง': 'daba', 'ุชูุง': 'tawa', 'ุบุฏุง': 'ghda', |
|
|
'ู
ุงู
ุง': 'mama', 'ุจุงุจุง': 'baba', 'ุฎููุง': 'khoya', 'ุฎุชู': 'khti', |
|
|
'ุณูุงู
': 'salam', 'ูุงูุงู': 'yallah', 'ููุง': 'hya', |
|
|
'ุงูู
ุบุฑุจ': 'lmaghrib', 'ู
ุบุฑุจ': 'maghrib', |
|
|
'ุทุงุฌูู': 'tajine', 'ุฃุชุงู': 'atay', 'ุฎูุจุฒ': 'khobz', |
|
|
'ูุงูู': 'kayn', 'ู
ุงูุงููุด': 'makaynsh', 'ุดู': 'chi', |
|
|
'ุฒููู': 'zwin', 'ุฒูููุง': 'zwina', 'ู
ุฒูุงู': 'mzyan', 'ู
ุฒูุงูุง': 'mzyana' |
|
|
} |
|
|
|
|
|
|
|
|
char_mappings = { |
|
|
'ุง': 'a', 'ุจ': 'b', 'ุช': 't', 'ุซ': 'th', 'ุฌ': 'j', 'ุญ': '7', |
|
|
'ุฎ': 'kh', 'ุฏ': 'd', 'ุฐ': 'dh', 'ุฑ': 'r', 'ุฒ': 'z', 'ุณ': 's', |
|
|
'ุด': 'sh', 'ุต': 's', 'ุถ': 'd', 'ุท': '6', 'ุธ': 'z', 'ุน': '3', |
|
|
'ุบ': 'gh', 'ู': 'f', 'ู': '9', 'ู': 'k', 'ู': 'l', 'ู
': 'm', |
|
|
'ู': 'n', 'ู': 'h', 'ู': 'w', 'ู': 'y', 'ุก': '2', |
|
|
'ุข': 'aa', 'ุฃ': 'a', 'ุฅ': 'i', 'ุฉ': 'a', 'ู': 'a', |
|
|
'ุ': '?', 'ุ': ',', 'ุ': ';', '๏ผ': ':', '๏ผ': '!', |
|
|
'ู': 'a', 'ู': 'o', 'ู': 'i', 'ู': 'an', 'ู': 'on', 'ู': 'in' |
|
|
} |
|
|
|
|
|
result = arabic_text |
|
|
|
|
|
|
|
|
for arabic_word, arabizi_word in word_mappings.items(): |
|
|
|
|
|
result = re.sub(r'\b' + re.escape(arabic_word) + r'\b', arabizi_word, result) |
|
|
|
|
|
|
|
|
for arabic_char, arabizi_char in char_mappings.items(): |
|
|
result = result.replace(arabic_char, arabizi_char) |
|
|
|
|
|
return result.strip() |
|
|
|
|
|
def chat_with_atlas(message, history): |
|
|
"""Generate response from Atlas-Chat model with AI-powered Arabizi conversion""" |
|
|
if not message.strip(): |
|
|
return "ahlan wa sahlan! kifash n9der n3awnek? / ู
ุฑุญุจุง! ูููุงุด ููุฏุฑ ูุนุงูููุ" |
|
|
|
|
|
try: |
|
|
|
|
|
atlas_model, _, _ = load_models() |
|
|
|
|
|
|
|
|
is_arabizi_input = detect_arabizi(message) |
|
|
|
|
|
|
|
|
if is_arabizi_input: |
|
|
|
|
|
print(f"๐ Converting Arabizi: '{message}'") |
|
|
arabic_input = arabizi_to_arabic_ai(message) |
|
|
print(f"โ
Converted to Arabic: '{arabic_input}'") |
|
|
model_input = arabic_input |
|
|
else: |
|
|
|
|
|
model_input = message |
|
|
|
|
|
|
|
|
messages = [{"role": "user", "content": model_input}] |
|
|
|
|
|
outputs = atlas_model( |
|
|
messages, |
|
|
max_new_tokens=256, |
|
|
temperature=0.1, |
|
|
do_sample=True, |
|
|
pad_token_id=atlas_model.tokenizer.eos_token_id |
|
|
) |
|
|
|
|
|
|
|
|
response = outputs[0]["generated_text"][-1]["content"].strip() |
|
|
print(f"๐ค Atlas response: '{response}'") |
|
|
|
|
|
|
|
|
if is_arabizi_input: |
|
|
arabizi_response = arabic_to_arabizi(response) |
|
|
print(f"๐ Converted to Arabizi: '{arabizi_response}'") |
|
|
return arabizi_response |
|
|
else: |
|
|
|
|
|
return response |
|
|
|
|
|
except Exception as e: |
|
|
print(f"โ Error in chat: {e}") |
|
|
|
|
|
if detect_arabizi(message): |
|
|
return f"sorry, kan chi mochkil: {str(e)}. 3awd jar'b!" |
|
|
else: |
|
|
return f"ุนุฐุฑุงูุ ูุงุฌูุช ุฎุทุฃ: {str(e)}. ุฌุฑุจ ู
ุฑุฉ ุฃุฎุฑู! / Sorry, error occurred: {str(e)}. Try again!" |
|
|
|
|
|
|
|
|
demo = gr.ChatInterface( |
|
|
fn=chat_with_atlas, |
|
|
title="๐๏ธ Atlas-Chat: Advanced Moroccan Arabic AI", |
|
|
description=""" |
|
|
**ู
ุฑุญุจุง ุจู ูู ุฃุทูุณ ุดุงุช ุงูู
ุทูุฑ!** Welcome to Advanced Atlas-Chat! ๐ฒ๐ฆ |
|
|
|
|
|
**๐ง AI-Powered Language Detection & Conversion:** |
|
|
- **Arabic Script (ุงูุนุฑุจูุฉ)** โ AI responds in Arabic |
|
|
- **Arabizi (3arabi bi 7oruf latin)** โ AI-powered conversion โ Arabizi response |
|
|
- **English** โ AI responds in English |
|
|
|
|
|
**โก NEW: Professional Arabizi Conversion** |
|
|
- Uses specialized AI model trained on Moroccan Darija |
|
|
- Perfect understanding of context: "kayn chi" โ "ูุงูู ุดู" |
|
|
- Handles complex phrases accurately |
|
|
|
|
|
**ุฌุฑุจ ูุฐู ุงูุฃุณุฆูุฉ / Try these questions:** |
|
|
""", |
|
|
examples=[ |
|
|
"ุดููู ูู ุตูุนูุ", |
|
|
"shkoun li sna3ek?", |
|
|
"ุงุดูู ูู ุงูุทุงุฌููุ", |
|
|
"achno howa tajine?", |
|
|
"ุดูู ููุชุณู
ู ุงูู
ูุชุฎุจ ุงูู
ุบุฑุจูุ", |
|
|
"chno kaytsma lmontakhab lmaghribi?", |
|
|
"What is Morocco famous for?", |
|
|
"ูููุงุด ููุฏุฑ ูุชุนูู
ุงูุฏุงุฑุฌุฉุ", |
|
|
"kifash n9der nt3elem darija?", |
|
|
"wach kayn atay f lmaghrib?", |
|
|
"3lach lmaghrib zwien bzzaf?", |
|
|
"kifash nsali tajine?", |
|
|
"chno homa l2aklat lmaghribiya?", |
|
|
"kayn chi restaurants zwinin f casa?", |
|
|
"mr7ba! kif dayr?" |
|
|
], |
|
|
cache_examples=False |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |