import os os.environ["CUDA_VISIBLE_DEVICES"] = "" import streamlit as st import joblib import re import io import random import hashlib from gtts import gTTS from huggingface_hub import InferenceClient st.set_page_config(page_title="Darija Comment Filter", page_icon="🇲🇦", layout="wide") st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def load_models(): clf = joblib.load("codeswitch_model.joblib") cli = InferenceClient() return clf, cli classifier, client = load_models() ARABIC_RE = re.compile(r'[\u0600-\u06FF]') FRENCH_SIGNS = re.compile(r'[àâäçéèêëîïôöùûüÿœæ]', re.IGNORECASE) FRENCH_STOP = set("je tu il elle nous vous les des une pour dans sur avec qui que est sont merci bonjour ça ce cette".split()) LATIN_DARIJA = re.compile(r'\b(wach|labas|bzzaf|bghit|kayn|mzyan|safi|wakha|daba|hada|howa|hiya|ntuma|ana|walakin|3lash|kifash|zwina|mashi|nta|nti|rani|khas|bayna|tayban|makayn|fhmt|t3rf|wqe3|dyal|had|lli|fin|fash|bach|ach)\b', re.IGNORECASE) COLORS = ["#e74c3c","#e67e22","#2ecc71","#1abc9c","#3498db","#9b59b6","#e91e63","#00bcd4","#ff5722","#f39c12"] NAMES = ["أمين","سارة","يوسف","فاطمة","عمر","نور","خالد","ليلى","رشيد","هند","كريم","سلمى"] TIMES = ["2m","5m","12m","27m","1h","2h","3h","5h","8h","1d","2d","3d"] def av_color(s): return COLORS[int(hashlib.md5(s.encode()).hexdigest()[:4],16) % len(COLORS)] def initials(n): p=n.split(); return (p[0][0]+p[1][0]).upper() if len(p)>=2 else n[:2].upper() def detect(text): lbl = classifier.predict([text])[0] ar = bool(ARABIC_RE.search(text)) fr = bool(FRENCH_SIGNS.search(text)) or sum(1 for w in text.lower().split() if w in FRENCH_STOP)>=1 dj = bool(LATIN_DARIJA.search(text)) if lbl=="AR" and fr: return "MIXED" if lbl=="FR" and (ar or dj): return "MIXED" return lbl def looks_french(t): if FRENCH_SIGNS.search(t): return True return sum(1 for w in t.lower().split() if w in FRENCH_STOP)>=2 def translate(text, src="fra_Latn"): try: r = client.translation(text, model="facebook/nllb-200-distilled-600M", src_lang=src, tgt_lang="arb_Arab") return r if isinstance(r,str) else getattr(r,'translation_text', str(r)) except: return "[Translation unavailable]" def full_translate(text, label): if label=="AR": return text, "ar-kept" if label=="FR": return translate(text), "fr-ar" parts = re.split(r'([\u0600-\u06FF][^\u0000-\u0040]*)', text) out = [] for p in parts: p=p.strip() if not p: continue if ARABIC_RE.search(p): out.append(p) elif looks_french(p): out.append(translate(p)) else: out.append(p) return ' '.join(out), "mixed-ar" def audio(text): try: tts=gTTS(text=text,lang='ar',slow=False); buf=io.BytesIO(); tts.write_to_fp(buf); buf.seek(0); return buf.read() except: return None def badge(label): d={"AR":("#0a2e1a","#22c55e","#166534","🇲🇦 Darija"),"FR":("#0a1929","#60a5fa","#1e3a5f","🇫🇷 Français"),"MIXED":("#1a0f00","#fb923c","#7c2d12","🔀 Mixed")}.get(label,("#111","#888","#444",label)) return f'{d[3]}' # Header st.markdown("""
🇲🇦
Darija Comment Filter
Language detection · Arabic translation · Audio — built for Moroccan social media
""", unsafe_allow_html=True) # Input layout col_in, col_opt = st.columns([3,1], gap="large") with col_in: st.markdown('

Paste comments — one per line

', unsafe_allow_html=True) text_in = st.text_area("", placeholder="C'est vraiment incroyable !\nكيف داير اليوم؟\nwach labas, Ça va bien ?", height=180, label_visibility="collapsed") st.markdown('

Or upload a .txt file

', unsafe_allow_html=True) uploaded = st.file_uploader("", type=["txt"], label_visibility="collapsed") with col_opt: st.markdown('

Filter by language

', unsafe_allow_html=True) f_ar = st.checkbox("🇲🇦 Darija / Arabic", value=True) f_fr = st.checkbox("🇫🇷 French", value=True) f_mix = st.checkbox("🔀 Mixed", value=True) st.markdown('

Options

', unsafe_allow_html=True) show_trans = st.checkbox("Show Arabic translation", value=True) gen_audio = st.checkbox("Generate audio", value=True) st.markdown("
", unsafe_allow_html=True) run = st.button("Analyze Comments") if run: comments = [] if uploaded: comments = [l.strip() for l in uploaded.read().decode("utf-8").splitlines() if l.strip()] elif text_in.strip(): comments = [l.strip() for l in text_in.strip().splitlines() if l.strip()] if not comments: st.warning("Please enter at least one comment.") st.stop() with st.spinner("Detecting languages..."): detected = [(c, detect(c)) for c in comments] counts = {"AR":0,"FR":0,"MIXED":0} for _,l in detected: if l in counts: counts[l]+=1 st.markdown("
", unsafe_allow_html=True) m1,m2,m3,m4 = st.columns(4) m1.metric("Total", len(comments)) m2.metric("Darija", counts["AR"]) m3.metric("French", counts["FR"]) m4.metric("Mixed", counts["MIXED"]) st.markdown("
", unsafe_allow_html=True) sel = [l for l,on in [("AR",f_ar),("FR",f_fr),("MIXED",f_mix)] if on] filtered = [(c,l) for c,l in detected if l in sel] if not filtered: st.warning("No comments match the selected filters.") st.stop() st.markdown(f"""
Post Comments SHOWING {len(filtered)} OF {len(comments)}
""", unsafe_allow_html=True) random.seed(42) for i,(text,label) in enumerate(filtered): uname = random.choice(NAMES)+str(random.randint(10,99)) ts = random.choice(TIMES) col = av_color(uname) ini = initials(uname) likes = random.randint(3,240) reps = random.randint(0,40) translated, method, ab = "", "", None if show_trans: with st.spinner(f"Translating {i+1}/{len(filtered)}..."): translated, method = full_translate(text, label) if gen_audio and method!="ar-kept": ab = audio(translated) if show_trans and method=="ar-kept": tblock = '
Arabic / Darija — no translation needed
' elif show_trans: mlabel = "Translated from French" if method=="fr-ar" else "Translated from Mixed" tblock = f'
{mlabel}
{translated}
' else: tblock = "" st.markdown(f"""
{ini}
{uname}{badge(label)}
{ts} ago
{text}
{tblock}
👍 {likes} 💬 {reps} ↩️ Reply
""", unsafe_allow_html=True) if ab: st.audio(ab, format="audio/mp3") st.markdown("""
Classifier: Char TF-IDF + LinearSVC + Hybrid Rules  ·  96.1% accuracy  ·  κ = 0.94   |   Translation: NLLB-200 (Meta AI) via HF Inference API  ·  BLEU 0.59
""", unsafe_allow_html=True)