| import os |
| os.environ["CUDA_VISIBLE_DEVICES"] = "" |
|
|
| import streamlit as st |
| import joblib |
| import re |
| import io |
| import random |
| import hashlib |
| from gtts import gTTS |
| from huggingface_hub import InferenceClient |
|
|
| st.set_page_config(page_title="Darija Comment Filter", page_icon="🇲🇦", layout="wide") |
|
|
| st.markdown(""" |
| <style> |
| @import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic:wght@300;400;600&family=DM+Sans:wght@300;400;500;600;700&display=swap'); |
| html, body, [class*="css"] { font-family:'DM Sans',sans-serif!important; background:#0d1117!important; color:#c9d1d9!important; } |
| .stApp { background:#0d1117!important; } |
| .block-container { padding:2rem 2rem 4rem!important; max-width:1000px!important; } |
| #MainMenu, footer, header { visibility:hidden!important; } |
| .stTextArea textarea { background:#161b22!important; border:1px solid #30363d!important; color:#c9d1d9!important; border-radius:10px!important; font-size:14px!important; } |
| .stTextArea textarea:focus { border-color:#388bfd!important; } |
| [data-testid="stCheckbox"] label { color:#8b949e!important; font-size:13px!important; } |
| .stButton>button { background:#238636!important; color:white!important; border:none!important; border-radius:8px!important; font-weight:600!important; font-size:15px!important; padding:12px 28px!important; width:100%!important; } |
| .stButton>button:hover { background:#2ea043!important; } |
| [data-testid="stMetric"] { background:#161b22!important; border:1px solid #30363d!important; border-radius:12px!important; padding:16px!important; } |
| [data-testid="stMetricValue"] { color:#f0f6fc!important; font-size:28px!important; font-weight:700!important; } |
| [data-testid="stMetricLabel"] { color:#8b949e!important; font-size:11px!important; letter-spacing:1.5px!important; text-transform:uppercase!important; } |
| </style> |
| """, unsafe_allow_html=True) |
|
|
| @st.cache_resource |
| def load_models(): |
| clf = joblib.load("codeswitch_model.joblib") |
| cli = InferenceClient() |
| return clf, cli |
|
|
| classifier, client = load_models() |
|
|
| ARABIC_RE = re.compile(r'[\u0600-\u06FF]') |
| FRENCH_SIGNS = re.compile(r'[àâäçéèêëîïôöùûüÿœæ]', re.IGNORECASE) |
| FRENCH_STOP = set("je tu il elle nous vous les des une pour dans sur avec qui que est sont merci bonjour ça ce cette".split()) |
| LATIN_DARIJA = re.compile(r'\b(wach|labas|bzzaf|bghit|kayn|mzyan|safi|wakha|daba|hada|howa|hiya|ntuma|ana|walakin|3lash|kifash|zwina|mashi|nta|nti|rani|khas|bayna|tayban|makayn|fhmt|t3rf|wqe3|dyal|had|lli|fin|fash|bach|ach)\b', re.IGNORECASE) |
|
|
| COLORS = ["#e74c3c","#e67e22","#2ecc71","#1abc9c","#3498db","#9b59b6","#e91e63","#00bcd4","#ff5722","#f39c12"] |
| NAMES = ["أمين","سارة","يوسف","فاطمة","عمر","نور","خالد","ليلى","رشيد","هند","كريم","سلمى"] |
| TIMES = ["2m","5m","12m","27m","1h","2h","3h","5h","8h","1d","2d","3d"] |
|
|
| def av_color(s): return COLORS[int(hashlib.md5(s.encode()).hexdigest()[:4],16) % len(COLORS)] |
| def initials(n): p=n.split(); return (p[0][0]+p[1][0]).upper() if len(p)>=2 else n[:2].upper() |
|
|
| def detect(text): |
| lbl = classifier.predict([text])[0] |
| ar = bool(ARABIC_RE.search(text)) |
| fr = bool(FRENCH_SIGNS.search(text)) or sum(1 for w in text.lower().split() if w in FRENCH_STOP)>=1 |
| dj = bool(LATIN_DARIJA.search(text)) |
| if lbl=="AR" and fr: return "MIXED" |
| if lbl=="FR" and (ar or dj): return "MIXED" |
| return lbl |
|
|
| def looks_french(t): |
| if FRENCH_SIGNS.search(t): return True |
| return sum(1 for w in t.lower().split() if w in FRENCH_STOP)>=2 |
|
|
| def translate(text, src="fra_Latn"): |
| try: |
| r = client.translation(text, model="facebook/nllb-200-distilled-600M", src_lang=src, tgt_lang="arb_Arab") |
| return r if isinstance(r,str) else getattr(r,'translation_text', str(r)) |
| except: return "[Translation unavailable]" |
|
|
| def full_translate(text, label): |
| if label=="AR": return text, "ar-kept" |
| if label=="FR": return translate(text), "fr-ar" |
| parts = re.split(r'([\u0600-\u06FF][^\u0000-\u0040]*)', text) |
| out = [] |
| for p in parts: |
| p=p.strip() |
| if not p: continue |
| if ARABIC_RE.search(p): out.append(p) |
| elif looks_french(p): out.append(translate(p)) |
| else: out.append(p) |
| return ' '.join(out), "mixed-ar" |
|
|
| def audio(text): |
| try: |
| tts=gTTS(text=text,lang='ar',slow=False); buf=io.BytesIO(); tts.write_to_fp(buf); buf.seek(0); return buf.read() |
| except: return None |
|
|
| def badge(label): |
| d={"AR":("#0a2e1a","#22c55e","#166534","🇲🇦 Darija"),"FR":("#0a1929","#60a5fa","#1e3a5f","🇫🇷 Français"),"MIXED":("#1a0f00","#fb923c","#7c2d12","🔀 Mixed")}.get(label,("#111","#888","#444",label)) |
| return f'<span style="background:{d[0]};color:{d[1]};border:1px solid {d[2]};padding:2px 10px;border-radius:20px;font-size:11px;font-weight:700">{d[3]}</span>' |
|
|
| |
| st.markdown(""" |
| <div style="display:flex;align-items:center;gap:14px;padding-bottom:20px;border-bottom:1px solid #21262d;margin-bottom:28px"> |
| <div style="width:44px;height:44px;background:#238636;border-radius:10px;display:flex;align-items:center;justify-content:center;font-size:24px">🇲🇦</div> |
| <div> |
| <div style="font-size:22px;font-weight:700;color:#f0f6fc">Darija Comment Filter</div> |
| <div style="font-size:13px;color:#6b7280;margin-top:3px">Language detection · Arabic translation · Audio — built for Moroccan social media</div> |
| </div> |
| </div>""", unsafe_allow_html=True) |
|
|
| |
| col_in, col_opt = st.columns([3,1], gap="large") |
| with col_in: |
| st.markdown('<p style="font-size:11px;color:#8b949e;letter-spacing:1.2px;text-transform:uppercase;margin-bottom:6px">Paste comments — one per line</p>', unsafe_allow_html=True) |
| text_in = st.text_area("", placeholder="C'est vraiment incroyable !\nكيف داير اليوم؟\nwach labas, Ça va bien ?", height=180, label_visibility="collapsed") |
| st.markdown('<p style="font-size:11px;color:#8b949e;letter-spacing:1.2px;text-transform:uppercase;margin:12px 0 4px">Or upload a .txt file</p>', unsafe_allow_html=True) |
| uploaded = st.file_uploader("", type=["txt"], label_visibility="collapsed") |
|
|
| with col_opt: |
| st.markdown('<p style="font-size:11px;color:#8b949e;letter-spacing:1.2px;text-transform:uppercase;margin-bottom:10px">Filter by language</p>', unsafe_allow_html=True) |
| f_ar = st.checkbox("🇲🇦 Darija / Arabic", value=True) |
| f_fr = st.checkbox("🇫🇷 French", value=True) |
| f_mix = st.checkbox("🔀 Mixed", value=True) |
| st.markdown('<p style="font-size:11px;color:#8b949e;letter-spacing:1.2px;text-transform:uppercase;margin:18px 0 10px">Options</p>', unsafe_allow_html=True) |
| show_trans = st.checkbox("Show Arabic translation", value=True) |
| gen_audio = st.checkbox("Generate audio", value=True) |
|
|
| st.markdown("<br>", unsafe_allow_html=True) |
| run = st.button("Analyze Comments") |
|
|
| if run: |
| comments = [] |
| if uploaded: |
| comments = [l.strip() for l in uploaded.read().decode("utf-8").splitlines() if l.strip()] |
| elif text_in.strip(): |
| comments = [l.strip() for l in text_in.strip().splitlines() if l.strip()] |
|
|
| if not comments: |
| st.warning("Please enter at least one comment.") |
| st.stop() |
|
|
| with st.spinner("Detecting languages..."): |
| detected = [(c, detect(c)) for c in comments] |
|
|
| counts = {"AR":0,"FR":0,"MIXED":0} |
| for _,l in detected: |
| if l in counts: counts[l]+=1 |
|
|
| st.markdown("<br>", unsafe_allow_html=True) |
| m1,m2,m3,m4 = st.columns(4) |
| m1.metric("Total", len(comments)) |
| m2.metric("Darija", counts["AR"]) |
| m3.metric("French", counts["FR"]) |
| m4.metric("Mixed", counts["MIXED"]) |
| st.markdown("<br>", unsafe_allow_html=True) |
|
|
| sel = [l for l,on in [("AR",f_ar),("FR",f_fr),("MIXED",f_mix)] if on] |
| filtered = [(c,l) for c,l in detected if l in sel] |
|
|
| if not filtered: |
| st.warning("No comments match the selected filters.") |
| st.stop() |
|
|
| st.markdown(f""" |
| <div style="display:flex;align-items:center;margin-bottom:16px;padding-bottom:14px;border-bottom:1px solid #21262d"> |
| <span style="font-size:17px;font-weight:700;color:#f0f6fc">Post Comments</span> |
| <span style="margin-left:auto;font-size:11px;color:#6b7280;letter-spacing:0.5px">SHOWING {len(filtered)} OF {len(comments)}</span> |
| </div>""", unsafe_allow_html=True) |
|
|
| random.seed(42) |
| for i,(text,label) in enumerate(filtered): |
| uname = random.choice(NAMES)+str(random.randint(10,99)) |
| ts = random.choice(TIMES) |
| col = av_color(uname) |
| ini = initials(uname) |
| likes = random.randint(3,240) |
| reps = random.randint(0,40) |
|
|
| translated, method, ab = "", "", None |
| if show_trans: |
| with st.spinner(f"Translating {i+1}/{len(filtered)}..."): |
| translated, method = full_translate(text, label) |
| if gen_audio and method!="ar-kept": |
| ab = audio(translated) |
|
|
| if show_trans and method=="ar-kept": |
| tblock = '<div style="margin-top:10px;padding:8px 12px;background:#0a2e1a;border-radius:8px;font-size:12px;color:#22c55e">Arabic / Darija — no translation needed</div>' |
| elif show_trans: |
| mlabel = "Translated from French" if method=="fr-ar" else "Translated from Mixed" |
| tblock = f'<div style="margin-top:12px;padding:12px 16px;background:#0d1117;border-radius:10px;border-left:3px solid {col}"><div style="font-size:10px;color:#6b7280;margin-bottom:8px;letter-spacing:0.5px">{mlabel}</div><div style="font-family:Noto Sans Arabic,sans-serif;font-size:16px;color:#f0f6fc;direction:rtl;text-align:right;line-height:2">{translated}</div></div>' |
| else: |
| tblock = "" |
|
|
| st.markdown(f""" |
| <div style="background:#161b22;border:1px solid #30363d;border-radius:14px;padding:18px 20px;margin:10px 0"> |
| <div style="display:flex;align-items:center;gap:12px;margin-bottom:12px"> |
| <div style="width:42px;height:42px;border-radius:50%;background:{col};display:flex;align-items:center;justify-content:center;font-weight:700;font-size:15px;color:#fff;flex-shrink:0">{ini}</div> |
| <div style="flex:1"> |
| <div style="display:flex;align-items:center;gap:8px;flex-wrap:wrap"> |
| <span style="font-weight:600;font-size:14px;color:#e6edf3">{uname}</span>{badge(label)} |
| </div> |
| <div style="font-size:11px;color:#6b7280;margin-top:2px">{ts} ago</div> |
| </div> |
| </div> |
| <div style="font-size:15px;color:#c9d1d9;line-height:1.75">{text}</div> |
| {tblock} |
| <div style="display:flex;gap:20px;margin-top:14px;padding-top:12px;border-top:1px solid #21262d"> |
| <span style="font-size:12px;color:#6b7280">👍 {likes}</span> |
| <span style="font-size:12px;color:#6b7280">💬 {reps}</span> |
| <span style="font-size:12px;color:#6b7280">↩️ Reply</span> |
| </div> |
| </div>""", unsafe_allow_html=True) |
|
|
| if ab: |
| st.audio(ab, format="audio/mp3") |
|
|
| st.markdown(""" |
| <div style="margin-top:48px;padding-top:16px;border-top:1px solid #21262d;font-size:11px;color:#484f58;text-align:center"> |
| Classifier: Char TF-IDF + LinearSVC + Hybrid Rules · 96.1% accuracy · κ = 0.94 |
| | Translation: NLLB-200 (Meta AI) via HF Inference API · BLEU 0.59 |
| </div>""", unsafe_allow_html=True) |
|
|