import os os.environ["CUDA_VISIBLE_DEVICES"] = "" import streamlit as st import joblib import re import io import random import hashlib from gtts import gTTS from huggingface_hub import InferenceClient st.set_page_config(page_title="Darija Comment Filter", page_icon="🇲🇦", layout="wide") st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def load_models(): clf = joblib.load("codeswitch_model.joblib") cli = InferenceClient() return clf, cli classifier, client = load_models() ARABIC_RE = re.compile(r'[\u0600-\u06FF]') FRENCH_SIGNS = re.compile(r'[àâäçéèêëîïôöùûüÿœæ]', re.IGNORECASE) FRENCH_STOP = set("je tu il elle nous vous les des une pour dans sur avec qui que est sont merci bonjour ça ce cette".split()) LATIN_DARIJA = re.compile(r'\b(wach|labas|bzzaf|bghit|kayn|mzyan|safi|wakha|daba|hada|howa|hiya|ntuma|ana|walakin|3lash|kifash|zwina|mashi|nta|nti|rani|khas|bayna|tayban|makayn|fhmt|t3rf|wqe3|dyal|had|lli|fin|fash|bach|ach)\b', re.IGNORECASE) COLORS = ["#e74c3c","#e67e22","#2ecc71","#1abc9c","#3498db","#9b59b6","#e91e63","#00bcd4","#ff5722","#f39c12"] NAMES = ["أمين","سارة","يوسف","فاطمة","عمر","نور","خالد","ليلى","رشيد","هند","كريم","سلمى"] TIMES = ["2m","5m","12m","27m","1h","2h","3h","5h","8h","1d","2d","3d"] def av_color(s): return COLORS[int(hashlib.md5(s.encode()).hexdigest()[:4],16) % len(COLORS)] def initials(n): p=n.split(); return (p[0][0]+p[1][0]).upper() if len(p)>=2 else n[:2].upper() def detect(text): lbl = classifier.predict([text])[0] ar = bool(ARABIC_RE.search(text)) fr = bool(FRENCH_SIGNS.search(text)) or sum(1 for w in text.lower().split() if w in FRENCH_STOP)>=1 dj = bool(LATIN_DARIJA.search(text)) if lbl=="AR" and fr: return "MIXED" if lbl=="FR" and (ar or dj): return "MIXED" return lbl def looks_french(t): if FRENCH_SIGNS.search(t): return True return sum(1 for w in t.lower().split() if w in FRENCH_STOP)>=2 def translate(text, src="fra_Latn"): try: r = client.translation(text, model="facebook/nllb-200-distilled-600M", src_lang=src, tgt_lang="arb_Arab") return r if isinstance(r,str) else getattr(r,'translation_text', str(r)) except: return "[Translation unavailable]" def full_translate(text, label): if label=="AR": return text, "ar-kept" if label=="FR": return translate(text), "fr-ar" parts = re.split(r'([\u0600-\u06FF][^\u0000-\u0040]*)', text) out = [] for p in parts: p=p.strip() if not p: continue if ARABIC_RE.search(p): out.append(p) elif looks_french(p): out.append(translate(p)) else: out.append(p) return ' '.join(out), "mixed-ar" def audio(text): try: tts=gTTS(text=text,lang='ar',slow=False); buf=io.BytesIO(); tts.write_to_fp(buf); buf.seek(0); return buf.read() except: return None def badge(label): d={"AR":("#0a2e1a","#22c55e","#166534","🇲🇦 Darija"),"FR":("#0a1929","#60a5fa","#1e3a5f","🇫🇷 Français"),"MIXED":("#1a0f00","#fb923c","#7c2d12","🔀 Mixed")}.get(label,("#111","#888","#444",label)) return f'{d[3]}' # Header st.markdown("""
Paste comments — one per line
', unsafe_allow_html=True) text_in = st.text_area("", placeholder="C'est vraiment incroyable !\nكيف داير اليوم؟\nwach labas, Ça va bien ?", height=180, label_visibility="collapsed") st.markdown('Or upload a .txt file
', unsafe_allow_html=True) uploaded = st.file_uploader("", type=["txt"], label_visibility="collapsed") with col_opt: st.markdown('Filter by language
', unsafe_allow_html=True) f_ar = st.checkbox("🇲🇦 Darija / Arabic", value=True) f_fr = st.checkbox("🇫🇷 French", value=True) f_mix = st.checkbox("🔀 Mixed", value=True) st.markdown('Options
', unsafe_allow_html=True) show_trans = st.checkbox("Show Arabic translation", value=True) gen_audio = st.checkbox("Generate audio", value=True) st.markdown("