import re import gradio as gr from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification import torch import torch.nn.functional as F import os # ====================== # ENV # ====================== os.environ["GRADIO_ANALYTICS_ENABLED"] = "False" DEBUG = True def log(msg): if DEBUG: print(msg, flush=True) # ====================== # MODEL 1 — RATING & KATEGORI (ASLI) # ====================== classifier = pipeline( "zero-shot-classification", model="joeddav/xlm-roberta-large-xnli", framework="pt", use_fast=False ) # ====================== # MODEL 2 — ABUSIVE (INDOBERT) # ====================== ABUSIVE_MODEL_PATH = "./indoBERT_abusive" abusive_tokenizer = AutoTokenizer.from_pretrained(ABUSIVE_MODEL_PATH) abusive_model = AutoModelForSequenceClassification.from_pretrained(ABUSIVE_MODEL_PATH) abusive_model.eval() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") abusive_model.to(device) id2label = {0: "Tidak Abusif", 1: "Abusif"} # ====================== # CONFIG (ASLI) # ====================== SEMANTIC_EVENT = { "sadisme": [ "seseorang membunuh manusia lain dengan cara kejam", "orang tua membunuh anaknya sendiri", "tubuh manusia dipotong dan dimasak", "manusia dikorbankan untuk kepentingan lain", "kekerasan dengan menghancurkan atau menghilangkan anggota tubuh", "pembunuhan dengan cara yang tidak lazim seperti mutilasi atau penyiksaan", "kekerasan yang mengakibatkan korban cacat" ], "kekerasan fisik": [ "seseorang melukai tubuh manusia lain", "manusia diserang secara fisik", "tindakan kekerasan yang menyebabkan kematian", "perundungan dengan menggunakan aktivitas fisik" ], "kekerasan verbal": [ "teriakan marah dan ancaman keras", "perkataan kasar yang menakut-nakuti", "kalimat jorok yang mengarah ke seksualitas", "penghinaan terhadap fisik" ], "seksual": [ "hubungan intim antara pria dan wanita", "perkataan dan kalimat vulgar terhadap bagian genital pria dan wanita", "kalimat yang mengarah ke aktivitas seksual" ], "perjudian": [ "bertaruh harta atau barang untuk mendapatkan keuntungan", "bertaruh nasib dengan permainan atau pertarungan" ], "narkoba": [ "mengonsumsi obat terlarang", "menjual dan mendistribusi obat terlarang", "kalimat yang mengandung jenis-jenis obat terlarang" ] } THRESHOLD = { "sadisme": 0.65, "kekerasan fisik": 0.80, "kekerasan verbal": 0.80, "seksual": 0.80, "perjudian": 0.85, "narkoba": 0.80 } # ====================== # UTIL (ASLI) # ====================== def split_kalimat(text): return [ k.strip() for k in re.split(r'(?<=[.!?])\s+', text) if len(k.strip()) >= 20 ] def sliding_window(kalimat, window=4): if len(kalimat) < window: return kalimat return [ " ".join(kalimat[i:i+window]) for i in range(len(kalimat) - window + 1) ] def rating_usia(kategori): if "sadisme" in kategori: return 21 if any(k in kategori for k in ["seksual", "perjudian", "narkoba"]): return 17 if any(k in kategori for k in ["kekerasan fisik", "kekerasan verbal"]): return 13 return 0 # ====================== # CORE ANALYSIS + DEBUG (ASLI — JANGAN DIUBAH) # ====================== def analyze_text(judul, isi): kalimat = split_kalimat(isi) windows = sliding_window(kalimat) detected = set() log("\n" + "=" * 80) log(f"JUDUL: {judul}") for w in windows: log("\n[WINDOW]") log(w) for kategori, desc in SEMANTIC_EVENT.items(): res = classifier( w, desc, hypothesis_template="Teks ini menggambarkan {}." ) score = max(res["scores"]) threshold = THRESHOLD[kategori] log(f" - {kategori} | score={score:.3f} | threshold={threshold}") if score < threshold: log(" ❌ FAIL: threshold") continue log(" ✅ ACCEPTED") detected.add(kategori) usia = rating_usia(detected) log(f"\nFINAL KATEGORI: {list(detected)}") log(f"RATING USIA: {usia}") log("=" * 80) return usia, ", ".join(sorted(detected)) # ====================== # FILTER ABUSIVE (TERPISAH + LOG LENGKAP) # ====================== def filter_abusive(isi): log("\n" + "=" * 80) log("[ABUSIVE] START") log(f"[ABUSIVE] INPUT:\n{isi}") paragraphs = re.split(r'\n+', isi) output = [] for p_idx, para in enumerate(paragraphs): para = para.strip() if not para: continue log(f"\n[ABUSIVE] PARAGRAPH {p_idx}: {para}") sentences = re.split(r'(?<=[.!?])\s+', para) notes = [] for s_idx, s in enumerate(sentences): log(f"[ABUSIVE] SENTENCE {s_idx}: {s}") inputs = abusive_tokenizer( s, return_tensors="pt", truncation=True, padding=True, max_length=128 ) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): out = abusive_model(**inputs) probs = F.softmax(out.logits, dim=-1) pred = torch.argmax(probs, dim=-1).item() label = id2label[pred] log(f"[ABUSIVE] PRED={label}") if label == "Abusif": note = f'Kalimat "{s}" mengandung kalimat abusif, tidak baik diucapkan' notes.append(note) log(f"[ABUSIVE] ⚠️ {note}") output.append(para) output.extend(notes) result = "\n\n".join(output) log("\n[ABUSIVE] OUTPUT:") log(result) log("=" * 80) return result # ====================== # ROUTER /ANALYZE (MODE BASED — TETAP) # ====================== def analyze_router(judul, isi, mode): log(f"\n[ROUTER] MODE = {mode}") if mode == "rating": usia, kategori = analyze_text(judul, isi) return { "rating_umur": usia, "kategori": kategori } if mode == "abusive": filtered = filter_abusive(isi) return { "filtered_text": filtered } return { "error": "Invalid mode" } # ====================== # GRADIO API — ROUTE TETAP /analyze # ====================== demo = gr.Interface( fn=analyze_router, inputs=[ gr.Textbox(), gr.Textbox(), gr.Radio(["rating", "abusive"]) ], outputs=gr.JSON(), api_name="analyze" ) # ====================== # ENTRY POINT # ====================== if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, ssr_mode=False, show_error=True )