Spaces:
Sleeping
Sleeping
| import re | |
| import gradio as gr | |
| from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification | |
| import torch | |
| import torch.nn.functional as F | |
| import os | |
| # ====================== | |
| # ENV | |
| # ====================== | |
| os.environ["GRADIO_ANALYTICS_ENABLED"] = "False" | |
| DEBUG = True | |
| def log(msg): | |
| if DEBUG: | |
| print(msg, flush=True) | |
| # ====================== | |
| # MODEL 1 β RATING & KATEGORI (ASLI) | |
| # ====================== | |
| classifier = pipeline( | |
| "zero-shot-classification", | |
| model="joeddav/xlm-roberta-large-xnli", | |
| framework="pt", | |
| use_fast=False | |
| ) | |
| # ====================== | |
| # MODEL 2 β ABUSIVE (INDOBERT) | |
| # ====================== | |
| ABUSIVE_MODEL_PATH = "./indoBERT_abusive" | |
| abusive_tokenizer = AutoTokenizer.from_pretrained(ABUSIVE_MODEL_PATH) | |
| abusive_model = AutoModelForSequenceClassification.from_pretrained(ABUSIVE_MODEL_PATH) | |
| abusive_model.eval() | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| abusive_model.to(device) | |
| id2label = {0: "Tidak Abusif", 1: "Abusif"} | |
| # ====================== | |
| # CONFIG (ASLI) | |
| # ====================== | |
| SEMANTIC_EVENT = { | |
| "sadisme": [ | |
| "seseorang membunuh manusia lain dengan cara kejam", | |
| "orang tua membunuh anaknya sendiri", | |
| "tubuh manusia dipotong dan dimasak", | |
| "manusia dikorbankan untuk kepentingan lain", | |
| "kekerasan dengan menghancurkan atau menghilangkan anggota tubuh", | |
| "pembunuhan dengan cara yang tidak lazim seperti mutilasi atau penyiksaan", | |
| "kekerasan yang mengakibatkan korban cacat" | |
| ], | |
| "kekerasan fisik": [ | |
| "seseorang melukai tubuh manusia lain", | |
| "manusia diserang secara fisik", | |
| "tindakan kekerasan yang menyebabkan kematian", | |
| "perundungan dengan menggunakan aktivitas fisik" | |
| ], | |
| "kekerasan verbal": [ | |
| "teriakan marah dan ancaman keras", | |
| "perkataan kasar yang menakut-nakuti", | |
| "kalimat jorok yang mengarah ke seksualitas", | |
| "penghinaan terhadap fisik" | |
| ], | |
| "seksual": [ | |
| "hubungan intim antara pria dan wanita", | |
| "perkataan dan kalimat vulgar terhadap bagian genital pria dan wanita", | |
| "kalimat yang mengarah ke aktivitas seksual" | |
| ], | |
| "perjudian": [ | |
| "bertaruh harta atau barang untuk mendapatkan keuntungan", | |
| "bertaruh nasib dengan permainan atau pertarungan" | |
| ], | |
| "narkoba": [ | |
| "mengonsumsi obat terlarang", | |
| "menjual dan mendistribusi obat terlarang", | |
| "kalimat yang mengandung jenis-jenis obat terlarang" | |
| ] | |
| } | |
| THRESHOLD = { | |
| "sadisme": 0.65, | |
| "kekerasan fisik": 0.80, | |
| "kekerasan verbal": 0.80, | |
| "seksual": 0.80, | |
| "perjudian": 0.85, | |
| "narkoba": 0.80 | |
| } | |
| # ====================== | |
| # UTIL (ASLI) | |
| # ====================== | |
| def split_kalimat(text): | |
| return [ | |
| k.strip() | |
| for k in re.split(r'(?<=[.!?])\s+', text) | |
| if len(k.strip()) >= 20 | |
| ] | |
| def sliding_window(kalimat, window=4): | |
| if len(kalimat) < window: | |
| return kalimat | |
| return [ | |
| " ".join(kalimat[i:i+window]) | |
| for i in range(len(kalimat) - window + 1) | |
| ] | |
| def rating_usia(kategori): | |
| if "sadisme" in kategori: | |
| return 21 | |
| if any(k in kategori for k in ["seksual", "perjudian", "narkoba"]): | |
| return 17 | |
| if any(k in kategori for k in ["kekerasan fisik", "kekerasan verbal"]): | |
| return 13 | |
| return 0 | |
| # ====================== | |
| # CORE ANALYSIS + DEBUG (ASLI β JANGAN DIUBAH) | |
| # ====================== | |
| def analyze_text(judul, isi): | |
| kalimat = split_kalimat(isi) | |
| windows = sliding_window(kalimat) | |
| detected = set() | |
| log("\n" + "=" * 80) | |
| log(f"JUDUL: {judul}") | |
| for w in windows: | |
| log("\n[WINDOW]") | |
| log(w) | |
| for kategori, desc in SEMANTIC_EVENT.items(): | |
| res = classifier( | |
| w, | |
| desc, | |
| hypothesis_template="Teks ini menggambarkan {}." | |
| ) | |
| score = max(res["scores"]) | |
| threshold = THRESHOLD[kategori] | |
| log(f" - {kategori} | score={score:.3f} | threshold={threshold}") | |
| if score < threshold: | |
| log(" β FAIL: threshold") | |
| continue | |
| log(" β ACCEPTED") | |
| detected.add(kategori) | |
| usia = rating_usia(detected) | |
| log(f"\nFINAL KATEGORI: {list(detected)}") | |
| log(f"RATING USIA: {usia}") | |
| log("=" * 80) | |
| return usia, ", ".join(sorted(detected)) | |
| # ====================== | |
| # FILTER ABUSIVE (TERPISAH + LOG LENGKAP) | |
| # ====================== | |
| def filter_abusive(isi): | |
| log("\n" + "=" * 80) | |
| log("[ABUSIVE] START") | |
| log(f"[ABUSIVE] INPUT:\n{isi}") | |
| paragraphs = re.split(r'\n+', isi) | |
| output = [] | |
| for p_idx, para in enumerate(paragraphs): | |
| para = para.strip() | |
| if not para: | |
| continue | |
| log(f"\n[ABUSIVE] PARAGRAPH {p_idx}: {para}") | |
| sentences = re.split(r'(?<=[.!?])\s+', para) | |
| notes = [] | |
| for s_idx, s in enumerate(sentences): | |
| log(f"[ABUSIVE] SENTENCE {s_idx}: {s}") | |
| inputs = abusive_tokenizer( | |
| s, | |
| return_tensors="pt", | |
| truncation=True, | |
| padding=True, | |
| max_length=128 | |
| ) | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| out = abusive_model(**inputs) | |
| probs = F.softmax(out.logits, dim=-1) | |
| pred = torch.argmax(probs, dim=-1).item() | |
| label = id2label[pred] | |
| log(f"[ABUSIVE] PRED={label}") | |
| if label == "Abusif": | |
| note = f'Kalimat "{s}" mengandung kalimat abusif, tidak baik diucapkan' | |
| notes.append(note) | |
| log(f"[ABUSIVE] β οΈ {note}") | |
| output.append(para) | |
| output.extend(notes) | |
| result = "\n\n".join(output) | |
| log("\n[ABUSIVE] OUTPUT:") | |
| log(result) | |
| log("=" * 80) | |
| return result | |
| # ====================== | |
| # ROUTER /ANALYZE (MODE BASED β TETAP) | |
| # ====================== | |
| def analyze_router(judul, isi, mode): | |
| log(f"\n[ROUTER] MODE = {mode}") | |
| if mode == "rating": | |
| usia, kategori = analyze_text(judul, isi) | |
| return { | |
| "rating_umur": usia, | |
| "kategori": kategori | |
| } | |
| if mode == "abusive": | |
| filtered = filter_abusive(isi) | |
| return { | |
| "filtered_text": filtered | |
| } | |
| return { | |
| "error": "Invalid mode" | |
| } | |
| # ====================== | |
| # GRADIO API β ROUTE TETAP /analyze | |
| # ====================== | |
| demo = gr.Interface( | |
| fn=analyze_router, | |
| inputs=[ | |
| gr.Textbox(), | |
| gr.Textbox(), | |
| gr.Radio(["rating", "abusive"]) | |
| ], | |
| outputs=gr.JSON(), | |
| api_name="analyze" | |
| ) | |
| # ====================== | |
| # ENTRY POINT | |
| # ====================== | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| ssr_mode=False, | |
| show_error=True | |
| ) |