Spaces:
Sleeping
Sleeping
File size: 7,005 Bytes
b658d34 9b47349 1453698 edb4f04 7739de6 edb4f04 1453698 9b47349 edb4f04 100b7f8 edb4f04 7c5f4f3 eb3e6ed 7c5f4f3 b658d34 7c5f4f3 49c2874 b658d34 7c5f4f3 7739de6 9b47349 eb3e6ed 7c5f4f3 b658d34 369c39e 7c5f4f3 369c39e b658d34 7c5f4f3 369c39e b658d34 369c39e 0f01ac2 369c39e b658d34 7c5f4f3 5d110a7 369c39e d8eac9e 0f01ac2 369c39e 7c5f4f3 7739de6 7c5f4f3 b658d34 7c5f4f3 1453698 b658d34 eb3e6ed b658d34 7c5f4f3 b658d34 7c5f4f3 6c83ef5 7c5f4f3 e5c4375 b658d34 eb3e6ed b658d34 eb3e6ed 100b7f8 eb3e6ed 116837f 9b47349 eb3e6ed 9b47349 edb4f04 9b47349 03c686d 9b47349 7739de6 6c83ef5 9b47349 6c83ef5 9b47349 6c83ef5 9b47349 7739de6 9b47349 6c83ef5 9b47349 edb4f04 9b47349 edb4f04 6c83ef5 7c5f4f3 e5c4375 03c686d b658d34 03c686d 6c83ef5 edb4f04 9b47349 6c83ef5 9b47349 eb3e6ed 6c83ef5 9b47349 7739de6 9b47349 03c686d 7739de6 03c686d 7739de6 9b47349 7739de6 b658d34 7c5f4f3 6c83ef5 7c5f4f3 b658d34 eb3e6ed 7739de6 b658d34 9b47349 eb3e6ed b658d34 7739de6 b658d34 eb3e6ed a169def edb4f04 1453698 6c83ef5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 | import re
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
import os
# ======================
# ENV
# ======================
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
DEBUG = True
def log(msg):
if DEBUG:
print(msg, flush=True)
# ======================
# MODEL 1 β RATING & KATEGORI (ASLI)
# ======================
classifier = pipeline(
"zero-shot-classification",
model="joeddav/xlm-roberta-large-xnli",
framework="pt",
use_fast=False
)
# ======================
# MODEL 2 β ABUSIVE (INDOBERT)
# ======================
ABUSIVE_MODEL_PATH = "./indoBERT_abusive"
abusive_tokenizer = AutoTokenizer.from_pretrained(ABUSIVE_MODEL_PATH)
abusive_model = AutoModelForSequenceClassification.from_pretrained(ABUSIVE_MODEL_PATH)
abusive_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
abusive_model.to(device)
id2label = {0: "Tidak Abusif", 1: "Abusif"}
# ======================
# CONFIG (ASLI)
# ======================
SEMANTIC_EVENT = {
"sadisme": [
"seseorang membunuh manusia lain dengan cara kejam",
"orang tua membunuh anaknya sendiri",
"tubuh manusia dipotong dan dimasak",
"manusia dikorbankan untuk kepentingan lain",
"kekerasan dengan menghancurkan atau menghilangkan anggota tubuh",
"pembunuhan dengan cara yang tidak lazim seperti mutilasi atau penyiksaan",
"kekerasan yang mengakibatkan korban cacat"
],
"kekerasan fisik": [
"seseorang melukai tubuh manusia lain",
"manusia diserang secara fisik",
"tindakan kekerasan yang menyebabkan kematian",
"perundungan dengan menggunakan aktivitas fisik"
],
"kekerasan verbal": [
"teriakan marah dan ancaman keras",
"perkataan kasar yang menakut-nakuti",
"kalimat jorok yang mengarah ke seksualitas",
"penghinaan terhadap fisik"
],
"seksual": [
"hubungan intim antara pria dan wanita",
"perkataan dan kalimat vulgar terhadap bagian genital pria dan wanita",
"kalimat yang mengarah ke aktivitas seksual"
],
"perjudian": [
"bertaruh harta atau barang untuk mendapatkan keuntungan",
"bertaruh nasib dengan permainan atau pertarungan"
],
"narkoba": [
"mengonsumsi obat terlarang",
"menjual dan mendistribusi obat terlarang",
"kalimat yang mengandung jenis-jenis obat terlarang"
]
}
THRESHOLD = {
"sadisme": 0.65,
"kekerasan fisik": 0.80,
"kekerasan verbal": 0.80,
"seksual": 0.80,
"perjudian": 0.85,
"narkoba": 0.80
}
# ======================
# UTIL (ASLI)
# ======================
def split_kalimat(text):
return [
k.strip()
for k in re.split(r'(?<=[.!?])\s+', text)
if len(k.strip()) >= 20
]
def sliding_window(kalimat, window=4):
if len(kalimat) < window:
return kalimat
return [
" ".join(kalimat[i:i+window])
for i in range(len(kalimat) - window + 1)
]
def rating_usia(kategori):
if "sadisme" in kategori:
return 21
if any(k in kategori for k in ["seksual", "perjudian", "narkoba"]):
return 17
if any(k in kategori for k in ["kekerasan fisik", "kekerasan verbal"]):
return 13
return 0
# ======================
# CORE ANALYSIS + DEBUG (ASLI β JANGAN DIUBAH)
# ======================
def analyze_text(judul, isi):
kalimat = split_kalimat(isi)
windows = sliding_window(kalimat)
detected = set()
log("\n" + "=" * 80)
log(f"JUDUL: {judul}")
for w in windows:
log("\n[WINDOW]")
log(w)
for kategori, desc in SEMANTIC_EVENT.items():
res = classifier(
w,
desc,
hypothesis_template="Teks ini menggambarkan {}."
)
score = max(res["scores"])
threshold = THRESHOLD[kategori]
log(f" - {kategori} | score={score:.3f} | threshold={threshold}")
if score < threshold:
log(" β FAIL: threshold")
continue
log(" β
ACCEPTED")
detected.add(kategori)
usia = rating_usia(detected)
log(f"\nFINAL KATEGORI: {list(detected)}")
log(f"RATING USIA: {usia}")
log("=" * 80)
return usia, ", ".join(sorted(detected))
# ======================
# FILTER ABUSIVE (TERPISAH + LOG LENGKAP)
# ======================
def filter_abusive(isi):
log("\n" + "=" * 80)
log("[ABUSIVE] START")
log(f"[ABUSIVE] INPUT:\n{isi}")
paragraphs = re.split(r'\n+', isi)
output = []
for p_idx, para in enumerate(paragraphs):
para = para.strip()
if not para:
continue
log(f"\n[ABUSIVE] PARAGRAPH {p_idx}: {para}")
sentences = re.split(r'(?<=[.!?])\s+', para)
notes = []
for s_idx, s in enumerate(sentences):
log(f"[ABUSIVE] SENTENCE {s_idx}: {s}")
inputs = abusive_tokenizer(
s,
return_tensors="pt",
truncation=True,
padding=True,
max_length=128
)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
out = abusive_model(**inputs)
probs = F.softmax(out.logits, dim=-1)
pred = torch.argmax(probs, dim=-1).item()
label = id2label[pred]
log(f"[ABUSIVE] PRED={label}")
if label == "Abusif":
note = f'Kalimat "{s}" mengandung kalimat abusif, tidak baik diucapkan'
notes.append(note)
log(f"[ABUSIVE] β οΈ {note}")
output.append(para)
output.extend(notes)
result = "\n\n".join(output)
log("\n[ABUSIVE] OUTPUT:")
log(result)
log("=" * 80)
return result
# ======================
# ROUTER /ANALYZE (MODE BASED β TETAP)
# ======================
def analyze_router(judul, isi, mode):
log(f"\n[ROUTER] MODE = {mode}")
if mode == "rating":
usia, kategori = analyze_text(judul, isi)
return {
"rating_umur": usia,
"kategori": kategori
}
if mode == "abusive":
filtered = filter_abusive(isi)
return {
"filtered_text": filtered
}
return {
"error": "Invalid mode"
}
# ======================
# GRADIO API β ROUTE TETAP /analyze
# ======================
demo = gr.Interface(
fn=analyze_router,
inputs=[
gr.Textbox(),
gr.Textbox(),
gr.Radio(["rating", "abusive"])
],
outputs=gr.JSON(),
api_name="analyze"
)
# ======================
# ENTRY POINT
# ======================
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
ssr_mode=False,
show_error=True
) |