Redear / app.py
rblueeyes's picture
Update app.py
03c686d verified
import re
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
import os
# ======================
# ENV
# ======================
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
DEBUG = True
def log(msg):
if DEBUG:
print(msg, flush=True)
# ======================
# MODEL 1 β€” RATING & KATEGORI (ASLI)
# ======================
classifier = pipeline(
"zero-shot-classification",
model="joeddav/xlm-roberta-large-xnli",
framework="pt",
use_fast=False
)
# ======================
# MODEL 2 β€” ABUSIVE (INDOBERT)
# ======================
ABUSIVE_MODEL_PATH = "./indoBERT_abusive"
abusive_tokenizer = AutoTokenizer.from_pretrained(ABUSIVE_MODEL_PATH)
abusive_model = AutoModelForSequenceClassification.from_pretrained(ABUSIVE_MODEL_PATH)
abusive_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
abusive_model.to(device)
id2label = {0: "Tidak Abusif", 1: "Abusif"}
# ======================
# CONFIG (ASLI)
# ======================
SEMANTIC_EVENT = {
"sadisme": [
"seseorang membunuh manusia lain dengan cara kejam",
"orang tua membunuh anaknya sendiri",
"tubuh manusia dipotong dan dimasak",
"manusia dikorbankan untuk kepentingan lain",
"kekerasan dengan menghancurkan atau menghilangkan anggota tubuh",
"pembunuhan dengan cara yang tidak lazim seperti mutilasi atau penyiksaan",
"kekerasan yang mengakibatkan korban cacat"
],
"kekerasan fisik": [
"seseorang melukai tubuh manusia lain",
"manusia diserang secara fisik",
"tindakan kekerasan yang menyebabkan kematian",
"perundungan dengan menggunakan aktivitas fisik"
],
"kekerasan verbal": [
"teriakan marah dan ancaman keras",
"perkataan kasar yang menakut-nakuti",
"kalimat jorok yang mengarah ke seksualitas",
"penghinaan terhadap fisik"
],
"seksual": [
"hubungan intim antara pria dan wanita",
"perkataan dan kalimat vulgar terhadap bagian genital pria dan wanita",
"kalimat yang mengarah ke aktivitas seksual"
],
"perjudian": [
"bertaruh harta atau barang untuk mendapatkan keuntungan",
"bertaruh nasib dengan permainan atau pertarungan"
],
"narkoba": [
"mengonsumsi obat terlarang",
"menjual dan mendistribusi obat terlarang",
"kalimat yang mengandung jenis-jenis obat terlarang"
]
}
THRESHOLD = {
"sadisme": 0.65,
"kekerasan fisik": 0.80,
"kekerasan verbal": 0.80,
"seksual": 0.80,
"perjudian": 0.85,
"narkoba": 0.80
}
# ======================
# UTIL (ASLI)
# ======================
def split_kalimat(text):
return [
k.strip()
for k in re.split(r'(?<=[.!?])\s+', text)
if len(k.strip()) >= 20
]
def sliding_window(kalimat, window=4):
if len(kalimat) < window:
return kalimat
return [
" ".join(kalimat[i:i+window])
for i in range(len(kalimat) - window + 1)
]
def rating_usia(kategori):
if "sadisme" in kategori:
return 21
if any(k in kategori for k in ["seksual", "perjudian", "narkoba"]):
return 17
if any(k in kategori for k in ["kekerasan fisik", "kekerasan verbal"]):
return 13
return 0
# ======================
# CORE ANALYSIS + DEBUG (ASLI β€” JANGAN DIUBAH)
# ======================
def analyze_text(judul, isi):
kalimat = split_kalimat(isi)
windows = sliding_window(kalimat)
detected = set()
log("\n" + "=" * 80)
log(f"JUDUL: {judul}")
for w in windows:
log("\n[WINDOW]")
log(w)
for kategori, desc in SEMANTIC_EVENT.items():
res = classifier(
w,
desc,
hypothesis_template="Teks ini menggambarkan {}."
)
score = max(res["scores"])
threshold = THRESHOLD[kategori]
log(f" - {kategori} | score={score:.3f} | threshold={threshold}")
if score < threshold:
log(" ❌ FAIL: threshold")
continue
log(" βœ… ACCEPTED")
detected.add(kategori)
usia = rating_usia(detected)
log(f"\nFINAL KATEGORI: {list(detected)}")
log(f"RATING USIA: {usia}")
log("=" * 80)
return usia, ", ".join(sorted(detected))
# ======================
# FILTER ABUSIVE (TERPISAH + LOG LENGKAP)
# ======================
def filter_abusive(isi):
log("\n" + "=" * 80)
log("[ABUSIVE] START")
log(f"[ABUSIVE] INPUT:\n{isi}")
paragraphs = re.split(r'\n+', isi)
output = []
for p_idx, para in enumerate(paragraphs):
para = para.strip()
if not para:
continue
log(f"\n[ABUSIVE] PARAGRAPH {p_idx}: {para}")
sentences = re.split(r'(?<=[.!?])\s+', para)
notes = []
for s_idx, s in enumerate(sentences):
log(f"[ABUSIVE] SENTENCE {s_idx}: {s}")
inputs = abusive_tokenizer(
s,
return_tensors="pt",
truncation=True,
padding=True,
max_length=128
)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
out = abusive_model(**inputs)
probs = F.softmax(out.logits, dim=-1)
pred = torch.argmax(probs, dim=-1).item()
label = id2label[pred]
log(f"[ABUSIVE] PRED={label}")
if label == "Abusif":
note = f'Kalimat "{s}" mengandung kalimat abusif, tidak baik diucapkan'
notes.append(note)
log(f"[ABUSIVE] ⚠️ {note}")
output.append(para)
output.extend(notes)
result = "\n\n".join(output)
log("\n[ABUSIVE] OUTPUT:")
log(result)
log("=" * 80)
return result
# ======================
# ROUTER /ANALYZE (MODE BASED β€” TETAP)
# ======================
def analyze_router(judul, isi, mode):
log(f"\n[ROUTER] MODE = {mode}")
if mode == "rating":
usia, kategori = analyze_text(judul, isi)
return {
"rating_umur": usia,
"kategori": kategori
}
if mode == "abusive":
filtered = filter_abusive(isi)
return {
"filtered_text": filtered
}
return {
"error": "Invalid mode"
}
# ======================
# GRADIO API β€” ROUTE TETAP /analyze
# ======================
demo = gr.Interface(
fn=analyze_router,
inputs=[
gr.Textbox(),
gr.Textbox(),
gr.Radio(["rating", "abusive"])
],
outputs=gr.JSON(),
api_name="analyze"
)
# ======================
# ENTRY POINT
# ======================
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
ssr_mode=False,
show_error=True
)