File size: 7,005 Bytes
b658d34
 
9b47349
 
 
1453698
 
edb4f04
7739de6
edb4f04
1453698
9b47349
edb4f04
 
100b7f8
 
edb4f04
7c5f4f3
eb3e6ed
7c5f4f3
b658d34
 
 
7c5f4f3
49c2874
b658d34
 
7c5f4f3
7739de6
9b47349
 
 
 
 
 
 
 
 
 
 
 
 
eb3e6ed
7c5f4f3
b658d34
 
369c39e
7c5f4f3
369c39e
 
 
 
 
b658d34
7c5f4f3
369c39e
 
 
 
b658d34
369c39e
 
 
 
 
 
 
 
 
 
 
 
 
0f01ac2
369c39e
 
 
 
 
 
b658d34
 
7c5f4f3
5d110a7
369c39e
d8eac9e
 
0f01ac2
369c39e
7c5f4f3
 
 
7739de6
7c5f4f3
b658d34
 
7c5f4f3
 
1453698
b658d34
 
eb3e6ed
 
 
 
 
 
 
 
b658d34
 
 
 
 
7c5f4f3
b658d34
 
 
7c5f4f3
6c83ef5
7c5f4f3
e5c4375
b658d34
eb3e6ed
 
b658d34
 
eb3e6ed
 
 
 
 
 
 
100b7f8
 
 
 
 
 
 
eb3e6ed
 
 
 
 
 
 
 
 
 
116837f
9b47349
eb3e6ed
 
 
 
9b47349
edb4f04
9b47349
03c686d
9b47349
7739de6
6c83ef5
 
 
 
9b47349
 
 
6c83ef5
9b47349
 
 
 
6c83ef5
9b47349
7739de6
9b47349
6c83ef5
 
 
9b47349
 
 
 
 
 
 
 
edb4f04
9b47349
 
 
 
edb4f04
6c83ef5
 
 
 
 
 
 
7c5f4f3
e5c4375
03c686d
b658d34
03c686d
6c83ef5
 
 
 
 
edb4f04
9b47349
6c83ef5
9b47349
eb3e6ed
6c83ef5
 
9b47349
7739de6
 
 
 
 
9b47349
 
03c686d
7739de6
03c686d
7739de6
9b47349
7739de6
 
 
b658d34
7c5f4f3
6c83ef5
7c5f4f3
b658d34
eb3e6ed
7739de6
 
 
 
b658d34
9b47349
eb3e6ed
b658d34
 
7739de6
 
 
b658d34
eb3e6ed
a169def
edb4f04
1453698
 
6c83ef5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
import re
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
import os

# ======================
# ENV
# ======================
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
DEBUG = True

def log(msg):
    if DEBUG:
        print(msg, flush=True)

# ======================
# MODEL 1 β€” RATING & KATEGORI (ASLI)
# ======================
classifier = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli",
    framework="pt",
    use_fast=False
)

# ======================
# MODEL 2 β€” ABUSIVE (INDOBERT)
# ======================
ABUSIVE_MODEL_PATH = "./indoBERT_abusive"

abusive_tokenizer = AutoTokenizer.from_pretrained(ABUSIVE_MODEL_PATH)
abusive_model = AutoModelForSequenceClassification.from_pretrained(ABUSIVE_MODEL_PATH)
abusive_model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
abusive_model.to(device)

id2label = {0: "Tidak Abusif", 1: "Abusif"}

# ======================
# CONFIG (ASLI)
# ======================
SEMANTIC_EVENT = {
    "sadisme": [
        "seseorang membunuh manusia lain dengan cara kejam",
        "orang tua membunuh anaknya sendiri",
        "tubuh manusia dipotong dan dimasak",
        "manusia dikorbankan untuk kepentingan lain",
        "kekerasan dengan menghancurkan atau menghilangkan anggota tubuh",
        "pembunuhan dengan cara yang tidak lazim seperti mutilasi atau penyiksaan",
        "kekerasan yang mengakibatkan korban cacat"
    ],
    "kekerasan fisik": [
        "seseorang melukai tubuh manusia lain",
        "manusia diserang secara fisik",
        "tindakan kekerasan yang menyebabkan kematian",
        "perundungan dengan menggunakan aktivitas fisik"
    ],
    "kekerasan verbal": [
        "teriakan marah dan ancaman keras",
        "perkataan kasar yang menakut-nakuti",
        "kalimat jorok yang mengarah ke seksualitas",
        "penghinaan terhadap fisik"
    ],
    "seksual": [
        "hubungan intim antara pria dan wanita",
        "perkataan dan kalimat vulgar terhadap bagian genital pria dan wanita",
        "kalimat yang mengarah ke aktivitas seksual"
    ],
    "perjudian": [
        "bertaruh harta atau barang untuk mendapatkan keuntungan",
        "bertaruh nasib dengan permainan atau pertarungan"
    ],
    "narkoba": [
        "mengonsumsi obat terlarang",
        "menjual dan mendistribusi obat terlarang",
        "kalimat yang mengandung jenis-jenis obat terlarang"
    ]
}

THRESHOLD = {
    "sadisme": 0.65,
    "kekerasan fisik": 0.80,
    "kekerasan verbal": 0.80,
    "seksual": 0.80,
    "perjudian": 0.85,
    "narkoba": 0.80
}

# ======================
# UTIL (ASLI)
# ======================
def split_kalimat(text):
    return [
        k.strip()
        for k in re.split(r'(?<=[.!?])\s+', text)
        if len(k.strip()) >= 20
    ]

def sliding_window(kalimat, window=4):
    if len(kalimat) < window:
        return kalimat
    return [
        " ".join(kalimat[i:i+window])
        for i in range(len(kalimat) - window + 1)
    ]

def rating_usia(kategori):
    if "sadisme" in kategori:
        return 21
    if any(k in kategori for k in ["seksual", "perjudian", "narkoba"]):
        return 17
    if any(k in kategori for k in ["kekerasan fisik", "kekerasan verbal"]):
        return 13
    return 0

# ======================
# CORE ANALYSIS + DEBUG (ASLI β€” JANGAN DIUBAH)
# ======================
def analyze_text(judul, isi):
    kalimat = split_kalimat(isi)
    windows = sliding_window(kalimat)

    detected = set()

    log("\n" + "=" * 80)
    log(f"JUDUL: {judul}")

    for w in windows:
        log("\n[WINDOW]")
        log(w)

        for kategori, desc in SEMANTIC_EVENT.items():
            res = classifier(
                w,
                desc,
                hypothesis_template="Teks ini menggambarkan {}."
            )
            score = max(res["scores"])
            threshold = THRESHOLD[kategori]

            log(f"  - {kategori} | score={score:.3f} | threshold={threshold}")

            if score < threshold:
                log("    ❌ FAIL: threshold")
                continue

            log("    βœ… ACCEPTED")
            detected.add(kategori)

    usia = rating_usia(detected)
    log(f"\nFINAL KATEGORI: {list(detected)}")
    log(f"RATING USIA: {usia}")
    log("=" * 80)

    return usia, ", ".join(sorted(detected))

# ======================
# FILTER ABUSIVE (TERPISAH + LOG LENGKAP)
# ======================
def filter_abusive(isi):
    log("\n" + "=" * 80)
    log("[ABUSIVE] START")
    log(f"[ABUSIVE] INPUT:\n{isi}")

    paragraphs = re.split(r'\n+', isi)
    output = []

    for p_idx, para in enumerate(paragraphs):
        para = para.strip()
        if not para:
            continue

        log(f"\n[ABUSIVE] PARAGRAPH {p_idx}: {para}")
        sentences = re.split(r'(?<=[.!?])\s+', para)
        notes = []

        for s_idx, s in enumerate(sentences):
            log(f"[ABUSIVE]   SENTENCE {s_idx}: {s}")

            inputs = abusive_tokenizer(
                s,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=128
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                out = abusive_model(**inputs)
                probs = F.softmax(out.logits, dim=-1)
                pred = torch.argmax(probs, dim=-1).item()

            label = id2label[pred]
            log(f"[ABUSIVE]     PRED={label}")

            if label == "Abusif":
                note = f'Kalimat "{s}" mengandung kalimat abusif, tidak baik diucapkan'
                notes.append(note)
                log(f"[ABUSIVE]     ⚠️ {note}")

        output.append(para)
        output.extend(notes)

    result = "\n\n".join(output)
    log("\n[ABUSIVE] OUTPUT:")
    log(result)
    log("=" * 80)

    return result

# ======================
# ROUTER /ANALYZE (MODE BASED β€” TETAP)
# ======================
def analyze_router(judul, isi, mode):
    log(f"\n[ROUTER] MODE = {mode}")

    if mode == "rating":
        usia, kategori = analyze_text(judul, isi)
        return {
            "rating_umur": usia,
            "kategori": kategori
        }

    if mode == "abusive":
        filtered = filter_abusive(isi)
        return {
            "filtered_text": filtered
        }

    return {
        "error": "Invalid mode"
    }

# ======================
# GRADIO API β€” ROUTE TETAP /analyze
# ======================
demo = gr.Interface(
    fn=analyze_router,
    inputs=[
        gr.Textbox(),
        gr.Textbox(),
        gr.Radio(["rating", "abusive"])
    ],
    outputs=gr.JSON(),
    api_name="analyze"
)

# ======================
# ENTRY POINT
# ======================
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        ssr_mode=False,
        show_error=True
    )