Prompt-Builder / scripts /gen_drawio.py
ArielJoe's picture
feat: add flowchart diagrams (proses evaluasi, use case, PII) + simplify field-fit ML layer
dd30e82
Raw
History Blame Contribute Delete
25.2 kB
"""
Generator diagram activity draw.io (.drawio) untuk tiap detektor.
Gaya: judul 30px, titik mulai hitam, end-state bercincin, BELAH KETUPAT (rhombus)
untuk keputusan, kotak membulat untuk aksi. Setiap node diukur agar pas (fit)
dengan teks di dalamnya. Kata bahasa Inggris dibungkus <i>...</i> agar miring.
Output: docs/diagrams/<Nama>.drawio (XML mxGraph, divalidasi well-formed).
Jalankan: python scripts/gen_drawio.py
"""
from __future__ import annotations
import math
import re
import xml.etree.ElementTree as ET
from pathlib import Path
OUT = Path(__file__).resolve().parent.parent / "docs" / "diagrams"
START = "ellipse;whiteSpace=wrap;html=1;fillColor=#000000;strokeColor=#000000;"
END = "ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=#000000;"
DEC = "rhombus;whiteSpace=wrap;html=1;fillColor=#ffffff;strokeColor=#000000;"
ACT = "rounded=1;whiteSpace=wrap;html=1;arcSize=20;fillColor=#ffffff;strokeColor=#000000;"
BAR = "rounded=0;whiteSpace=wrap;html=1;fillColor=#000000;strokeColor=#000000;"
TITLE = ("text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;"
"whiteSpace=wrap;fontSize=30;fontStyle=1;")
NOTE = ("text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;"
"whiteSpace=wrap;fontSize=12;fontStyle=2;")
EDGE = "edgeStyle=orthogonalEdgeStyle;rounded=0;html=1;endArrow=block;"
CX = 460 # pusat spine
RX = 720 # kiri cabang kanan
LXR = 250 # kanan cabang kiri (tepi kanan kotak kiri)
GAP = 42
CW = 6.7 # lebar rata-rata karakter @ fontSize 12
def strip_tags(text: str) -> str:
"""Hapus tag HTML untuk keperluan pengukuran panjang teks tampilan."""
return re.sub(r'<[^>]+>', '', text)
def esc(s: str) -> str:
# Escape & dan newline; <i>…</i> dalam label dibiarkan, esc mengubahnya
# menjadi &lt;i&gt; sehingga menjadi HTML valid dalam atribut XML draw.io.
s = s.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
return s.replace("\n", "&#10;")
def fit_act(text, max_w=280, pad=14, lh=17):
segs = strip_tags(text).split("\n")
longest = max(len(s) for s in segs)
w = min(max(int(longest * CW) + 2 * pad, 130), max_w)
cpl = max(1, int((w - 2 * pad) / CW))
lines = sum(max(1, math.ceil(len(s) / cpl)) for s in segs)
h = max(40, lines * lh + 2 * pad)
return w, h
def fit_dec(text, pad=14, lh=17):
"""Rhombus perlu ruang ekstra karena area teks efektif di tengah belah ketupat."""
segs = strip_tags(text).split("\n")
longest = max(len(s) for s in segs)
content_w = min(max(int(longest * CW), 100), 210)
cpl = max(1, int(content_w / CW))
lines = sum(max(1, math.ceil(len(s) / cpl)) for s in segs)
w = content_w + 140
h = lines * lh + 96
return int(w), int(h)
def fit_note(text):
return int(len(strip_tags(text)) * 6.6) + 8, 22
class Dia:
def __init__(self, title: str, page_w=1000):
# self.title = nama diagram (teks biasa, tanpa tag HTML)
self.title = strip_tags(title)
self.page_w = page_w
self.cells: list[str] = []
self.geo: dict[str, tuple[int, int, int, int]] = {}
self.y = 96
# Cell judul menggunakan versi dengan HTML italic
self.cells.append(
f'<mxCell id="t" value="{esc(title)}" style="{TITLE}" vertex="1" parent="1">'
f'<mxGeometry x="240" y="20" width="500" height="46" as="geometry"/></mxCell>')
def _add(self, cid, style, label, x, y, w, h):
self.geo[cid] = (x, y, w, h)
self.cells.append(
f'<mxCell id="{cid}" value="{esc(label)}" style="{style}" vertex="1" parent="1">'
f'<mxGeometry x="{x}" y="{y}" width="{w}" height="{h}" as="geometry"/></mxCell>')
return cid
def cy(self, ref):
x, y, w, h = self.geo[ref]
return y + h // 2
# spine (center CX, auto y)
def start(self, cid="s"):
self._add(cid, START, "", CX - 15, self.y, 30, 30)
self.y += 30 + GAP
return cid
def act(self, cid, label, max_w=280):
w, h = fit_act(label, max_w)
self._add(cid, ACT, label, CX - w // 2, self.y, w, h)
self.y += h + GAP
return cid
def dec(self, cid, label):
w, h = fit_dec(label)
self._add(cid, DEC, label, CX - w // 2, self.y, w, h)
self.y += h + GAP
return cid
def end(self, cid="e"):
self._add(cid, END, "", CX - 15, self.y, 30, 30)
self.y += 30 + GAP
return cid
# cabang
def right(self, cid, label, ref, max_w=250):
w, h = fit_act(label, max_w)
return self._add(cid, ACT, label, RX, self.cy(ref) - h // 2, w, h)
def left(self, cid, label, ref, max_w=210):
w, h = fit_act(label, max_w)
return self._add(cid, ACT, label, LXR - w, self.cy(ref) - h // 2, w, h)
def side_end(self, cid, ref):
return self._add(cid, END, "", RX + 60, self.cy(ref) - 15, 30, 30)
def note(self, cid, label):
w, h = fit_note(label)
self._add(cid, NOTE, label, 30, self.y - GAP + 4, w, h)
def raw(self, cid, style, label, x, y, w, h):
return self._add(cid, style, label, x, y, w, h)
# edges
def e(self, src, tgt, label="", **kw):
style = EDGE + "".join(f"{k}={v};" for k, v in kw.items())
val = f' value="{esc(label)}"' if label else ""
self.cells.append(
f'<mxCell id="e_{src}_{tgt}"{val} style="{style}" edge="1" parent="1" '
f'source="{src}" target="{tgt}"><mxGeometry relative="1" as="geometry"/></mxCell>')
def e_pts(self, src, tgt, pts, label="", **kw):
style = EDGE + "".join(f"{k}={v};" for k, v in kw.items())
val = f' value="{esc(label)}"' if label else ""
ptxml = "".join(f'<mxPoint x="{x}" y="{y}"/>' for x, y in pts)
self.cells.append(
f'<mxCell id="e_{src}_{tgt}"{val} style="{style}" edge="1" parent="1" '
f'source="{src}" target="{tgt}"><mxGeometry relative="1" as="geometry">'
f'<Array as="points">{ptxml}</Array></mxGeometry></mxCell>')
def loop_left(self, src, tgt, label="ya", lane=200):
self.e_pts(src, tgt, [(lane, self.cy(src)), (lane, self.cy(tgt))], label,
exitX=0, exitY=0.5, entryX=0, entryY=0.5)
def loop_right(self, src, tgt, label="ya", lane=960):
self.e_pts(src, tgt, [(lane, self.cy(src)), (lane, self.cy(tgt))], label,
exitX=1, exitY=0.5, entryX=1, entryY=0.5)
def xml(self) -> str:
ph = self.y + 40
body = "".join(self.cells)
return (
'<mxfile host="app.diagrams.net" type="device">'
f'<diagram name="{esc(self.title)}" id="d">'
f'<mxGraphModel dx="900" dy="700" grid="1" gridSize="10" guides="1" '
f'tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" '
f'pageWidth="{self.page_w}" pageHeight="{ph}" background="#ffffff" '
f'math="0" shadow="0">'
f'<root><mxCell id="0"/><mxCell id="1" parent="0"/>{body}</root>'
'</mxGraphModel></diagram></mxfile>')
def empty_branch(d: Dia, dref):
ae = d.right("aEmpty", "Kembalikan daftar kosong", dref)
ee = d.side_end("eEmpty", dref)
d.e(dref, ae, "ya")
d.e(ae, ee)
# ============================================================ PROSES EVALUASI
def proses_evaluasi():
d = Dia("Proses Evaluasi Prompt", page_w=1120)
s = d.start()
a1 = d.act("a1", "[Antarmuka] Pengguna mengisi field prompt")
a2 = d.act("a2", "[Antarmuka] Hitung badge field wajib kosong secara client-side")
d1 = d.dec("d1", "Ada field yang berubah?")
a3 = d.act("a3", "[Antarmuka] Kirim POST /api/evaluate untuk field berubah\n"
"(satu field per request)")
a4 = d.act("a4", "[Orkestrator] Validasi dan parse JSON {fields}")
a5 = d.act("a5", "[Orkestrator] Jalankan deteksi bahasa + 9 detektor\n"
"pada field terisi")
a6 = d.act("a6", "[Orkestrator] Gabungkan temuan dan dedup level kata")
a7 = d.act("a7", "[Antarmuka] Simpan temuan server ke cache field")
a8 = d.act("a8", "[Antarmuka] Gabungkan cache issue + badge missing;\n"
"susun prompt akhir dari field lokal")
a9 = d.act("a9", "[Antarmuka] Tampilkan sorotan, badge, banner catatan,\n"
"dan prompt akhir")
d2 = d.dec("d2", "Saran perbaikan diterapkan?")
a10 = d.act("a10", "[Antarmuka] Ubah isi field, kosongkan cache field,\n"
"jadwalkan evaluasi ulang")
d3 = d.dec("d3", "Prompt akhir disalin?")
e = d.end()
d.e(s, a1)
d.e(a1, a2)
d.e(a2, d1)
bNoChange = d.right("bNoChange", "Render ulang dari cache yang masih valid", d1)
d.e(d1, bNoChange, "tidak")
d.e(bNoChange, a8)
d.e(d1, a3, "ya")
d.e(a3, a4)
d.e(a4, a5)
d.e(a5, a6)
d.e(a6, a7)
d.e(a7, a8)
d.e(a8, a9)
d.e(a9, d2)
d.e(d2, a10, "ya")
d.loop_left(a10, d1, "", lane=150)
d.e(d2, d3, "tidak")
d.loop_right(d3, a9, "tidak", lane=1030)
d.e(d3, e, "ya")
return d
# ============================================================ USE CASE
def use_case():
d = Dia("Use Case Sistem Prompt Builder", page_w=1220)
boundary = ("rounded=0;whiteSpace=wrap;html=1;fillColor=none;"
"strokeColor=#000000;fontStyle=1;verticalAlign=top;spacingTop=10;")
actor = ("shape=umlActor;verticalLabelPosition=bottom;verticalAlign=top;"
"html=1;outlineConnect=0;fillColor=#ffffff;strokeColor=#000000;")
usecase = "ellipse;whiteSpace=wrap;html=1;fillColor=#ffffff;strokeColor=#000000;"
d.raw("sys", boundary, "Aplikasi Prompt Builder", 250, 95, 900, 820)
d.raw("actor", actor, "Mahasiswa", 70, 390, 90, 150)
d.raw("uc1", usecase, "Mengisi field\nPrompt Builder", 310, 170, 230, 85)
d.raw("uc2", usecase, "Mengevaluasi\nprompt", 310, 300, 230, 85)
d.raw("uc3", usecase, "Meninjau hasil\nevaluasi", 310, 430, 230, 85)
d.raw("uc4", usecase, "Menerapkan saran\nperbaikan", 310, 560, 230, 85)
d.raw("uc5", usecase, "Menyalin\nprompt akhir", 310, 690, 230, 85)
d.raw("uc6", usecase, "Menjalankan\nsembilan detektor", 620, 300, 250, 85)
detector_names = [
("d1", "Deteksi PII"),
("d2", "Deteksi Word Quality"),
("d3", "Deteksi Konten Berisiko"),
("d4", "Deteksi NER"),
("d5", "Deteksi Profanity"),
("d6", "Deteksi Filler"),
("d7", "Deteksi Special Char"),
("d8", "Deteksi Syntax"),
("d9", "Deteksi Field-Fit"),
]
y = 125
for cid, label in detector_names:
d.raw(cid, usecase, label, 910, y, 190, 58)
d.e("uc6", cid, "<<include>>", dashed=1)
y += 82
for cid in ("uc1", "uc2", "uc3", "uc4", "uc5"):
d.e("actor", cid)
d.e("uc2", "uc6", "<<include>>", dashed=1)
d.e("uc3", "uc2", "<<extend>>", dashed=1)
d.e("uc4", "uc2", "<<extend>>", dashed=1)
d.y = 940
return d
# ============================================================ PII
def pii():
d = Dia("Detektor <i>PII</i> (Data Pribadi)")
s = d.start()
d1 = d.dec("d1", "Teks kosong?")
d.e(s, d1)
empty_branch(d, d1)
a1 = d.act("a1", "Cari kandidat lewat pola <i>regex</i> PII Indonesia\n"
"(NIK, NPWP, BPJS, SIM, plat nomor, email, HP,\n"
"rekening, kartu kredit, IP, alamat)")
a2 = d.act("a2", "Ekstrak nilai dari <i>capturing group</i>\n"
"dan pangkas spasi/tanda baca tepi")
d2 = d.dec("d2", "Punya validator struktural?")
d3 = d.dec("d3", "Validasi lolos?")
a3 = d.act("a3", "Naikkan <i>confidence score</i> (+0,15)")
aDrop = d.right("aDrop", "Buang kandidat", d3)
d4 = d.dec("d4", "<i>confidence</i> >= ambang minimum?")
a4 = d.act("a4", "Catat <i>PIIEntity</i> (label, span, konteks)")
aLow = d.right("aLow", "Abaikan kandidat", d4)
d5 = d.dec("d5", "Masih ada kandidat?")
a5 = d.act("a5", "Selesaikan span tumpang-tindih\n"
"(confidence tertinggi menang)")
a6 = d.act("a6", "Urutkan menurut posisi")
a7 = d.act("a7", "Kembalikan daftar <i>PIIEntity</i>")
e = d.end()
d.e(d1, a1, "tidak")
d.e(a1, a2)
d.e(a2, d2)
d.e(d2, d3, "ya")
d.e(d3, a3, "ya")
d.e(d3, aDrop, "tidak")
d.e(aDrop, d5)
d.e(d2, d4, "tidak")
d.e(a3, d4)
d.e(d4, a4, "ya")
d.e(a4, d5)
d.e(d4, aLow, "tidak")
d.e(aLow, d5)
d.loop_left(d5, a1, "ya")
d.e(d5, a5, "tidak")
d.e(a5, a6)
d.e(a6, a7)
d.e(a7, e)
return d
# ============================================================ WORD QUALITY
def word_quality():
d = Dia("Detektor <i>Word Quality</i>")
s = d.start()
d1 = d.dec("d1", "Teks kosong?")
d.e(s, d1)
empty_branch(d, d1)
a1 = d.act("a1", "Tokenisasi teks")
a2 = d.act("a2", "Ambil <i>token</i> berikutnya")
d2 = d.dec("d2", "<i>Token</i> pendek / non-<i>ASCII</i> / pola dilewati?")
a3 = d.act("a3", "Kumpulkan konteks kata sebelum & sesudah")
d3 = d.dec("d3", "<i>SLANG</i>? (kamus + pola <i>regex</i>)")
d4 = d.dec("d4", "<i>ALAY</i>? (<i>l33tspeak</i>)")
d5 = d.dec("d5", "<i>TYPO</i>? (<i>SymSpell</i> + <i>skeleton</i>)")
d6 = d.dec("d6", "Masih ada <i>token</i>?")
d7 = d.dec("d7", "Jumlah <i>SLANG</i> ≥ 2 pada kalimat?")
d8 = d.dec("d8", "<i>Layer-2 ML</i> aktif & ada <i>TYPO</i>?")
a8 = d.act("a8", "Urutkan menurut posisi & kembalikan <i>WordIssue</i>")
e = d.end()
d.e(d1, a1, "tidak")
d.e(a1, a2)
d.e(a2, d2)
bSkip = d.right("bSkip", "Lewati <i>token</i>", d2)
d.e(d2, bSkip, "ya")
d.e(bSkip, d6)
d.e(d2, a3, "tidak")
d.e(a3, d3)
bSlang = d.right("bSlang", "Tandai <i>SLANG</i>", d3)
d.e(d3, bSlang, "ya")
d.e(bSlang, d6)
d.e(d3, d4, "tidak")
bAlay = d.right("bAlay", "Tandai <i>ALAY</i>", d4)
d.e(d4, bAlay, "ya")
d.e(bAlay, d6)
d.e(d4, d5, "tidak")
bTypo = d.right("bTypo", "Tandai <i>TYPO</i>", d5)
d.e(d5, bTypo, "ya")
d.e(bTypo, d6)
bClean = d.left("bClean", "Tidak ada temuan", d5)
d.e(d5, bClean, "bersih")
d.e(bClean, d6)
d.loop_left(d6, a2, "ya")
d.e(d6, d7, "tidak")
bRe = d.right("bRe",
"Reklasifikasi <i>TYPO</i> <i>confidence</i>-rendah → <i>SLANG</i>", d7)
d.e(d7, bRe, "ya")
d.e(bRe, d8)
d.e(d7, d8, "tidak")
bML = d.right("bML",
"<i>Fill-mask</i> <i>IndoBERT</i> kontekstual:\n"
"(a) gugurkan <i>TYPO</i> bila kata asli masuk akal\n"
"(b) <i>rerank</i> kandidat koreksi sesuai konteks", d8)
d.e(d8, bML, "ya")
d.e(bML, a8)
d.e(d8, a8, "tidak")
d.e(a8, e)
return d
# ============================================================ KONTEN BERISIKO
def konten_berisiko():
d = Dia("Detektor Konten Berisiko")
s = d.start()
d1 = d.dec("d1", "Teks kosong?")
d.e(s, d1)
empty_branch(d, d1)
a1 = d.act("a1", "Bentuk varian teks: asli, normalisasi <i>slang</i>, "
"<i>de-obfuscation</i>, <i>de-obfuscation</i> + normalisasi")
a2 = d.act("a2", "Ambil varian teks berikutnya")
a3 = d.act("a3", "Cocokkan pola <i>regex</i> Indonesia (<i>INJECTION</i>, "
"<i>self-harm</i>, konten eksplisit, diskriminasi, "
"<i>ACADEMIC_DISHONESTY</i>, <i>HARMFUL</i>)")
d2 = d.dec("d2", "Pola cocok?")
a4 = d.act("a4", "Petakan <i>offset</i> kembali ke teks asli")
d3 = d.dec("d3", "<i>Evidence</i> sudah dicatat?")
a5 = d.act("a5", "Catat <i>RiskyContentFinding</i> (kode, <i>severity</i>, "
"<i>evidence</i>, saran)")
d4 = d.dec("d4", "Masih ada varian?")
a6 = d.act("a6", "Urutkan <i>HIGH</i> → <i>MEDIUM</i> → <i>LOW</i>")
e = d.end()
d.e(d1, a1, "tidak")
d.e(a1, a2)
d.e(a2, a3)
d.e(a3, d2)
d.e(d2, a4, "ya")
d.e(d2, d4, "tidak")
d.e(a4, d3)
bSeen = d.right("bSeen", "Lewati (duplikat)", d3)
d.e(d3, bSeen, "ya")
d.e(bSeen, d4)
d.e(d3, a5, "tidak")
d.e(a5, d4)
d.loop_left(d4, a2, "ya")
d.e(d4, a6, "tidak")
d.e(a6, e)
return d
# ============================================================ NER (fork/join)
def ner():
d = Dia("Detektor <i>NER</i>", page_w=1020)
s = d.start()
d1 = d.dec("d1", "Teks kosong?")
d.e(s, d1)
empty_branch(d, d1)
fy = d.y
fork = d.raw("fork", BAR, "", 200, fy, 620, 10)
d.y += 10 + 48
by = d.y
b1 = d.raw("b1", ACT,
"Prediksi entitas via <i>transformer XLM-R</i> (bila model dimuat)",
150, by, 220, 70)
b2 = d.raw("b2", ACT,
"Deteksi <i>rule</i> <i>regex</i> Indonesia "
"(PT/CV, kementerian, kota, Rp, tanggal, gelar+nama)",
400, by, 220, 80)
b3 = d.raw("b3", ACT, "Cocokkan daftar nama orang", 650, by, 220, 70)
d.y = by + 80 + 48
jy = d.y
join = d.raw("join", BAR, "", 200, jy, 620, 10)
d.y = jy + 10 + GAP
d.e(d1, fork, "tidak")
d.e(fork, b1, exitX=0.2, exitY=1)
d.e(fork, b2, exitX=0.5, exitY=1)
d.e(fork, b3, exitX=0.8, exitY=1)
d.e(b1, join, entryX=0.2, entryY=0)
d.e(b2, join, entryX=0.5, entryY=0)
d.e(b3, join, entryX=0.8, entryY=0)
a1 = d.act("a1", "Gabungkan entitas <i>ML</i> + <i>rule</i> + nama")
a2 = d.act("a2", "Filter entitas tak masuk akal & konteks tidak valid")
a3 = d.act("a3", "Filter kebutuhan <i>prompt</i> "
"(buang <i>stopword</i> & <i>token</i> tunggal huruf kecil)")
a4 = d.act("a4", "Urutkan menurut posisi & kembalikan <i>NEREntity</i>")
e = d.end()
d.e(join, a1)
d.e(a1, a2)
d.e(a2, a3)
d.e(a3, a4)
d.e(a4, e)
return d
# ============================================================ PROFANITY
def profanity():
d = Dia("Detektor <i>Profanity</i>")
s = d.start()
d1 = d.dec("d1", "Teks kosong?")
d.e(s, d1)
empty_branch(d, d1)
a1 = d.act("a1", "<i>Layer 1</i> — <i>pre-pass</i>: "
"tangkap kata dieja per huruf (mis. 'a n j i n g')")
a2 = d.act("a2", "Ambil <i>token</i> berikutnya")
d2 = d.dec("d2", "<i>Span</i> sudah tertangkap?")
a3 = d.act("a3", "Klasifikasi <i>token</i> "
"(normalisasi <i>leet</i>/<i>strip</i>/<i>collapse</i> "
"+ <i>skeleton</i> konsonan)")
d3 = d.dec("d3", "Tergolong kasar?")
d4 = d.dec("d4", "Masih ada <i>token</i>?")
d5 = d.dec("d5", "<i>Layer 1</i> kosong & <i>ML</i> aktif & ≥ 3 kata?")
a4 = d.act("a4", "Urutkan menurut posisi & kembalikan temuan")
e = d.end()
d.e(d1, a1, "tidak")
d.e(a1, a2)
d.e(a2, d2)
bSkip = d.right("bSkip", "Lewati <i>token</i>", d2)
d.e(d2, bSkip, "ya")
d.e(bSkip, d4)
d.e(d2, a3, "tidak")
d.e(a3, d3)
bMark = d.right("bMark",
"Catat <i>ProfanityFinding</i> (<i>HIGH</i>/<i>MEDIUM</i>)", d3)
d.e(d3, bMark, "ya")
d.e(bMark, d4)
d.e(d3, d4, "tidak")
d.loop_left(d4, a2, "ya")
d.e(d4, d5, "tidak")
bML = d.right("bML",
"<i>Layer 2</i> — <i>classifier</i> toksisitas Indonesia "
"(<i>advice-only</i>, sorot teks penuh)", d5)
d.e(d5, bML, "ya")
d.e(bML, a4)
d.e(d5, a4, "tidak")
d.e(a4, e)
return d
# ============================================================ FILLER
def filler():
d = Dia("Detektor <i>Filler</i>")
s = d.start()
d1 = d.dec("d1", "Teks kosong?")
d.e(s, d1)
empty_branch(d, d1)
a1 = d.act("a1", "Siapkan teks pindai: teks asli + <i>norm_text</i>\n"
"Word Quality bila tersedia")
a2 = d.act("a2", "Terjemahkan <i>l33t</i> -> huruf normal")
a3 = d.act("a3",
"Ambil pola <i>regex</i> berikutnya "
"(<i>GREETING_AI, GREETING_ONLY, THANKS, APOLOGY, EMPTY_OPENER, "
"EMOTIONAL_FILLER, VAGUE_REFERENCE, VAGUE_PARTICLE, "
"UNNECESSARY_PREAMBLE, HESITATION</i>)")
a4 = d.act("a4", "Cari kecocokan berikutnya;\n"
"petakan offset ke teks asli bila dari <i>norm_text</i>")
d2 = d.dec("d2", "<i>Span</i> tumpang-tindih dengan temuan lain?")
d3 = d.dec("d3", "Masih ada kecocokan?")
d4 = d.dec("d4", "Masih ada pola?")
d5 = d.dec("d5", "Masih ada teks pindai?")
a5 = d.act("a5", "Urutkan menurut posisi & kembalikan temuan")
e = d.end()
d.e(d1, a1, "tidak")
d.e(a1, a2)
d.e(a2, a3)
d.e(a3, a4)
d.e(a4, d2)
bSkip = d.right("bSkip", "Lewati", d2)
d.e(d2, bSkip, "ya")
d.e(bSkip, d3)
bMark = d.left("bMark", "Catat <i>FillerFinding</i>", d2)
d.e(d2, bMark, "tidak")
d.e(bMark, d3)
d.loop_left(d3, a4, "ya")
d.e(d3, d4, "tidak")
d.loop_right(d4, a3, "ya")
d.e(d4, d5, "tidak")
d.loop_left(d5, a2, "ya", lane=120)
d.e(d5, a5, "tidak")
d.e(a5, e)
return d
# ============================================================ SPECIAL CHAR
def special_char():
d = Dia("Detektor <i>Special Char</i>")
s = d.start()
a0 = d.act("a0", "Siapkan teks & himpunan <i>span</i> terpakai")
steps = [
("c1", "<i>Zero-width characters</i> → hapus"),
("c2", "Kontrol arah teks (<i>BiDi</i>) → hapus"),
("c3", "<i>Unicode Tag</i> (teks tersembunyi) → hapus"),
("c4", "Homoglif (huruf menyamar) → perbaiki ke <i>Latin</i>"),
("c5", "Spasi non-standar → ganti spasi biasa"),
("c6", "Karakter kontrol → hapus"),
("c7", "<i>Smart quote</i> -> ganti kutip ASCII"),
("c8", "Baris kosong berlebih -> jadikan satu baris kosong"),
("c9", "Spasi ganda & tanda baca berulang -> rapikan"),
]
d.note("nScan", "Scan kategori berurutan (dedup <i>span</i>)")
d.e(s, a0)
prev = a0
for cid, label in steps:
cur = d.act(cid, label)
d.e(prev, cur)
prev = cur
a1 = d.act("a1", "Kembalikan daftar <i>SpecialCharFinding</i> "
"beserta teks pengganti (<i>replacement</i>)")
e = d.end()
d.e(prev, a1)
d.e(a1, e)
return d
# ============================================================ SYNTAX
def syntax():
d = Dia("Detektor <i>Syntax</i>")
s = d.start()
d1 = d.dec("d1", "Teks kosong / model tak dimuat?")
d.e(s, d1)
empty_branch(d, d1)
a1 = d.act("a1", "Pisahkan teks menjadi kalimat (dibatasi jumlah)")
a2 = d.act("a2", "Ambil kalimat berikutnya")
d2 = d.dec("d2", "Jumlah kata ≥ minimum?")
a3 = d.act("a3", "Bangun permutasi acak urutan kata")
a4 = d.act("a4", "Hitung skor <i>PLL</i> kalimat asli + permutasi "
"via <i>IndoBERT MLM</i> (satu <i>forward pass</i>)")
a5 = d.act("a5", "Hitung rasio permutasi yang lebih wajar daripada urutan asli")
d3 = d.dec("d3", "rasio ≥ ambang?")
d4 = d.dec("d4", "Masih ada kalimat?")
a6 = d.act("a6", "Kembalikan daftar <i>SyntaxFinding</i>")
e = d.end()
d.e(d1, a1, "tidak")
d.e(a1, a2)
d.e(a2, d2)
bSkip = d.right("bSkip", "Lewati kalimat", d2)
d.e(d2, bSkip, "tidak")
d.e(bSkip, d4)
d.e(d2, a3, "ya")
d.e(a3, a4)
d.e(a4, a5)
d.e(a5, d3)
bMark = d.right("bMark",
"Catat <i>UNUSUAL_WORD_ORDER</i> (<i>severity LOW</i>)", d3)
d.e(d3, bMark, "ya")
d.e(bMark, d4)
d.e(d3, d4, "tidak")
d.loop_left(d4, a2, "ya")
d.e(d4, a6, "tidak")
d.e(a6, e)
return d
# ============================================================ FIELD-FIT
def field_fit():
d = Dia("Detektor <i>Field-Fit</i>")
s = d.start()
d1 = d.dec("d1", "Teks kosong / bahasa bukan id?")
d.e(s, d1)
empty_branch(d, d1)
d.note("n1", "<i>ML embedding</i> (semua field)")
d3 = d.dec("d3", "Model <i>embedding</i> aktif?")
a1 = d.act("a1", "<i>Embed</i> isi field (ternormalisasi)")
a2 = d.act("a2", "Hitung <i>cosine</i> ke centroid prototipe 9 field")
d4 = d.dec("d4", "Field lain unggul ≥ <i>margin</i> & <i>cos</i> ≥ ambang?")
a3 = d.act("a3", "Saran pindah ke field termirip")
a4 = d.act("a4", "Kembalikan daftar <i>FieldFitFinding</i>")
e = d.end()
d.e(d1, d3, "tidak")
d.e(d3, a1, "ya")
d.e(a1, a2)
d.e(a2, d4)
d.e(d4, a3, "ya")
d.e(a3, a4)
d.e(d4, a4, "tidak")
d.loop_left(d3, a4, "tidak", lane=205)
d.e(a4, e)
return d
def main():
diagrams = {
"Proses Evaluasi": proses_evaluasi,
"Use Case": use_case,
"PII": pii,
"Word Quality": word_quality,
"Konten Berisiko": konten_berisiko,
"NER": ner,
"Profanity": profanity,
"Filler": filler,
"Special Char": special_char,
"Syntax": syntax,
"Field-Fit": field_fit,
}
for name, fn in diagrams.items():
xml = fn().xml()
ET.fromstring(xml)
(OUT / f"{name}.drawio").write_text(xml, encoding="utf-8")
print(f"OK {name}.drawio ({len(xml)} bytes)")
if __name__ == "__main__":
main()