RangGaraga commited on
Commit
3ddfa48
Β·
verified Β·
1 Parent(s): 25444cd

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +498 -0
app.py ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import fitz
4
+ import tempfile
5
+ import os
6
+ import re
7
+ import time
8
+
9
+
10
+ # Load model + warmup agar request pertama tidak lambat
11
+ import time as _t
12
+ print("[STARTUP] Loading NER model...")
13
+ _t0 = _t.time()
14
+ ner = pipeline("ner", model="cahya/NusaBert-ner-v1.3", aggregation_strategy="simple")
15
+ print(f"[STARTUP] Model loaded in {_t.time()-_t0:.2f}s β€” running warmup...")
16
+ ner("PT Bank Indonesia") # warmup: paksa model fully loaded ke memori
17
+ print(f"[STARTUP] Warmup selesai. Total: {_t.time()-_t0:.2f}s")
18
+
19
+
20
+ # Label mapping
21
+ LABEL_MAP = {
22
+ "CRD": "Kardinal", "DAT": "Tanggal", "EVT": "Peristiwa",
23
+ "FAC": "Fasilitas", "GPE": "Entitas Geopolitik", "LAW": "Peraturan / Undang-Undang",
24
+ "LOC": "Lokasi", "MON": "Uang", "NOR": "Organisasi Politik",
25
+ "ORD": "Ordinal", "ORG": "Organisasi", "PER": "Orang",
26
+ "PRC": "Persentase", "PRD": "Produk", "QTY": "Kuantitas",
27
+ "REG": "Agama", "TIM": "Waktu", "WOA": "Karya Seni", "LAN": "Bahasa",
28
+ }
29
+
30
+ RTF_COLOR_TABLE = [
31
+ (255,191,191),(174,221,255),(141,199,255),(184,255,184),(153,242,204),
32
+ (217,255,165),(255,229,140),(255,204,102),(204,255,218),(242,204,255),
33
+ (224,191,255),(255,242,178),(199,242,242),(255,186,186),(255,217,165),
34
+ (191,223,255),(230,218,255),(255,230,218),(204,255,242),
35
+ ]
36
+
37
+ LABEL_RTF_COLOR_IDX = {
38
+ "PER":1,"ORG":2,"NOR":3,"LOC":4,"GPE":5,"FAC":6,"DAT":7,"TIM":8,
39
+ "MON":9,"CRD":10,"ORD":11,"PRC":12,"QTY":13,"LAW":14,"EVT":15,
40
+ "PRD":16,"REG":17,"WOA":18,"LAN":19,
41
+ }
42
+
43
+ LABEL_HEX = {
44
+ "PER":"#FFBFBF","ORG":"#AEDDFF","NOR":"#8DC7FF","LOC":"#B8FFB8",
45
+ "GPE":"#99F2CC","FAC":"#D9FFA5","DAT":"#FFE58C","TIM":"#FFCC66",
46
+ "MON":"#CCFFDA","CRD":"#F2CCFF","ORD":"#E0BFFF","PRC":"#FFF2B2",
47
+ "QTY":"#C7F2F2","LAW":"#FFBABA","EVT":"#FFD9A5","PRD":"#BFDFFF",
48
+ "REG":"#E6DAFF","WOA":"#FFE6DA","LAN":"#CCFFF2",
49
+ }
50
+
51
+ MAX_PDF_PAGES = 5
52
+ # NusaBERT max 512 token β‰ˆ ~400 kata β‰ˆ ~2500 karakter aman
53
+ MAX_CHUNK_CHARS = 2000
54
+ OVERLAP_CHARS = 150
55
+
56
+ EXAMPLES = [
57
+ ("Contoh 1 – RUPS & Dana Cadangan",
58
+ "Berdasarkan Rapat Umum Pemegang Saham (RUPS) pada tanggal 24 Juni 2024 yang disahkan "
59
+ "oleh notaris Ashoya Ratam, S.H., M.Kn., Risalah No.124/VI/2024, Perusahaan memutuskan "
60
+ "antara lain menyisihkan 5% dari laba bersih untuk tahun yang berakhir 31 Desember 2023 "
61
+ "atau sebesar Rp5.299.075.507 sebagai dana cadangan jaminan."),
62
+ ("Contoh 2 – Akta Jual Beli Saham PEFINDO",
63
+ "Berdasarkan Akta Notaris Melinda, S.Sos., S.H., M.Kn dengan No. 17 tanggal 21 Januari "
64
+ "2025, Perusahaan dan Dana Pensiun Pertamina telah menandatangani Akta Jual Beli saham "
65
+ "dan Perusahaan telah melakukan pembayaran penuh untuk pembelian 5.170 lembar saham "
66
+ "PEFINDO yang dimiliki Dana Pensiun Pertamina. Dengan demikian total kepemilikan saham "
67
+ "Perusahaan pada tanggal 21 Januari 2025 menjadi sebanyak 37.548 lembar saham atau sama "
68
+ "dengan 31,92% kepemilikan di PEFINDO."),
69
+ ("Contoh 3 – Fasilitas Kredit Bank Permata",
70
+ "Pada tanggal 12 Desember 2022, PEI, entitas anak, dan PT Bank Permata Tbk "
71
+ "menandatangani perjanjian fasilitas money market dengan fasilitas kredit maksimum "
72
+ "sebesar Rp50.000.000.000. Pinjaman ini digunakan untuk keperluan stand by facility "
73
+ "dengan jangka waktu penarikan antara 3 (tiga) hari sampai dengan 3 (tiga) bulan "
74
+ "semenjak tanggal penarikan pinjaman dilakukan."),
75
+ ("Contoh 4 – Dividen PEFINDO Biro Kredit",
76
+ "Berdasarkan Rapat Umum Pemegang Saham Tahunan tanggal 28 Juni 2024, pemegang saham "
77
+ "PEFINDO Biro Kredit menyetujui pembagian dividen untuk Perusahaan sebesar Rp6.637.962.683."),
78
+ ("Contoh 5 – Regulasi Bursa Karbon",
79
+ "Peraturan Presiden RI No. 98 Tahun 2021 tentang Penyelenggaraan Nilai Ekonomi Karbon "
80
+ "untuk Pencapaian Target Kontribusi yang Ditetapkan Secara Nasional dan Pengendalian "
81
+ "Emisi Gas Rumah Kaca dalam Pembangunan Nasional mengatur mengenai mekanisme pencapaian "
82
+ "NDC. Undang-undang RI No. 4 Tahun 2023 tentang Pengembangan dan Penguatan Sektor "
83
+ "Keuangan menegaskan bahwa tugas pengaturan dan pengawasan bursa karbon dilakukan oleh "
84
+ "Otoritas Jasa Keuangan."),
85
+ ]
86
+
87
+
88
+ # ── Helpers ──────────────────────────────────────────────────────────────────
89
+
90
+ def clean_word(word: str) -> str:
91
+ return word.replace("▁", " ").replace("##", "").strip()
92
+
93
+ def get_label_id(raw_label: str) -> str:
94
+ label_id = raw_label.replace("B-","").replace("I-","").replace("B_","").replace("I_","")
95
+ return label_id.split("-")[-1].upper().strip()
96
+
97
+ def escape_rtf(text: str) -> str:
98
+ out = []
99
+ for ch in text:
100
+ if ch == '\\': out.append('\\\\')
101
+ elif ch == '{': out.append('\\{')
102
+ elif ch == '}': out.append('\\}')
103
+ elif ord(ch) > 127: out.append(f'\\u{ord(ch)}?')
104
+ else: out.append(ch)
105
+ return ''.join(out)
106
+
107
+ def build_rtf_color_table() -> str:
108
+ entries = "\\red0\\green0\\blue0;"
109
+ for r,g,b in RTF_COLOR_TABLE:
110
+ entries += f"\\red{r}\\green{g}\\blue{b};"
111
+ return "{\\colortbl ;" + entries + "}"
112
+
113
+ def text_to_rtf_with_highlights(text: str, entity_map: dict) -> str:
114
+ sorted_entities = sorted(entity_map.items(), key=lambda x: len(x[0]), reverse=True)
115
+ spans = []
116
+ used = [False] * len(text)
117
+ for entity_lower, label_id in sorted_entities:
118
+ if not entity_lower:
119
+ continue
120
+ pattern = re.compile(re.escape(entity_lower), re.IGNORECASE)
121
+ for m in pattern.finditer(text):
122
+ s, e = m.start(), m.end()
123
+ if any(used[i] for i in range(s, e)):
124
+ continue
125
+ spans.append((s, e, label_id))
126
+ for i in range(s, e):
127
+ used[i] = True
128
+ spans.sort(key=lambda x: x[0])
129
+ rtf_parts = []
130
+ cursor = 0
131
+ for s, e, label_id in spans:
132
+ if cursor < s:
133
+ rtf_parts.append(escape_rtf(text[cursor:s]))
134
+ color_idx = LABEL_RTF_COLOR_IDX.get(label_id, 1)
135
+ highlighted = escape_rtf(text[s:e])
136
+ rtf_parts.append(f"{{\\highlight{color_idx} {highlighted}}}")
137
+ cursor = e
138
+ if cursor < len(text):
139
+ rtf_parts.append(escape_rtf(text[cursor:]))
140
+ return "".join(rtf_parts)
141
+
142
+ def wrap_rtf(content: str) -> str:
143
+ color_table = build_rtf_color_table()
144
+ content_rtf = content.replace("\r\n","\n").replace("\r","\n").replace("\n","\\par\n")
145
+ return (
146
+ "{\\rtf1\\ansi\\ansicpg1252\\deff0\n"
147
+ "{\\fonttbl{\\f0\\froman\\fcharset0 Times New Roman;}}\n"
148
+ f"{color_table}\n"
149
+ "\\widowctrl\\wpaper12240\\wpaperh15840\n"
150
+ "\\margl1800\\margr1800\\margt1440\\margb1440\n"
151
+ "\\f0\\fs24\\sl360\\slmult1\n"
152
+ f"{content_rtf}"
153
+ "}"
154
+ )
155
+
156
+ def _build_debug_html(lines: list) -> str:
157
+ rows = "".join(
158
+ f'<div style="padding:6px 10px; border-bottom:1px solid rgba(255,255,255,0.07); '
159
+ f'font-size:13px; color:#cbd5e1; font-family:monospace;">{line}</div>'
160
+ for line in lines
161
+ )
162
+ return f"""
163
+ <div style="margin-top:12px; background:rgba(15,23,42,0.85); border:1px solid rgba(99,102,241,0.35);
164
+ border-radius:12px; overflow:hidden;">
165
+ <div style="background:linear-gradient(135deg,#1e40af,#6d28d9); padding:8px 14px;
166
+ font-size:11px; font-weight:700; letter-spacing:0.1em; color:#fff;">
167
+ πŸ” DEBUG β€” WAKTU PER LANGKAH
168
+ </div>
169
+ {rows}
170
+ </div>"""
171
+
172
+
173
+ # ── NER Teks ─────────────────────────────────────────────────────────────────
174
+
175
+ def run_ner(text: str):
176
+ if not text or not text.strip():
177
+ return "<p style='color:#94a3b8; font-style:italic; padding:16px;'>Masukkan teks terlebih dahulu.</p>"
178
+ results = ner(text.strip())
179
+ if not results:
180
+ return "<p style='color:#94a3b8; font-style:italic; padding:16px;'>Tidak ada entitas yang ditemukan.</p>"
181
+
182
+ rows_html = ""
183
+ row_num = 1
184
+ seen_words = set()
185
+ for ent in results:
186
+ raw_label = ent["entity_group"]
187
+ label_id = get_label_id(raw_label)
188
+ label_idn = LABEL_MAP.get(label_id, raw_label)
189
+ word = clean_word(ent["word"])
190
+ if not word: continue
191
+ word_key = word.lower()
192
+ if word_key in seen_words: continue
193
+ seen_words.add(word_key)
194
+ hex_color = LABEL_HEX.get(label_id, "#e2e8f0")
195
+ score = f"{ent['score']:.2%}"
196
+ row_bg = "#f8faff" if row_num % 2 == 0 else "#ffffff"
197
+ rows_html += f"""
198
+ <tr style="background:{row_bg};">
199
+ <td style="padding:9px 14px;border-bottom:1px solid #e8edf5;text-align:center;color:#64748b;font-size:12px;">{row_num}</td>
200
+ <td style="padding:9px 14px;border-bottom:1px solid #e8edf5;font-weight:600;color:#1e293b;">{word}</td>
201
+ <td style="padding:9px 14px;border-bottom:1px solid #e8edf5;">
202
+ <span style="background:{hex_color};padding:3px 10px;border-radius:20px;font-size:12px;font-weight:600;color:#1e293b;">{label_idn}</span>
203
+ </td>
204
+ <td style="padding:9px 14px;border-bottom:1px solid #e8edf5;font-size:12px;color:#64748b;text-align:center;">{score}</td>
205
+ </tr>"""
206
+ row_num += 1
207
+
208
+ if not rows_html:
209
+ return "<p style='color:#94a3b8; font-style:italic; padding:16px;'>Tidak ada entitas yang ditemukan.</p>"
210
+
211
+ return f"""
212
+ <div style="overflow-x:auto;margin-top:4px;border-radius:12px;border:1px solid #e2e8f0;box-shadow:0 2px 12px rgba(0,0,0,0.06);">
213
+ <table style="width:100%;border-collapse:collapse;font-size:14px;font-family:'Segoe UI',sans-serif;">
214
+ <thead>
215
+ <tr style="background:linear-gradient(135deg,#1e40af,#6d28d9);">
216
+ <th style="padding:12px 14px;color:#fff;width:55px;font-weight:600;font-size:12px;letter-spacing:0.05em;">NO</th>
217
+ <th style="padding:12px 14px;color:#fff;text-align:left;font-weight:600;font-size:12px;letter-spacing:0.05em;">KATA / FRASA</th>
218
+ <th style="padding:12px 14px;color:#fff;text-align:left;font-weight:600;font-size:12px;letter-spacing:0.05em;">ENTITAS</th>
219
+ <th style="padding:12px 14px;color:#fff;width:90px;font-weight:600;font-size:12px;letter-spacing:0.05em;">SKOR</th>
220
+ </tr>
221
+ </thead>
222
+ <tbody>{rows_html}</tbody>
223
+ </table>
224
+ </div>"""
225
+
226
+
227
+ # ── NER PDF β†’ RTF ─────────────────────────────────────────────────────────────
228
+
229
+ def run_ner_pdf(pdf_file):
230
+ # return: (out_path, legend_html, warn_html, debug_html)
231
+ if pdf_file is None:
232
+ return None, "", "<p style='color:#94a3b8;font-style:italic;padding:16px;'>Unggah file PDF terlebih dahulu.</p>", ""
233
+
234
+ pdf_path = pdf_file if isinstance(pdf_file, str) else pdf_file.name
235
+ doc = fitz.open(pdf_path)
236
+ page_count = len(doc)
237
+
238
+ if page_count > MAX_PDF_PAGES:
239
+ doc.close()
240
+ return None, "", f"""
241
+ <div style="background:#fff1f2;border:1.5px solid #fda4af;border-radius:12px;
242
+ padding:16px 20px;display:flex;align-items:center;gap:12px;">
243
+ <span style="font-size:24px;">⚠️</span>
244
+ <div>
245
+ <p style="margin:0;font-weight:700;color:#be123c;font-size:15px;">PDF Terlalu Banyak Halaman</p>
246
+ <p style="margin:4px 0 0;color:#9f1239;font-size:13px;">
247
+ File Anda memiliki <strong>{page_count} halaman</strong>.
248
+ Maksimal <strong>{MAX_PDF_PAGES} halaman</strong>.
249
+ </p>
250
+ </div>
251
+ </div>""", ""
252
+
253
+ debug_lines = []
254
+ t_total = time.time()
255
+
256
+ # ── LANGKAH 1: Ekstrak teks ──
257
+ t0 = time.time()
258
+ pages_text = [page.get_text() for page in doc]
259
+ doc.close()
260
+ full_text = "\n\n".join(pages_text)
261
+ char_count = len(full_text)
262
+ t1 = time.time()
263
+ debug_lines.append(f"βœ… <b>Langkah 1</b> β€” Ekstrak teks PDF: <b>{t1-t0:.3f}s</b> | {char_count:,} karakter | {page_count} halaman")
264
+
265
+ # ── LANGKAH 2: Chunking ──
266
+ t0 = time.time()
267
+ chunks = []
268
+ start = 0
269
+ while start < len(full_text):
270
+ end = min(start + MAX_CHUNK_CHARS, len(full_text))
271
+ chunks.append(full_text[start:end])
272
+ if end == len(full_text):
273
+ break
274
+ start = end - OVERLAP_CHARS
275
+ t1 = time.time()
276
+ debug_lines.append(f"βœ… <b>Langkah 2</b> β€” Chunking teks: <b>{t1-t0:.4f}s</b> | {len(chunks)} chunk @ {MAX_CHUNK_CHARS} karakter")
277
+
278
+ # ── LANGKAH 3: NER model (ini yang biasanya paling lama) ──
279
+ t0 = time.time()
280
+ all_ner_results = []
281
+ chunk_times = []
282
+ for i, chunk in enumerate(chunks):
283
+ tc = time.time()
284
+ chunk_results = ner(chunk.strip())
285
+ chunk_times.append(time.time() - tc)
286
+ all_ner_results.extend(chunk_results)
287
+ t1 = time.time()
288
+ avg_chunk = sum(chunk_times)/len(chunk_times) if chunk_times else 0
289
+ debug_lines.append(
290
+ f"βœ… <b>Langkah 3</b> β€” NER model: <b>{t1-t0:.2f}s</b> total | "
291
+ f"{len(chunks)} chunk | avg/chunk: {avg_chunk:.2f}s | {len(all_ner_results)} entitas raw"
292
+ )
293
+ # Detail per chunk
294
+ for i, ct in enumerate(chunk_times):
295
+ debug_lines.append(f"&nbsp;&nbsp;&nbsp;↳ chunk {i+1}/{len(chunks)}: {ct:.2f}s ({len(chunks[i])} karakter)")
296
+
297
+ if not all_ner_results:
298
+ dh = _build_debug_html(debug_lines)
299
+ return None, "", "<p style='color:#94a3b8;padding:16px;'>Tidak ada entitas ditemukan.</p>", dh
300
+
301
+ # ── LANGKAH 4: Bangun entity_map ──
302
+ t0 = time.time()
303
+ entity_map: dict[str, str] = {}
304
+ for ent in all_ner_results:
305
+ word = clean_word(ent["word"])
306
+ if len(word) < 2: continue
307
+ label_id = get_label_id(ent["entity_group"])
308
+ w_lower = word.lower()
309
+ if w_lower not in entity_map:
310
+ entity_map[w_lower] = label_id
311
+ t1 = time.time()
312
+ debug_lines.append(f"βœ… <b>Langkah 4</b> β€” Bangun entity_map: <b>{t1-t0:.4f}s</b> | {len(entity_map)} entitas unik")
313
+
314
+ if not entity_map:
315
+ dh = _build_debug_html(debug_lines)
316
+ return None, "", "<p style='color:#94a3b8;padding:16px;'>Tidak ada entitas ditemukan.</p>", dh
317
+
318
+ # ── LANGKAH 5: Highlight β†’ RTF content ──
319
+ t0 = time.time()
320
+ rtf_content = text_to_rtf_with_highlights(full_text, entity_map)
321
+ t1 = time.time()
322
+ debug_lines.append(f"βœ… <b>Langkah 5</b> β€” Highlight & build RTF content: <b>{t1-t0:.4f}s</b>")
323
+
324
+ # ── LANGKAH 6: Tulis file .rtf ──
325
+ t0 = time.time()
326
+ rtf_full = wrap_rtf(rtf_content)
327
+ out_fd, out_path = tempfile.mkstemp(suffix=".rtf")
328
+ with os.fdopen(out_fd, "w", encoding="ascii", errors="replace") as f:
329
+ f.write(rtf_full)
330
+ t1 = time.time()
331
+ debug_lines.append(f"βœ… <b>Langkah 6</b> β€” Tulis file RTF: <b>{t1-t0:.4f}s</b> | {len(rtf_full):,} bytes")
332
+
333
+ t_end = time.time()
334
+ debug_lines.append(f"⏱ <b>TOTAL WAKTU: {t_end-t_total:.2f}s</b>")
335
+
336
+ debug_html = _build_debug_html(debug_lines)
337
+ found_labels: set[str] = set(entity_map.values())
338
+
339
+ # ── Legend ──
340
+ legend_rows = ""
341
+ for label_id in sorted(found_labels):
342
+ label_idn = LABEL_MAP.get(label_id, label_id)
343
+ hex_color = LABEL_HEX.get(label_id, "#e2e8f0")
344
+ legend_rows += f"""
345
+ <tr>
346
+ <td style="padding:8px 14px;border-bottom:1px solid #e8edf5;">
347
+ <span style="display:inline-block;width:24px;height:24px;border-radius:6px;
348
+ background:{hex_color};vertical-align:middle;border:1px solid rgba(0,0,0,0.1);"></span>
349
+ </td>
350
+ <td style="padding:8px 14px;border-bottom:1px solid #e8edf5;font-weight:600;color:#1e293b;font-size:13px;">{label_id}</td>
351
+ <td style="padding:8px 14px;border-bottom:1px solid #e8edf5;color:#475569;font-size:13px;">{label_idn}</td>
352
+ </tr>"""
353
+
354
+ legend_html = f"""
355
+ <div style="margin-top:4px;">
356
+ <p style="font-size:13px;color:#64748b;margin-bottom:10px;">
357
+ Ditemukan <strong style='color:#1e40af;'>{len(found_labels)}</strong> jenis entitas
358
+ dari <strong style='color:#1e40af;'>{page_count}</strong> halaman PDF.
359
+ Buka file <strong>.rtf</strong> di <strong>Microsoft Word</strong> atau <strong>LibreOffice Writer</strong>.
360
+ </p>
361
+ <div style="overflow-x:auto;border-radius:12px;border:1px solid #e2e8f0;box-shadow:0 2px 12px rgba(0,0,0,0.06);">
362
+ <table style="width:100%;border-collapse:collapse;font-family:'Segoe UI',sans-serif;">
363
+ <thead>
364
+ <tr style="background:linear-gradient(135deg,#1e40af,#6d28d9);">
365
+ <th style="padding:11px 14px;color:#fff;font-size:12px;letter-spacing:0.05em;width:50px;">WARNA</th>
366
+ <th style="padding:11px 14px;color:#fff;font-size:12px;letter-spacing:0.05em;text-align:left;">KODE</th>
367
+ <th style="padding:11px 14px;color:#fff;font-size:12px;letter-spacing:0.05em;text-align:left;">NAMA ENTITAS</th>
368
+ </tr>
369
+ </thead>
370
+ <tbody>{legend_rows}</tbody>
371
+ </table>
372
+ </div>
373
+ </div>"""
374
+
375
+ return out_path, legend_html, "", debug_html
376
+
377
+
378
+ # ── CSS ───────────────────────────────────────────────────────────────────────
379
+
380
+ CUSTOM_CSS = """
381
+ @import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@700;900&family=DM+Sans:wght@400;500;600&display=swap');
382
+ .gradio-container{max-width:100%!important;padding:0!important;background:linear-gradient(160deg,#0f172a 0%,#1e1b4b 40%,#0f172a 100%)!important;min-height:100vh;}
383
+ #hero-header{background:linear-gradient(135deg,#1e40af 0%,#6d28d9 50%,#be185d 100%);padding:40px 48px 36px;margin:0 0 24px;border-radius:16px;text-align:center;position:relative;overflow:hidden;}
384
+ #hero-header::before{content:'';position:absolute;inset:0;background:url("data:image/svg+xml,%3Csvg width='60' height='60' viewBox='0 0 60 60' xmlns='http://www.w3.org/2000/svg'%3E%3Cg fill='none' fill-rule='evenodd'%3E%3Cg fill='%23ffffff' fill-opacity='0.04'%3E%3Cpath d='M36 34v-4h-2v4h-4v2h4v4h2v-4h4v-2h-4zm0-30V0h-2v4h-4v2h4v4h2V6h4V4h-4zM6 34v-4H4v4H0v2h4v4h2v-4h4v-2H6zM6 4V0H4v4H0v2h4v4h2V6h4V4H6z'/%3E%3C/g%3E%3C/g%3E%3C/svg%3E");}
385
+ .tab-nav{background:rgba(255,255,255,0.05)!important;border-radius:12px!important;padding:4px!important;border:1px solid rgba(255,255,255,0.1)!important;margin:0!important;}
386
+ .tab-nav button{background:transparent!important;color:#94a3b8!important;border-radius:8px!important;padding:10px 24px!important;font-family:'DM Sans',sans-serif!important;font-weight:600!important;font-size:14px!important;transition:all 0.2s!important;}
387
+ .tab-nav button.selected{background:linear-gradient(135deg,#1e40af,#6d28d9)!important;color:#ffffff!important;box-shadow:0 4px 12px rgba(109,40,217,0.4)!important;}
388
+ label span{color:#cbd5e1!important;font-family:'DM Sans',sans-serif!important;font-weight:600!important;font-size:12px!important;letter-spacing:0.07em!important;text-transform:uppercase!important;}
389
+ textarea,.gr-textbox textarea{background:rgba(15,23,42,0.7)!important;border:1.5px solid rgba(255,255,255,0.12)!important;border-radius:10px!important;color:#e2e8f0!important;font-family:'DM Sans',sans-serif!important;font-size:14px!important;padding:14px!important;transition:border 0.2s!important;}
390
+ textarea:focus{border-color:#6d28d9!important;outline:none!important;}
391
+ .example-btn{display:block!important;width:100%!important;text-align:left!important;padding:12px 16px!important;margin-bottom:8px!important;background:rgba(30,64,175,0.15)!important;border:1px solid rgba(99,102,241,0.30)!important;border-radius:10px!important;cursor:pointer!important;font-size:13px!important;line-height:1.6!important;color:#cbd5e1!important;white-space:normal!important;height:auto!important;font-family:'DM Sans',sans-serif!important;transition:all 0.2s!important;}
392
+ .example-btn:hover{background:rgba(109,40,217,0.25)!important;border-color:#6d28d9!important;color:#e2e8f0!important;transform:translateX(3px);}
393
+ #analyze-btn,#analyze-pdf-btn{background:linear-gradient(135deg,#1e40af,#6d28d9)!important;color:white!important;font-weight:700!important;font-size:15px!important;border-radius:10px!important;padding:12px 0!important;margin-top:8px!important;font-family:'DM Sans',sans-serif!important;letter-spacing:0.03em!important;box-shadow:0 4px 16px rgba(109,40,217,0.35)!important;transition:all 0.2s!important;border:none!important;}
394
+ #analyze-btn:hover,#analyze-pdf-btn:hover{transform:translateY(-2px)!important;box-shadow:0 6px 24px rgba(109,40,217,0.50)!important;}
395
+ .section-heading{font-family:'DM Sans',sans-serif;font-weight:700;font-size:11px;letter-spacing:0.12em;text-transform:uppercase;color:#ffffff!important;margin-bottom:12px;display:flex;align-items:center;gap:8px;}
396
+ .section-heading::before{content:'';display:inline-block;width:18px;height:2px;background:linear-gradient(90deg,#6d28d9,#be185d);border-radius:2px;}
397
+ .gr-file{background:rgba(15,23,42,0.7)!important;border:1.5px dashed rgba(99,102,241,0.40)!important;border-radius:10px!important;color:#94a3b8!important;}
398
+ .gr-file-download{background:rgba(30,64,175,0.2)!important;border:1px solid rgba(99,102,241,0.4)!important;border-radius:10px!important;color:#a5b4fc!important;font-family:'DM Sans',sans-serif!important;}
399
+ #center-col{max-width:780px!important;margin:0 auto!important;width:100%!important;padding:0 8px!important;}
400
+ #footer{text-align:center;padding:20px;color:rgba(148,163,184,0.5);font-family:'DM Sans',sans-serif;font-size:12px;letter-spacing:0.04em;}
401
+ .gradio-container h3{color:#e2e8f0!important;}
402
+ #center-col p{color:#94a3b8!important;}
403
+ """
404
+
405
+ HERO_HTML = """
406
+ <div id="hero-header">
407
+ <p style="font-family:'DM Sans',sans-serif;font-size:12px;font-weight:700;letter-spacing:0.20em;text-transform:uppercase;color:rgba(165,180,252,0.8);margin:0 0 10px;">Tugas Kelompok Β· NLP &amp; Text Mining</p>
408
+ <h1 style="font-family:'Playfair Display',serif;font-size:clamp(32px,5vw,56px);font-weight:900;color:#ffffff;margin:0 0 8px;line-height:1.1;text-shadow:0 2px 20px rgba(109,40,217,0.5);">
409
+ NER <span style="color:#a78bfa;">for</span> Financial Statements
410
+ </h1>
411
+ <p style="font-family:'DM Sans',sans-serif;font-size:clamp(13px,2vw,16px);color:rgba(203,213,225,0.85);margin:0 auto 18px;max-width:640px;line-height:1.6;">
412
+ Implementasi Named Entity Recognition pada Kumpulan<br>Laporan-laporan Keuangan Bahasa Indonesia
413
+ </p>
414
+ <div style="display:inline-flex;align-items:center;gap:8px;background:rgba(0,0,0,0.25);border:1px solid rgba(255,255,255,0.15);border-radius:20px;padding:6px 16px 6px 8px;">
415
+ <span style="background:linear-gradient(135deg,#1e40af,#6d28d9);border-radius:12px;padding:3px 10px;font-size:11px;font-weight:700;color:#fff;letter-spacing:0.05em;">MODEL</span>
416
+ <span style="font-family:monospace;font-size:13px;color:#a5b4fc;">cahya/NusaBert-ner-v1.3</span>
417
+ </div>
418
+ </div>
419
+ """
420
+
421
+
422
+ # ── Gradio UI ─────────────────────────────────────────────────────────────────
423
+
424
+ with gr.Blocks(title="NER for Financial Statements", css=CUSTOM_CSS) as demo:
425
+
426
+ gr.HTML(HERO_HTML)
427
+
428
+ with gr.Tabs(elem_classes="tab-nav"):
429
+
430
+ with gr.Tab("Analisis Teks"):
431
+ with gr.Column(elem_id="center-col"):
432
+ gr.HTML('<div class="section-heading">Contoh Teks</div>')
433
+ example_btns = []
434
+ for title, body in EXAMPLES:
435
+ btn = gr.Button(f"πŸ“Œ {title}\n\n{body}", elem_classes="example-btn")
436
+ example_btns.append((btn, body))
437
+
438
+ gr.HTML('<div class="section-heading" style="margin-top:20px;">Input Teks</div>')
439
+ text_input = gr.Textbox(lines=9, placeholder="Ketik atau tempel teks laporan keuangan di sini…", label="", show_label=False)
440
+ analyze_btn = gr.Button("Lakukan Analisis", elem_id="analyze-btn")
441
+
442
+ gr.HTML('<div class="section-heading" style="margin-top:20px;">Hasil Analisis Entitas</div>')
443
+ text_output = gr.HTML(value="<p style='color:#64748b;font-size:14px;font-family:DM Sans,sans-serif;padding:20px;text-align:center;'>Masukkan teks lalu klik Lakukan Analisis.</p>")
444
+
445
+ with gr.Tab("Analisis PDF"):
446
+ with gr.Column(elem_id="center-col"):
447
+ gr.HTML(
448
+ '''<div style="margin:0 0 16px;">
449
+ <p style="color:#94a3b8;font-family:DM Sans,sans-serif;font-size:13px;margin:0 0 8px;">
450
+ Unggah PDF laporan keuangan (maks. <strong style="color:#a78bfa;">5 halaman</strong>).
451
+ Output: file <strong style="color:#a78bfa;">.rtf</strong> dengan stabilo warna β€”
452
+ bisa dibuka di <strong>Microsoft Word</strong> atau <strong>LibreOffice</strong>.
453
+ </p>
454
+ <div style="background:rgba(251,191,36,0.1);border:1px solid rgba(251,191,36,0.4);
455
+ border-radius:8px;padding:8px 14px;display:flex;align-items:center;gap:8px;">
456
+ <span style="font-size:16px;">⚑</span>
457
+ <span style="font-size:12px;color:#fbbf24;font-family:DM Sans,sans-serif;">
458
+ <strong>Pertama kali digunakan:</strong> model NER perlu load ke memori (~2 menit).
459
+ Request berikutnya akan jauh lebih cepat (~5 detik).
460
+ </span>
461
+ </div>
462
+ </div>'''
463
+ )
464
+ gr.HTML('<div class="section-heading">Unggah PDF</div>')
465
+ pdf_input = gr.File(label="", file_types=[".pdf"], type="filepath")
466
+ analyze_pdf_btn = gr.Button("Highlight Entitas β†’ Download RTF", elem_id="analyze-pdf-btn")
467
+ pdf_warning = gr.HTML(value="")
468
+
469
+ with gr.Column(visible=False) as pdf_results_col:
470
+ gr.HTML('<div class="section-heading" style="margin-top:20px;">Download Hasil RTF</div>')
471
+ pdf_output = gr.File(label="", file_types=[".rtf"], interactive=False)
472
+ gr.HTML('<div class="section-heading" style="margin-top:20px;">Keterangan Warna Entitas</div>')
473
+ pdf_legend = gr.HTML(value="")
474
+ # Debug panel β€” selalu tampil setelah proses
475
+ pdf_debug = gr.HTML(value="")
476
+
477
+ gr.HTML('<div id="footer">NER for Financial Statements 2026</div>')
478
+
479
+ # Wiring
480
+ for btn, body in example_btns:
481
+ btn.click(fn=lambda b=body: b, inputs=[], outputs=text_input)
482
+
483
+ analyze_btn.click(fn=run_ner, inputs=text_input, outputs=text_output)
484
+ text_input.submit(fn=run_ner, inputs=text_input, outputs=text_output)
485
+
486
+ def handle_pdf(pdf_file):
487
+ out_path, legend_html, warn_html, debug_html = run_ner_pdf(pdf_file)
488
+ show = gr.Column(visible=out_path is not None)
489
+ return out_path, legend_html, warn_html, debug_html, show
490
+
491
+ analyze_pdf_btn.click(
492
+ fn=handle_pdf,
493
+ inputs=pdf_input,
494
+ outputs=[pdf_output, pdf_legend, pdf_warning, pdf_debug, pdf_results_col],
495
+ )
496
+
497
+ if __name__ == "__main__":
498
+ demo.launch()