import os import re import time import fitz import gradio as gr from transformers import pipeline # Load model ner = pipeline("ner", model="cahya/NusaBert-ner-v1.3", aggregation_strategy="simple") # Label mapping LABEL_MAP = { "CRD": "Kardinal", "DAT": "Tanggal", "EVT": "Peristiwa", "FAC": "Fasilitas", "GPE": "Entitas Geopolitik", "LAW": "Peraturan / Undang-Undang", "LOC": "Lokasi", "MON": "Uang", "NOR": "Organisasi Politik", "ORD": "Ordinal", "ORG": "Organisasi", "PER": "Orang", "PRC": "Persentase", "PRD": "Produk", "QTY": "Kuantitas", "REG": "Agama", "TIM": "Waktu", "WOA": "Karya Seni", "LAN": "Bahasa", } LABEL_HEX = { "PER":"#FFBFBF","ORG":"#AEDDFF","NOR":"#8DC7FF","LOC":"#B8FFB8", "GPE":"#99F2CC","FAC":"#D9FFA5","DAT":"#FFE58C","TIM":"#FFCC66", "MON":"#CCFFDA","CRD":"#F2CCFF","ORD":"#E0BFFF","PRC":"#FFF2B2", "QTY":"#C7F2F2","LAW":"#FFBABA","EVT":"#FFD9A5","PRD":"#BFDFFF", "REG":"#E6DAFF","WOA":"#FFE6DA","LAN":"#CCFFF2", } MAX_PDF_PAGES = 5 MAX_CHUNK_CHARS = 2000 OVERLAP_CHARS = 150 EXAMPLES = [ ("Contoh 1 – RUPS & Dana Cadangan", "Berdasarkan Rapat Umum Pemegang Saham (RUPS) pada tanggal 24 Juni 2024 yang disahkan " "oleh notaris Ashoya Ratam, S.H., M.Kn., Risalah No.124/VI/2024, Perusahaan memutuskan " "antara lain menyisihkan 5% dari laba bersih untuk tahun yang berakhir 31 Desember 2023 " "atau sebesar Rp5.299.075.507 sebagai dana cadangan jaminan."), ("Contoh 2 – Akta Jual Beli Saham PEFINDO", "Berdasarkan Akta Notaris Melinda, S.Sos., S.H., M.Kn dengan No. 17 tanggal 21 Januari " "2025, Perusahaan dan Dana Pensiun Pertamina telah menandatangani Akta Jual Beli saham " "dan Perusahaan telah melakukan pembayaran penuh untuk pembelian 5.170 lembar saham " "PEFINDO yang dimiliki Dana Pensiun Pertamina. Dengan demikian total kepemilikan saham " "Perusahaan pada tanggal 21 Januari 2025 menjadi sebanyak 37.548 lembar saham atau sama " "dengan 31,92% kepemilikan di PEFINDO."), ("Contoh 3 – Fasilitas Kredit Bank Permata", "Pada tanggal 12 Desember 2022, PEI, entitas anak, dan PT Bank Permata Tbk " "menandatangani perjanjian fasilitas money market dengan fasilitas kredit maksimum " "sebesar Rp50.000.000.000. Pinjaman ini digunakan untuk keperluan stand by facility " "dengan jangka waktu penarikan antara 3 (tiga) hari sampai dengan 3 (tiga) bulan " "semenjak tanggal penarikan pinjaman dilakukan."), ("Contoh 4 – Dividen PEFINDO Biro Kredit", "Berdasarkan Rapat Umum Pemegang Saham Tahunan tanggal 28 Juni 2024, pemegang saham " "PEFINDO Biro Kredit menyetujui pembagian dividen untuk Perusahaan sebesar Rp6.637.962.683."), ("Contoh 5 – Regulasi Bursa Karbon", "Peraturan Presiden RI No. 98 Tahun 2021 tentang Penyelenggaraan Nilai Ekonomi Karbon " "untuk Pencapaian Target Kontribusi yang Ditetapkan Secara Nasional dan Pengendalian " "Emisi Gas Rumah Kaca dalam Pembangunan Nasional mengatur mengenai mekanisme pencapaian " "NDC. Undang-undang RI No. 4 Tahun 2023 tentang Pengembangan dan Penguatan Sektor " "Keuangan menegaskan bahwa tugas pengaturan dan pengawasan bursa karbon dilakukan oleh " "Otoritas Jasa Keuangan."), ] # Helpers def clean_word(word: str) -> str: return word.replace("▁", " ").replace("##", "").strip() def get_label_id(raw_label: str) -> str: label_id = raw_label.replace("B-","").replace("I-","").replace("B_","").replace("I_","") return label_id.split("-")[-1].upper().strip() def highlight_html(text: str, entity_map: dict) -> str: sorted_entities = sorted(entity_map.items(), key=lambda x: len(x[0]), reverse=True) spans = [] used = [False] * len(text) for entity_lower, label_id in sorted_entities: if not entity_lower: continue pattern = re.compile(re.escape(entity_lower), re.IGNORECASE) for m in pattern.finditer(text): s, e = m.start(), m.end() if any(used[i] for i in range(s, e)): continue spans.append((s, e, label_id)) for i in range(s, e): used[i] = True spans.sort(key=lambda x: x[0]) parts = [] cursor = 0 for s, e, label_id in spans: if cursor < s: parts.append(text[cursor:s].replace("\n", "
")) hex_color = LABEL_HEX.get(label_id, "#e2e8f0") label_idn = LABEL_MAP.get(label_id, label_id) word = text[s:e] parts.append( f'{word}' ) cursor = e if cursor < len(text): parts.append(text[cursor:].replace("\n", "
")) return ( '
' + "".join(parts) + "
" ) # NER Teks def run_ner(text: str): if not text or not text.strip(): return "

Masukkan teks terlebih dahulu.

" results = ner(text.strip()) if not results: return "

Tidak ada entitas yang ditemukan.

" rows_html = "" row_num = 1 seen_words = set() for ent in results: raw_label = ent["entity_group"] label_id = get_label_id(raw_label) label_idn = LABEL_MAP.get(label_id, raw_label) word = clean_word(ent["word"]) if not word: continue word_key = word.lower() if word_key in seen_words: continue seen_words.add(word_key) hex_color = LABEL_HEX.get(label_id, "#e2e8f0") score = f"{ent['score']:.2%}" row_bg = "#f8faff" if row_num % 2 == 0 else "#ffffff" rows_html += f""" {row_num} {word} {label_idn} {score} """ row_num += 1 if not rows_html: return "

Tidak ada entitas yang ditemukan.

" return f"""
{rows_html}
NO KATA / FRASA ENTITAS SKOR
""" def run_ner_file(upload_file): if upload_file is None: return "

Unggah file terlebih dahulu.

", "" file_path = upload_file if isinstance(upload_file, str) else upload_file.name ext = os.path.splitext(file_path)[-1].lower() if ext == ".txt": with open(file_path, "r", encoding="utf-8", errors="replace") as f: full_text = f.read() page_count = 1 elif ext == ".pdf": doc = fitz.open(file_path) page_count = len(doc) if page_count > MAX_PDF_PAGES: doc.close() return ( f"

PDF terlalu banyak halaman " f"({page_count}). Maks {MAX_PDF_PAGES} halaman.

", "" ) full_text = "\n\n".join(page.get_text() for page in doc) doc.close() else: return "

Format tidak didukung.

", "" # Chunking chunks = [] start = 0 while start < len(full_text): end = min(start + MAX_CHUNK_CHARS, len(full_text)) chunks.append(full_text[start:end]) if end == len(full_text): break start = end - OVERLAP_CHARS # NER all_ner_results = [] for chunk in chunks: all_ner_results.extend(ner(chunk.strip())) if not all_ner_results: return "

Tidak ada entitas ditemukan.

", "" # Bangun entity_map entity_map: dict[str, str] = {} for ent in all_ner_results: word = clean_word(ent["word"]) if len(word) < 2: continue label_id = get_label_id(ent["entity_group"]) w_lower = word.lower() if w_lower not in entity_map: entity_map[w_lower] = label_id if not entity_map: return "

Tidak ada entitas ditemukan.

", "" highlighted = highlight_html(full_text, entity_map) # Badge legend found_labels = set(entity_map.values()) badges = "".join( f'' f'{LABEL_MAP.get(l, l)}' for l in sorted(found_labels) ) legend_html = f'
{badges}
' return highlighted, legend_html # CSS CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@700;900&family=DM+Sans:wght@400;500;600&display=swap'); .gradio-container{max-width:100%!important;padding:0!important;background:linear-gradient(160deg,#0f172a 0%,#1e1b4b 40%,#0f172a 100%)!important;min-height:100vh;} #hero-header{background:linear-gradient(135deg,#1e40af 0%,#6d28d9 50%,#be185d 100%);padding:40px 48px 36px;margin:0 0 24px;border-radius:16px;text-align:center;position:relative;overflow:hidden;} #hero-header::before{content:'';position:absolute;inset:0;background:url("data:image/svg+xml,%3Csvg width='60' height='60' viewBox='0 0 60 60' xmlns='http://www.w3.org/2000/svg'%3E%3Cg fill='none' fill-rule='evenodd'%3E%3Cg fill='%23ffffff' fill-opacity='0.04'%3E%3Cpath d='M36 34v-4h-2v4h-4v2h4v4h2v-4h4v-2h-4zm0-30V0h-2v4h-4v2h4v4h2V6h4V4h-4zM6 34v-4H4v4H0v2h4v4h2v-4h4v-2H6zM6 4V0H4v4H0v2h4v4h2V6h4V4H6z'/%3E%3C/g%3E%3C/g%3E%3C/svg%3E");} .tab-nav{background:rgba(255,255,255,0.05)!important;border-radius:12px!important;padding:4px!important;border:1px solid rgba(255,255,255,0.1)!important;margin:0!important;} .tab-nav button{background:transparent!important;color:#94a3b8!important;border-radius:8px!important;padding:10px 24px!important;font-family:'DM Sans',sans-serif!important;font-weight:600!important;font-size:14px!important;transition:all 0.2s!important;} .tab-nav button.selected{background:linear-gradient(135deg,#1e40af,#6d28d9)!important;color:#ffffff!important;box-shadow:0 4px 12px rgba(109,40,217,0.4)!important;} label span{color:#cbd5e1!important;font-family:'DM Sans',sans-serif!important;font-weight:600!important;font-size:12px!important;letter-spacing:0.07em!important;text-transform:uppercase!important;} textarea,.gr-textbox textarea{background:rgba(15,23,42,0.7)!important;border:1.5px solid rgba(255,255,255,0.12)!important;border-radius:10px!important;color:#e2e8f0!important;font-family:'DM Sans',sans-serif!important;font-size:14px!important;padding:14px!important;transition:border 0.2s!important;} textarea:focus{border-color:#6d28d9!important;outline:none!important;} .example-btn{display:block!important;width:100%!important;text-align:left!important;padding:12px 16px!important;margin-bottom:8px!important;background:rgba(30,64,175,0.15)!important;border:1px solid rgba(99,102,241,0.30)!important;border-radius:10px!important;cursor:pointer!important;font-size:13px!important;line-height:1.6!important;color:#cbd5e1!important;white-space:normal!important;height:auto!important;font-family:'DM Sans',sans-serif!important;transition:all 0.2s!important;} .example-btn:hover{background:rgba(109,40,217,0.25)!important;border-color:#6d28d9!important;color:#e2e8f0!important;transform:translateX(3px);} #analyze-btn,#analyze-pdf-btn{background:linear-gradient(135deg,#1e40af,#6d28d9)!important;color:white!important;font-weight:700!important;font-size:15px!important;border-radius:10px!important;padding:12px 0!important;margin-top:8px!important;font-family:'DM Sans',sans-serif!important;letter-spacing:0.03em!important;box-shadow:0 4px 16px rgba(109,40,217,0.35)!important;transition:all 0.2s!important;border:none!important;} #analyze-btn:hover,#analyze-pdf-btn:hover{transform:translateY(-2px)!important;box-shadow:0 6px 24px rgba(109,40,217,0.50)!important;} .section-heading{font-family:'DM Sans',sans-serif;font-weight:700;font-size:11px;letter-spacing:0.12em;text-transform:uppercase;color:#ffffff!important;margin-bottom:12px;display:flex;align-items:center;gap:8px;} .section-heading::before{content:'';display:inline-block;width:18px;height:2px;background:linear-gradient(90deg,#6d28d9,#be185d);border-radius:2px;} .gr-file{background:rgba(15,23,42,0.7)!important;border:1.5px dashed rgba(99,102,241,0.40)!important;border-radius:10px!important;color:#94a3b8!important;} .gr-file-download{background:rgba(30,64,175,0.2)!important;border:1px solid rgba(99,102,241,0.4)!important;border-radius:10px!important;color:#a5b4fc!important;font-family:'DM Sans',sans-serif!important;} #center-col{max-width:780px!important;margin:0 auto!important;width:100%!important;padding:0 8px!important;} #footer{text-align:center;padding:20px;color:rgba(148,163,184,0.5);font-family:'DM Sans',sans-serif;font-size:12px;letter-spacing:0.04em;} .gradio-container h3{color:#e2e8f0!important;} #center-col p{color:#94a3b8!important;} """ HERO_HTML = """

Tugas Kelompok · NLP & Text Mining

NER for Financial Statements

Implementasi Named Entity Recognition pada Kumpulan
Laporan-laporan Keuangan Bahasa Indonesia

MODEL cahya/NusaBert-ner-v1.3
""" # Gradio UI with gr.Blocks(title="NER for Financial Statements") as demo: gr.HTML(HERO_HTML) with gr.Tabs(elem_classes="tab-nav"): with gr.Tab("Analisis Teks"): with gr.Column(elem_id="center-col"): gr.HTML('
Contoh Teks
') example_btns = [] for title, body in EXAMPLES: btn = gr.Button(f"📌 {title}\n\n{body}", elem_classes="example-btn") example_btns.append((btn, body)) gr.HTML('
Input Teks
') text_input = gr.Textbox(lines=9, placeholder="Ketik atau tempel teks laporan keuangan di sini…", label="", show_label=False) analyze_btn = gr.Button("Lakukan Analisis", elem_id="analyze-btn") gr.HTML('
Hasil Analisis Entitas
') text_output = gr.HTML(value="

Masukkan teks lalu klik Lakukan Analisis.

") with gr.Tab("Analisis File"): with gr.Column(elem_id="center-col"): gr.HTML('

' 'Unggah file .pdf (maks 5 halaman) atau .txt.

' ) gr.HTML('
Unggah File
') pdf_input = gr.File(label="", file_types=[".pdf", ".txt"], type="filepath") analyze_pdf_btn = gr.Button("Analisis & Highlight Entitas", elem_id="analyze-pdf-btn") gr.HTML('
Entitas Ditemukan
') pdf_legend = gr.HTML(value="") gr.HTML('
Teks Ter-highlight
') pdf_output = gr.HTML(value="") gr.HTML('') # Wiring for btn, body in example_btns: btn.click(fn=lambda b=body: b, inputs=[], outputs=text_input) analyze_btn.click(fn=run_ner, inputs=text_input, outputs=text_output) text_input.submit(fn=run_ner, inputs=text_input, outputs=text_output) analyze_pdf_btn.click( fn=run_ner_file, inputs=pdf_input, outputs=[pdf_output, pdf_legend], ) if __name__ == "__main__": demo.launch(css=CUSTOM_CSS)