| import os
|
| import sys
|
| import subprocess
|
| import numpy as np
|
| from datasets import load_dataset
|
| from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
|
| import torch
|
| import gradio as gr
|
| import pandas as pd
|
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| print(f"Menggunakan perangkat: {device}")
|
|
|
|
|
| try:
|
| dataset = load_dataset("indonlp/indonlu", "nergrit", trust_remote_code=True)
|
| except Exception as e:
|
| print(f"Gagal memuat dataset: {e}")
|
| sys.exit(1)
|
|
|
|
|
| if "train" not in dataset or "test" not in dataset:
|
| print("Dataset tidak memiliki split train/test yang diharapkan.")
|
| sys.exit(1)
|
| if "tokens" not in dataset["train"].column_names or "ner_tags" not in dataset["train"].column_names:
|
| print("Dataset tidak memiliki kolom 'tokens' atau 'ner_tags'.")
|
| sys.exit(1)
|
|
|
|
|
| try:
|
| label_list = dataset["train"].features["ner_tags"].feature.names
|
| id2label = {i: label for i, label in enumerate(label_list)}
|
| label2id = {label: i for i, label in enumerate(label_list)}
|
| except Exception as e:
|
| print(f"Gagal mendapatkan label: {e}")
|
| sys.exit(1)
|
|
|
|
|
| try:
|
| tokenizer = AutoTokenizer.from_pretrained("./ner_model")
|
| model = AutoModelForTokenClassification.from_pretrained(
|
| "./ner_model",
|
| num_labels=len(label_list),
|
| id2label=id2label,
|
| label2id=label2id
|
| )
|
| model.to(device)
|
| except Exception as e:
|
| print(f"Gagal memuat model atau tokenizer dari './ner_model': {e}")
|
| print("Pastikan folder './ner_model' ada dan berisi model yang telah dilatih.")
|
| sys.exit(1)
|
|
|
|
|
| def tokenize_and_align_labels(examples):
|
| tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
|
| labels = []
|
| for i, label in enumerate(examples["ner_tags"]):
|
| word_ids = tokenized_inputs.word_ids(batch_index=i)
|
| previous_word_idx = None
|
| label_ids = []
|
| for word_idx in word_ids:
|
| if word_idx is None:
|
| label_ids.append(-100)
|
| elif word_idx != previous_word_idx:
|
| label_ids.append(label[word_idx])
|
| else:
|
| label_ids.append(-100)
|
| previous_word_idx = word_idx
|
| labels.append(label_ids)
|
| tokenized_inputs["labels"] = labels
|
| return tokenized_inputs
|
|
|
|
|
| try:
|
| tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
|
| except Exception as e:
|
| print(f"Gagal menokenisasi dataset: {e}")
|
| sys.exit(1)
|
|
|
|
|
| def predict_entities(input_text):
|
| if not input_text.strip():
|
| return "Masukkan teks untuk diprediksi."
|
|
|
|
|
| inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
|
| input_ids = inputs["input_ids"].to(device)
|
| attention_mask = inputs["attention_mask"].to(device)
|
|
|
|
|
| model.eval()
|
| with torch.no_grad():
|
| outputs = model(input_ids, attention_mask=attention_mask)
|
| predictions = outputs.logits.argmax(dim=2)[0].cpu().numpy()
|
|
|
|
|
| tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
|
| labels = [id2label[pred] for pred in predictions]
|
|
|
|
|
| result = []
|
| for token, label in zip(tokens, labels):
|
| if token not in ["[CLS]", "[SEP]"]:
|
| result.append({"Token": token, "Entity": label})
|
|
|
|
|
| return pd.DataFrame(result)
|
|
|
|
|
| with gr.Blocks() as demo:
|
| gr.Markdown("# Named Entity Recognition (NER) dengan IndoBERT")
|
| gr.Markdown("Masukkan teks dalam bahasa Indonesia untuk mendeteksi entitas seperti PERSON, ORGANISATION, PLACE, dll.")
|
|
|
| gr.Markdown("## Keterangan Label Entitas")
|
| gr.Markdown("""
|
| - **O**: Token bukan entitas (contoh: "dan", "mengunjungi").
|
| - **B-PERSON**: Awal nama orang (contoh: "Joko" dalam "Joko Widodo").
|
| - **I-PERSON**: Lanjutan nama orang (contoh: "Widodo" atau "##do" dalam "Joko Widodo").
|
| - **B-PLACE**: Awal nama tempat (contoh: "Bali").
|
| - **I-PLACE**: Lanjutan nama tempat (contoh: "Indonesia" dalam "Bali, Indonesia").
|
| """)
|
|
|
| with gr.Row():
|
| text_input = gr.Textbox(
|
| label="Masukkan Teks",
|
| placeholder="Contoh: Joko Widodo menghadiri acara di Universitas Indonesia pada tanggal 14 Juni 2025",
|
| lines=3
|
| )
|
| submit_button = gr.Button("Prediksi")
|
| clear_button = gr.Button("Bersihkan")
|
|
|
| output_table = gr.Dataframe(label="Hasil Prediksi")
|
|
|
| gr.Markdown("## Contoh Teks")
|
| gr.Markdown("- SBY berkunjung ke Bali bersama Jokowi.\n- Universitas Gadjah Mada menyelenggarakan seminar pada 10 Maret 2025.")
|
|
|
| gr.Markdown("## Pertimbangan Keamanan Data, Privasi, dan Etika")
|
| gr.Markdown("""
|
| - **Keamanan Data**: Dataset bersumber dari berita publik, tidak mengandung informasi sensitif seperti alamat atau nomor identitas.
|
| - **Privasi**: Input pengguna tidak disimpan, menjaga privasi.
|
| - **Etika AI**: Dataset mencakup berbagai topik berita (politik, olahraga, budaya), mengurangi risiko bias terhadap entitas tertentu.
|
| """)
|
|
|
| submit_button.click(fn=predict_entities, inputs=text_input, outputs=output_table)
|
| clear_button.click(fn=lambda: "", inputs=None, outputs=text_input)
|
|
|
|
|
| demo.launch() |