import gradio as gr from transformers import AutoTokenizer, AutoModelForTokenClassification import torch import os # Label mapping - urutan HARUS sama dengan training! # Urutan ini dari file TSV: O muncul pertama, lalu B-Person, I-Person, B-Place, I-Place, B-Organisation, I-Organisation LABEL_LIST = ['O', 'B-Person', 'I-Person', 'B-Place', 'I-Place', 'B-Organisation', 'I-Organisation'] LABEL_TO_ID = {l: i for i, l in enumerate(LABEL_LIST)} ID_TO_LABEL = {i: l for l, i in LABEL_TO_ID.items()} # Base model BASE_MODEL = "ChristopherA08/IndoELECTRA" MODEL_PATH = "./model.pth" print("Loading tokenizer and model...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) # Load model architecture model = AutoModelForTokenClassification.from_pretrained( BASE_MODEL, num_labels=len(LABEL_LIST), label2id=LABEL_TO_ID, id2label=ID_TO_LABEL ) # Resize embeddings (penting!) model.resize_token_embeddings(len(tokenizer)) # Load weights dari .pth file if os.path.exists(MODEL_PATH): print(f"Loading weights from {MODEL_PATH}...") state_dict = torch.load(MODEL_PATH, map_location="cpu") model.load_state_dict(state_dict) print(" Model loaded successfully!") else: print(f" Warning: {MODEL_PATH} not found. Using base model.") def predict_ner(text): """ Fungsi untuk prediksi Named Entity Recognition """ if not text.strip(): return " Masukkan teks terlebih dahulu!" # Tokenisasi tokens = text.split() inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=128) # Prediksi with torch.no_grad(): outputs = model(**inputs) predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist() if isinstance(predictions, int): predictions = [predictions] # Extract entities word_ids = inputs.word_ids() results = [] prev_word = None for idx, word_idx in enumerate(word_ids): if word_idx is None or word_idx == prev_word: continue label = model.config.id2label[predictions[idx]] results.append((tokens[word_idx], label)) prev_word = word_idx # Format output vertikal output = "" for i, (token, label) in enumerate(results, 1): output += f"**{i}.** `{token}` → **{label}**\n\n" return output # Contoh teks - DISABLED sementara karena vocab size issue examples = None # Buat Gradio Interface dengan Blocks untuk kontrol lebih with gr.Blocks(title="IndoELECTRA NER") as demo: gr.Markdown("# IndoELECTRA Named Entity Recognition") gr.Markdown("Model NER untuk Bahasa Indonesia | Dataset: SINGGALANG | Base Model: ChristopherA08/IndoELECTRA") with gr.Row(): input_text = gr.Textbox( lines=3, placeholder="Contoh: Doni sedang menempuh pendidikan di ITERA", label="Input Teks" ) with gr.Row(): submit_btn = gr.Button("Submit", variant="primary") clear_btn = gr.Button("Clear") with gr.Row(): output_text = gr.Markdown(label="Hasil NER") gr.Markdown("---\n© 2025 NLP Project") submit_btn.click(fn=predict_ner, inputs=input_text, outputs=output_text) clear_btn.click(fn=lambda: ("", ""), inputs=None, outputs=[input_text, output_text]) if __name__ == "__main__": demo.launch()