ecaaa09's picture
Update app.py
46a7372
import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import os
# Label mapping - urutan HARUS sama dengan training!
# Urutan ini dari file TSV: O muncul pertama, lalu B-Person, I-Person, B-Place, I-Place, B-Organisation, I-Organisation
LABEL_LIST = ['O', 'B-Person', 'I-Person', 'B-Place', 'I-Place', 'B-Organisation', 'I-Organisation']
LABEL_TO_ID = {l: i for i, l in enumerate(LABEL_LIST)}
ID_TO_LABEL = {i: l for l, i in LABEL_TO_ID.items()}
# Base model
BASE_MODEL = "ChristopherA08/IndoELECTRA"
MODEL_PATH = "./model.pth"
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
# Load model architecture
model = AutoModelForTokenClassification.from_pretrained(
BASE_MODEL,
num_labels=len(LABEL_LIST),
label2id=LABEL_TO_ID,
id2label=ID_TO_LABEL
)
# Resize embeddings (penting!)
model.resize_token_embeddings(len(tokenizer))
# Load weights dari .pth file
if os.path.exists(MODEL_PATH):
print(f"Loading weights from {MODEL_PATH}...")
state_dict = torch.load(MODEL_PATH, map_location="cpu")
model.load_state_dict(state_dict)
print(" Model loaded successfully!")
else:
print(f" Warning: {MODEL_PATH} not found. Using base model.")
def predict_ner(text):
"""
Fungsi untuk prediksi Named Entity Recognition
"""
if not text.strip():
return " Masukkan teks terlebih dahulu!"
# Tokenisasi
tokens = text.split()
inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=128)
# Prediksi
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()
if isinstance(predictions, int):
predictions = [predictions]
# Extract entities
word_ids = inputs.word_ids()
results = []
prev_word = None
for idx, word_idx in enumerate(word_ids):
if word_idx is None or word_idx == prev_word:
continue
label = model.config.id2label[predictions[idx]]
results.append((tokens[word_idx], label))
prev_word = word_idx
# Format output vertikal
output = ""
for i, (token, label) in enumerate(results, 1):
output += f"**{i}.** `{token}` → **{label}**\n\n"
return output
# Contoh teks - DISABLED sementara karena vocab size issue
examples = None
# Buat Gradio Interface dengan Blocks untuk kontrol lebih
with gr.Blocks(title="IndoELECTRA NER") as demo:
gr.Markdown("# IndoELECTRA Named Entity Recognition")
gr.Markdown("Model NER untuk Bahasa Indonesia | Dataset: SINGGALANG | Base Model: ChristopherA08/IndoELECTRA")
with gr.Row():
input_text = gr.Textbox(
lines=3,
placeholder="Contoh: Doni sedang menempuh pendidikan di ITERA",
label="Input Teks"
)
with gr.Row():
submit_btn = gr.Button("Submit", variant="primary")
clear_btn = gr.Button("Clear")
with gr.Row():
output_text = gr.Markdown(label="Hasil NER")
gr.Markdown("---\n© 2025 NLP Project")
submit_btn.click(fn=predict_ner, inputs=input_text, outputs=output_text)
clear_btn.click(fn=lambda: ("", ""), inputs=None, outputs=[input_text, output_text])
if __name__ == "__main__":
demo.launch()