Spaces:

ecaaa09
/

IndoELECTRA-NER-Demo

Sleeping

App Files Files Community

IndoELECTRA-NER-Demo / app.py

ecaaa09

Update app.py

46a7372 about 1 month ago

raw

history blame contribute delete

3.38 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForTokenClassification
	import torch
	import os

	# Label mapping - urutan HARUS sama dengan training!
	# Urutan ini dari file TSV: O muncul pertama, lalu B-Person, I-Person, B-Place, I-Place, B-Organisation, I-Organisation
	LABEL_LIST = ['O', 'B-Person', 'I-Person', 'B-Place', 'I-Place', 'B-Organisation', 'I-Organisation']
	LABEL_TO_ID = {l: i for i, l in enumerate(LABEL_LIST)}
	ID_TO_LABEL = {i: l for l, i in LABEL_TO_ID.items()}

	# Base model
	BASE_MODEL = "ChristopherA08/IndoELECTRA"
	MODEL_PATH = "./model.pth"

	print("Loading tokenizer and model...")
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

	# Load model architecture
	model = AutoModelForTokenClassification.from_pretrained(
	BASE_MODEL,
	num_labels=len(LABEL_LIST),
	label2id=LABEL_TO_ID,
	id2label=ID_TO_LABEL
	)

	# Resize embeddings (penting!)
	model.resize_token_embeddings(len(tokenizer))

	# Load weights dari .pth file
	if os.path.exists(MODEL_PATH):
	print(f"Loading weights from {MODEL_PATH}...")
	state_dict = torch.load(MODEL_PATH, map_location="cpu")
	model.load_state_dict(state_dict)
	print(" Model loaded successfully!")
	else:
	print(f" Warning: {MODEL_PATH} not found. Using base model.")

	def predict_ner(text):
	"""
	Fungsi untuk prediksi Named Entity Recognition
	"""
	if not text.strip():
	return " Masukkan teks terlebih dahulu!"

	# Tokenisasi
	tokens = text.split()
	inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=128)

	# Prediksi
	with torch.no_grad():
	outputs = model(**inputs)

	predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()
	if isinstance(predictions, int):
	predictions = [predictions]

	# Extract entities
	word_ids = inputs.word_ids()
	results = []
	prev_word = None

	for idx, word_idx in enumerate(word_ids):
	if word_idx is None or word_idx == prev_word:
	continue
	label = model.config.id2label[predictions[idx]]
	results.append((tokens[word_idx], label))
	prev_word = word_idx

	# Format output vertikal
	output = ""
	for i, (token, label) in enumerate(results, 1):
	output += f"{i}. `{token}` → {label}\n\n"

	return output


	# Contoh teks - DISABLED sementara karena vocab size issue
	examples = None

	# Buat Gradio Interface dengan Blocks untuk kontrol lebih
	with gr.Blocks(title="IndoELECTRA NER") as demo:
	gr.Markdown("# IndoELECTRA Named Entity Recognition")
	gr.Markdown("Model NER untuk Bahasa Indonesia \| Dataset: SINGGALANG \| Base Model: ChristopherA08/IndoELECTRA")

	with gr.Row():
	input_text = gr.Textbox(
	lines=3,
	placeholder="Contoh: Doni sedang menempuh pendidikan di ITERA",
	label="Input Teks"
	)

	with gr.Row():
	submit_btn = gr.Button("Submit", variant="primary")
	clear_btn = gr.Button("Clear")

	with gr.Row():
	output_text = gr.Markdown(label="Hasil NER")

	gr.Markdown("---\n© 2025 NLP Project")

	submit_btn.click(fn=predict_ner, inputs=input_text, outputs=output_text)
	clear_btn.click(fn=lambda: ("", ""), inputs=None, outputs=[input_text, output_text])

	if __name__ == "__main__":
	demo.launch()