Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification | |
| import torch | |
| import os | |
| # Label mapping - urutan HARUS sama dengan training! | |
| # Urutan ini dari file TSV: O muncul pertama, lalu B-Person, I-Person, B-Place, I-Place, B-Organisation, I-Organisation | |
| LABEL_LIST = ['O', 'B-Person', 'I-Person', 'B-Place', 'I-Place', 'B-Organisation', 'I-Organisation'] | |
| LABEL_TO_ID = {l: i for i, l in enumerate(LABEL_LIST)} | |
| ID_TO_LABEL = {i: l for l, i in LABEL_TO_ID.items()} | |
| # Base model | |
| BASE_MODEL = "ChristopherA08/IndoELECTRA" | |
| MODEL_PATH = "./model.pth" | |
| print("Loading tokenizer and model...") | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) | |
| # Load model architecture | |
| model = AutoModelForTokenClassification.from_pretrained( | |
| BASE_MODEL, | |
| num_labels=len(LABEL_LIST), | |
| label2id=LABEL_TO_ID, | |
| id2label=ID_TO_LABEL | |
| ) | |
| # Resize embeddings (penting!) | |
| model.resize_token_embeddings(len(tokenizer)) | |
| # Load weights dari .pth file | |
| if os.path.exists(MODEL_PATH): | |
| print(f"Loading weights from {MODEL_PATH}...") | |
| state_dict = torch.load(MODEL_PATH, map_location="cpu") | |
| model.load_state_dict(state_dict) | |
| print(" Model loaded successfully!") | |
| else: | |
| print(f" Warning: {MODEL_PATH} not found. Using base model.") | |
| def predict_ner(text): | |
| """ | |
| Fungsi untuk prediksi Named Entity Recognition | |
| """ | |
| if not text.strip(): | |
| return " Masukkan teks terlebih dahulu!" | |
| # Tokenisasi | |
| tokens = text.split() | |
| inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=128) | |
| # Prediksi | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist() | |
| if isinstance(predictions, int): | |
| predictions = [predictions] | |
| # Extract entities | |
| word_ids = inputs.word_ids() | |
| results = [] | |
| prev_word = None | |
| for idx, word_idx in enumerate(word_ids): | |
| if word_idx is None or word_idx == prev_word: | |
| continue | |
| label = model.config.id2label[predictions[idx]] | |
| results.append((tokens[word_idx], label)) | |
| prev_word = word_idx | |
| # Format output vertikal | |
| output = "" | |
| for i, (token, label) in enumerate(results, 1): | |
| output += f"**{i}.** `{token}` → **{label}**\n\n" | |
| return output | |
| # Contoh teks - DISABLED sementara karena vocab size issue | |
| examples = None | |
| # Buat Gradio Interface dengan Blocks untuk kontrol lebih | |
| with gr.Blocks(title="IndoELECTRA NER") as demo: | |
| gr.Markdown("# IndoELECTRA Named Entity Recognition") | |
| gr.Markdown("Model NER untuk Bahasa Indonesia | Dataset: SINGGALANG | Base Model: ChristopherA08/IndoELECTRA") | |
| with gr.Row(): | |
| input_text = gr.Textbox( | |
| lines=3, | |
| placeholder="Contoh: Doni sedang menempuh pendidikan di ITERA", | |
| label="Input Teks" | |
| ) | |
| with gr.Row(): | |
| submit_btn = gr.Button("Submit", variant="primary") | |
| clear_btn = gr.Button("Clear") | |
| with gr.Row(): | |
| output_text = gr.Markdown(label="Hasil NER") | |
| gr.Markdown("---\n© 2025 NLP Project") | |
| submit_btn.click(fn=predict_ner, inputs=input_text, outputs=output_text) | |
| clear_btn.click(fn=lambda: ("", ""), inputs=None, outputs=[input_text, output_text]) | |
| if __name__ == "__main__": | |
| demo.launch() | |