project1 / app.py
Asanaly
Update app.py
70a40ec verified
raw
history blame
3.21 kB
# app.py
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
# ============================
# 1. Модель мен токенизатор
# ============================
model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=7)
# Қазақ NER үшін label тізімі
label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
# ============================
# 2. NER функциясы (бірнеше сөзді біріктіру)
# ============================
def predict_ner_entities(text):
tokens = tokenizer(text.split(), return_tensors="pt", is_split_into_words=True)
outputs = model(**tokens).logits
predictions = np.argmax(outputs.detach().numpy(), axis=2)[0]
word_ids = tokens.word_ids(batch_index=0)
entities = []
current_entity = []
current_label = None
for idx, word_idx in enumerate(word_ids):
if word_idx is None:
continue
label = label_list[predictions[idx]]
word = text.split()[word_idx]
if label.startswith("B-"):
if current_entity:
entities.append((" ".join(current_entity), current_label))
current_entity = [word]
current_label = label[2:]
elif label.startswith("I-") and current_label == label[2:]:
current_entity.append(word)
else:
if current_entity:
entities.append((" ".join(current_entity), current_label))
current_entity = []
current_label = None
if current_entity:
entities.append((" ".join(current_entity), current_label))
return entities
# ============================
# 3. Форматтау – қарапайым текстпен шығару
# ============================
def format_ner(text):
entities = predict_ner_entities(text)
if not entities:
return "Атаулар табылған жоқ"
output_dict = {"PER": [], "ORG": [], "LOC": []}
for word, label in entities:
if label.endswith("PER"):
output_dict["PER"].append(word)
elif label.endswith("ORG"):
output_dict["ORG"].append(word)
elif label.endswith("LOC"):
output_dict["LOC"].append(word)
output_text = ""
for key, words in output_dict.items():
if words:
output_text += f"{key}: {'; '.join(words)}\n"
return output_text.strip()
# ============================
# 4. Gradio интерфейсі
# ============================
iface = gr.Interface(
fn=format_ner,
inputs=gr.Textbox(lines=15, placeholder="Қазақ мәтінін осында енгізіңіз..."),
outputs=gr.Textbox(label="Анықталған атаулар"),
title="Қазақ тіліндегі NER",
description="PER – адам, ORG – ұйым, LOC – орын. Бірнеше сөйлемді бірден өңдейді."
)
# ============================
# 5. Іске қосу
# ============================
iface.launch()