Spaces:

student2222333051
/

project1

Sleeping

Asanaly

Update app.py

70a40ec verified 5 months ago

3.21 kB

	# app.py

	import gradio as gr
	import torch
	import numpy as np
	from transformers import AutoTokenizer, AutoModelForTokenClassification

	# ============================
	# 1. Модель мен токенизатор
	# ============================
	model_checkpoint = "bert-base-multilingual-cased"
	tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
	model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=7)

	# Қазақ NER үшін label тізімі
	label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

	# ============================
	# 2. NER функциясы (бірнеше сөзді біріктіру)
	# ============================
	def predict_ner_entities(text):
	tokens = tokenizer(text.split(), return_tensors="pt", is_split_into_words=True)
	outputs = model(**tokens).logits
	predictions = np.argmax(outputs.detach().numpy(), axis=2)[0]
	word_ids = tokens.word_ids(batch_index=0)

	entities = []
	current_entity = []
	current_label = None

	for idx, word_idx in enumerate(word_ids):
	if word_idx is None:
	continue
	label = label_list[predictions[idx]]
	word = text.split()[word_idx]

	if label.startswith("B-"):
	if current_entity:
	entities.append((" ".join(current_entity), current_label))
	current_entity = [word]
	current_label = label[2:]
	elif label.startswith("I-") and current_label == label[2:]:
	current_entity.append(word)
	else:
	if current_entity:
	entities.append((" ".join(current_entity), current_label))
	current_entity = []
	current_label = None
	if current_entity:
	entities.append((" ".join(current_entity), current_label))
	return entities

	# ============================
	# 3. Форматтау – қарапайым текстпен шығару
	# ============================
	def format_ner(text):
	entities = predict_ner_entities(text)
	if not entities:
	return "Атаулар табылған жоқ"

	output_dict = {"PER": [], "ORG": [], "LOC": []}
	for word, label in entities:
	if label.endswith("PER"):
	output_dict["PER"].append(word)
	elif label.endswith("ORG"):
	output_dict["ORG"].append(word)
	elif label.endswith("LOC"):
	output_dict["LOC"].append(word)

	output_text = ""
	for key, words in output_dict.items():
	if words:
	output_text += f"{key}: {'; '.join(words)}\n"
	return output_text.strip()

	# ============================
	# 4. Gradio интерфейсі
	# ============================
	iface = gr.Interface(
	fn=format_ner,
	inputs=gr.Textbox(lines=15, placeholder="Қазақ мәтінін осында енгізіңіз..."),
	outputs=gr.Textbox(label="Анықталған атаулар"),
	title="Қазақ тіліндегі NER",
	description="PER – адам, ORG – ұйым, LOC – орын. Бірнеше сөйлемді бірден өңдейді."
	)

	# ============================
	# 5. Іске қосу
	# ============================
	iface.launch()