Spaces:
Sleeping
Sleeping
| # app.py | |
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification | |
| # ============================ | |
| # 1. Модель мен токенизатор | |
| # ============================ | |
| model_checkpoint = "bert-base-multilingual-cased" | |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
| model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=7) | |
| # Қазақ NER үшін label тізімі | |
| label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] | |
| # ============================ | |
| # 2. NER функциясы (бірнеше сөзді біріктіру) | |
| # ============================ | |
| def predict_ner_entities(text): | |
| tokens = tokenizer(text.split(), return_tensors="pt", is_split_into_words=True) | |
| outputs = model(**tokens).logits | |
| predictions = np.argmax(outputs.detach().numpy(), axis=2)[0] | |
| word_ids = tokens.word_ids(batch_index=0) | |
| entities = [] | |
| current_entity = [] | |
| current_label = None | |
| for idx, word_idx in enumerate(word_ids): | |
| if word_idx is None: | |
| continue | |
| label = label_list[predictions[idx]] | |
| word = text.split()[word_idx] | |
| if label.startswith("B-"): | |
| if current_entity: | |
| entities.append((" ".join(current_entity), current_label)) | |
| current_entity = [word] | |
| current_label = label[2:] | |
| elif label.startswith("I-") and current_label == label[2:]: | |
| current_entity.append(word) | |
| else: | |
| if current_entity: | |
| entities.append((" ".join(current_entity), current_label)) | |
| current_entity = [] | |
| current_label = None | |
| if current_entity: | |
| entities.append((" ".join(current_entity), current_label)) | |
| return entities | |
| # ============================ | |
| # 3. Форматтау – қарапайым текстпен шығару | |
| # ============================ | |
| def format_ner(text): | |
| entities = predict_ner_entities(text) | |
| if not entities: | |
| return "Атаулар табылған жоқ" | |
| output_dict = {"PER": [], "ORG": [], "LOC": []} | |
| for word, label in entities: | |
| if label.endswith("PER"): | |
| output_dict["PER"].append(word) | |
| elif label.endswith("ORG"): | |
| output_dict["ORG"].append(word) | |
| elif label.endswith("LOC"): | |
| output_dict["LOC"].append(word) | |
| output_text = "" | |
| for key, words in output_dict.items(): | |
| if words: | |
| output_text += f"{key}: {'; '.join(words)}\n" | |
| return output_text.strip() | |
| # ============================ | |
| # 4. Gradio интерфейсі | |
| # ============================ | |
| iface = gr.Interface( | |
| fn=format_ner, | |
| inputs=gr.Textbox(lines=15, placeholder="Қазақ мәтінін осында енгізіңіз..."), | |
| outputs=gr.Textbox(label="Анықталған атаулар"), | |
| title="Қазақ тіліндегі NER", | |
| description="PER – адам, ORG – ұйым, LOC – орын. Бірнеше сөйлемді бірден өңдейді." | |
| ) | |
| # ============================ | |
| # 5. Іске қосу | |
| # ============================ | |
| iface.launch() | |