Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from langdetect import detect | |
| # ============================ | |
| # Natasha для русского | |
| # ============================ | |
| from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsNERTagger, Doc | |
| segmenter = Segmenter() | |
| morph_vocab = MorphVocab() | |
| embedding = NewsEmbedding() | |
| ner_tagger = NewsNERTagger(embedding) | |
| # ============================ | |
| # HuggingFace для английского | |
| # ============================ | |
| from transformers import pipeline | |
| english_ner = pipeline( | |
| "ner", | |
| model="dbmdz/bert-large-cased-finetuned-conll03-english", | |
| tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english", | |
| aggregation_strategy="simple" | |
| ) | |
| # ============================ | |
| # Функция распознавания сущностей | |
| # ============================ | |
| def recognize_entities_auto(text): | |
| """ | |
| text: текст пользователя | |
| """ | |
| # Определяем язык | |
| try: | |
| lang = detect(text) | |
| except: | |
| lang = "ru" | |
| # ============================ | |
| # Распознавание сущностей | |
| # ============================ | |
| entities = {"PER": [], "ORG": [], "LOC": []} | |
| if lang == "en": | |
| results = english_ner(text) | |
| for res in results: | |
| label = res['entity_group'] | |
| word = res['word'].replace("##", "").strip() # очистка токенов | |
| if label in ["PER", "ORG", "LOC", "GPE"]: | |
| if label == "GPE": | |
| label = "LOC" | |
| entities[label].append(word) | |
| else: | |
| doc = Doc(text) | |
| doc.segment(segmenter) | |
| doc.tag_ner(ner_tagger) | |
| for span in doc.spans: | |
| label = span.type | |
| if label in ["PER", "ORG", "LOC"]: | |
| entities[label].append(span.text.strip()) | |
| # Убираем дубликаты | |
| for key in entities: | |
| entities[key] = list(dict.fromkeys(entities[key])) | |
| # ============================ | |
| # Формируем подсветку для Gradio | |
| # ============================ | |
| highlighted = [] | |
| for key, items in entities.items(): | |
| for item in items: | |
| highlighted.append((item, key)) | |
| return highlighted | |
| # ============================ | |
| # Gradio интерфейс | |
| # ============================ | |
| iface = gr.Interface( | |
| fn=recognize_entities_auto, | |
| inputs=gr.Textbox(lines=15, placeholder="Введите русский или английский текст здесь..."), | |
| outputs=gr.HighlightedText(label="Выделенные сущности", color_map={"PER":"#faa", "ORG":"#afa", "LOC":"#aaf"}), | |
| title="Автоматический NER для русского и английского текста", | |
| description="PER – человек, ORG – организация, LOC – место. Текст любого языка обрабатывается автоматически." | |
| ) | |
| iface.launch() | |