import gradio as gr from langdetect import detect # ============================ # Natasha для русского # ============================ from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsNERTagger, Doc segmenter = Segmenter() morph_vocab = MorphVocab() embedding = NewsEmbedding() ner_tagger = NewsNERTagger(embedding) # ============================ # HuggingFace для английского # ============================ from transformers import pipeline english_ner = pipeline( "ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple" ) # ============================ # Функция распознавания сущностей # ============================ def recognize_entities_auto(text): """ text: текст пользователя """ # Определяем язык try: lang = detect(text) except: lang = "ru" # ============================ # Распознавание сущностей # ============================ entities = {"PER": [], "ORG": [], "LOC": []} if lang == "en": results = english_ner(text) for res in results: label = res['entity_group'] word = res['word'].replace("##", "").strip() # очистка токенов if label in ["PER", "ORG", "LOC", "GPE"]: if label == "GPE": label = "LOC" entities[label].append(word) else: doc = Doc(text) doc.segment(segmenter) doc.tag_ner(ner_tagger) for span in doc.spans: label = span.type if label in ["PER", "ORG", "LOC"]: entities[label].append(span.text.strip()) # Убираем дубликаты for key in entities: entities[key] = list(dict.fromkeys(entities[key])) # ============================ # Формируем подсветку для Gradio # ============================ highlighted = [] for key, items in entities.items(): for item in items: highlighted.append((item, key)) return highlighted # ============================ # Gradio интерфейс # ============================ iface = gr.Interface( fn=recognize_entities_auto, inputs=gr.Textbox(lines=15, placeholder="Введите русский или английский текст здесь..."), outputs=gr.HighlightedText(label="Выделенные сущности", color_map={"PER":"#faa", "ORG":"#afa", "LOC":"#aaf"}), title="Автоматический NER для русского и английского текста", description="PER – человек, ORG – организация, LOC – место. Текст любого языка обрабатывается автоматически." ) iface.launch()