Spaces:

student2222333051
/

project1

Sleeping

File size: 2,962 Bytes

import gradio as gr
from langdetect import detect

# ============================
# Natasha для русского
# ============================
from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsNERTagger, Doc

segmenter = Segmenter()
morph_vocab = MorphVocab()
embedding = NewsEmbedding()
ner_tagger = NewsNERTagger(embedding)

# ============================
# HuggingFace для английского
# ============================
from transformers import pipeline

english_ner = pipeline(
    "ner",
    model="dbmdz/bert-large-cased-finetuned-conll03-english",
    tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english",
    aggregation_strategy="simple"
)

# ============================
# Функция распознавания сущностей
# ============================
def recognize_entities_auto(text):
    """
    text: текст пользователя
    """
    # Определяем язык
    try:
        lang = detect(text)
    except:
        lang = "ru"

    # ============================
    # Распознавание сущностей
    # ============================
    entities = {"PER": [], "ORG": [], "LOC": []}

    if lang == "en":
        results = english_ner(text)
        for res in results:
            label = res['entity_group']
            word = res['word'].replace("##", "").strip()  # очистка токенов
            if label in ["PER", "ORG", "LOC", "GPE"]:
                if label == "GPE":
                    label = "LOC"
                entities[label].append(word)
    else:
        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_ner(ner_tagger)
        for span in doc.spans:
            label = span.type
            if label in ["PER", "ORG", "LOC"]:
                entities[label].append(span.text.strip())

    # Убираем дубликаты
    for key in entities:
        entities[key] = list(dict.fromkeys(entities[key]))

    # ============================
    # Формируем подсветку для Gradio
    # ============================
    highlighted = []
    for key, items in entities.items():
        for item in items:
            highlighted.append((item, key))

    return highlighted

# ============================
# Gradio интерфейс
# ============================
iface = gr.Interface(
    fn=recognize_entities_auto,
    inputs=gr.Textbox(lines=15, placeholder="Введите русский или английский текст здесь..."),
    outputs=gr.HighlightedText(label="Выделенные сущности", color_map={"PER":"#faa", "ORG":"#afa", "LOC":"#aaf"}),
    title="Автоматический NER для русского и английского текста",
    description="PER – человек, ORG – организация, LOC – место. Текст любого языка обрабатывается автоматически."
)

iface.launch()