File size: 2,962 Bytes
75464af
65e51b4
75464af
 
3065429
75464af
3065429
 
 
 
 
 
75464af
3065429
 
 
 
3196524
3065429
 
 
 
 
 
75464af
3196524
a62422b
75464af
a62422b
3196524
 
 
3065429
65e51b4
 
 
3065429
 
3196524
901c892
3196524
901c892
 
65e51b4
3065429
 
 
67cfabf
3065429
 
 
 
65e51b4
3065429
 
 
 
 
 
67cfabf
70a40ec
3196524
 
 
 
 
 
 
 
3065429
3196524
 
 
a62422b
75464af
 
f79fc96
75464af
 
3065429
a62422b
 
65e51b4
a62422b
75464af
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import gradio as gr
from langdetect import detect

# ============================
# Natasha для русского
# ============================
from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsNERTagger, Doc

segmenter = Segmenter()
morph_vocab = MorphVocab()
embedding = NewsEmbedding()
ner_tagger = NewsNERTagger(embedding)

# ============================
# HuggingFace для английского
# ============================
from transformers import pipeline

english_ner = pipeline(
    "ner",
    model="dbmdz/bert-large-cased-finetuned-conll03-english",
    tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english",
    aggregation_strategy="simple"
)

# ============================
# Функция распознавания сущностей
# ============================
def recognize_entities_auto(text):
    """
    text: текст пользователя
    """
    # Определяем язык
    try:
        lang = detect(text)
    except:
        lang = "ru"

    # ============================
    # Распознавание сущностей
    # ============================
    entities = {"PER": [], "ORG": [], "LOC": []}

    if lang == "en":
        results = english_ner(text)
        for res in results:
            label = res['entity_group']
            word = res['word'].replace("##", "").strip()  # очистка токенов
            if label in ["PER", "ORG", "LOC", "GPE"]:
                if label == "GPE":
                    label = "LOC"
                entities[label].append(word)
    else:
        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_ner(ner_tagger)
        for span in doc.spans:
            label = span.type
            if label in ["PER", "ORG", "LOC"]:
                entities[label].append(span.text.strip())

    # Убираем дубликаты
    for key in entities:
        entities[key] = list(dict.fromkeys(entities[key]))

    # ============================
    # Формируем подсветку для Gradio
    # ============================
    highlighted = []
    for key, items in entities.items():
        for item in items:
            highlighted.append((item, key))

    return highlighted

# ============================
# Gradio интерфейс
# ============================
iface = gr.Interface(
    fn=recognize_entities_auto,
    inputs=gr.Textbox(lines=15, placeholder="Введите русский или английский текст здесь..."),
    outputs=gr.HighlightedText(label="Выделенные сущности", color_map={"PER":"#faa", "ORG":"#afa", "LOC":"#aaf"}),
    title="Автоматический NER для русского и английского текста",
    description="PER – человек, ORG – организация, LOC – место. Текст любого языка обрабатывается автоматически."
)

iface.launch()