Spaces:
Sleeping
Sleeping
File size: 2,962 Bytes
75464af 65e51b4 75464af 3065429 75464af 3065429 75464af 3065429 3196524 3065429 75464af 3196524 a62422b 75464af a62422b 3196524 3065429 65e51b4 3065429 3196524 901c892 3196524 901c892 65e51b4 3065429 67cfabf 3065429 65e51b4 3065429 67cfabf 70a40ec 3196524 3065429 3196524 a62422b 75464af f79fc96 75464af 3065429 a62422b 65e51b4 a62422b 75464af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import gradio as gr
from langdetect import detect
# ============================
# Natasha для русского
# ============================
from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsNERTagger, Doc
segmenter = Segmenter()
morph_vocab = MorphVocab()
embedding = NewsEmbedding()
ner_tagger = NewsNERTagger(embedding)
# ============================
# HuggingFace для английского
# ============================
from transformers import pipeline
english_ner = pipeline(
"ner",
model="dbmdz/bert-large-cased-finetuned-conll03-english",
tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english",
aggregation_strategy="simple"
)
# ============================
# Функция распознавания сущностей
# ============================
def recognize_entities_auto(text):
"""
text: текст пользователя
"""
# Определяем язык
try:
lang = detect(text)
except:
lang = "ru"
# ============================
# Распознавание сущностей
# ============================
entities = {"PER": [], "ORG": [], "LOC": []}
if lang == "en":
results = english_ner(text)
for res in results:
label = res['entity_group']
word = res['word'].replace("##", "").strip() # очистка токенов
if label in ["PER", "ORG", "LOC", "GPE"]:
if label == "GPE":
label = "LOC"
entities[label].append(word)
else:
doc = Doc(text)
doc.segment(segmenter)
doc.tag_ner(ner_tagger)
for span in doc.spans:
label = span.type
if label in ["PER", "ORG", "LOC"]:
entities[label].append(span.text.strip())
# Убираем дубликаты
for key in entities:
entities[key] = list(dict.fromkeys(entities[key]))
# ============================
# Формируем подсветку для Gradio
# ============================
highlighted = []
for key, items in entities.items():
for item in items:
highlighted.append((item, key))
return highlighted
# ============================
# Gradio интерфейс
# ============================
iface = gr.Interface(
fn=recognize_entities_auto,
inputs=gr.Textbox(lines=15, placeholder="Введите русский или английский текст здесь..."),
outputs=gr.HighlightedText(label="Выделенные сущности", color_map={"PER":"#faa", "ORG":"#afa", "LOC":"#aaf"}),
title="Автоматический NER для русского и английского текста",
description="PER – человек, ORG – организация, LOC – место. Текст любого языка обрабатывается автоматически."
)
iface.launch()
|