project1 / app.py
student2222333051's picture
Update app.py
a62422b verified
import gradio as gr
from langdetect import detect
# ============================
# Natasha для русского
# ============================
from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsNERTagger, Doc
segmenter = Segmenter()
morph_vocab = MorphVocab()
embedding = NewsEmbedding()
ner_tagger = NewsNERTagger(embedding)
# ============================
# HuggingFace для английского
# ============================
from transformers import pipeline
english_ner = pipeline(
"ner",
model="dbmdz/bert-large-cased-finetuned-conll03-english",
tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english",
aggregation_strategy="simple"
)
# ============================
# Функция распознавания сущностей
# ============================
def recognize_entities_auto(text):
"""
text: текст пользователя
"""
# Определяем язык
try:
lang = detect(text)
except:
lang = "ru"
# ============================
# Распознавание сущностей
# ============================
entities = {"PER": [], "ORG": [], "LOC": []}
if lang == "en":
results = english_ner(text)
for res in results:
label = res['entity_group']
word = res['word'].replace("##", "").strip() # очистка токенов
if label in ["PER", "ORG", "LOC", "GPE"]:
if label == "GPE":
label = "LOC"
entities[label].append(word)
else:
doc = Doc(text)
doc.segment(segmenter)
doc.tag_ner(ner_tagger)
for span in doc.spans:
label = span.type
if label in ["PER", "ORG", "LOC"]:
entities[label].append(span.text.strip())
# Убираем дубликаты
for key in entities:
entities[key] = list(dict.fromkeys(entities[key]))
# ============================
# Формируем подсветку для Gradio
# ============================
highlighted = []
for key, items in entities.items():
for item in items:
highlighted.append((item, key))
return highlighted
# ============================
# Gradio интерфейс
# ============================
iface = gr.Interface(
fn=recognize_entities_auto,
inputs=gr.Textbox(lines=15, placeholder="Введите русский или английский текст здесь..."),
outputs=gr.HighlightedText(label="Выделенные сущности", color_map={"PER":"#faa", "ORG":"#afa", "LOC":"#aaf"}),
title="Автоматический NER для русского и английского текста",
description="PER – человек, ORG – организация, LOC – место. Текст любого языка обрабатывается автоматически."
)
iface.launch()