|
|
import time |
|
|
import re |
|
|
import gradio as gr |
|
|
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification |
|
|
|
|
|
MODELS = { |
|
|
"Leo97/KoELECTRA-small-v3-modu-ner", |
|
|
"Babelscape/wikineural-multilingual-ner", |
|
|
"CAMeL-Lab/bert-base-arabic-camelbert-mix-ner", |
|
|
} |
|
|
|
|
|
DEFAULT_MODEL = "Leo97/KoELECTRA-small-v3-modu-ner" |
|
|
|
|
|
ENTITY_COLORS = { |
|
|
"PER": "#FF6B6B", |
|
|
"ORG": "#4ECDC4", |
|
|
"LOC": "#FFD166", |
|
|
"MISC": "#06D6A0", |
|
|
"PERSON": "#FF6B6B", |
|
|
"ORGANIZATION": "#4ECDC4", |
|
|
"LOCATION": "#FFD166", |
|
|
"DATE": "#118AB2", |
|
|
"TIME": "#073B4C", |
|
|
} |
|
|
|
|
|
MAX_CHARS = 2000 |
|
|
|
|
|
def load_model(model_name): |
|
|
"""Загрузка модели и токенизатора""" |
|
|
try: |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForTokenClassification.from_pretrained(model_name) |
|
|
nlp_pipeline = pipeline( |
|
|
"ner", |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
aggregation_strategy="simple", |
|
|
device=-1 |
|
|
) |
|
|
return nlp_pipeline |
|
|
except Exception as e: |
|
|
raise Exception(f"Ошибка загрузки модели: {str(e)}") |
|
|
|
|
|
|
|
|
try: |
|
|
pipe = load_model(DEFAULT_MODEL) |
|
|
current_model_name = DEFAULT_MODEL |
|
|
except Exception as e: |
|
|
print(f"Warning: {e}") |
|
|
pipe = None |
|
|
current_model_name = None |
|
|
|
|
|
|
|
|
def extract_entities(text, model_choice): |
|
|
global pipe, current_model_name |
|
|
|
|
|
|
|
|
if not text or not text.strip(): |
|
|
return "⚠️ Введите текст для анализа", None, None, None, None |
|
|
|
|
|
text = text.strip() |
|
|
|
|
|
|
|
|
if len(text) > MAX_CHARS: |
|
|
text = text[:MAX_CHARS] |
|
|
|
|
|
|
|
|
if model_choice != current_model_name: |
|
|
try: |
|
|
pipe = load_model(model_choice) |
|
|
current_model_name = model_choice |
|
|
except Exception as e: |
|
|
return f"❌ Ошибка загрузки модели: {str(e)}", None, None, None, None |
|
|
|
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
|
|
|
|
entities = pipe(text) |
|
|
latency = round((time.time() - start_time) * 1000, 1) |
|
|
|
|
|
|
|
|
if not entities: |
|
|
formatted_result = "Сущности не обнаружены" |
|
|
html_output = "<p>Сущности не обнаружены</p>" |
|
|
else: |
|
|
|
|
|
formatted_result = [] |
|
|
for entity in entities: |
|
|
entity_info = { |
|
|
"Текст": entity['word'], |
|
|
"Тип": entity['entity_group'], |
|
|
"Уверенность": round(entity['score'], 3), |
|
|
"Позиция": f"{entity['start']}-{entity['end']}" |
|
|
} |
|
|
formatted_result.append(entity_info) |
|
|
|
|
|
|
|
|
html_parts = [] |
|
|
last_end = 0 |
|
|
|
|
|
for entity in sorted(entities, key=lambda x: x['start']): |
|
|
|
|
|
if entity['start'] > last_end: |
|
|
html_parts.append(text[last_end:entity['start']]) |
|
|
|
|
|
|
|
|
color = ENTITY_COLORS.get(entity['entity_group'], "#CCCCCC") |
|
|
html_parts.append( |
|
|
f'<span style="background-color: {color}; padding: 2px 4px; ' |
|
|
f'border-radius: 3px; margin: 2px;" title="{entity["entity_group"]} ' |
|
|
f'(уверенность: {entity["score"]:.2f})">{text[entity["start"]:entity["end"]]}</span>' |
|
|
) |
|
|
|
|
|
last_end = entity['end'] |
|
|
|
|
|
|
|
|
if last_end < len(text): |
|
|
html_parts.append(text[last_end:]) |
|
|
|
|
|
html_output = '<div style="line-height: 1.8; font-size: 16px;">' + ''.join(html_parts) + '</div>' |
|
|
|
|
|
|
|
|
stats = {} |
|
|
for entity in entities: |
|
|
etype = entity['entity_group'] |
|
|
stats[etype] = stats.get(etype, 0) + 1 |
|
|
|
|
|
stats_text = " | ".join([f"{k}: {v}" for k, v in stats.items()]) |
|
|
|
|
|
return ( |
|
|
"✅ Анализ завершен", |
|
|
formatted_result, |
|
|
html_output, |
|
|
stats_text, |
|
|
f"{latency} мс" |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
return f"❌ Ошибка: {str(e)}", None, None, None, None |
|
|
|
|
|
|
|
|
def anonymize_text(text, entities): |
|
|
"""Базовая анонимизация текста (для демонстрации)""" |
|
|
if not entities: |
|
|
return text |
|
|
|
|
|
result = text |
|
|
|
|
|
for entity in sorted(entities, key=lambda x: x['start'], reverse=True): |
|
|
if entity['entity_group'] in ['PER', 'PERSON']: |
|
|
replacement = '[ЛИЦО]' |
|
|
elif entity['entity_group'] in ['ORG', 'ORGANIZATION']: |
|
|
replacement = '[ОРГАНИЗАЦИЯ]' |
|
|
elif entity['entity_group'] in ['LOC', 'LOCATION']: |
|
|
replacement = '[МЕСТО]' |
|
|
else: |
|
|
replacement = f'[{entity["entity_group"]}]' |
|
|
|
|
|
result = result[:entity['start']] + replacement + result[entity['end']:] |
|
|
|
|
|
return result |
|
|
|
|
|
def batch_process(files): |
|
|
"""Обработка нескольких файлов""" |
|
|
if not files: |
|
|
return "⚠️ Загрузите файлы", [] |
|
|
|
|
|
results = [] |
|
|
for file_info in files: |
|
|
try: |
|
|
with open(file_info.name, 'r', encoding='utf-8') as f: |
|
|
text = f.read(MAX_CHARS) |
|
|
_, entities, _, stats, _ = extract_entities(text, DEFAULT_MODEL) |
|
|
results.append({ |
|
|
"Файл": file_info.name, |
|
|
"Сущности": len(entities) if entities else 0, |
|
|
"Статистика": stats |
|
|
}) |
|
|
except Exception as e: |
|
|
results.append({ |
|
|
"Файл": file_info.name, |
|
|
"Ошибка": str(e) |
|
|
}) |
|
|
|
|
|
return "✅ Обработка завершена", results |
|
|
|
|
|
with gr.Blocks(title="NER — Извлечение сущностей", theme=gr.themes.Soft()) as demo: |
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
# 🔍 Извлечение именованных сущностей (NER) |
|
|
**Распознавание имен, организаций, локаций и других сущностей в тексте** |
|
|
|
|
|
""") |
|
|
|
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.TabItem("📝 Анализ текста"): |
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
|
|
|
model_dropdown = gr.Dropdown( |
|
|
choices=list(MODELS.keys()), |
|
|
value=DEFAULT_MODEL, |
|
|
label="Выберите модель", |
|
|
info="Разные модели поддерживают разные языки и типы сущностей" |
|
|
) |
|
|
|
|
|
|
|
|
text_input = gr.Textbox( |
|
|
label="Введите текст для анализа", |
|
|
placeholder="Пример: Компания Microsoft, основанная Биллом Гейтсом, находится в Редмонде, штат Вашингтон.", |
|
|
lines=8, |
|
|
max_length=MAX_CHARS |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
analyze_btn = gr.Button("🔎 Анализировать", variant="primary") |
|
|
clear_btn = gr.Button("🗑️ Очистить") |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["Apple решила открыть новый офис в Париже, где Тим Кук встретится с президентом Франции."], |
|
|
["Вчера в Москве прошла встреча представителей Google и Яндекса, обсудили ИИ с Илоном Маском."], |
|
|
["Президент США Джо Байден выступил в Белом доме на встрече с генеральным директором Tesla."], |
|
|
["The Eiffel Tower in Paris is visited by millions of tourists every year from all over the world."] |
|
|
], |
|
|
inputs=text_input, |
|
|
label="Примеры текстов" |
|
|
) |
|
|
|
|
|
with gr.Column(scale=3): |
|
|
|
|
|
status = gr.Textbox(label="Статус") |
|
|
|
|
|
|
|
|
with gr.Tab("📊 Структурированный"): |
|
|
result_json = gr.JSON(label="Найденные сущности") |
|
|
|
|
|
with gr.Tab("🎨 Визуализация"): |
|
|
result_html = gr.HTML(label="Текст с подсветкой сущностей") |
|
|
|
|
|
|
|
|
stats_output = gr.Textbox(label="Статистика") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
latency_output = gr.Textbox(label="Время обработки") |
|
|
|
|
|
|
|
|
with gr.Accordion("🛡️ Анонимизация текста", open=False): |
|
|
anonymized_text = gr.Textbox( |
|
|
label="Анонимизированный текст", |
|
|
lines=4, |
|
|
interactive=False |
|
|
) |
|
|
anonymize_btn = gr.Button("Анонимизировать") |
|
|
|
|
|
|
|
|
analyze_btn.click( |
|
|
fn=extract_entities, |
|
|
inputs=[text_input, model_dropdown], |
|
|
outputs=[status, result_json, result_html, stats_output, latency_output] |
|
|
) |
|
|
|
|
|
clear_btn.click( |
|
|
fn=lambda: ["", None, None, None, None, ""], |
|
|
outputs=[text_input, result_json, result_html, stats_output, latency_output, anonymized_text] |
|
|
) |
|
|
|
|
|
anonymize_btn.click( |
|
|
fn=lambda text, entities: anonymize_text(text, entities) if entities else "Сначала выполните анализ", |
|
|
inputs=[text_input, result_json], |
|
|
outputs=anonymized_text |
|
|
) |
|
|
|
|
|
|
|
|
with gr.TabItem("📁 Пакетная обработка"): |
|
|
gr.Markdown("### Загрузите текстовые файлы (.txt)") |
|
|
|
|
|
file_input = gr.File( |
|
|
label="Выберите файлы", |
|
|
file_count="multiple", |
|
|
file_types=[".txt"] |
|
|
) |
|
|
|
|
|
batch_btn = gr.Button("🚀 Обработать файлы", variant="primary") |
|
|
|
|
|
batch_status = gr.Textbox(label="Статус обработки") |
|
|
batch_results = gr.JSON(label="Результаты") |
|
|
|
|
|
batch_btn.click( |
|
|
fn=batch_process, |
|
|
inputs=file_input, |
|
|
outputs=[batch_status, batch_results] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
share=False |
|
|
) |