Spaces:

student2222333051
/

project1

Sleeping

App Files Files Community

Asanaly commited on Nov 26, 2025

Commit

65e51b4

verified ·

1 Parent(s): f79fc96

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -77

app.py CHANGED Viewed

@@ -1,102 +1,69 @@
-# app.py
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForTokenClassification
-import torch
-import numpy as np
 # ============================
 # Модели
 # ============================
-MODEL_DICT = {
-    "Russian": "DeepPavlov/rubert-base-cased",
     "English": "dbmdz/bert-large-cased-finetuned-conll03-english"
 }
-# ============================
-# Загрузка модели
-# ============================
-def load_model(model_name):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForTokenClassification.from_pretrained(model_name)
-    labels = model.config.id2label
-    return tokenizer, model, labels
 # ============================
-# NER функция
 # ============================
-def get_entities(text, tokenizer, model, labels):
-    words = text.split()
-    inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True, padding=True)
-    outputs = model(**inputs).logits
-    predictions = torch.argmax(outputs, dim=2).numpy()[0]
-    word_ids = inputs.word_ids(batch_index=0)
-    entities = []
-    current_entity = []
-    current_label = None
-    for idx, word_idx in enumerate(word_ids):
-        if word_idx is None:
-            continue
-        label = labels[predictions[idx]]
-        word = words[word_idx]
-        if label.startswith("B-"):
-            if current_entity:
-                entities.append((" ".join(current_entity), current_label))
-            current_entity = [word]
-            current_label = label[2:]
-        elif label.startswith("I-") and current_label == label[2:]:
-            current_entity.append(word)
-        else:
-            if current_entity:
-                entities.append((" ".join(current_entity), current_label))
-                current_entity = []
-                current_label = None
-    if current_entity:
-        entities.append((" ".join(current_entity), current_label))
-    return entities
-# ============================
-# Форматирование результата
-# ============================
-def format_entities(text, model_choice):
-    tokenizer, model, labels = load_model(MODEL_DICT[model_choice])
-    entities = get_entities(text, tokenizer, model, labels)
-    if not entities:
-        return "No entities found."
     output = {"PER": [], "ORG": [], "LOC": []}
-    for word, label in entities:
-        # Стандартные метки PER/ORG/LOC
-        if label in ["PER", "PERSON"]:
-            output["PER"].append(word)
-        elif label in ["ORG", "ORGANIZATION"]:
-            output["ORG"].append(word)
-        elif label in ["LOC", "GPE", "LOCATION"]:
-            output["LOC"].append(word)
-    result = ""
-    for key, words in output.items():
-        if words:
-            result += f"{key}: {'; '.join(words)}\n"
-    return result.strip()
 # ============================
 # Gradio интерфейс
 # ============================
 iface = gr.Interface(
-    fn=format_entities,
-    inputs=[
-        gr.Textbox(lines=15, placeholder="Введите текст здесь..."),
-        gr.Dropdown(choices=list(MODEL_DICT.keys()), label="Выберите модель")
-    ],
-    outputs=gr.Textbox(label="Recognized entities (PER/ORG/LOC)"),
-    title="NER для текста",
-    description="PER – person, ORG – organization, LOC – location. Можно вводить несколько предложений."
 )
 iface.launch()

 import gradio as gr
+from transformers import pipeline
+from langdetect import detect
 # ============================
 # Модели
 # ============================
+MODELS = {
+    "Russian": "DeepPavlov/ner_rubert",
     "English": "dbmdz/bert-large-cased-finetuned-conll03-english"
 }
+# Создаем пайплайны NER заранее
+ner_pipelines = {
+    "Russian": pipeline("ner", model=MODELS["Russian"], tokenizer=MODELS["Russian"], aggregation_strategy="simple"),
+    "English": pipeline("ner", model=MODELS["English"], tokenizer=MODELS["English"], aggregation_strategy="simple")
+}
 # ============================
+# Функция распознавания
 # ============================
+def auto_ner(text):
+    try:
+        lang = detect(text)
+    except:
+        lang = "ru"  # По умолчанию русский
+    if lang == "en":
+        model_choice = "English"
+    else:
+        model_choice = "Russian"
+    ner = ner_pipelines[model_choice]
+    results = ner(text)
+    if not results:
+        return "Сущности не найдены."
     output = {"PER": [], "ORG": [], "LOC": []}
+    for item in results:
+        entity = item['word']
+        label = item['entity_group']
+        if label == "PER":
+            output["PER"].append(entity)
+        elif label == "ORG":
+            output["ORG"].append(entity)
+        elif label == "LOC" or label == "GPE":
+            output["LOC"].append(entity)
+    result_text = ""
+    for key, values in output.items():
+        if values:
+            # Убираем дубли и соединяем через ;
+            result_text += f"{key}: {'; '.join(list(dict.fromkeys(values)))}\n"
+    return result_text.strip()
 # ============================
 # Gradio интерфейс
 # ============================
 iface = gr.Interface(
+    fn=auto_ner,
+    inputs=gr.Textbox(lines=15, placeholder="Введите текст здесь (русский или английский, можно несколько предложений)..."),
+    outputs=gr.Textbox(label="Распознанные сущности (PER/ORG/LOC)"),
+    title="Автоматический NER для русского и английского текста",
+    description="PER – человек, ORG – организация, LOC – место. Текст любого языка обрабатывается автоматически."
 )
 iface.launch()