Spaces:

student2222333051
/

project1

Sleeping

App Files Files Community

Asanaly commited on Nov 26, 2025

Commit

f79fc96

verified ·

1 Parent(s): 2d0991b

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -22

app.py CHANGED Viewed

@@ -6,19 +6,26 @@ import torch
 import numpy as np
 # ============================
-# 1. Модель и токенизатор
 # ============================
-MODEL_NAME = "dbmdz/bert-large-cased-finetuned-conll03-english"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
-# Метки CoNLL
-LABELS = model.config.id2label  # 0: O, 1: B-MISC, 2: I-MISC, 3: B-PER, ...
 # ============================
-# 2. NER функция
 # ============================
-def get_entities(text):
     words = text.split()
     inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True, padding=True)
     outputs = model(**inputs).logits
@@ -32,7 +39,7 @@ def get_entities(text):
     for idx, word_idx in enumerate(word_ids):
         if word_idx is None:
             continue
-        label = LABELS[predictions[idx]]
         word = words[word_idx]
         if label.startswith("B-"):
@@ -53,21 +60,22 @@ def get_entities(text):
     return entities
 # ============================
-# 3. Форматирование на казахском
 # ============================
-def format_entities(text):
-    entities = get_entities(text)
     if not entities:
-        return "Атаулар табылған жоқ."
     output = {"PER": [], "ORG": [], "LOC": []}
     for word, label in entities:
-        # Метки CoNLL: PER, ORG, LOC/GPE
-        if label in ["PER"]:
             output["PER"].append(word)
-        elif label in ["ORG"]:
             output["ORG"].append(word)
-        elif label in ["LOC", "GPE"]:
             output["LOC"].append(word)
     result = ""
@@ -77,14 +85,18 @@ def format_entities(text):
     return result.strip()
 # ============================
-# 4. Gradio интерфейс
 # ============================
 iface = gr.Interface(
     fn=format_entities,
-    inputs=gr.Textbox(lines=15, placeholder="Введите текст на русском..."),
-    outputs=gr.Textbox(label="Анықталған атаулар (қазақша)"),
-    title="NER для русского текста (метки на казахском)",
-    description="PER – адам, ORG – ұйым, LOC – орын. Несколько предложений обрабатываются сразу."
 )
 iface.launch()

 import numpy as np
 # ============================
+# Модели
 # ============================
+MODEL_DICT = {
+    "Russian": "DeepPavlov/rubert-base-cased",
+    "English": "dbmdz/bert-large-cased-finetuned-conll03-english"
+}
+# ============================
+# Загрузка модели
+# ============================
+def load_model(model_name):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForTokenClassification.from_pretrained(model_name)
+    labels = model.config.id2label
+    return tokenizer, model, labels
 # ============================
+# NER функция
 # ============================
+def get_entities(text, tokenizer, model, labels):
     words = text.split()
     inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True, padding=True)
     outputs = model(**inputs).logits
     for idx, word_idx in enumerate(word_ids):
         if word_idx is None:
             continue
+        label = labels[predictions[idx]]
         word = words[word_idx]
         if label.startswith("B-"):
     return entities
 # ============================
+# Форматирование результата
 # ============================
+def format_entities(text, model_choice):
+    tokenizer, model, labels = load_model(MODEL_DICT[model_choice])
+    entities = get_entities(text, tokenizer, model, labels)
     if not entities:
+        return "No entities found."
     output = {"PER": [], "ORG": [], "LOC": []}
     for word, label in entities:
+        # Стандартные метки PER/ORG/LOC
+        if label in ["PER", "PERSON"]:
             output["PER"].append(word)
+        elif label in ["ORG", "ORGANIZATION"]:
             output["ORG"].append(word)
+        elif label in ["LOC", "GPE", "LOCATION"]:
             output["LOC"].append(word)
     result = ""
     return result.strip()
 # ============================
+# Gradio интерфейс
 # ============================
 iface = gr.Interface(
     fn=format_entities,
+    inputs=[
+        gr.Textbox(lines=15, placeholder="Введите текст здесь..."),
+        gr.Dropdown(choices=list(MODEL_DICT.keys()), label="Выберите модель")
+    ],
+    outputs=gr.Textbox(label="Recognized entities (PER/ORG/LOC)"),
+    title="NER для текста",
+    description="PER – person, ORG – organization, LOC – location. Можно вводить несколько предложений."
 )
 iface.launch()