Spaces:

student2222333051
/

project1

Sleeping

App Files Files Community

Asanaly commited on Nov 26, 2025

Commit

2d0991b

verified ·

1 Parent(s): 1a00b01

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -47

app.py CHANGED Viewed

@@ -1,28 +1,29 @@
 # app.py
 import gradio as gr
 import torch
 import numpy as np
-from transformers import AutoTokenizer, AutoModelForTokenClassification
 # ============================
-# 1. Модель мен токенизатор
 # ============================
-model_checkpoint = "bert-base-multilingual-cased"
-tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
-model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=7)
-# NER үшін label тізімі
-label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
 # ============================
-# 2. NER функциясы (B/I тегтерін біріктіру)
 # ============================
-def predict_ner_entities(text):
-    tokens = tokenizer(text, return_tensors="pt", is_split_into_words=False)
-    outputs = model(**tokens).logits
-    predictions = np.argmax(outputs.detach().numpy(), axis=2)[0]
-    word_ids = tokens.word_ids(batch_index=0)
     entities = []
     current_entity = []
@@ -31,15 +32,8 @@ def predict_ner_entities(text):
     for idx, word_idx in enumerate(word_ids):
         if word_idx is None:
             continue
-        label = label_list[predictions[idx]]
-        word = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0][idx])
-        # Токендерді біріктіру
-        if word.startswith("##"):
-            word = word[2:]
-            if current_entity:
-                current_entity[-1] += word
-            continue
         if label.startswith("B-"):
             if current_entity:
@@ -53,47 +47,44 @@ def predict_ner_entities(text):
                 entities.append((" ".join(current_entity), current_label))
                 current_entity = []
                 current_label = None
     if current_entity:
         entities.append((" ".join(current_entity), current_label))
     return entities
 # ============================
-# 3. Форматтау – қарапайым текстпен шығару
 # ============================
-def format_ner(text):
-    entities = predict_ner_entities(text)
     if not entities:
-        return "Атаулар табылған жоқ"
-    output_dict = {"PER": [], "ORG": [], "LOC": []}
     for word, label in entities:
-        if label == "PER":
-            output_dict["PER"].append(word)
-        elif label == "ORG":
-            output_dict["ORG"].append(word)
-        elif label == "LOC":
-            output_dict["LOC"].append(word)
-    output_text = ""
-    for key, words in output_dict.items():
         if words:
-            output_text += f"{key}: {'; '.join(words)}\n"
-    return output_text.strip()
 # ============================
-# 4. Gradio интерфейсі
 # ============================
 iface = gr.Interface(
-    fn=format_ner,
-    inputs=gr.Textbox(lines=15, placeholder="Қазақ мәтінін осында енгізіңіз..."),
-    outputs=gr.Textbox(label="Анықталған атаулар"),
-    title="Қазақ тіліндегі NER",
-    description="PER – адам, ORG – ұйым, LOC – орын. Бірнеше сөйлемді бірден өңдейді."
 )
-# ============================
-# 5. Іске қосу
-# ============================
 iface.launch()

 # app.py
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForTokenClassification
 import torch
 import numpy as np
 # ============================
+# 1. Модель и токенизатор
 # ============================
+MODEL_NAME = "dbmdz/bert-large-cased-finetuned-conll03-english"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
+# Метки CoNLL
+LABELS = model.config.id2label  # 0: O, 1: B-MISC, 2: I-MISC, 3: B-PER, ...
 # ============================
+# 2. NER функция
 # ============================
+def get_entities(text):
+    words = text.split()
+    inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True, padding=True)
+    outputs = model(**inputs).logits
+    predictions = torch.argmax(outputs, dim=2).numpy()[0]
+    word_ids = inputs.word_ids(batch_index=0)
     entities = []
     current_entity = []
     for idx, word_idx in enumerate(word_ids):
         if word_idx is None:
             continue
+        label = LABELS[predictions[idx]]
+        word = words[word_idx]
         if label.startswith("B-"):
             if current_entity:
                 entities.append((" ".join(current_entity), current_label))
                 current_entity = []
                 current_label = None
     if current_entity:
         entities.append((" ".join(current_entity), current_label))
     return entities
 # ============================
+# 3. Форматирование на казахском
 # ============================
+def format_entities(text):
+    entities = get_entities(text)
     if not entities:
+        return "Атаулар табылған жоқ."
+    output = {"PER": [], "ORG": [], "LOC": []}
     for word, label in entities:
+        # Метки CoNLL: PER, ORG, LOC/GPE
+        if label in ["PER"]:
+            output["PER"].append(word)
+        elif label in ["ORG"]:
+            output["ORG"].append(word)
+        elif label in ["LOC", "GPE"]:
+            output["LOC"].append(word)
+    result = ""
+    for key, words in output.items():
         if words:
+            result += f"{key}: {'; '.join(words)}\n"
+    return result.strip()
 # ============================
+# 4. Gradio интерфейс
 # ============================
 iface = gr.Interface(
+    fn=format_entities,
+    inputs=gr.Textbox(lines=15, placeholder="Введите текст на русском..."),
+    outputs=gr.Textbox(label="Анықталған атаулар (қазақша)"),
+    title="NER для русского текста (метки на казахском)",
+    description="PER – адам, ORG – ұйым, LOC – орын. Несколько предложений обрабатываются сразу."
 )
 iface.launch()