Spaces:

student2222333051
/

project1

Sleeping

App Files Files Community

Asanaly commited on Nov 26, 2025

Commit

1a00b01

verified ·

1 Parent(s): 70a40ec

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -7

app.py CHANGED Viewed

@@ -12,14 +12,14 @@ model_checkpoint = "bert-base-multilingual-cased"
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
 model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=7)
-# Қазақ NER үшін label тізімі
 label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
 # ============================
-# 2. NER функциясы (бірнеше сөзді біріктіру)
 # ============================
 def predict_ner_entities(text):
-    tokens = tokenizer(text.split(), return_tensors="pt", is_split_into_words=True)
     outputs = model(**tokens).logits
     predictions = np.argmax(outputs.detach().numpy(), axis=2)[0]
     word_ids = tokens.word_ids(batch_index=0)
@@ -32,7 +32,14 @@ def predict_ner_entities(text):
         if word_idx is None:
             continue
         label = label_list[predictions[idx]]
-        word = text.split()[word_idx]
         if label.startswith("B-"):
             if current_entity:
@@ -46,8 +53,10 @@ def predict_ner_entities(text):
                 entities.append((" ".join(current_entity), current_label))
                 current_entity = []
                 current_label = None
     if current_entity:
         entities.append((" ".join(current_entity), current_label))
     return entities
 # ============================
@@ -60,11 +69,11 @@ def format_ner(text):
     output_dict = {"PER": [], "ORG": [], "LOC": []}
     for word, label in entities:
-        if label.endswith("PER"):
             output_dict["PER"].append(word)
-        elif label.endswith("ORG"):
             output_dict["ORG"].append(word)
-        elif label.endswith("LOC"):
             output_dict["LOC"].append(word)
     output_text = ""

 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
 model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=7)
+# NER үшін label тізімі
 label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
 # ============================
+# 2. NER функциясы (B/I тегтерін біріктіру)
 # ============================
 def predict_ner_entities(text):
+    tokens = tokenizer(text, return_tensors="pt", is_split_into_words=False)
     outputs = model(**tokens).logits
     predictions = np.argmax(outputs.detach().numpy(), axis=2)[0]
     word_ids = tokens.word_ids(batch_index=0)
         if word_idx is None:
             continue
         label = label_list[predictions[idx]]
+        word = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0][idx])
+        # Токендерді біріктіру
+        if word.startswith("##"):
+            word = word[2:]
+            if current_entity:
+                current_entity[-1] += word
+            continue
         if label.startswith("B-"):
             if current_entity:
                 entities.append((" ".join(current_entity), current_label))
                 current_entity = []
                 current_label = None
     if current_entity:
         entities.append((" ".join(current_entity), current_label))
     return entities
 # ============================
     output_dict = {"PER": [], "ORG": [], "LOC": []}
     for word, label in entities:
+        if label == "PER":
             output_dict["PER"].append(word)
+        elif label == "ORG":
             output_dict["ORG"].append(word)
+        elif label == "LOC":
             output_dict["LOC"].append(word)
     output_text = ""