Spaces:

student2222333051
/

project1

Sleeping

App Files Files Community

Asanaly commited on Nov 26, 2025

Commit

70a40ec

verified ·

1 Parent(s): db3da85

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -21

app.py CHANGED Viewed

@@ -16,33 +16,48 @@ model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_la
 label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
 # ============================
-# 2. NER функциясы
 # ============================
-def predict_ner(text):
     tokens = tokenizer(text.split(), return_tensors="pt", is_split_into_words=True)
     outputs = model(**tokens).logits
     predictions = np.argmax(outputs.detach().numpy(), axis=2)[0]
     word_ids = tokens.word_ids(batch_index=0)
-    results = []
-    already_seen = set()
     for idx, word_idx in enumerate(word_ids):
-        if word_idx is not None and word_idx not in already_seen:
-            label = label_list[predictions[idx]]
-            word = text.split()[word_idx]
-            if label != "O":
-                results.append((word, label))
-            already_seen.add(word_idx)
-    return results
 # ============================
-# 3. Атауларды текстпен шығару
 # ============================
 def format_ner(text):
-    entities = predict_ner(text)
     if not entities:
         return "Атаулар табылған жоқ"
     output_dict = {"PER": [], "ORG": [], "LOC": []}
     for word, label in entities:
         if label.endswith("PER"):
@@ -51,12 +66,11 @@ def format_ner(text):
             output_dict["ORG"].append(word)
         elif label.endswith("LOC"):
             output_dict["LOC"].append(word)
-    # Бір қатарға қосып шығару
     output_text = ""
     for key, words in output_dict.items():
         if words:
-            output_text += f"{key}: {' '.join(words)}\n"
     return output_text.strip()
 # ============================
@@ -64,14 +78,13 @@ def format_ner(text):
 # ============================
 iface = gr.Interface(
     fn=format_ner,
-    inputs=gr.Textbox(lines=10, placeholder="Қазақ мәтінін осында енгізіңіз..."),
     outputs=gr.Textbox(label="Анықталған атаулар"),
     title="Қазақ тіліндегі NER",
-    description="PER – адам, ORG – ұйым, LOC – орын. Атаулар қарапайым текстпен шығарылады."
 )
 # ============================
 # 5. Іске қосу
 # ============================
 iface.launch()

 label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
 # ============================
+# 2. NER функциясы (бірнеше сөзді біріктіру)
 # ============================
+def predict_ner_entities(text):
     tokens = tokenizer(text.split(), return_tensors="pt", is_split_into_words=True)
     outputs = model(**tokens).logits
     predictions = np.argmax(outputs.detach().numpy(), axis=2)[0]
     word_ids = tokens.word_ids(batch_index=0)
+    entities = []
+    current_entity = []
+    current_label = None
     for idx, word_idx in enumerate(word_ids):
+        if word_idx is None:
+            continue
+        label = label_list[predictions[idx]]
+        word = text.split()[word_idx]
+        if label.startswith("B-"):
+            if current_entity:
+                entities.append((" ".join(current_entity), current_label))
+            current_entity = [word]
+            current_label = label[2:]
+        elif label.startswith("I-") and current_label == label[2:]:
+            current_entity.append(word)
+        else:
+            if current_entity:
+                entities.append((" ".join(current_entity), current_label))
+                current_entity = []
+                current_label = None
+    if current_entity:
+        entities.append((" ".join(current_entity), current_label))
+    return entities
 # ============================
+# 3. Форматтау – қарапайым текстпен шығару
 # ============================
 def format_ner(text):
+    entities = predict_ner_entities(text)
     if not entities:
         return "Атаулар табылған жоқ"
     output_dict = {"PER": [], "ORG": [], "LOC": []}
     for word, label in entities:
         if label.endswith("PER"):
             output_dict["ORG"].append(word)
         elif label.endswith("LOC"):
             output_dict["LOC"].append(word)
     output_text = ""
     for key, words in output_dict.items():
         if words:
+            output_text += f"{key}: {'; '.join(words)}\n"
     return output_text.strip()
 # ============================
 # ============================
 iface = gr.Interface(
     fn=format_ner,
+    inputs=gr.Textbox(lines=15, placeholder="Қазақ мәтінін осында енгізіңіз..."),
     outputs=gr.Textbox(label="Анықталған атаулар"),
     title="Қазақ тіліндегі NER",
+    description="PER – адам, ORG – ұйым, LOC – орын. Бірнеше сөйлемді бірден өңдейді."
 )
 # ============================
 # 5. Іске қосу
 # ============================
 iface.launch()