Spaces:
Sleeping
Sleeping
Asanaly
commited on
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,14 +12,14 @@ model_checkpoint = "bert-base-multilingual-cased"
|
|
| 12 |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
| 13 |
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=7)
|
| 14 |
|
| 15 |
-
#
|
| 16 |
label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
|
| 17 |
|
| 18 |
# ============================
|
| 19 |
-
# 2. NER функциясы (
|
| 20 |
# ============================
|
| 21 |
def predict_ner_entities(text):
|
| 22 |
-
tokens = tokenizer(text
|
| 23 |
outputs = model(**tokens).logits
|
| 24 |
predictions = np.argmax(outputs.detach().numpy(), axis=2)[0]
|
| 25 |
word_ids = tokens.word_ids(batch_index=0)
|
|
@@ -32,7 +32,14 @@ def predict_ner_entities(text):
|
|
| 32 |
if word_idx is None:
|
| 33 |
continue
|
| 34 |
label = label_list[predictions[idx]]
|
| 35 |
-
word =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
if label.startswith("B-"):
|
| 38 |
if current_entity:
|
|
@@ -46,8 +53,10 @@ def predict_ner_entities(text):
|
|
| 46 |
entities.append((" ".join(current_entity), current_label))
|
| 47 |
current_entity = []
|
| 48 |
current_label = None
|
|
|
|
| 49 |
if current_entity:
|
| 50 |
entities.append((" ".join(current_entity), current_label))
|
|
|
|
| 51 |
return entities
|
| 52 |
|
| 53 |
# ============================
|
|
@@ -60,11 +69,11 @@ def format_ner(text):
|
|
| 60 |
|
| 61 |
output_dict = {"PER": [], "ORG": [], "LOC": []}
|
| 62 |
for word, label in entities:
|
| 63 |
-
if label
|
| 64 |
output_dict["PER"].append(word)
|
| 65 |
-
elif label
|
| 66 |
output_dict["ORG"].append(word)
|
| 67 |
-
elif label
|
| 68 |
output_dict["LOC"].append(word)
|
| 69 |
|
| 70 |
output_text = ""
|
|
|
|
| 12 |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
| 13 |
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=7)
|
| 14 |
|
| 15 |
+
# NER үшін label тізімі
|
| 16 |
label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
|
| 17 |
|
| 18 |
# ============================
|
| 19 |
+
# 2. NER функциясы (B/I тегтерін біріктіру)
|
| 20 |
# ============================
|
| 21 |
def predict_ner_entities(text):
|
| 22 |
+
tokens = tokenizer(text, return_tensors="pt", is_split_into_words=False)
|
| 23 |
outputs = model(**tokens).logits
|
| 24 |
predictions = np.argmax(outputs.detach().numpy(), axis=2)[0]
|
| 25 |
word_ids = tokens.word_ids(batch_index=0)
|
|
|
|
| 32 |
if word_idx is None:
|
| 33 |
continue
|
| 34 |
label = label_list[predictions[idx]]
|
| 35 |
+
word = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0][idx])
|
| 36 |
+
|
| 37 |
+
# Токендерді біріктіру
|
| 38 |
+
if word.startswith("##"):
|
| 39 |
+
word = word[2:]
|
| 40 |
+
if current_entity:
|
| 41 |
+
current_entity[-1] += word
|
| 42 |
+
continue
|
| 43 |
|
| 44 |
if label.startswith("B-"):
|
| 45 |
if current_entity:
|
|
|
|
| 53 |
entities.append((" ".join(current_entity), current_label))
|
| 54 |
current_entity = []
|
| 55 |
current_label = None
|
| 56 |
+
|
| 57 |
if current_entity:
|
| 58 |
entities.append((" ".join(current_entity), current_label))
|
| 59 |
+
|
| 60 |
return entities
|
| 61 |
|
| 62 |
# ============================
|
|
|
|
| 69 |
|
| 70 |
output_dict = {"PER": [], "ORG": [], "LOC": []}
|
| 71 |
for word, label in entities:
|
| 72 |
+
if label == "PER":
|
| 73 |
output_dict["PER"].append(word)
|
| 74 |
+
elif label == "ORG":
|
| 75 |
output_dict["ORG"].append(word)
|
| 76 |
+
elif label == "LOC":
|
| 77 |
output_dict["LOC"].append(word)
|
| 78 |
|
| 79 |
output_text = ""
|