Asanaly commited on
Commit
1a00b01
·
verified ·
1 Parent(s): 70a40ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -7
app.py CHANGED
@@ -12,14 +12,14 @@ model_checkpoint = "bert-base-multilingual-cased"
12
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
13
  model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=7)
14
 
15
- # Қазақ NER үшін label тізімі
16
  label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
17
 
18
  # ============================
19
- # 2. NER функциясы (бірнеше сөзді біріктіру)
20
  # ============================
21
  def predict_ner_entities(text):
22
- tokens = tokenizer(text.split(), return_tensors="pt", is_split_into_words=True)
23
  outputs = model(**tokens).logits
24
  predictions = np.argmax(outputs.detach().numpy(), axis=2)[0]
25
  word_ids = tokens.word_ids(batch_index=0)
@@ -32,7 +32,14 @@ def predict_ner_entities(text):
32
  if word_idx is None:
33
  continue
34
  label = label_list[predictions[idx]]
35
- word = text.split()[word_idx]
 
 
 
 
 
 
 
36
 
37
  if label.startswith("B-"):
38
  if current_entity:
@@ -46,8 +53,10 @@ def predict_ner_entities(text):
46
  entities.append((" ".join(current_entity), current_label))
47
  current_entity = []
48
  current_label = None
 
49
  if current_entity:
50
  entities.append((" ".join(current_entity), current_label))
 
51
  return entities
52
 
53
  # ============================
@@ -60,11 +69,11 @@ def format_ner(text):
60
 
61
  output_dict = {"PER": [], "ORG": [], "LOC": []}
62
  for word, label in entities:
63
- if label.endswith("PER"):
64
  output_dict["PER"].append(word)
65
- elif label.endswith("ORG"):
66
  output_dict["ORG"].append(word)
67
- elif label.endswith("LOC"):
68
  output_dict["LOC"].append(word)
69
 
70
  output_text = ""
 
12
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
13
  model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=7)
14
 
15
+ # NER үшін label тізімі
16
  label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
17
 
18
  # ============================
19
+ # 2. NER функциясы (B/I тегтерін біріктіру)
20
  # ============================
21
  def predict_ner_entities(text):
22
+ tokens = tokenizer(text, return_tensors="pt", is_split_into_words=False)
23
  outputs = model(**tokens).logits
24
  predictions = np.argmax(outputs.detach().numpy(), axis=2)[0]
25
  word_ids = tokens.word_ids(batch_index=0)
 
32
  if word_idx is None:
33
  continue
34
  label = label_list[predictions[idx]]
35
+ word = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0][idx])
36
+
37
+ # Токендерді біріктіру
38
+ if word.startswith("##"):
39
+ word = word[2:]
40
+ if current_entity:
41
+ current_entity[-1] += word
42
+ continue
43
 
44
  if label.startswith("B-"):
45
  if current_entity:
 
53
  entities.append((" ".join(current_entity), current_label))
54
  current_entity = []
55
  current_label = None
56
+
57
  if current_entity:
58
  entities.append((" ".join(current_entity), current_label))
59
+
60
  return entities
61
 
62
  # ============================
 
69
 
70
  output_dict = {"PER": [], "ORG": [], "LOC": []}
71
  for word, label in entities:
72
+ if label == "PER":
73
  output_dict["PER"].append(word)
74
+ elif label == "ORG":
75
  output_dict["ORG"].append(word)
76
+ elif label == "LOC":
77
  output_dict["LOC"].append(word)
78
 
79
  output_text = ""