Asanaly commited on
Commit
2d0991b
·
verified ·
1 Parent(s): 1a00b01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -47
app.py CHANGED
@@ -1,28 +1,29 @@
1
  # app.py
2
 
3
  import gradio as gr
 
4
  import torch
5
  import numpy as np
6
- from transformers import AutoTokenizer, AutoModelForTokenClassification
7
 
8
  # ============================
9
- # 1. Модель мен токенизатор
10
  # ============================
11
- model_checkpoint = "bert-base-multilingual-cased"
12
- tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
13
- model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=7)
14
 
15
- # NER үшін label тізімі
16
- label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
17
 
18
  # ============================
19
- # 2. NER функциясы (B/I тегтерін біріктіру)
20
  # ============================
21
- def predict_ner_entities(text):
22
- tokens = tokenizer(text, return_tensors="pt", is_split_into_words=False)
23
- outputs = model(**tokens).logits
24
- predictions = np.argmax(outputs.detach().numpy(), axis=2)[0]
25
- word_ids = tokens.word_ids(batch_index=0)
 
26
 
27
  entities = []
28
  current_entity = []
@@ -31,15 +32,8 @@ def predict_ner_entities(text):
31
  for idx, word_idx in enumerate(word_ids):
32
  if word_idx is None:
33
  continue
34
- label = label_list[predictions[idx]]
35
- word = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0][idx])
36
-
37
- # Токендерді біріктіру
38
- if word.startswith("##"):
39
- word = word[2:]
40
- if current_entity:
41
- current_entity[-1] += word
42
- continue
43
 
44
  if label.startswith("B-"):
45
  if current_entity:
@@ -53,47 +47,44 @@ def predict_ner_entities(text):
53
  entities.append((" ".join(current_entity), current_label))
54
  current_entity = []
55
  current_label = None
56
-
57
  if current_entity:
58
  entities.append((" ".join(current_entity), current_label))
59
 
60
  return entities
61
 
62
  # ============================
63
- # 3. Форматтау қарапайым текстпен шығару
64
  # ============================
65
- def format_ner(text):
66
- entities = predict_ner_entities(text)
67
  if not entities:
68
- return "Атаулар табылған жоқ"
69
 
70
- output_dict = {"PER": [], "ORG": [], "LOC": []}
71
  for word, label in entities:
72
- if label == "PER":
73
- output_dict["PER"].append(word)
74
- elif label == "ORG":
75
- output_dict["ORG"].append(word)
76
- elif label == "LOC":
77
- output_dict["LOC"].append(word)
 
78
 
79
- output_text = ""
80
- for key, words in output_dict.items():
81
  if words:
82
- output_text += f"{key}: {'; '.join(words)}\n"
83
- return output_text.strip()
84
 
85
  # ============================
86
- # 4. Gradio интерфейсі
87
  # ============================
88
  iface = gr.Interface(
89
- fn=format_ner,
90
- inputs=gr.Textbox(lines=15, placeholder="Қазақ мәтінін осында енгізіңіз..."),
91
- outputs=gr.Textbox(label="Анықталған атаулар"),
92
- title="Қазақ тіліндегі NER",
93
- description="PER – адам, ORG – ұйым, LOC – орын. Бірнеше сөйлемді бірден өңдейді."
94
  )
95
 
96
- # ============================
97
- # 5. Іске қосу
98
- # ============================
99
  iface.launch()
 
1
  # app.py
2
 
3
  import gradio as gr
4
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
5
  import torch
6
  import numpy as np
 
7
 
8
  # ============================
9
+ # 1. Модель и токенизатор
10
  # ============================
11
+ MODEL_NAME = "dbmdz/bert-large-cased-finetuned-conll03-english"
12
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
+ model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
14
 
15
+ # Метки CoNLL
16
+ LABELS = model.config.id2label # 0: O, 1: B-MISC, 2: I-MISC, 3: B-PER, ...
17
 
18
  # ============================
19
+ # 2. NER функция
20
  # ============================
21
+ def get_entities(text):
22
+ words = text.split()
23
+ inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True, padding=True)
24
+ outputs = model(**inputs).logits
25
+ predictions = torch.argmax(outputs, dim=2).numpy()[0]
26
+ word_ids = inputs.word_ids(batch_index=0)
27
 
28
  entities = []
29
  current_entity = []
 
32
  for idx, word_idx in enumerate(word_ids):
33
  if word_idx is None:
34
  continue
35
+ label = LABELS[predictions[idx]]
36
+ word = words[word_idx]
 
 
 
 
 
 
 
37
 
38
  if label.startswith("B-"):
39
  if current_entity:
 
47
  entities.append((" ".join(current_entity), current_label))
48
  current_entity = []
49
  current_label = None
 
50
  if current_entity:
51
  entities.append((" ".join(current_entity), current_label))
52
 
53
  return entities
54
 
55
  # ============================
56
+ # 3. Форматирование на казахском
57
  # ============================
58
+ def format_entities(text):
59
+ entities = get_entities(text)
60
  if not entities:
61
+ return "Атаулар табылған жоқ."
62
 
63
+ output = {"PER": [], "ORG": [], "LOC": []}
64
  for word, label in entities:
65
+ # Метки CoNLL: PER, ORG, LOC/GPE
66
+ if label in ["PER"]:
67
+ output["PER"].append(word)
68
+ elif label in ["ORG"]:
69
+ output["ORG"].append(word)
70
+ elif label in ["LOC", "GPE"]:
71
+ output["LOC"].append(word)
72
 
73
+ result = ""
74
+ for key, words in output.items():
75
  if words:
76
+ result += f"{key}: {'; '.join(words)}\n"
77
+ return result.strip()
78
 
79
  # ============================
80
+ # 4. Gradio интерфейс
81
  # ============================
82
  iface = gr.Interface(
83
+ fn=format_entities,
84
+ inputs=gr.Textbox(lines=15, placeholder="Введите текст на русском..."),
85
+ outputs=gr.Textbox(label="Анықталған атаулар (қазақша)"),
86
+ title="NER для русского текста (метки на казахском)",
87
+ description="PER – адам, ORG – ұйым, LOC – орын. Несколько предложений обрабатываются сразу."
88
  )
89
 
 
 
 
90
  iface.launch()