Asanaly commited on
Commit
f79fc96
·
verified ·
1 Parent(s): 2d0991b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -22
app.py CHANGED
@@ -6,19 +6,26 @@ import torch
6
  import numpy as np
7
 
8
  # ============================
9
- # 1. Модель и токенизатор
10
  # ============================
11
- MODEL_NAME = "dbmdz/bert-large-cased-finetuned-conll03-english"
12
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
- model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 
14
 
15
- # Метки CoNLL
16
- LABELS = model.config.id2label # 0: O, 1: B-MISC, 2: I-MISC, 3: B-PER, ...
 
 
 
 
 
 
17
 
18
  # ============================
19
- # 2. NER функция
20
  # ============================
21
- def get_entities(text):
22
  words = text.split()
23
  inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True, padding=True)
24
  outputs = model(**inputs).logits
@@ -32,7 +39,7 @@ def get_entities(text):
32
  for idx, word_idx in enumerate(word_ids):
33
  if word_idx is None:
34
  continue
35
- label = LABELS[predictions[idx]]
36
  word = words[word_idx]
37
 
38
  if label.startswith("B-"):
@@ -53,21 +60,22 @@ def get_entities(text):
53
  return entities
54
 
55
  # ============================
56
- # 3. Форматирование на казахском
57
  # ============================
58
- def format_entities(text):
59
- entities = get_entities(text)
 
60
  if not entities:
61
- return "Атаулар табылған жоқ."
62
 
63
  output = {"PER": [], "ORG": [], "LOC": []}
64
  for word, label in entities:
65
- # Метки CoNLL: PER, ORG, LOC/GPE
66
- if label in ["PER"]:
67
  output["PER"].append(word)
68
- elif label in ["ORG"]:
69
  output["ORG"].append(word)
70
- elif label in ["LOC", "GPE"]:
71
  output["LOC"].append(word)
72
 
73
  result = ""
@@ -77,14 +85,18 @@ def format_entities(text):
77
  return result.strip()
78
 
79
  # ============================
80
- # 4. Gradio интерфейс
81
  # ============================
82
  iface = gr.Interface(
83
  fn=format_entities,
84
- inputs=gr.Textbox(lines=15, placeholder="Введите текст на русском..."),
85
- outputs=gr.Textbox(label="Анықталған атаулар (қазақша)"),
86
- title="NER для русского текста (метки на казахском)",
87
- description="PER – адам, ORG – ұйым, LOC – орын. Несколько предложений обрабатываются сразу."
 
 
 
88
  )
89
 
90
  iface.launch()
 
 
6
  import numpy as np
7
 
8
  # ============================
9
+ # Модели
10
  # ============================
11
+ MODEL_DICT = {
12
+ "Russian": "DeepPavlov/rubert-base-cased",
13
+ "English": "dbmdz/bert-large-cased-finetuned-conll03-english"
14
+ }
15
 
16
+ # ============================
17
+ # Загрузка модели
18
+ # ============================
19
+ def load_model(model_name):
20
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
21
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
22
+ labels = model.config.id2label
23
+ return tokenizer, model, labels
24
 
25
  # ============================
26
+ # NER функция
27
  # ============================
28
+ def get_entities(text, tokenizer, model, labels):
29
  words = text.split()
30
  inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True, padding=True)
31
  outputs = model(**inputs).logits
 
39
  for idx, word_idx in enumerate(word_ids):
40
  if word_idx is None:
41
  continue
42
+ label = labels[predictions[idx]]
43
  word = words[word_idx]
44
 
45
  if label.startswith("B-"):
 
60
  return entities
61
 
62
  # ============================
63
+ # Форматирование результата
64
  # ============================
65
+ def format_entities(text, model_choice):
66
+ tokenizer, model, labels = load_model(MODEL_DICT[model_choice])
67
+ entities = get_entities(text, tokenizer, model, labels)
68
  if not entities:
69
+ return "No entities found."
70
 
71
  output = {"PER": [], "ORG": [], "LOC": []}
72
  for word, label in entities:
73
+ # Стандартные метки PER/ORG/LOC
74
+ if label in ["PER", "PERSON"]:
75
  output["PER"].append(word)
76
+ elif label in ["ORG", "ORGANIZATION"]:
77
  output["ORG"].append(word)
78
+ elif label in ["LOC", "GPE", "LOCATION"]:
79
  output["LOC"].append(word)
80
 
81
  result = ""
 
85
  return result.strip()
86
 
87
  # ============================
88
+ # Gradio интерфейс
89
  # ============================
90
  iface = gr.Interface(
91
  fn=format_entities,
92
+ inputs=[
93
+ gr.Textbox(lines=15, placeholder="Введите текст здесь..."),
94
+ gr.Dropdown(choices=list(MODEL_DICT.keys()), label="Выберите модель")
95
+ ],
96
+ outputs=gr.Textbox(label="Recognized entities (PER/ORG/LOC)"),
97
+ title="NER для текста",
98
+ description="PER – person, ORG – organization, LOC – location. Можно вводить несколько предложений."
99
  )
100
 
101
  iface.launch()
102
+