Asanaly commited on
Commit
65e51b4
·
verified ·
1 Parent(s): f79fc96

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -77
app.py CHANGED
@@ -1,102 +1,69 @@
1
- # app.py
2
-
3
  import gradio as gr
4
- from transformers import AutoTokenizer, AutoModelForTokenClassification
5
- import torch
6
- import numpy as np
7
 
8
  # ============================
9
  # Модели
10
  # ============================
11
- MODEL_DICT = {
12
- "Russian": "DeepPavlov/rubert-base-cased",
13
  "English": "dbmdz/bert-large-cased-finetuned-conll03-english"
14
  }
15
 
16
- # ============================
17
- # Загрузка модели
18
- # ============================
19
- def load_model(model_name):
20
- tokenizer = AutoTokenizer.from_pretrained(model_name)
21
- model = AutoModelForTokenClassification.from_pretrained(model_name)
22
- labels = model.config.id2label
23
- return tokenizer, model, labels
24
 
25
  # ============================
26
- # NER функция
27
  # ============================
28
- def get_entities(text, tokenizer, model, labels):
29
- words = text.split()
30
- inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True, padding=True)
31
- outputs = model(**inputs).logits
32
- predictions = torch.argmax(outputs, dim=2).numpy()[0]
33
- word_ids = inputs.word_ids(batch_index=0)
34
-
35
- entities = []
36
- current_entity = []
37
- current_label = None
38
 
39
- for idx, word_idx in enumerate(word_ids):
40
- if word_idx is None:
41
- continue
42
- label = labels[predictions[idx]]
43
- word = words[word_idx]
44
 
45
- if label.startswith("B-"):
46
- if current_entity:
47
- entities.append((" ".join(current_entity), current_label))
48
- current_entity = [word]
49
- current_label = label[2:]
50
- elif label.startswith("I-") and current_label == label[2:]:
51
- current_entity.append(word)
52
- else:
53
- if current_entity:
54
- entities.append((" ".join(current_entity), current_label))
55
- current_entity = []
56
- current_label = None
57
- if current_entity:
58
- entities.append((" ".join(current_entity), current_label))
59
 
60
- return entities
61
-
62
- # ============================
63
- # Форматирование результата
64
- # ============================
65
- def format_entities(text, model_choice):
66
- tokenizer, model, labels = load_model(MODEL_DICT[model_choice])
67
- entities = get_entities(text, tokenizer, model, labels)
68
- if not entities:
69
- return "No entities found."
70
 
71
  output = {"PER": [], "ORG": [], "LOC": []}
72
- for word, label in entities:
73
- # Стандартные метки PER/ORG/LOC
74
- if label in ["PER", "PERSON"]:
75
- output["PER"].append(word)
76
- elif label in ["ORG", "ORGANIZATION"]:
77
- output["ORG"].append(word)
78
- elif label in ["LOC", "GPE", "LOCATION"]:
79
- output["LOC"].append(word)
80
 
81
- result = ""
82
- for key, words in output.items():
83
- if words:
84
- result += f"{key}: {'; '.join(words)}\n"
85
- return result.strip()
 
 
 
 
 
 
 
 
86
 
87
  # ============================
88
  # Gradio интерфейс
89
  # ============================
90
  iface = gr.Interface(
91
- fn=format_entities,
92
- inputs=[
93
- gr.Textbox(lines=15, placeholder="Введите текст здесь..."),
94
- gr.Dropdown(choices=list(MODEL_DICT.keys()), label="Выберите модель")
95
- ],
96
- outputs=gr.Textbox(label="Recognized entities (PER/ORG/LOC)"),
97
- title="NER для текста",
98
- description="PER – person, ORG – organization, LOC – location. Можно вводить несколько предложений."
99
  )
100
 
101
  iface.launch()
102
-
 
 
 
1
  import gradio as gr
2
+ from transformers import pipeline
3
+ from langdetect import detect
 
4
 
5
  # ============================
6
  # Модели
7
  # ============================
8
+ MODELS = {
9
+ "Russian": "DeepPavlov/ner_rubert",
10
  "English": "dbmdz/bert-large-cased-finetuned-conll03-english"
11
  }
12
 
13
+ # Создаем пайплайны NER заранее
14
+ ner_pipelines = {
15
+ "Russian": pipeline("ner", model=MODELS["Russian"], tokenizer=MODELS["Russian"], aggregation_strategy="simple"),
16
+ "English": pipeline("ner", model=MODELS["English"], tokenizer=MODELS["English"], aggregation_strategy="simple")
17
+ }
 
 
 
18
 
19
  # ============================
20
+ # Функция распознавания
21
  # ============================
22
+ def auto_ner(text):
23
+ try:
24
+ lang = detect(text)
25
+ except:
26
+ lang = "ru" # По умолчанию русский
 
 
 
 
 
27
 
28
+ if lang == "en":
29
+ model_choice = "English"
30
+ else:
31
+ model_choice = "Russian"
 
32
 
33
+ ner = ner_pipelines[model_choice]
34
+ results = ner(text)
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ if not results:
37
+ return "Сущности не найдены."
 
 
 
 
 
 
 
 
38
 
39
  output = {"PER": [], "ORG": [], "LOC": []}
40
+ for item in results:
41
+ entity = item['word']
42
+ label = item['entity_group']
 
 
 
 
 
43
 
44
+ if label == "PER":
45
+ output["PER"].append(entity)
46
+ elif label == "ORG":
47
+ output["ORG"].append(entity)
48
+ elif label == "LOC" or label == "GPE":
49
+ output["LOC"].append(entity)
50
+
51
+ result_text = ""
52
+ for key, values in output.items():
53
+ if values:
54
+ # Убираем дубли и соединяем через ;
55
+ result_text += f"{key}: {'; '.join(list(dict.fromkeys(values)))}\n"
56
+ return result_text.strip()
57
 
58
  # ============================
59
  # Gradio интерфейс
60
  # ============================
61
  iface = gr.Interface(
62
+ fn=auto_ner,
63
+ inputs=gr.Textbox(lines=15, placeholder="Введите текст здесь (русский или английский, можно несколько предложений)..."),
64
+ outputs=gr.Textbox(label="Распознанные сущности (PER/ORG/LOC)"),
65
+ title="Автоматический NER для русского и английского текста",
66
+ description="PER – человек, ORG – организация, LOC – место. Текст любого языка обрабатывается автоматически."
 
 
 
67
  )
68
 
69
  iface.launch()