Asanaly commited on
Commit
3065429
·
verified ·
1 Parent(s): b3123ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -40
app.py CHANGED
@@ -1,65 +1,73 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
  from langdetect import detect
4
 
5
  # ============================
6
- # Публичные модели
7
  # ============================
8
- MODELS = {
9
- "Russian": "blinoff/bert-base-russian-ner",
10
- "English": "dbmdz/bert-large-cased-finetuned-conll03-english"
11
- }
 
 
12
 
13
- # Создаем пайплайны NER заранее
14
- ner_pipelines = {
15
- "Russian": pipeline("ner", model=MODELS["Russian"], tokenizer=MODELS["Russian"], aggregation_strategy="simple"),
16
- "English": pipeline("ner", model=MODELS["English"], tokenizer=MODELS["English"], aggregation_strategy="simple")
17
- }
 
 
 
 
 
18
 
19
  # ============================
20
- # Функция распознавания
21
  # ============================
22
- def auto_ner(text):
 
23
  try:
24
  lang = detect(text)
25
  except:
26
- lang = "ru" # По умолчанию русский
 
 
27
 
28
  if lang == "en":
29
- model_choice = "English"
 
 
 
 
 
 
 
30
  else:
31
- model_choice = "Russian"
32
-
33
- ner = ner_pipelines[model_choice]
34
- results = ner(text)
35
-
36
- if not results:
37
- return "Сущности не найдены."
38
-
39
- output = {"PER": [], "ORG": [], "LOC": []}
40
- for item in results:
41
- entity = item['word']
42
- label = item['entity_group']
43
 
44
- if label == "PER":
45
- output["PER"].append(entity)
46
- elif label == "ORG":
47
- output["ORG"].append(entity)
48
- elif label == "LOC" or label == "GPE":
49
- output["LOC"].append(entity)
 
50
 
51
- result_text = ""
52
- for key, values in output.items():
53
- if values:
54
- result_text += f"{key}: {'; '.join(list(dict.fromkeys(values)))}\n"
55
- return result_text.strip()
56
 
57
  # ============================
58
  # Gradio интерфейс
59
  # ============================
60
  iface = gr.Interface(
61
- fn=auto_ner,
62
- inputs=gr.Textbox(lines=15, placeholder="Введите текст здесь (русский или английский, несколько предложений)..."),
63
  outputs=gr.Textbox(label="Распознанные сущности (PER/ORG/LOC)"),
64
  title="Автоматический NER для русского и английского текста",
65
  description="PER – человек, ORG – организация, LOC – место. Текст любого языка обрабатывается автоматически."
 
1
  import gradio as gr
 
2
  from langdetect import detect
3
 
4
  # ============================
5
+ # Natasha для русского
6
  # ============================
7
+ from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsNERTagger, Doc
8
+
9
+ segmenter = Segmenter()
10
+ morph_vocab = MorphVocab()
11
+ embedding = NewsEmbedding()
12
+ ner_tagger = NewsNERTagger(embedding)
13
 
14
+ # ============================
15
+ # HuggingFace для английского
16
+ # ============================
17
+ from transformers import pipeline
18
+ english_ner = pipeline(
19
+ "ner",
20
+ model="dbmdz/bert-large-cased-finetuned-conll03-english",
21
+ tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english",
22
+ aggregation_strategy="simple"
23
+ )
24
 
25
  # ============================
26
+ # Функция распознавания сущностей
27
  # ============================
28
+ def recognize_entities_auto(text):
29
+ # Определяем язык
30
  try:
31
  lang = detect(text)
32
  except:
33
+ lang = "ru"
34
+
35
+ entities = {"PER": [], "ORG": [], "LOC": []}
36
 
37
  if lang == "en":
38
+ results = english_ner(text)
39
+ for res in results:
40
+ label = res['entity_group']
41
+ word = res['word']
42
+ if label in ["PER", "ORG", "LOC", "GPE"]:
43
+ if label == "GPE":
44
+ label = "LOC"
45
+ entities[label].append(word)
46
  else:
47
+ doc = Doc(text)
48
+ doc.segment(segmenter)
49
+ doc.tag_ner(ner_tagger)
50
+ for span in doc.spans:
51
+ label = span.type
52
+ if label in ["PER", "ORG", "LOC"]:
53
+ entities[label].append(span.text)
 
 
 
 
 
54
 
55
+ # Формируем текстовый вывод
56
+ output_text = ""
57
+ for key, items in entities.items():
58
+ if items:
59
+ # Убираем дубликаты
60
+ unique_items = list(dict.fromkeys(items))
61
+ output_text += f"{key}: {'; '.join(unique_items)}\n"
62
 
63
+ return output_text.strip() if output_text else "Сущности не найдены."
 
 
 
 
64
 
65
  # ============================
66
  # Gradio интерфейс
67
  # ============================
68
  iface = gr.Interface(
69
+ fn=recognize_entities_auto,
70
+ inputs=gr.Textbox(lines=15, placeholder="Введите русский или английский текст здесь..."),
71
  outputs=gr.Textbox(label="Распознанные сущности (PER/ORG/LOC)"),
72
  title="Автоматический NER для русского и английского текста",
73
  description="PER – человек, ORG – организация, LOC – место. Текст любого языка обрабатывается автоматически."