mechtnet commited on
Commit
46a289d
·
verified ·
1 Parent(s): fc6671c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -16
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
3
  import json
@@ -5,10 +6,14 @@ import os
5
  from sentence_transformers import SentenceTransformer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  import numpy as np
 
 
 
 
8
 
9
  def analyze_text():
10
  try:
11
- # Читаем исходный текст
12
  file_path = os.path.join(os.getcwd(), "test.txt")
13
  if not os.path.exists(file_path):
14
  return "Ошибка: файл 'test.txt' не найден."
@@ -21,66 +26,132 @@ def analyze_text():
21
  # 2. Разбиение на блоки
22
  blocks = split_into_blocks(cleaned_text)
23
 
24
- # 3. Семантический анализ
25
  semantic_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
26
  embeddings = semantic_model.encode(blocks)
27
 
28
- # 4. Эмоциональный анализ
29
  model_name = "seara/rubert-tiny2-russian-emotion-detection-ru-go-emotions"
30
  tokenizer = AutoTokenizer.from_pretrained(model_name)
31
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
32
  emotion_classifier = pipeline("text-classification", tokenizer=tokenizer, model=model, return_all_scores=True)
33
- emotion_results = analyze_emotions(blocks, emotion_classifier)
 
 
 
34
 
35
- # 5. Перекрестный анализ
36
  results = cross_analysis(blocks, embeddings, emotion_results)
37
 
38
- # 6. Сохранение результатов
39
- save_results(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- # Возвращаем результаты в интерфейс
42
- formatted_results = json.dumps(results, ensure_ascii=False, indent=2)
43
  return formatted_results
44
 
45
  except Exception as e:
46
  return f"Произошла ошибка: {str(e)}"
47
 
48
  def preprocess_text(text):
 
49
  lines = []
50
  for line in text.split('\n'):
51
  line = line.strip()
52
- # Удаляем метки в квадратных скобках, заменяя их пустой строкой
53
  if line.startswith('[') and line.endswith(']'):
54
- lines.append('') # Добавляем пустую строку для разделения блоков
55
  elif line and 'Название:' not in line and 'URL:' not in line and '===' not in line:
56
  lines.append(line)
57
  cleaned_text = '\n'.join(lines)
58
  return cleaned_text
59
 
60
  def split_into_blocks(text):
61
- blocks = text.strip().split('\n\n') # Разделяем по двум символам новой строки
 
62
  return blocks
63
 
64
- def analyze_emotions(blocks, emotion_classifier):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  results = []
66
  for block in blocks:
67
  predictions = emotion_classifier(block, truncation=True)[0]
68
  emotions = {}
69
  for pred in predictions:
70
  emotion_label = pred['label']
71
- emotions[emotion_label] = round(float(pred['score']), 3)
72
- # Определяем доминирующую эмоцию
73
  dominant_emotion = max(emotions, key=emotions.get)
74
  dominant_score = emotions[dominant_emotion]
 
 
 
 
75
  results.append({
76
  'text': block,
77
- 'dominant_emotion': dominant_emotion,
78
  'score': dominant_score,
79
  'emotions': emotions
80
  })
81
  return results
82
 
 
 
 
 
 
 
83
  def cross_analysis(blocks, embeddings, emotion_results):
 
84
  similarity_matrix = cosine_similarity(embeddings)
85
  for i, result in enumerate(emotion_results):
86
  similarities = similarity_matrix[i]
@@ -89,7 +160,40 @@ def cross_analysis(blocks, embeddings, emotion_results):
89
  result['similarity_scores'] = [round(float(similarities[j]), 3) for j in similar_indices]
90
  return emotion_results
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  def save_results(results, filename='results.json'):
 
93
  with open(filename, 'w', encoding='utf-8') as f:
94
  json.dump(results, f, ensure_ascii=False, indent=2)
95
 
 
1
+ # Импорт необходимых библиотек
2
  import gradio as gr
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
4
  import json
 
6
  from sentence_transformers import SentenceTransformer
7
  from sklearn.metrics.pairwise import cosine_similarity
8
  import numpy as np
9
+ import stanza
10
+
11
+ # Инициализация пайплайна Stanza для русского языка
12
+ nlp = stanza.Pipeline('ru', processors='tokenize,pos,lemma,depparse', use_gpu=False)
13
 
14
  def analyze_text():
15
  try:
16
+ # Читаем исходный текст из файла 'test.txt'
17
  file_path = os.path.join(os.getcwd(), "test.txt")
18
  if not os.path.exists(file_path):
19
  return "Ошибка: файл 'test.txt' не найден."
 
26
  # 2. Разбиение на блоки
27
  blocks = split_into_blocks(cleaned_text)
28
 
29
+ # 3. Семантический анализ (векторизация блоков)
30
  semantic_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
31
  embeddings = semantic_model.encode(blocks)
32
 
33
+ # 4. Эмоциональный анализ блоков с использованием модели с 27 эмоциями
34
  model_name = "seara/rubert-tiny2-russian-emotion-detection-ru-go-emotions"
35
  tokenizer = AutoTokenizer.from_pretrained(model_name)
36
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
37
  emotion_classifier = pipeline("text-classification", tokenizer=tokenizer, model=model, return_all_scores=True)
38
+ emotion_results = analyze_emotions(blocks, emotion_classifier, threshold=0.5)
39
+
40
+ # 4.1. Эмоциональный анализ всего текста
41
+ total_result = analyze_total_text(blocks, emotion_classifier)
42
 
43
+ # 5. Перекрестный анализ (поиск схожих блоков)
44
  results = cross_analysis(blocks, embeddings, emotion_results)
45
 
46
+ # 6. Тематическое моделирование текста (LDA)
47
+ topics = topic_modeling(blocks, num_topics=3)
48
+
49
+ # 7. Лексический и синтаксический анализ блоков с использованием Stanza
50
+ for result in results:
51
+ analysis = lexical_syntactic_analysis(result['text'])
52
+ result['lexical_analysis'] = analysis
53
+
54
+ # 8. Визуализация эмоций (опционально)
55
+ # plot_emotions(results)
56
+
57
+ # Собираем все результаты в один словарь
58
+ output = {
59
+ 'block_analysis': results,
60
+ 'total_emotion': total_result['dominant_emotion'],
61
+ 'total_emotions': total_result['emotions'],
62
+ 'topics': topics
63
+ }
64
+
65
+ # 9. Сохранение результатов в файл 'results.json'
66
+ save_results(output)
67
 
68
+ # Возвращаем результаты в интерфейс Gradio
69
+ formatted_results = json.dumps(output, ensure_ascii=False, indent=2)
70
  return formatted_results
71
 
72
  except Exception as e:
73
  return f"Произошла ошибка: {str(e)}"
74
 
75
  def preprocess_text(text):
76
+ # Предобработка текста
77
  lines = []
78
  for line in text.split('\n'):
79
  line = line.strip()
 
80
  if line.startswith('[') and line.endswith(']'):
81
+ lines.append('')
82
  elif line and 'Название:' not in line and 'URL:' not in line and '===' not in line:
83
  lines.append(line)
84
  cleaned_text = '\n'.join(lines)
85
  return cleaned_text
86
 
87
  def split_into_blocks(text):
88
+ # Разбиваем текст на блоки
89
+ blocks = text.strip().split('\n\n')
90
  return blocks
91
 
92
+ def analyze_emotions(blocks, emotion_classifier, threshold=0.5):
93
+ # Эмоциональный анализ блоков
94
+ emotion_translation = {
95
+ "admiration": "восхищение",
96
+ "amusement": "развлечение",
97
+ "anger": "злость",
98
+ "annoyance": "раздражение",
99
+ "approval": "одобрение",
100
+ "caring": "забота",
101
+ "confusion": "замешательство",
102
+ "curiosity": "любопытство",
103
+ "desire": "желание",
104
+ "disappointment": "разочарование",
105
+ "disapproval": "неодобрение",
106
+ "disgust": "отвращение",
107
+ "embarrassment": "смущение",
108
+ "excitement": "волнение",
109
+ "fear": "страх",
110
+ "gratitude": "благодарность",
111
+ "grief": "горе",
112
+ "joy": "радость",
113
+ "love": "любовь",
114
+ "nervousness": "нервозность",
115
+ "optimism": "оптимизм",
116
+ "pride": "гордость",
117
+ "realization": "осознание",
118
+ "relief": "облегчение",
119
+ "remorse": "раскаяние",
120
+ "sadness": "печаль",
121
+ "surprise": "удивление",
122
+ "neutral": "нейтрально"
123
+ }
124
+
125
  results = []
126
  for block in blocks:
127
  predictions = emotion_classifier(block, truncation=True)[0]
128
  emotions = {}
129
  for pred in predictions:
130
  emotion_label = pred['label']
131
+ emotion_label_ru = emotion_translation.get(emotion_label, emotion_label)
132
+ emotions[emotion_label_ru] = round(float(pred['score']), 3)
133
  dominant_emotion = max(emotions, key=emotions.get)
134
  dominant_score = emotions[dominant_emotion]
135
+ if dominant_score >= threshold:
136
+ result_emotion = dominant_emotion
137
+ else:
138
+ result_emotion = "Смешанные эмоции"
139
  results.append({
140
  'text': block,
141
+ 'dominant_emotion': result_emotion,
142
  'score': dominant_score,
143
  'emotions': emotions
144
  })
145
  return results
146
 
147
+ def analyze_total_text(blocks, emotion_classifier):
148
+ # Эмоциональный анализ всего текста
149
+ total_text = ' '.join(blocks)
150
+ total_result = analyze_emotions([total_text], emotion_classifier)[0]
151
+ return total_result
152
+
153
  def cross_analysis(blocks, embeddings, emotion_results):
154
+ # Перекрестный анализ
155
  similarity_matrix = cosine_similarity(embeddings)
156
  for i, result in enumerate(emotion_results):
157
  similarities = similarity_matrix[i]
 
160
  result['similarity_scores'] = [round(float(similarities[j]), 3) for j in similar_indices]
161
  return emotion_results
162
 
163
+ def topic_modeling(blocks, num_topics=3):
164
+ # Тематическое моделирование
165
+ from gensim import corpora, models
166
+ from nltk.tokenize import word_tokenize
167
+
168
+ texts = [word_tokenize(block.lower()) for block in blocks]
169
+ dictionary = corpora.Dictionary(texts)
170
+ corpus = [dictionary.doc2bow(text) for text in texts]
171
+ lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
172
+ topics = lda_model.print_topics()
173
+ topics_list = []
174
+ for idx, topic in topics:
175
+ topics_list.append(f"Тема {idx+1}: {topic}")
176
+ return topics_list
177
+
178
+ def lexical_syntactic_analysis(block):
179
+ # Лексический и синтаксический анализ с использованием Stanza
180
+ doc = nlp(block)
181
+ tokens = []
182
+ for sentence in doc.sentences:
183
+ for word in sentence.words:
184
+ tokens.append({
185
+ 'text': word.text,
186
+ 'lemma': word.lemma,
187
+ 'upos': word.upos,
188
+ 'xpos': word.xpos,
189
+ 'feats': word.feats,
190
+ 'head': word.head,
191
+ 'deprel': word.deprel
192
+ })
193
+ return tokens
194
+
195
  def save_results(results, filename='results.json'):
196
+ # Сохранение результатов
197
  with open(filename, 'w', encoding='utf-8') as f:
198
  json.dump(results, f, ensure_ascii=False, indent=2)
199