mechtnet commited on
Commit
1c5484e
·
verified ·
1 Parent(s): ef0f719

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +280 -190
app.py CHANGED
@@ -1,216 +1,307 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 
3
  import os
4
  import json
5
- import torch
6
-
7
- def analyze_text(text, original_file_name):
8
- try:
9
- # 1. Предобработка текста
10
- cleaned_text = preprocess_text(text)
11
-
12
- # 2. Разбивка на строки
13
- lines = split_into_lines(cleaned_text)
14
 
15
- # 3. Эмоциональный анализ строк
16
- emotions = analyze_emotions(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # 4. Группировка строк по эмоциям
19
- quotes_by_emotion = group_lines_by_emotion(emotions)
 
 
 
 
 
 
 
20
 
21
- # 5. Формирование итогового результата
22
- output = {
23
- 'quotes_by_mood': quotes_by_emotion
24
- }
 
 
 
 
25
 
26
- # Сохранение результата
27
- output_file_name = f"{os.path.splitext(original_file_name)[0]}_analysis.json"
28
- output_file_path = os.path.join(os.getcwd(), output_file_name)
29
- with open(output_file_path, 'w', encoding='utf-8') as f:
30
- json.dump(output, f, ensure_ascii=False, indent=2)
 
31
 
32
- return output_file_path
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- except Exception as e:
35
- print(f"Произошла ошибка: {str(e)}")
36
- return None
37
 
38
- def preprocess_text(text):
39
- lines = []
40
- for line in text.split('\n'):
41
- line = line.strip()
42
- # Пропускаем строки, начинающиеся с 'Название:' или 'URL:'
43
- if line.startswith('Название:') or line.startswith('URL:'):
44
- continue
45
- # Пропускаем строки, состоящие только из разделителей
46
- elif all(char in '=*-_' for char in line) and len(line) > 0:
47
- continue
48
- # Пропускаем строки в квадратных скобках
49
- elif line.startswith('[') and line.endswith(']'):
50
- continue
51
- elif line:
52
- lines.append(line)
53
- cleaned_text = '\n'.join(lines)
54
- return cleaned_text
55
-
56
- def split_into_lines(text):
57
- lines = [line.strip() for line in text.split('\n') if line.strip()]
58
- return lines
59
-
60
- def analyze_emotions(lines):
61
- model_name = "cointegrated/rubert-tiny2-cedr-emotion-detection"
62
- tokenizer = AutoTokenizer.from_pretrained(model_name)
63
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
64
- device = "cuda" if torch.cuda.is_available() else "cpu"
65
- model = model.to(device)
66
-
67
- # Получаем список меток эмоций из конфигурации модели
68
- id2label = model.config.id2label
69
-
70
- emotion_translation = {
71
- 'disappointment': 'раз��чарование',
72
- 'sadness': 'грусть',
73
- 'neutral': 'нейтральность',
74
- 'joy': 'радость',
75
- 'surprise': 'удивление',
76
- 'fear': 'страх',
77
- 'anger': 'злость'
78
- }
79
 
80
- emotions = []
81
- # Анализируем одиночные строки и пары
82
- for i in range(len(lines)):
83
- line = lines[i]
84
- if len(line.strip()) == 0:
85
- continue
 
 
86
 
87
- # Анализируем текущую строку и получаем все оценки
88
- inputs = tokenizer(line, return_tensors="pt", truncation=True, max_length=512).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  with torch.no_grad():
90
  outputs = model(**inputs)
91
  scores = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
92
- predictions = []
93
- for idx, score in enumerate(scores):
94
- label = id2label[idx]
95
- predictions.append({
96
- 'label': label,
97
- 'score': float(score)
98
- })
99
 
100
- # Сортируем предсказания по убыванию score
101
- predictions = sorted(predictions, key=lambda x: x['score'], reverse=True)
102
-
103
- # Получаем основную эмоцию (с максимальным score)
104
- main_emotion = predictions[0]
105
- emotion_label = main_emotion['label']
106
- main_score = main_emotion['score']
107
-
108
- # Если есть следующая строка, анализируем пару
109
- if i < len(lines) - 1:
110
- pair = line + " " + lines[i + 1]
111
- inputs_pair = tokenizer(pair, return_tensors="pt", truncation=True, max_length=512).to(device)
112
- with torch.no_grad():
113
- outputs_pair = model(**inputs_pair)
114
- scores_pair = torch.nn.functional.softmax(outputs_pair.logits, dim=1)[0]
115
- predictions_pair = []
116
- for idx, score in enumerate(scores_pair):
117
- label = id2label[idx]
118
- predictions_pair.append({
119
- 'label': label,
120
- 'score': float(score)
121
- })
122
- predictions_pair = sorted(predictions_pair, key=lambda x: x['score'], reverse=True)
123
- pair_emotion = predictions_pair[0]
124
-
125
- # Если эмоция пары сильнее, используем её
126
- if pair_emotion['score'] > main_score:
127
- emotion_label = pair_emotion['label']
128
- main_score = pair_emotion['score']
129
- predictions = predictions_pair
130
-
131
- # Переводим метку эмоции на русский
132
- emotion_label_ru = emotion_translation.get(emotion_label, emotion_label)
133
-
134
- # Формируем список всех эмоций с их оценками для отладки
135
- all_emotions = [{
136
- 'emotion': emotion_translation.get(pred['label'], pred['label']),
137
- 'score': round(float(pred['score']), 3)
138
- } for pred in predictions]
139
-
140
- emotions.append({
141
- 'line': line,
142
- 'emotion': emotion_label_ru,
143
- 'score': round(float(main_score), 3),
144
- 'all_emotions': all_emotions
145
- })
146
-
147
- return emotions
148
-
149
- def group_lines_by_emotion(emotions, threshold=0.3, top_n=3):
150
- mood_quotes = {}
151
- i = 0
152
- negative_emotions = ['разочарование', 'грусть', 'страх', 'злость']
153
-
154
- while i < len(emotions):
155
- item = emotions[i]
156
- emotion = item['emotion']
157
- score = item['score']
158
- line = item['line']
159
-
160
- # Если есть следующая строка
161
- if i < len(emotions) - 1:
162
- next_item = emotions[i + 1]
163
- combined_line = f"{line}\n{next_item['line']}"
164
 
165
- # Проверяем связь между строками
166
- current_is_negative = emotion in negative_emotions
167
- next_is_negative = next_item['emotion'] in negative_emotions
168
- emotions_match = (
169
- (current_is_negative and next_is_negative) or
170
- emotion == next_item['emotion'] or
171
- any(e['score'] > threshold for e in item['all_emotions'] if e['emotion'] in negative_emotions)
172
- )
173
 
174
- if emotions_match:
175
- combined_score = max(score, next_item['score'])
176
- if emotion not in mood_quotes:
177
- mood_quotes[emotion] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
- if combined_score >= threshold:
180
- mood_quotes[emotion].append({
181
- 'quote': combined_line,
182
- 'score': combined_score,
183
- 'emotions': item['all_emotions']
184
- })
185
- i += 2 # Пропускаем следующую строку
186
- continue
187
-
188
- # Если нет пары или не удалось объединить
189
- if score >= threshold:
190
- if emotion not in mood_quotes:
191
- mood_quotes[emotion] = []
192
- mood_quotes[emotion].append({
193
- 'quote': line,
194
- 'score': score,
195
- 'emotions': item['all_emotions']
196
  })
197
- i += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
- # Отбираем топовые цитаты для каждой эмоции
200
- for emotion in mood_quotes:
201
- mood_quotes[emotion] = sorted(mood_quotes[emotion], key=lambda x: x['score'], reverse=True)[:top_n]
202
-
203
- return mood_quotes
 
 
 
 
 
 
204
 
205
  def analyze_files(file_paths):
206
  result_files = []
207
  for file_path in file_paths:
208
  file_name = os.path.basename(file_path)
209
- with open(file_path, 'r', encoding='utf-8') as f:
210
- text = f.read()
211
- output_file_path = analyze_text(text, file_name)
212
- if output_file_path:
213
- result_files.append(output_file_path)
 
 
 
214
  return result_files
215
 
216
  # Создание интерфейса Gradio
@@ -218,10 +309,9 @@ demo = gr.Interface(
218
  fn=analyze_files,
219
  inputs=gr.File(label="Загрузите .txt файлы", file_count="multiple", type="filepath"),
220
  outputs=gr.File(label="Скачайте результаты", file_count="multiple"),
221
- title="Анализ текста по настроению",
222
- description="Загрузите .txt файлы для анализа и скачайте результаты."
223
  )
224
 
225
- # Запуск приложения
226
  if __name__ == "__main__":
227
  demo.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
3
+ import torch
4
  import os
5
  import json
6
+ import numpy as np
7
+ from collections import defaultdict
8
+ from sklearn.cluster import DBSCAN
 
 
 
 
 
 
9
 
10
+ # Конфигурация
11
+ MODELS = {
12
+ # Для определения базовых эмоций
13
+ 'emotion': {
14
+ 'name': "cointegrated/rubert-tiny2-cedr-emotion-detection",
15
+ 'task': "emotion"
16
+ },
17
+ # Для анализа тональности и общего настроения
18
+ 'sentiment': {
19
+ 'name': "seara/rubert-base-cased-russian-sentiment",
20
+ 'task': "sentiment"
21
+ },
22
+ # Для понимания контекста и тем
23
+ 'context': {
24
+ 'name': "DeepPavlov/rubert-base-cased",
25
+ 'task': "sequence-classification"
26
+ }
27
+ }
28
 
29
+ EMOTION_TRANSLATION = {
30
+ 'disappointment': 'разочарование',
31
+ 'sadness': 'грусть',
32
+ 'neutral': 'нейтральность',
33
+ 'joy': 'радость',
34
+ 'surprise': 'удивление',
35
+ 'fear': 'страх',
36
+ 'anger': 'злость'
37
+ }
38
 
39
+ ANALYSIS_PARAMS = {
40
+ 'min_block_lines': 2,
41
+ 'metaphor_threshold': 0.5,
42
+ 'emotion_threshold': 0.3,
43
+ 'clustering_eps': 0.5,
44
+ 'clustering_min_samples': 2,
45
+ 'top_n_quotes': 3
46
+ }
47
 
48
+ class TextAnalyzer:
49
+ def __init__(self):
50
+ self.models = {}
51
+ self.tokenizers = {}
52
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
53
+ self._load_models()
54
 
55
+ def _load_models(self):
56
+ for model_type, config in MODELS.items():
57
+ try:
58
+ print(f"Loading {model_type} model...")
59
+ self.tokenizers[model_type] = AutoTokenizer.from_pretrained(config['name'])
60
+ if config['task'] == "feature-extraction":
61
+ self.models[model_type] = AutoModel.from_pretrained(config['name']).to(self.device)
62
+ else:
63
+ self.models[model_type] = AutoModelForSequenceClassification.from_pretrained(config['name']).to(self.device)
64
+ print(f"{model_type} model loaded successfully")
65
+ except Exception as e:
66
+ print(f"Error loading {model_type} model: {str(e)}")
67
 
68
+ def analyze_text_block(self, text_block):
69
+ if not text_block.strip():
70
+ return None
71
 
72
+ # Анализ эмоций
73
+ emotions = self._analyze_emotions(text_block)
74
+
75
+ # Анализ тональности
76
+ sentiment = self._analyze_sentiment(text_block)
77
+
78
+ # Анализ контекста
79
+ context_embedding = self._get_context_embedding(text_block)
80
+
81
+ return {
82
+ 'text': text_block,
83
+ 'emotions': emotions,
84
+ 'sentiment': sentiment,
85
+ 'context_embedding': context_embedding
86
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ def _analyze_emotions(self, text):
89
+ model = self.models['emotion']
90
+ tokenizer = self.tokenizers['emotion']
91
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
92
+
93
+ with torch.no_grad():
94
+ outputs = model(**inputs)
95
+ scores = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
96
 
97
+ emotions = []
98
+ for idx, score in enumerate(scores):
99
+ label = model.config.id2label[idx]
100
+ emotion_label = EMOTION_TRANSLATION.get(label, label)
101
+ emotions.append({
102
+ 'label': emotion_label,
103
+ 'score': float(score)
104
+ })
105
+ return sorted(emotions, key=lambda x: x['score'], reverse=True)
106
+
107
+ def _analyze_sentiment(self, text):
108
+ model = self.models['sentiment']
109
+ tokenizer = self.tokenizers['sentiment']
110
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
111
+
112
  with torch.no_grad():
113
  outputs = model(**inputs)
114
  scores = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
 
 
 
 
 
 
 
115
 
116
+ return float(scores[1]) - float(scores[0])
117
+
118
+ def _get_context_embedding(self, text):
119
+ model = self.models['context']
120
+ tokenizer = self.tokenizers['context']
121
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
122
+
123
+ with torch.no_grad():
124
+ outputs = model(**inputs)
125
+ if hasattr(outputs, 'last_hidden_state'):
126
+ embedding = outputs.last_hidden_state.mean(dim=1)
127
+ else:
128
+ embedding = outputs.logits
129
+
130
+ return embedding.cpu().numpy()
131
+
132
+ class TextBlockAnalyzer:
133
+ def __init__(self):
134
+ self.analyzer = TextAnalyzer()
135
+ self.emotion_clusters = defaultdict(list)
136
+
137
+ def analyze_text(self, text):
138
+ # Разбиваем текст на блоки
139
+ blocks = self._split_into_blocks(text)
140
+
141
+ # Анализируем каждый блок
142
+ block_analyses = []
143
+ for block in blocks:
144
+ analysis = self.analyzer.analyze_text_block(block)
145
+ if analysis:
146
+ block_analyses.append(analysis)
147
+
148
+ # Группируем блоки по эмоциональной близости
149
+ self._cluster_blocks(block_analyses)
150
+
151
+ # Формируем итоговый результат
152
+ return self._format_results(block_analyses)
153
+
154
+ def _split_into_blocks(self, text):
155
+ blocks = []
156
+ current_block = []
157
+
158
+ lines = text.split('\n')
159
+ for line in lines:
160
+ line = line.strip()
161
+ if not line or line.startswith('['):
162
+ if len(current_block) >= ANALYSIS_PARAMS['min_block_lines']:
163
+ blocks.append('\n'.join(current_block))
164
+ current_block = []
165
+ continue
166
+ current_block.append(line)
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
+ if len(current_block) >= ANALYSIS_PARAMS['min_block_lines']:
169
+ blocks.append('\n'.join(current_block))
 
 
 
 
 
 
170
 
171
+ return blocks
172
+
173
+ def _cluster_blocks(self, block_analyses):
174
+ if not block_analyses:
175
+ return
176
+
177
+ # Получаем эмбеддинги для каждого блока
178
+ embeddings = np.array([analysis['context_embedding'][0] for analysis in block_analyses])
179
+
180
+ # Кластеризуем блоки
181
+ clustering = DBSCAN(
182
+ eps=ANALYSIS_PARAMS['clustering_eps'],
183
+ min_samples=ANALYSIS_PARAMS['clustering_min_samples']
184
+ ).fit(embeddings)
185
+
186
+ # Группируем блоки по кластерам
187
+ for idx, label in enumerate(clustering.labels_):
188
+ self.emotion_clusters[label].append(block_analyses[idx])
189
+
190
+ def _format_results(self, block_analyses):
191
+ results = {
192
+ 'emotional_blocks': defaultdict(list),
193
+ 'general_mood': self._determine_general_mood(block_analyses),
194
+ 'emotional_progression': self._analyze_emotional_progression(block_analyses),
195
+ 'clusters': self._format_clusters()
196
+ }
197
+
198
+ # Распределяем блоки по эмоциональным категориям
199
+ for block in block_analyses:
200
+ primary_emotion = max(block['emotions'], key=lambda x: x['score'])
201
+ if primary_emotion['score'] >= ANALYSIS_PARAMS['emotion_threshold']:
202
+ results['emotional_blocks'][primary_emotion['label']].append({
203
+ 'text': block['text'],
204
+ 'score': primary_emotion['score'],
205
+ 'sentiment': block['sentiment']
206
+ })
207
+
208
+ # Отбираем топовые цитаты для каждой эмоции
209
+ for emotion in results['emotional_blocks']:
210
+ results['emotional_blocks'][emotion] = sorted(
211
+ results['emotional_blocks'][emotion],
212
+ key=lambda x: x['score'],
213
+ reverse=True
214
+ )[:ANALYSIS_PARAMS['top_n_quotes']]
215
+
216
+ return results
217
+
218
+ def _determine_general_mood(self, block_analyses):
219
+ if not block_analyses:
220
+ return {'average_sentiment': 0, 'sentiment_variance': 0}
221
+
222
+ sentiments = [block['sentiment'] for block in block_analyses]
223
+ return {
224
+ 'average_sentiment': float(np.mean(sentiments)),
225
+ 'sentiment_variance': float(np.var(sentiments))
226
+ }
227
+
228
+ def _analyze_emotional_progression(self, block_analyses):
229
+ progression = []
230
+ for block in block_analyses:
231
+ primary_emotion = max(block['emotions'], key=lambda x: x['score'])
232
+ progression.append({
233
+ 'text': block['text'],
234
+ 'emotion': primary_emotion['label'],
235
+ 'intensity': float(primary_emotion['score'])
236
+ })
237
+ return progression
238
+
239
+ def _format_clusters(self):
240
+ clusters_info = []
241
+ for label, blocks in self.emotion_clusters.items():
242
+ if label == -1: # Пропускаем выбросы
243
+ continue
244
 
245
+ cluster_emotions = defaultdict(float)
246
+ for block in blocks:
247
+ primary_emotion = max(block['emotions'], key=lambda x: x['score'])
248
+ cluster_emotions[primary_emotion['label']] += primary_emotion['score']
249
+
250
+ # Находим доминирующую эмоцию кластера
251
+ dominant_emotion = max(cluster_emotions.items(), key=lambda x: x[1])
252
+
253
+ clusters_info.append({
254
+ 'cluster_id': int(label),
255
+ 'dominant_emotion': dominant_emotion[0],
256
+ 'emotion_score': float(dominant_emotion[1] / len(blocks)),
257
+ 'blocks': [block['text'] for block in blocks]
 
 
 
 
258
  })
259
+
260
+ return clusters_info
261
+
262
+ def analyze_text_file(text, original_file_name):
263
+ try:
264
+ # Создаем анализатор текста
265
+ analyzer = TextBlockAnalyzer()
266
+
267
+ # Анализируем текст
268
+ analysis_results = analyzer.analyze_text(text)
269
+
270
+ # Формируем итоговый результат
271
+ output = {
272
+ 'file_name': original_file_name,
273
+ 'analysis': {
274
+ 'emotional_blocks': analysis_results['emotional_blocks'],
275
+ 'general_mood': analysis_results['general_mood'],
276
+ 'emotional_progression': analysis_results['emotional_progression'],
277
+ 'emotional_clusters': analysis_results['clusters']
278
+ }
279
+ }
280
 
281
+ # Сохраняем результат
282
+ output_file_name = f"{os.path.splitext(original_file_name)[0]}_analysis.json"
283
+ output_file_path = os.path.join(os.getcwd(), output_file_name)
284
+ with open(output_file_path, 'w', encoding='utf-8') as f:
285
+ json.dump(output, f, ensure_ascii=False, indent=2)
286
+
287
+ return output_file_path
288
+
289
+ except Exception as e:
290
+ print(f"Произошла ошибка: {str(e)}")
291
+ return None
292
 
293
  def analyze_files(file_paths):
294
  result_files = []
295
  for file_path in file_paths:
296
  file_name = os.path.basename(file_path)
297
+ try:
298
+ with open(file_path, 'r', encoding='utf-8') as f:
299
+ text = f.read()
300
+ output_file_path = analyze_text_file(text, file_name)
301
+ if output_file_path:
302
+ result_files.append(output_file_path)
303
+ except Exception as e:
304
+ print(f"Ошибка при обработке файла {file_name}: {str(e)}")
305
  return result_files
306
 
307
  # Создание интерфейса Gradio
 
309
  fn=analyze_files,
310
  inputs=gr.File(label="Загрузите .txt файлы", file_count="multiple", type="filepath"),
311
  outputs=gr.File(label="Скачайте результаты", file_count="multiple"),
312
+ title="Расширенный анализ текста по настроению",
313
+ description="Загрузите .txt файлы для многоуровневого анализа эмоций и настроения."
314
  )
315
 
 
316
  if __name__ == "__main__":
317
  demo.launch()