Spaces:

mechtnet
/

analyse_text

Sleeping

App Files Files Community

mechtnet commited on Jan 13, 2025

Commit

1c5484e

verified ·

1 Parent(s): ef0f719

Update app.py

Browse files

Files changed (1) hide show

app.py +280 -190

app.py CHANGED Viewed

@@ -1,216 +1,307 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import os
 import json
-import torch
-def analyze_text(text, original_file_name):
-    try:
-        # 1. Предобработка текста
-        cleaned_text = preprocess_text(text)
-        # 2. Разбивка на строки
-        lines = split_into_lines(cleaned_text)
-        # 3. Эмоциональный анализ строк
-        emotions = analyze_emotions(lines)
-        # 4. Группировка строк по эмоциям
-        quotes_by_emotion = group_lines_by_emotion(emotions)
-        # 5. Формирование итогового результата
-        output = {
-            'quotes_by_mood': quotes_by_emotion
-        }
-        # Сохранение результата
-        output_file_name = f"{os.path.splitext(original_file_name)[0]}_analysis.json"
-        output_file_path = os.path.join(os.getcwd(), output_file_name)
-        with open(output_file_path, 'w', encoding='utf-8') as f:
-            json.dump(output, f, ensure_ascii=False, indent=2)
-        return output_file_path
-    except Exception as e:
-        print(f"Произошла ошибка: {str(e)}")
-        return None
-def preprocess_text(text):
-    lines = []
-    for line in text.split('\n'):
-        line = line.strip()
-        # Пропускаем строки, начинающиеся с 'Название:' или 'URL:'
-        if line.startswith('Название:') or line.startswith('URL:'):
-            continue
-        # Пропускаем строки, состоящие только из разделителей
-        elif all(char in '=*-_' for char in line) and len(line) > 0:
-            continue
-        # Пропускаем строки в квадратных скобках
-        elif line.startswith('[') and line.endswith(']'):
-            continue
-        elif line:
-            lines.append(line)
-    cleaned_text = '\n'.join(lines)
-    return cleaned_text
-def split_into_lines(text):
-    lines = [line.strip() for line in text.split('\n') if line.strip()]
-    return lines
-def analyze_emotions(lines):
-    model_name = "cointegrated/rubert-tiny2-cedr-emotion-detection"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = model.to(device)
-    # Получаем список меток эмоций из конфигурации модели
-    id2label = model.config.id2label
-    emotion_translation = {
-        'disappointment': 'раз��чарование',
-        'sadness': 'грусть',
-        'neutral': 'нейтральность',
-        'joy': 'радость',
-        'surprise': 'удивление',
-        'fear': 'страх',
-        'anger': 'злость'
-    }
-    emotions = []
-    # Анализируем одиночные строки и пары
-    for i in range(len(lines)):
-        line = lines[i]
-        if len(line.strip()) == 0:
-            continue
-        # Анализируем текущую строку и получаем все оценки
-        inputs = tokenizer(line, return_tensors="pt", truncation=True, max_length=512).to(device)
         with torch.no_grad():
             outputs = model(**inputs)
             scores = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
-            predictions = []
-            for idx, score in enumerate(scores):
-                label = id2label[idx]
-                predictions.append({
-                    'label': label,
-                    'score': float(score)
-                })
-        # Сортируем предсказания по убыванию score
-        predictions = sorted(predictions, key=lambda x: x['score'], reverse=True)
-        # Получаем основную эмоцию (с максимальным score)
-        main_emotion = predictions[0]
-        emotion_label = main_emotion['label']
-        main_score = main_emotion['score']
-        # Если есть следующая строка, анализируем пару
-        if i < len(lines) - 1:
-            pair = line + " " + lines[i + 1]
-            inputs_pair = tokenizer(pair, return_tensors="pt", truncation=True, max_length=512).to(device)
-            with torch.no_grad():
-                outputs_pair = model(**inputs_pair)
-                scores_pair = torch.nn.functional.softmax(outputs_pair.logits, dim=1)[0]
-                predictions_pair = []
-                for idx, score in enumerate(scores_pair):
-                    label = id2label[idx]
-                    predictions_pair.append({
-                        'label': label,
-                        'score': float(score)
-                    })
-            predictions_pair = sorted(predictions_pair, key=lambda x: x['score'], reverse=True)
-            pair_emotion = predictions_pair[0]
-            # Если эмоция пары сильнее, используем её
-            if pair_emotion['score'] > main_score:
-                emotion_label = pair_emotion['label']
-                main_score = pair_emotion['score']
-                predictions = predictions_pair
-        # Переводим метку эмоции на русский
-        emotion_label_ru = emotion_translation.get(emotion_label, emotion_label)
-        # Формируем список всех эмоций с их оценками для отладки
-        all_emotions = [{
-            'emotion': emotion_translation.get(pred['label'], pred['label']),
-            'score': round(float(pred['score']), 3)
-        } for pred in predictions]
-        emotions.append({
-            'line': line,
-            'emotion': emotion_label_ru,
-            'score': round(float(main_score), 3),
-            'all_emotions': all_emotions
-        })
-    return emotions
-def group_lines_by_emotion(emotions, threshold=0.3, top_n=3):
-    mood_quotes = {}
-    i = 0
-    negative_emotions = ['разочарование', 'грусть', 'страх', 'злость']
-    while i < len(emotions):
-        item = emotions[i]
-        emotion = item['emotion']
-        score = item['score']
-        line = item['line']
-        # Если есть следующая строка
-        if i < len(emotions) - 1:
-            next_item = emotions[i + 1]
-            combined_line = f"{line}\n{next_item['line']}"
-            # Проверяем связь между строками
-            current_is_negative = emotion in negative_emotions
-            next_is_negative = next_item['emotion'] in negative_emotions
-            emotions_match = (
-                (current_is_negative and next_is_negative) or
-                emotion == next_item['emotion'] or
-                any(e['score'] > threshold for e in item['all_emotions'] if e['emotion'] in negative_emotions)
-            )
-            if emotions_match:
-                combined_score = max(score, next_item['score'])
-                if emotion not in mood_quotes:
-                    mood_quotes[emotion] = []
-                if combined_score >= threshold:
-                    mood_quotes[emotion].append({
-                        'quote': combined_line,
-                        'score': combined_score,
-                        'emotions': item['all_emotions']
-                    })
-                    i += 2  # Пропускаем следующую строку
-                    continue
-        # Если нет пары или не удалось объединить
-        if score >= threshold:
-            if emotion not in mood_quotes:
-                mood_quotes[emotion] = []
-            mood_quotes[emotion].append({
-                'quote': line,
-                'score': score,
-                'emotions': item['all_emotions']
             })
-        i += 1
-    # Отбираем топовые цитаты для каждой эмоции
-    for emotion in mood_quotes:
-        mood_quotes[emotion] = sorted(mood_quotes[emotion], key=lambda x: x['score'], reverse=True)[:top_n]
-    return mood_quotes
 def analyze_files(file_paths):
     result_files = []
     for file_path in file_paths:
         file_name = os.path.basename(file_path)
-        with open(file_path, 'r', encoding='utf-8') as f:
-            text = f.read()
-        output_file_path = analyze_text(text, file_name)
-        if output_file_path:
-            result_files.append(output_file_path)
     return result_files
 # Создание интерфейса Gradio
@@ -218,10 +309,9 @@ demo = gr.Interface(
     fn=analyze_files,
     inputs=gr.File(label="Загрузите .txt файлы", file_count="multiple", type="filepath"),
     outputs=gr.File(label="Скачайте результаты", file_count="multiple"),
-    title="Анализ текста по настроению",
-    description="Загрузите .txt файлы для анализа и скачайте результаты."
 )
-# Запуск приложения
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
+import torch
 import os
 import json
+import numpy as np
+from collections import defaultdict
+from sklearn.cluster import DBSCAN
+# Конфигурация
+MODELS = {
+    # Для определения базовых эмоций
+    'emotion': {
+        'name': "cointegrated/rubert-tiny2-cedr-emotion-detection",
+        'task': "emotion"
+    },
+    # Для анализа тональности и общего настроения
+    'sentiment': {
+        'name': "seara/rubert-base-cased-russian-sentiment",
+        'task': "sentiment"
+    },
+    # Для понимания контекста и тем
+    'context': {
+        'name': "DeepPavlov/rubert-base-cased",
+        'task': "sequence-classification"
+    }
+}
+EMOTION_TRANSLATION = {
+    'disappointment': 'разочарование',
+    'sadness': 'грусть',
+    'neutral': 'нейтральность',
+    'joy': 'радость',
+    'surprise': 'удивление',
+    'fear': 'страх',
+    'anger': 'злость'
+}
+ANALYSIS_PARAMS = {
+    'min_block_lines': 2,
+    'metaphor_threshold': 0.5,
+    'emotion_threshold': 0.3,
+    'clustering_eps': 0.5,
+    'clustering_min_samples': 2,
+    'top_n_quotes': 3
+}
+class TextAnalyzer:
+    def __init__(self):
+        self.models = {}
+        self.tokenizers = {}
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self._load_models()
+    def _load_models(self):
+        for model_type, config in MODELS.items():
+            try:
+                print(f"Loading {model_type} model...")
+                self.tokenizers[model_type] = AutoTokenizer.from_pretrained(config['name'])
+                if config['task'] == "feature-extraction":
+                    self.models[model_type] = AutoModel.from_pretrained(config['name']).to(self.device)
+                else:
+                    self.models[model_type] = AutoModelForSequenceClassification.from_pretrained(config['name']).to(self.device)
+                print(f"{model_type} model loaded successfully")
+            except Exception as e:
+                print(f"Error loading {model_type} model: {str(e)}")
+    def analyze_text_block(self, text_block):
+        if not text_block.strip():
+            return None
+        # Анализ эмоций
+        emotions = self._analyze_emotions(text_block)
+        # Анализ тональности
+        sentiment = self._analyze_sentiment(text_block)
+        # Анализ контекста
+        context_embedding = self._get_context_embedding(text_block)
+        return {
+            'text': text_block,
+            'emotions': emotions,
+            'sentiment': sentiment,
+            'context_embedding': context_embedding
+        }
+    def _analyze_emotions(self, text):
+        model = self.models['emotion']
+        tokenizer = self.tokenizers['emotion']
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+            scores = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
+        emotions = []
+        for idx, score in enumerate(scores):
+            label = model.config.id2label[idx]
+            emotion_label = EMOTION_TRANSLATION.get(label, label)
+            emotions.append({
+                'label': emotion_label,
+                'score': float(score)
+            })
+        return sorted(emotions, key=lambda x: x['score'], reverse=True)
+    def _analyze_sentiment(self, text):
+        model = self.models['sentiment']
+        tokenizer = self.tokenizers['sentiment']
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
         with torch.no_grad():
             outputs = model(**inputs)
             scores = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
+        return float(scores[1]) - float(scores[0])
+    def _get_context_embedding(self, text):
+        model = self.models['context']
+        tokenizer = self.tokenizers['context']
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+            if hasattr(outputs, 'last_hidden_state'):
+                embedding = outputs.last_hidden_state.mean(dim=1)
+            else:
+                embedding = outputs.logits
+        return embedding.cpu().numpy()
+class TextBlockAnalyzer:
+    def __init__(self):
+        self.analyzer = TextAnalyzer()
+        self.emotion_clusters = defaultdict(list)
+    def analyze_text(self, text):
+        # Разбиваем текст на блоки
+        blocks = self._split_into_blocks(text)
+        # Анализируем каждый блок
+        block_analyses = []
+        for block in blocks:
+            analysis = self.analyzer.analyze_text_block(block)
+            if analysis:
+                block_analyses.append(analysis)
+        # Группируем блоки по эмоциональной близости
+        self._cluster_blocks(block_analyses)
+        # Формируем итоговый результат
+        return self._format_results(block_analyses)
+    def _split_into_blocks(self, text):
+        blocks = []
+        current_block = []
+        lines = text.split('\n')
+        for line in lines:
+            line = line.strip()
+            if not line or line.startswith('['):
+                if len(current_block) >= ANALYSIS_PARAMS['min_block_lines']:
+                    blocks.append('\n'.join(current_block))
+                current_block = []
+                continue
+            current_block.append(line)
+        if len(current_block) >= ANALYSIS_PARAMS['min_block_lines']:
+            blocks.append('\n'.join(current_block))
+        return blocks
+    def _cluster_blocks(self, block_analyses):
+        if not block_analyses:
+            return
+        # Получаем эмбеддинги для каждого блока
+        embeddings = np.array([analysis['context_embedding'][0] for analysis in block_analyses])
+        # Кластеризуем блоки
+        clustering = DBSCAN(
+            eps=ANALYSIS_PARAMS['clustering_eps'],
+            min_samples=ANALYSIS_PARAMS['clustering_min_samples']
+        ).fit(embeddings)
+        # Группируем блоки по кластерам
+        for idx, label in enumerate(clustering.labels_):
+            self.emotion_clusters[label].append(block_analyses[idx])
+    def _format_results(self, block_analyses):
+        results = {
+            'emotional_blocks': defaultdict(list),
+            'general_mood': self._determine_general_mood(block_analyses),
+            'emotional_progression': self._analyze_emotional_progression(block_analyses),
+            'clusters': self._format_clusters()
+        }
+        # Распределяем блоки по эмоциональным категориям
+        for block in block_analyses:
+            primary_emotion = max(block['emotions'], key=lambda x: x['score'])
+            if primary_emotion['score'] >= ANALYSIS_PARAMS['emotion_threshold']:
+                results['emotional_blocks'][primary_emotion['label']].append({
+                    'text': block['text'],
+                    'score': primary_emotion['score'],
+                    'sentiment': block['sentiment']
+                })
+        # Отбираем топовые цитаты для каждой эмоции
+        for emotion in results['emotional_blocks']:
+            results['emotional_blocks'][emotion] = sorted(
+                results['emotional_blocks'][emotion],
+                key=lambda x: x['score'],
+                reverse=True
+            )[:ANALYSIS_PARAMS['top_n_quotes']]
+        return results
+    def _determine_general_mood(self, block_analyses):
+        if not block_analyses:
+            return {'average_sentiment': 0, 'sentiment_variance': 0}
+        sentiments = [block['sentiment'] for block in block_analyses]
+        return {
+            'average_sentiment': float(np.mean(sentiments)),
+            'sentiment_variance': float(np.var(sentiments))
+        }
+    def _analyze_emotional_progression(self, block_analyses):
+        progression = []
+        for block in block_analyses:
+            primary_emotion = max(block['emotions'], key=lambda x: x['score'])
+            progression.append({
+                'text': block['text'],
+                'emotion': primary_emotion['label'],
+                'intensity': float(primary_emotion['score'])
+            })
+        return progression
+    def _format_clusters(self):
+        clusters_info = []
+        for label, blocks in self.emotion_clusters.items():
+            if label == -1:  # Пропускаем выбросы
+                continue
+            cluster_emotions = defaultdict(float)
+            for block in blocks:
+                primary_emotion = max(block['emotions'], key=lambda x: x['score'])
+                cluster_emotions[primary_emotion['label']] += primary_emotion['score']
+            # Находим доминирующую эмоцию кластера
+            dominant_emotion = max(cluster_emotions.items(), key=lambda x: x[1])
+            clusters_info.append({
+                'cluster_id': int(label),
+                'dominant_emotion': dominant_emotion[0],
+                'emotion_score': float(dominant_emotion[1] / len(blocks)),
+                'blocks': [block['text'] for block in blocks]
             })
+        return clusters_info
+def analyze_text_file(text, original_file_name):
+    try:
+        # Создаем анализатор текста
+        analyzer = TextBlockAnalyzer()
+        # Анализируем текст
+        analysis_results = analyzer.analyze_text(text)
+        # Формируем итоговый результат
+        output = {
+            'file_name': original_file_name,
+            'analysis': {
+                'emotional_blocks': analysis_results['emotional_blocks'],
+                'general_mood': analysis_results['general_mood'],
+                'emotional_progression': analysis_results['emotional_progression'],
+                'emotional_clusters': analysis_results['clusters']
+            }
+        }
+        # Сохраняем результат
+        output_file_name = f"{os.path.splitext(original_file_name)[0]}_analysis.json"
+        output_file_path = os.path.join(os.getcwd(), output_file_name)
+        with open(output_file_path, 'w', encoding='utf-8') as f:
+            json.dump(output, f, ensure_ascii=False, indent=2)
+        return output_file_path
+    except Exception as e:
+        print(f"Произошла ошибка: {str(e)}")
+        return None
 def analyze_files(file_paths):
     result_files = []
     for file_path in file_paths:
         file_name = os.path.basename(file_path)
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                text = f.read()
+            output_file_path = analyze_text_file(text, file_name)
+            if output_file_path:
+                result_files.append(output_file_path)
+        except Exception as e:
+            print(f"Ошибка при обработке файла {file_name}: {str(e)}")
     return result_files
 # Создание интерфейса Gradio
     fn=analyze_files,
     inputs=gr.File(label="Загрузите .txt файлы", file_count="multiple", type="filepath"),
     outputs=gr.File(label="Скачайте результаты", file_count="multiple"),
+    title="Расширенный анализ текста по настроению",
+    description="Загрузите .txt файлы для многоуровневого анализа эмоций и настроения."
 )
 if __name__ == "__main__":
     demo.launch()