Spaces:

mechtnet
/

analyse_text

Sleeping

App Files Files Community

mechtnet commited on Jan 13, 2025

Commit

46a289d

verified ·

1 Parent(s): fc6671c

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -16

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import json
@@ -5,10 +6,14 @@ import os
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 def analyze_text():
     try:
-        # Читаем исходный текст
         file_path = os.path.join(os.getcwd(), "test.txt")
         if not os.path.exists(file_path):
             return "Ошибка: файл 'test.txt' не найден."
@@ -21,66 +26,132 @@ def analyze_text():
         # 2. Разбиение на блоки
         blocks = split_into_blocks(cleaned_text)
-        # 3. Семантический анализ
         semantic_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
         embeddings = semantic_model.encode(blocks)
-        # 4. Эмоциональный анализ
         model_name = "seara/rubert-tiny2-russian-emotion-detection-ru-go-emotions"
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForSequenceClassification.from_pretrained(model_name)
         emotion_classifier = pipeline("text-classification", tokenizer=tokenizer, model=model, return_all_scores=True)
-        emotion_results = analyze_emotions(blocks, emotion_classifier)
-        # 5. Перекрестный анализ
         results = cross_analysis(blocks, embeddings, emotion_results)
-        # 6. Сохранение результатов
-        save_results(results)
-        # Возвращаем результаты в интерфейс
-        formatted_results = json.dumps(results, ensure_ascii=False, indent=2)
         return formatted_results
     except Exception as e:
         return f"Произошла ошибка: {str(e)}"
 def preprocess_text(text):
     lines = []
     for line in text.split('\n'):
         line = line.strip()
-        # Удаляем метки в квадратных скобках, заменяя их пустой строкой
         if line.startswith('[') and line.endswith(']'):
-            lines.append('')  # Добавляем пустую строку для разделения блоков
         elif line and 'Название:' not in line and 'URL:' not in line and '===' not in line:
             lines.append(line)
     cleaned_text = '\n'.join(lines)
     return cleaned_text
 def split_into_blocks(text):
-    blocks = text.strip().split('\n\n')  # Разделяем по двум символам новой строки
     return blocks
-def analyze_emotions(blocks, emotion_classifier):
     results = []
     for block in blocks:
         predictions = emotion_classifier(block, truncation=True)[0]
         emotions = {}
         for pred in predictions:
             emotion_label = pred['label']
-            emotions[emotion_label] = round(float(pred['score']), 3)
-        # Определяем доминирующую эмоцию
         dominant_emotion = max(emotions, key=emotions.get)
         dominant_score = emotions[dominant_emotion]
         results.append({
             'text': block,
-            'dominant_emotion': dominant_emotion,
             'score': dominant_score,
             'emotions': emotions
         })
     return results
 def cross_analysis(blocks, embeddings, emotion_results):
     similarity_matrix = cosine_similarity(embeddings)
     for i, result in enumerate(emotion_results):
         similarities = similarity_matrix[i]
@@ -89,7 +160,40 @@ def cross_analysis(blocks, embeddings, emotion_results):
         result['similarity_scores'] = [round(float(similarities[j]), 3) for j in similar_indices]
     return emotion_results
 def save_results(results, filename='results.json'):
     with open(filename, 'w', encoding='utf-8') as f:
         json.dump(results, f, ensure_ascii=False, indent=2)

+# Импорт необходимых библиотек
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import json
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
+import stanza
+# Инициализация пайплайна Stanza для русского языка
+nlp = stanza.Pipeline('ru', processors='tokenize,pos,lemma,depparse', use_gpu=False)
 def analyze_text():
     try:
+        # Читаем исходный текст из файла 'test.txt'
         file_path = os.path.join(os.getcwd(), "test.txt")
         if not os.path.exists(file_path):
             return "Ошибка: файл 'test.txt' не найден."
         # 2. Разбиение на блоки
         blocks = split_into_blocks(cleaned_text)
+        # 3. Семантический анализ (векторизация блоков)
         semantic_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
         embeddings = semantic_model.encode(blocks)
+        # 4. Эмоциональный анализ блоков с использованием модели с 27 эмоциями
         model_name = "seara/rubert-tiny2-russian-emotion-detection-ru-go-emotions"
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForSequenceClassification.from_pretrained(model_name)
         emotion_classifier = pipeline("text-classification", tokenizer=tokenizer, model=model, return_all_scores=True)
+        emotion_results = analyze_emotions(blocks, emotion_classifier, threshold=0.5)
+        # 4.1. Эмоциональный анализ всего текста
+        total_result = analyze_total_text(blocks, emotion_classifier)
+        # 5. Перекрестный анализ (поиск схожих блоков)
         results = cross_analysis(blocks, embeddings, emotion_results)
+        # 6. Тематическое моделирование текста (LDA)
+        topics = topic_modeling(blocks, num_topics=3)
+        # 7. Лексический и синтаксический анализ блоков с использованием Stanza
+        for result in results:
+            analysis = lexical_syntactic_analysis(result['text'])
+            result['lexical_analysis'] = analysis
+        # 8. Визуализация эмоций (опционально)
+        # plot_emotions(results)
+        # Собираем все результаты в один словарь
+        output = {
+            'block_analysis': results,
+            'total_emotion': total_result['dominant_emotion'],
+            'total_emotions': total_result['emotions'],
+            'topics': topics
+        }
+        # 9. Сохранение результатов в файл 'results.json'
+        save_results(output)
+        # Возвращаем результаты в интерфейс Gradio
+        formatted_results = json.dumps(output, ensure_ascii=False, indent=2)
         return formatted_results
     except Exception as e:
         return f"Произошла ошибка: {str(e)}"
 def preprocess_text(text):
+    # Предобработка текста
     lines = []
     for line in text.split('\n'):
         line = line.strip()
         if line.startswith('[') and line.endswith(']'):
+            lines.append('')
         elif line and 'Название:' not in line and 'URL:' not in line and '===' not in line:
             lines.append(line)
     cleaned_text = '\n'.join(lines)
     return cleaned_text
 def split_into_blocks(text):
+    # Разбиваем текст на блоки
+    blocks = text.strip().split('\n\n')
     return blocks
+def analyze_emotions(blocks, emotion_classifier, threshold=0.5):
+    # Эмоциональный анализ блоков
+    emotion_translation = {
+        "admiration": "восхищение",
+        "amusement": "развлечение",
+        "anger": "злость",
+        "annoyance": "раздражение",
+        "approval": "одобрение",
+        "caring": "забота",
+        "confusion": "замешательство",
+        "curiosity": "любопытство",
+        "desire": "желание",
+        "disappointment": "разочарование",
+        "disapproval": "неодобрение",
+        "disgust": "отвращение",
+        "embarrassment": "смущение",
+        "excitement": "волнение",
+        "fear": "страх",
+        "gratitude": "благодарность",
+        "grief": "горе",
+        "joy": "радость",
+        "love": "любовь",
+        "nervousness": "нервозность",
+        "optimism": "оптимизм",
+        "pride": "гордость",
+        "realization": "осознание",
+        "relief": "облегчение",
+        "remorse": "раскаяние",
+        "sadness": "печаль",
+        "surprise": "удивление",
+        "neutral": "нейтрально"
+    }
     results = []
     for block in blocks:
         predictions = emotion_classifier(block, truncation=True)[0]
         emotions = {}
         for pred in predictions:
             emotion_label = pred['label']
+            emotion_label_ru = emotion_translation.get(emotion_label, emotion_label)
+            emotions[emotion_label_ru] = round(float(pred['score']), 3)
         dominant_emotion = max(emotions, key=emotions.get)
         dominant_score = emotions[dominant_emotion]
+        if dominant_score >= threshold:
+            result_emotion = dominant_emotion
+        else:
+            result_emotion = "Смешанные эмоции"
         results.append({
             'text': block,
+            'dominant_emotion': result_emotion,
             'score': dominant_score,
             'emotions': emotions
         })
     return results
+def analyze_total_text(blocks, emotion_classifier):
+    # Эмоциональный анализ всего текста
+    total_text = ' '.join(blocks)
+    total_result = analyze_emotions([total_text], emotion_classifier)[0]
+    return total_result
 def cross_analysis(blocks, embeddings, emotion_results):
+    # Перекрестный анализ
     similarity_matrix = cosine_similarity(embeddings)
     for i, result in enumerate(emotion_results):
         similarities = similarity_matrix[i]
         result['similarity_scores'] = [round(float(similarities[j]), 3) for j in similar_indices]
     return emotion_results
+def topic_modeling(blocks, num_topics=3):
+    # Тематическое моделирование
+    from gensim import corpora, models
+    from nltk.tokenize import word_tokenize
+    texts = [word_tokenize(block.lower()) for block in blocks]
+    dictionary = corpora.Dictionary(texts)
+    corpus = [dictionary.doc2bow(text) for text in texts]
+    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
+    topics = lda_model.print_topics()
+    topics_list = []
+    for idx, topic in topics:
+        topics_list.append(f"Тема {idx+1}: {topic}")
+    return topics_list
+def lexical_syntactic_analysis(block):
+    # Лексический и синтаксический анализ с использованием Stanza
+    doc = nlp(block)
+    tokens = []
+    for sentence in doc.sentences:
+        for word in sentence.words:
+            tokens.append({
+                'text': word.text,
+                'lemma': word.lemma,
+                'upos': word.upos,
+                'xpos': word.xpos,
+                'feats': word.feats,
+                'head': word.head,
+                'deprel': word.deprel
+            })
+    return tokens
 def save_results(results, filename='results.json'):
+    # Сохранение результатов
     with open(filename, 'w', encoding='utf-8') as f:
         json.dump(results, f, ensure_ascii=False, indent=2)