Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
| 3 |
import json
|
|
@@ -5,10 +6,14 @@ import os
|
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 7 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def analyze_text():
|
| 10 |
try:
|
| 11 |
-
# Читаем исходный текст
|
| 12 |
file_path = os.path.join(os.getcwd(), "test.txt")
|
| 13 |
if not os.path.exists(file_path):
|
| 14 |
return "Ошибка: файл 'test.txt' не найден."
|
|
@@ -21,66 +26,132 @@ def analyze_text():
|
|
| 21 |
# 2. Разбиение на блоки
|
| 22 |
blocks = split_into_blocks(cleaned_text)
|
| 23 |
|
| 24 |
-
# 3. Семантический анализ
|
| 25 |
semantic_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
|
| 26 |
embeddings = semantic_model.encode(blocks)
|
| 27 |
|
| 28 |
-
# 4. Эмоциональный анализ
|
| 29 |
model_name = "seara/rubert-tiny2-russian-emotion-detection-ru-go-emotions"
|
| 30 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 31 |
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
| 32 |
emotion_classifier = pipeline("text-classification", tokenizer=tokenizer, model=model, return_all_scores=True)
|
| 33 |
-
emotion_results = analyze_emotions(blocks, emotion_classifier)
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
# 5. Перекрестный анализ
|
| 36 |
results = cross_analysis(blocks, embeddings, emotion_results)
|
| 37 |
|
| 38 |
-
# 6.
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
# Возвращаем результаты в интерфейс
|
| 42 |
-
formatted_results = json.dumps(
|
| 43 |
return formatted_results
|
| 44 |
|
| 45 |
except Exception as e:
|
| 46 |
return f"Произошла ошибка: {str(e)}"
|
| 47 |
|
| 48 |
def preprocess_text(text):
|
|
|
|
| 49 |
lines = []
|
| 50 |
for line in text.split('\n'):
|
| 51 |
line = line.strip()
|
| 52 |
-
# Удаляем метки в квадратных скобках, заменяя их пустой строкой
|
| 53 |
if line.startswith('[') and line.endswith(']'):
|
| 54 |
-
lines.append('')
|
| 55 |
elif line and 'Название:' not in line and 'URL:' not in line and '===' not in line:
|
| 56 |
lines.append(line)
|
| 57 |
cleaned_text = '\n'.join(lines)
|
| 58 |
return cleaned_text
|
| 59 |
|
| 60 |
def split_into_blocks(text):
|
| 61 |
-
|
|
|
|
| 62 |
return blocks
|
| 63 |
|
| 64 |
-
def analyze_emotions(blocks, emotion_classifier):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
results = []
|
| 66 |
for block in blocks:
|
| 67 |
predictions = emotion_classifier(block, truncation=True)[0]
|
| 68 |
emotions = {}
|
| 69 |
for pred in predictions:
|
| 70 |
emotion_label = pred['label']
|
| 71 |
-
|
| 72 |
-
|
| 73 |
dominant_emotion = max(emotions, key=emotions.get)
|
| 74 |
dominant_score = emotions[dominant_emotion]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
results.append({
|
| 76 |
'text': block,
|
| 77 |
-
'dominant_emotion':
|
| 78 |
'score': dominant_score,
|
| 79 |
'emotions': emotions
|
| 80 |
})
|
| 81 |
return results
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
def cross_analysis(blocks, embeddings, emotion_results):
|
|
|
|
| 84 |
similarity_matrix = cosine_similarity(embeddings)
|
| 85 |
for i, result in enumerate(emotion_results):
|
| 86 |
similarities = similarity_matrix[i]
|
|
@@ -89,7 +160,40 @@ def cross_analysis(blocks, embeddings, emotion_results):
|
|
| 89 |
result['similarity_scores'] = [round(float(similarities[j]), 3) for j in similar_indices]
|
| 90 |
return emotion_results
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
def save_results(results, filename='results.json'):
|
|
|
|
| 93 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 94 |
json.dump(results, f, ensure_ascii=False, indent=2)
|
| 95 |
|
|
|
|
| 1 |
+
# Импорт необходимых библиотек
|
| 2 |
import gradio as gr
|
| 3 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
| 4 |
import json
|
|
|
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 8 |
import numpy as np
|
| 9 |
+
import stanza
|
| 10 |
+
|
| 11 |
+
# Инициализация пайплайна Stanza для русского языка
|
| 12 |
+
nlp = stanza.Pipeline('ru', processors='tokenize,pos,lemma,depparse', use_gpu=False)
|
| 13 |
|
| 14 |
def analyze_text():
|
| 15 |
try:
|
| 16 |
+
# Читаем исходный текст из файла 'test.txt'
|
| 17 |
file_path = os.path.join(os.getcwd(), "test.txt")
|
| 18 |
if not os.path.exists(file_path):
|
| 19 |
return "Ошибка: файл 'test.txt' не найден."
|
|
|
|
| 26 |
# 2. Разбиение на блоки
|
| 27 |
blocks = split_into_blocks(cleaned_text)
|
| 28 |
|
| 29 |
+
# 3. Семантический анализ (векторизация блоков)
|
| 30 |
semantic_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
|
| 31 |
embeddings = semantic_model.encode(blocks)
|
| 32 |
|
| 33 |
+
# 4. Эмоциональный анализ блоков с использованием модели с 27 эмоциями
|
| 34 |
model_name = "seara/rubert-tiny2-russian-emotion-detection-ru-go-emotions"
|
| 35 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 36 |
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
| 37 |
emotion_classifier = pipeline("text-classification", tokenizer=tokenizer, model=model, return_all_scores=True)
|
| 38 |
+
emotion_results = analyze_emotions(blocks, emotion_classifier, threshold=0.5)
|
| 39 |
+
|
| 40 |
+
# 4.1. Эмоциональный анализ всего текста
|
| 41 |
+
total_result = analyze_total_text(blocks, emotion_classifier)
|
| 42 |
|
| 43 |
+
# 5. Перекрестный анализ (поиск схожих блоков)
|
| 44 |
results = cross_analysis(blocks, embeddings, emotion_results)
|
| 45 |
|
| 46 |
+
# 6. Тематическое моделирование текста (LDA)
|
| 47 |
+
topics = topic_modeling(blocks, num_topics=3)
|
| 48 |
+
|
| 49 |
+
# 7. Лексический и синтаксический анализ блоков с использованием Stanza
|
| 50 |
+
for result in results:
|
| 51 |
+
analysis = lexical_syntactic_analysis(result['text'])
|
| 52 |
+
result['lexical_analysis'] = analysis
|
| 53 |
+
|
| 54 |
+
# 8. Визуализация эмоций (опционально)
|
| 55 |
+
# plot_emotions(results)
|
| 56 |
+
|
| 57 |
+
# Собираем все результаты в один словарь
|
| 58 |
+
output = {
|
| 59 |
+
'block_analysis': results,
|
| 60 |
+
'total_emotion': total_result['dominant_emotion'],
|
| 61 |
+
'total_emotions': total_result['emotions'],
|
| 62 |
+
'topics': topics
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
# 9. Сохранение результатов в файл 'results.json'
|
| 66 |
+
save_results(output)
|
| 67 |
|
| 68 |
+
# Возвращаем результаты в интерфейс Gradio
|
| 69 |
+
formatted_results = json.dumps(output, ensure_ascii=False, indent=2)
|
| 70 |
return formatted_results
|
| 71 |
|
| 72 |
except Exception as e:
|
| 73 |
return f"Произошла ошибка: {str(e)}"
|
| 74 |
|
| 75 |
def preprocess_text(text):
|
| 76 |
+
# Предобработка текста
|
| 77 |
lines = []
|
| 78 |
for line in text.split('\n'):
|
| 79 |
line = line.strip()
|
|
|
|
| 80 |
if line.startswith('[') and line.endswith(']'):
|
| 81 |
+
lines.append('')
|
| 82 |
elif line and 'Название:' not in line and 'URL:' not in line and '===' not in line:
|
| 83 |
lines.append(line)
|
| 84 |
cleaned_text = '\n'.join(lines)
|
| 85 |
return cleaned_text
|
| 86 |
|
| 87 |
def split_into_blocks(text):
|
| 88 |
+
# Разбиваем текст на блоки
|
| 89 |
+
blocks = text.strip().split('\n\n')
|
| 90 |
return blocks
|
| 91 |
|
| 92 |
+
def analyze_emotions(blocks, emotion_classifier, threshold=0.5):
|
| 93 |
+
# Эмоциональный анализ блоков
|
| 94 |
+
emotion_translation = {
|
| 95 |
+
"admiration": "восхищение",
|
| 96 |
+
"amusement": "развлечение",
|
| 97 |
+
"anger": "злость",
|
| 98 |
+
"annoyance": "раздражение",
|
| 99 |
+
"approval": "одобрение",
|
| 100 |
+
"caring": "забота",
|
| 101 |
+
"confusion": "замешательство",
|
| 102 |
+
"curiosity": "любопытство",
|
| 103 |
+
"desire": "желание",
|
| 104 |
+
"disappointment": "разочарование",
|
| 105 |
+
"disapproval": "неодобрение",
|
| 106 |
+
"disgust": "отвращение",
|
| 107 |
+
"embarrassment": "смущение",
|
| 108 |
+
"excitement": "волнение",
|
| 109 |
+
"fear": "страх",
|
| 110 |
+
"gratitude": "благодарность",
|
| 111 |
+
"grief": "горе",
|
| 112 |
+
"joy": "радость",
|
| 113 |
+
"love": "любовь",
|
| 114 |
+
"nervousness": "нервозность",
|
| 115 |
+
"optimism": "оптимизм",
|
| 116 |
+
"pride": "гордость",
|
| 117 |
+
"realization": "осознание",
|
| 118 |
+
"relief": "облегчение",
|
| 119 |
+
"remorse": "раскаяние",
|
| 120 |
+
"sadness": "печаль",
|
| 121 |
+
"surprise": "удивление",
|
| 122 |
+
"neutral": "нейтрально"
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
results = []
|
| 126 |
for block in blocks:
|
| 127 |
predictions = emotion_classifier(block, truncation=True)[0]
|
| 128 |
emotions = {}
|
| 129 |
for pred in predictions:
|
| 130 |
emotion_label = pred['label']
|
| 131 |
+
emotion_label_ru = emotion_translation.get(emotion_label, emotion_label)
|
| 132 |
+
emotions[emotion_label_ru] = round(float(pred['score']), 3)
|
| 133 |
dominant_emotion = max(emotions, key=emotions.get)
|
| 134 |
dominant_score = emotions[dominant_emotion]
|
| 135 |
+
if dominant_score >= threshold:
|
| 136 |
+
result_emotion = dominant_emotion
|
| 137 |
+
else:
|
| 138 |
+
result_emotion = "Смешанные эмоции"
|
| 139 |
results.append({
|
| 140 |
'text': block,
|
| 141 |
+
'dominant_emotion': result_emotion,
|
| 142 |
'score': dominant_score,
|
| 143 |
'emotions': emotions
|
| 144 |
})
|
| 145 |
return results
|
| 146 |
|
| 147 |
+
def analyze_total_text(blocks, emotion_classifier):
|
| 148 |
+
# Эмоциональный анализ всего текста
|
| 149 |
+
total_text = ' '.join(blocks)
|
| 150 |
+
total_result = analyze_emotions([total_text], emotion_classifier)[0]
|
| 151 |
+
return total_result
|
| 152 |
+
|
| 153 |
def cross_analysis(blocks, embeddings, emotion_results):
|
| 154 |
+
# Перекрестный анализ
|
| 155 |
similarity_matrix = cosine_similarity(embeddings)
|
| 156 |
for i, result in enumerate(emotion_results):
|
| 157 |
similarities = similarity_matrix[i]
|
|
|
|
| 160 |
result['similarity_scores'] = [round(float(similarities[j]), 3) for j in similar_indices]
|
| 161 |
return emotion_results
|
| 162 |
|
| 163 |
+
def topic_modeling(blocks, num_topics=3):
|
| 164 |
+
# Тематическое моделирование
|
| 165 |
+
from gensim import corpora, models
|
| 166 |
+
from nltk.tokenize import word_tokenize
|
| 167 |
+
|
| 168 |
+
texts = [word_tokenize(block.lower()) for block in blocks]
|
| 169 |
+
dictionary = corpora.Dictionary(texts)
|
| 170 |
+
corpus = [dictionary.doc2bow(text) for text in texts]
|
| 171 |
+
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
|
| 172 |
+
topics = lda_model.print_topics()
|
| 173 |
+
topics_list = []
|
| 174 |
+
for idx, topic in topics:
|
| 175 |
+
topics_list.append(f"Тема {idx+1}: {topic}")
|
| 176 |
+
return topics_list
|
| 177 |
+
|
| 178 |
+
def lexical_syntactic_analysis(block):
|
| 179 |
+
# Лексический и синтаксический анализ с использованием Stanza
|
| 180 |
+
doc = nlp(block)
|
| 181 |
+
tokens = []
|
| 182 |
+
for sentence in doc.sentences:
|
| 183 |
+
for word in sentence.words:
|
| 184 |
+
tokens.append({
|
| 185 |
+
'text': word.text,
|
| 186 |
+
'lemma': word.lemma,
|
| 187 |
+
'upos': word.upos,
|
| 188 |
+
'xpos': word.xpos,
|
| 189 |
+
'feats': word.feats,
|
| 190 |
+
'head': word.head,
|
| 191 |
+
'deprel': word.deprel
|
| 192 |
+
})
|
| 193 |
+
return tokens
|
| 194 |
+
|
| 195 |
def save_results(results, filename='results.json'):
|
| 196 |
+
# Сохранение результатов
|
| 197 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 198 |
json.dump(results, f, ensure_ascii=False, indent=2)
|
| 199 |
|