Spaces:

sentimentanalyzer01
/

emo-sentiment-analyzer

Runtime error

App Files Files Community

emo-sentiment-analyzer / app.py

sentimentanalyzer01

Update app.py

8b2ae78 verified about 11 hours ago

raw

history blame contribute delete

16.7 kB

	import os
	import sys
	import json
	import pickle
	import torch
	import torch.nn as nn
	import numpy as np
	import re
	from typing import Dict, List, Any, Optional
	from collections import defaultdict
	import networkx as nx
	import pymorphy3
	from fastapi import FastAPI, Request, Form, HTTPException
	from fastapi.responses import HTMLResponse, JSONResponse
	from fastapi.templating import Jinja2Templates
	import uvicorn
	from transformers import BertTokenizer, BertModel
	import warnings
	warnings.filterwarnings('ignore')

	# Определяем устройство
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	print(f"Используется устройство: {device}")

	# ============================================================
	# ВСПОМОГАТЕЛЬНЫЕ ФУНКЦИИ
	# ============================================================
	def clean_russian_text(text):
	if not isinstance(text, str):
	return ""
	text = text.lower()
	text = re.sub(r'http\S+\|www\S+\|https\S+', '', text)
	text = re.sub(r'\S+@\S+', '', text)
	smileys = {
	':)': ' смайлик_радость ', ')': ' смайлик_радость ',
	':(': ' смайлик_грусть ', '(': ' смайлик_грусть ',
	':D': ' смайлик_смех ', ';)': ' смайлик_подмигивание ',
	}
	for smiley, replacement in smileys.items():
	text = text.replace(smiley, replacement)
	text = re.sub(r'[^\w\sа-яё.,!?;:)(-]', ' ', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	# ============================================================
	# КЛАССЫ МОДЕЛЕЙ (упрощенные для инференса)
	# ============================================================
	class EmotionLSTM(nn.Module):
	def __init__(self, vocab_size, embed_dim=100, hidden_dim=256,
	num_classes=5, dropout=0.3, num_layers=2):
	super().__init__()
	self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
	self.lstm = nn.LSTM(
	embed_dim,
	hidden_dim,
	num_layers=num_layers,
	batch_first=True,
	bidirectional=True,
	dropout=dropout if num_layers > 1 else 0
	)
	self.dropout = nn.Dropout(dropout)
	self.classifier = nn.Sequential(
	nn.Linear(hidden_dim * 2, 128),
	nn.ReLU(),
	nn.Dropout(dropout),
	nn.Linear(128, 64),
	nn.ReLU(),
	nn.Linear(64, num_classes)
	)

	def forward(self, x, return_confidence=False):
	embedded = self.embedding(x)
	lstm_out, (hidden, cell) = self.lstm(embedded)
	lstm_last = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
	features = self.dropout(lstm_last)
	logits = self.classifier(features)
	if return_confidence:
	probs = torch.softmax(logits, dim=1)
	conf, _ = torch.max(probs, dim=1)
	return logits, conf
	return logits

	class EmotionBERT(nn.Module):
	def __init__(self, bert_model_name, num_classes, dropout=0.3):
	super().__init__()
	self.bert = BertModel.from_pretrained(bert_model_name)
	hidden = self.bert.config.hidden_size
	self.classifier = nn.Sequential(
	nn.Dropout(dropout),
	nn.Linear(hidden, 256), nn.ReLU(),
	nn.Dropout(dropout),
	nn.Linear(256, 128), nn.ReLU(),
	nn.Linear(128, num_classes)
	)

	def forward(self, input_ids, attention_mask, return_confidence=False):
	out = self.bert(input_ids, attention_mask, return_dict=True)
	cls = out.last_hidden_state[:, 0, :]
	logits = self.classifier(cls)
	if return_confidence:
	probs = torch.softmax(logits, dim=1)
	conf, _ = torch.max(probs, dim=1)
	return logits, conf
	return logits

	class OntologyEmotionModel:
	def __init__(self, emotions: List[str]):
	self.emotions = emotions
	self.morph = pymorphy3.MorphAnalyzer()
	self.ontology_graph = nx.DiGraph()
	self.sentiment_lexicon = {}
	self.linguistic_rules = {
	'усилители': {'words': ['очень', 'сильно', 'крайне'], 'weight': 0.3},
	'ослабители': {'words': ['слегка', 'немного', 'чуть-чуть'], 'weight': -0.2},
	'отрицания': {'words': ['не', 'ни', 'нет'], 'weight': -0.5},
	}
	self.init_ontology_level1()

	def init_ontology_level1(self):
	self.emotion_definitions = {
	'радость': {'valence': 'positive', 'arousal': 'high'},
	'грусть': {'valence': 'negative', 'arousal': 'low'},
	'злость': {'valence': 'negative', 'arousal': 'high'},
	'страх': {'valence': 'negative', 'arousal': 'high'},
	'сарказм': {'valence': 'negative', 'arousal': 'high'},
	}
	for emotion in self.emotions:
	if emotion in self.emotion_definitions:
	self.ontology_graph.add_node(emotion, **self.emotion_definitions[emotion])

	def apply_linguistic_rules(self, text: str) -> Dict:
	rules_applied = []
	words = text.lower().split()
	lemmas = [self.morph.parse(w)[0].normal_form for w in words]

	for category, rule in self.linguistic_rules.items():
	for word in rule['words']:
	if word in lemmas:
	rules_applied.append(f"{category}: {word}")

	return {'rules_applied': rules_applied, 'lemmas': lemmas}

	def adjust_prediction_with_rules(self, prediction: Dict, rule_analysis: Dict) -> Dict:
	return {
	'emotion': prediction['emotion'],
	'confidence': prediction['confidence'],
	'rules_applied': rule_analysis['rules_applied']
	}

	def get_ontology_analysis(self, text: str, model_prediction: Dict) -> Dict:
	rule_analysis = self.apply_linguistic_rules(text)
	adjusted = self.adjust_prediction_with_rules(model_prediction, rule_analysis)
	return {
	'rule_analysis': rule_analysis,
	'adjusted_prediction': adjusted
	}

	def get_statistics(self) -> Dict:
	return {
	'ontology_nodes': len(self.ontology_graph.nodes),
	'linguistic_rules': len(self.linguistic_rules),
	}

	class CascadeEmotionClassifier:
	def __init__(self, lstm_model, bert_model, vocab, tokenizer,
	label_encoder, ontology_model, threshold=0.95, device='cpu',
	max_length_lstm=100, max_length_bert=128):
	self.lstm_model = lstm_model
	self.bert_model = bert_model
	self.vocab = vocab
	self.tokenizer = tokenizer
	self.label_encoder = label_encoder
	self.ontology_model = ontology_model
	self.threshold = threshold
	self.device = device
	self.max_length_lstm = max_length_lstm
	self.max_length_bert = max_length_bert

	self.lstm_model.eval()
	self.bert_model.eval()
	self.lstm_model.to(device)
	self.bert_model.to(device)

	self.stats = {'total': 0, 'lstm': 0, 'bert': 0}

	def text_to_sequence(self, text):
	words = str(text).split()[:self.max_length_lstm]
	sequence = [self.vocab.get(word, self.vocab.get('<UNK>', 1)) for word in words]
	if len(sequence) < self.max_length_lstm:
	sequence += [self.vocab.get('<PAD>', 0)] * (self.max_length_lstm - len(sequence))
	return sequence[:self.max_length_lstm]

	def predict(self, text):
	self.stats['total'] += 1
	text_clean = clean_russian_text(text)

	# LSTM prediction
	seq = torch.LongTensor([self.text_to_sequence(text_clean)]).to(self.device)
	with torch.no_grad():
	lstm_logits, lstm_conf = self.lstm_model(seq, return_confidence=True)
	lstm_probs = torch.softmax(lstm_logits, dim=1)
	lstm_pred = lstm_probs.argmax().item()

	lstm_emo = self.label_encoder.inverse_transform([lstm_pred])[0]
	lstm_pred_dict = {
	'emotion': lstm_emo,
	'confidence': lstm_conf.item(),
	'probabilities': lstm_probs[0].cpu().numpy().tolist()
	}

	# Применяем онтологию
	lstm_onto = self.ontology_model.get_ontology_analysis(text_clean, lstm_pred_dict)
	lstm_adjusted = lstm_onto['adjusted_prediction']

	if lstm_adjusted['confidence'] >= self.threshold:
	self.stats['lstm'] += 1
	final = lstm_adjusted
	used_model = "LSTM с онтологией"
	else:
	# BERT prediction
	self.stats['bert'] += 1
	enc = self.tokenizer(
	text_clean,
	truncation=True,
	padding=True,
	max_length=self.max_length_bert,
	return_tensors='pt'
	).to(self.device)

	with torch.no_grad():
	bert_logits, bert_conf = self.bert_model(
	enc['input_ids'],
	enc['attention_mask'],
	return_confidence=True
	)
	bert_probs = torch.softmax(bert_logits, dim=1)
	bert_pred = bert_probs.argmax().item()

	bert_emo = self.label_encoder.inverse_transform([bert_pred])[0]
	bert_pred_dict = {
	'emotion': bert_emo,
	'confidence': bert_conf.item(),
	'probabilities': bert_probs[0].cpu().numpy().tolist()
	}

	bert_onto = self.ontology_model.get_ontology_analysis(text_clean, bert_pred_dict)
	bert_adjusted = bert_onto['adjusted_prediction']
	final = bert_adjusted
	used_model = "BERT с онтологией"
	lstm_onto = bert_onto

	result = {
	'text': text,
	'predicted_emotion': final['emotion'],
	'confidence': float(final['confidence']),
	'used_model': used_model,
	'rules_applied': lstm_onto['rule_analysis']['rules_applied'],
	'class_probabilities': {
	emo: float(prob)
	for emo, prob in zip(self.label_encoder.classes_, final.get('probabilities', lstm_pred_dict['probabilities']))
	}
	}
	return result

	# ============================================================
	# ЗАГРУЗКА МОДЕЛИ
	# ============================================================
	def load_model():
	print("Загрузка модели...")
	model_dir = 'model'

	# Загружаем информацию о модели
	with open(f'{model_dir}/model_info.json', 'r', encoding='utf-8') as f:
	model_info = json.load(f)

	# Загружаем vocab
	with open(f'{model_dir}/vocab.json', 'r', encoding='utf-8') as f:
	vocab = json.load(f)

	# Загружаем label encoder
	with open(f'{model_dir}/label_encoder.pkl', 'rb') as f:
	label_encoder = pickle.load(f)

	# Загружаем онтологию
	with open(f'{model_dir}/ontology_model.pkl', 'rb') as f:
	ontology_model = pickle.load(f)

	# Создаем и загружаем LSTM
	lstm_model = EmotionLSTM(
	vocab_size=len(vocab),
	embed_dim=model_info.get('embed_dim', 100),
	hidden_dim=256,
	num_classes=model_info['num_classes'],
	dropout=0.3,
	num_layers=2
	)

	checkpoint = torch.load(f'{model_dir}/lstm_model.pth', map_location=device, weights_only=True)
	lstm_model.load_state_dict(checkpoint['model_state_dict'])

	# Создаем и загружаем BERT
	bert_model = EmotionBERT(
	bert_model_name=model_info['bert_model_name'],
	num_classes=model_info['num_classes'],
	dropout=0.3
	)
	bert_model.load_state_dict(torch.load(f'{model_dir}/bert_model.pth', map_location=device, weights_only=True))

	# Загружаем токенизатор
	tokenizer = BertTokenizer.from_pretrained(model_dir)

	# Создаем каскадный классификатор
	cascade = CascadeEmotionClassifier(
	lstm_model=lstm_model,
	bert_model=bert_model,
	vocab=vocab,
	tokenizer=tokenizer,
	label_encoder=label_encoder,
	ontology_model=ontology_model,
	threshold=model_info.get('threshold', 0.95),
	device=device,
	max_length_lstm=model_info.get('max_length_lstm', 100),
	max_length_bert=model_info.get('max_length_bert', 128)
	)

	print("✅ Модель успешно загружена!")
	return cascade, model_info

	# ============================================================
	# FASTAPI ПРИЛОЖЕНИЕ
	# ============================================================
	app = FastAPI(title="Emotion Analysis with BERT and Ontology")

	# Настраиваем шаблоны
	templates = Jinja2Templates(directory="templates")

	# Глобальная переменная для модели
	classifier = None
	model_info = None

	@app.on_event("startup")
	async def startup_event():
	global classifier, model_info
	classifier, model_info = load_model()

	@app.get("/", response_class=HTMLResponse)
	async def home(request: Request):
	return templates.TemplateResponse(
	"index.html",
	{
	"request": request,
	"classes": classifier.label_encoder.classes_.tolist() if classifier else []
	}
	)

	@app.post("/predict")
	async def predict(text: str = Form(...)):
	if not classifier:
	raise HTTPException(status_code=503, detail="Модель еще не загружена")

	if not text or len(text.strip()) < 3:
	return JSONResponse({
	"error": "Текст слишком короткий. Введите хотя бы 3 символа."
	}, status_code=400)

	try:
	result = classifier.predict(text)

	# Форматируем правила для отображения
	rules_display = []
	for rule in result['rules_applied'][:10]:
	if ':' in rule:
	cat, val = rule.split(':', 1)
	rules_display.append(f"<span class='rule-tag rule-{cat.strip()}'>{cat}: {val.strip()}</span>")
	else:
	rules_display.append(f"<span class='rule-tag'>{rule}</span>")

	# Форматируем вероятности
	probs_display = []
	for emotion, prob in result['class_probabilities'].items():
	percentage = prob * 100
	probs_display.append(f"""
	<div class="prob-item">
	<span class="prob-label">{emotion}</span>
	<div class="prob-bar-container">
	<div class="prob-bar" style="width: {percentage}%"></div>
	</div>
	<span class="prob-value">{percentage:.1f}%</span>
	</div>
	""")

	return JSONResponse({
	"success": True,
	"text": result['text'][:200] + "..." if len(result['text']) > 200 else result['text'],
	"emotion": result['predicted_emotion'],
	"confidence": f"{result['confidence']*100:.1f}%",
	"used_model": result['used_model'],
	"rules": "".join(rules_display) if rules_display else "Нет примененных правил",
	"probabilities": "".join(probs_display)
	})
	except Exception as e:
	return JSONResponse({
	"error": f"Ошибка при анализе: {str(e)}"
	}, status_code=500)

	@app.get("/stats")
	async def get_stats():
	if not classifier:
	raise HTTPException(status_code=503, detail="Модель еще не загружена")

	stats = classifier.stats

	return JSONResponse({
	"total_predictions": stats['total'],
	"lstm_used": stats['lstm'],
	"bert_used": stats['bert'],
	"lstm_percentage": f"{(stats['lstm']/max(stats['total'],1))*100:.1f}%",
	"bert_percentage": f"{(stats['bert']/max(stats['total'],1))*100:.1f}%"
	})

	@app.get("/health")
	async def health_check():
	return {"status": "healthy", "model_loaded": classifier is not None}

	# Для запуска
	if __name__ == "__main__":
	port = int(os.environ.get("PORT", 7860))
	uvicorn.run(app, host="0.0.0.0", port=port)