sentimentanalyzer01's picture
Update app.py
8b2ae78 verified
import os
import sys
import json
import pickle
import torch
import torch.nn as nn
import numpy as np
import re
from typing import Dict, List, Any, Optional
from collections import defaultdict
import networkx as nx
import pymorphy3
from fastapi import FastAPI, Request, Form, HTTPException
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.templating import Jinja2Templates
import uvicorn
from transformers import BertTokenizer, BertModel
import warnings
warnings.filterwarnings('ignore')
# Определяем устройство
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Используется устройство: {device}")
# ============================================================
# ВСПОМОГАТЕЛЬНЫЕ ФУНКЦИИ
# ============================================================
def clean_russian_text(text):
if not isinstance(text, str):
return ""
text = text.lower()
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
text = re.sub(r'\S+@\S+', '', text)
smileys = {
':)': ' смайлик_радость ', ')': ' смайлик_радость ',
':(': ' смайлик_грусть ', '(': ' смайлик_грусть ',
':D': ' смайлик_смех ', ';)': ' смайлик_подмигивание ',
}
for smiley, replacement in smileys.items():
text = text.replace(smiley, replacement)
text = re.sub(r'[^\w\sа-яё.,!?;:)(-]', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
# ============================================================
# КЛАССЫ МОДЕЛЕЙ (упрощенные для инференса)
# ============================================================
class EmotionLSTM(nn.Module):
def __init__(self, vocab_size, embed_dim=100, hidden_dim=256,
num_classes=5, dropout=0.3, num_layers=2):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.lstm = nn.LSTM(
embed_dim,
hidden_dim,
num_layers=num_layers,
batch_first=True,
bidirectional=True,
dropout=dropout if num_layers > 1 else 0
)
self.dropout = nn.Dropout(dropout)
self.classifier = nn.Sequential(
nn.Linear(hidden_dim * 2, 128),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(128, 64),
nn.ReLU(),
nn.Linear(64, num_classes)
)
def forward(self, x, return_confidence=False):
embedded = self.embedding(x)
lstm_out, (hidden, cell) = self.lstm(embedded)
lstm_last = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
features = self.dropout(lstm_last)
logits = self.classifier(features)
if return_confidence:
probs = torch.softmax(logits, dim=1)
conf, _ = torch.max(probs, dim=1)
return logits, conf
return logits
class EmotionBERT(nn.Module):
def __init__(self, bert_model_name, num_classes, dropout=0.3):
super().__init__()
self.bert = BertModel.from_pretrained(bert_model_name)
hidden = self.bert.config.hidden_size
self.classifier = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(hidden, 256), nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(256, 128), nn.ReLU(),
nn.Linear(128, num_classes)
)
def forward(self, input_ids, attention_mask, return_confidence=False):
out = self.bert(input_ids, attention_mask, return_dict=True)
cls = out.last_hidden_state[:, 0, :]
logits = self.classifier(cls)
if return_confidence:
probs = torch.softmax(logits, dim=1)
conf, _ = torch.max(probs, dim=1)
return logits, conf
return logits
class OntologyEmotionModel:
def __init__(self, emotions: List[str]):
self.emotions = emotions
self.morph = pymorphy3.MorphAnalyzer()
self.ontology_graph = nx.DiGraph()
self.sentiment_lexicon = {}
self.linguistic_rules = {
'усилители': {'words': ['очень', 'сильно', 'крайне'], 'weight': 0.3},
'ослабители': {'words': ['слегка', 'немного', 'чуть-чуть'], 'weight': -0.2},
'отрицания': {'words': ['не', 'ни', 'нет'], 'weight': -0.5},
}
self.init_ontology_level1()
def init_ontology_level1(self):
self.emotion_definitions = {
'радость': {'valence': 'positive', 'arousal': 'high'},
'грусть': {'valence': 'negative', 'arousal': 'low'},
'злость': {'valence': 'negative', 'arousal': 'high'},
'страх': {'valence': 'negative', 'arousal': 'high'},
'сарказм': {'valence': 'negative', 'arousal': 'high'},
}
for emotion in self.emotions:
if emotion in self.emotion_definitions:
self.ontology_graph.add_node(emotion, **self.emotion_definitions[emotion])
def apply_linguistic_rules(self, text: str) -> Dict:
rules_applied = []
words = text.lower().split()
lemmas = [self.morph.parse(w)[0].normal_form for w in words]
for category, rule in self.linguistic_rules.items():
for word in rule['words']:
if word in lemmas:
rules_applied.append(f"{category}: {word}")
return {'rules_applied': rules_applied, 'lemmas': lemmas}
def adjust_prediction_with_rules(self, prediction: Dict, rule_analysis: Dict) -> Dict:
return {
'emotion': prediction['emotion'],
'confidence': prediction['confidence'],
'rules_applied': rule_analysis['rules_applied']
}
def get_ontology_analysis(self, text: str, model_prediction: Dict) -> Dict:
rule_analysis = self.apply_linguistic_rules(text)
adjusted = self.adjust_prediction_with_rules(model_prediction, rule_analysis)
return {
'rule_analysis': rule_analysis,
'adjusted_prediction': adjusted
}
def get_statistics(self) -> Dict:
return {
'ontology_nodes': len(self.ontology_graph.nodes),
'linguistic_rules': len(self.linguistic_rules),
}
class CascadeEmotionClassifier:
def __init__(self, lstm_model, bert_model, vocab, tokenizer,
label_encoder, ontology_model, threshold=0.95, device='cpu',
max_length_lstm=100, max_length_bert=128):
self.lstm_model = lstm_model
self.bert_model = bert_model
self.vocab = vocab
self.tokenizer = tokenizer
self.label_encoder = label_encoder
self.ontology_model = ontology_model
self.threshold = threshold
self.device = device
self.max_length_lstm = max_length_lstm
self.max_length_bert = max_length_bert
self.lstm_model.eval()
self.bert_model.eval()
self.lstm_model.to(device)
self.bert_model.to(device)
self.stats = {'total': 0, 'lstm': 0, 'bert': 0}
def text_to_sequence(self, text):
words = str(text).split()[:self.max_length_lstm]
sequence = [self.vocab.get(word, self.vocab.get('<UNK>', 1)) for word in words]
if len(sequence) < self.max_length_lstm:
sequence += [self.vocab.get('<PAD>', 0)] * (self.max_length_lstm - len(sequence))
return sequence[:self.max_length_lstm]
def predict(self, text):
self.stats['total'] += 1
text_clean = clean_russian_text(text)
# LSTM prediction
seq = torch.LongTensor([self.text_to_sequence(text_clean)]).to(self.device)
with torch.no_grad():
lstm_logits, lstm_conf = self.lstm_model(seq, return_confidence=True)
lstm_probs = torch.softmax(lstm_logits, dim=1)
lstm_pred = lstm_probs.argmax().item()
lstm_emo = self.label_encoder.inverse_transform([lstm_pred])[0]
lstm_pred_dict = {
'emotion': lstm_emo,
'confidence': lstm_conf.item(),
'probabilities': lstm_probs[0].cpu().numpy().tolist()
}
# Применяем онтологию
lstm_onto = self.ontology_model.get_ontology_analysis(text_clean, lstm_pred_dict)
lstm_adjusted = lstm_onto['adjusted_prediction']
if lstm_adjusted['confidence'] >= self.threshold:
self.stats['lstm'] += 1
final = lstm_adjusted
used_model = "LSTM с онтологией"
else:
# BERT prediction
self.stats['bert'] += 1
enc = self.tokenizer(
text_clean,
truncation=True,
padding=True,
max_length=self.max_length_bert,
return_tensors='pt'
).to(self.device)
with torch.no_grad():
bert_logits, bert_conf = self.bert_model(
enc['input_ids'],
enc['attention_mask'],
return_confidence=True
)
bert_probs = torch.softmax(bert_logits, dim=1)
bert_pred = bert_probs.argmax().item()
bert_emo = self.label_encoder.inverse_transform([bert_pred])[0]
bert_pred_dict = {
'emotion': bert_emo,
'confidence': bert_conf.item(),
'probabilities': bert_probs[0].cpu().numpy().tolist()
}
bert_onto = self.ontology_model.get_ontology_analysis(text_clean, bert_pred_dict)
bert_adjusted = bert_onto['adjusted_prediction']
final = bert_adjusted
used_model = "BERT с онтологией"
lstm_onto = bert_onto
result = {
'text': text,
'predicted_emotion': final['emotion'],
'confidence': float(final['confidence']),
'used_model': used_model,
'rules_applied': lstm_onto['rule_analysis']['rules_applied'],
'class_probabilities': {
emo: float(prob)
for emo, prob in zip(self.label_encoder.classes_, final.get('probabilities', lstm_pred_dict['probabilities']))
}
}
return result
# ============================================================
# ЗАГРУЗКА МОДЕЛИ
# ============================================================
def load_model():
print("Загрузка модели...")
model_dir = 'model'
# Загружаем информацию о модели
with open(f'{model_dir}/model_info.json', 'r', encoding='utf-8') as f:
model_info = json.load(f)
# Загружаем vocab
with open(f'{model_dir}/vocab.json', 'r', encoding='utf-8') as f:
vocab = json.load(f)
# Загружаем label encoder
with open(f'{model_dir}/label_encoder.pkl', 'rb') as f:
label_encoder = pickle.load(f)
# Загружаем онтологию
with open(f'{model_dir}/ontology_model.pkl', 'rb') as f:
ontology_model = pickle.load(f)
# Создаем и загружаем LSTM
lstm_model = EmotionLSTM(
vocab_size=len(vocab),
embed_dim=model_info.get('embed_dim', 100),
hidden_dim=256,
num_classes=model_info['num_classes'],
dropout=0.3,
num_layers=2
)
checkpoint = torch.load(f'{model_dir}/lstm_model.pth', map_location=device, weights_only=True)
lstm_model.load_state_dict(checkpoint['model_state_dict'])
# Создаем и загружаем BERT
bert_model = EmotionBERT(
bert_model_name=model_info['bert_model_name'],
num_classes=model_info['num_classes'],
dropout=0.3
)
bert_model.load_state_dict(torch.load(f'{model_dir}/bert_model.pth', map_location=device, weights_only=True))
# Загружаем токенизатор
tokenizer = BertTokenizer.from_pretrained(model_dir)
# Создаем каскадный классификатор
cascade = CascadeEmotionClassifier(
lstm_model=lstm_model,
bert_model=bert_model,
vocab=vocab,
tokenizer=tokenizer,
label_encoder=label_encoder,
ontology_model=ontology_model,
threshold=model_info.get('threshold', 0.95),
device=device,
max_length_lstm=model_info.get('max_length_lstm', 100),
max_length_bert=model_info.get('max_length_bert', 128)
)
print("✅ Модель успешно загружена!")
return cascade, model_info
# ============================================================
# FASTAPI ПРИЛОЖЕНИЕ
# ============================================================
app = FastAPI(title="Emotion Analysis with BERT and Ontology")
# Настраиваем шаблоны
templates = Jinja2Templates(directory="templates")
# Глобальная переменная для модели
classifier = None
model_info = None
@app.on_event("startup")
async def startup_event():
global classifier, model_info
classifier, model_info = load_model()
@app.get("/", response_class=HTMLResponse)
async def home(request: Request):
return templates.TemplateResponse(
"index.html",
{
"request": request,
"classes": classifier.label_encoder.classes_.tolist() if classifier else []
}
)
@app.post("/predict")
async def predict(text: str = Form(...)):
if not classifier:
raise HTTPException(status_code=503, detail="Модель еще не загружена")
if not text or len(text.strip()) < 3:
return JSONResponse({
"error": "Текст слишком короткий. Введите хотя бы 3 символа."
}, status_code=400)
try:
result = classifier.predict(text)
# Форматируем правила для отображения
rules_display = []
for rule in result['rules_applied'][:10]:
if ':' in rule:
cat, val = rule.split(':', 1)
rules_display.append(f"<span class='rule-tag rule-{cat.strip()}'>{cat}: {val.strip()}</span>")
else:
rules_display.append(f"<span class='rule-tag'>{rule}</span>")
# Форматируем вероятности
probs_display = []
for emotion, prob in result['class_probabilities'].items():
percentage = prob * 100
probs_display.append(f"""
<div class="prob-item">
<span class="prob-label">{emotion}</span>
<div class="prob-bar-container">
<div class="prob-bar" style="width: {percentage}%"></div>
</div>
<span class="prob-value">{percentage:.1f}%</span>
</div>
""")
return JSONResponse({
"success": True,
"text": result['text'][:200] + "..." if len(result['text']) > 200 else result['text'],
"emotion": result['predicted_emotion'],
"confidence": f"{result['confidence']*100:.1f}%",
"used_model": result['used_model'],
"rules": "".join(rules_display) if rules_display else "Нет примененных правил",
"probabilities": "".join(probs_display)
})
except Exception as e:
return JSONResponse({
"error": f"Ошибка при анализе: {str(e)}"
}, status_code=500)
@app.get("/stats")
async def get_stats():
if not classifier:
raise HTTPException(status_code=503, detail="Модель еще не загружена")
stats = classifier.stats
return JSONResponse({
"total_predictions": stats['total'],
"lstm_used": stats['lstm'],
"bert_used": stats['bert'],
"lstm_percentage": f"{(stats['lstm']/max(stats['total'],1))*100:.1f}%",
"bert_percentage": f"{(stats['bert']/max(stats['total'],1))*100:.1f}%"
})
@app.get("/health")
async def health_check():
return {"status": "healthy", "model_loaded": classifier is not None}
# Для запуска
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
uvicorn.run(app, host="0.0.0.0", port=port)