Spaces:
Runtime error
Runtime error
| import os | |
| import sys | |
| import json | |
| import pickle | |
| import torch | |
| import torch.nn as nn | |
| import numpy as np | |
| import re | |
| from typing import Dict, List, Any, Optional | |
| from collections import defaultdict | |
| import networkx as nx | |
| import pymorphy3 | |
| from fastapi import FastAPI, Request, Form, HTTPException | |
| from fastapi.responses import HTMLResponse, JSONResponse | |
| from fastapi.templating import Jinja2Templates | |
| import uvicorn | |
| from transformers import BertTokenizer, BertModel | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Определяем устройство | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| print(f"Используется устройство: {device}") | |
| # ============================================================ | |
| # ВСПОМОГАТЕЛЬНЫЕ ФУНКЦИИ | |
| # ============================================================ | |
| def clean_russian_text(text): | |
| if not isinstance(text, str): | |
| return "" | |
| text = text.lower() | |
| text = re.sub(r'http\S+|www\S+|https\S+', '', text) | |
| text = re.sub(r'\S+@\S+', '', text) | |
| smileys = { | |
| ':)': ' смайлик_радость ', ')': ' смайлик_радость ', | |
| ':(': ' смайлик_грусть ', '(': ' смайлик_грусть ', | |
| ':D': ' смайлик_смех ', ';)': ' смайлик_подмигивание ', | |
| } | |
| for smiley, replacement in smileys.items(): | |
| text = text.replace(smiley, replacement) | |
| text = re.sub(r'[^\w\sа-яё.,!?;:)(-]', ' ', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| # ============================================================ | |
| # КЛАССЫ МОДЕЛЕЙ (упрощенные для инференса) | |
| # ============================================================ | |
| class EmotionLSTM(nn.Module): | |
| def __init__(self, vocab_size, embed_dim=100, hidden_dim=256, | |
| num_classes=5, dropout=0.3, num_layers=2): | |
| super().__init__() | |
| self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0) | |
| self.lstm = nn.LSTM( | |
| embed_dim, | |
| hidden_dim, | |
| num_layers=num_layers, | |
| batch_first=True, | |
| bidirectional=True, | |
| dropout=dropout if num_layers > 1 else 0 | |
| ) | |
| self.dropout = nn.Dropout(dropout) | |
| self.classifier = nn.Sequential( | |
| nn.Linear(hidden_dim * 2, 128), | |
| nn.ReLU(), | |
| nn.Dropout(dropout), | |
| nn.Linear(128, 64), | |
| nn.ReLU(), | |
| nn.Linear(64, num_classes) | |
| ) | |
| def forward(self, x, return_confidence=False): | |
| embedded = self.embedding(x) | |
| lstm_out, (hidden, cell) = self.lstm(embedded) | |
| lstm_last = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1) | |
| features = self.dropout(lstm_last) | |
| logits = self.classifier(features) | |
| if return_confidence: | |
| probs = torch.softmax(logits, dim=1) | |
| conf, _ = torch.max(probs, dim=1) | |
| return logits, conf | |
| return logits | |
| class EmotionBERT(nn.Module): | |
| def __init__(self, bert_model_name, num_classes, dropout=0.3): | |
| super().__init__() | |
| self.bert = BertModel.from_pretrained(bert_model_name) | |
| hidden = self.bert.config.hidden_size | |
| self.classifier = nn.Sequential( | |
| nn.Dropout(dropout), | |
| nn.Linear(hidden, 256), nn.ReLU(), | |
| nn.Dropout(dropout), | |
| nn.Linear(256, 128), nn.ReLU(), | |
| nn.Linear(128, num_classes) | |
| ) | |
| def forward(self, input_ids, attention_mask, return_confidence=False): | |
| out = self.bert(input_ids, attention_mask, return_dict=True) | |
| cls = out.last_hidden_state[:, 0, :] | |
| logits = self.classifier(cls) | |
| if return_confidence: | |
| probs = torch.softmax(logits, dim=1) | |
| conf, _ = torch.max(probs, dim=1) | |
| return logits, conf | |
| return logits | |
| class OntologyEmotionModel: | |
| def __init__(self, emotions: List[str]): | |
| self.emotions = emotions | |
| self.morph = pymorphy3.MorphAnalyzer() | |
| self.ontology_graph = nx.DiGraph() | |
| self.sentiment_lexicon = {} | |
| self.linguistic_rules = { | |
| 'усилители': {'words': ['очень', 'сильно', 'крайне'], 'weight': 0.3}, | |
| 'ослабители': {'words': ['слегка', 'немного', 'чуть-чуть'], 'weight': -0.2}, | |
| 'отрицания': {'words': ['не', 'ни', 'нет'], 'weight': -0.5}, | |
| } | |
| self.init_ontology_level1() | |
| def init_ontology_level1(self): | |
| self.emotion_definitions = { | |
| 'радость': {'valence': 'positive', 'arousal': 'high'}, | |
| 'грусть': {'valence': 'negative', 'arousal': 'low'}, | |
| 'злость': {'valence': 'negative', 'arousal': 'high'}, | |
| 'страх': {'valence': 'negative', 'arousal': 'high'}, | |
| 'сарказм': {'valence': 'negative', 'arousal': 'high'}, | |
| } | |
| for emotion in self.emotions: | |
| if emotion in self.emotion_definitions: | |
| self.ontology_graph.add_node(emotion, **self.emotion_definitions[emotion]) | |
| def apply_linguistic_rules(self, text: str) -> Dict: | |
| rules_applied = [] | |
| words = text.lower().split() | |
| lemmas = [self.morph.parse(w)[0].normal_form for w in words] | |
| for category, rule in self.linguistic_rules.items(): | |
| for word in rule['words']: | |
| if word in lemmas: | |
| rules_applied.append(f"{category}: {word}") | |
| return {'rules_applied': rules_applied, 'lemmas': lemmas} | |
| def adjust_prediction_with_rules(self, prediction: Dict, rule_analysis: Dict) -> Dict: | |
| return { | |
| 'emotion': prediction['emotion'], | |
| 'confidence': prediction['confidence'], | |
| 'rules_applied': rule_analysis['rules_applied'] | |
| } | |
| def get_ontology_analysis(self, text: str, model_prediction: Dict) -> Dict: | |
| rule_analysis = self.apply_linguistic_rules(text) | |
| adjusted = self.adjust_prediction_with_rules(model_prediction, rule_analysis) | |
| return { | |
| 'rule_analysis': rule_analysis, | |
| 'adjusted_prediction': adjusted | |
| } | |
| def get_statistics(self) -> Dict: | |
| return { | |
| 'ontology_nodes': len(self.ontology_graph.nodes), | |
| 'linguistic_rules': len(self.linguistic_rules), | |
| } | |
| class CascadeEmotionClassifier: | |
| def __init__(self, lstm_model, bert_model, vocab, tokenizer, | |
| label_encoder, ontology_model, threshold=0.95, device='cpu', | |
| max_length_lstm=100, max_length_bert=128): | |
| self.lstm_model = lstm_model | |
| self.bert_model = bert_model | |
| self.vocab = vocab | |
| self.tokenizer = tokenizer | |
| self.label_encoder = label_encoder | |
| self.ontology_model = ontology_model | |
| self.threshold = threshold | |
| self.device = device | |
| self.max_length_lstm = max_length_lstm | |
| self.max_length_bert = max_length_bert | |
| self.lstm_model.eval() | |
| self.bert_model.eval() | |
| self.lstm_model.to(device) | |
| self.bert_model.to(device) | |
| self.stats = {'total': 0, 'lstm': 0, 'bert': 0} | |
| def text_to_sequence(self, text): | |
| words = str(text).split()[:self.max_length_lstm] | |
| sequence = [self.vocab.get(word, self.vocab.get('<UNK>', 1)) for word in words] | |
| if len(sequence) < self.max_length_lstm: | |
| sequence += [self.vocab.get('<PAD>', 0)] * (self.max_length_lstm - len(sequence)) | |
| return sequence[:self.max_length_lstm] | |
| def predict(self, text): | |
| self.stats['total'] += 1 | |
| text_clean = clean_russian_text(text) | |
| # LSTM prediction | |
| seq = torch.LongTensor([self.text_to_sequence(text_clean)]).to(self.device) | |
| with torch.no_grad(): | |
| lstm_logits, lstm_conf = self.lstm_model(seq, return_confidence=True) | |
| lstm_probs = torch.softmax(lstm_logits, dim=1) | |
| lstm_pred = lstm_probs.argmax().item() | |
| lstm_emo = self.label_encoder.inverse_transform([lstm_pred])[0] | |
| lstm_pred_dict = { | |
| 'emotion': lstm_emo, | |
| 'confidence': lstm_conf.item(), | |
| 'probabilities': lstm_probs[0].cpu().numpy().tolist() | |
| } | |
| # Применяем онтологию | |
| lstm_onto = self.ontology_model.get_ontology_analysis(text_clean, lstm_pred_dict) | |
| lstm_adjusted = lstm_onto['adjusted_prediction'] | |
| if lstm_adjusted['confidence'] >= self.threshold: | |
| self.stats['lstm'] += 1 | |
| final = lstm_adjusted | |
| used_model = "LSTM с онтологией" | |
| else: | |
| # BERT prediction | |
| self.stats['bert'] += 1 | |
| enc = self.tokenizer( | |
| text_clean, | |
| truncation=True, | |
| padding=True, | |
| max_length=self.max_length_bert, | |
| return_tensors='pt' | |
| ).to(self.device) | |
| with torch.no_grad(): | |
| bert_logits, bert_conf = self.bert_model( | |
| enc['input_ids'], | |
| enc['attention_mask'], | |
| return_confidence=True | |
| ) | |
| bert_probs = torch.softmax(bert_logits, dim=1) | |
| bert_pred = bert_probs.argmax().item() | |
| bert_emo = self.label_encoder.inverse_transform([bert_pred])[0] | |
| bert_pred_dict = { | |
| 'emotion': bert_emo, | |
| 'confidence': bert_conf.item(), | |
| 'probabilities': bert_probs[0].cpu().numpy().tolist() | |
| } | |
| bert_onto = self.ontology_model.get_ontology_analysis(text_clean, bert_pred_dict) | |
| bert_adjusted = bert_onto['adjusted_prediction'] | |
| final = bert_adjusted | |
| used_model = "BERT с онтологией" | |
| lstm_onto = bert_onto | |
| result = { | |
| 'text': text, | |
| 'predicted_emotion': final['emotion'], | |
| 'confidence': float(final['confidence']), | |
| 'used_model': used_model, | |
| 'rules_applied': lstm_onto['rule_analysis']['rules_applied'], | |
| 'class_probabilities': { | |
| emo: float(prob) | |
| for emo, prob in zip(self.label_encoder.classes_, final.get('probabilities', lstm_pred_dict['probabilities'])) | |
| } | |
| } | |
| return result | |
| # ============================================================ | |
| # ЗАГРУЗКА МОДЕЛИ | |
| # ============================================================ | |
| def load_model(): | |
| print("Загрузка модели...") | |
| model_dir = 'model' | |
| # Загружаем информацию о модели | |
| with open(f'{model_dir}/model_info.json', 'r', encoding='utf-8') as f: | |
| model_info = json.load(f) | |
| # Загружаем vocab | |
| with open(f'{model_dir}/vocab.json', 'r', encoding='utf-8') as f: | |
| vocab = json.load(f) | |
| # Загружаем label encoder | |
| with open(f'{model_dir}/label_encoder.pkl', 'rb') as f: | |
| label_encoder = pickle.load(f) | |
| # Загружаем онтологию | |
| with open(f'{model_dir}/ontology_model.pkl', 'rb') as f: | |
| ontology_model = pickle.load(f) | |
| # Создаем и загружаем LSTM | |
| lstm_model = EmotionLSTM( | |
| vocab_size=len(vocab), | |
| embed_dim=model_info.get('embed_dim', 100), | |
| hidden_dim=256, | |
| num_classes=model_info['num_classes'], | |
| dropout=0.3, | |
| num_layers=2 | |
| ) | |
| checkpoint = torch.load(f'{model_dir}/lstm_model.pth', map_location=device, weights_only=True) | |
| lstm_model.load_state_dict(checkpoint['model_state_dict']) | |
| # Создаем и загружаем BERT | |
| bert_model = EmotionBERT( | |
| bert_model_name=model_info['bert_model_name'], | |
| num_classes=model_info['num_classes'], | |
| dropout=0.3 | |
| ) | |
| bert_model.load_state_dict(torch.load(f'{model_dir}/bert_model.pth', map_location=device, weights_only=True)) | |
| # Загружаем токенизатор | |
| tokenizer = BertTokenizer.from_pretrained(model_dir) | |
| # Создаем каскадный классификатор | |
| cascade = CascadeEmotionClassifier( | |
| lstm_model=lstm_model, | |
| bert_model=bert_model, | |
| vocab=vocab, | |
| tokenizer=tokenizer, | |
| label_encoder=label_encoder, | |
| ontology_model=ontology_model, | |
| threshold=model_info.get('threshold', 0.95), | |
| device=device, | |
| max_length_lstm=model_info.get('max_length_lstm', 100), | |
| max_length_bert=model_info.get('max_length_bert', 128) | |
| ) | |
| print("✅ Модель успешно загружена!") | |
| return cascade, model_info | |
| # ============================================================ | |
| # FASTAPI ПРИЛОЖЕНИЕ | |
| # ============================================================ | |
| app = FastAPI(title="Emotion Analysis with BERT and Ontology") | |
| # Настраиваем шаблоны | |
| templates = Jinja2Templates(directory="templates") | |
| # Глобальная переменная для модели | |
| classifier = None | |
| model_info = None | |
| async def startup_event(): | |
| global classifier, model_info | |
| classifier, model_info = load_model() | |
| async def home(request: Request): | |
| return templates.TemplateResponse( | |
| "index.html", | |
| { | |
| "request": request, | |
| "classes": classifier.label_encoder.classes_.tolist() if classifier else [] | |
| } | |
| ) | |
| async def predict(text: str = Form(...)): | |
| if not classifier: | |
| raise HTTPException(status_code=503, detail="Модель еще не загружена") | |
| if not text or len(text.strip()) < 3: | |
| return JSONResponse({ | |
| "error": "Текст слишком короткий. Введите хотя бы 3 символа." | |
| }, status_code=400) | |
| try: | |
| result = classifier.predict(text) | |
| # Форматируем правила для отображения | |
| rules_display = [] | |
| for rule in result['rules_applied'][:10]: | |
| if ':' in rule: | |
| cat, val = rule.split(':', 1) | |
| rules_display.append(f"<span class='rule-tag rule-{cat.strip()}'>{cat}: {val.strip()}</span>") | |
| else: | |
| rules_display.append(f"<span class='rule-tag'>{rule}</span>") | |
| # Форматируем вероятности | |
| probs_display = [] | |
| for emotion, prob in result['class_probabilities'].items(): | |
| percentage = prob * 100 | |
| probs_display.append(f""" | |
| <div class="prob-item"> | |
| <span class="prob-label">{emotion}</span> | |
| <div class="prob-bar-container"> | |
| <div class="prob-bar" style="width: {percentage}%"></div> | |
| </div> | |
| <span class="prob-value">{percentage:.1f}%</span> | |
| </div> | |
| """) | |
| return JSONResponse({ | |
| "success": True, | |
| "text": result['text'][:200] + "..." if len(result['text']) > 200 else result['text'], | |
| "emotion": result['predicted_emotion'], | |
| "confidence": f"{result['confidence']*100:.1f}%", | |
| "used_model": result['used_model'], | |
| "rules": "".join(rules_display) if rules_display else "Нет примененных правил", | |
| "probabilities": "".join(probs_display) | |
| }) | |
| except Exception as e: | |
| return JSONResponse({ | |
| "error": f"Ошибка при анализе: {str(e)}" | |
| }, status_code=500) | |
| async def get_stats(): | |
| if not classifier: | |
| raise HTTPException(status_code=503, detail="Модель еще не загружена") | |
| stats = classifier.stats | |
| return JSONResponse({ | |
| "total_predictions": stats['total'], | |
| "lstm_used": stats['lstm'], | |
| "bert_used": stats['bert'], | |
| "lstm_percentage": f"{(stats['lstm']/max(stats['total'],1))*100:.1f}%", | |
| "bert_percentage": f"{(stats['bert']/max(stats['total'],1))*100:.1f}%" | |
| }) | |
| async def health_check(): | |
| return {"status": "healthy", "model_loaded": classifier is not None} | |
| # Для запуска | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("PORT", 7860)) | |
| uvicorn.run(app, host="0.0.0.0", port=port) |