File size: 2,489 Bytes
d88070f
 
 
7b8ab01
6da12cd
 
d88070f
3df8d53
 
 
 
 
59e39cd
3df8d53
 
 
6da12cd
 
 
 
 
 
d88070f
6da12cd
 
 
6c24e56
59e39cd
7b8ab01
73d67e3
 
7b8ab01
cb7bb0d
7b8ab01
3df8d53
 
 
 
 
 
 
 
 
 
 
 
d88070f
6da12cd
3df8d53
6da12cd
 
20f7681
3df8d53
20f7681
 
 
3df8d53
 
 
 
20f7681
3df8d53
20f7681
3df8d53
20f7681
 
3df8d53
20f7681
 
 
 
3df8d53
20f7681
 
 
3df8d53
 
 
 
 
 
 
20f7681
3df8d53
20f7681
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
import tensorflow as tf
import numpy as np
import pickle
import torch
from transformers import AutoTokenizer, AutoModel

# добавляем нужные импорты
import re
import string
import emoji
import pymorphy2
import joblib
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# ---------------------------
# ЗАГРУЗКА BERT
# ---------------------------
MODEL_NAME = 'sberbank-ai/ruBert-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME)

# ---------------------------
# ЗАГРУЗКА SCALER и KERAS-МОДЕЛИ
# ---------------------------
with open("scaler.joblib", "rb") as f:
    scaler = joblib.load(f)

keras_model = tf.keras.models.load_model("tf.keras", compile=False)


EMOTIONS = ["страх", "гнев", "грусть", "радость"]

# ---------------------------------------
# ФУНКЦИИ ДЛЯ ОБРАБОТКИ ЭМОДЗИ (добавь свои)
# ---------------------------------------
def remove_duplicate_emojis(text):
    return text  # заглушка — поставь свою реализацию

def is_emoji_spam(text):
    return False  # заглушка — поставь свою реализацию

def remove_all_emojis(text):
    return text  # заглушка — поставь свою реализацию


# ---------------------------
# ПРЕДОБРАБОТКА ТЕКСТА
# ---------------------------
def preprocess_text(text):

    text = remove_duplicate_emojis(text)
    if is_emoji_spam(text):
        text = remove_all_emojis(text)

    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))

    text = emoji.demojize(text)

    text = re.sub(r'\d+', '', text)

    try:
        tokens = word_tokenize(text, language="russian")
    except:
        tokens = text.split()

    try:
        stop_words = set(stopwords.words('russian'))
    except:
        stop_words = set()

    tokens = [
        word for word in tokens
        if (word.isalpha() or (word.startswith(':') and word.endswith(':')))
        and word not in stop_words
        and len(word) > 2
    ]

    try:
        lemmatizer = pymorphy2.MorphAnalyzer()
        tokens = [lemmatizer.parse(word)[0].normal_form for word in tokens]
    except:
        pass

    return ' '.join(tokens)