Spaces:

EagleOfEmpire
/

EmotionClassifier

Runtime error

App Files Files Community

EagleOfEmpire commited on Dec 9, 2025

Commit

20f7681

verified ·

1 Parent(s): 4f6201a

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -2

app.py CHANGED Viewed

@@ -27,8 +27,42 @@ EMOTIONS = ["neutral", "joy", "sadness", "anger", "fear", "surprise"]
 # ВАША ПРЕДОБРАБОТКА ТЕКСТА
 # ---------------------------
 def preprocess_text(text):
-    # !!! ВСТАВЬТЕ ВАШ КОД СЮДА !!!
-    # полностью всю функцию preprocess_text
     return text

 # ВАША ПРЕДОБРАБОТКА ТЕКСТА
 # ---------------------------
 def preprocess_text(text):
+    def preprocess_text(text):
+    text = remove_duplicate_emojis(text) #удаление дублирующихся смайликов
+    if is_emoji_spam(text):
+        text = remove_all_emojis(text)
+    text = str(text).lower()#нижний регистр
+    text = re.sub(r'http\S+|www\S+|https\S+', '', text)#меняет всё, что начинается с hhtp, www, https на ''. S+ - один или более непробельных символов, | - или
+    text = re.sub(r'@\w+|#\w+', '', text)#удалили упоминания и хэштеги. \w+ - одна или более цифра/буква/нижнее подчёркивание.
+    text = text.translate(str.maketrans('', '', string.punctuation))#удалили пунктуацию. str.maketrans(, , <символы для полного удаления>), text.translate - применение maketrans.
+    text = emoji.demojize(text)#замена эмоций на текстовые метки (после удаления пунктуации, чтобы не удалилось выделение меток)
+    text = re.sub(r'\d+', '', text)#удалили цифры
+    try:
+        tokens = word_tokenize(text, language="russian")# токенизация
+    except:
+        tokens = text.split()
+    try:
+        stop_words = set(stopwords.words('russian'))  # удаление стоп-слов
+    except:
+        stop_words = set()
+    tokens = [word for word in tokens if (word.isalpha() or (word.startswith(':') and word.endswith(':'))) and word not in stop_words and len(word) > 2]
+    #использование isalpha - удаление всего, что имеет в составе цифры, знаки препинания, удаление стоп слова и очень коротких слов
+    #но оставить метки смайликов
+    try:
+        lemmatizer = pymorphy2.MorphAnalyzer()# лемматизация, используя pymorphy
+        tokens = [lemmatizer.parse(word)[0].normal_form for word in tokens]
+    except:
+        pass
+    return ' '.join(tokens)
     return text