Spaces:

Kdnv
/

kobz

Sleeping

App Files Files Community

Kdnv commited on Aug 19, 2024

Commit

360fc74

1 Parent(s): 8edef47

init

Browse files

Files changed (7) hide show

app.py +58 -0
class_dict.json +1 -0
kdnv_preprocess.py +109 -0
models/index_cosine.faiss +0 -0
models/index_dot.faiss +0 -0
models/index_l2.faiss +0 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import streamlit as st
+from kdnv_preprocess import data_preprocessing
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+import json
+from collections import Counter
+@st.cache_resource
+def load_model():
+    return SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
+@st.cache_resource
+def load_index():
+    indices = {
+        'L2': faiss.read_index("models/index_l2.faiss"),
+        'Dot': faiss.read_index("models/index_dot.faiss"),
+        'Cos': faiss.read_index("models/index_cosine.faiss")
+    }
+    return indices
+model = load_model()
+indices = load_index()
+with open('class_dict.json', 'r') as file:
+    class_dict = json.load(file)
+st.header('Кальянный угадыватель')
+st.caption('для Кобза')
+st.divider()
+with st.form(key='pred'):
+    text = st.text_area(label='Введи сюда описание табака')
+    button = st.form_submit_button('Узнать предсказание')
+if button:
+    text = data_preprocessing(text)
+    prompt_embedding = model.encode(text).astype('float32')
+    prompt_embedding = prompt_embedding[np.newaxis, :]
+    _, indices_result_l2 = indices['L2'].search(prompt_embedding, 1)
+    _, indices_result_dot = indices['Dot'].search(prompt_embedding, 1)
+    _, indices_result_cosine = indices['Cos'].search(prompt_embedding, 1)
+    pred_l2 = class_dict[str(indices_result_l2[0][0])]
+    pred_dot = class_dict[str(indices_result_dot[0][0])]
+    pred_cosine = class_dict[str(indices_result_cosine[0][0])]
+    predictions = [pred_l2, pred_dot, pred_cosine]
+    prediction_counts = Counter(predictions)
+    final_prediction = prediction_counts.most_common(1)[0][0]
+    if len(prediction_counts) == len(predictions):
+        final_prediction = pred_l2
+    st.subheader(f'Я считаю, что это: {final_prediction}')

class_dict.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"0": "Cereal", "1": "Ginger", "2": "Melons", "3": "Raspberry", "4": "Tobacco", "5": "Currant", "6": "Tropic mix", "7": "Mixed berries", "8": "Blueberry", "9": "Tea", "10": "Passionfruit", "11": "Strawberry", "12": "Mixed fruits", "13": "Fruit/berry desserts", "14": "Aloe", "15": "Cucumber", "16": "Mixed citrus", "17": "Plum", "18": "Orange", "19": "Mango", "20": "Fruit/berry yogurt", "21": "Rice", "22": "Chocolate", "23": "Grapefruit", "24": "Sweets", "25": "Gum", "26": "Coctail", "27": "Cloudberry", "28": "Lemon", "29": "Calamansi", "30": "Milk/Cream", "31": "Cola", "32": "Gin", "33": "Violet", "34": "Pomegranate", "35": "Grass", "36": "Prickly Pear", "37": "Lavander", "38": "Whiskey", "39": "Honey", "40": "Wild strawberry", "41": "Feijoa", "42": "Cranberry", "43": "Blackberry", "44": "Pineapple", "45": "Papaya", "46": "Melon", "47": "Pear", "48": "Banana", "49": "Apple", "50": "Nectarine", "51": "Grape", "52": "Cream", "53": "Waffles", "54": "Fruit/berry coctail", "55": "Kiwi", "56": "Coffee", "57": "Energy drink", "58": "Lemongrass", "59": "Mint", "60": "Cinnamon", "61": "Cookies", "62": "Estragon", "63": "Hazelnut", "64": "Lime", "65": "Basil", "66": "Elderberry", "67": "Bergamot", "68": "Coconut", "69": "Cherry", "70": "Mixed florals", "71": "Ice Cream", "72": "Cactus", "73": "Peach", "74": "Guava", "75": "Fir", "76": "Pistachio", "77": "Gooseberry", "78": "Salbei", "79": "Lychee", "80": "Cooling", "81": "Wildberry", "82": "Jackfruit", "83": "Watermelon", "84": "Chocomint", "85": "Almond", "86": "Cake", "87": "Liqueur", "88": "Mandarin", "89": "Baikal", "90": "Soursop", "91": "Root beer", "92": "Vanilla", "93": "Corn", "94": "Curry", "95": "Orange flower", "96": "Wine", "97": "Beer", "98": "Piones", "99": "Yudzu", "100": "Saffron", "101": "Wood", "102": "Anise", "103": "Rum", "104": "Maple", "105": "Marula", "106": "Quince", "107": "Sea Buckthorn"}

kdnv_preprocess.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import inspect
+# Патч для pymorphy2, чтобы использовать getfullargspec вместо getargspec
+if not hasattr(inspect, 'getargspec'):
+    def getargspec(func):
+        specs = inspect.getfullargspec(func)
+        return specs.args, specs.varargs, specs.varkw, specs.defaults
+    inspect.getargspec = getargspec
+import re
+import string
+import numpy as np
+import torch
+from torch import Tensor
+import pymorphy2
+from nltk.corpus import stopwords
+import nltk
+nltk.download('stopwords')
+import spacy
+import subprocess
+# Попытка загрузки модели, если она не установлена
+try:
+    nlp = spacy.load("ru_core_news_sm", disable=["parser", "ner"])
+except OSError:
+    # Установка модели с помощью команды Spacy
+    subprocess.run(["python", "-m", "spacy", "download", "ru_core_news_sm"])
+    nlp = spacy.load("ru_core_news_sm", disable=["parser", "ner"])
+# Загрузка стоп-слов для русского языка
+stop_words = set(stopwords.words('russian'))
+# Загрузка модели spacy для русского языка
+nlp = spacy.load("ru_core_news_sm", disable=["parser", "ner"])
+# Инициализация pymorphy2
+morph = pymorphy2.MorphAnalyzer()
+def data_preprocessing(text: str) -> str:
+    # Приведение к нижнему регистру
+    text = text.lower()
+    # Удаление HTML-тегов
+    text = re.sub(r'<.*?>', '', text)
+    # Удаление символов переноса строки и неразрывного пробела
+    text = text.replace('\n', ' ').replace('\xa0', ' ')
+    # Удаление пунктуации и цифр в одном шаге
+    text = ''.join([c for c in text if c not in string.punctuation and not c.isdigit()])
+    # Удаление стоп-слов и лемматизация
+    doc = nlp(text)
+    text = ' '.join([morph.parse(token.text)[0].normal_form for token in doc if token.text not in stop_words and not token.is_digit])
+    return text
+def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
+    return list(filter(lambda x: x[1] > n, sorted_words))
+def padding(review_int: list, seq_len: int) -> np.array:
+    """Make left-sided padding for input list of tokens
+    Args:
+        review_int (list): input list of tokens
+        seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
+    Returns:
+        np.array: padded sequences
+    """
+    features = np.zeros((len(review_int), seq_len), dtype=int)
+    for i, review in enumerate(review_int):
+        if len(review) <= seq_len:
+            zeros = list(np.zeros(seq_len - len(review)))
+            new = zeros + review
+        else:
+            new = review[: seq_len]
+        features[i, :] = np.array(new)
+    return features
+def preprocess_single_string(
+        input_string: str,
+        seq_len: int,
+        vocab_to_int: dict,
+        verbose: bool = False
+) -> Tensor:
+    """Function for all preprocessing steps on a single string
+    Args:
+        input_string (str): input single string for preprocessing
+        seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
+        vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
+    Returns:
+        list: preprocessed string
+    """
+    preprocessed_string = data_preprocessing(input_string)
+    result_list = []
+    for word in preprocessed_string.split():
+        try:
+            result_list.append(vocab_to_int[word])
+        except KeyError as e:
+            if verbose:
+                print(f'{e}: not in dictionary!')
+            pass
+    result_padded = padding([result_list], seq_len)[0]
+    return Tensor(result_padded)

models/index_cosine.faiss ADDED Viewed

Binary file (332 kB). View file

models/index_dot.faiss ADDED Viewed

Binary file (332 kB). View file

models/index_l2.faiss ADDED Viewed

Binary file (332 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+faiss
+nltk
+numpy
+pymorphy2
+sentence_transformers
+spacy
+streamlit
+torch