annafilina MARI-posa commited on
Commit
7934772
·
0 Parent(s):

Duplicate from MARI-posa/FindMyBook

Browse files

Co-authored-by: Maria <MARI-posa@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ all+.csv filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FindMyBook
3
+ emoji: 📈
4
+ colorFrom: purple
5
+ colorTo: pink
6
+ sdk: streamlit
7
+ sdk_version: 1.21.0
8
+ app_file: stri.py
9
+ pinned: false
10
+ duplicated_from: MARI-posa/FindMyBook
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
all+.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b79018ba5f2577b5108959e89e426e1869ba6e566495b8f800f14ffe60aad418
3
+ size 57131790
book_embeddings.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9ed0a393395706781e1cb5d80546a8673c47639b7db96a6f5e1a4dd6d5fbced
3
+ size 805276449
book_embeddings256.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96567b92c365d75bdacf525bccd2f901ac57098112e1428482a966ac2478bf9f
3
+ size 49487624
book_embeddings32.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40717110f053c22c3aa9e023c4e4e3773bda27390b1a2bd2f8a205496bc2fae1
3
+ size 49221494
book_embeddings512.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19b399d56a5eb48491ac7262b0fc442a4cfe25bcf8ab3d6fe21ee4655b2278d5
3
+ size 49487624
book_embeddingsN.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0826b4c501e65f4cca3449d2b05283326c56bb0cda692d19be8ccece1366153
3
+ size 46007197
book_train.csv ADDED
The diff for this file is too large to render. See raw diff
 
books_6000.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit==1.23.1
2
+ torch==2.0.1
3
+ numpy==1.23.5
4
+ pandas==1.5.3
5
+ transformers==4.30.0
6
+ regex==2022.10.31
stri.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import numpy as np
4
+ import pandas as pd
5
+ from PIL import Image
6
+ from transformers import AutoTokenizer, AutoModel
7
+ import re
8
+ import pickle
9
+ import requests
10
+ from io import BytesIO
11
+
12
+ st.title("Книжные рекомендации")
13
+
14
+ # Загрузка модели и токенизатора
15
+ model_name = "cointegrated/rubert-tiny2"
16
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
18
+
19
+ # Загрузка датасета и аннотаций к книгам
20
+ books = pd.read_csv('all+.csv')
21
+ books.dropna(inplace=True)
22
+
23
+ books = books[books['annotation'].apply(lambda x: len(x.split()) >= 40)]
24
+ books.drop_duplicates(subset='title', keep='first', inplace=True)
25
+ books = books.reset_index(drop=True)
26
+
27
+
28
+ def data_preprocessing(text: str) -> str:
29
+ text = re.sub(r'http\S+', " ", text) # удаляем ссылки
30
+ text = re.sub(r'@\w+', ' ', text) # удаляем упоминания пользователей
31
+ text = re.sub(r'#\w+', ' ', text) # удаляем хэштеги
32
+ text = re.sub(r'<.*?>', ' ', text) # html tags
33
+ return text
34
+
35
+
36
+ for i in ['author', 'title', 'annotation']:
37
+ books[i] = books[i].apply(data_preprocessing)
38
+
39
+ annot = books['annotation']
40
+
41
+ # Получение эмбеддингов аннотаций каждой книги в датасете
42
+ length = 512
43
+
44
+ # Определение запроса пользователя
45
+ query = st.text_input("Введите запрос")
46
+
47
+ if st.button('Сгенерировать'):
48
+ with open("book_embeddingsN.pkl", "rb") as f:
49
+ book_embeddings = pickle.load(f)
50
+
51
+ query_tokens = tokenizer.encode_plus(
52
+ query,
53
+ add_special_tokens=True,
54
+ max_length=length, # Ограничение на максимальную длину входной последовательности
55
+ pad_to_max_length=True, # Дополним последовательность нулями до максимальной длины
56
+ return_tensors='pt' # Вернём тензоры PyTorch
57
+ )
58
+
59
+ with torch.no_grad():
60
+ query_outputs = model(**query_tokens)
61
+ query_hidden_states = query_outputs.hidden_states[-1][:,0,:]
62
+ query_hidden_states = torch.nn.functional.normalize(query_hidden_states)
63
+
64
+
65
+ # Вычисление косинусного расстояния между эмбеддингом запроса и каждой аннотацией
66
+ cosine_similarities = torch.nn.functional.cosine_similarity(
67
+ query_embedding.squeeze(0),
68
+ torch.stack(book_embeddings.cpu())
69
+ )
70
+
71
+ cosine_similarities = cosine_similarities.numpy()
72
+
73
+ indices = np.argsort(cosine_similarities)[::-1] # Сортировка по убыванию
74
+
75
+ num_books_per_page = st.selectbox("Количество книг на странице:", [3, 5, 10], index=0)
76
+
77
+ for i in indices[:num_books_per_page]:
78
+ cols = st.columns(2) # Создание двух столбцов для размещения информации и изображения
79
+ cols[1].write("## " + books['title'][i])
80
+ cols[1].markdown("**Автор:** " + books['author'][i])
81
+ cols[1].markdown("**Аннотация:** " + books['annotation'][i])
82
+ image_url = books['image_url'][i]
83
+ response = requests.get(image_url)
84
+ image = Image.open(BytesIO(response.content))
85
+ cols[0].image(image)
86
+ cols[0].write(cosine_similarities[i])
87
+ cols[1].write("---")