Upload 7 files
Browse files- .gitattributes +1 -0
- app.py +57 -0
- requirements.txt +68 -0
- resources/DF_FINAL.csv +3 -0
- resources/corpus_embeddings_rub.pth +3 -0
- resources/functions.py +40 -0
- resources/img.jpeg +0 -0
- resources/parcing.ipynb +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
resources/DF_FINAL.csv filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
from resources.functions import recommend
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
st.markdown(f"<h1 style='text-align: center;'>Глупый поиск фильмов", unsafe_allow_html=True)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
df = pd.read_csv('resources/DF_FINAL.csv')
|
| 13 |
+
emb = torch.load('resources/corpus_embeddings_rub.pth')
|
| 14 |
+
|
| 15 |
+
st.write(f'<p style="text-align: center; font-family: Arial, sans-serif; font-size: 20px; color: white;">Количество фильмов \
|
| 16 |
+
для поиска {len(df)}</p>', unsafe_allow_html=True)
|
| 17 |
+
|
| 18 |
+
# genre_lists = df['ganres'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])
|
| 19 |
+
# all_genres = list(set([genre for sublist in genre_lists for genre in sublist]))
|
| 20 |
+
# unique_genres = sorted(all_genres[1:])
|
| 21 |
+
|
| 22 |
+
st.header(':wrench: Панель инструментов')
|
| 23 |
+
# choice_g = st.multiselect("Выберите жанры", options=unique_genres)
|
| 24 |
+
top_k = st.selectbox("Сколько фильмов предложить?", options=[5, 10, 15, 20])
|
| 25 |
+
|
| 26 |
+
text = st.text_input('Что будем искать?')
|
| 27 |
+
button = st.button('Начать поиск', type="primary")
|
| 28 |
+
|
| 29 |
+
if text and button:
|
| 30 |
+
# if len(choice_g) == 0:
|
| 31 |
+
# choice_g = all_genres
|
| 32 |
+
# filt_ind = filter(df, choice_g)
|
| 33 |
+
hits = recommend(text, emb, top_k)
|
| 34 |
+
st.write(f'<p style="font-family: Arial, sans-serif; font-size: 24px; color: blue; font-weight: bold;"><strong>Всего подобранных \
|
| 35 |
+
рекомендаций {len(hits[0])}</strong></p>', unsafe_allow_html=True)
|
| 36 |
+
st.write('\n')
|
| 37 |
+
|
| 38 |
+
for i in range(top_k):
|
| 39 |
+
col1, col2 = st.columns([3, 4])
|
| 40 |
+
with col1:
|
| 41 |
+
try:
|
| 42 |
+
st.image(df['poster'][hits[0][i]['corpus_id']], width=300)
|
| 43 |
+
except:
|
| 44 |
+
st.image('https://cdnn11.img.sputnik.by/img/104126/36/1041263627_235:441:1472:1802_1920x0_80_0_0_fc2acc893b618b7c650d661fafe178b8.jpg', width=300)
|
| 45 |
+
with col2:
|
| 46 |
+
st.write(f"***Название:*** {df['title'][hits[0][i]['corpus_id']]}")
|
| 47 |
+
st.write(f"***Жанр:*** {(df['ganres'][hits[0][i]['corpus_id']])}")
|
| 48 |
+
st.write(f"***Описание:*** {df['description'][hits[0][i]['corpus_id']]}")
|
| 49 |
+
st.write(f"***Год:*** {df['year'][hits[0][i]['corpus_id']]}")
|
| 50 |
+
st.write(f"***Актерский состав:*** {df['cast'][hits[0][i]['corpus_id']]}")
|
| 51 |
+
st.write(f"***Косинусное сходство:*** {round(hits[0][i]['score'], 2)}")
|
| 52 |
+
st.write(f"***Ссылка на фильм : {df['url'][hits[0][i]['corpus_id']]}***")
|
| 53 |
+
|
| 54 |
+
st.markdown(
|
| 55 |
+
"<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
|
| 56 |
+
unsafe_allow_html=True
|
| 57 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
altair==5.2.0
|
| 2 |
+
attrs==23.2.0
|
| 3 |
+
blinker==1.7.0
|
| 4 |
+
cachetools==5.3.2
|
| 5 |
+
certifi==2024.2.2
|
| 6 |
+
charset-normalizer==3.3.2
|
| 7 |
+
click==8.1.7
|
| 8 |
+
DAWG-Python==0.7.2
|
| 9 |
+
docopt==0.6.2
|
| 10 |
+
filelock==3.13.1
|
| 11 |
+
fsspec==2024.2.0
|
| 12 |
+
gitdb==4.0.11
|
| 13 |
+
GitPython==3.1.41
|
| 14 |
+
huggingface-hub==0.20.3
|
| 15 |
+
idna==3.6
|
| 16 |
+
importlib-metadata==7.0.1
|
| 17 |
+
Jinja2==3.1.3
|
| 18 |
+
joblib==1.3.2
|
| 19 |
+
jsonschema==4.21.1
|
| 20 |
+
jsonschema-specifications==2023.12.1
|
| 21 |
+
markdown-it-py==3.0.0
|
| 22 |
+
MarkupSafe==2.1.5
|
| 23 |
+
mdurl==0.1.2
|
| 24 |
+
mpmath==1.3.0
|
| 25 |
+
networkx==3.2.1
|
| 26 |
+
nltk==3.8.1
|
| 27 |
+
numpy==1.26.4
|
| 28 |
+
packaging==23.2
|
| 29 |
+
pandas==2.2.0
|
| 30 |
+
pillow==10.2.0
|
| 31 |
+
protobuf==4.25.2
|
| 32 |
+
pyarrow==15.0.0
|
| 33 |
+
pydeck==0.8.1b0
|
| 34 |
+
Pygments==2.17.2
|
| 35 |
+
pymorphy2==0.9.1
|
| 36 |
+
pymorphy2-dicts-ru==2.4.417127.4579844
|
| 37 |
+
python-dateutil==2.8.2
|
| 38 |
+
pytz==2024.1
|
| 39 |
+
PyYAML==6.0.1
|
| 40 |
+
referencing==0.33.0
|
| 41 |
+
regex==2023.12.25
|
| 42 |
+
requests==2.31.0
|
| 43 |
+
rich==13.7.0
|
| 44 |
+
rpds-py==0.17.1
|
| 45 |
+
safetensors==0.4.2
|
| 46 |
+
scikit-learn==1.4.0
|
| 47 |
+
scipy==1.12.0
|
| 48 |
+
sentence-transformers==2.3.1
|
| 49 |
+
sentencepiece==0.1.99
|
| 50 |
+
six==1.16.0
|
| 51 |
+
smmap==5.0.1
|
| 52 |
+
streamlit==1.31.0
|
| 53 |
+
sympy==1.12
|
| 54 |
+
tenacity==8.2.3
|
| 55 |
+
threadpoolctl==3.2.0
|
| 56 |
+
tokenizers==0.15.1
|
| 57 |
+
toml==0.10.2
|
| 58 |
+
toolz==0.12.1
|
| 59 |
+
torch==2.2.0
|
| 60 |
+
tornado==6.4
|
| 61 |
+
tqdm==4.66.1
|
| 62 |
+
transformers==4.37.2
|
| 63 |
+
typing_extensions==4.9.0
|
| 64 |
+
tzdata==2023.4
|
| 65 |
+
tzlocal==5.2
|
| 66 |
+
urllib3==2.2.0
|
| 67 |
+
validators==0.22.0
|
| 68 |
+
zipp==3.17.0
|
resources/DF_FINAL.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c028d793b0c651c6d043309a3b272946c80762dfa8eb0565927a36cc897b9032
|
| 3 |
+
size 118068895
|
resources/corpus_embeddings_rub.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0b94a30ec61d3dab95c9c4f05896cbb9d298b2079535af4f61e739e1df9a703
|
| 3 |
+
size 56046434
|
resources/functions.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import string
|
| 3 |
+
import pandas as pd
|
| 4 |
+
# import numpy as np
|
| 5 |
+
# import torch
|
| 6 |
+
import nltk
|
| 7 |
+
import pymorphy2
|
| 8 |
+
from nltk.corpus import stopwords
|
| 9 |
+
nltk.download('stopwords')
|
| 10 |
+
from sentence_transformers import SentenceTransformer, util
|
| 11 |
+
|
| 12 |
+
stop_words = set(stopwords.words('russian'))
|
| 13 |
+
morph = pymorphy2.MorphAnalyzer()
|
| 14 |
+
model = SentenceTransformer('cointegrated/rubert-tiny2')
|
| 15 |
+
|
| 16 |
+
def data_preprocessing_hard(text: str) -> str:
|
| 17 |
+
text = str(text)
|
| 18 |
+
text = text.lower()
|
| 19 |
+
text = re.sub('<.*?>', '', text)
|
| 20 |
+
text = re.sub(r'[^а-яА-Я\s]', '', text)
|
| 21 |
+
text = ''.join([c for c in text if c not in string.punctuation])
|
| 22 |
+
text = ' '.join([word for word in text.split() if word not in stop_words])
|
| 23 |
+
# text = ''.join([char for char in text if not char.isdigit()])
|
| 24 |
+
text = ' '.join([morph.parse(word)[0].normal_form for word in text.split()])
|
| 25 |
+
return text
|
| 26 |
+
|
| 27 |
+
def filter(df: pd.DataFrame, ganre_list: list):
|
| 28 |
+
filtered_df = df[df['ganres'].apply(lambda x: any(g in ganre_list for g in(x)))]
|
| 29 |
+
filt_ind = filtered_df.index.to_list()
|
| 30 |
+
return filt_ind
|
| 31 |
+
|
| 32 |
+
def recommend(text: str, embeddings, top_k):
|
| 33 |
+
query_embeddings = model.encode([data_preprocessing_hard(text)], convert_to_tensor=True)
|
| 34 |
+
embeddings = embeddings.to("cpu")
|
| 35 |
+
embeddings = util.normalize_embeddings(embeddings)
|
| 36 |
+
|
| 37 |
+
query_embeddings = query_embeddings.to("cpu")
|
| 38 |
+
query_embeddings = util.normalize_embeddings(query_embeddings)
|
| 39 |
+
hits = util.semantic_search(query_embeddings, embeddings, top_k, score_function=util.dot_score)
|
| 40 |
+
return hits
|
resources/img.jpeg
ADDED
|
resources/parcing.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|