Spaces:

ElijahDi
/

nn_ext

Sleeping

App Files Files Community

ElijahDi commited on Feb 8, 2024

Commit

cb99840

verified ·

1 Parent(s): 2328918

Upload 7 files

Browse files

Files changed (8) hide show

.gitattributes +1 -0
app.py +57 -0
requirements.txt +68 -0
resources/DF_FINAL.csv +3 -0
resources/corpus_embeddings_rub.pth +3 -0
resources/functions.py +40 -0
resources/img.jpeg +0 -0
resources/parcing.ipynb +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+resources/DF_FINAL.csv filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import torch
+from resources.functions import recommend
+st.markdown(f"<h1 style='text-align: center;'>Глупый поиск фильмов", unsafe_allow_html=True)
+df = pd.read_csv('resources/DF_FINAL.csv')
+emb = torch.load('resources/corpus_embeddings_rub.pth')
+st.write(f'<p style="text-align: center; font-family: Arial, sans-serif; font-size: 20px; color: white;">Количество фильмов \
+         для поиска {len(df)}</p>', unsafe_allow_html=True)
+# genre_lists = df['ganres'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])
+# all_genres = list(set([genre for sublist in genre_lists for genre in sublist]))
+# unique_genres = sorted(all_genres[1:])
+st.header(':wrench: Панель инструментов')
+# choice_g = st.multiselect("Выберите жанры", options=unique_genres)
+top_k = st.selectbox("Сколько фильмов предложить?", options=[5, 10, 15, 20])
+text = st.text_input('Что будем искать?')
+button = st.button('Начать поиск', type="primary")
+if text and button:
+    # if len(choice_g) == 0:
+    #     choice_g = all_genres
+    # filt_ind = filter(df, choice_g)
+    hits = recommend(text, emb, top_k)
+    st.write(f'<p style="font-family: Arial, sans-serif; font-size: 24px; color: blue; font-weight: bold;"><strong>Всего подобранных \
+         рекомендаций {len(hits[0])}</strong></p>', unsafe_allow_html=True)
+    st.write('\n')
+    for i in range(top_k):
+        col1, col2 = st.columns([3, 4])
+        with col1:
+            try:
+                st.image(df['poster'][hits[0][i]['corpus_id']], width=300)
+            except:
+                st.image('https://cdnn11.img.sputnik.by/img/104126/36/1041263627_235:441:1472:1802_1920x0_80_0_0_fc2acc893b618b7c650d661fafe178b8.jpg', width=300)
+        with col2:
+            st.write(f"***Название:*** {df['title'][hits[0][i]['corpus_id']]}")
+            st.write(f"***Жанр:*** {(df['ganres'][hits[0][i]['corpus_id']])}")
+            st.write(f"***Описание:*** {df['description'][hits[0][i]['corpus_id']]}")
+            st.write(f"***Год:*** {df['year'][hits[0][i]['corpus_id']]}")
+            st.write(f"***Актерский состав:*** {df['cast'][hits[0][i]['corpus_id']]}")
+            st.write(f"***Косинусное сходство:*** {round(hits[0][i]['score'], 2)}")
+            st.write(f"***Ссылка на фильм : {df['url'][hits[0][i]['corpus_id']]}***")
+        st.markdown(
+        "<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
+        unsafe_allow_html=True
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,68 @@

+altair==5.2.0
+attrs==23.2.0
+blinker==1.7.0
+cachetools==5.3.2
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+DAWG-Python==0.7.2
+docopt==0.6.2
+filelock==3.13.1
+fsspec==2024.2.0
+gitdb==4.0.11
+GitPython==3.1.41
+huggingface-hub==0.20.3
+idna==3.6
+importlib-metadata==7.0.1
+Jinja2==3.1.3
+joblib==1.3.2
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.4
+packaging==23.2
+pandas==2.2.0
+pillow==10.2.0
+protobuf==4.25.2
+pyarrow==15.0.0
+pydeck==0.8.1b0
+Pygments==2.17.2
+pymorphy2==0.9.1
+pymorphy2-dicts-ru==2.4.417127.4579844
+python-dateutil==2.8.2
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.33.0
+regex==2023.12.25
+requests==2.31.0
+rich==13.7.0
+rpds-py==0.17.1
+safetensors==0.4.2
+scikit-learn==1.4.0
+scipy==1.12.0
+sentence-transformers==2.3.1
+sentencepiece==0.1.99
+six==1.16.0
+smmap==5.0.1
+streamlit==1.31.0
+sympy==1.12
+tenacity==8.2.3
+threadpoolctl==3.2.0
+tokenizers==0.15.1
+toml==0.10.2
+toolz==0.12.1
+torch==2.2.0
+tornado==6.4
+tqdm==4.66.1
+transformers==4.37.2
+typing_extensions==4.9.0
+tzdata==2023.4
+tzlocal==5.2
+urllib3==2.2.0
+validators==0.22.0
+zipp==3.17.0

resources/DF_FINAL.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c028d793b0c651c6d043309a3b272946c80762dfa8eb0565927a36cc897b9032
+size 118068895

resources/corpus_embeddings_rub.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0b94a30ec61d3dab95c9c4f05896cbb9d298b2079535af4f61e739e1df9a703
+size 56046434

resources/functions.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import re
+import string
+import pandas as pd
+# import numpy as np
+# import torch
+import nltk
+import pymorphy2
+from nltk.corpus import stopwords
+nltk.download('stopwords')
+from sentence_transformers import SentenceTransformer, util
+stop_words = set(stopwords.words('russian'))
+morph = pymorphy2.MorphAnalyzer()
+model = SentenceTransformer('cointegrated/rubert-tiny2')
+def data_preprocessing_hard(text: str) -> str:
+    text = str(text)
+    text = text.lower()
+    text = re.sub('<.*?>', '', text)
+    text = re.sub(r'[^а-яА-Я\s]', '', text)
+    text = ''.join([c for c in text if c not in string.punctuation])
+    text = ' '.join([word for word in text.split() if word not in stop_words])
+    # text = ''.join([char for char in text if not char.isdigit()])
+    text = ' '.join([morph.parse(word)[0].normal_form for word in text.split()])
+    return text
+def filter(df: pd.DataFrame, ganre_list: list):
+    filtered_df = df[df['ganres'].apply(lambda x: any(g in ganre_list for g in(x)))]
+    filt_ind = filtered_df.index.to_list()
+    return filt_ind
+def recommend(text: str, embeddings, top_k):
+    query_embeddings = model.encode([data_preprocessing_hard(text)], convert_to_tensor=True)
+    embeddings = embeddings.to("cpu")
+    embeddings = util.normalize_embeddings(embeddings)
+    query_embeddings = query_embeddings.to("cpu")
+    query_embeddings = util.normalize_embeddings(query_embeddings)
+    hits = util.semantic_search(query_embeddings, embeddings, top_k, score_function=util.dot_score)
+    return hits

resources/img.jpeg ADDED Viewed

resources/parcing.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff