Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import string | |
| import re | |
| import joblib | |
| import time | |
| import base64 | |
| def get_base64_of_bin_file(bin_file): | |
| with open(bin_file, 'rb') as f: | |
| data = f.read() | |
| return base64.b64encode(data).decode() | |
| # Путь к вашему локальному изображению | |
| img_file = 'fon3.jpg' # Убедитесь, что путь правильный относительно вашего скрипта | |
| # Преобразование изображения в base64 | |
| img_base64 = get_base64_of_bin_file(img_file) | |
| page_bg_img = f""" | |
| <style> | |
| [data-testid="stAppViewContainer"] {{ | |
| background: linear-gradient(rgba(255, 255, 255, 0.5), rgba(255, 255, 255, 0.5)), url("data:image/jpeg;base64,{img_base64}"); | |
| background-size: cover; | |
| background-repeat: no-repeat; | |
| background-attachment: fixed; | |
| filter: brightness(1.1); /* Adjust the brightness here */ | |
| }} | |
| .custom-title {{ | |
| font-size: 70px; | |
| font-weight: bold; | |
| color: #120c01; | |
| text-align: center; | |
| text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); | |
| }} | |
| </style> | |
| """ | |
| st.markdown(page_bg_img, unsafe_allow_html=True) | |
| # Загрузка данных | |
| def load_data_models(): | |
| data = pd.read_csv('data/series_edited.csv') | |
| # data['description'] = data['description'].astype(str) | |
| combined_embeddings = np.load('embeddings/combined_embeddings_2.npy') | |
| index = faiss.read_index('embeddings/faiss_index_2.bin') | |
| embedder = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2') | |
| lsa = joblib.load('embeddings/lsa_model.pkl') | |
| return data, combined_embeddings, index, embedder, lsa | |
| data, combined_embeddings, index, embedder, lsa = load_data_models() | |
| #Функция предобработки текста | |
| def clean_text(text): | |
| text = re.sub(r'\r\n', ' ', text) | |
| text = text.replace('\xa0', ' ') | |
| clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]') | |
| text = clean_pattern.sub('', text) | |
| url_pattern = re.compile(r'http\S+|www\S+|https\S+') | |
| text = url_pattern.sub(r'', text) | |
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| text = text.lower() | |
| return text | |
| #Функция поиска подходящего по пользовательскому запросу сериала | |
| def search_series(user_query, top_k, description_weight, actors_weight, genre_weight): | |
| user_query = clean_text(user_query) | |
| query_embedding = embedder.encode([user_query], convert_to_tensor=True).cpu().numpy() | |
| weighted_query_embedding = np.concatenate((query_embedding * description_weight, | |
| query_embedding * actors_weight, | |
| query_embedding * genre_weight), axis=1) | |
| weighted_query_embedding = lsa.transform(weighted_query_embedding) | |
| weighted_query_embedding = weighted_query_embedding / np.linalg.norm(weighted_query_embedding, axis=1, keepdims=True) # Нормализация | |
| D, I = index.search(weighted_query_embedding, top_k) | |
| # results = data.iloc[I[0]].copy() | |
| # cosine_similarities = D[0] | |
| # results['cosine_similarity'] = cosine_similarities | |
| return I[0], D[0] | |
| # Инициализация весов в session_state | |
| if 'description_weight' not in st.session_state: | |
| st.session_state['description_weight'] = 0.7 | |
| if 'actors_weight' not in st.session_state: | |
| st.session_state['actors_weight'] = 0.15 | |
| if 'genre_weight' not in st.session_state: | |
| st.session_state['genre_weight'] = 0.15 | |
| # Инициализация оригинальных весов для сброса | |
| if 'original_description_weight' not in st.session_state: | |
| st.session_state['original_description_weight'] = 0.7 | |
| if 'original_actors_weight' not in st.session_state: | |
| st.session_state['original_actors_weight'] = 0.15 | |
| if 'original_genre_weight' not in st.session_state: | |
| st.session_state['original_genre_weight'] = 0.15 | |
| # Функция для сброса весов к первоначальным значениям | |
| def reset_weights(): | |
| st.session_state['description_weight'] = 0.7 | |
| st.session_state['actors_weight'] = 0.15 | |
| st.session_state['genre_weight'] = 0.15 | |
| # Функция для расчета пропорционального изменения весов | |
| def update_weights(): | |
| total_original_weight = st.session_state['original_description_weight'] + st.session_state['original_actors_weight'] + st.session_state['original_genre_weight'] | |
| total_new_weight = st.session_state['description_weight'] + st.session_state['actors_weight'] + st.session_state['genre_weight'] | |
| if total_original_weight != 0 and total_new_weight != 0: | |
| proportion = total_original_weight / total_new_weight | |
| st.session_state['description_weight'] *= proportion | |
| st.session_state['actors_weight'] *= proportion | |
| st.session_state['genre_weight'] *= proportion | |
| # Слайдеры для настройки весов | |
| description_weight = st.sidebar.slider("Вес описания", 0.0, 1.0, st.session_state['description_weight'], step=0.01) | |
| actors_weight = st.sidebar.slider("Вес актеров", 0.0, 1.0, st.session_state['actors_weight'], step=0.01) | |
| genre_weight = st.sidebar.slider("Вес жанра", 0.0, 1.0, st.session_state['genre_weight'], step=0.01) | |
| # Обновляем значения весов в session_state | |
| st.session_state['description_weight'] = description_weight | |
| st.session_state['actors_weight'] = actors_weight | |
| st.session_state['genre_weight'] = genre_weight | |
| # Обработчик кнопки сброса весов | |
| if st.sidebar.button("Обновить веса"): | |
| reset_weights() | |
| # Обновляем веса пропорционально | |
| update_weights() | |
| st.markdown('<h1 class="custom-title">📽️FIND MY SHOW📽️</h1>', unsafe_allow_html=True) | |
| # st.title('✨FIND MY SHOW✨') | |
| st.header('Поиск сериала по описанию') | |
| input_text = st.text_area('Введите описание сериала') | |
| top_k = st.slider("Количество результатов", min_value=1, max_value=20, value=5) | |
| if st.button('Поиск'): | |
| start_time = time.time() | |
| indices, distances = search_series(input_text, top_k, description_weight, actors_weight, genre_weight) | |
| end_time = time.time() | |
| search_time = end_time - start_time | |
| st.write("Результаты поиска:") | |
| for idx, dist in zip(indices, distances): | |
| results = data.iloc[idx] | |
| st.write("---") | |
| st.image(results['image_url'], width=400) | |
| st.write(f"**Название:** **{results['tvshow_title']}**") | |
| st.write(f"**Жанр:** {results['genre']}") | |
| if len(results['description']) > 50: | |
| results['description'] = ' '.join(results['description'].split()[:50]) + '...' | |
| st.write(f"**Описание:** {results['description']}") | |
| st.write(f"**Рейтинг КП :** {round(results['kinopoisk_rating'], 2)}, **Рейтинг IMDb :** {round(results['imdb'], 2)}") | |
| st.write(f"**Косинусное сходство:** {round(dist, 3)}") | |
| st.write(f'**Время поиска:** {search_time:.3f} секунд') | |
| st.markdown(f"[Читать далее]({results['page_url']})", unsafe_allow_html=True) |