FindMyBook

Build error

App Files Files Community

annafilina commited on Jun 16, 2023

Commit

11cb079

1 Parent(s): 0162a45

Update stri.py

Browse files

Files changed (1) hide show

stri.py +28 -85

stri.py CHANGED Viewed

@@ -1,87 +1,30 @@
 import streamlit as st
-import torch
-import numpy as np
 import pandas as pd
-from PIL import Image
-from transformers import AutoTokenizer, AutoModel
-import re
-import pickle
-import requests
-from io import BytesIO
-st.title("Книжные рекомендации")
-# Загрузка модели и токенизатора
-model_name = "symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
-# Загрузка датасета и аннотаций к книгам
-books = pd.read_csv('all+.csv')
-books.dropna(inplace=True)
-books = books[books['annotation'].apply(lambda x: len(x.split()) >= 40)]
-books.drop_duplicates(subset='title', keep='first', inplace=True)
-books = books.reset_index(drop=True)
-def data_preprocessing(text: str) -> str:
-    text = re.sub(r'http\S+', " ", text)  # удаляем ссылки
-    text = re.sub(r'@\w+', ' ', text)  # удаляем упоминания пользователей
-    text = re.sub(r'#\w+', ' ', text)  # удаляем хэштеги
-    text = re.sub(r'<.*?>', ' ', text)  # html tags
-    return text
-for i in ['author', 'title', 'annotation']:
-    books[i] = books[i].apply(data_preprocessing)
-annot = books['annotation']
-# Получение эмбеддингов аннотаций каждой книги в датасете
-length = 512
-# Определение запроса пользователя
-query = st.text_input("Введите запрос")
-if st.button('Сгенерировать'):
-    with open("book_embeddingsN.pkl", "rb") as f:
-        book_embeddings = pickle.load(f)
-    query_tokens = tokenizer.encode_plus(
-            query,
-            add_special_tokens=True,
-            max_length=length, # Ограничение на максимальную длину входной последовательности
-            pad_to_max_length=True, # Дополним последовательность нулями до максимальной длины
-            return_tensors='pt' # Вернём тензоры PyTorch
-        )
-    with torch.no_grad():
-            query_outputs = model(**query_tokens)
-            query_hidden_states = query_outputs.hidden_states[-1][:,0,:]
-            query_hidden_states = torch.nn.functional.normalize(query_hidden_states)
-    # Вычисление косинусного расстояния между эмбеддингом запроса и каждой аннотацией
-    cosine_similarities = torch.nn.functional.cosine_similarity(
-        query_embedding.squeeze(0),
-        torch.stack(book_embeddings.cpu())
-    )
-    cosine_similarities = cosine_similarities.numpy()
-    indices = np.argsort(cosine_similarities)[::-1]  # Сортировка по убыванию
-    num_books_per_page = st.selectbox("Количество книг на странице:", [3, 5, 10], index=0)
-    for i in indices[:num_books_per_page]:
-        cols = st.columns(2)  # Создание двух столбцов для размещения информации и изображения
-        cols[1].write("## " + books['title'][i])
-        cols[1].markdown("**Автор:** " + books['author'][i])
-        cols[1].markdown("**Аннотация:** " + books['annotation'][i])
-        image_url = books['image_url'][i]
-        response = requests.get(image_url)
-        image = Image.open(BytesIO(response.content))
-        cols[0].image(image)
-        cols[0].write(cosine_similarities[i])
-        cols[1].write("---")

 import streamlit as st
 import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+# Read the CSV file
+df = pd.read_csv('all+++.csv')
+# Display the CSV file
+st.title('CSV File Overview')
+st.dataframe(df)
+# Bar plot for genres
+st.title('Genre Bar Plot')
+genre_counts = df['genre'].value_counts()
+plt.figure(figsize=(10, 6))
+sns.barplot(x=genre_counts.index, y=genre_counts.values)
+plt.xlabel('Genre')
+plt.ylabel('Count')
+plt.xticks(rotation=45)
+st.pyplot()
+# Distribution plot for annotation lengths
+st.title('Annotation Length Distribution')
+annotation_lengths = df['annotation'].str.len()
+plt.figure(figsize=(10, 6))
+sns.histplot(annotation_lengths, kde=True)
+plt.xlabel('Annotation Length')
+plt.ylabel('Count')
+st.pyplot()