CSAT commited on
Commit
22bd453
Β·
verified Β·
1 Parent(s): 5ad083d

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +168 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pickle
3
+ import polars as pl
4
+ import re
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.neighbors import NearestNeighbors
7
+ import matplotlib.pyplot as plt
8
+ import seaborn as sns
9
+
10
+ # Set page configuration
11
+ st.set_page_config(
12
+ page_title="Book Recommendation System",
13
+ page_icon="πŸ“š",
14
+ layout="wide"
15
+ )
16
+
17
+ # App title and description
18
+ st.title("πŸ“š Book Recommendation System")
19
+ st.markdown("Enter a book summary and genres to get personalized book recommendations!")
20
+
21
+ # Load the TF-IDF vectorizer
22
+ @st.cache_resource
23
+ def load_models():
24
+ with open('tfidf_vectorizer.pkl', 'rb') as f:
25
+ tfidf = pickle.load(f)
26
+
27
+ # Load the KNN model
28
+ with open('knn_model.pkl', 'rb') as f:
29
+ knn_model = pickle.load(f)
30
+
31
+ return tfidf, knn_model
32
+
33
+ # Load the dataset
34
+ @st.cache_data
35
+ def load_data():
36
+ df_lazy = pl.scan_csv('goodreadsV5.csv')
37
+ df_cleaned = (
38
+ df_lazy.drop_nulls(subset=['name', 'summary', 'genres'])
39
+ .with_columns([
40
+ (pl.col('summary') + ' ' + pl.col('genres')).alias('combined_features')
41
+ ])
42
+ ).collect()
43
+
44
+ # Apply preprocessing to create the 'processed_features' column
45
+ df_cleaned = df_cleaned.with_columns([
46
+ pl.col('combined_features')
47
+ .map_elements(preprocess_text, return_dtype=pl.Utf8)
48
+ .alias('processed_features')
49
+ ])
50
+
51
+ return df_cleaned
52
+
53
+ # Define the preprocessing function
54
+ def preprocess_text(text):
55
+ return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
56
+
57
+ # Recommendation function for out-of-dataset books
58
+ def recommend_books_knn_out_of_dataset(input_summary, input_genres, top_n=5):
59
+ # Combine and preprocess the input book's features
60
+ combined_input = f"{input_summary} {input_genres}"
61
+ processed_input = preprocess_text(combined_input)
62
+
63
+ # Transform the input book's features using the loaded TF-IDF vectorizer
64
+ input_vector = tfidf.transform([processed_input])
65
+
66
+ # Find the nearest neighbors using the loaded KNN model
67
+ distances, indices = knn_model.kneighbors(input_vector, n_neighbors=top_n)
68
+
69
+ # Retrieve the recommended book titles and additional information
70
+ recommendations = []
71
+ for i, idx in enumerate(indices.flatten()):
72
+ book_info = {
73
+ "title": df_cleaned['name'][idx],
74
+ "summary": df_cleaned['summary'][idx],
75
+ "genres": df_cleaned['genres'][idx],
76
+ "similarity_score": 1 - distances.flatten()[i] # Convert distance to similarity
77
+ }
78
+ recommendations.append(book_info)
79
+
80
+ return recommendations
81
+
82
+ # Load models and data
83
+ try:
84
+ tfidf, knn_model = load_models()
85
+ df_cleaned = load_data()
86
+ models_loaded = True
87
+ except Exception as e:
88
+ st.error(f"Error loading models or data: {e}")
89
+ models_loaded = False
90
+
91
+ # Sidebar for inputs
92
+ st.sidebar.header("Input Parameters")
93
+
94
+ # Input fields
95
+ input_summary = st.sidebar.text_area("Book Summary",
96
+ placeholder="Enter a brief summary of the book...",
97
+ height=150)
98
+
99
+ input_genres = st.sidebar.text_input("Genres",
100
+ placeholder="E.g., fantasy, adventure, mystery")
101
+
102
+ # Number of recommendations slider
103
+ num_recommendations = st.sidebar.slider("Number of Recommendations",
104
+ min_value=1,
105
+ max_value=10,
106
+ value=5)
107
+
108
+ # Get recommendations button
109
+ if st.sidebar.button("Get Recommendations") and models_loaded:
110
+ if input_summary and input_genres:
111
+ with st.spinner("Finding the perfect books for you..."):
112
+ # Get recommendations
113
+ recommendations = recommend_books_knn_out_of_dataset(
114
+ input_summary,
115
+ input_genres,
116
+ top_n=num_recommendations
117
+ )
118
+
119
+ # Display recommendations
120
+ st.header("Recommended Books")
121
+
122
+ # Create columns for book cards
123
+ cols = st.columns(min(3, num_recommendations))
124
+
125
+ for i, book in enumerate(recommendations):
126
+ col_idx = i % 3
127
+ with cols[col_idx]:
128
+ st.subheader(book["title"])
129
+ st.markdown(f"**Genres:** {book['genres']}")
130
+ st.markdown(f"**Similarity Score:** {book['similarity_score']:.2f}")
131
+ with st.expander("Summary"):
132
+ st.write(book["summary"])
133
+ st.divider()
134
+
135
+ # Visualization of similarity scores
136
+ st.header("Similarity Scores")
137
+ fig, ax = plt.subplots(figsize=(10, 5))
138
+
139
+ book_titles = [book["title"] for book in recommendations]
140
+ similarity_scores = [book["similarity_score"] for book in recommendations]
141
+
142
+ # Create horizontal bar chart
143
+ sns.barplot(x=similarity_scores, y=book_titles, palette="viridis", ax=ax)
144
+ ax.set_xlabel("Similarity Score")
145
+ ax.set_ylabel("Book Title")
146
+ ax.set_title("Book Recommendation Similarity Scores")
147
+
148
+ st.pyplot(fig)
149
+
150
+ else:
151
+ st.warning("Please enter both a summary and genres to get recommendations.")
152
+
153
+ # Add some information about the app
154
+ st.sidebar.markdown("---")
155
+ st.sidebar.header("About")
156
+ st.sidebar.info(
157
+ """
158
+ This app uses TF-IDF vectorization and K-Nearest Neighbors to recommend books
159
+ based on your input summary and genres.
160
+
161
+ The recommendations are based on textual similarity between your input and
162
+ our database of books from Goodreads.
163
+ """
164
+ )
165
+
166
+ # Add a footer
167
+ st.markdown("---")
168
+ st.markdown("πŸ“š Book Recommendation System | Created with Streamlit")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ polars
3
+ scikit-learn
4
+ matplotlib
5
+ seaborn
6
+ requests