bookrec / app.py
Shekharmeena's picture
Update app.py
e50ce20 verified
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Gradio web application for the book recommendation system.
Improved version that ensures diverse book recommendations./
"""
import gradio as gr
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import os
# Download NLTK resources (needed for Hugging Face Spaces)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
# Text preprocessing
def preprocess_text(text):
"""
Preprocess text data by removing special characters, converting to lowercase,
removing stopwords, and lemmatizing.
Args:
text (str): Input text
Returns:
str: Preprocessed text
"""
if isinstance(text, str):
# Convert to lowercase
text = text.lower()
# Remove special characters, numbers, etc.
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Tokenize
tokens = nltk.word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
# Lemmatize
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
# Join tokens back into a string
processed_text = ' '.join(tokens)
return processed_text
else:
return ""
# Initialize the model
df = None
tfidf_matrix = None
tfidf_vectorizer = None
def initialize_model():
"""
Initialize the recommendation model by preprocessing data and calculating similarities.
Returns:
bool: True if initialization successful, False otherwise
"""
global df, tfidf_matrix, tfidf_vectorizer
try:
# Load data
print("Loading data...")
df = pd.read_csv('books_summary.csv')
print(f"Data loaded successfully with shape: {df.shape}")
# Check for duplicates in raw data
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
# Handle missing values
df['summaries'] = df['summaries'].fillna('')
df['categories'] = df['categories'].fillna('')
# Preprocess summaries and categories
print("Preprocessing text data...")
df['processed_summaries'] = df['summaries'].apply(preprocess_text)
df['processed_categories'] = df['categories'].apply(preprocess_text)
# Combine features (summaries and categories)
df['combined_features'] = df['processed_summaries'] + ' ' + df['processed_categories']
# Extract features
print("Extracting TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
return True
except Exception as e:
print(f"Error initializing model: {e}")
return False
# Initialize the model when the app starts
model_initialized = initialize_model()
def get_recommendations(book_title, top_n=5):
"""
Get book recommendations based on similarity.
Args:
book_title (str): Title of the book
top_n (int): Number of recommendations to return
Returns:
str: HTML-formatted recommendations
"""
global df, tfidf_matrix
if df is None or tfidf_matrix is None:
return "<p>Error: Model not initialized properly. Please try again later.</p>"
# First, attempt to find exact title match (case-insensitive)
book_matches = df[df['book_name'].str.lower() == book_title.lower()]
if book_matches.empty:
# If no exact match, look for a partial match
book_matches = df[df['book_name'].str.lower().str.contains(book_title.lower())]
if book_matches.empty:
return f"<p>Book '{book_title}' not found in the dataset. Please try another title.</p>"
else:
# Use the first partial match
input_book = book_matches.iloc[0]
print(f"Exact match not found, using closest match: {input_book['book_name']}")
else:
# Use the first exact match
input_book = book_matches.iloc[0]
# Get the book index
book_idx = input_book.name
# Calculate similarity for input book to all other books
input_tfidf = tfidf_matrix[book_idx]
similarity_scores = cosine_similarity(input_tfidf, tfidf_matrix).flatten()
# Create an array of indices, similarities and book names
book_data = list(zip(
range(len(similarity_scores)),
similarity_scores,
df['book_name']
))
# Sort by similarity score (descending)
sorted_books = sorted(book_data, key=lambda x: x[1], reverse=True)
# Find the top N recommendations (excluding the input book)
recommendations = []
seen_titles = set([input_book['book_name'].lower()])
for idx, score, title in sorted_books:
if title.lower() in seen_titles:
continue
seen_titles.add(title.lower())
recommendations.append({
'index': idx,
'title': title,
'similarity': score,
'categories': df.iloc[idx]['categories'],
'summary': df.iloc[idx]['summaries']
})
if len(recommendations) >= top_n:
break
# Format recommendations as HTML
recommendations_html = f"<h3>Recommendations based on: {input_book['book_name']}</h3>"
if recommendations:
recommendations_html += "<ul>"
for book in recommendations:
similarity = round(book['similarity'] * 100, 2)
recommendations_html += f"<li><strong>{book['title']}</strong> (Similarity: {similarity}%)<br>"
recommendations_html += f"<em>Categories:</em> {book['categories']}<br>"
summary = book['summary']
if len(summary) > 200:
summary = summary[:200] + "..."
recommendations_html += f"<em>Summary:</em> {summary}</li><br>"
recommendations_html += "</ul>"
else:
recommendations_html += "<p>No similar books found.</p>"
return recommendations_html
# Gradio interface function
def recommend_books_interface(book_title):
"""
Interface function for Gradio.
Args:
book_title (str): Title of the book
Returns:
str: HTML-formatted recommendations
"""
if not book_title or book_title.strip() == "":
return "<p>Please enter a book title.</p>"
if not model_initialized:
return "<p>Error: Model not initialized properly. Please check the logs.</p>"
try:
recommendations = get_recommendations(book_title)
return recommendations
except Exception as e:
import traceback
error_trace = traceback.format_exc()
print(f"Error in recommendation: {error_trace}")
return f"<p>An error occurred while generating recommendations: {str(e)}</p>"
# Define the interface with verified examples from the dataset
iface = gr.Interface(
fn=recommend_books_interface,
inputs=gr.Textbox(label="Enter a book title"),
outputs=gr.HTML(),
title="Book Recommendation System",
description="Enter a book title to get 5 similar book recommendations based on content (summaries and categories).",
examples=[
"1984",
"The Midnight Library",
"Atomic Habits"
]
)
# Launch the interface
iface.launch(share=True)