Spaces:

Shekharmeena
/

bookrec

Build error

App Files Files Community

bookrec / app.py

Shekharmeena

Update app.py

e50ce20 verified about 1 year ago

raw

history blame contribute delete

7.9 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""
	Gradio web application for the book recommendation system.
	Improved version that ensures diverse book recommendations./
	"""

	import gradio as gr
	import pandas as pd
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	import re
	import os

	# Download NLTK resources (needed for Hugging Face Spaces)
	nltk.download('punkt', quiet=True)
	nltk.download('stopwords', quiet=True)
	nltk.download('wordnet', quiet=True)

	# Text preprocessing
	def preprocess_text(text):
	"""
	Preprocess text data by removing special characters, converting to lowercase,
	removing stopwords, and lemmatizing.

	Args:
	text (str): Input text

	Returns:
	str: Preprocessed text
	"""
	if isinstance(text, str):
	# Convert to lowercase
	text = text.lower()

	# Remove special characters, numbers, etc.
	text = re.sub(r'[^a-zA-Z\s]', '', text)

	# Tokenize
	tokens = nltk.word_tokenize(text)

	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	tokens = [token for token in tokens if token not in stop_words]

	# Lemmatize
	lemmatizer = WordNetLemmatizer()
	tokens = [lemmatizer.lemmatize(token) for token in tokens]

	# Join tokens back into a string
	processed_text = ' '.join(tokens)

	return processed_text
	else:
	return ""

	# Initialize the model
	df = None
	tfidf_matrix = None
	tfidf_vectorizer = None

	def initialize_model():
	"""
	Initialize the recommendation model by preprocessing data and calculating similarities.

	Returns:
	bool: True if initialization successful, False otherwise
	"""
	global df, tfidf_matrix, tfidf_vectorizer

	try:
	# Load data
	print("Loading data...")
	df = pd.read_csv('books_summary.csv')
	print(f"Data loaded successfully with shape: {df.shape}")

	# Check for duplicates in raw data
	duplicates = df.duplicated().sum()
	print(f"Number of duplicate rows: {duplicates}")

	# Handle missing values
	df['summaries'] = df['summaries'].fillna('')
	df['categories'] = df['categories'].fillna('')

	# Preprocess summaries and categories
	print("Preprocessing text data...")
	df['processed_summaries'] = df['summaries'].apply(preprocess_text)
	df['processed_categories'] = df['categories'].apply(preprocess_text)

	# Combine features (summaries and categories)
	df['combined_features'] = df['processed_summaries'] + ' ' + df['processed_categories']

	# Extract features
	print("Extracting TF-IDF features...")
	tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
	tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])
	print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

	return True
	except Exception as e:
	print(f"Error initializing model: {e}")
	return False

	# Initialize the model when the app starts
	model_initialized = initialize_model()

	def get_recommendations(book_title, top_n=5):
	"""
	Get book recommendations based on similarity.

	Args:
	book_title (str): Title of the book
	top_n (int): Number of recommendations to return

	Returns:
	str: HTML-formatted recommendations
	"""
	global df, tfidf_matrix

	if df is None or tfidf_matrix is None:
	return "<p>Error: Model not initialized properly. Please try again later.</p>"

	# First, attempt to find exact title match (case-insensitive)
	book_matches = df[df['book_name'].str.lower() == book_title.lower()]

	if book_matches.empty:
	# If no exact match, look for a partial match
	book_matches = df[df['book_name'].str.lower().str.contains(book_title.lower())]
	if book_matches.empty:
	return f"<p>Book '{book_title}' not found in the dataset. Please try another title.</p>"
	else:
	# Use the first partial match
	input_book = book_matches.iloc[0]
	print(f"Exact match not found, using closest match: {input_book['book_name']}")
	else:
	# Use the first exact match
	input_book = book_matches.iloc[0]

	# Get the book index
	book_idx = input_book.name

	# Calculate similarity for input book to all other books
	input_tfidf = tfidf_matrix[book_idx]
	similarity_scores = cosine_similarity(input_tfidf, tfidf_matrix).flatten()

	# Create an array of indices, similarities and book names
	book_data = list(zip(
	range(len(similarity_scores)),
	similarity_scores,
	df['book_name']
	))

	# Sort by similarity score (descending)
	sorted_books = sorted(book_data, key=lambda x: x[1], reverse=True)

	# Find the top N recommendations (excluding the input book)
	recommendations = []
	seen_titles = set([input_book['book_name'].lower()])

	for idx, score, title in sorted_books:
	if title.lower() in seen_titles:
	continue

	seen_titles.add(title.lower())
	recommendations.append({
	'index': idx,
	'title': title,
	'similarity': score,
	'categories': df.iloc[idx]['categories'],
	'summary': df.iloc[idx]['summaries']
	})

	if len(recommendations) >= top_n:
	break

	# Format recommendations as HTML
	recommendations_html = f"<h3>Recommendations based on: {input_book['book_name']}</h3>"

	if recommendations:
	recommendations_html += "<ul>"
	for book in recommendations:
	similarity = round(book['similarity'] * 100, 2)
	recommendations_html += f"<li><strong>{book['title']}</strong> (Similarity: {similarity}%)<br>"
	recommendations_html += f"<em>Categories:</em> {book['categories']}<br>"
	summary = book['summary']
	if len(summary) > 200:
	summary = summary[:200] + "..."
	recommendations_html += f"<em>Summary:</em> {summary}</li><br>"
	recommendations_html += "</ul>"
	else:
	recommendations_html += "<p>No similar books found.</p>"

	return recommendations_html

	# Gradio interface function
	def recommend_books_interface(book_title):
	"""
	Interface function for Gradio.

	Args:
	book_title (str): Title of the book

	Returns:
	str: HTML-formatted recommendations
	"""
	if not book_title or book_title.strip() == "":
	return "<p>Please enter a book title.</p>"

	if not model_initialized:
	return "<p>Error: Model not initialized properly. Please check the logs.</p>"

	try:
	recommendations = get_recommendations(book_title)
	return recommendations
	except Exception as e:
	import traceback
	error_trace = traceback.format_exc()
	print(f"Error in recommendation: {error_trace}")
	return f"<p>An error occurred while generating recommendations: {str(e)}</p>"

	# Define the interface with verified examples from the dataset
	iface = gr.Interface(
	fn=recommend_books_interface,
	inputs=gr.Textbox(label="Enter a book title"),
	outputs=gr.HTML(),
	title="Book Recommendation System",
	description="Enter a book title to get 5 similar book recommendations based on content (summaries and categories).",
	examples=[
	"1984",
	"The Midnight Library",
	"Atomic Habits"
	]
	)

	# Launch the interface
	iface.launch(share=True)