Spaces:
Build error
Build error
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| """ | |
| Gradio web application for the book recommendation system. | |
| Improved version that ensures diverse book recommendations./ | |
| """ | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| import re | |
| import os | |
| # Download NLTK resources (needed for Hugging Face Spaces) | |
| nltk.download('punkt', quiet=True) | |
| nltk.download('stopwords', quiet=True) | |
| nltk.download('wordnet', quiet=True) | |
| # Text preprocessing | |
| def preprocess_text(text): | |
| """ | |
| Preprocess text data by removing special characters, converting to lowercase, | |
| removing stopwords, and lemmatizing. | |
| Args: | |
| text (str): Input text | |
| Returns: | |
| str: Preprocessed text | |
| """ | |
| if isinstance(text, str): | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Remove special characters, numbers, etc. | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) | |
| # Tokenize | |
| tokens = nltk.word_tokenize(text) | |
| # Remove stopwords | |
| stop_words = set(stopwords.words('english')) | |
| tokens = [token for token in tokens if token not in stop_words] | |
| # Lemmatize | |
| lemmatizer = WordNetLemmatizer() | |
| tokens = [lemmatizer.lemmatize(token) for token in tokens] | |
| # Join tokens back into a string | |
| processed_text = ' '.join(tokens) | |
| return processed_text | |
| else: | |
| return "" | |
| # Initialize the model | |
| df = None | |
| tfidf_matrix = None | |
| tfidf_vectorizer = None | |
| def initialize_model(): | |
| """ | |
| Initialize the recommendation model by preprocessing data and calculating similarities. | |
| Returns: | |
| bool: True if initialization successful, False otherwise | |
| """ | |
| global df, tfidf_matrix, tfidf_vectorizer | |
| try: | |
| # Load data | |
| print("Loading data...") | |
| df = pd.read_csv('books_summary.csv') | |
| print(f"Data loaded successfully with shape: {df.shape}") | |
| # Check for duplicates in raw data | |
| duplicates = df.duplicated().sum() | |
| print(f"Number of duplicate rows: {duplicates}") | |
| # Handle missing values | |
| df['summaries'] = df['summaries'].fillna('') | |
| df['categories'] = df['categories'].fillna('') | |
| # Preprocess summaries and categories | |
| print("Preprocessing text data...") | |
| df['processed_summaries'] = df['summaries'].apply(preprocess_text) | |
| df['processed_categories'] = df['categories'].apply(preprocess_text) | |
| # Combine features (summaries and categories) | |
| df['combined_features'] = df['processed_summaries'] + ' ' + df['processed_categories'] | |
| # Extract features | |
| print("Extracting TF-IDF features...") | |
| tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') | |
| tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features']) | |
| print(f"TF-IDF matrix shape: {tfidf_matrix.shape}") | |
| return True | |
| except Exception as e: | |
| print(f"Error initializing model: {e}") | |
| return False | |
| # Initialize the model when the app starts | |
| model_initialized = initialize_model() | |
| def get_recommendations(book_title, top_n=5): | |
| """ | |
| Get book recommendations based on similarity. | |
| Args: | |
| book_title (str): Title of the book | |
| top_n (int): Number of recommendations to return | |
| Returns: | |
| str: HTML-formatted recommendations | |
| """ | |
| global df, tfidf_matrix | |
| if df is None or tfidf_matrix is None: | |
| return "<p>Error: Model not initialized properly. Please try again later.</p>" | |
| # First, attempt to find exact title match (case-insensitive) | |
| book_matches = df[df['book_name'].str.lower() == book_title.lower()] | |
| if book_matches.empty: | |
| # If no exact match, look for a partial match | |
| book_matches = df[df['book_name'].str.lower().str.contains(book_title.lower())] | |
| if book_matches.empty: | |
| return f"<p>Book '{book_title}' not found in the dataset. Please try another title.</p>" | |
| else: | |
| # Use the first partial match | |
| input_book = book_matches.iloc[0] | |
| print(f"Exact match not found, using closest match: {input_book['book_name']}") | |
| else: | |
| # Use the first exact match | |
| input_book = book_matches.iloc[0] | |
| # Get the book index | |
| book_idx = input_book.name | |
| # Calculate similarity for input book to all other books | |
| input_tfidf = tfidf_matrix[book_idx] | |
| similarity_scores = cosine_similarity(input_tfidf, tfidf_matrix).flatten() | |
| # Create an array of indices, similarities and book names | |
| book_data = list(zip( | |
| range(len(similarity_scores)), | |
| similarity_scores, | |
| df['book_name'] | |
| )) | |
| # Sort by similarity score (descending) | |
| sorted_books = sorted(book_data, key=lambda x: x[1], reverse=True) | |
| # Find the top N recommendations (excluding the input book) | |
| recommendations = [] | |
| seen_titles = set([input_book['book_name'].lower()]) | |
| for idx, score, title in sorted_books: | |
| if title.lower() in seen_titles: | |
| continue | |
| seen_titles.add(title.lower()) | |
| recommendations.append({ | |
| 'index': idx, | |
| 'title': title, | |
| 'similarity': score, | |
| 'categories': df.iloc[idx]['categories'], | |
| 'summary': df.iloc[idx]['summaries'] | |
| }) | |
| if len(recommendations) >= top_n: | |
| break | |
| # Format recommendations as HTML | |
| recommendations_html = f"<h3>Recommendations based on: {input_book['book_name']}</h3>" | |
| if recommendations: | |
| recommendations_html += "<ul>" | |
| for book in recommendations: | |
| similarity = round(book['similarity'] * 100, 2) | |
| recommendations_html += f"<li><strong>{book['title']}</strong> (Similarity: {similarity}%)<br>" | |
| recommendations_html += f"<em>Categories:</em> {book['categories']}<br>" | |
| summary = book['summary'] | |
| if len(summary) > 200: | |
| summary = summary[:200] + "..." | |
| recommendations_html += f"<em>Summary:</em> {summary}</li><br>" | |
| recommendations_html += "</ul>" | |
| else: | |
| recommendations_html += "<p>No similar books found.</p>" | |
| return recommendations_html | |
| # Gradio interface function | |
| def recommend_books_interface(book_title): | |
| """ | |
| Interface function for Gradio. | |
| Args: | |
| book_title (str): Title of the book | |
| Returns: | |
| str: HTML-formatted recommendations | |
| """ | |
| if not book_title or book_title.strip() == "": | |
| return "<p>Please enter a book title.</p>" | |
| if not model_initialized: | |
| return "<p>Error: Model not initialized properly. Please check the logs.</p>" | |
| try: | |
| recommendations = get_recommendations(book_title) | |
| return recommendations | |
| except Exception as e: | |
| import traceback | |
| error_trace = traceback.format_exc() | |
| print(f"Error in recommendation: {error_trace}") | |
| return f"<p>An error occurred while generating recommendations: {str(e)}</p>" | |
| # Define the interface with verified examples from the dataset | |
| iface = gr.Interface( | |
| fn=recommend_books_interface, | |
| inputs=gr.Textbox(label="Enter a book title"), | |
| outputs=gr.HTML(), | |
| title="Book Recommendation System", | |
| description="Enter a book title to get 5 similar book recommendations based on content (summaries and categories).", | |
| examples=[ | |
| "1984", | |
| "The Midnight Library", | |
| "Atomic Habits" | |
| ] | |
| ) | |
| # Launch the interface | |
| iface.launch(share=True) | |