bookrec / utils.py
Shekharmeena's picture
Rename utils-py.py to utils.py
22d71ec verified
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
import pickle
# Data Loading
def load_data(file_path):
"""
Load the book dataset from a CSV file.
Args:
file_path (str): Path to the CSV file
Returns:
pd.DataFrame: Loaded DataFrame
"""
try:
df = pd.read_csv(file_path)
print(f"Data loaded successfully with shape: {df.shape}")
return df
except Exception as e:
print(f"Error loading data: {e}")
return None
# Data Exploration
def explore_data(df):
"""
Perform basic data exploration and return summary statistics.
Args:
df (pd.DataFrame): DataFrame to explore
Returns:
dict: Dictionary containing data summary
"""
summary = {
"shape": df.shape,
"columns": df.columns.tolist(),
"missing_values": df.isnull().sum().to_dict(),
"sample_data": df.head(5).to_dict()
}
return summary
# Text Preprocessing
def preprocess_text(text):
"""
Preprocess text data by removing special characters, converting to lowercase,
removing stopwords, and lemmatizing.
Args:
text (str): Input text
Returns:
str: Preprocessed text
"""
if isinstance(text, str):
# Download NLTK resources if not already downloaded
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
try:
nltk.data.find('corpora/wordnet')
except LookupError:
nltk.download('wordnet')
# Convert to lowercase
text = text.lower()
# Remove special characters, numbers, etc.
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Tokenize
tokens = nltk.word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
# Lemmatize
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
# Join tokens back into a string
processed_text = ' '.join(tokens)
return processed_text
else:
return ""
# Data Preprocessing
def preprocess_data(df):
"""
Preprocess the DataFrame by cleaning the text columns and handling missing values.
Args:
df (pd.DataFrame): Input DataFrame
Returns:
pd.DataFrame: Preprocessed DataFrame
"""
# Create a copy to avoid modifying the original DataFrame
processed_df = df.copy()
# Handle missing values
processed_df['summaries'] = processed_df['summaries'].fillna('')
processed_df['categories'] = processed_df['categories'].fillna('')
# Preprocess summaries and categories
print("Preprocessing summaries...")
processed_df['processed_summaries'] = processed_df['summaries'].apply(preprocess_text)
print("Preprocessing categories...")
processed_df['processed_categories'] = processed_df['categories'].apply(preprocess_text)
# Combine features (summaries and categories)
processed_df['combined_features'] = processed_df['processed_summaries'] + ' ' + processed_df['processed_categories']
return processed_df
# Feature Engineering
def extract_features(df, feature_column='combined_features'):
"""
Extract TF-IDF features from the specified text column.
Args:
df (pd.DataFrame): Input DataFrame
feature_column (str): Column name to extract features from
Returns:
tuple: (TF-IDF matrix, TF-IDF vectorizer)
"""
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df[feature_column])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
return tfidf_matrix, tfidf_vectorizer
# Similarity Calculation
def calculate_similarity(tfidf_matrix):
"""
Calculate cosine similarity matrix from TF-IDF features.
Args:
tfidf_matrix: TF-IDF feature matrix
Returns:
numpy.ndarray: Cosine similarity matrix
"""
# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
return cosine_sim
# Recommendation Generation
def recommend_books(book_title, df, cosine_sim, top_n=5):
"""
Recommend similar books based on cosine similarity.
Args:
book_title (str): Title of the book to find recommendations for
df (pd.DataFrame): DataFrame containing book information
cosine_sim (numpy.ndarray): Cosine similarity matrix
top_n (int): Number of recommendations to return
Returns:
list: List of recommended book dictionaries
"""
# Find matches for the book title (case-insensitive)
book_matches = df[df['book_name'].str.lower() == book_title.lower()]
if book_matches.empty:
# Try to find a partial match
book_matches = df[df['book_name'].str.lower().str.contains(book_title.lower())]
if book_matches.empty:
return {"error": f"Book '{book_title}' not found in the dataset."}
else:
# Use the first partial match
input_book = book_matches.iloc[0]
print(f"Exact match not found, using closest match: {input_book['book_name']}")
else:
# Use the first exact match
input_book = book_matches.iloc[0]
# Get the book index
book_idx = input_book.name
# Create a list with (index, similarity, title) for all books
book_data = []
for i, similarity in enumerate(cosine_sim[book_idx]):
book_data.append((i, similarity, df.iloc[i]['book_name']))
# Sort by similarity score (descending)
book_data = sorted(book_data, key=lambda x: x[1], reverse=True)
# Find the top N recommendations (excluding books with the same title)
recommendations = []
seen_titles = set([input_book['book_name'].lower()])
for idx, similarity, title in book_data:
if title.lower() not in seen_titles:
seen_titles.add(title.lower())
recommendations.append({
"title": title,
"summary": df.iloc[idx]['summaries'][:200] + "..." if len(df.iloc[idx]['summaries']) > 200 else df.iloc[idx]['summaries'],
"categories": df.iloc[idx]['categories'],
"similarity_score": round(similarity * 100, 2)
})
if len(recommendations) >= top_n:
break
return recommendations
# Model Training and Saving
def train_and_save_model(file_path, model_dir='model'):
"""
Train the recommendation model and save it for later use.
Args:
file_path (str): Path to the CSV file
model_dir (str): Directory to save the model
Returns:
dict: Model information
"""
# Create model directory if it doesn't exist
os.makedirs(model_dir, exist_ok=True)
# Load and preprocess data
df = load_data(file_path)
if df is None:
return {"error": "Failed to load data."}
# Explore data
data_summary = explore_data(df)
# Preprocess data
processed_df = preprocess_data(df)
# Extract features
tfidf_matrix, tfidf_vectorizer = extract_features(processed_df)
# Calculate similarity
cosine_sim = calculate_similarity(tfidf_matrix)
# Save model artifacts
model_info = {
"processed_df": processed_df,
"tfidf_matrix": tfidf_matrix,
"tfidf_vectorizer": tfidf_vectorizer,
"cosine_sim": cosine_sim,
"data_summary": data_summary
}
# Save processed DataFrame
processed_df.to_csv(os.path.join(model_dir, 'processed_data.csv'), index=False)
# Save other model artifacts
with open(os.path.join(model_dir, 'model_artifacts.pkl'), 'wb') as f:
pickle.dump({
"tfidf_vectorizer": tfidf_vectorizer,
"cosine_sim": cosine_sim,
"data_summary": data_summary
}, f)
print(f"Model trained and saved to {model_dir}")
return {"status": "success", "model_dir": model_dir}
# Model Loading
def load_model(model_dir='model'):
"""
Load the saved recommendation model.
Args:
model_dir (str): Directory where the model is saved
Returns:
dict: Loaded model artifacts
"""
try:
# Load processed DataFrame
processed_df = pd.read_csv(os.path.join(model_dir, 'processed_data.csv'))
# Load other model artifacts
with open(os.path.join(model_dir, 'model_artifacts.pkl'), 'rb') as f:
model_artifacts = pickle.load(f)
model_artifacts["processed_df"] = processed_df
print(f"Model loaded from {model_dir}")
return model_artifacts
except Exception as e:
print(f"Error loading model: {e}")
return None