Spaces:

Shekharmeena
/

bookrec

Build error

App Files Files Community

bookrec / utils.py

Shekharmeena

Rename utils-py.py to utils.py

22d71ec verified 10 months ago

raw

history blame contribute delete

9.54 kB

	import pandas as pd
	import numpy as np
	import re
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	import os
	import pickle

	# Data Loading
	def load_data(file_path):
	"""
	Load the book dataset from a CSV file.

	Args:
	file_path (str): Path to the CSV file

	Returns:
	pd.DataFrame: Loaded DataFrame
	"""
	try:
	df = pd.read_csv(file_path)
	print(f"Data loaded successfully with shape: {df.shape}")
	return df
	except Exception as e:
	print(f"Error loading data: {e}")
	return None

	# Data Exploration
	def explore_data(df):
	"""
	Perform basic data exploration and return summary statistics.

	Args:
	df (pd.DataFrame): DataFrame to explore

	Returns:
	dict: Dictionary containing data summary
	"""
	summary = {
	"shape": df.shape,
	"columns": df.columns.tolist(),
	"missing_values": df.isnull().sum().to_dict(),
	"sample_data": df.head(5).to_dict()
	}
	return summary

	# Text Preprocessing
	def preprocess_text(text):
	"""
	Preprocess text data by removing special characters, converting to lowercase,
	removing stopwords, and lemmatizing.

	Args:
	text (str): Input text

	Returns:
	str: Preprocessed text
	"""
	if isinstance(text, str):
	# Download NLTK resources if not already downloaded
	try:
	nltk.data.find('tokenizers/punkt')
	except LookupError:
	nltk.download('punkt')
	try:
	nltk.data.find('corpora/stopwords')
	except LookupError:
	nltk.download('stopwords')
	try:
	nltk.data.find('corpora/wordnet')
	except LookupError:
	nltk.download('wordnet')

	# Convert to lowercase
	text = text.lower()

	# Remove special characters, numbers, etc.
	text = re.sub(r'[^a-zA-Z\s]', '', text)

	# Tokenize
	tokens = nltk.word_tokenize(text)

	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	tokens = [token for token in tokens if token not in stop_words]

	# Lemmatize
	lemmatizer = WordNetLemmatizer()
	tokens = [lemmatizer.lemmatize(token) for token in tokens]

	# Join tokens back into a string
	processed_text = ' '.join(tokens)

	return processed_text
	else:
	return ""

	# Data Preprocessing
	def preprocess_data(df):
	"""
	Preprocess the DataFrame by cleaning the text columns and handling missing values.

	Args:
	df (pd.DataFrame): Input DataFrame

	Returns:
	pd.DataFrame: Preprocessed DataFrame
	"""
	# Create a copy to avoid modifying the original DataFrame
	processed_df = df.copy()

	# Handle missing values
	processed_df['summaries'] = processed_df['summaries'].fillna('')
	processed_df['categories'] = processed_df['categories'].fillna('')

	# Preprocess summaries and categories
	print("Preprocessing summaries...")
	processed_df['processed_summaries'] = processed_df['summaries'].apply(preprocess_text)

	print("Preprocessing categories...")
	processed_df['processed_categories'] = processed_df['categories'].apply(preprocess_text)

	# Combine features (summaries and categories)
	processed_df['combined_features'] = processed_df['processed_summaries'] + ' ' + processed_df['processed_categories']

	return processed_df

	# Feature Engineering
	def extract_features(df, feature_column='combined_features'):
	"""
	Extract TF-IDF features from the specified text column.

	Args:
	df (pd.DataFrame): Input DataFrame
	feature_column (str): Column name to extract features from

	Returns:
	tuple: (TF-IDF matrix, TF-IDF vectorizer)
	"""
	# Initialize TF-IDF vectorizer
	tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

	# Fit and transform the text data
	tfidf_matrix = tfidf_vectorizer.fit_transform(df[feature_column])

	print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

	return tfidf_matrix, tfidf_vectorizer

	# Similarity Calculation
	def calculate_similarity(tfidf_matrix):
	"""
	Calculate cosine similarity matrix from TF-IDF features.

	Args:
	tfidf_matrix: TF-IDF feature matrix

	Returns:
	numpy.ndarray: Cosine similarity matrix
	"""
	# Calculate cosine similarity
	cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

	return cosine_sim

	# Recommendation Generation
	def recommend_books(book_title, df, cosine_sim, top_n=5):
	"""
	Recommend similar books based on cosine similarity.

	Args:
	book_title (str): Title of the book to find recommendations for
	df (pd.DataFrame): DataFrame containing book information
	cosine_sim (numpy.ndarray): Cosine similarity matrix
	top_n (int): Number of recommendations to return

	Returns:
	list: List of recommended book dictionaries
	"""
	# Find matches for the book title (case-insensitive)
	book_matches = df[df['book_name'].str.lower() == book_title.lower()]

	if book_matches.empty:
	# Try to find a partial match
	book_matches = df[df['book_name'].str.lower().str.contains(book_title.lower())]
	if book_matches.empty:
	return {"error": f"Book '{book_title}' not found in the dataset."}
	else:
	# Use the first partial match
	input_book = book_matches.iloc[0]
	print(f"Exact match not found, using closest match: {input_book['book_name']}")
	else:
	# Use the first exact match
	input_book = book_matches.iloc[0]

	# Get the book index
	book_idx = input_book.name

	# Create a list with (index, similarity, title) for all books
	book_data = []
	for i, similarity in enumerate(cosine_sim[book_idx]):
	book_data.append((i, similarity, df.iloc[i]['book_name']))

	# Sort by similarity score (descending)
	book_data = sorted(book_data, key=lambda x: x[1], reverse=True)

	# Find the top N recommendations (excluding books with the same title)
	recommendations = []
	seen_titles = set([input_book['book_name'].lower()])

	for idx, similarity, title in book_data:
	if title.lower() not in seen_titles:
	seen_titles.add(title.lower())
	recommendations.append({
	"title": title,
	"summary": df.iloc[idx]['summaries'][:200] + "..." if len(df.iloc[idx]['summaries']) > 200 else df.iloc[idx]['summaries'],
	"categories": df.iloc[idx]['categories'],
	"similarity_score": round(similarity * 100, 2)
	})

	if len(recommendations) >= top_n:
	break

	return recommendations

	# Model Training and Saving
	def train_and_save_model(file_path, model_dir='model'):
	"""
	Train the recommendation model and save it for later use.

	Args:
	file_path (str): Path to the CSV file
	model_dir (str): Directory to save the model

	Returns:
	dict: Model information
	"""
	# Create model directory if it doesn't exist
	os.makedirs(model_dir, exist_ok=True)

	# Load and preprocess data
	df = load_data(file_path)
	if df is None:
	return {"error": "Failed to load data."}

	# Explore data
	data_summary = explore_data(df)

	# Preprocess data
	processed_df = preprocess_data(df)

	# Extract features
	tfidf_matrix, tfidf_vectorizer = extract_features(processed_df)

	# Calculate similarity
	cosine_sim = calculate_similarity(tfidf_matrix)

	# Save model artifacts
	model_info = {
	"processed_df": processed_df,
	"tfidf_matrix": tfidf_matrix,
	"tfidf_vectorizer": tfidf_vectorizer,
	"cosine_sim": cosine_sim,
	"data_summary": data_summary
	}

	# Save processed DataFrame
	processed_df.to_csv(os.path.join(model_dir, 'processed_data.csv'), index=False)

	# Save other model artifacts
	with open(os.path.join(model_dir, 'model_artifacts.pkl'), 'wb') as f:
	pickle.dump({
	"tfidf_vectorizer": tfidf_vectorizer,
	"cosine_sim": cosine_sim,
	"data_summary": data_summary
	}, f)

	print(f"Model trained and saved to {model_dir}")

	return {"status": "success", "model_dir": model_dir}

	# Model Loading
	def load_model(model_dir='model'):
	"""
	Load the saved recommendation model.

	Args:
	model_dir (str): Directory where the model is saved

	Returns:
	dict: Loaded model artifacts
	"""
	try:
	# Load processed DataFrame
	processed_df = pd.read_csv(os.path.join(model_dir, 'processed_data.csv'))

	# Load other model artifacts
	with open(os.path.join(model_dir, 'model_artifacts.pkl'), 'rb') as f:
	model_artifacts = pickle.load(f)

	model_artifacts["processed_df"] = processed_df

	print(f"Model loaded from {model_dir}")

	return model_artifacts
	except Exception as e:
	print(f"Error loading model: {e}")
	return None