import pandas as pd import numpy as np import re from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import os import pickle # Data Loading def load_data(file_path): """ Load the book dataset from a CSV file. Args: file_path (str): Path to the CSV file Returns: pd.DataFrame: Loaded DataFrame """ try: df = pd.read_csv(file_path) print(f"Data loaded successfully with shape: {df.shape}") return df except Exception as e: print(f"Error loading data: {e}") return None # Data Exploration def explore_data(df): """ Perform basic data exploration and return summary statistics. Args: df (pd.DataFrame): DataFrame to explore Returns: dict: Dictionary containing data summary """ summary = { "shape": df.shape, "columns": df.columns.tolist(), "missing_values": df.isnull().sum().to_dict(), "sample_data": df.head(5).to_dict() } return summary # Text Preprocessing def preprocess_text(text): """ Preprocess text data by removing special characters, converting to lowercase, removing stopwords, and lemmatizing. Args: text (str): Input text Returns: str: Preprocessed text """ if isinstance(text, str): # Download NLTK resources if not already downloaded try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') try: nltk.data.find('corpora/wordnet') except LookupError: nltk.download('wordnet') # Convert to lowercase text = text.lower() # Remove special characters, numbers, etc. text = re.sub(r'[^a-zA-Z\s]', '', text) # Tokenize tokens = nltk.word_tokenize(text) # Remove stopwords stop_words = set(stopwords.words('english')) tokens = [token for token in tokens if token not in stop_words] # Lemmatize lemmatizer = WordNetLemmatizer() tokens = [lemmatizer.lemmatize(token) for token in tokens] # Join tokens back into a string processed_text = ' '.join(tokens) return processed_text else: return "" # Data Preprocessing def preprocess_data(df): """ Preprocess the DataFrame by cleaning the text columns and handling missing values. Args: df (pd.DataFrame): Input DataFrame Returns: pd.DataFrame: Preprocessed DataFrame """ # Create a copy to avoid modifying the original DataFrame processed_df = df.copy() # Handle missing values processed_df['summaries'] = processed_df['summaries'].fillna('') processed_df['categories'] = processed_df['categories'].fillna('') # Preprocess summaries and categories print("Preprocessing summaries...") processed_df['processed_summaries'] = processed_df['summaries'].apply(preprocess_text) print("Preprocessing categories...") processed_df['processed_categories'] = processed_df['categories'].apply(preprocess_text) # Combine features (summaries and categories) processed_df['combined_features'] = processed_df['processed_summaries'] + ' ' + processed_df['processed_categories'] return processed_df # Feature Engineering def extract_features(df, feature_column='combined_features'): """ Extract TF-IDF features from the specified text column. Args: df (pd.DataFrame): Input DataFrame feature_column (str): Column name to extract features from Returns: tuple: (TF-IDF matrix, TF-IDF vectorizer) """ # Initialize TF-IDF vectorizer tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') # Fit and transform the text data tfidf_matrix = tfidf_vectorizer.fit_transform(df[feature_column]) print(f"TF-IDF matrix shape: {tfidf_matrix.shape}") return tfidf_matrix, tfidf_vectorizer # Similarity Calculation def calculate_similarity(tfidf_matrix): """ Calculate cosine similarity matrix from TF-IDF features. Args: tfidf_matrix: TF-IDF feature matrix Returns: numpy.ndarray: Cosine similarity matrix """ # Calculate cosine similarity cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) return cosine_sim # Recommendation Generation def recommend_books(book_title, df, cosine_sim, top_n=5): """ Recommend similar books based on cosine similarity. Args: book_title (str): Title of the book to find recommendations for df (pd.DataFrame): DataFrame containing book information cosine_sim (numpy.ndarray): Cosine similarity matrix top_n (int): Number of recommendations to return Returns: list: List of recommended book dictionaries """ # Find matches for the book title (case-insensitive) book_matches = df[df['book_name'].str.lower() == book_title.lower()] if book_matches.empty: # Try to find a partial match book_matches = df[df['book_name'].str.lower().str.contains(book_title.lower())] if book_matches.empty: return {"error": f"Book '{book_title}' not found in the dataset."} else: # Use the first partial match input_book = book_matches.iloc[0] print(f"Exact match not found, using closest match: {input_book['book_name']}") else: # Use the first exact match input_book = book_matches.iloc[0] # Get the book index book_idx = input_book.name # Create a list with (index, similarity, title) for all books book_data = [] for i, similarity in enumerate(cosine_sim[book_idx]): book_data.append((i, similarity, df.iloc[i]['book_name'])) # Sort by similarity score (descending) book_data = sorted(book_data, key=lambda x: x[1], reverse=True) # Find the top N recommendations (excluding books with the same title) recommendations = [] seen_titles = set([input_book['book_name'].lower()]) for idx, similarity, title in book_data: if title.lower() not in seen_titles: seen_titles.add(title.lower()) recommendations.append({ "title": title, "summary": df.iloc[idx]['summaries'][:200] + "..." if len(df.iloc[idx]['summaries']) > 200 else df.iloc[idx]['summaries'], "categories": df.iloc[idx]['categories'], "similarity_score": round(similarity * 100, 2) }) if len(recommendations) >= top_n: break return recommendations # Model Training and Saving def train_and_save_model(file_path, model_dir='model'): """ Train the recommendation model and save it for later use. Args: file_path (str): Path to the CSV file model_dir (str): Directory to save the model Returns: dict: Model information """ # Create model directory if it doesn't exist os.makedirs(model_dir, exist_ok=True) # Load and preprocess data df = load_data(file_path) if df is None: return {"error": "Failed to load data."} # Explore data data_summary = explore_data(df) # Preprocess data processed_df = preprocess_data(df) # Extract features tfidf_matrix, tfidf_vectorizer = extract_features(processed_df) # Calculate similarity cosine_sim = calculate_similarity(tfidf_matrix) # Save model artifacts model_info = { "processed_df": processed_df, "tfidf_matrix": tfidf_matrix, "tfidf_vectorizer": tfidf_vectorizer, "cosine_sim": cosine_sim, "data_summary": data_summary } # Save processed DataFrame processed_df.to_csv(os.path.join(model_dir, 'processed_data.csv'), index=False) # Save other model artifacts with open(os.path.join(model_dir, 'model_artifacts.pkl'), 'wb') as f: pickle.dump({ "tfidf_vectorizer": tfidf_vectorizer, "cosine_sim": cosine_sim, "data_summary": data_summary }, f) print(f"Model trained and saved to {model_dir}") return {"status": "success", "model_dir": model_dir} # Model Loading def load_model(model_dir='model'): """ Load the saved recommendation model. Args: model_dir (str): Directory where the model is saved Returns: dict: Loaded model artifacts """ try: # Load processed DataFrame processed_df = pd.read_csv(os.path.join(model_dir, 'processed_data.csv')) # Load other model artifacts with open(os.path.join(model_dir, 'model_artifacts.pkl'), 'rb') as f: model_artifacts = pickle.load(f) model_artifacts["processed_df"] = processed_df print(f"Model loaded from {model_dir}") return model_artifacts except Exception as e: print(f"Error loading model: {e}") return None