Spaces:
Build error
Build error
| import pandas as pd | |
| import numpy as np | |
| import re | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| import os | |
| import pickle | |
| # Data Loading | |
| def load_data(file_path): | |
| """ | |
| Load the book dataset from a CSV file. | |
| Args: | |
| file_path (str): Path to the CSV file | |
| Returns: | |
| pd.DataFrame: Loaded DataFrame | |
| """ | |
| try: | |
| df = pd.read_csv(file_path) | |
| print(f"Data loaded successfully with shape: {df.shape}") | |
| return df | |
| except Exception as e: | |
| print(f"Error loading data: {e}") | |
| return None | |
| # Data Exploration | |
| def explore_data(df): | |
| """ | |
| Perform basic data exploration and return summary statistics. | |
| Args: | |
| df (pd.DataFrame): DataFrame to explore | |
| Returns: | |
| dict: Dictionary containing data summary | |
| """ | |
| summary = { | |
| "shape": df.shape, | |
| "columns": df.columns.tolist(), | |
| "missing_values": df.isnull().sum().to_dict(), | |
| "sample_data": df.head(5).to_dict() | |
| } | |
| return summary | |
| # Text Preprocessing | |
| def preprocess_text(text): | |
| """ | |
| Preprocess text data by removing special characters, converting to lowercase, | |
| removing stopwords, and lemmatizing. | |
| Args: | |
| text (str): Input text | |
| Returns: | |
| str: Preprocessed text | |
| """ | |
| if isinstance(text, str): | |
| # Download NLTK resources if not already downloaded | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| nltk.download('punkt') | |
| try: | |
| nltk.data.find('corpora/stopwords') | |
| except LookupError: | |
| nltk.download('stopwords') | |
| try: | |
| nltk.data.find('corpora/wordnet') | |
| except LookupError: | |
| nltk.download('wordnet') | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Remove special characters, numbers, etc. | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) | |
| # Tokenize | |
| tokens = nltk.word_tokenize(text) | |
| # Remove stopwords | |
| stop_words = set(stopwords.words('english')) | |
| tokens = [token for token in tokens if token not in stop_words] | |
| # Lemmatize | |
| lemmatizer = WordNetLemmatizer() | |
| tokens = [lemmatizer.lemmatize(token) for token in tokens] | |
| # Join tokens back into a string | |
| processed_text = ' '.join(tokens) | |
| return processed_text | |
| else: | |
| return "" | |
| # Data Preprocessing | |
| def preprocess_data(df): | |
| """ | |
| Preprocess the DataFrame by cleaning the text columns and handling missing values. | |
| Args: | |
| df (pd.DataFrame): Input DataFrame | |
| Returns: | |
| pd.DataFrame: Preprocessed DataFrame | |
| """ | |
| # Create a copy to avoid modifying the original DataFrame | |
| processed_df = df.copy() | |
| # Handle missing values | |
| processed_df['summaries'] = processed_df['summaries'].fillna('') | |
| processed_df['categories'] = processed_df['categories'].fillna('') | |
| # Preprocess summaries and categories | |
| print("Preprocessing summaries...") | |
| processed_df['processed_summaries'] = processed_df['summaries'].apply(preprocess_text) | |
| print("Preprocessing categories...") | |
| processed_df['processed_categories'] = processed_df['categories'].apply(preprocess_text) | |
| # Combine features (summaries and categories) | |
| processed_df['combined_features'] = processed_df['processed_summaries'] + ' ' + processed_df['processed_categories'] | |
| return processed_df | |
| # Feature Engineering | |
| def extract_features(df, feature_column='combined_features'): | |
| """ | |
| Extract TF-IDF features from the specified text column. | |
| Args: | |
| df (pd.DataFrame): Input DataFrame | |
| feature_column (str): Column name to extract features from | |
| Returns: | |
| tuple: (TF-IDF matrix, TF-IDF vectorizer) | |
| """ | |
| # Initialize TF-IDF vectorizer | |
| tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') | |
| # Fit and transform the text data | |
| tfidf_matrix = tfidf_vectorizer.fit_transform(df[feature_column]) | |
| print(f"TF-IDF matrix shape: {tfidf_matrix.shape}") | |
| return tfidf_matrix, tfidf_vectorizer | |
| # Similarity Calculation | |
| def calculate_similarity(tfidf_matrix): | |
| """ | |
| Calculate cosine similarity matrix from TF-IDF features. | |
| Args: | |
| tfidf_matrix: TF-IDF feature matrix | |
| Returns: | |
| numpy.ndarray: Cosine similarity matrix | |
| """ | |
| # Calculate cosine similarity | |
| cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) | |
| return cosine_sim | |
| # Recommendation Generation | |
| def recommend_books(book_title, df, cosine_sim, top_n=5): | |
| """ | |
| Recommend similar books based on cosine similarity. | |
| Args: | |
| book_title (str): Title of the book to find recommendations for | |
| df (pd.DataFrame): DataFrame containing book information | |
| cosine_sim (numpy.ndarray): Cosine similarity matrix | |
| top_n (int): Number of recommendations to return | |
| Returns: | |
| list: List of recommended book dictionaries | |
| """ | |
| # Find matches for the book title (case-insensitive) | |
| book_matches = df[df['book_name'].str.lower() == book_title.lower()] | |
| if book_matches.empty: | |
| # Try to find a partial match | |
| book_matches = df[df['book_name'].str.lower().str.contains(book_title.lower())] | |
| if book_matches.empty: | |
| return {"error": f"Book '{book_title}' not found in the dataset."} | |
| else: | |
| # Use the first partial match | |
| input_book = book_matches.iloc[0] | |
| print(f"Exact match not found, using closest match: {input_book['book_name']}") | |
| else: | |
| # Use the first exact match | |
| input_book = book_matches.iloc[0] | |
| # Get the book index | |
| book_idx = input_book.name | |
| # Create a list with (index, similarity, title) for all books | |
| book_data = [] | |
| for i, similarity in enumerate(cosine_sim[book_idx]): | |
| book_data.append((i, similarity, df.iloc[i]['book_name'])) | |
| # Sort by similarity score (descending) | |
| book_data = sorted(book_data, key=lambda x: x[1], reverse=True) | |
| # Find the top N recommendations (excluding books with the same title) | |
| recommendations = [] | |
| seen_titles = set([input_book['book_name'].lower()]) | |
| for idx, similarity, title in book_data: | |
| if title.lower() not in seen_titles: | |
| seen_titles.add(title.lower()) | |
| recommendations.append({ | |
| "title": title, | |
| "summary": df.iloc[idx]['summaries'][:200] + "..." if len(df.iloc[idx]['summaries']) > 200 else df.iloc[idx]['summaries'], | |
| "categories": df.iloc[idx]['categories'], | |
| "similarity_score": round(similarity * 100, 2) | |
| }) | |
| if len(recommendations) >= top_n: | |
| break | |
| return recommendations | |
| # Model Training and Saving | |
| def train_and_save_model(file_path, model_dir='model'): | |
| """ | |
| Train the recommendation model and save it for later use. | |
| Args: | |
| file_path (str): Path to the CSV file | |
| model_dir (str): Directory to save the model | |
| Returns: | |
| dict: Model information | |
| """ | |
| # Create model directory if it doesn't exist | |
| os.makedirs(model_dir, exist_ok=True) | |
| # Load and preprocess data | |
| df = load_data(file_path) | |
| if df is None: | |
| return {"error": "Failed to load data."} | |
| # Explore data | |
| data_summary = explore_data(df) | |
| # Preprocess data | |
| processed_df = preprocess_data(df) | |
| # Extract features | |
| tfidf_matrix, tfidf_vectorizer = extract_features(processed_df) | |
| # Calculate similarity | |
| cosine_sim = calculate_similarity(tfidf_matrix) | |
| # Save model artifacts | |
| model_info = { | |
| "processed_df": processed_df, | |
| "tfidf_matrix": tfidf_matrix, | |
| "tfidf_vectorizer": tfidf_vectorizer, | |
| "cosine_sim": cosine_sim, | |
| "data_summary": data_summary | |
| } | |
| # Save processed DataFrame | |
| processed_df.to_csv(os.path.join(model_dir, 'processed_data.csv'), index=False) | |
| # Save other model artifacts | |
| with open(os.path.join(model_dir, 'model_artifacts.pkl'), 'wb') as f: | |
| pickle.dump({ | |
| "tfidf_vectorizer": tfidf_vectorizer, | |
| "cosine_sim": cosine_sim, | |
| "data_summary": data_summary | |
| }, f) | |
| print(f"Model trained and saved to {model_dir}") | |
| return {"status": "success", "model_dir": model_dir} | |
| # Model Loading | |
| def load_model(model_dir='model'): | |
| """ | |
| Load the saved recommendation model. | |
| Args: | |
| model_dir (str): Directory where the model is saved | |
| Returns: | |
| dict: Loaded model artifacts | |
| """ | |
| try: | |
| # Load processed DataFrame | |
| processed_df = pd.read_csv(os.path.join(model_dir, 'processed_data.csv')) | |
| # Load other model artifacts | |
| with open(os.path.join(model_dir, 'model_artifacts.pkl'), 'rb') as f: | |
| model_artifacts = pickle.load(f) | |
| model_artifacts["processed_df"] = processed_df | |
| print(f"Model loaded from {model_dir}") | |
| return model_artifacts | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| return None | |