import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
import pickle

# Data Loading
def load_data(file_path):
    """
    Load the book dataset from a CSV file.
    
    Args:
        file_path (str): Path to the CSV file
        
    Returns:
        pd.DataFrame: Loaded DataFrame
    """
    try:
        df = pd.read_csv(file_path)
        print(f"Data loaded successfully with shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Data Exploration
def explore_data(df):
    """
    Perform basic data exploration and return summary statistics.
    
    Args:
        df (pd.DataFrame): DataFrame to explore
        
    Returns:
        dict: Dictionary containing data summary
    """
    summary = {
        "shape": df.shape,
        "columns": df.columns.tolist(),
        "missing_values": df.isnull().sum().to_dict(),
        "sample_data": df.head(5).to_dict()
    }
    return summary

# Text Preprocessing
def preprocess_text(text):
    """
    Preprocess text data by removing special characters, converting to lowercase,
    removing stopwords, and lemmatizing.
    
    Args:
        text (str): Input text
        
    Returns:
        str: Preprocessed text
    """
    if isinstance(text, str):
        # Download NLTK resources if not already downloaded
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')
        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')
        try:
            nltk.data.find('corpora/wordnet')
        except LookupError:
            nltk.download('wordnet')
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters, numbers, etc.
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        
        # Lemmatize
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
        # Join tokens back into a string
        processed_text = ' '.join(tokens)
        
        return processed_text
    else:
        return ""

# Data Preprocessing
def preprocess_data(df):
    """
    Preprocess the DataFrame by cleaning the text columns and handling missing values.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: Preprocessed DataFrame
    """
    # Create a copy to avoid modifying the original DataFrame
    processed_df = df.copy()
    
    # Handle missing values
    processed_df['summaries'] = processed_df['summaries'].fillna('')
    processed_df['categories'] = processed_df['categories'].fillna('')
    
    # Preprocess summaries and categories
    print("Preprocessing summaries...")
    processed_df['processed_summaries'] = processed_df['summaries'].apply(preprocess_text)
    
    print("Preprocessing categories...")
    processed_df['processed_categories'] = processed_df['categories'].apply(preprocess_text)
    
    # Combine features (summaries and categories)
    processed_df['combined_features'] = processed_df['processed_summaries'] + ' ' + processed_df['processed_categories']
    
    return processed_df

# Feature Engineering
def extract_features(df, feature_column='combined_features'):
    """
    Extract TF-IDF features from the specified text column.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        feature_column (str): Column name to extract features from
        
    Returns:
        tuple: (TF-IDF matrix, TF-IDF vectorizer)
    """
    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    
    # Fit and transform the text data
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[feature_column])
    
    print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
    
    return tfidf_matrix, tfidf_vectorizer

# Similarity Calculation
def calculate_similarity(tfidf_matrix):
    """
    Calculate cosine similarity matrix from TF-IDF features.
    
    Args:
        tfidf_matrix: TF-IDF feature matrix
        
    Returns:
        numpy.ndarray: Cosine similarity matrix
    """
    # Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    return cosine_sim

# Recommendation Generation
def recommend_books(book_title, df, cosine_sim, top_n=5):
    """
    Recommend similar books based on cosine similarity.
    
    Args:
        book_title (str): Title of the book to find recommendations for
        df (pd.DataFrame): DataFrame containing book information
        cosine_sim (numpy.ndarray): Cosine similarity matrix
        top_n (int): Number of recommendations to return
        
    Returns:
        list: List of recommended book dictionaries
    """
    # Find matches for the book title (case-insensitive)
    book_matches = df[df['book_name'].str.lower() == book_title.lower()]
    
    if book_matches.empty:
        # Try to find a partial match
        book_matches = df[df['book_name'].str.lower().str.contains(book_title.lower())]
        if book_matches.empty:
            return {"error": f"Book '{book_title}' not found in the dataset."}
        else:
            # Use the first partial match
            input_book = book_matches.iloc[0]
            print(f"Exact match not found, using closest match: {input_book['book_name']}")
    else:
        # Use the first exact match
        input_book = book_matches.iloc[0]
    
    # Get the book index
    book_idx = input_book.name
    
    # Create a list with (index, similarity, title) for all books
    book_data = []
    for i, similarity in enumerate(cosine_sim[book_idx]):
        book_data.append((i, similarity, df.iloc[i]['book_name']))
    
    # Sort by similarity score (descending)
    book_data = sorted(book_data, key=lambda x: x[1], reverse=True)
    
    # Find the top N recommendations (excluding books with the same title)
    recommendations = []
    seen_titles = set([input_book['book_name'].lower()])
    
    for idx, similarity, title in book_data:
        if title.lower() not in seen_titles:
            seen_titles.add(title.lower())
            recommendations.append({
                "title": title,
                "summary": df.iloc[idx]['summaries'][:200] + "..." if len(df.iloc[idx]['summaries']) > 200 else df.iloc[idx]['summaries'],
                "categories": df.iloc[idx]['categories'],
                "similarity_score": round(similarity * 100, 2)
            })
            
            if len(recommendations) >= top_n:
                break
    
    return recommendations

# Model Training and Saving
def train_and_save_model(file_path, model_dir='model'):
    """
    Train the recommendation model and save it for later use.
    
    Args:
        file_path (str): Path to the CSV file
        model_dir (str): Directory to save the model
        
    Returns:
        dict: Model information
    """
    # Create model directory if it doesn't exist
    os.makedirs(model_dir, exist_ok=True)
    
    # Load and preprocess data
    df = load_data(file_path)
    if df is None:
        return {"error": "Failed to load data."}
    
    # Explore data
    data_summary = explore_data(df)
    
    # Preprocess data
    processed_df = preprocess_data(df)
    
    # Extract features
    tfidf_matrix, tfidf_vectorizer = extract_features(processed_df)
    
    # Calculate similarity
    cosine_sim = calculate_similarity(tfidf_matrix)
    
    # Save model artifacts
    model_info = {
        "processed_df": processed_df,
        "tfidf_matrix": tfidf_matrix,
        "tfidf_vectorizer": tfidf_vectorizer,
        "cosine_sim": cosine_sim,
        "data_summary": data_summary
    }
    
    # Save processed DataFrame
    processed_df.to_csv(os.path.join(model_dir, 'processed_data.csv'), index=False)
    
    # Save other model artifacts
    with open(os.path.join(model_dir, 'model_artifacts.pkl'), 'wb') as f:
        pickle.dump({
            "tfidf_vectorizer": tfidf_vectorizer,
            "cosine_sim": cosine_sim,
            "data_summary": data_summary
        }, f)
    
    print(f"Model trained and saved to {model_dir}")
    
    return {"status": "success", "model_dir": model_dir}

# Model Loading
def load_model(model_dir='model'):
    """
    Load the saved recommendation model.
    
    Args:
        model_dir (str): Directory where the model is saved
        
    Returns:
        dict: Loaded model artifacts
    """
    try:
        # Load processed DataFrame
        processed_df = pd.read_csv(os.path.join(model_dir, 'processed_data.csv'))
        
        # Load other model artifacts
        with open(os.path.join(model_dir, 'model_artifacts.pkl'), 'rb') as f:
            model_artifacts = pickle.load(f)
        
        model_artifacts["processed_df"] = processed_df
        
        print(f"Model loaded from {model_dir}")
        
        return model_artifacts
    except Exception as e:
        print(f"Error loading model: {e}")
        return None