import gradio as gr
import pickle
import json
import numpy as np
import pandas as pd
import traceback
import nltk
import os
import re
import logging
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
from huggingface_hub import hf_hub_download
import joblib
from typing import Tuple, Dict, Any

# Download the 'punkt' tokenizer
nltk.download('punkt', quiet=True)

# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Define current directory
current_dir = os.path.dirname(os.path.abspath(__file__))

# Get Hugging Face access token from environment variable
access_token = os.getenv('HUGGINGFACE_HUB_TOKEN')

# Load all necessary files first
def load_models_and_encoders():
    try:
        # Set the repository IDs
        model_repo_id = 'apoppie/random_forest_model_20241005_202610'

        # Load the trained Random Forest model from Hugging Face Hub
        model_file = hf_hub_download(
            repo_id=model_repo_id,
            filename='random_forest_model_20241005_145919.pkl',
            use_auth_token=access_token,
            resume_download=True
        )
        with open(model_file, 'rb') as f:
            rf_model = pickle.load(f)
        logger.info("Random Forest model loaded successfully.")

        # Load label encoder
        label_encoder_path = os.path.join(current_dir, 'label_encoder.joblib')
        label_encoder = joblib.load(label_encoder_path)
        logger.info("Label encoder loaded successfully.")

        # Load MfrID encoder
        mfrid_encoder_path = os.path.join(current_dir, 'MfrID_encoder.pkl')
        with open(mfrid_encoder_path, 'rb') as f:
            mfrid_encoder = pickle.load(f)
        logger.info(f"MfrID encoder loaded. Type: {type(mfrid_encoder)}, Size: {len(mfrid_encoder)}")

        # Load SPSC encoder
        spsc_encoder_path = os.path.join(current_dir, 'SPSC_encoder.pkl')
        with open(spsc_encoder_path, 'rb') as f:
            spsc_encoder = pickle.load(f)
        logger.info(f"SPSC encoder loaded. Type: {type(spsc_encoder)}")

        # Load TF-IDF vectorizer
        tfidf_vectorizer_path = os.path.join(current_dir, 'tfidf_vectorizer_20241006_123542.joblib')
        tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)
        logger.info("TF-IDF vectorizer loaded successfully.")

        # Load SVD model
        svd_model_path = os.path.join(current_dir, 'svd_model.joblib')
        svd_model = joblib.load(svd_model_path)
        logger.info("SVD model loaded successfully.")

        return {
            'rf_model': rf_model,
            'label_encoder': label_encoder,
            'mfrid_encoder': mfrid_encoder,
            'spsc_encoder': spsc_encoder,
            'tfidf_vectorizer': tfidf_vectorizer,
            'svd_model': svd_model
        }
    except Exception as e:
        logger.error(f"Error loading models and encoders: {str(e)}")
        raise

# Load all models and encoders
models = load_models_and_encoders()

# Assign models to global variables
rf_model = models['rf_model']
label_encoder = models['label_encoder']
mfrid_encoder = models['mfrid_encoder']
spsc_encoder = models['spsc_encoder']
tfidf_vectorizer = models['tfidf_vectorizer']
svd_model = models['svd_model']

# Set up mfrid_to_index
if isinstance(mfrid_encoder, list):
    mfrid_to_index = {mfrid: i for i, mfrid in enumerate(mfrid_encoder)}
elif isinstance(mfrid_encoder, dict):
    mfrid_to_index = mfrid_encoder
else:
    raise ValueError(f"Unexpected type for MfrID encoder: {type(mfrid_encoder)}")

# Custom download link button
def create_download_link(csv_string):
    return f'<a href="data:file/csv;base64,{csv_string.encode().decode()}" download="predictions.csv">Download CSV</a>'


def test_different_thresholds(json_file, thresholds=[0.2, 0.3, 0.4, 0.5]):
    results = {}
    try:
        for threshold in thresholds:
            df, _, validation_msg = predict_from_json(json_file, confidence_threshold=threshold)
            if df is not None:
                # Change 'Class' to 'Review' to match DataFrame structure
                human_review = (df['Review'] == "Needs Human Review").mean()
                results[threshold] = {
                    'human_review_percentage': round(human_review * 100, 2),
                    'confident_predictions': round((1 - human_review) * 100, 2)
                }
        
        # Create DataFrame with better formatting
        results_df = pd.DataFrame(results).T
        results_df.index = [f"{int(x*100)}%" for x in results_df.index]
        results_df.index.name = "Threshold"
        results_df.columns = ["Human Review %", "Confident Predictions %"]
        
        return results_df
    except Exception as e:
        logger.error(f"Error in threshold testing: {str(e)}")
        # Return a formatted error DataFrame
        error_df = pd.DataFrame({
            'Threshold': [f"{int(x*100)}%" for x in thresholds],
            'Error': [str(e)] * len(thresholds)
        }).set_index('Threshold')
        return error_df


def parse_json_content(content):
    try:
        data = json.loads(content)
        if not isinstance(data, list):
            data = [data]
        return data, "JSON parsing successful."
    except json.JSONDecodeError:
        logger.error("Failed to parse JSON content")
        return [], "Failed to parse JSON content. Please check the format."

def preprocess_features(df: pd.DataFrame) -> dict:
    """
    Preprocess features exactly as done in training
    """
    features = {}
    
    # Description processing
    features['Description'] = (df['Description']
                             .fillna('')  # Match training null handling
                             .astype(str)
                             .tolist())
    
    # SPSC processing
    features['SPSC'] = (df['SPSC']
                       .fillna(0)  # Match training null handling
                       .astype(int)  # Convert to int first
                       .astype(str)  # Then to string
                       .tolist())
    
    # MfrID processing
    features['MfrID'] = (df['MfrID']
                        .fillna('Unknown')  # Match training null handling
                        .astype(str)
                        .tolist())
    
    return features

def preprocess_descriptions(descriptions):
    try:
        tfidf_matrix = tfidf_vectorizer.transform(descriptions)
        svd_result = svd_model.transform(tfidf_matrix)
        validation_message = f"Descriptions preprocessed. Shape: {svd_result.shape}"
        return svd_result, validation_message
    except Exception as e:
        logger.error(f"Error in preprocess_descriptions: {str(e)}")
        return np.zeros((len(descriptions), svd_model.n_components)), f"Error in description preprocessing: {str(e)}"

def binarize_spsc(spsc_codes):
    out = []
    if isinstance(spsc_encoder, dict):
        bits = spsc_encoder['bits']
    elif isinstance(spsc_encoder, int):
        bits = spsc_encoder
    else:
        bits = 32  # Default to 32 bits if we can't determine the correct number
        logger.warning(f"Unexpected type for SPSC encoder: {type(spsc_encoder)}. Using default 32 bits.")

    for v in spsc_codes:
        try:
            v = int(v) if not pd.isna(v) else 0
            enc = format(v, f'0{bits}b')
            out.append([int(c) for c in enc])
        except Exception as e:
            logger.error(f"Error binarizing SPSC code {v}: {str(e)}")
            out.append([0] * bits)
    
    validation_message = f"SPSC codes binarized. Shape: {np.array(out).shape}"
    return out, validation_message

def encode_mfrid(mfrid_list, mfrid_encoder):
    """
    Convert list of MfrID values to binary encoding using the provided encoder
    
    Args:
        mfrid_list (list): List of MfrID values to encode
        mfrid_encoder (dict): Dictionary mapping MfrID to binary encoding
        
    Returns:
        tuple: (encoded_mfrid, validation_message)
    """
    encoded_length = len(next(iter(mfrid_encoder.values()))) if mfrid_encoder else 0
    encoded_mfrid = []
    
    for mfrid in mfrid_list:
        if pd.isna(mfrid):
            encoded_mfrid.append([0] * encoded_length)
        else:
            mfrid = str(mfrid)
            if mfrid in mfrid_encoder:
                encoded_mfrid.append(mfrid_encoder[mfrid])
            else:
                encoded_mfrid.append([0] * encoded_length)
    
    validation_message = f"MfrID encoded. Shape: {np.array(encoded_mfrid).shape}"
    return encoded_mfrid, validation_message


def predict_from_json(json_file, confidence_threshold=0.3):
    """
    Make predictions from JSON input file with consistent preprocessing
    """
    validation_steps = []
    try:
        # Parse JSON content
        if hasattr(json_file, 'read'):
            content = json_file.read().decode('utf-8')
        else:
            with open(json_file, 'r') as f:
                content = f.read()
                
        data = json.loads(content)
        if not isinstance(data, list):
            data = [data]
            
        # Create DataFrame
        df = pd.DataFrame(data)
        
        # Use the new centralized preprocessing
        features = preprocess_features(df)
        validation_steps.append("Features preprocessed consistently with training")

        # Process features with existing functions
        tfidf_features, desc_msg = preprocess_descriptions(features['Description'])
        validation_steps.append(desc_msg)

        spsc_features, spsc_msg = binarize_spsc(features['SPSC'])
        validation_steps.append(spsc_msg)

        mfrid_features, mfrid_msg = encode_mfrid(features['MfrID'], mfrid_to_index)
        validation_steps.append(mfrid_msg)

        # Combine features
        X = np.hstack([
            tfidf_features, 
            np.array(spsc_features), 
            np.array(mfrid_features)
        ])
        
        # Make predictions
        predictions = rf_model.predict(X)
        probabilities = rf_model.predict_proba(X)
        max_probabilities = np.max(probabilities, axis=1)
        
        # Create output DataFrame with more detailed information
        output_df = pd.DataFrame({
            'SKU': df['ID'],
            'MfrID': df['MfrID'],
            'Predicted_Class': label_encoder.inverse_transform(predictions),
            'Review': [
                label_encoder.inverse_transform([pred])[0] if prob >= confidence_threshold 
                else "Needs Human Review"
                for pred, prob in zip(predictions, max_probabilities)
            ],
            'Confidence Score': [f"{prob*100:.2f}%" for prob in max_probabilities],
            'Second_Best_Class': [
                label_encoder.inverse_transform([np.argsort(probs)[-2]])[0]
                for probs in probabilities
            ],
            'Second_Best_Score': [f"{probs[np.argsort(probs)[-2]]*100:.2f}%" 
                                 for probs in probabilities]
        })

        validation_steps.append("Predictions completed successfully.")
        return output_df, output_df.to_csv(index=False), "\n".join(validation_steps)

    except Exception as e:
        logger.error("An error occurred", exc_info=True)
        error_message = f"An error occurred:\n{str(e)}"
        validation_steps.append(f"Error: {str(e)}")
        return None, error_message, "\n".join(validation_steps)


# Diagnostic code
def analyze_prediction_confidence(model, X, predictions, confidences):
    print(f"Number of predictions: {len(predictions)}")
    print(f"Average confidence: {np.mean(confidences):.4f}")
    print(f"Confidence distribution:")
    print(pd.Series(confidences).describe())
    
    # Compare feature distributions
    print("\nFeature statistics:")
    print(pd.DataFrame(X).describe())
    
    return pd.DataFrame({
        'prediction': predictions,
        'confidence': confidences
    }).sort_values('confidence', ascending=False)

def validate_features(X, expected_shape):
    """Validate feature matrix before prediction"""
    if X.shape[1] != expected_shape[1]:
        logger.error(f"Feature mismatch: Expected {expected_shape[1]}, got {X.shape[1]}")
        return False
    
    # Check for all-zero features
    zero_features = np.all(X == 0, axis=0)
    if np.any(zero_features):
        logger.warning(f"Found {np.sum(zero_features)} all-zero features")
    
    return True

# GRADIO Interface
import gradio as gr
import base64
import pandas as pd
import numpy as np

def create_download_link(csv_string):
    csv_bytes = csv_string.encode('utf-8')
    b64 = base64.b64encode(csv_bytes).decode()
    href = f'data:file/csv;base64,{b64}'
    return f'<a href="{href}" download="predictions.csv" target="_blank">Download Predictions CSV</a>'

def predict_and_display(json_file, confidence_threshold):
    try:
        if json_file is None:
            return None, "", "No file uploaded. Please upload a JSON file."
            
        df, csv_string, validation_steps = predict_from_json(json_file, confidence_threshold)
        if df is not None:
            download_link = create_download_link(csv_string)
            return df, download_link, validation_steps
        else:
            return None, "", f"Error processing predictions: {validation_steps}"
    except Exception as e:
        logger.error(f"Error in predict_and_display: {str(e)}")
        return None, "", f"Error processing predictions: {str(e)}"

def create_diagnostic_output(json_file):
    """Create comprehensive diagnostic information"""
    try:
        # Test different thresholds
        threshold_results = test_different_thresholds(json_file)
        
        # Make predictions with default threshold to get detailed metrics
        df, _, validation_steps = predict_from_json(json_file, confidence_threshold=0.3)
        
        if df is None:
            return "Error: Unable to process predictions. Please check the input file format."
            
        # Create detailed report
        report = []
        report.append("=== Threshold Analysis ===")
        report.append("Shows percentage of predictions requiring human review at different confidence thresholds:")
        report.append(threshold_results.to_string())
        
        report.append("\n=== Confidence Distribution ===")
        conf_scores = df['Confidence Score'].str.rstrip('%').astype(float)
        conf_stats = conf_scores.describe()
        report.append(conf_stats.to_string())
        
        report.append("\n=== Prediction Distribution ===")
        review_dist = df['Review'].value_counts()  # Changed from 'Class' to 'Review'
        report.append(review_dist.to_string())
        
        report.append("\n=== Validation Steps ===")
        report.append(validation_steps)
        
        return "\n".join(report)
    except Exception as e:
        logger.error(f"Error in diagnostic output: {str(e)}")
        return f"Error generating diagnostic report: {str(e)}\n\nPlease check the input file format and try again."

def build_interface():
    try:
        with gr.Blocks(title="Camcor Item Prediction Model") as demo:
            gr.Markdown("""
            # Camcor Item Prediction Model
            Upload a JSON file containing product information to get predictions.
            """)
            
            with gr.Tabs():
                # Predictions Tab
                with gr.Tab("Make Predictions"):
                    # Create a two-column layout
                    with gr.Row():
                        # Left column for inputs
                        with gr.Column(scale=1):
                            input_file = gr.File(
                                label="Upload JSON File", 
                                file_types=['.json']
                            )
                            confidence = gr.Slider(
                                minimum=0.1,
                                maximum=1.0,
                                value=0.3,
                                step=0.1,
                                label="Confidence Threshold",
                                info="Lower values will result in more predictions, higher values in more human review cases"
                            )
                            predict_btn = gr.Button("Make Predictions", variant="primary")
                            
                            # Validation output in left column
                            validation_output = gr.Textbox(
                                label="Validation Steps",
                                lines=3,
                                max_lines=3,
                                interactive=False
                            )
                            download_link = gr.HTML(
                                label="Download Predictions",
                                value=""
                            )
                        
                        # Right column for predictions, taking more space
                        with gr.Column(scale=2):
                            predictions_df = gr.DataFrame(
                                headers=[
                                    'SKU', 
                                    'MfrID', 
                                    'Predicted_Class',
                                    'Confidence CLass', 
                                    'Confidence Score',
                                    'Second_Best_Class',
                                    'Second_Best_Score'
                                ],
                                label="Predictions",
                                value=None,
                                wrap=True,
                                height=600,
                                interactive=False
                            )

                # Diagnostics Tab
                with gr.Tab("Diagnostics"):
                    with gr.Row():
                        diagnostic_file = gr.File(
                            label="Upload JSON for Analysis", 
                            file_types=['.json']
                        )
                        analyze_btn = gr.Button("Run Diagnostic Analysis", variant="secondary")
                    
                    diagnostic_output = gr.Textbox(
                        label="Diagnostic Results",
                        lines=20,
                        max_lines=30
                    )
                
                # Help Tab
                with gr.Tab("Help"):
                    gr.Markdown("""
                    ### Using the Prediction Model
                    1. Upload a JSON file containing items to classify
                    2. Adjust the confidence threshold if needed
                    3. Click "Make Predictions" to get results
                    4. Download results using the provided link
                    
                    ### Understanding the Results
                    - **SKU**: Unique identifier for the item
                    - **MfrID**: Manufacturer ID
                    - **Predicted_Class**: Primary classification prediction
                    - **Review**: Final classification or "Needs Human Review"
                    - **Confidence Score**: Confidence level of primary prediction
                    - **Second_Best_Class**: Alternative classification
                    - **Second_Best_Score**: Confidence level of alternative
                    """)

            # Updated CSS for better layout and readability
            demo.css = """
                /* General layout improvements */
                .container {
                    max-width: 100% !important;
                    padding: 0 !important;
                    margin: 0 !important;
                }
                
                /* DataFrame styling */
                .table-wrap {
                    overflow: visible !important;
                    max-height: none !important;
                    border: none !important;
                }
                
                table {
                    width: 100% !important;
                    border-collapse: collapse !important;
                }
                
                th {
                    background: #f3f4f6 !important;
                    padding: 12px 8px !important;
                    text-align: left !important;
                    font-weight: 600 !important;
                    color: #374151 !important;
                    border-bottom: 2px solid #d1d5db !important;
                }
                
                td {
                    padding: 12px 8px !important;
                    border-bottom: 1px solid #e5e7eb !important;
                    color: #4b5563 !important;
                }
                
                tr:nth-child(even) {
                    background-color: #f9fafb !important;
                }
                
                /* Button styling */
                button.primary {
                    background-color: #2563eb !important;
                    color: white !important;
                    padding: 10px 20px !important;
                    border-radius: 6px !important;
                    font-weight: 500 !important;
                }
                
                /* Validation output styling */
                .validation-box {
                    background-color: #f3f4f6 !important;
                    border: 1px solid #e5e7eb !important;
                    border-radius: 6px !important;
                    padding: 12px !important;
                    margin: 12px 0 !important;
                }
                
                /* Download link styling */
                .download-link {
                    margin: 12px 0 !important;
                    padding: 8px !important;
                    text-align: center !important;
                    background-color: #f3f4f6 !important;
                    border-radius: 6px !important;
                }
                
                /* Tab styling */
                .tab-nav {
                    border-bottom: 2px solid #e5e7eb !important;
                    padding-bottom: 0 !important;
                }
                
                .tab-nav button {
                    padding: 12px 20px !important;
                    margin-right: 4px !important;
                    border-radius: 6px 6px 0 0 !important;
                }
                
                .tab-nav button.selected {
                    background-color: #f3f4f6 !important;
                    border-bottom: 2px solid #2563eb !important;
                }
            """

            # Set up event handlers
            predict_btn.click(
                fn=predict_and_display,
                inputs=[input_file, confidence],
                outputs=[predictions_df, download_link, validation_output]
            )
            
            analyze_btn.click(
                create_diagnostic_output,
                inputs=[diagnostic_file],
                outputs=[diagnostic_output]
            )

        return demo

    except Exception as e:
        logger.error(f"Error building interface: {str(e)}")
        raise gr.Error(f"Failed to initialize interface: {str(e)}")

def main():
    """Main function to run the Gradio interface"""
    try:
        demo = build_interface()
        # Launch with specific server options
        demo.launch(
            server_name="0.0.0.0",
            server_port=7860,
            show_error=True,
            share=False
        )
    except Exception as e:
        logger.error(f"Application failed to start: {str(e)}")
        raise

if __name__ == "__main__":
    main()