import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from NoCodeTextClassifier.EDA import Informations, Visualizations
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization  
from NoCodeTextClassifier.models import Models
import os
import pickle
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import io

# Set page config
st.set_page_config(page_title="Text Classification App", page_icon="📝", layout="wide")

# Utility functions
def save_artifacts(obj, folder_name, file_name):
    """Save artifacts like encoders and vectorizers"""
    try:
        os.makedirs(folder_name, exist_ok=True)
        with open(os.path.join(folder_name, file_name), 'wb') as f:
            pickle.dump(obj, f)
        return True
    except Exception as e:
        st.error(f"Error saving {file_name}: {str(e)}")
        return False

def load_artifacts(folder_name, file_name):
    """Load saved artifacts"""
    try:
        with open(os.path.join(folder_name, file_name), 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        st.error(f"File {file_name} not found in {folder_name} folder")
        return None
    except Exception as e:
        st.error(f"Error loading {file_name}: {str(e)}")
        return None

def load_model(model_name):
    """Load trained model"""
    try:
        with open(os.path.join('models', model_name), 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        st.error(f"Model {model_name} not found. Please train a model first.")
        return None
    except Exception as e:
        st.error(f"Error loading model {model_name}: {str(e)}")
        return None

def safe_read_csv(uploaded_file, encoding_options=['utf-8', 'latin1', 'iso-8859-1', 'cp1252']):
    """Safely read CSV with multiple encoding options"""
    for encoding in encoding_options:
        try:
            # Reset file pointer
            uploaded_file.seek(0)
            # Read as bytes first, then decode
            content = uploaded_file.read()
            if isinstance(content, bytes):
                content = content.decode(encoding)
            
            # Use StringIO to create a file-like object
            df = pd.read_csv(io.StringIO(content))
            st.success(f"File loaded successfully with {encoding} encoding")
            return df
            
        except UnicodeDecodeError:
            continue
        except Exception as e:
            st.warning(f"Failed to read with {encoding} encoding: {str(e)}")
            continue
    
    # If all encodings fail, try pandas default
    try:
        uploaded_file.seek(0)
        df = pd.read_csv(uploaded_file)
        st.success("File loaded with default encoding")
        return df
    except Exception as e:
        st.error(f"All encoding attempts failed. Error: {str(e)}")
        return None

def predict_text(model_name, text, vectorizer_type="tfidf"):
    """Make prediction on new text"""
    try:
        # Load model
        model = load_model(model_name)
        if model is None:
            return None, None
        
        # Load vectorizer
        vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
        vectorizer = load_artifacts("artifacts", vectorizer_file)
        if vectorizer is None:
            return None, None
        
        # Load label encoder
        encoder = load_artifacts("artifacts", "encoder.pkl")
        if encoder is None:
            return None, None
        
        # Clean and vectorize text
        text_cleaner = TextCleaner()
        clean_text = text_cleaner.clean_text(text)
        
        # Transform text using the same vectorizer used during training
        text_vector = vectorizer.transform([clean_text])
        
        # Make prediction
        prediction = model.predict(text_vector)
        prediction_proba = None
        
        # Get prediction probabilities if available
        if hasattr(model, 'predict_proba'):
            try:
                prediction_proba = model.predict_proba(text_vector)[0]
            except:
                pass
        
        # Decode prediction
        predicted_label = encoder.inverse_transform(prediction)[0]
        
        return predicted_label, prediction_proba
        
    except Exception as e:
        st.error(f"Error during prediction: {str(e)}")
        return None, None

# Streamlit App
st.title('📝 No Code Text Classification App')
st.write('Understand the behavior of your text data and train a model to classify the text data')

# Sidebar
st.sidebar.title("Navigation")
section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])

# Upload Data
st.sidebar.subheader("📁 Upload Your Dataset")
train_data = st.sidebar.file_uploader("Upload training data", type=["csv"], key="train_upload")
test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"], key="test_upload")

# Global variables to store data and settings
if 'vectorizer_type' not in st.session_state:
    st.session_state.vectorizer_type = "tfidf"
if 'train_df' not in st.session_state:
    st.session_state.train_df = None
if 'info' not in st.session_state:
    st.session_state.info = None

# Process uploaded data
if train_data is not None:
    try:
        # Use safe CSV reading function
        train_df = safe_read_csv(train_data)
        
        if train_df is not None:
            st.session_state.train_df = train_df
            
            if test_data is not None:
                test_df = safe_read_csv(test_data)
                st.session_state.test_df = test_df
            else:
                st.session_state.test_df = None
            
            st.sidebar.success("✅ Data loaded successfully!")
            st.write("Training Data Preview:")
            st.write(train_df.head(3))
            
            columns = train_df.columns.tolist()
            text_data = st.sidebar.selectbox("Choose the text column:", columns, key="text_col")
            target = st.sidebar.selectbox("Choose the target column:", columns, key="target_col")
            
            if text_data and target:
                try:
                    # Process data
                    info = Informations(train_df, text_data, target)
                    train_df['clean_text'] = info.clean_text()
                    train_df['text_length'] = info.text_length()
                    
                    # Handle label encoding manually
                    from sklearn.preprocessing import LabelEncoder
                    label_encoder = LabelEncoder()
                    train_df['target'] = label_encoder.fit_transform(train_df[target])
                    
                    # Save label encoder for later use
                    if save_artifacts(label_encoder, "artifacts", "encoder.pkl"):
                        st.sidebar.success("✅ Data processed successfully!")
                    
                    st.session_state.train_df = train_df
                    st.session_state.info = info
                    
                except Exception as e:
                    st.error(f"Error processing data: {str(e)}")
                    st.session_state.train_df = None
                    st.session_state.info = None
        
    except Exception as e:
        st.error(f"Error loading data: {str(e)}")
        st.session_state.train_df = None
        st.session_state.info = None

# Get data from session state
train_df = st.session_state.get('train_df')
info = st.session_state.get('info')

# Data Analysis Section
if section == "Data Analysis":
    if train_data is not None and train_df is not None:
        try:
            st.subheader("📊 Get Insights from the Data")
            
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Data Shape", f"{info.shape()[0]} rows × {info.shape()[1]} cols")
            with col2:
                st.metric("Classes", len(train_df['target'].unique()))
            with col3:
                st.metric("Missing Values", info.missing_values())
            
            st.write("**Class Distribution:**", info.class_imbalanced())

            st.write("**Processed Data Preview:**")
            st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
            
            st.markdown("**Text Length Analysis**")
            st.write(info.analysis_text_length('text_length'))
            
            # Calculate correlation manually
            correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
            st.write(f"**Correlation between Text Length and Target:** {correlation:.4f}")

            st.subheader("📈 Visualizations")
            
            try:
                columns = train_df.columns.tolist()
                text_col = next((col for col in columns if 'text' in col.lower() or col in ['message', 'content', 'review']), columns[0])
                target_col = next((col for col in columns if col in ['label', 'target', 'class', 'category']), columns[-1])
                
                vis = Visualizations(train_df, text_col, target_col)
                vis.class_distribution()
                vis.text_length_distribution()
            except Exception as e:
                st.error(f"Error generating visualizations: {str(e)}")

        except Exception as e:
            st.error(f"Error in data analysis: {str(e)}")
    else:
        st.warning("⚠️ Please upload training data to get insights")

# Train Model Section
elif section == "Train Model":
    if train_data is not None and train_df is not None:
        try:
            st.subheader("🤖 Train a Model")

            # Create two columns for model selection
            col1, col2 = st.columns(2)

            with col1:
                st.markdown("**Select Model:**")
                model = st.radio("Choose the Model", [
                    "Logistic Regression", "Decision Tree", 
                    "Random Forest", "Linear SVC", "SVC",
                    "Multinomial Naive Bayes", "Gaussian Naive Bayes"
                ])
            
            with col2:
                st.markdown("**Select Vectorizer:**")
                vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])

            # Initialize vectorizer
            if vectorizer_choice == "Tfidf Vectorizer":
                vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
                st.session_state.vectorizer_type = "tfidf"
            else:
                vectorizer = CountVectorizer(max_features=10000, stop_words='english')
                st.session_state.vectorizer_type = "count"

            st.write("**Training Data Preview:**")
            st.write(train_df[['clean_text', 'target']].head(3))
            
            # Vectorize text data
            with st.spinner("Vectorizing text data..."):
                X = vectorizer.fit_transform(train_df['clean_text'])
                y = train_df['target']
            
            # Split data
            X_train, X_test, y_train, y_test = process.split_data(X, y)
            st.write(f"**Data split** - Train: {X_train.shape}, Test: {X_test.shape}")
            
            # Save vectorizer for later use
            vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
            save_artifacts(vectorizer, "artifacts", vectorizer_filename)
            
            if st.button("🚀 Start Training", type="primary"):
                with st.spinner("Training model..."):
                    try:
                        models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
                        
                        # Train selected model
                        if model == "Logistic Regression":
                            models.LogisticRegression()
                        elif model == "Decision Tree":
                            models.DecisionTree()
                        elif model == "Linear SVC":
                            models.LinearSVC()
                        elif model == "SVC":
                            models.SVC()
                        elif model == "Multinomial Naive Bayes":
                            models.MultinomialNB()
                        elif model == "Random Forest":
                            models.RandomForestClassifier()
                        elif model == "Gaussian Naive Bayes":
                            models.GaussianNB()
                        
                        st.success("🎉 Model training completed!")
                        st.info("You can now use the 'Predictions' section to classify new text.")
                        
                    except Exception as e:
                        st.error(f"Error during model training: {str(e)}")

        except Exception as e:
            st.error(f"Error in model training: {str(e)}")
    else:
        st.warning("⚠️ Please upload training data to train a model")

# Predictions Section
elif section == "Predictions":
    st.subheader("🔮 Perform Predictions on New Text")
    
    # Check if models exist
    if os.path.exists("models") and os.listdir("models"):
        # Text input for prediction
        text_input = st.text_area("Enter the text to classify:", height=100, placeholder="Type your text here...")
        
        # Model selection
        available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
        
        if available_models:
            selected_model = st.selectbox("Choose the trained model:", available_models)
            
            # Prediction button
            if st.button("🎯 Predict", key="single_predict", type="primary"):
                if text_input.strip():
                    with st.spinner("Making prediction..."):
                        predicted_label, prediction_proba = predict_text(
                            selected_model, 
                            text_input, 
                            st.session_state.get('vectorizer_type', 'tfidf')
                        )
                        
                        if predicted_label is not None:
                            st.success("✅ Prediction completed!")
                            
                            # Display results
                            st.markdown("### 📊 Prediction Results")
                            
                            col1, col2 = st.columns([2, 1])
                            with col1:
                                st.markdown(f"**Input Text:** {text_input}")
                            with col2:
                                st.markdown(f"**Predicted Class:** `{predicted_label}`")
                            
                            # Display probabilities if available
                            if prediction_proba is not None:
                                st.markdown("**Class Probabilities:**")
                                
                                # Load encoder to get class names
                                encoder = load_artifacts("artifacts", "encoder.pkl")
                                if encoder is not None:
                                    classes = encoder.classes_
                                    prob_df = pd.DataFrame({
                                        'Class': classes,
                                        'Probability': prediction_proba
                                    }).sort_values('Probability', ascending=False)
                                    
                                    col1, col2 = st.columns(2)
                                    with col1:
                                        st.bar_chart(prob_df.set_index('Class'))
                                    with col2:
                                        st.dataframe(prob_df, use_container_width=True)
                else:
                    st.warning("⚠️ Please enter some text to classify")
        else:
            st.warning("⚠️ No trained models found. Please train a model first.")
    else:
        st.warning("⚠️ No trained models found. Please go to 'Train Model' section to train a model first.")
        
    # Option to classify multiple texts
    st.markdown("---")
    st.subheader("📊 Batch Predictions")
    
    uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'], key="batch_upload")
    
    if uploaded_file is not None:
        try:
            batch_df = safe_read_csv(uploaded_file)
            
            if batch_df is not None:
                st.write("**Uploaded data preview:**")
                st.write(batch_df.head())
                
                # Select text column
                text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
                
                if os.path.exists("models") and os.listdir("models"):
                    available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
                    batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
                    
                    if st.button("🚀 Run Batch Predictions", key="batch_predict", type="primary"):
                        with st.spinner("Processing batch predictions..."):
                            predictions = []
                            progress_bar = st.progress(0)
                            
                            for idx, text in enumerate(batch_df[text_column]):
                                pred, _ = predict_text(
                                    batch_model, 
                                    str(text), 
                                    st.session_state.get('vectorizer_type', 'tfidf')
                                )
                                predictions.append(pred if pred is not None else "Error")
                                progress_bar.progress((idx + 1) / len(batch_df))
                            
                            batch_df['Predicted_Class'] = predictions
                            
                            st.success("✅ Batch predictions completed!")
                            st.write("**Results:**")
                            st.write(batch_df[[text_column, 'Predicted_Class']])
                            
                            # Download results
                            csv = batch_df.to_csv(index=False)
                            st.download_button(
                                label="📥 Download predictions as CSV",
                                data=csv,
                                file_name="batch_predictions.csv",
                                mime="text/csv"
                            )
                            
        except Exception as e:
            st.error(f"Error in batch prediction: {str(e)}")