import os
import streamlit as st
import pandas as pd
import numpy as np
import requests
import urllib3
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Suppress SSL warnings for local development
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "data")
DATA_PATH = os.path.join(DATA_DIR, "Telco-Customer-Churn.csv")

# Direct download URL from IBM GitHub repository
DATA_URL = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"

CATEGORICAL_COLS = [
    "gender", "Partner", "Dependents", "PhoneService", "MultipleLines",
    "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies", "Contract",
    "PaperlessBilling", "PaymentMethod",
]

NUMERIC_COLS = ["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]


def download_data_if_needed():
    """Download the Telco Customer Churn dataset from IBM GitHub if it doesn't exist locally."""
    if not os.path.exists(DATA_PATH):
        os.makedirs(DATA_DIR, exist_ok=True)
        
        with st.spinner("📥 Downloading Telco Customer Churn dataset from IBM (one-time only)..."):
            try:
                # Download CSV from IBM GitHub
                # verify=False is needed for some local SSL certificate issues
                response = requests.get(DATA_URL, timeout=60, verify=False)
                response.raise_for_status()
                
                # Save CSV
                with open(DATA_PATH, 'wb') as f:
                    f.write(response.content)
                
                st.success("✅ Dataset downloaded successfully!")
                
            except Exception as e:
                st.error(f"Failed to download dataset: {str(e)}")
                st.info(
                    "**Alternative:** Download manually from "
                    "https://github.com/IBM/telco-customer-churn-on-icp4d/tree/master/data "
                    "and place 'Telco-Customer-Churn.csv' in the data/ folder."
                )
                raise Exception(f"Could not download dataset: {e}")


@st.cache_data
def load_raw_data() -> pd.DataFrame:
    download_data_if_needed()
    df = pd.read_csv(DATA_PATH)
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
    df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())
    df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
    return df


@st.cache_data
def get_encoded_data() -> tuple[pd.DataFrame, dict[str, LabelEncoder]]:
    """Return encoded DataFrame and dict of fitted LabelEncoders (for inverse transforms)."""
    df = load_raw_data().copy()
    encoders: dict[str, LabelEncoder] = {}
    for col in CATEGORICAL_COLS:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        encoders[col] = le
    return df, encoders


@st.cache_data
def get_train_test(test_size: float = 0.2, random_state: int = 42):
    df, encoders = get_encoded_data()
    feature_cols = CATEGORICAL_COLS + NUMERIC_COLS
    X = df[feature_cols]
    y = df["Churn"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    return X_train, X_test, y_train, y_test, encoders, feature_cols


@st.cache_data
def get_onehot_train_test(test_size: float = 0.2, random_state: int = 42):
    """One-Hot Encoded data for Logistic Regression. Same split indices as get_train_test."""
    df = load_raw_data().copy()
    df_oh = pd.get_dummies(df, columns=CATEGORICAL_COLS, drop_first=True)
    feature_cols_oh = [c for c in df_oh.columns if c not in ["customerID", "Churn"]]
    X = df_oh[feature_cols_oh]
    y = df_oh["Churn"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    return X_train, X_test, y_train, y_test, feature_cols_oh


@st.cache_data
def get_scaled_train_test(test_size: float = 0.2, random_state: int = 42):
    """Return scaled features (needed for SGDClassifier, Logistic Regression, and Naive Bayes)."""
    X_train, X_test, y_train, y_test, encoders, feature_cols = get_train_test(
        test_size, random_state
    )
    scaler = StandardScaler()
    X_train_sc = pd.DataFrame(
        scaler.fit_transform(X_train), columns=feature_cols, index=X_train.index
    )
    X_test_sc = pd.DataFrame(
        scaler.transform(X_test), columns=feature_cols, index=X_test.index
    )
    return X_train_sc, X_test_sc, y_train, y_test, encoders, feature_cols, scaler