import gradio as gr
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import plotly.express as px
import plotly.graph_objects as go
import warnings
from sklearn.metrics import precision_score, recall_score, f1_score

warnings.filterwarnings("ignore")

# Hugging face dataset import
print("Loading dataset...")
ds = load_dataset("uhoui/text-tone-classifier")

# Optional: download csv (colab)
# df = ds['train'].to_pandas()
# df.to_csv("text_tone_classifier.csv", index=False)

df = pd.DataFrame(ds["train"])

# Console Log dataset and class
print(f"Dataset size: {len(df)} entries")
print(f"Columns: {df.columns}")

label_counts = df['label'].value_counts()
print("\nClass distribution:")
print(label_counts)

# Labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
print(label_encoder)
num_classes = len(label_encoder.classes_)
print(num_classes)

# Train  testsplit
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], 
    df['label_encoded'], 
    test_size=0.2, 
    random_state=42,
    stratify=None
)

# TFIDF Feature extraction
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# SMOTE
print("Handling class imbalance (via SNOTE)...")
try:
    smallest_class_size = min(np.bincount(y_train)[np.bincount(y_train) > 0])
    k_neighbors = min(5, smallest_class_size - 1)
    
    if k_neighbors > 0:
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)
        print(f"After SMOTE: {X_train_resampled.shape}")
    else:
        print("Classes too small for SMOTE, using original data.")
        X_train_resampled, y_train_resampled = X_train_tfidf, y_train
except ValueError as e:
    print(f"SMOTE error: {e}. Using original data.")
    X_train_resampled, y_train_resampled = X_train_tfidf, y_train

# Logistic Regression Model
# max iter exceeding 200 doesnt improve anything
# Don't set C low, set to 100+ default. 200 works better.
model = LogisticRegression(C=200, max_iter=200, n_jobs=-1, solver='lbfgs', multi_class='multinomial')
model.fit(X_train_resampled, y_train_resampled)

# Evaluate Model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {(1 - accuracy) * 100:.2f}%")
print(f"Precision: {(1 - precision) * 100:.2f}%")
print(f"Recall: {(1 - recall) * 100:.2f}%")
print(f"F1 Score: {(1 - f1) * 100:.2f}%")

def predict_tone(text):
    text_tfidf = tfidf.transform([text])
    
    probs = model.predict_proba(text_tfidf)[0]
    
    pred_class_encoded = model.classes_[np.argmax(probs)]
    pred_class = label_encoder.inverse_transform([pred_class_encoded])[0]
    
    trained_labels = model.classes_  

    trained_label_names = label_encoder.inverse_transform(trained_labels)

    results = {label: float(prob) for label, prob in zip(trained_label_names, probs)}
    
    sorted_results = {k: v for k, v in sorted(results.items(), key=lambda item: item[1], reverse=True)}
    
    top_n = 5  # Top 5, adjust later if needed
    top_labels = list(sorted_results.keys())[:top_n]
    top_probs = list(sorted_results.values())[:top_n]
    
    colors = ["rgba(64, 128, 255, " + str(min(1.0, p + 0.3)) + ")" for p in top_probs]
    
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=top_probs,
        y=top_labels,
        orientation='h',
        marker_color=colors,
        text=[f"{p:.1%}" for p in top_probs],
        textposition='auto'
    ))
    
    fig.update_layout(
        title="Emotion Probability",
        xaxis_title="Probability",
        yaxis_title="Emotion",
        height=400,
        margin=dict(l=20, r=20, t=40, b=20),
        xaxis=dict(range=[0, 1])
    )
    
    # Fetch examples
    example_texts = df[df['label'] == pred_class]['text'].sample(min(3, len(df[df['label'] == pred_class]))).tolist()
    
    return pred_class, sorted_results, fig, example_texts

def get_tone_examples(tone):
    examples = df[df['label'] == tone]['text'].sample(min(5, len(df[df['label'] == tone]))).tolist()
    return examples

# Gradio interface
def analyze_tone(text, selected_tone=None):
    if not text:
        return "Enter the text to analyze:", {}, None, []
    
    if selected_tone and not text:
        examples = get_tone_examples(selected_tone)
        return f"Examples of '{selected_tone}' tone:", {}, None, examples
    
    predicted_tone, all_probs, fig, examples = predict_tone(text)
    
    message = f"The tone is: **{predicted_tone}**"
    
    return message, all_probs, fig, examples

# Gradio interface Creation
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
    gr.Markdown("# Text Tone Sentimental Analyzer")
    gr.Markdown("Be mindful of punctuation as it affects results. Slang is unaccounted for due to dataset constraints.")
    
    with gr.Row():
        with gr.Column(scale=3):
            text_input = gr.Textbox(
                label="Enter your text here",
                placeholder="Example: The satisfaction of completing a difficult puzzle is indescribable.",
                lines=5
            )
            analyze_button = gr.Button("Analyze Tone", variant="primary")
            
        with gr.Column(scale=2):
            # Example Tones Dropdown
            tone_dropdown = gr.Dropdown(
                choices=sorted(df['label'].unique().tolist()),
                label="Select a tone to view an example below."
            )

    gr.Markdown("<br>", elem_id="line-break-1")
    
    with gr.Row():
        with gr.Column(scale=1):
            result_message = gr.Markdown()
        
    with gr.Row():
        with gr.Column(scale=2):
            plot_output = gr.Plot(label="Tone Probabilities")
        with gr.Column(scale=1):
            all_probs_output = gr.JSON(label="All Probabilities")
            
    with gr.Row():
        examples_output = gr.Dataframe(
            headers=["Examples of similar texts"],
            datatype=["str"],
            label="Example texts with similar tone"
        )
    
    analyze_button.click(
        fn=analyze_tone,
        inputs=[text_input, tone_dropdown],  
        outputs=[result_message, all_probs_output, plot_output, examples_output]
    )
    
    tone_dropdown.change(
        fn=get_tone_examples,
        inputs=tone_dropdown,  
        outputs=examples_output
    )

# Main
if __name__ == "__main__":
    demo.launch()