import torch
import torch.nn as nn
import torch.nn.functional as F
import gradio as gr
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModel
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import cv2

class GatedAttentionFusion(nn.Module):
    def __init__(self, img_dim=512, text_dim=768, hidden_dim=256):
        super().__init__()
        self.img_proj = nn.Linear(img_dim, hidden_dim)
        self.text_proj = nn.Linear(text_dim, hidden_dim)
        
        self.gate = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.Sigmoid()
        )
        
        self.cross_attention = nn.MultiheadAttention(hidden_dim, 8, dropout=0.1)
        self.layer_norm = nn.LayerNorm(hidden_dim)
        
    def forward(self, img_feat, text_feat):
        img_proj = self.img_proj(img_feat)
        text_proj = self.text_proj(text_feat)
        
        concat_feat = torch.cat([img_proj, text_proj], dim=-1)
        gate_weight = self.gate(concat_feat)
        
        gated_img = img_proj * gate_weight
        gated_text = text_proj * (1 - gate_weight)
        
        fused_feat = gated_img + gated_text
        fused_feat = fused_feat.unsqueeze(0)
        
        attended_feat, attention_weights = self.cross_attention(
            fused_feat, fused_feat, fused_feat
        )
        
        attended_feat = attended_feat.squeeze(0)
        attended_feat = self.layer_norm(attended_feat + fused_feat.squeeze(0))
        
        return attended_feat, attention_weights

class SentimentClassifier(nn.Module):
    def __init__(self, input_dim=256, num_classes=3):
        super().__init__()
        
        self.feature_enhancer = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        self.self_attention = nn.MultiheadAttention(512, 8, dropout=0.1)
        
        self.classifier = nn.Sequential(
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )
        
    def forward(self, x):
        enhanced = self.feature_enhancer(x)
        enhanced = enhanced.unsqueeze(0)
        attended, attn_weights = self.self_attention(enhanced, enhanced, enhanced)
        attended = attended.squeeze(0)
        
        final_feat = enhanced.squeeze(0) + attended
        logits = self.classifier(final_feat)
        
        return logits, attn_weights

class MovieSentimentAnalyzer:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Load models
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        
        self.text_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.text_model = AutoModel.from_pretrained("bert-base-uncased")
        
        # Custom layers
        self.fusion_module = GatedAttentionFusion()
        self.classifier = SentimentClassifier()
        
        # Move to device
        self.clip_model.to(self.device)
        self.text_model.to(self.device)
        self.fusion_module.to(self.device)
        self.classifier.to(self.device)
        
        # Set eval mode
        self.clip_model.eval()
        self.text_model.eval()
        self.fusion_module.eval()
        self.classifier.eval()
        
        # Movie sentiment labels
        self.labels = ['Not Recommended', 'Average', 'Highly Recommended']
        
    def extract_image_features(self, image):
        inputs = self.clip_processor(images=image, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            image_features = self.clip_model.get_image_features(**inputs)
        
        return image_features
    
    def extract_text_features(self, text):
        inputs = self.text_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = self.text_model(**inputs)
            text_features = outputs.last_hidden_state.mean(dim=1)
        
        return text_features
    
    def predict_sentiment(self, image, text):
        img_features = self.extract_image_features(image)
        text_features = self.extract_text_features(text)
        
        with torch.no_grad():
            fused_features, fusion_attention = self.fusion_module(img_features, text_features)
            logits, classification_attention = self.classifier(fused_features)
            probabilities = F.softmax(logits, dim=-1)
        
        return probabilities, fusion_attention, classification_attention
    
    def generate_gradcam(self, image, text):
        img_array = np.array(image.resize((224, 224)))
        height, width = img_array.shape[:2]
        
        # Mock attention map
        attention_map = np.random.random((height, width))
        attention_map = cv2.GaussianBlur(attention_map, (21, 21), 0)
        attention_map = (attention_map - attention_map.min()) / (attention_map.max() - attention_map.min())
        
        heatmap = cv2.applyColorMap(np.uint8(255 * attention_map), cv2.COLORMAP_JET)
        heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
        
        overlay = 0.6 * img_array + 0.4 * heatmap
        overlay = np.uint8(overlay)
        
        return Image.fromarray(overlay)
    
    def create_attention_visualization(self, text, attention_weights):
        words = text.split()
        if len(words) == 0:
            return "No text provided"
        
        mock_weights = np.random.random(len(words))
        mock_weights = mock_weights / mock_weights.sum()
        
        highlighted_text = []
        for word, weight in zip(words, mock_weights):
            intensity = min(1.0, weight * 3)
            highlighted_text.append((word, intensity))
        
        return highlighted_text

# Initialize analyzer
analyzer = MovieSentimentAnalyzer()

def analyze_movie_sentiment(image, text):
    if image is None or not text.strip():
        return (
            {"Error": 1.0}, 
            None, 
            None, 
            "Please upload a movie poster and enter your review.",
            None
        )
    
    try:
        probabilities, fusion_attn, class_attn = analyzer.predict_sentiment(image, text)
        
        prob_dict = {
            analyzer.labels[i]: float(probabilities[0][i]) 
            for i in range(len(analyzer.labels))
        }
        
        gradcam_image = analyzer.generate_gradcam(image, text)
        text_attention = analyzer.create_attention_visualization(text, class_attn)
        
        # Create plot
        fig, ax = plt.subplots(figsize=(8, 5))
        labels = list(prob_dict.keys())
        values = list(prob_dict.values())
        colors = ['#ff6b6b', '#feca57', '#48dbfb']
        
        bars = ax.bar(labels, values, color=colors, alpha=0.8)
        ax.set_ylabel('Recommendation Score')
        ax.set_title('Movie Sentiment Analysis')
        ax.set_ylim(0, 1)
        
        for bar, value in zip(bars, values):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{value:.3f}', ha='center', va='bottom')
        
        plt.tight_layout()
        
        # Generate explanation
        predicted_label = max(prob_dict, key=prob_dict.get)
        confidence = prob_dict[predicted_label]
        
        explanation = f"""
**Movie Analysis Results:**

🎯 **Recommendation**: {predicted_label}
⭐ **Confidence**: {confidence:.1%}

**Analysis Summary:**
The model analyzed the movie poster/image and your review text to determine the overall sentiment.
Visual elements like color scheme, composition, and textual sentiment patterns were considered.

**Score Breakdown:**
• Highly Recommended: {prob_dict['Highly Recommended']:.1%}
• Average: {prob_dict['Average']:.1%}  
• Not Recommended: {prob_dict['Not Recommended']:.1%}
        """
        
        return prob_dict, fig, gradcam_image, explanation, text_attention
        
    except Exception as e:
        return (
            {"Error": 1.0}, 
            None, 
            None, 
            f"Analysis error: {str(e)}", 
            None
        )

def create_interface():
    with gr.Blocks(
        theme=gr.themes.Soft(),
        title="Movie Sentiment Analysis",
        css="""
        .gradio-container {
            max-width: 1200px !important;
        }
        .main-header {
            text-align: center;
            margin-bottom: 30px;
        }
        """
    ) as interface:
        
        gr.HTML("""
        <div class="main-header">
            <h1>🎬 Movie Sentiment Analysis</h1>
            <p>AI-powered analysis of movie posters and reviews for recommendation insights</p>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 📥 Input")
                image_input = gr.Image(
                    type="pil", 
                    label="🎬 Upload Movie Poster",
                    height=300
                )
                text_input = gr.Textbox(
                    label="📝 Movie Review", 
                    placeholder="Enter your movie review or thoughts here...",
                    lines=4
                )
                analyze_btn = gr.Button(
                    "🔍 Analyze Movie", 
                    variant="primary",
                    size="lg"
                )
                
                gr.Markdown("### 📊 Results")
                sentiment_output = gr.Label(
                    label="🎯 Recommendation",
                    num_top_classes=3
                )
            
            with gr.Column(scale=1):
                gr.Markdown("### 📈 Confidence Scores")
                confidence_plot = gr.Plot(label="Analysis Results")
                
                gr.Markdown("### 💭 Analysis Summary")
                explanation_output = gr.Textbox(
                    label="Detailed Results",
                    lines=8,
                    max_lines=15
                )
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("### 🔥 Visual Attention")
                gradcam_output = gr.Image(
                    label="Poster Analysis Heatmap",
                    height=300
                )
            
            with gr.Column():
                gr.Markdown("### 📝 Text Attention")
                text_attention_output = gr.HighlightedText(
                    label="Key Words",
                    combine_adjacent=True
                )
        
        # Example text suggestions
        gr.Markdown("### 🎯 Example Reviews")
        gr.Markdown("""
        **Positive Example:**
        "This movie exceeded all my expectations! The visual effects were breathtaking and the storyline was incredibly engaging. Definitely worth watching!"
        
        **Negative Example:**
        "I found this film quite disappointing. The pacing was slow and the plot felt predictable. Not what I was hoping for."
        """)
        
        analyze_btn.click(
            fn=analyze_movie_sentiment,
            inputs=[image_input, text_input],
            outputs=[sentiment_output, confidence_plot, gradcam_output, explanation_output, text_attention_output]
        )
        
        gr.HTML("""
        <div style="text-align: center; margin-top: 40px; padding: 20px; border-top: 1px solid #ddd;">
            <p><strong>🎬 Movie Industry AI Analysis</strong></p>
            <p>Powered by CLIP + BERT with cross-modal attention for movie recommendation</p>
        </div>
        """)
    
    return interface

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        show_error=True
    )