import gradio as gr
import os
from huggingface_hub import InferenceClient
import tempfile
import shutil
from pathlib import Path
from typing import Optional
import time

# -------------------------
# Utilities
# -------------------------

def cleanup_temp_files():
    """Clean up old temporary video files"""
    try:
        temp_dir = tempfile.gettempdir()
        for file_path in Path(temp_dir).glob("*.mp4"):
            try:
                if file_path.stat().st_mtime < (time.time() - 300):
                    file_path.unlink(missing_ok=True)
            except Exception:
                pass
    except Exception as e:
        print(f"Cleanup error: {e}")

def _client_from_token(token: Optional[str]) -> InferenceClient:
    """Create InferenceClient from user's OAuth token"""
    if not token:
        raise gr.Error("Please sign in first. This app requires your Hugging Face login.")
    # IMPORTANT: do not set bill_to when using user OAuth tokens
    # This ensures the user is billed, not Hugging Face
    return InferenceClient(
        provider="fal-ai",
        api_key=token,
    )

def _save_bytes_as_temp_mp4(data: bytes) -> str:
    """Save video bytes to temporary MP4 file"""
    temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
    try:
        temp_file.write(data)
        temp_file.flush()
        return temp_file.name
    finally:
        temp_file.close()

def generate_video_with_auth(image, prompt, profile: gr.OAuthProfile | None, progress=gr.Progress()):
    """
    Generate a video from an image using the Ovi model with authentication check.
    
    Args:
        image: Input image (PIL Image or file path)
        prompt: Text prompt describing the desired motion/animation
        profile: OAuth profile for authentication
        progress: Gradio progress tracker
    
    Returns:
        Tuple of (video_path, status_message)
    """
    try:
        # Check authentication
        if profile is None:
            return None, "❌ Sign in with Hugging Face to continue. This app uses your inference provider credits."
        
        if image is None:
            return None, "❌ Please upload an image first!"
        
        if not prompt or prompt.strip() == "":
            return None, "❌ Please enter a prompt describing the desired motion!"
        
        progress(0.2, desc="Processing image...")
        
        cleanup_temp_files()
        
        # Read the image file
        if isinstance(image, str):
            # If image is a file path
            with open(image, "rb") as image_file:
                input_image = image_file.read()
        else:
            # If image is PIL Image or array
            import io
            from PIL import Image as PILImage
            
            if isinstance(image, PILImage.Image):
                buffer = io.BytesIO()
                image.save(buffer, format='PNG')
                input_image = buffer.getvalue()
            else:
                # Assume it's a numpy array
                pil_image = PILImage.fromarray(image)
                buffer = io.BytesIO()
                pil_image.save(buffer, format='PNG')
                input_image = buffer.getvalue()
        
        progress(0.4, desc="Generating video with AI...")
        
        # Create client with user's OAuth token (not HF_TOKEN)
        # IMPORTANT: Do not use bill_to parameter - this ensures user gets billed
        client = InferenceClient(
            provider="fal-ai",
            api_key=profile.oauth_info.access_token,  # Use user's token
        )
        
        # Generate video using the inference client
        try:
            video = client.image_to_video(
                input_image,
                prompt=prompt,
                model="chetwinlow1/Ovi",
            )
        except Exception as e:
            import requests
            if isinstance(e, requests.HTTPError) and getattr(e.response, "status_code", None) == 403:
                return None, "❌ Access denied by provider (403). Make sure your HF account has credits/permission for provider 'fal-ai' and model 'chetwinlow1/Ovi'."
            raise
        
        progress(0.9, desc="Finalizing video...")
        
        # Save the video to a temporary file
        video_path = _save_bytes_as_temp_mp4(video)
        
        progress(1.0, desc="Complete!")
        
        return video_path, f"✅ Video generated successfully! Prompt: '{prompt[:60]}...'"
    
    except gr.Error as e:
        return None, f"❌ {str(e)}"
    except Exception as e:
        return None, f"❌ Generation failed. If this keeps happening, check your provider quota or try again later. Error: {str(e)}"

def clear_all():
    """Clear all inputs and outputs"""
    return None, "", None, ""

# Custom CSS for better styling
custom_css = """
.container {
    max-width: 1200px;
    margin: auto;
}
.header-link {
    text-decoration: none;
    color: #2196F3;
    font-weight: bold;
}
.header-link:hover {
    text-decoration: underline;
}
.status-box {
    padding: 10px;
    border-radius: 5px;
    margin-top: 10px;
}
.notice {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    padding: 14px 16px;
    border-radius: 12px;
    margin: 18px auto 6px;
    max-width: 860px;
    text-align: center;
    font-size: 0.98rem;
}
.info-box {
    background-color: #f0f7ff;
    border-left: 4px solid #4285f4;
    padding: 1em;
    margin: 1em 0;
    border-radius: 4px;
}
.special-tokens-box {
    background: linear-gradient(135deg, #ffeaa7 0%, #fdcb6e 100%);
    padding: 1em;
    margin: 1em 0;
    border-radius: 8px;
    border-left: 4px solid #e17055;
}
"""

# Create the Gradio interface
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="Ovi Image-to-Video Generator (Paid)") as demo:
    
    # Header with payment notice
    gr.HTML(
        """
        <div style="text-align:center; padding:2em 1em 1em;">
            <h1 style="font-size:2.2em; margin-bottom:6px;">🎬 Ovi: Image-to-Video with Audio</h1>
            <p style="color:#777; margin:0 0 8px;">Generate synchronized video and audio from images</p>
            <div class="notice">
                <b>Heads up:</b> This is a paid app that uses <b>your</b> inference provider credits when you run generations.
                Free users get <b>$0.10 in included credits</b>. <b>PRO users</b> get <b>$2 in included credits</b> 
                and can continue using beyond that (with billing). 
                <a href='http://huggingface.co/subscribe/pro?source=ovi' target='_blank' style='color:#fff; text-decoration:underline; font-weight:bold;'>Subscribe to PRO</a> 
                for more credits. Please sign in with your Hugging Face account to continue.
            </div>
            <p style="font-size: 0.9em; color: #999; margin-top: 10px;">
                Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color:#667eea; text-decoration:underline;">anycoder</a>
            </p>
        </div>
        """
    )
    
    gr.Markdown(
        """
        ### Transform your static images into dynamic videos with synchronized audio using AI!
        
        Powered by **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** via [HuggingFace Inference Providers](https://huggingface.co/docs/huggingface_hub/guides/inference)
        """
    )
    
    # Add login button - required for OAuth
    login_btn = gr.LoginButton("Sign in with Hugging Face")
    
    gr.HTML(
        """
        <div class="info-box">
            <strong>💡 Tips for best results:</strong>
            <ul>
                <li>Use clear, well-lit images with a single main subject</li>
                <li>Write specific prompts describing the desired motion or action</li>
                <li>Keep prompts concise and focused on movement and audio elements</li>
                <li>Processing generates 5-second videos at 24 FPS with synchronized audio</li>
                <li>Processing may take 30-60 seconds depending on server load</li>
            </ul>
        </div>
        """
    )
    
    gr.HTML(
        """
        <div class="special-tokens-box">
            <strong>✨ Special Tokens for Enhanced Control:</strong>
            <ul>
                <li><strong>Speech:</strong> <code>&lt;S&gt;Your speech content here&lt;E&gt;</code> - Text enclosed in these tags will be converted to speech</li>
                <li><strong>Audio Description:</strong> <code>&lt;AUDCAP&gt;Audio description here&lt;ENDAUDCAP&gt;</code> - Describes the audio or sound effects present in the video</li>
            </ul>
            <br>
            <strong>📝 Example Prompt:</strong><br>
            <code>Dogs bark loudly at a man wearing a red shirt. The man says &lt;S&gt;Please stop barking at me!&lt;E&gt;. &lt;AUDCAP&gt;Dogs barking, angry man yelling in stern voice&lt;ENDAUDCAP&gt;.</code>
        </div>
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            image_input = gr.Image(
                label="📸 Upload Image",
                type="pil",
                sources=["upload", "clipboard"],
                height=400,
            )
            
            prompt_input = gr.Textbox(
                label="✍️ Text Prompt",
                placeholder="Describe the motion and audio you want... (e.g., 'A person walking forward while talking')",
                lines=4,
                max_lines=6
            )
            
            with gr.Row():
                generate_btn = gr.Button(
                    "🎬 Generate Video",
                    variant="primary",
                    scale=2
                )
                
                clear_btn = gr.Button(
                    "🗑️ Clear",
                    variant="secondary",
                    scale=1
                )
            
            status_output = gr.Textbox(
                label="Status",
                interactive=False,
                visible=True,
                elem_classes=["status-box"]
            )
            
            gr.Examples(
                examples=[
                    [
                        "5.png",
                        'A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the "CHOICE FM" logo and various social media handles like "@ilovechoicefm" with "RALEIGH" below it. The man intently addresses the microphone, articulating, <S>is talent. It\'s all about authenticity. You gotta be who you really are, especially if you\'re working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>'
                    ]
                ],
                inputs=[image_input, prompt_input],
                label="Example Prompts",
            )
        
        with gr.Column(scale=1):
            video_output = gr.Video(
                label="🎥 Generated Video",
                height=400,
                autoplay=True,
                show_download_button=True
            )
            
            gr.Markdown(
                """
                ### About Ovi Model
                
                **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation**
                
                Developed by Chetwin Low, Weimin Wang (Character AI) & Calder Katyal (Yale University)
                
                🌟 **Key Features:**
                - 🎬 **Video+Audio Generation**: Generates synchronized video and audio content simultaneously
                - 📝 **Flexible Input**: Supports text-only or text+image conditioning  
                - ⏱️ **5-second Videos**: Generates 5-second videos at 24 FPS
                - 📐 **Multiple Aspect Ratios**: Supports 720×720 area at various ratios (9:16, 16:9, 1:1, etc)
                
                Ovi is a veo-3 like model that simultaneously generates both video and audio content from text or text+image inputs.
                
                ---
                
                ### 💳 Pricing Information
                
                This app uses the Hugging Face Inference API (provider: fal-ai) which charges based on usage:
                - **Free users**: $0.10 in included credits
                - **PRO users**: $2 in included credits + ability to continue with billing
                
                [Subscribe to PRO](http://huggingface.co/subscribe/pro?source=ovi) for more credits and features!
                """
            )
    
    # How to Use section
    with gr.Accordion("📖 How to Use", open=False):
        gr.Markdown(
            """
            ### Getting Started:
            1. **Sign in** with your Hugging Face account using the button above
            2. **Upload** your image - any photo or illustration
            3. **Describe** the motion and audio you want in the prompt
            4. **Use special tokens** for speech and audio descriptions (optional but recommended)
            5. **Generate** and watch your image come to life with synchronized audio!
            
            ### Special Tokens Guide:
            
            **Speech Token**: `<S>text<E>`
            - Use this to add spoken dialogue to your video
            - Example: `The person says <S>Hello, how are you?<E>`
            
            **Audio Description Token**: `<AUDCAP>description<ENDAUDCAP>`
            - Use this to describe background sounds and audio effects
            - Example: `<AUDCAP>Birds chirping, gentle wind blowing<ENDAUDCAP>`
            
            ### Tips for Better Results:
            - Be specific and descriptive in your prompts
            - Combine visual motion descriptions with audio elements
            - Use high-quality input images for better results
            - Experiment with different prompts and special tokens
            - Processing takes 30-60 seconds per generation
            
            ### ⚠️ Important Notes:
            - This is a **paid app** that uses your inference provider credits
            - Each generation consumes credits based on processing time
            - Free accounts have limited credits ($0.10)
            - PRO accounts get more credits ($2) and can continue with billing
            - Videos are 5 seconds long at 24 FPS
            - Supports multiple aspect ratios (9:16, 16:9, 1:1, etc)
            """
        )
    
    gr.Markdown(
        """
        ---
        
        ### 🔗 Resources
        
        - [Ovi Model Card](https://huggingface.co/chetwinlow1/Ovi)
        - [Character AI](https://character.ai)
        - [Hugging Face Inference API Docs](https://huggingface.co/docs/huggingface_hub/guides/inference)
        - [Subscribe to PRO](http://huggingface.co/subscribe/pro?source=ovi)
        
        ### 📊 Model Specifications
        
        - **Provider**: fal-ai
        - **Model**: chetwinlow1/Ovi
        - **Output**: 5-second videos at 24 FPS with audio
        - **Input**: Image + Text prompt
        - **Resolution**: 720×720 area (various aspect ratios)
        """
    )
    
    # Event handlers with authentication
    generate_btn.click(
        fn=generate_video_with_auth,
        inputs=[image_input, prompt_input],
        outputs=[video_output, status_output],
        show_progress="full",
        queue=False,
        api_name=False,
        show_api=False,
    )
    
    clear_btn.click(
        fn=clear_all,
        inputs=[],
        outputs=[image_input, prompt_input, video_output, status_output],
        queue=False,
    )

# Launch the app
if __name__ == "__main__":
    try:
        cleanup_temp_files()
        if os.path.exists("gradio_cached_examples"):
            shutil.rmtree("gradio_cached_examples", ignore_errors=True)
    except Exception as e:
        print(f"Initial cleanup error: {e}")
    
    demo.queue(status_update_rate="auto", api_open=False, default_concurrency_limit=None)
    demo.launch(
        show_api=False,
        share=False,
        show_error=True,
        enable_monitoring=False,
        quiet=True,
    )