import streamlit as st
from transformers import pipeline
from PIL import Image
import torch
import os

# Set cache directory to avoid permission issues
os.environ["TRANSFORMERS_CACHE"] = "/app/cache/transformers"
os.environ["HF_HOME"] = "/app/cache/hf"
os.environ["HF_HUB_CACHE"] = "/app/cache/hf"

# Set HF token from environment
hf_token = os.getenv("HF_TOKEN")
if hf_token:
    os.environ["HUGGINGFACE_HUB_TOKEN"] = hf_token

# Set page config
st.set_page_config(
    page_title="Gemma-3n E4B Vision-Language Model",
    page_icon="🤖",
    layout="wide"
)

@st.cache_resource
def load_model():
    """Load the model pipeline with caching"""
    try:
        # Check if token is available
        if not hf_token:
            st.error("HF_TOKEN not found in environment variables")
            return None
            
        # Use pipeline approach which is more compatible
        pipe = pipeline(
            "image-text-to-text", 
            model="google/gemma-3n-E4B-it",
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else "cpu",
            token=hf_token  # Pass token directly to pipeline
        )
        return pipe
    except Exception as e:
        st.error(f"Error loading model: {str(e)}")
        st.error("Make sure you have access to the model and your token is valid.")
        return None

def generate_response(pipe, image, text_prompt, max_tokens=100):
    """Generate response from the model"""
    try:
        # Prepare messages in the expected format
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": text_prompt}
                ]
            }
        ]
        
        # Generate response using pipeline
        response = pipe(messages, max_new_tokens=max_tokens)
        
        # Extract text from response
        if isinstance(response, list) and len(response) > 0:
            if isinstance(response[0], dict) and 'generated_text' in response[0]:
                return response[0]['generated_text']
            elif isinstance(response[0], str):
                return response[0]
        
        return str(response)
        
    except Exception as e:
        return f"Error generating response: {str(e)}"

def main():
    st.title("🤖 Gemma-3n E4B Vision-Language Model")
    st.markdown("Upload an image and ask questions about it!")
    
    # Check if token is available
    if not hf_token:
        st.error("❌ HuggingFace token not found in environment variables.")
        st.markdown("""
        **To fix this:**
        1. Go to your Space settings (⚙️ icon)
        2. Navigate to "Repository secrets"
        3. Add a secret with name: `HF_TOKEN`
        4. Value: Your HuggingFace token
        5. Restart the Space
        """)
        return
    else:
        st.success("✅ HuggingFace token found!")
    
    # Check if user is authenticated
    st.sidebar.markdown("### 📋 Setup Status")
    st.sidebar.markdown(f"""
    ✅ **Token**: Found in environment
    
    Make sure you have:
    1. ✅ Access to the gated model
    2. ✅ Added your HF token to Space secrets
    3. ✅ Token has proper permissions
    """)
    
    # Load model
    with st.spinner("Loading model... This may take a few minutes on first run."):
        pipe = load_model()
    
    if pipe is None:
        st.error("Failed to load model. Please check your setup and try again.")
        return
    
    st.success("Model loaded successfully!")
    
    # Create two columns
    col1, col2 = st.columns([1, 1])
    
    with col1:
        st.subheader("📤 Input")
        
        # Image upload
        uploaded_file = st.file_uploader(
            "Choose an image...", 
            type=['png', 'jpg', 'jpeg', 'gif', 'bmp'],
            help="Upload an image to analyze"
        )
        
        # Text input
        text_prompt = st.text_area(
            "Ask a question about the image:",
            placeholder="What do you see in this image?",
            height=100
        )
        
        # Generation parameters
        max_tokens = st.slider(
            "Max tokens to generate:",
            min_value=10,
            max_value=200,
            value=100,
            help="Maximum number of tokens to generate"
        )
        
        # Generate button
        generate_btn = st.button("🚀 Generate Response", type="primary")
    
    with col2:
        st.subheader("📤 Output")
        
        if uploaded_file is not None:
            # Display uploaded image
            image = Image.open(uploaded_file)
            st.image(image, caption="Uploaded image", use_column_width=True)
            
            # Generate response when button is clicked
            if generate_btn:
                if not text_prompt.strip():
                    st.warning("Please enter a question about the image.")
                else:
                    with st.spinner("Generating response..."):
                        response = generate_response(
                            pipe, image, text_prompt, max_tokens
                        )
                    
                    st.subheader("🤖 Model Response:")
                    st.write(response)
        else:
            st.info("👆 Please upload an image to get started")
    
    # Example section
    st.markdown("---")
    st.subheader("💡 Example Questions to Try:")
    st.markdown("""
    - What objects do you see in this image?
    - Describe the scene in detail
    - What colors are present in the image?
    - What is the main subject of this image?
    - Can you identify any text in this image?
    """)
    
    # Footer
    st.markdown("---")
    st.markdown(
        "Built with ❤️ using [Streamlit](https://streamlit.io) and "
        "[Hugging Face Transformers](https://huggingface.co/transformers/)"
    )

if __name__ == "__main__":
    main()