# app.py

import streamlit as st
from PIL import Image
import torch

# Import TinyLLaVA modules (use local copy!)
from tinyllava.model.builder import load_pretrained_model
from tinyllava.utils import disable_torch_init
from tinyllava.mm_utils import (
    process_images,
    tokenizer_image_token,
    get_model_name_from_path
)

# Disable torch default init for speed
disable_torch_init()

# Load TinyLLaVA 3.1B
MODEL_PATH = "bczhou/TinyLLaVA-3.1B"
tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path=MODEL_PATH,
    model_base=None,
    model_name="TinyLLaVA-3.1B"
)

device = torch.device("cpu")
model.to(device)

# Streamlit UI
st.set_page_config(page_title="TinyLLaVA 3.1B (Streamlit)", layout="centered")
st.title("🦙 TinyLLaVA 3.1B — Vision-Language Q&A")

uploaded_file = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"])

prompt = st.text_input("Ask a question about the image:")

if uploaded_file is not None and prompt:
    image = Image.open(uploaded_file).convert("RGB")

    # Process image
    image_tensor = process_images([image], image_processor, model.config)
    image_tensor = image_tensor.to(device)

    # Process prompt
    prompt_text = tokenizer_image_token(prompt, tokenizer, context_len)
    inputs = tokenizer([prompt_text])
    input_ids = torch.tensor(inputs.input_ids).unsqueeze(0).to(device)

    # Run inference
    with st.spinner("Generating answer..."):
        output_ids = model.generate(
            input_ids,
            images=image_tensor,
            do_sample=True,
            temperature=0.2,
            max_new_tokens=200
        )
        out_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    st.subheader("Answer:")
    st.write(out_text)