Spaces:
Sleeping
Sleeping
File size: 1,766 Bytes
d78fda0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# app.py
import streamlit as st
from PIL import Image
import torch
# Import TinyLLaVA modules (use local copy!)
from tinyllava.model.builder import load_pretrained_model
from tinyllava.utils import disable_torch_init
from tinyllava.mm_utils import (
process_images,
tokenizer_image_token,
get_model_name_from_path
)
# Disable torch default init for speed
disable_torch_init()
# Load TinyLLaVA 3.1B
MODEL_PATH = "bczhou/TinyLLaVA-3.1B"
tokenizer, model, image_processor, context_len = load_pretrained_model(
model_path=MODEL_PATH,
model_base=None,
model_name="TinyLLaVA-3.1B"
)
device = torch.device("cpu")
model.to(device)
# Streamlit UI
st.set_page_config(page_title="TinyLLaVA 3.1B (Streamlit)", layout="centered")
st.title("π¦ TinyLLaVA 3.1B β Vision-Language Q&A")
uploaded_file = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"])
prompt = st.text_input("Ask a question about the image:")
if uploaded_file is not None and prompt:
image = Image.open(uploaded_file).convert("RGB")
# Process image
image_tensor = process_images([image], image_processor, model.config)
image_tensor = image_tensor.to(device)
# Process prompt
prompt_text = tokenizer_image_token(prompt, tokenizer, context_len)
inputs = tokenizer([prompt_text])
input_ids = torch.tensor(inputs.input_ids).unsqueeze(0).to(device)
# Run inference
with st.spinner("Generating answer..."):
output_ids = model.generate(
input_ids,
images=image_tensor,
do_sample=True,
temperature=0.2,
max_new_tokens=200
)
out_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
st.subheader("Answer:")
st.write(out_text)
|