Spaces:

WaysAheadGlobal
/

VLM

Sleeping

App Files Files Community

WaysAheadGlobal commited on Jun 28

Commit

9aa78bc

verified ·

1 Parent(s): d8f06bf

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -9

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import streamlit as st
 from PIL import Image
 import torch
-# Import TinyLLaVA modules (use local copy!)
 from tinyllava.model.builder import load_pretrained_model
 from tinyllava.utils import disable_torch_init
 from tinyllava.mm_utils import (
@@ -13,11 +13,13 @@ from tinyllava.mm_utils import (
     get_model_name_from_path
 )
-# Disable torch default init for speed
 disable_torch_init()
-# Load TinyLLaVA 3.1B
 MODEL_PATH = "bczhou/TinyLLaVA-3.1B"
 tokenizer, model, image_processor, context_len = load_pretrained_model(
     model_path=MODEL_PATH,
     model_base=None,
@@ -31,9 +33,8 @@ model.to(device)
 st.set_page_config(page_title="TinyLLaVA 3.1B (Streamlit)", layout="centered")
 st.title("🦙 TinyLLaVA 3.1B — Vision-Language Q&A")
-uploaded_file = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"])
-prompt = st.text_input("Ask a question about the image:")
 if uploaded_file is not None and prompt:
     image = Image.open(uploaded_file).convert("RGB")
@@ -42,12 +43,12 @@ if uploaded_file is not None and prompt:
     image_tensor = process_images([image], image_processor, model.config)
     image_tensor = image_tensor.to(device)
-    # Process prompt
     prompt_text = tokenizer_image_token(prompt, tokenizer, context_len)
     inputs = tokenizer([prompt_text])
     input_ids = torch.tensor(inputs.input_ids).unsqueeze(0).to(device)
-    # Run inference
     with st.spinner("Generating answer..."):
         output_ids = model.generate(
             input_ids,
@@ -58,5 +59,5 @@ if uploaded_file is not None and prompt:
         )
         out_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    st.subheader("Answer:")
     st.write(out_text)

 from PIL import Image
 import torch
+# ✅ Local TinyLLaVA from real LLaVA repo
 from tinyllava.model.builder import load_pretrained_model
 from tinyllava.utils import disable_torch_init
 from tinyllava.mm_utils import (
     get_model_name_from_path
 )
+# Disable torch default init for faster startup
 disable_torch_init()
+# Load TinyLLaVA 3.1B (best small version)
 MODEL_PATH = "bczhou/TinyLLaVA-3.1B"
+# Loads tokenizer, model, image processor, context length
 tokenizer, model, image_processor, context_len = load_pretrained_model(
     model_path=MODEL_PATH,
     model_base=None,
 st.set_page_config(page_title="TinyLLaVA 3.1B (Streamlit)", layout="centered")
 st.title("🦙 TinyLLaVA 3.1B — Vision-Language Q&A")
+uploaded_file = st.file_uploader("📷 Upload an image", type=["jpg", "png", "jpeg"])
+prompt = st.text_input("💬 Ask a question about the image:")
 if uploaded_file is not None and prompt:
     image = Image.open(uploaded_file).convert("RGB")
     image_tensor = process_images([image], image_processor, model.config)
     image_tensor = image_tensor.to(device)
+    # Build prompt with image tokens
     prompt_text = tokenizer_image_token(prompt, tokenizer, context_len)
     inputs = tokenizer([prompt_text])
     input_ids = torch.tensor(inputs.input_ids).unsqueeze(0).to(device)
+    # Generate
     with st.spinner("Generating answer..."):
         output_ids = model.generate(
             input_ids,
         )
         out_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    st.subheader("📝 Answer:")
     st.write(out_text)