Spaces:

WaysAheadGlobal
/

Blip

Build error

App Files Files Community

WaysAheadGlobal commited on Jun 28

Commit

ef81d40

verified ·

1 Parent(s): 4265501

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -52

app.py CHANGED Viewed

@@ -1,62 +1,60 @@
 # app.py
 import gradio as gr
 import torch
-import cv2
 from PIL import Image
-from transformers import LlavaProcessor, LlavaForConditionalGeneration
-# Load LLaVA model (MiniGPT-4 style)
-model_id = "llava-hf/llava-1.5-7b-hf"
-processor = LlavaProcessor.from_pretrained(model_id)
-model = LlavaForConditionalGeneration.from_pretrained(model_id)
 device = torch.device("cpu")
 model.to(device)
-# Function: read webcam, yield frame + LLaVA caption every few seconds
-def webcam_llava():
-    cap = cv2.VideoCapture(0)
-    if not cap.isOpened():
-        raise RuntimeError("Webcam could not be opened.")
-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        # Convert BGR to RGB PIL
-        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        pil_image = Image.fromarray(rgb_frame)
-        # --- Compose prompt for LLaVA ---
-        prompt = "<image>\nUSER: Describe this scene in detail.\nASSISTANT:"
-        inputs = processor(prompt, pil_image, return_tensors="pt").to(device)
-        # Generate
-        output = model.generate(**inputs, max_new_tokens=200)
-        caption = processor.decode(output[0], skip_special_tokens=True)
-        # Yield current frame + caption
-        yield rgb_frame, caption
-        # Wait before next frame (adjust as needed)
-        cv2.waitKey(10000)  # 10 seconds for CPU safety
-    cap.release()
-# Gradio app
-with gr.Blocks() as demo:
-    gr.Markdown("# 🎥 LLaVA MiniGPT-4 Webcam Captioning\n_(CPU, slow but descriptive)_")
-    webcam_display = gr.Image(label="Live Webcam")
-    description = gr.Textbox(label="LLaVA Caption")
-    demo.load(
-        fn=webcam_llava,
-        inputs=None,
-        outputs=[webcam_display, description],
-        every=1
-    )
-demo.launch()

 # app.py
 import gradio as gr
+from tinyllava.model.builder import load_pretrained_model
+from tinyllava.utils import disable_torch_init
+from tinyllava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
 import torch
 from PIL import Image
+# --- Disable unnecessary torch init ---
+disable_torch_init()
+# --- Load TinyLLaVA 3.1B ---
+model_path = "bczhou/TinyLLaVA-3.1B"  # official HF ID
+tokenizer, model, image_processor, context_len = load_pretrained_model(
+    model_path=model_path,
+    model_base=None,  # If you have a base model, point it here; else leave as is
+    model_name="TinyLLaVA-3.1B"
+)
 device = torch.device("cpu")
 model.to(device)
+# --- Gradio handler ---
+def describe_image(image, prompt):
+    # TinyLLaVA wants PIL
+    image = Image.fromarray(image)
+    image_tensor = process_images([image], image_processor, model.config)
+    image_tensor = image_tensor.to(device)
+    prompt = tokenizer_image_token(prompt, tokenizer, context_len)
+    inputs = tokenizer([prompt])
+    input_ids = torch.tensor(inputs.input_ids).unsqueeze(0).to(device)
+    with torch.no_grad():
+        output_ids = model.generate(
+            input_ids,
+            images=image_tensor,
+            do_sample=True,
+            temperature=0.2,
+            max_new_tokens=200
+        )
+    out_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return out_text
+iface = gr.Interface(
+    fn=describe_image,
+    inputs=[
+        gr.Image(type="numpy", label="Image"),
+        gr.Textbox(label="Your question", placeholder="What's happening in this image?")
+    ],
+    outputs=gr.Textbox(label="TinyLLaVA Answer"),
+    title="🦙 TinyLLaVA-3.1B — Vision-Language Q&A",
+    description="A lightweight LLaVA variant that runs on CPU Spaces. Upload an image, ask a question."
+)
+if __name__ == "__main__":
+    iface.launch()