EveryonesGPT_Vision_Instruct

Sleeping

App Files Files Community

HayatoHongoEveryonesAI commited on Jan 14

Commit

cf3c4b2

1 Parent(s): 6b1d95e

update

Browse files

Files changed (2) hide show

__pycache__/vlm_inference.cpython-310.pyc +0 -0
app.py +72 -24

__pycache__/vlm_inference.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/vlm_inference.cpython-310.pyc and b/__pycache__/vlm_inference.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -2,7 +2,6 @@
 import gradio as gr
 import spaces
 import torch
-from PIL import Image
 from vlm_inference import (
     load_vlm_model,
@@ -13,37 +12,42 @@ from vlm_inference import (
 # =====================================================
 # Load model on CPU (ZeroGPU)
 # =====================================================
-model = load_vlm_model()   # CPU load, eval
 # =====================================================
-# GPU inference (VLM only)
 # =====================================================
 @spaces.GPU
-def chat_fn(
-    message,
-    history,
     image,
     temperature,
     top_p,
     top_k,
 ):
     if image is None:
-        return "Please upload an image."
     device = "cuda"
     model_gpu = model.to(device)
     image_tensor = image_processor(
         images=image.convert("RGB"),
         return_tensors="pt"
     )["pixel_values"].to(device)
     prompt = (
-        f"{message}"
     )
-    def stream():
         for chunk in vlm_infer_stream(
             model=model_gpu,
             image_tensor=image_tensor,
@@ -54,27 +58,71 @@ def chat_fn(
             top_k=top_k if top_k > 0 else None,
         ):
             yield chunk
         model_gpu.to("cpu")
         torch.cuda.empty_cache()
-    return stream()
 # =====================================================
-# UI
 # =====================================================
-demo = gr.ChatInterface(
-    fn=chat_fn,
-    multimodal=True,
-    title="EveryonesGPT Vision (CLIP)",
-    description="Vision-only VLM demo (CLIP ViT-L/14)",
-    additional_inputs=[
-        gr.Image(type="pil", label="Image"),
-        gr.Slider(0.1, 2.0, value=0.5, step=0.05, label="Temperature"),
-        gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p"),
-        gr.Slider(0, 200, value=0, step=1, label="Top-k"),
-    ],
-)
 demo.launch()

 import gradio as gr
 import spaces
 import torch
 from vlm_inference import (
     load_vlm_model,
 # =====================================================
 # Load model on CPU (ZeroGPU)
 # =====================================================
+model = load_vlm_model()
+model.eval()
 # =====================================================
+# GPU inference (single-turn VLM)
 # =====================================================
 @spaces.GPU
+def infer_once(
     image,
+    text,
     temperature,
     top_p,
     top_k,
 ):
     if image is None:
+        yield "⚠️ Please upload an image."
+        return
     device = "cuda"
     model_gpu = model.to(device)
+    # --- image tensor ---
     image_tensor = image_processor(
         images=image.convert("RGB"),
         return_tensors="pt"
     )["pixel_values"].to(device)
+    # --- prompt (Colabと同一) ---
     prompt = (
+        "<user>\n"
+        f"{text}\n"
+        "<assistant>\n"
     )
+    try:
         for chunk in vlm_infer_stream(
             model=model_gpu,
             image_tensor=image_tensor,
             top_k=top_k if top_k > 0 else None,
         ):
             yield chunk
+    finally:
         model_gpu.to("cpu")
         torch.cuda.empty_cache()
+# =====================================================
+# UI logic (history is display-only)
+# =====================================================
+def submit(
+    image,
+    text,
+    history,
+    temperature,
+    top_p,
+    top_k,
+):
+    history = history or []
+    history.append((text, ""))
+    def stream():
+        acc = ""
+        for chunk in infer_once(image, text, temperature, top_p, top_k):
+            acc += chunk
+            history[-1] = (text, acc)
+            yield history
+    return history, stream()
 # =====================================================
+# Gradio UI
 # =====================================================
+with gr.Blocks(title="EveryonesGPT Vision (Single-turn)") as demo:
+    gr.Markdown("## 🖼️ EveryonesGPT Vision\nSingle-turn VLM (Colab-compatible)")
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", label="Image")
+            text_input = gr.Textbox(
+                label="Prompt",
+                placeholder="Describe the image or ask a question",
+                lines=3,
+            )
+            temperature = gr.Slider(0.1, 2.0, value=0.5, step=0.05, label="Temperature")
+            top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p")
+            top_k = gr.Slider(0, 200, value=0, step=1, label="Top-k")
+            submit_btn = gr.Button("Run")
+        with gr.Column(scale=1):
+            chatbot = gr.Chatbot(label="Output (history is display-only)")
+            state = gr.State([])
+    submit_btn.click(
+        fn=submit,
+        inputs=[
+            image_input,
+            text_input,
+            state,
+            temperature,
+            top_p,
+            top_k,
+        ],
+        outputs=[chatbot, chatbot],
+    )
 demo.launch()