HayatoHongoEveryonesAI commited on
Commit
cf3c4b2
·
1 Parent(s): 6b1d95e
Files changed (2) hide show
  1. __pycache__/vlm_inference.cpython-310.pyc +0 -0
  2. app.py +72 -24
__pycache__/vlm_inference.cpython-310.pyc CHANGED
Binary files a/__pycache__/vlm_inference.cpython-310.pyc and b/__pycache__/vlm_inference.cpython-310.pyc differ
 
app.py CHANGED
@@ -2,7 +2,6 @@
2
  import gradio as gr
3
  import spaces
4
  import torch
5
- from PIL import Image
6
 
7
  from vlm_inference import (
8
  load_vlm_model,
@@ -13,37 +12,42 @@ from vlm_inference import (
13
  # =====================================================
14
  # Load model on CPU (ZeroGPU)
15
  # =====================================================
16
- model = load_vlm_model() # CPU load, eval
 
17
 
18
 
19
  # =====================================================
20
- # GPU inference (VLM only)
21
  # =====================================================
22
  @spaces.GPU
23
- def chat_fn(
24
- message,
25
- history,
26
  image,
 
27
  temperature,
28
  top_p,
29
  top_k,
30
  ):
31
  if image is None:
32
- return "Please upload an image."
 
33
 
34
  device = "cuda"
35
  model_gpu = model.to(device)
36
 
 
37
  image_tensor = image_processor(
38
  images=image.convert("RGB"),
39
  return_tensors="pt"
40
  )["pixel_values"].to(device)
41
 
 
42
  prompt = (
43
- f"{message}"
 
 
44
  )
45
 
46
- def stream():
47
  for chunk in vlm_infer_stream(
48
  model=model_gpu,
49
  image_tensor=image_tensor,
@@ -54,27 +58,71 @@ def chat_fn(
54
  top_k=top_k if top_k > 0 else None,
55
  ):
56
  yield chunk
57
-
58
  model_gpu.to("cpu")
59
  torch.cuda.empty_cache()
60
 
61
- return stream()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
 
64
  # =====================================================
65
- # UI
66
  # =====================================================
67
- demo = gr.ChatInterface(
68
- fn=chat_fn,
69
- multimodal=True,
70
- title="EveryonesGPT Vision (CLIP)",
71
- description="Vision-only VLM demo (CLIP ViT-L/14)",
72
- additional_inputs=[
73
- gr.Image(type="pil", label="Image"),
74
- gr.Slider(0.1, 2.0, value=0.5, step=0.05, label="Temperature"),
75
- gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p"),
76
- gr.Slider(0, 200, value=0, step=1, label="Top-k"),
77
- ],
78
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  demo.launch()
 
2
  import gradio as gr
3
  import spaces
4
  import torch
 
5
 
6
  from vlm_inference import (
7
  load_vlm_model,
 
12
  # =====================================================
13
  # Load model on CPU (ZeroGPU)
14
  # =====================================================
15
+ model = load_vlm_model()
16
+ model.eval()
17
 
18
 
19
  # =====================================================
20
+ # GPU inference (single-turn VLM)
21
  # =====================================================
22
  @spaces.GPU
23
+ def infer_once(
 
 
24
  image,
25
+ text,
26
  temperature,
27
  top_p,
28
  top_k,
29
  ):
30
  if image is None:
31
+ yield "⚠️ Please upload an image."
32
+ return
33
 
34
  device = "cuda"
35
  model_gpu = model.to(device)
36
 
37
+ # --- image tensor ---
38
  image_tensor = image_processor(
39
  images=image.convert("RGB"),
40
  return_tensors="pt"
41
  )["pixel_values"].to(device)
42
 
43
+ # --- prompt (Colabと同一) ---
44
  prompt = (
45
+ "<user>\n"
46
+ f"{text}\n"
47
+ "<assistant>\n"
48
  )
49
 
50
+ try:
51
  for chunk in vlm_infer_stream(
52
  model=model_gpu,
53
  image_tensor=image_tensor,
 
58
  top_k=top_k if top_k > 0 else None,
59
  ):
60
  yield chunk
61
+ finally:
62
  model_gpu.to("cpu")
63
  torch.cuda.empty_cache()
64
 
65
+
66
+ # =====================================================
67
+ # UI logic (history is display-only)
68
+ # =====================================================
69
+ def submit(
70
+ image,
71
+ text,
72
+ history,
73
+ temperature,
74
+ top_p,
75
+ top_k,
76
+ ):
77
+ history = history or []
78
+ history.append((text, ""))
79
+
80
+ def stream():
81
+ acc = ""
82
+ for chunk in infer_once(image, text, temperature, top_p, top_k):
83
+ acc += chunk
84
+ history[-1] = (text, acc)
85
+ yield history
86
+
87
+ return history, stream()
88
 
89
 
90
  # =====================================================
91
+ # Gradio UI
92
  # =====================================================
93
+ with gr.Blocks(title="EveryonesGPT Vision (Single-turn)") as demo:
94
+ gr.Markdown("## 🖼️ EveryonesGPT Vision\nSingle-turn VLM (Colab-compatible)")
95
+
96
+ with gr.Row():
97
+ with gr.Column(scale=1):
98
+ image_input = gr.Image(type="pil", label="Image")
99
+ text_input = gr.Textbox(
100
+ label="Prompt",
101
+ placeholder="Describe the image or ask a question",
102
+ lines=3,
103
+ )
104
+
105
+ temperature = gr.Slider(0.1, 2.0, value=0.5, step=0.05, label="Temperature")
106
+ top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p")
107
+ top_k = gr.Slider(0, 200, value=0, step=1, label="Top-k")
108
+
109
+ submit_btn = gr.Button("Run")
110
+
111
+ with gr.Column(scale=1):
112
+ chatbot = gr.Chatbot(label="Output (history is display-only)")
113
+ state = gr.State([])
114
+
115
+ submit_btn.click(
116
+ fn=submit,
117
+ inputs=[
118
+ image_input,
119
+ text_input,
120
+ state,
121
+ temperature,
122
+ top_p,
123
+ top_k,
124
+ ],
125
+ outputs=[chatbot, chatbot],
126
+ )
127
 
128
  demo.launch()