stpete2 commited on
Commit
c7af37d
·
verified ·
1 Parent(s): a536dc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -30
app.py CHANGED
@@ -1,29 +1,31 @@
1
  import gradio as gr
2
  import torch
3
  from PIL import Image
4
- from transformers import AutoTokenizer
5
 
6
  # =========================
7
  # Model config
8
  # =========================
9
  MODEL_ID = "vikhyatk/moondream2"
 
10
  DEVICE = "cpu"
11
 
12
  # =========================
13
- # Load model (IMPORTANT)
14
  # =========================
15
  tokenizer = AutoTokenizer.from_pretrained(
16
  MODEL_ID,
 
17
  trust_remote_code=True
18
  )
19
 
20
- model = torch.load(
21
- torch.hub.load_state_dict_from_url(
22
- f"https://huggingface.co/{MODEL_ID}/resolve/main/moondream.pt",
23
- map_location=DEVICE
24
- )
 
25
  )
26
-
27
  model.eval()
28
 
29
  # =========================
@@ -32,43 +34,61 @@ model.eval()
32
  def understand_image(image, prompt):
33
  if image is None:
34
  return "Please upload an image."
35
-
36
- image = image.convert("RGB")
37
-
38
- with torch.no_grad():
 
 
 
 
 
39
  answer = model.answer_question(
40
- image,
41
  prompt,
42
  tokenizer
43
  )
44
-
45
- return answer
46
-
 
47
 
48
  # =========================
49
  # Gradio UI
50
  # =========================
51
  with gr.Blocks() as demo:
52
- gr.Markdown("# 🌓 Moondream2 Image Understanding (Free Tier)")
53
  gr.Markdown(
54
- "⚠️ Uploaded images are processed in memory and not stored permanently."
55
  )
56
-
57
  with gr.Row():
58
- image_input = gr.Image(type="pil", label="Upload Image")
59
- text_input = gr.Textbox(
60
- label="Question",
61
- placeholder="What is in this image?"
62
- )
63
-
64
- output = gr.Textbox(label="Answer")
65
-
66
- btn = gr.Button("Run")
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  btn.click(
68
  understand_image,
69
  inputs=[image_input, text_input],
70
  outputs=output
71
  )
72
 
73
- demo.launch()
74
-
 
1
  import gradio as gr
2
  import torch
3
  from PIL import Image
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
 
6
  # =========================
7
  # Model config
8
  # =========================
9
  MODEL_ID = "vikhyatk/moondream2"
10
+ REVISION = "2024-08-26" # 安定版のリビジョン
11
  DEVICE = "cpu"
12
 
13
  # =========================
14
+ # Load model (FIXED)
15
  # =========================
16
  tokenizer = AutoTokenizer.from_pretrained(
17
  MODEL_ID,
18
+ revision=REVISION,
19
  trust_remote_code=True
20
  )
21
 
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ MODEL_ID,
24
+ revision=REVISION,
25
+ trust_remote_code=True,
26
+ torch_dtype=torch.float32, # CPUの場合はfloat32
27
+ device_map={"": DEVICE}
28
  )
 
29
  model.eval()
30
 
31
  # =========================
 
34
  def understand_image(image, prompt):
35
  if image is None:
36
  return "Please upload an image."
37
+
38
+ if not prompt or prompt.strip() == "":
39
+ return "Please enter a question."
40
+
41
+ try:
42
+ image = image.convert("RGB")
43
+
44
+ # Moondream2の推論
45
+ enc_image = model.encode_image(image)
46
  answer = model.answer_question(
47
+ enc_image,
48
  prompt,
49
  tokenizer
50
  )
51
+ return answer
52
+
53
+ except Exception as e:
54
+ return f"Error: {str(e)}"
55
 
56
  # =========================
57
  # Gradio UI
58
  # =========================
59
  with gr.Blocks() as demo:
60
+ gr.Markdown("# 🌓 Moondream2 Image Understanding")
61
  gr.Markdown(
62
+ "⚠️ This space runs on CPU. Processing may take a few seconds."
63
  )
64
+
65
  with gr.Row():
66
+ with gr.Column():
67
+ image_input = gr.Image(type="pil", label="Upload Image")
68
+ text_input = gr.Textbox(
69
+ label="Question",
70
+ placeholder="What is in this image?",
71
+ value="Describe this image."
72
+ )
73
+ btn = gr.Button("Run", variant="primary")
74
+
75
+ with gr.Column():
76
+ output = gr.Textbox(label="Answer", lines=5)
77
+
78
+ # Examples
79
+ gr.Examples(
80
+ examples=[
81
+ ["What objects are in this image?"],
82
+ ["Describe the scene in detail."],
83
+ ["What colors do you see?"]
84
+ ],
85
+ inputs=text_input
86
+ )
87
+
88
  btn.click(
89
  understand_image,
90
  inputs=[image_input, text_input],
91
  outputs=output
92
  )
93
 
94
+ demo.launch()