airzy1 commited on
Commit
489e707
·
verified ·
1 Parent(s): 31b1ba6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -22
app.py CHANGED
@@ -2,18 +2,23 @@ import os
2
  import json
3
  import re
4
 
5
- # Use persistent storage on Spaces instead of /tmp
6
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
7
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"
8
- os.environ["HF_HOME"] = "/data/.huggingface"
9
- os.environ["HF_HUB_CACHE"] = "/data/.huggingface/hub"
10
- os.environ["TRANSFORMERS_CACHE"] = "/data/.huggingface/transformers"
 
 
 
 
 
11
 
12
  import spaces
13
  import torch
14
  import gradio as gr
15
  from PIL import Image
16
  from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 
17
 
18
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
19
  MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
@@ -31,6 +36,8 @@ def load_model():
31
  processor = AutoProcessor.from_pretrained(
32
  MODEL_ID,
33
  token=HF_TOKEN if HF_TOKEN else None,
 
 
34
  )
35
 
36
  print("loading model:", MODEL_ID)
@@ -91,7 +98,7 @@ Rules:
91
  """
92
 
93
 
94
- @spaces.GPU(size="large", duration=160)
95
  def analyze_pantry(image: Image.Image):
96
  if image is None:
97
  return {"error": "Please upload a pantry image."}
@@ -101,12 +108,7 @@ def analyze_pantry(image: Image.Image):
101
  messages = [
102
  {
103
  "role": "system",
104
- "content": [
105
- {
106
- "type": "text",
107
- "text": "You extract pantry items from photos and respond with JSON only."
108
- }
109
- ],
110
  },
111
  {
112
  "role": "user",
@@ -117,18 +119,22 @@ def analyze_pantry(image: Image.Image):
117
  },
118
  ]
119
 
120
- inputs = processor.apply_chat_template(
121
  messages,
 
122
  add_generation_prompt=True,
123
- tokenize=True,
124
- return_dict=True,
 
 
 
 
 
 
125
  return_tensors="pt",
126
  )
127
 
128
- inputs = {
129
- k: v.to(model.device) if hasattr(v, "to") else v
130
- for k, v in inputs.items()
131
- }
132
 
133
  with torch.inference_mode():
134
  output_ids = model.generate(
@@ -137,11 +143,11 @@ def analyze_pantry(image: Image.Image):
137
  do_sample=False,
138
  )
139
 
140
- prompt_len = inputs["input_ids"].shape[-1]
141
- generated_text = processor.decode(
142
- output_ids[0][prompt_len:],
143
  skip_special_tokens=True,
144
- ).strip()
 
145
 
146
  print("generated_text:", generated_text)
147
 
 
2
  import json
3
  import re
4
 
 
5
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
6
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"
7
+
8
+ # Writable cache path for Spaces WITHOUT persistent storage
9
+ os.environ["HF_HOME"] = "/tmp/hf"
10
+ os.environ["HF_HUB_CACHE"] = "/tmp/hf/hub"
11
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf/transformers"
12
+
13
+ os.makedirs("/tmp/hf/hub", exist_ok=True)
14
+ os.makedirs("/tmp/hf/transformers", exist_ok=True)
15
 
16
  import spaces
17
  import torch
18
  import gradio as gr
19
  from PIL import Image
20
  from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
21
+ from qwen_vl_utils import process_vision_info
22
 
23
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
24
  MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
 
36
  processor = AutoProcessor.from_pretrained(
37
  MODEL_ID,
38
  token=HF_TOKEN if HF_TOKEN else None,
39
+ min_pixels=256 * 28 * 28,
40
+ max_pixels=1280 * 28 * 28,
41
  )
42
 
43
  print("loading model:", MODEL_ID)
 
98
  """
99
 
100
 
101
+ @spaces.GPU(size="xlarge", duration=160)
102
  def analyze_pantry(image: Image.Image):
103
  if image is None:
104
  return {"error": "Please upload a pantry image."}
 
108
  messages = [
109
  {
110
  "role": "system",
111
+ "content": [{"type": "text", "text": "You extract pantry items from photos and respond with JSON only."}],
 
 
 
 
 
112
  },
113
  {
114
  "role": "user",
 
119
  },
120
  ]
121
 
122
+ text = processor.apply_chat_template(
123
  messages,
124
+ tokenize=False,
125
  add_generation_prompt=True,
126
+ )
127
+ image_inputs, video_inputs = process_vision_info(messages)
128
+
129
+ inputs = processor(
130
+ text=[text],
131
+ images=image_inputs,
132
+ videos=video_inputs,
133
+ padding=True,
134
  return_tensors="pt",
135
  )
136
 
137
+ inputs = {k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()}
 
 
 
138
 
139
  with torch.inference_mode():
140
  output_ids = model.generate(
 
143
  do_sample=False,
144
  )
145
 
146
+ generated_text = processor.batch_decode(
147
+ [output_ids[0][inputs["input_ids"].shape[-1]:]],
 
148
  skip_special_tokens=True,
149
+ clean_up_tokenization_spaces=False,
150
+ )[0].strip()
151
 
152
  print("generated_text:", generated_text)
153