File size: 4,747 Bytes
09738fb b9bf728 2f43bdf b9bf728 2f43bdf b9bf728 d4de43b b9bf728 da7b320 d4de43b be8cb5c 537537a be8cb5c 537537a d4de43b f196485 2f43bdf be8cb5c da7b320 537537a ba8eac6 537537a 4141cd9 537537a 4141cd9 da7b320 537537a b9bf728 d44488e 537537a ba8eac6 b9bf728 537537a d4de43b b9bf728 09738fb d4de43b 537537a 09738fb d4de43b 537537a d4de43b 537537a 09738fb 537537a d4de43b 537537a 31b1ba6 c462733 be8cb5c fc3b462 c462733 fc3b462 c462733 fc3b462 f883da5 b9bf728 be8cb5c 2f43bdf 98e3549 ba8eac6 98e3549 2f43bdf b9bf728 537537a 09738fb be8cb5c 09738fb 537537a b9bf728 be8cb5c 537537a b9bf728 537537a b9bf728 537537a b9bf728 537537a 489e707 d4de43b ee5eab8 537537a 3acaf2a 739eb54 7d86531 739eb54 537537a 09738fb 537537a ee5eab8 2f43bdf 537537a b9bf728 2f43bdf 537537a 2f43bdf d4de43b 2f43bdf d4de43b 98e3549 be8cb5c d4de43b 2f43bdf 98e3549 b9bf728 98e3549 be8cb5c 98e3549 2f43bdf 09738fb 537537a 278b423 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | import os
import json
import re
from typing import Any, Dict, Tuple
import torch
import gradio as gr
import spaces
from PIL import Image, ImageOps
from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
# env / cache setup
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:64"
# cache for Spaces
os.environ["HF_HOME"] = "/tmp/hf"
os.environ["HF_HUB_CACHE"] = "/tmp/hf/hub"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf/transformers"
os.makedirs("/tmp/hf/hub", exist_ok=True)
os.makedirs("/tmp/hf/transformers", exist_ok=True)
torch.set_float32_matmul_precision("high")
HF_TOKEN = os.environ.get("HF_TOKEN", "")
# THE MODEL Qwen3-VL
MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
processor = None
model = None
def load_model() -> None:
global processor, model
if model is not None and processor is not None:
return
print("Loading processor...")
processor = AutoProcessor.from_pretrained(
MODEL_ID,
token=HF_TOKEN if HF_TOKEN else None,
)
print("Loading model...")
model = Qwen3VLForConditionalGeneration.from_pretrained(
MODEL_ID,
token=HF_TOKEN if HF_TOKEN else None,
device_map="auto",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
)
print("Setting eval mode...")
model.eval()
print("Model ready")
def normalize_image(image: Image.Image) -> Image.Image:
return ImageOps.exif_transpose(image).convert("RGB")
def extract_json(text: str) -> Dict[str, Any]:
text = (text or "").strip()
# Strip common markdown fences.
text = re.sub(r"^\s*```(?:json)?\s*", "", text, flags=re.I)
text = re.sub(r"\s*```\s*$", "", text, flags=re.I)
try:
return json.loads(text)
except Exception:
pass
# Try to find the first JSON object in the text.
match = re.search(r"\{.*\}", text, flags=re.S)
if match:
try:
return json.loads(match.group(0))
except Exception:
pass
return {"raw_output": text}
DEFAULT_SYSTEM_PROMPT = "Analyze this pantry image in detail, list all items"
DEFAULT_PROMPT = """
Return only valid JSON.
List each pantry items once.
Use this format:
{["item1", "item2"]}
"""
@spaces.GPU(size="large", duration=60)
def analyze_pantry(image: Image.Image, system_prompt: str, prompt: str) -> Tuple[Image.Image, Dict[str, Any]]:
if image is None:
return None, {"error": "Upload an image first."}
load_model()
prepared = normalize_image(image)
messages = [
{
"role": "system",
"content": [
{"type": "text", "text": system_prompt}
],
},
{
"role": "user",
"content": [
{"type": "image", "image": prepared},
{"type": "text", "text": prompt},
],
},
]
# Qwen3-VL official Transformers usage.
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
)
inputs = inputs.to(model.device)
print("inputs:", inputs)
with torch.inference_mode():
output_ids = model.generate(
**inputs,
max_new_tokens=1024,
do_sample=False,
repetition_penalty=1.1,
no_repeat_ngram_size=3
)
prompt_len = inputs["input_ids"].shape[-1]
generated_text = processor.batch_decode(
[output_ids[0][prompt_len:]],
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0].strip()
print("generated_text:", generated_text)
parsed = extract_json(generated_text)
if isinstance(parsed, dict) and "raw_output" not in parsed:
parsed["_raw_output"] = generated_text
return prepared, parsed
with gr.Blocks() as demo:
gr.Markdown("# Pantry Scanner")
with gr.Row():
image_input = gr.Image(type="pil", label="Pantry image")
system_prompt_input = gr.Textbox(
value=DEFAULT_SYSTEM_PROMPT,
label="System prompt",
lines=3,
)
prompt_input = gr.Textbox(
value=DEFAULT_PROMPT,
label="Prompt",
lines=6,
)
with gr.Row():
analyze_btn = gr.Button("Analyze", variant="primary")
with gr.Row():
prepared_output = gr.Image(type="pil", label="Feeding image")
output_json = gr.JSON(label="Detected items")
analyze_btn.click(
analyze_pantry,
inputs=[image_input, system_prompt_input, prompt_input],
outputs=[prepared_output, output_json],
)
demo.queue(max_size=8)
demo.launch() |