Spaces:
Running on Zero
Running on Zero
| import logging | |
| import torch | |
| from transformers import Qwen3VLForConditionalGeneration, AutoProcessor | |
| from PIL import Image | |
| logger = logging.getLogger(__name__) | |
| # Load model if GPU is available | |
| model = None | |
| processor = None | |
| if torch.cuda.is_available(): | |
| logger.info("Loading Qwen3VL model.") | |
| model = Qwen3VLForConditionalGeneration.from_pretrained( | |
| "Qwen/Qwen3-VL-4B-Instruct", | |
| dtype=torch.bfloat16, | |
| device_map="auto" | |
| ) | |
| processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-4B-Instruct") | |
| logger.info("Qwen3VL model loaded.") | |
| else: | |
| logger.info("Qwen3VL was not loaded because no GPU is available.") | |
| def expand_prompt(image: Image.Image, user_prompt: str) -> str: | |
| """ | |
| Expand the user prompt using the Qwen3VL model. | |
| Args: | |
| image: The image to use for the prompt expansion. | |
| user_prompt: The user prompt to expand. | |
| Returns: | |
| The expanded prompt. | |
| """ | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image"}, | |
| {"type": "text", "text": f"Describe the {user_prompt} in this image with a short prompt. Don't use surrounding objects in the description. Also don't describe the background, like what it is sitting on or what it is on top of, etc..."} | |
| ] | |
| } | |
| ] | |
| text = processor.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| inputs = processor( | |
| text=[text], | |
| images=[image], | |
| padding=True, | |
| return_tensors="pt" | |
| ) | |
| inputs = inputs.to(model.device) | |
| with torch.no_grad(): | |
| generated_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=512 | |
| ) | |
| generated_ids_trimmed = generated_ids[:, inputs["input_ids"].shape[1]:] | |
| output_text = processor.batch_decode( | |
| generated_ids_trimmed, | |
| skip_special_tokens=True | |
| )[0] | |
| return output_text | |