File size: 1,663 Bytes
2323b4d 0b79131 2323b4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import torch
import gradio as gr
from PIL import Image
from consts import BASE_MODEL
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from qwen_vl_utils import process_vision_info
import time
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Qwen2VLForConditionalGeneration.from_pretrained(BASE_MODEL)
processor = AutoProcessor.from_pretrained(BASE_MODEL)
def query_local(image: Image.Image, question: str):
start_time = time.time()
print("starting local inference at: %s" %( start_time))
if not image:
raise ValueError("Missing image")
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": question}
]
}
]
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
images, video_inputs = process_vision_info(messages)
inputs = processor(
text=text,
images=images,
videos=video_inputs,
padding=True,
return_tensors="pt")
generated_ids = model.generate(**inputs, max_new_tokens=256)
print("inputs generated")
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
print("trimmed")
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)
print("decoded")
print("local %s --- " % (time.time() - start_time))
return output_text[0]
|