developer0hye's picture
Update app.py
5e746b8 verified
import gradio as gr
import spaces
import torch
import os
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info # pip install qwen-vl-utils[decord]==0.0.8
# =============================================================================
# Qwen2.5-VL-7B-Instruct: model & processor
# =============================================================================
MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
# ๊ถŒ์žฅ: flash-attn2 ์‚ฌ์šฉ (ํ™˜๊ฒฝ์— ๋”ฐ๋ผ ์ฃผ์„ ํ•ด์ œ)
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
# MODEL_ID,
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
# device_map="auto",
# )
# ๊ธฐ๋ณธ ๋กœ๋“œ
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype="auto",
device_map="auto",
)
model.eval()
# ํ•ด์ƒ๋„ ์ž๋™ ์กฐ์ ˆ(๊ธฐ๋ณธ๊ฐ’ ์‚ฌ์šฉ). ํ•„์š”์‹œ min/max_pixels๋กœ ํ† ํฐ ๋น„์šฉ ์ œ์–ด ๊ฐ€๋Šฅ.
processor = AutoProcessor.from_pretrained(MODEL_ID)
# ์˜ˆ: min_pixels = 256*28*28; max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=min_pixels, max_pixels=max_pixels)
# =============================================================================
# Inference (image-only UI, text๋Š” ์„ ํƒ)
# =============================================================================
@spaces.GPU
def qwen_vl_inference(image_path: str | None, text_input: str | None = None):
if image_path is None:
return "Please upload an image first."
# Qwen์€ ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ file:// URI๋กœ ์ „๋‹ฌํ•˜๋Š” ๋ฐฉ์‹์„ ๊ณต์‹ ์˜ˆ์ œ๋กœ ์ œ๊ณต
file_uri = f"file://{os.path.abspath(image_path)}"
user_text = text_input.strip() if text_input else "Describe this image."
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": file_uri},
{"type": "text", "text": user_text},
],
}
]
# ํ…์ŠคํŠธ/๋น„์ „ ์ „์ฒ˜๋ฆฌ
chat_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[chat_text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
# ๋ชจ๋ธ ๋””๋ฐ”์ด์Šค๋กœ ์ด๋™ (device_map="auto" ํ™˜๊ฒฝ์—์„œ๋„ ์•ˆ์ „)
inputs = {k: (v.to(model.device) if isinstance(v, torch.Tensor) else v) for k, v in inputs.items()}
# ์ƒ์„ฑ
gen_ids = model.generate(**inputs, max_new_tokens=512)
# ์ž…๋ ฅ ํ† ํฐ ์ œ๊ฑฐ ํ›„ ๋””์ฝ”๋”ฉ
trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], gen_ids)]
output = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return output
# =============================================================================
# Gradio UI (Gradio 5)
# =============================================================================
DESCRIPTION = (
"[Qwen2.5-VL-7B-Instruct demo](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) โ€” "
"upload an image and ask anything about it."
)
css = """
#output_text {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css, theme="origin") as demo:
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column(scale=1):
input_image = gr.Image(label="Upload Image", type="filepath")
text_input = gr.Textbox(label="Question")
submit_btn = gr.Button("Submit")
with gr.Column(scale=1):
output_text = gr.Textbox(label="Model Output", elem_id="output_text")
gr.Examples(
examples=[["example.webp", "Explain this image"]],
inputs=[input_image, text_input],
outputs=output_text,
fn=qwen_vl_inference,
cache_examples=True,
label="Try an example"
)
submit_btn.click(qwen_vl_inference, [input_image, text_input], [output_text])
if __name__ == "__main__":
demo.launch()