Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,877 Bytes
d9d1598 4c2d972 71ef59e d9d1598 0bf8729 4fa18d9 71ef59e 0bf8729 71ef59e 0bf8729 71ef59e f18bd0f b3db9ce f18bd0f f10889a 2a9891d 71ef59e 4fa18d9 0bf8729 d9d1598 0bf8729 d9d1598 0bf8729 8edc124 0bf8729 8edc124 d9d1598 0bf8729 8edc124 2a9891d 8edc124 d9d1598 0bf8729 f18bd0f 0bf8729 f18bd0f a7fd61f f18bd0f a83f12f 0bf8729 4fa18d9 f10889a d9d1598 4fa18d9 0bf8729 4e1e198 f18bd0f 4fa18d9 0bf8729 a32aeaf 0bf8729 4fa18d9 0bf8729 b3db9ce f18bd0f c697b34 f18bd0f 0bf8729 4c2d972 d9d1598 0bf8729 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
from statistics import quantiles
import spaces, ffmpeg, os, sys, torch
import gradio as gr
from transformers import (
Qwen2_5_VLForConditionalGeneration,
AutoProcessor,
BitsAndBytesConfig,
)
from qwen_vl_utils import process_vision_info
from loguru import logger
logger.remove()
logger.add(
sys.stderr,
format="<d>{time:YYYY-MM-DD ddd HH:mm:ss}</d> | <lvl>{level}</lvl> | <lvl>{message}</lvl>",
)
# --- Installing Flash Attention for ZeroGPU is special --- #
import subprocess
subprocess.run(
"pip install flash-attn --no-build-isolation",
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
shell=True,
)
# --- now we got Flash Attention ---#
# Set target DEVICE and DTYPE
# For maximum memory efficiency, use bfloat16 if your GPU supports it, otherwise float16.
DTYPE = (
torch.bfloat16
if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
else torch.float16
)
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Use "auto" to let accelerate handle device placement (GPU, CPU, disk)
DEVICE = "auto"
logger.info(f"Device: {DEVICE}, dtype: {DTYPE}")
def get_fps_ffmpeg(video_path: str):
probe = ffmpeg.probe(video_path)
# Find the first video stream
video_stream = next(
(stream for stream in probe["streams"] if stream["codec_type"] == "video"), None
)
if video_stream is None:
raise ValueError("No video stream found")
# Frame rate is given as a string fraction, e.g., '30000/1001'
r_frame_rate = video_stream["r_frame_rate"]
num, denom = map(int, r_frame_rate.split("/"))
return num / denom
def load_model(
model_name: str = "chancharikm/qwen2.5-vl-7b-cam-motion-preview",
use_flash_attention: bool = True,
apply_quantization: bool = True,
):
# We recommend enabling flash_attention_2 for better acceleration and memory saving,
# especially in multi-image and video scenarios.
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, # Load model weights in 4-bit
bnb_4bit_quant_type="nf4", # Use NF4 quantization (or "fp4")
bnb_4bit_compute_dtype=DTYPE, # Perform computations in bfloat16/float16
bnb_4bit_use_double_quant=True, # Optional: further quantization for slightly more memory saving
)
model = (
Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_name,
torch_dtype=DTYPE,
attn_implementation="flash_attention_2",
device_map=DEVICE,
low_cpu_mem_usage=True,
quantization_config=bnb_config if apply_quantization else None,
)
if use_flash_attention
else Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_name,
torch_dtype=DTYPE,
device_map=DEVICE,
low_cpu_mem_usage=True,
quantization_config=bnb_config if apply_quantization else None,
)
)
# Set model to evaluation mode for inference (disables dropout, etc.)
model.eval()
return model
def load_processor(model_name="Qwen/Qwen2.5-VL-7B-Instruct"):
return AutoProcessor.from_pretrained(
model_name,
device_map=DEVICE,
use_fast=True,
torch_dtype=DTYPE,
)
MODEL = load_model(use_flash_attention=False, apply_quantization=False)
PROCESSOR = load_processor()
@spaces.GPU(duration=120)
def inference(
video_path: str,
prompt: str = "Describe the camera motion in this video.",
use_flash_attention: bool = True,
apply_quantization: bool = True,
):
# default processor
# processor, model = PROCESSOR, MODEL
processor = load_processor()
model = load_model(
use_flash_attention=use_flash_attention, apply_quantization=apply_quantization
)
# The model is trained on 8.0 FPS which we recommend for optimal inference
fps = get_fps_ffmpeg(video_path)
logger.info(f"{os.path.basename(video_path)} FPS: {fps}")
messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": video_path,
"fps": fps,
},
{"type": "text", "text": prompt},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs, video_kwargs = process_vision_info(
messages, return_video_kwargs=True
)
# This prevents PyTorch from building the computation graph for gradients,
# saving a significant amount of memory for intermediate activations.
with torch.no_grad():
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
# fps=fps,
padding=True,
return_tensors="pt",
**video_kwargs,
)
inputs = inputs.to("cuda")
# Inference
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :]
for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)
return output_text
demo = gr.Interface(
fn=inference,
inputs=[
gr.Video(label="Input Video"),
gr.Textbox(label="Prompt", value="Describe the camera motion in this video."),
gr.Checkbox(label="Use Flash Attention", value=False),
gr.Checkbox(label="Apply Quantization", value=True),
],
outputs=gr.JSON(label="Output JSON"),
title="",
api_name="video_inference",
)
demo.launch(
mcp_server=True, app_kwargs={"docs_url": "/docs"} # add FastAPI Swagger API Docs
)
|