Update README.md
Browse files
README.md
CHANGED
|
@@ -202,108 +202,6 @@ for t in range(31):
|
|
| 202 |
t += 1
|
| 203 |
```
|
| 204 |
|
| 205 |
-
Here we show a code snippet to show you how to do **common video (multi-turn) qa** with `transformers` and the above utils:
|
| 206 |
-
```python
|
| 207 |
-
import functools, torch
|
| 208 |
-
from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
|
| 209 |
-
apply_liger_kernel_to_qwen2_vl() # important. our model is trained with this. keep consistency
|
| 210 |
-
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, LogitsProcessor, logging
|
| 211 |
-
from livecc_utils import prepare_multiturn_multimodal_inputs_for_generation, get_smart_resized_clip, get_smart_resized_video_reader
|
| 212 |
-
from qwen_vl_utils import process_vision_info
|
| 213 |
-
|
| 214 |
-
class LiveCCDemoInfer:
|
| 215 |
-
fps = 2
|
| 216 |
-
initial_fps_frames = 6
|
| 217 |
-
streaming_fps_frames = 2
|
| 218 |
-
initial_time_interval = initial_fps_frames / fps
|
| 219 |
-
streaming_time_interval = streaming_fps_frames / fps
|
| 220 |
-
frame_time_interval = 1 / fps
|
| 221 |
-
|
| 222 |
-
def __init__(self, model_path: str = None, device: str = 'cuda'):
|
| 223 |
-
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 224 |
-
model_path, torch_dtype="auto",
|
| 225 |
-
device_map=device,
|
| 226 |
-
attn_implementation='flash_attention_2'
|
| 227 |
-
)
|
| 228 |
-
self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
|
| 229 |
-
self.streaming_eos_token_id = self.processor.tokenizer(' ...').input_ids[-1]
|
| 230 |
-
self.model.prepare_inputs_for_generation = functools.partial(prepare_multiturn_multimodal_inputs_for_generation, self.model)
|
| 231 |
-
message = {
|
| 232 |
-
"role": "user",
|
| 233 |
-
"content": [
|
| 234 |
-
{"type": "text", "text": 'livecc'},
|
| 235 |
-
]
|
| 236 |
-
}
|
| 237 |
-
texts = self.processor.apply_chat_template([message], tokenize=False)
|
| 238 |
-
self.system_prompt_offset = texts.index('<|im_start|>user')
|
| 239 |
-
|
| 240 |
-
def video_qa(
|
| 241 |
-
self,
|
| 242 |
-
message: str,
|
| 243 |
-
state: dict,
|
| 244 |
-
do_sample: bool = False,
|
| 245 |
-
repetition_penalty: float = 1.05,
|
| 246 |
-
**kwargs,
|
| 247 |
-
):
|
| 248 |
-
"""
|
| 249 |
-
state: dict, (maybe) with keys:
|
| 250 |
-
video_path: str, video path
|
| 251 |
-
video_timestamp: float, current video timestamp
|
| 252 |
-
last_timestamp: float, last processed video timestamp
|
| 253 |
-
last_video_pts_index: int, last processed video frame index
|
| 254 |
-
video_pts: np.ndarray, video pts
|
| 255 |
-
last_history: list, last processed history
|
| 256 |
-
past_key_values: llm past_key_values
|
| 257 |
-
past_ids: past generated ids
|
| 258 |
-
"""
|
| 259 |
-
video_path = state.get('video_path', None)
|
| 260 |
-
conversation = []
|
| 261 |
-
past_ids = state.get('past_ids', None)
|
| 262 |
-
content = [{"type": "text", "text": message}]
|
| 263 |
-
if past_ids is None and video_path: # only use once
|
| 264 |
-
content.insert(0, {"type": "video", "video": video_path})
|
| 265 |
-
conversation.append({"role": "user", "content": content})
|
| 266 |
-
image_inputs, video_inputs = process_vision_info(conversation)
|
| 267 |
-
texts = self.processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True, return_tensors='pt')
|
| 268 |
-
if past_ids is not None:
|
| 269 |
-
texts = '<|im_end|>\n' + texts[self.system_prompt_offset:]
|
| 270 |
-
inputs = self.processor(
|
| 271 |
-
text=texts,
|
| 272 |
-
images=image_inputs,
|
| 273 |
-
videos=video_inputs,
|
| 274 |
-
return_tensors="pt",
|
| 275 |
-
return_attention_mask=False
|
| 276 |
-
)
|
| 277 |
-
inputs.to(self.model.device)
|
| 278 |
-
if past_ids is not None:
|
| 279 |
-
inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1)
|
| 280 |
-
outputs = self.model.generate(
|
| 281 |
-
**inputs, past_key_values=state.get('past_key_values', None),
|
| 282 |
-
return_dict_in_generate=True, do_sample=do_sample,
|
| 283 |
-
repetition_penalty=repetition_penalty,
|
| 284 |
-
max_new_tokens=512,
|
| 285 |
-
)
|
| 286 |
-
state['past_key_values'] = outputs.past_key_values
|
| 287 |
-
state['past_ids'] = outputs.sequences[:, :-1]
|
| 288 |
-
response = self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True)
|
| 289 |
-
return response, state
|
| 290 |
-
|
| 291 |
-
model_path = 'chenjoya/LiveCC-7B-Base'
|
| 292 |
-
# download a test video at: https://github.com/showlab/livecc/blob/main/demo/sources/howto_fix_laptop_mute_1080p.mp4
|
| 293 |
-
video_path = "demo/sources/howto_fix_laptop_mute_1080p.mp4"
|
| 294 |
-
|
| 295 |
-
infer = LiveCCDemoInfer(model_path=model_path)
|
| 296 |
-
state = {'video_path': video_path}
|
| 297 |
-
# first round
|
| 298 |
-
query1 = 'What is the video?'
|
| 299 |
-
response1, state = infer.video_qa(message=query1, state=state)
|
| 300 |
-
print(f'Q1: {query1}\nA1: {response1}')
|
| 301 |
-
# second round
|
| 302 |
-
query2 = 'How do you know that?'
|
| 303 |
-
response2, state = infer.video_qa(message=query2, state=state)
|
| 304 |
-
print(f'Q2: {query2}\nA2: {response2}')
|
| 305 |
-
```
|
| 306 |
-
|
| 307 |
## Limitations
|
| 308 |
|
| 309 |
- This model is only performed video-ASR streaming pre-training, so it may not support well in common video qa.
|
|
|
|
| 202 |
t += 1
|
| 203 |
```
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
## Limitations
|
| 206 |
|
| 207 |
- This model is only performed video-ASR streaming pre-training, so it may not support well in common video qa.
|