chenjoya commited on
Commit
71585cc
·
verified ·
1 Parent(s): 57c8d4e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +0 -102
README.md CHANGED
@@ -202,108 +202,6 @@ for t in range(31):
202
  t += 1
203
  ```
204
 
205
- Here we show a code snippet to show you how to do **common video (multi-turn) qa** with `transformers` and the above utils:
206
- ```python
207
- import functools, torch
208
- from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
209
- apply_liger_kernel_to_qwen2_vl() # important. our model is trained with this. keep consistency
210
- from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, LogitsProcessor, logging
211
- from livecc_utils import prepare_multiturn_multimodal_inputs_for_generation, get_smart_resized_clip, get_smart_resized_video_reader
212
- from qwen_vl_utils import process_vision_info
213
-
214
- class LiveCCDemoInfer:
215
- fps = 2
216
- initial_fps_frames = 6
217
- streaming_fps_frames = 2
218
- initial_time_interval = initial_fps_frames / fps
219
- streaming_time_interval = streaming_fps_frames / fps
220
- frame_time_interval = 1 / fps
221
-
222
- def __init__(self, model_path: str = None, device: str = 'cuda'):
223
- self.model = Qwen2VLForConditionalGeneration.from_pretrained(
224
- model_path, torch_dtype="auto",
225
- device_map=device,
226
- attn_implementation='flash_attention_2'
227
- )
228
- self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
229
- self.streaming_eos_token_id = self.processor.tokenizer(' ...').input_ids[-1]
230
- self.model.prepare_inputs_for_generation = functools.partial(prepare_multiturn_multimodal_inputs_for_generation, self.model)
231
- message = {
232
- "role": "user",
233
- "content": [
234
- {"type": "text", "text": 'livecc'},
235
- ]
236
- }
237
- texts = self.processor.apply_chat_template([message], tokenize=False)
238
- self.system_prompt_offset = texts.index('<|im_start|>user')
239
-
240
- def video_qa(
241
- self,
242
- message: str,
243
- state: dict,
244
- do_sample: bool = False,
245
- repetition_penalty: float = 1.05,
246
- **kwargs,
247
- ):
248
- """
249
- state: dict, (maybe) with keys:
250
- video_path: str, video path
251
- video_timestamp: float, current video timestamp
252
- last_timestamp: float, last processed video timestamp
253
- last_video_pts_index: int, last processed video frame index
254
- video_pts: np.ndarray, video pts
255
- last_history: list, last processed history
256
- past_key_values: llm past_key_values
257
- past_ids: past generated ids
258
- """
259
- video_path = state.get('video_path', None)
260
- conversation = []
261
- past_ids = state.get('past_ids', None)
262
- content = [{"type": "text", "text": message}]
263
- if past_ids is None and video_path: # only use once
264
- content.insert(0, {"type": "video", "video": video_path})
265
- conversation.append({"role": "user", "content": content})
266
- image_inputs, video_inputs = process_vision_info(conversation)
267
- texts = self.processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True, return_tensors='pt')
268
- if past_ids is not None:
269
- texts = '<|im_end|>\n' + texts[self.system_prompt_offset:]
270
- inputs = self.processor(
271
- text=texts,
272
- images=image_inputs,
273
- videos=video_inputs,
274
- return_tensors="pt",
275
- return_attention_mask=False
276
- )
277
- inputs.to(self.model.device)
278
- if past_ids is not None:
279
- inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1)
280
- outputs = self.model.generate(
281
- **inputs, past_key_values=state.get('past_key_values', None),
282
- return_dict_in_generate=True, do_sample=do_sample,
283
- repetition_penalty=repetition_penalty,
284
- max_new_tokens=512,
285
- )
286
- state['past_key_values'] = outputs.past_key_values
287
- state['past_ids'] = outputs.sequences[:, :-1]
288
- response = self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True)
289
- return response, state
290
-
291
- model_path = 'chenjoya/LiveCC-7B-Base'
292
- # download a test video at: https://github.com/showlab/livecc/blob/main/demo/sources/howto_fix_laptop_mute_1080p.mp4
293
- video_path = "demo/sources/howto_fix_laptop_mute_1080p.mp4"
294
-
295
- infer = LiveCCDemoInfer(model_path=model_path)
296
- state = {'video_path': video_path}
297
- # first round
298
- query1 = 'What is the video?'
299
- response1, state = infer.video_qa(message=query1, state=state)
300
- print(f'Q1: {query1}\nA1: {response1}')
301
- # second round
302
- query2 = 'How do you know that?'
303
- response2, state = infer.video_qa(message=query2, state=state)
304
- print(f'Q2: {query2}\nA2: {response2}')
305
- ```
306
-
307
  ## Limitations
308
 
309
  - This model is only performed video-ASR streaming pre-training, so it may not support well in common video qa.
 
202
  t += 1
203
  ```
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  ## Limitations
206
 
207
  - This model is only performed video-ASR streaming pre-training, so it may not support well in common video qa.