VeuReu commited on
Commit
bc0dfdc
·
verified ·
1 Parent(s): 24e9ba3

Upload 2 files

Browse files
Files changed (1) hide show
  1. vision_tools.py +8 -6
vision_tools.py CHANGED
@@ -39,7 +39,7 @@ from scenedetect.detectors import ContentDetector
39
 
40
  import os, base64, requests, subprocess, contextlib, time
41
 
42
- from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
43
  from PIL import Image
44
 
45
  from libs.audio_tools_ana_2 import process_audio_for_video
@@ -261,11 +261,13 @@ def describe_montage_sequence(
261
  path_model = "BSC-LT/salamandra-7b-vision"
262
 
263
  processor = AutoProcessor.from_pretrained(path_model)
264
- model = LlavaOnevisionForConditionalGeneration.from_pretrained(
 
 
265
  path_model,
266
- torch_dtype=torch.float16,
267
  low_cpu_mem_usage=True
268
- ).to("cuda")
269
 
270
  img = cv2.imread(montage_path, cv2.IMREAD_COLOR)
271
  if img is None:
@@ -305,9 +307,9 @@ def describe_montage_sequence(
305
  inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
306
  for k, v in inputs.items():
307
  if v.dtype.is_floating_point:
308
- inputs[k] = v.to("cuda", torch.float16)
309
  else:
310
- inputs[k] = v.to("cuda")
311
 
312
  output = model.generate(**inputs, max_new_tokens=1024)
313
  text = processor.decode(output[0], skip_special_tokens=True)
 
39
 
40
  import os, base64, requests, subprocess, contextlib, time
41
 
42
+ from transformers import AutoProcessor, LlavaOneForConditionalGeneration
43
  from PIL import Image
44
 
45
  from libs.audio_tools_ana_2 import process_audio_for_video
 
261
  path_model = "BSC-LT/salamandra-7b-vision"
262
 
263
  processor = AutoProcessor.from_pretrained(path_model)
264
+ device = "cuda" if torch.cuda.is_available() else "cpu"
265
+ dtype = torch.float16 if device == "cuda" else torch.float32
266
+ model = LlavaOneForConditionalGeneration.from_pretrained(
267
  path_model,
268
+ torch_dtype=dtype,
269
  low_cpu_mem_usage=True
270
+ ).to(device)
271
 
272
  img = cv2.imread(montage_path, cv2.IMREAD_COLOR)
273
  if img is None:
 
307
  inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
308
  for k, v in inputs.items():
309
  if v.dtype.is_floating_point:
310
+ inputs[k] = v.to(device, dtype)
311
  else:
312
+ inputs[k] = v.to(device)
313
 
314
  output = model.generate(**inputs, max_new_tokens=1024)
315
  text = processor.decode(output[0], skip_special_tokens=True)