Upload 2 files
Browse files- vision_tools.py +8 -6
vision_tools.py
CHANGED
|
@@ -39,7 +39,7 @@ from scenedetect.detectors import ContentDetector
|
|
| 39 |
|
| 40 |
import os, base64, requests, subprocess, contextlib, time
|
| 41 |
|
| 42 |
-
from transformers import AutoProcessor,
|
| 43 |
from PIL import Image
|
| 44 |
|
| 45 |
from libs.audio_tools_ana_2 import process_audio_for_video
|
|
@@ -261,11 +261,13 @@ def describe_montage_sequence(
|
|
| 261 |
path_model = "BSC-LT/salamandra-7b-vision"
|
| 262 |
|
| 263 |
processor = AutoProcessor.from_pretrained(path_model)
|
| 264 |
-
|
|
|
|
|
|
|
| 265 |
path_model,
|
| 266 |
-
torch_dtype=
|
| 267 |
low_cpu_mem_usage=True
|
| 268 |
-
).to(
|
| 269 |
|
| 270 |
img = cv2.imread(montage_path, cv2.IMREAD_COLOR)
|
| 271 |
if img is None:
|
|
@@ -305,9 +307,9 @@ def describe_montage_sequence(
|
|
| 305 |
inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
|
| 306 |
for k, v in inputs.items():
|
| 307 |
if v.dtype.is_floating_point:
|
| 308 |
-
inputs[k] = v.to(
|
| 309 |
else:
|
| 310 |
-
inputs[k] = v.to(
|
| 311 |
|
| 312 |
output = model.generate(**inputs, max_new_tokens=1024)
|
| 313 |
text = processor.decode(output[0], skip_special_tokens=True)
|
|
|
|
| 39 |
|
| 40 |
import os, base64, requests, subprocess, contextlib, time
|
| 41 |
|
| 42 |
+
from transformers import AutoProcessor, LlavaOneForConditionalGeneration
|
| 43 |
from PIL import Image
|
| 44 |
|
| 45 |
from libs.audio_tools_ana_2 import process_audio_for_video
|
|
|
|
| 261 |
path_model = "BSC-LT/salamandra-7b-vision"
|
| 262 |
|
| 263 |
processor = AutoProcessor.from_pretrained(path_model)
|
| 264 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 265 |
+
dtype = torch.float16 if device == "cuda" else torch.float32
|
| 266 |
+
model = LlavaOneForConditionalGeneration.from_pretrained(
|
| 267 |
path_model,
|
| 268 |
+
torch_dtype=dtype,
|
| 269 |
low_cpu_mem_usage=True
|
| 270 |
+
).to(device)
|
| 271 |
|
| 272 |
img = cv2.imread(montage_path, cv2.IMREAD_COLOR)
|
| 273 |
if img is None:
|
|
|
|
| 307 |
inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
|
| 308 |
for k, v in inputs.items():
|
| 309 |
if v.dtype.is_floating_point:
|
| 310 |
+
inputs[k] = v.to(device, dtype)
|
| 311 |
else:
|
| 312 |
+
inputs[k] = v.to(device)
|
| 313 |
|
| 314 |
output = model.generate(**inputs, max_new_tokens=1024)
|
| 315 |
text = processor.decode(output[0], skip_special_tokens=True)
|