File size: 2,554 Bytes
80ceab0
 
 
 
 
ef7643d
cf5f08b
ef7643d
80ceab0
 
ef7643d
 
80ceab0
ef7643d
 
cf5f08b
ef7643d
 
 
 
 
 
80ceab0
ef7643d
 
 
80ceab0
 
 
 
 
 
 
 
ef7643d
 
80ceab0
 
 
 
 
 
 
ef7643d
 
80ceab0
 
 
 
 
 
 
 
ef7643d
 
80ceab0
 
 
 
 
 
 
ef7643d
 
80ceab0
cf5f08b
 
 
 
 
 
ef7643d
 
cf5f08b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from .base import BaseVideoModel
from packaging import version
import torch
from typing import Optional, Union, Dict


# IMP: Add required versions here
transformers_required_version = version.parse("5.0.0")

# Conditional imports based on transformers version
import transformers
from transformers import BitsAndBytesConfig

# Check transformers version
transformers_version = version.parse(transformers.__version__)

# transformers v5 condition
if transformers_version >= transformers_required_version:
    from .qwen2_5vl import Qwen2_5VLModel
    from .qwen3vl import Qwen3VLModel
    from .internvl import InternVLModel
    from .llava_video import LLaVAVideoModel

    TRANSFORMERS_MODELS_AVAILABLE = True
else:
    raise ValueError(f"Transformers v5 models require transformers>=5.0.0, but found {transformers.__version__}. Transformers v5 models will not be available. Please upgrade to transformers>=5.0.0 or switch conda environments to use Transformers v5 models.")


# Function to get the model by mapping model ID to the correct model class
def load_model(
    model_path: str,
    dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16,
    device_map: Optional[Union[str, Dict]] = "auto",
    attn_implementation: Optional[str] = "flash_attention_2",
    load_8bit: Optional[bool] = False,
    load_4bit: Optional[bool] = False,
) -> BaseVideoModel:
    if "LLaVA-Video" in model_path:
        return LLaVAVideoModel(
            model_path,
            dtype=dtype,
            device_map=device_map,
            attn_implementation=attn_implementation,
            load_8bit=load_8bit,
            load_4bit=load_4bit,
        )
    elif "Qwen" in model_path:
        if "Qwen3" in model_path:
            return Qwen3VLModel(
                model_path,
                dtype=dtype,
                device_map=device_map,
                attn_implementation=attn_implementation,
                load_8bit=load_8bit,
                load_4bit=load_4bit,
            )
        else:
            return Qwen2_5VLModel(
                model_path,
                dtype=dtype,
                device_map=device_map,
                attn_implementation=attn_implementation,
                load_8bit=load_8bit,
                load_4bit=load_4bit,
            )
    elif "Intern" in model_path:
        return InternVLModel(
            model_path,
            dtype=dtype,
            device_map=device_map,
            attn_implementation=attn_implementation,
            load_8bit=load_8bit,
            load_4bit=load_4bit,
        )