Commit
·
9fba121
1
Parent(s):
e914bd1
update
Browse files- ixc_utils.py +4 -4
- modeling_internlm_xcomposer2.py +3 -1
ixc_utils.py
CHANGED
|
@@ -66,7 +66,7 @@ def Video_transform(img, hd_num=25):
|
|
| 66 |
|
| 67 |
return img
|
| 68 |
|
| 69 |
-
def frame2img(imgs):
|
| 70 |
new_imgs = []
|
| 71 |
for img in imgs:
|
| 72 |
w, h = img.size
|
|
@@ -83,7 +83,8 @@ def frame2img(imgs):
|
|
| 83 |
new_w = 0
|
| 84 |
new_h = 0
|
| 85 |
pad = 40
|
| 86 |
-
|
|
|
|
| 87 |
if w > h:
|
| 88 |
for im in imgs:
|
| 89 |
w,h = im.size
|
|
@@ -135,6 +136,5 @@ def load_video(video_path, num_frm=32, start=None, end=None):
|
|
| 135 |
indices = [int(i*step_size) for i in range(num_frm)]
|
| 136 |
images = [images[i] for i in indices]
|
| 137 |
images = [Image.fromarray(arr) for arr in images]
|
| 138 |
-
|
| 139 |
-
return image
|
| 140 |
|
|
|
|
| 66 |
|
| 67 |
return img
|
| 68 |
|
| 69 |
+
def frame2img(imgs, font_path):
|
| 70 |
new_imgs = []
|
| 71 |
for img in imgs:
|
| 72 |
w, h = img.size
|
|
|
|
| 83 |
new_w = 0
|
| 84 |
new_h = 0
|
| 85 |
pad = 40
|
| 86 |
+
print (font_path)
|
| 87 |
+
font = ImageFont.truetype(os.path.join(font_path, "SimHei.ttf"), pad)
|
| 88 |
if w > h:
|
| 89 |
for im in imgs:
|
| 90 |
w,h = im.size
|
|
|
|
| 136 |
indices = [int(i*step_size) for i in range(num_frm)]
|
| 137 |
images = [images[i] for i in indices]
|
| 138 |
images = [Image.fromarray(arr) for arr in images]
|
| 139 |
+
return images
|
|
|
|
| 140 |
|
modeling_internlm_xcomposer2.py
CHANGED
|
@@ -45,7 +45,7 @@ import torchvision.transforms as transforms
|
|
| 45 |
from torchvision.transforms.functional import InterpolationMode
|
| 46 |
|
| 47 |
from .build_mlp import build_vision_projector, build_vision_tower
|
| 48 |
-
from .ixc_utils import Image_transform, Video_transform, load_video
|
| 49 |
from .configuration_internlm_xcomposer2 import InternLMXcomposer2Config
|
| 50 |
from .modeling_internlm2 import (InternLM2_INPUTS_DOCSTRING, InternLM2Model,
|
| 51 |
InternLM2PreTrainedModel)
|
|
@@ -102,6 +102,7 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
|
|
| 102 |
config.hidden_size, config.vocab_size, bias=False)
|
| 103 |
self.tokenizer = None
|
| 104 |
self.hd_num = 25
|
|
|
|
| 105 |
|
| 106 |
self.max_length = config.max_length
|
| 107 |
print(f'Set max length to {self.max_length}')
|
|
@@ -163,6 +164,7 @@ class InternLMXComposer2ForCausalLM(InternLM2PreTrainedModel):
|
|
| 163 |
image = Image_transform(image, hd_num = hd_num)
|
| 164 |
elif ext.lower() in video_extensions:
|
| 165 |
image = load_video(image)
|
|
|
|
| 166 |
image = Video_transform(image, hd_num = hd_num)
|
| 167 |
else:
|
| 168 |
print ('Unknow input format', image)
|
|
|
|
| 45 |
from torchvision.transforms.functional import InterpolationMode
|
| 46 |
|
| 47 |
from .build_mlp import build_vision_projector, build_vision_tower
|
| 48 |
+
from .ixc_utils import Image_transform, Video_transform, load_video, frame2img
|
| 49 |
from .configuration_internlm_xcomposer2 import InternLMXcomposer2Config
|
| 50 |
from .modeling_internlm2 import (InternLM2_INPUTS_DOCSTRING, InternLM2Model,
|
| 51 |
InternLM2PreTrainedModel)
|
|
|
|
| 102 |
config.hidden_size, config.vocab_size, bias=False)
|
| 103 |
self.tokenizer = None
|
| 104 |
self.hd_num = 25
|
| 105 |
+
self._path = config._name_or_path
|
| 106 |
|
| 107 |
self.max_length = config.max_length
|
| 108 |
print(f'Set max length to {self.max_length}')
|
|
|
|
| 164 |
image = Image_transform(image, hd_num = hd_num)
|
| 165 |
elif ext.lower() in video_extensions:
|
| 166 |
image = load_video(image)
|
| 167 |
+
image = frame2img(image, self._path)
|
| 168 |
image = Video_transform(image, hd_num = hd_num)
|
| 169 |
else:
|
| 170 |
print ('Unknow input format', image)
|