InternVL3_5-8B

Running on Zero

App Files Files Community

developer0hye commited on Aug 29

Commit

6802457

verified ·

1 Parent(s): 1c13a5e

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -12

app.py CHANGED Viewed

@@ -100,10 +100,10 @@ def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
     ])
     return frame_indices
-def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=8):
     """
     InternVL 예시 코드 참고: 여러 프레임을 추출하여 dynamic_preprocess 적용.
-    여기서는 기본적으로 num_segments=8로 설정.
     """
     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
     max_frame = len(vr) - 1
@@ -130,15 +130,17 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=8
 # =============================================================================
 # InternVL 모델 로딩
 # =============================================================================
-MODEL_ID = "OpenGVLab/InternVL2_5-8B"
 model = AutoModel.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     use_flash_attn=True,
-    trust_remote_code=True
-).eval().cuda()
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_ID,
@@ -147,7 +149,7 @@ tokenizer = AutoTokenizer.from_pretrained(
 )
 # Gradio 상단에 표시할 설명 문구
-DESCRIPTION = "[InternVL2_5-8B Demo](https://github.com/OpenGVLab/InternVL) - Using the InternVL2_5-8B"
 image_extensions = Image.registered_extensions()
 video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
@@ -234,27 +236,29 @@ def internvl_inference(media_input, text_input=None):
         pixel_values = load_image(media_path, max_num=12)
         pixel_values = pixel_values.to(torch.bfloat16).cuda()  # (N, 3, H, W)
         # InternVL 대화
-        question = f"<image>\n{text_input}" if text_input else "<image>\n"
         generation_config = dict(max_new_tokens=1024, do_sample=True)
         response = model.chat(
             tokenizer,
             pixel_values,
             question,
-            generation_config
         )
         return response
     elif media_type == "video":
-        # 영상: 예시로 첫 8프레임에 대해 처리
         pixel_values, num_patches_list = load_video(
             media_path,
-            num_segments=8,
             max_num=1
         )
         pixel_values = pixel_values.to(torch.bfloat16).cuda()
         question_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])
-        question = question_prefix + (text_input if text_input else "")
         generation_config = dict(max_new_tokens=1024, do_sample=True)
         # 영상에서도 동일한 chat() 함수 사용
@@ -263,7 +267,9 @@ def internvl_inference(media_input, text_input=None):
             pixel_values,
             question,
             generation_config,
-            num_patches_list=num_patches_list
         )
         return response

     ])
     return frame_indices
+def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
     """
     InternVL 예시 코드 참고: 여러 프레임을 추출하여 dynamic_preprocess 적용.
+    여기서는 기본적으로 num_segments=32로 설정.
     """
     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
     max_frame = len(vr) - 1
 # =============================================================================
 # InternVL 모델 로딩
 # =============================================================================
+MODEL_ID = "OpenGVLab/InternVL3_5-8B"
 model = AutoModel.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.bfloat16,
+    load_in_8bit=False,
     low_cpu_mem_usage=True,
     use_flash_attn=True,
+    trust_remote_code=True,
+    device_map="auto"
+).eval()
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_ID,
 )
 # Gradio 상단에 표시할 설명 문구
+DESCRIPTION = "[InternVL3.5-8B Demo](https://github.com/OpenGVLab/InternVL) - Using the InternVL3.5-8B"
 image_extensions = Image.registered_extensions()
 video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
         pixel_values = load_image(media_path, max_num=12)
         pixel_values = pixel_values.to(torch.bfloat16).cuda()  # (N, 3, H, W)
         # InternVL 대화
+        question = f"<image>\n{text_input}" if text_input else "<image>\nPlease describe the image."
         generation_config = dict(max_new_tokens=1024, do_sample=True)
         response = model.chat(
             tokenizer,
             pixel_values,
             question,
+            generation_config,
+            history=None,
+            return_history=False
         )
         return response
     elif media_type == "video":
+        # 영상: 예시로 32프레임에 대해 처리
         pixel_values, num_patches_list = load_video(
             media_path,
+            num_segments=32,
             max_num=1
         )
         pixel_values = pixel_values.to(torch.bfloat16).cuda()
         question_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])
+        question = question_prefix + (text_input if text_input else "Describe this video in detail.")
         generation_config = dict(max_new_tokens=1024, do_sample=True)
         # 영상에서도 동일한 chat() 함수 사용
             pixel_values,
             question,
             generation_config,
+            num_patches_list=num_patches_list,
+            history=None,
+            return_history=False
         )
         return response