uncensored-com
/

video-llava-7b-deployable

Model card Files Files and versions

xet

Community

uncensored-com commited on Dec 1, 2025

Commit

000f3a6

verified ·

1 Parent(s): 23fc12f

Update handler.py

Browse files

Files changed (1) hide show

handler.py +10 -25

handler.py CHANGED Viewed

@@ -10,14 +10,14 @@ from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration
 class EndpointHandler:
     def __init__(self, path=""):
-        # Load Model
         model_id = "LanguageBind/Video-LLaVA-7B-hf"
         print(f"Loading model: {model_id}...")
         self.processor = VideoLlavaProcessor.from_pretrained(model_id)
         self.model = VideoLlavaForConditionalGeneration.from_pretrained(
             model_id,
-            torch_dtype=torch.float16,
             device_map="auto",
             low_cpu_mem_usage=True
         )
@@ -30,7 +30,7 @@ class EndpointHandler:
         temp_path = temp_file.name
         temp_file.close()
         try:
-            # Added timeout (30s) to prevent hanging on bad URLs
             response = requests.get(video_url, stream=True, timeout=30)
             if response.status_code != 200:
                 raise ValueError(f"Failed to download: {response.status_code}")
@@ -55,11 +55,9 @@ class EndpointHandler:
             if i >= start_index and i in indices:
                 frames.append(frame)
-        # Guard clause: If video is corrupted or empty
         if not frames:
             raise ValueError("Video decoding failed: No frames found.")
-        # Return as list of numpy arrays
         return [x.to_ndarray(format="rgb24") for x in frames]
     def __call__(self, data):
@@ -70,40 +68,34 @@ class EndpointHandler:
         video_path = None
         try:
-            # 1. EXTRACT DATA
             inputs = data.pop("inputs", "Describe this video.")
             video_url = data.pop("video", None)
             parameters = data.pop("parameters", {})
-            # Default to 8 frames because LanguageBind is trained on 8 frames.
-            # Only change this if you are sure the model handles interpolation.
             num_frames = parameters.pop("num_frames", 8)
-            # Clean parameters for generation (pass everything else to the model)
             gen_kwargs = {
                 "max_new_tokens": parameters.pop("max_new_tokens", 500),
-                "temperature": parameters.pop("temperature", 0.7),
-                "top_p": parameters.pop("top_p", 0.9),
                 "do_sample": True
             }
-            gen_kwargs.update(parameters) # Merge any other params user sent
             if not video_url:
                 return {"error": "Missing 'video' URL."}
-            # 2. DOWNLOAD
             print(f"Downloading: {video_url}")
             video_path = self.download_video(video_url)
             container = av.open(video_path)
-            # 3. SMART FRAME SAMPLING
             total_frames = container.streams.video[0].frames
             if total_frames == 0:
-                # Fallback for videos with missing metadata
                 total_frames = sum(1 for _ in container.decode(video=0))
                 container.seek(0)
-            # Clamp frames to available count
             frames_to_use = min(total_frames, num_frames)
             if frames_to_use < 1: frames_to_use = 1
@@ -111,20 +103,18 @@ class EndpointHandler:
             clip = self.read_video_pyav(container, indices)
             print(f"Processed {len(clip)} frames.")
-            # 4. PREPARE PROMPT
-            # Check if user already added the template to avoid double-templating
             if "USER:" in inputs:
                 full_prompt = inputs
             else:
-                full_prompt = f"USER: <video>\n{inputs}\nASSISTANT:"
             model_inputs = self.processor(
                 text=full_prompt,
                 videos=clip,
                 return_tensors="pt"
             ).to(self.model.device)
-            # 5. GENERATE
             print(f"Generating with params: {gen_kwargs}")
             with torch.inference_mode():
                 generate_ids = self.model.generate(
@@ -134,13 +124,11 @@ class EndpointHandler:
             result = self.processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-            # Clean output based on prompt structure
             if "ASSISTANT:" in result:
                 final_output = result.split("ASSISTANT:")[-1].strip()
             else:
                 final_output = result
-            # LOG TIME
             duration = time.time() - start_time
             print(f"✅ Success! Total time: {duration:.2f} seconds.")
             print(f"Result preview: {final_output[:50]}...")
@@ -153,11 +141,8 @@ class EndpointHandler:
             return {"error": str(e)}
         finally:
-            # 6. CLEANUP (Crucial for long-running endpoints)
             if container: container.close()
             if video_path and os.path.exists(video_path):
                 os.unlink(video_path)
-            # Clear GPU memory
             torch.cuda.empty_cache()
             gc.collect()

 class EndpointHandler:
     def __init__(self, path=""):
         model_id = "LanguageBind/Video-LLaVA-7B-hf"
         print(f"Loading model: {model_id}...")
+        # 1. Use bfloat16 (Matches your local script for better precision)
         self.processor = VideoLlavaProcessor.from_pretrained(model_id)
         self.model = VideoLlavaForConditionalGeneration.from_pretrained(
             model_id,
+            torch_dtype=torch.bfloat16,
             device_map="auto",
             low_cpu_mem_usage=True
         )
         temp_path = temp_file.name
         temp_file.close()
         try:
+            # 30s timeout prevents hanging
             response = requests.get(video_url, stream=True, timeout=30)
             if response.status_code != 200:
                 raise ValueError(f"Failed to download: {response.status_code}")
             if i >= start_index and i in indices:
                 frames.append(frame)
         if not frames:
             raise ValueError("Video decoding failed: No frames found.")
         return [x.to_ndarray(format="rgb24") for x in frames]
     def __call__(self, data):
         video_path = None
         try:
             inputs = data.pop("inputs", "Describe this video.")
             video_url = data.pop("video", None)
             parameters = data.pop("parameters", {})
+            # Default to 8 frames (Native to this model architecture)
             num_frames = parameters.pop("num_frames", 8)
+            # 2. Configuration that matches your script's logic
             gen_kwargs = {
                 "max_new_tokens": parameters.pop("max_new_tokens", 500),
+                "temperature": parameters.pop("temperature", 0.1), # Defaulted to your 0.1
+                "top_p": parameters.pop("top_p", 0.9),             # Defaulted to your 0.9
                 "do_sample": True
             }
+            gen_kwargs.update(parameters)
             if not video_url:
                 return {"error": "Missing 'video' URL."}
             print(f"Downloading: {video_url}")
             video_path = self.download_video(video_url)
             container = av.open(video_path)
             total_frames = container.streams.video[0].frames
             if total_frames == 0:
                 total_frames = sum(1 for _ in container.decode(video=0))
                 container.seek(0)
             frames_to_use = min(total_frames, num_frames)
             if frames_to_use < 1: frames_to_use = 1
             clip = self.read_video_pyav(container, indices)
             print(f"Processed {len(clip)} frames.")
             if "USER:" in inputs:
                 full_prompt = inputs
             else:
+                full_prompt = f"USER: <video>{inputs} ASSISTANT:"
+            # 3. Ensure input tensors are also bfloat16
             model_inputs = self.processor(
                 text=full_prompt,
                 videos=clip,
                 return_tensors="pt"
             ).to(self.model.device)
             print(f"Generating with params: {gen_kwargs}")
             with torch.inference_mode():
                 generate_ids = self.model.generate(
             result = self.processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
             if "ASSISTANT:" in result:
                 final_output = result.split("ASSISTANT:")[-1].strip()
             else:
                 final_output = result
             duration = time.time() - start_time
             print(f"✅ Success! Total time: {duration:.2f} seconds.")
             print(f"Result preview: {final_output[:50]}...")
             return {"error": str(e)}
         finally:
             if container: container.close()
             if video_path and os.path.exists(video_path):
                 os.unlink(video_path)
             torch.cuda.empty_cache()
             gc.collect()