uncensored-com
/

video-llava-7b-deployable

Model card Files Files and versions

xet

Community

uncensored-com commited on Dec 1, 2025

Commit

fcd38b0

verified ·

1 Parent(s): 542ecdb

Update handler.py

Browse files

Files changed (1) hide show

handler.py +52 -79

handler.py CHANGED Viewed

@@ -8,37 +8,21 @@ from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration
 class EndpointHandler:
     def __init__(self, path=""):
-        # 1. LOAD MODEL
         model_id = "LanguageBind/Video-LLaVA-7B-hf"
         print(f"Loading model: {model_id}...")
         self.processor = VideoLlavaProcessor.from_pretrained(model_id)
         self.model = VideoLlavaForConditionalGeneration.from_pretrained(
             model_id,
-            torch_dtype=torch.float16, # bfloat16 is better if your GPU supports it (A10G/A100), otherwise float16
             device_map="auto",
             low_cpu_mem_usage=True
         )
         self.model.eval()
         print("Model loaded successfully.")
-    def read_video_pyav(self, container, indices):
-        '''
-        Decode the video with PyAV decoder.
-        '''
-        frames = []
-        container.seek(0)
-        start_index = indices[0]
-        end_index = indices[-1]
-        for i, frame in enumerate(container.decode(video=0)):
-            if i > end_index:
-                break
-            if i >= start_index and i in indices:
-                frames.append(frame)
-        return np.stack([x.to_ndarray(format="rgb24") for x in frames])
     def download_video(self, video_url):
-        # Your specific download logic
         suffix = os.path.splitext(video_url)[1] or '.mp4'
         temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
         temp_path = temp_file.name
@@ -51,98 +35,87 @@ class EndpointHandler:
                         f.write(chunk)
             return temp_path
         except Exception as e:
-            if os.path.exists(temp_path):
-                os.unlink(temp_path)
             raise e
-def __call__(self, data):
-        """
-        The endpoint calls this function when you send a JSON request.
-        Expected JSON: {"inputs": "text prompt", "video": "url", "parameters": {...}}
-        """
-        print("\n--- NEW REQUEST RECEIVED ---") # LOG
-        # 1. EXTRACT DATA
-        inputs = data.pop("inputs", "What is happening in this video?")
-        video_url = data.pop("video", None)
-        parameters = data.pop("parameters", {})
-        # Log the inputs so you can verify overrides in the dashboard logs
-        print(f"Input Prompt: {inputs}")
-        print(f"Video URL: {video_url}")
-        print(f"Raw Parameters: {parameters}")
-        # Default parameters from your script
-        max_new_tokens = parameters.get("max_new_tokens", 500)
-        temperature = parameters.get("temperature", 0.1)
-        top_p = parameters.get("top_p", 0.9)
-        num_frames = parameters.get("num_frames", 10)
-        # Log effective parameters
-        print(f"Effective Config -> Frames: {num_frames}, Max Tokens: {max_new_tokens}, Temp: {temperature}")
-        if not video_url:
-            print("Error: No video URL provided.")
-            return {"error": "No 'video' key provided in the payload."}
-        video_path = None
-        container = None
-        try:
-            # 2. DOWNLOAD & PROCESS
-            print(f"Downloading video from {video_url}...")
             video_path = self.download_video(video_url)
             container = av.open(video_path)
             total_frames = container.streams.video[0].frames
             if total_frames == 0:
                 total_frames = sum(1 for _ in container.decode(video=0))
                 container.seek(0)
-            print(f"Video Info -> Total Frames: {total_frames}")
-            frames_to_use = min(total_frames, num_frames) if total_frames > 0 else num_frames
-            indices = np.linspace(0, total_frames - 1, frames_to_use, dtype=int)
             clip = self.read_video_pyav(container, indices)
-            print(f"Extracted {len(clip)} frames for inference.")
-            # 3. PREPARE PROMPT
             full_prompt = f"USER: <video>{inputs} ASSISTANT:"
             model_inputs = self.processor(
                 text=full_prompt,
-                videos=clip,
                 return_tensors="pt"
             ).to(self.model.device)
-            # 4. GENERATE
-            print("Starting generation...")
             with torch.inference_mode():
                 generate_ids = self.model.generate(
                     **model_inputs,
-                    max_length=max_new_tokens,
                     temperature=temperature,
-                    top_p=top_p,
                     do_sample=True if temperature > 0 else False
                 )
-            result = self.processor.batch_decode(
-                generate_ids,
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=False
-            )[0]
             final_output = result.split("ASSISTANT:")[-1].strip()
-            print(f"Generation complete. Result: {final_output[:50]}...") # Log first 50 chars
             return [{"generated_text": final_output}]
         except Exception as e:
-            print(f"CRITICAL ERROR: {str(e)}")
             return {"error": str(e)}
         finally:
-            if container: container.close()
-            if video_path and os.path.exists(video_path):
-                os.unlink(video_path)

 class EndpointHandler:
     def __init__(self, path=""):
+        # Load Model
         model_id = "LanguageBind/Video-LLaVA-7B-hf"
         print(f"Loading model: {model_id}...")
         self.processor = VideoLlavaProcessor.from_pretrained(model_id)
         self.model = VideoLlavaForConditionalGeneration.from_pretrained(
             model_id,
+            torch_dtype=torch.float16,
             device_map="auto",
             low_cpu_mem_usage=True
         )
         self.model.eval()
         print("Model loaded successfully.")
     def download_video(self, video_url):
         suffix = os.path.splitext(video_url)[1] or '.mp4'
         temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
         temp_path = temp_file.name
                         f.write(chunk)
             return temp_path
         except Exception as e:
+            if os.path.exists(temp_path): os.unlink(temp_path)
             raise e
+    def read_video_pyav(self, container, indices):
+        frames = []
+        container.seek(0)
+        start_index = indices[0]
+        end_index = indices[-1]
+        for i, frame in enumerate(container.decode(video=0)):
+            if i > end_index:
+                break
+            if i >= start_index and i in indices:
+                frames.append(frame)
+        # Return as list of numpy arrays, which acts like a "list of images" for the processor
+        return [x.to_ndarray(format="rgb24") for x in frames]
+    def __call__(self, data):
+        print("\n--- NEW REQUEST ---")
+        try:
+            # 1. EXTRACT DATA
+            inputs = data.pop("inputs", "What is happening in this video?")
+            video_url = data.pop("video", None)
+            parameters = data.pop("parameters", {})
+            num_frames = parameters.get("num_frames", 8)
+            max_new_tokens = parameters.get("max_new_tokens", 250)
+            temperature = parameters.get("temperature", 0.1)
+            if not video_url:
+                return {"error": "Missing 'video' URL."}
+            # 2. DOWNLOAD
+            print(f"Downloading: {video_url}")
             video_path = self.download_video(video_url)
             container = av.open(video_path)
+            # 3. SAMPLE FRAMES
             total_frames = container.streams.video[0].frames
             if total_frames == 0:
                 total_frames = sum(1 for _ in container.decode(video=0))
                 container.seek(0)
+            # Ensure we don't request more frames than exist
+            frames_to_use = min(total_frames, num_frames)
+            if frames_to_use < 1: frames_to_use = 1
+            indices = np.linspace(0, total_frames - 1, frames_to_use, dtype=int)
             clip = self.read_video_pyav(container, indices)
+            print(f"Processed {len(clip)} frames.")
+            # 4. PREPARE INPUTS
+            # Note: VideoLlava expects specific prompt formatting
             full_prompt = f"USER: <video>{inputs} ASSISTANT:"
             model_inputs = self.processor(
                 text=full_prompt,
+                videos=clip,
                 return_tensors="pt"
             ).to(self.model.device)
+            # 5. GENERATE
+            print("Generating...")
             with torch.inference_mode():
                 generate_ids = self.model.generate(
                     **model_inputs,
+                    max_new_tokens=max_new_tokens,
                     temperature=temperature,
                     do_sample=True if temperature > 0 else False
                 )
+            result = self.processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
             final_output = result.split("ASSISTANT:")[-1].strip()
+            print(f"Result: {final_output[:50]}...")
             return [{"generated_text": final_output}]
         except Exception as e:
+            import traceback
+            traceback.print_exc()
             return {"error": str(e)}
         finally:
+            if 'container' in locals() and container: container.close()
+            if 'video_path' in locals() and video_path and os.path.exists(video_path):
+                os.unlink(video_path)