Spaces:

sming256
/

VideoAuto-R1_Demo

Running on Zero

App Files Files Community

sming256 commited on Jan 9

Commit

6983b5a

verified ·

1 Parent(s): 79e6f86

Update app.py

Browse files

Files changed (1) hide show

app.py +346 -360

app.py CHANGED Viewed

@@ -49,6 +49,29 @@ CUSTOM_CSS = """
 }
 """
 # ============================================================================
 # Utility Functions
@@ -82,233 +105,206 @@ def detect_media_type(file_path: str | None) -> str | None:
         return "video"
-# ============================================================================
-# Model Class
-# ============================================================================
-class Qwen3VLAutoThinkDemo:
-    """Main model class for Qwen3-VL with adaptive inference."""
-    def __init__(self, model_path="IVUL-KAUST/VideoAuto-R1-Qwen3-VL-8B"):
-        """Initialize model, processor, and tokenizer."""
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # Load model
-        self.model = Qwen3VLForConditionalGeneration.from_pretrained(
-            model_path,
-            dtype="bfloat16",
-            attn_implementation="sdpa",
-        ).to('cuda').eval()
-        self.processor = AutoProcessor.from_pretrained(model_path)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
-        self.system_prompt = COT_SYSTEM_PROMPT_ANSWER_TWICE
-    def process_image(
-        self,
-        image_path: str,
-        image_min_pixels: int = 128 * 28 * 28,
-        image_max_pixels: int = 16384 * 28 * 28,
-    ) -> dict | None:
-        """
-        Process image file to base64 format.
-        Args:
-            image_path: Path to image file
-            image_min_pixels: Minimum pixel count
-            image_max_pixels: Maximum pixel count
-        Returns:
-            Dictionary with image data or None
-        """
-        if image_path is None:
-            return None
-        image = Image.open(image_path).convert("RGB")
-        buffer = BytesIO()
-        image.save(buffer, format="JPEG")
-        base64_bytes = base64.b64encode(buffer.getvalue())
-        base64_string = base64_bytes.decode("utf-8")
-        return {
-            "type": "image",
-            "image": f"data:image/jpeg;base64,{base64_string}",
-            "min_pixels": image_min_pixels,
-            "max_pixels": image_max_pixels,
-        }
-    def process_video(
-        self,
-        video_path: str,
-        video_min_pixels: int = 16 * 28 * 28,
-        video_max_pixels: int = 768 * 28 * 28,
-        video_total_pixels: int = 128000 * 28 * 28,
-        min_frames: int = 4,
-        max_frames: int = 64,
-        fps: float = 2.0,
-    ) -> dict | None:
-        """
-        Process video file configuration.
-        Args:
-            video_path: Path to video file
-            video_min_pixels: Minimum pixels per frame
-            video_max_pixels: Maximum pixels per frame
-            video_total_pixels: Total pixels across all frames
-            min_frames: Minimum number of frames
-            max_frames: Maximum number of frames
-            fps: Frames per second for sampling
-        Returns:
-            Dictionary with video configuration or None
-        """
-        if video_path is None:
-            return None
-        return {
-            "type": "video",
-            "video": video_path,
-            "min_pixels": video_min_pixels,
-            "max_pixels": video_max_pixels,
-            "total_pixels": video_total_pixels,
-            "min_frames": min_frames,
-            "max_frames": max_frames,
-            "fps": fps,
-        }
-    @spaces.GPU(duration=120)
-    def generate(
-        self,
-        media_input: str | None,
-        prompt: str,
-        early_exit_thresh: float,
-        temperature: float,
-        max_new_tokens: int = 4096,
-    ) -> dict:
-        """
-        Generate response with adaptive inference.
-        Args:
-            media_input: Path to media file
-            prompt: Text prompt
-            early_exit_thresh: Confidence threshold for early exit
-            temperature: Sampling temperature
-            max_new_tokens: Maximum tokens to generate
-        Returns:
-            Dictionary containing response and metadata
-        """
-        # if self.model.device.type != "cuda":
-            # self.model.to("cuda")
-        # Prepare message
-        message = [{"role": "system", "content": self.system_prompt}]
-        content_parts = []
-        # Process media input
-        if media_input is not None:
-            media_type = detect_media_type(media_input)
-            if media_type == "video":
-                video_dict = self.process_video(media_input)
-                if video_dict:
-                    content_parts.append(video_dict)
-            elif media_type == "image":
-                image_dict = self.process_image(media_input)
-                if image_dict:
-                    content_parts.append(image_dict)
-        # Add text prompt
-        content_parts.append({"type": "text", "text": prompt})
-        message.append({"role": "user", "content": content_parts})
-        # Apply chat template
-        text = self.processor.apply_chat_template([message], tokenize=False, add_generation_prompt=True)
-        # Process vision inputs
-        image_inputs, video_inputs, video_kwargs = process_vision_info(
-            [message],
-            image_patch_size=16,
-            return_video_kwargs=True,
-            return_video_metadata=True,
-        )
-        if video_inputs is not None:
-            video_inputs, video_metadatas = zip(*video_inputs)
-            video_inputs = list(video_inputs)
-            video_metadatas = list(video_metadatas)
-        else:
-            video_metadatas = None
-        # Prepare inputs
-        inputs = self.processor(
-            text=text,
-            images=image_inputs,
-            videos=video_inputs,
-            video_metadata=video_metadatas,
-            do_resize=False,
-            padding=True,
-            return_tensors="pt",
-            **video_kwargs,
-        )
-        inputs = inputs.to(self.device)
-        # Generation configuration
-        gen_kwargs = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature if temperature > 0 else None,
-            "do_sample": temperature > 0,
-            "top_p": 0.9 if temperature > 0 else None,
-            "num_beams": 1,
-            "use_cache": True,
-            "return_dict_in_generate": True,
-            "output_scores": True,
-        }
-        # Generate response
-        with torch.no_grad():
-            gen_out = self.model.generate(
-                **inputs,
-                eos_token_id=self.tokenizer.eos_token_id,
-                pad_token_id=self.tokenizer.pad_token_id,
-                **gen_kwargs,
-            )
-        # Decode output
-        generated_ids = gen_out.sequences[0][len(inputs.input_ids[0]) :]
-        answer = self.processor.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        # Compute confidence
-        first_box_probs = compute_first_boxed_answer_probs(
-            b=0,
-            gen_ids=generated_ids,
-            gen_out=gen_out,
-            ans=answer,
-            task="",
-            tokenizer=self.tokenizer,
         )
-        # Parse response
-        first_answer = answer.split("<think>")[0]
-        second_answer = answer.split("</think>")[-1] if "</think>" in answer else first_answer
-        reasoning = answer.split("<think>")[-1].split("</think>")[0] if "<think>" in answer else "N/A"
-        # Determine inference mode
-        if first_box_probs >= early_exit_thresh:
-            need_cot = False
-            reasoning = False
-        else:
-            need_cot = True
-        return {
-            "full_response": answer,
-            "first_answer": first_answer,
-            "confidence": f"{first_box_probs:.4f}",
-            "need_cot": need_cot,
-            "reasoning": reasoning,
-            "second_answer": second_answer,
-        }
 # ============================================================================
@@ -361,18 +357,18 @@ def chat_generate(
     # Initialize system prompt
     if len(messages_state) == 0:
-        messages_state.append({"role": "system", "content": demo_model.system_prompt})
     # Prepare user message
     content_parts = []
     if media_path is not None:
         mtype = detect_media_type(media_path)
         if mtype == "video":
-            vd = demo_model.process_video(media_path)
             if vd:
                 content_parts.append(vd)
         elif mtype == "image":
-            imd = demo_model.process_image(media_path)
             if imd:
                 content_parts.append(imd)
@@ -380,7 +376,7 @@ def chat_generate(
     messages_state.append({"role": "user", "content": content_parts})
     # Generate response
-    result = demo_model.generate(media_path, user_text, early_exit_thresh, temperature)
     # Format assistant response
     first_ans = (result.get("first_answer") or "").strip()
@@ -465,155 +461,145 @@ EXAMPLES = [
 # Gradio Interface
 # ============================================================================
-def create_demo():
-    """Create and configure the Gradio interface."""
-    with gr.Blocks(title="VideoAuto-R1 Demo") as demo:
-        gr.Markdown("# VideoAuto-R1 (Qwen3-VL-8B) Demo")
-        # Display system prompt
-        with gr.Accordion("System Prompt", open=False):
-            gr.Markdown(f"```\n{COT_SYSTEM_PROMPT_ANSWER_TWICE}\n```")
-        # State variables
-        messages_state = gr.State([])
-        chatbot_state = gr.State([])
-        last_media_state = gr.State(None)
-        with gr.Row():
-            # Left column: Media input and settings
-            with gr.Column(scale=3):
-                media_input = gr.File(
-                    label="Upload Image or Video",
-                    file_types=["image", "video"],
-                    type="filepath",
-                )
-                image_preview = gr.Image(label="Image Preview", visible=False)
-                video_preview = gr.Video(label="Video Preview", visible=False)
-                with gr.Accordion("Advanced Settings", open=True):
-                    early_exit_thresh = gr.Slider(
-                        minimum=0.0,
-                        maximum=1.0,
-                        value=0.98,
-                        step=0.01,
-                        label="Early Exit Threshold",
-                    )
-                    temperature = gr.Slider(
-                        minimum=0.0,
-                        maximum=2.0,
-                        value=0.0,
-                        step=0.1,
-                        label="Temperature",
-                    )
-            # Right column: Chat interface
-            with gr.Column(scale=7):
-                chatbot = gr.Chatbot(
-                    label="Chat",
-                    elem_id="chatbot",
-                    height=600,
-                    sanitize_html=False,
-                )
-                textbox = gr.Textbox(
-                    show_label=False,
-                    placeholder="Enter text and press ENTER",
-                    lines=2,
                 )
-                with gr.Row():
-                    send_btn = gr.Button("Send", variant="primary")
-                    clear_btn = gr.Button("Clear")
-                gr.Markdown(
-                    "Please click the **Clear** button before starting a new conversation or trying a new example."
                 )
-        # Event handlers
-        media_input.change(
-            fn=update_preview,
-            inputs=[media_input],
-            outputs=[image_preview, video_preview],
-        )
-        # Send button click: generate response and disable input controls
-        send_btn.click(
-            fn=chat_generate,
-            inputs=[
-                media_input,
-                textbox,
-                messages_state,
-                chatbot_state,
-                last_media_state,
-                early_exit_thresh,
-                temperature,
-            ],
-            outputs=[messages_state, chatbot_state, last_media_state, textbox, send_btn],
-        ).then(
-            fn=lambda cs: cs,
-            inputs=[chatbot_state],
-            outputs=[chatbot],
-        )
-        # Textbox submit: generate response and disable input controls
-        textbox.submit(
-            fn=chat_generate,
-            inputs=[
-                media_input,
-                textbox,
-                messages_state,
-                chatbot_state,
-                last_media_state,
-                early_exit_thresh,
-                temperature,
-            ],
-            outputs=[messages_state, chatbot_state, last_media_state, textbox, send_btn],
-        ).then(
-            fn=lambda cs: cs,
-            inputs=[chatbot_state],
-            outputs=[chatbot],
-        )
-        # Clear button: reset all states and re-enable input controls
-        clear_btn.click(
-            fn=clear_history,
-            inputs=[],
-            outputs=[
-                messages_state,
-                chatbot_state,
-                last_media_state,
-                media_input,
-                image_preview,
-                video_preview,
-                textbox,
-                send_btn,
-            ],
-        ).then(
-            fn=lambda cs: cs,
-            inputs=[chatbot_state],
-            outputs=[chatbot],
-        )
-        gr.Examples(
-            examples=EXAMPLES,
-            inputs=[media_input, textbox],
-            label="Examples",
-            cache_examples=False,
-        )
-    return demo
-# ============================================================================
-# Main Entry Point
-# ============================================================================
-if __name__ == "__main__":
-    # Initialize model
-    demo_model = Qwen3VLAutoThinkDemo()
-    # Create and launch demo
-    demo = create_demo()
-    demo.launch(
-        allowed_paths=["assets"],
-        css=CUSTOM_CSS,
-    )

 }
 """
+MODEL_PATH = "IVUL-KAUST/VideoAuto-R1-Qwen3-VL-8B"
+# ============================================================================
+# Global Model Variables
+# ============================================================================
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load model
+model = (
+    Qwen3VLForConditionalGeneration.from_pretrained(
+        MODEL_PATH,
+        dtype="bfloat16",
+        attn_implementation="sdpa",
+    )
+    .to("cuda")
+    .eval()
+)
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
 # ============================================================================
 # Utility Functions
         return "video"
+def process_image(
+    image_path: str,
+    image_min_pixels: int = 128 * 28 * 28,
+    image_max_pixels: int = 16384 * 28 * 28,
+) -> dict | None:
+    """
+    Process image file to base64 format.
+    Args:
+        image_path: Path to image file
+        image_min_pixels: Minimum pixel count
+        image_max_pixels: Maximum pixel count
+    Returns:
+        Dictionary with image data or None
+    """
+    if image_path is None:
+        return None
+    image = Image.open(image_path).convert("RGB")
+    buffer = BytesIO()
+    image.save(buffer, format="JPEG")
+    base64_bytes = base64.b64encode(buffer.getvalue())
+    base64_string = base64_bytes.decode("utf-8")
+    return {
+        "type": "image",
+        "image": f"data:image/jpeg;base64,{base64_string}",
+        "min_pixels": image_min_pixels,
+        "max_pixels": image_max_pixels,
+    }
+def process_video(
+    video_path: str,
+    video_min_pixels: int = 16 * 28 * 28,
+    video_max_pixels: int = 768 * 28 * 28,
+    video_total_pixels: int = 128000 * 28 * 28,
+    min_frames: int = 4,
+    max_frames: int = 64,
+    fps: float = 2.0,
+) -> dict | None:
+    """
+    Process video file configuration.
+    Args:
+        video_path: Path to video file
+        video_min_pixels: Minimum pixels per frame
+        video_max_pixels: Maximum pixels per frame
+        video_total_pixels: Total pixels across all frames
+        min_frames: Minimum number of frames
+        max_frames: Maximum number of frames
+        fps: Frames per second for sampling
+    Returns:
+        Dictionary with video configuration or None
+    """
+    if video_path is None:
+        return None
+    return {
+        "type": "video",
+        "video": video_path,
+        "min_pixels": video_min_pixels,
+        "max_pixels": video_max_pixels,
+        "total_pixels": video_total_pixels,
+        "min_frames": min_frames,
+        "max_frames": max_frames,
+        "fps": fps,
+    }
+@spaces.GPU(duration=180)
+def generate(
+    media_input: str | None,
+    prompt: str,
+    early_exit_thresh: float,
+    temperature: float,
+    max_new_tokens: int = 4096,
+) -> dict:
+    """
+    Generate response with adaptive inference.
+    Args:
+        media_input: Path to media file
+        prompt: Text prompt
+        early_exit_thresh: Confidence threshold for early exit
+        temperature: Sampling temperature
+        max_new_tokens: Maximum tokens to generate
+    Returns:
+        Dictionary containing response and metadata
+    """
+    # Prepare message
+    message = [{"role": "system", "content": COT_SYSTEM_PROMPT_ANSWER_TWICE}]
+    content_parts = []
+    # Process media input
+    if media_input is not None:
+        media_type = detect_media_type(media_input)
+        if media_type == "video":
+            video_dict = process_video(media_input)
+            if video_dict:
+                content_parts.append(video_dict)
+        elif media_type == "image":
+            image_dict = process_image(media_input)
+            if image_dict:
+                content_parts.append(image_dict)
+    # Add text prompt
+    content_parts.append({"type": "text", "text": prompt})
+    message.append({"role": "user", "content": content_parts})
+    # Apply chat template
+    text = processor.apply_chat_template([message], tokenize=False, add_generation_prompt=True)
+    # Process vision inputs
+    image_inputs, video_inputs, video_kwargs = process_vision_info(
+        [message],
+        image_patch_size=16,
+        return_video_kwargs=True,
+        return_video_metadata=True,
+    )
+    if video_inputs is not None:
+        video_inputs, video_metadatas = zip(*video_inputs)
+        video_inputs = list(video_inputs)
+        video_metadatas = list(video_metadatas)
+    else:
+        video_metadatas = None
+    # Prepare inputs
+    inputs = processor(
+        text=text,
+        images=image_inputs,
+        videos=video_inputs,
+        video_metadata=video_metadatas,
+        do_resize=False,
+        padding=True,
+        return_tensors="pt",
+        **video_kwargs,
+    )
+    inputs = inputs.to(device)
+    # Generation configuration
+    gen_kwargs = {
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature if temperature > 0 else None,
+        "do_sample": temperature > 0,
+        "top_p": 0.9 if temperature > 0 else None,
+        "num_beams": 1,
+        "use_cache": True,
+        "return_dict_in_generate": True,
+        "output_scores": True,
+    }
+    # Generate response
+    with torch.no_grad():
+        gen_out = model.generate(
+            **inputs,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+            **gen_kwargs,
         )
+    # Decode output
+    generated_ids = gen_out.sequences[0][len(inputs.input_ids[0]) :]
+    answer = processor.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+    # Compute confidence
+    first_box_probs = compute_first_boxed_answer_probs(
+        b=0,
+        gen_ids=generated_ids,
+        gen_out=gen_out,
+        ans=answer,
+        task="",
+        tokenizer=tokenizer,
+    )
+    # Parse response
+    first_answer = answer.split("<think>")[0]
+    second_answer = answer.split("</think>")[-1] if "</think>" in answer else first_answer
+    reasoning = answer.split("<think>")[-1].split("</think>")[0] if "<think>" in answer else "N/A"
+    # Determine inference mode
+    if first_box_probs >= early_exit_thresh:
+        need_cot = False
+        reasoning = False
+    else:
+        need_cot = True
+    return {
+        "full_response": answer,
+        "first_answer": first_answer,
+        "confidence": f"{first_box_probs:.4f}",
+        "need_cot": need_cot,
+        "reasoning": reasoning,
+        "second_answer": second_answer,
+    }
 # ============================================================================
     # Initialize system prompt
     if len(messages_state) == 0:
+        messages_state.append({"role": "system", "content": COT_SYSTEM_PROMPT_ANSWER_TWICE})
     # Prepare user message
     content_parts = []
     if media_path is not None:
         mtype = detect_media_type(media_path)
         if mtype == "video":
+            vd = process_video(media_path)
             if vd:
                 content_parts.append(vd)
         elif mtype == "image":
+            imd = process_image(media_path)
             if imd:
                 content_parts.append(imd)
     messages_state.append({"role": "user", "content": content_parts})
     # Generate response
+    result = generate(media_path, user_text, early_exit_thresh, temperature)
     # Format assistant response
     first_ans = (result.get("first_answer") or "").strip()
 # Gradio Interface
 # ============================================================================
+demo = gr.Blocks(title="VideoAuto-R1 Demo")
+with demo:
+    gr.Markdown("# VideoAuto-R1 (Qwen3-VL-8B) Demo")
+    # Display system prompt
+    with gr.Accordion("System Prompt", open=False):
+        gr.Markdown(f"```\n{COT_SYSTEM_PROMPT_ANSWER_TWICE}\n```")
+    # State variables
+    messages_state = gr.State([])
+    chatbot_state = gr.State([])
+    last_media_state = gr.State(None)
+    with gr.Row():
+        # Left column: Media input and settings
+        with gr.Column(scale=3):
+            media_input = gr.File(
+                label="Upload Image or Video",
+                file_types=["image", "video"],
+                type="filepath",
+            )
+            image_preview = gr.Image(label="Image Preview", visible=False)
+            video_preview = gr.Video(label="Video Preview", visible=False)
+            with gr.Accordion("Advanced Settings", open=True):
+                early_exit_thresh = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.98,
+                    step=0.01,
+                    label="Early Exit Threshold",
                 )
+                temperature = gr.Slider(
+                    minimum=0.0,
+                    maximum=2.0,
+                    value=0.0,
+                    step=0.1,
+                    label="Temperature",
                 )
+        # Right column: Chat interface
+        with gr.Column(scale=7):
+            chatbot = gr.Chatbot(
+                label="Chat",
+                elem_id="chatbot",
+                height=600,
+                sanitize_html=False,
+            )
+            textbox = gr.Textbox(
+                show_label=False,
+                placeholder="Enter text and press ENTER",
+                lines=2,
+            )
+            with gr.Row():
+                send_btn = gr.Button("Send", variant="primary")
+                clear_btn = gr.Button("Clear")
+            gr.Markdown("Please click the **Clear** button before starting a new conversation or trying a new example.")
+    # Event handlers
+    media_input.change(
+        fn=update_preview,
+        inputs=[media_input],
+        outputs=[image_preview, video_preview],
+    )
+    # Send button click: generate response and disable input controls
+    send_btn.click(
+        fn=chat_generate,
+        inputs=[
+            media_input,
+            textbox,
+            messages_state,
+            chatbot_state,
+            last_media_state,
+            early_exit_thresh,
+            temperature,
+        ],
+        outputs=[messages_state, chatbot_state, last_media_state, textbox, send_btn],
+    ).then(
+        fn=lambda cs: cs,
+        inputs=[chatbot_state],
+        outputs=[chatbot],
+    )
+    # Textbox submit: generate response and disable input controls
+    textbox.submit(
+        fn=chat_generate,
+        inputs=[
+            media_input,
+            textbox,
+            messages_state,
+            chatbot_state,
+            last_media_state,
+            early_exit_thresh,
+            temperature,
+        ],
+        outputs=[messages_state, chatbot_state, last_media_state, textbox, send_btn],
+    ).then(
+        fn=lambda cs: cs,
+        inputs=[chatbot_state],
+        outputs=[chatbot],
+    )
+    # Clear button: reset all states and re-enable input controls
+    clear_btn.click(
+        fn=clear_history,
+        inputs=[],
+        outputs=[
+            messages_state,
+            chatbot_state,
+            last_media_state,
+            media_input,
+            image_preview,
+            video_preview,
+            textbox,
+            send_btn,
+        ],
+    ).then(
+        fn=lambda cs: cs,
+        inputs=[chatbot_state],
+        outputs=[chatbot],
+    )
+    gr.Examples(
+        examples=EXAMPLES,
+        inputs=[media_input, textbox],
+        label="Examples",
+        cache_examples=False,
+    )
+# Launch demo
+demo.launch(
+    share=True,
+    server_name="0.0.0.0",
+    server_port=7860,
+    allowed_paths=["assets"],
+    debug=True,
+    css=CUSTOM_CSS,
+)