Spaces:

KC123hello
/

RobustMMFM

Configuration error

App Files Files Community

KC123hello commited on Dec 4, 2025

Commit

c49d4d8

verified ·

1 Parent(s): 7365249

Upload 2 files

Browse files

Files changed (2) hide show

gradio/gradio_app.py +186 -0
gradio/run_caption.py +221 -0

gradio/gradio_app.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import gradio as gr
+import subprocess
+import os
+import tempfile
+import json
+def generate_caption(image, epsilon, sparsity, attack_algo, num_iters):
+    """
+    Generate caption for the uploaded image using the model in RobustMMFMEnv.
+    Args:
+        image: The uploaded image from Gradio
+    Returns:
+        tuple: (original_caption, adversarial_caption, original_image, adversarial_image, perturbation_image)
+    """
+    if image is None:
+        return "Please upload an image first.", "", None, None, None
+    try:
+        # Save the uploaded image to a temporary file
+        with tempfile.NamedTemporaryFile(mode='wb', suffix='.jpg', delete=False) as tmp_file:
+            tmp_image_path = tmp_file.name
+            # Save the image
+            from PIL import Image
+            import numpy as np
+            if isinstance(image, np.ndarray):
+                img = Image.fromarray(image)
+                img.save(tmp_image_path)
+            else:
+                image.save(tmp_image_path)
+        # Prepare the command to run in RobustMMFMEnv
+        # This is a placeholder - you'll need to create the actual script
+        conda_env = "RobustMMFMEnv"
+        script_path = os.path.join(os.path.dirname(__file__), "run_caption.py")
+        # Run the caption generation script in the RobustMMFMEnv conda environment
+        cmd = [
+            "conda", "run", "-n", conda_env,
+            "python", script_path,
+            "--image_path", tmp_image_path,
+            "--epsilon", str(epsilon),
+            "--num_iters", str(num_iters),
+            "--sparsity", str(sparsity),
+            "--attack_algo", attack_algo
+        ]
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=60  # 60 seconds timeout
+        )
+        # Clean up temporary file
+        os.unlink(tmp_image_path)
+        if result.returncode == 0:
+            # Parse the output
+            output = result.stdout.strip()
+            #return output if output else "No caption generated."
+            try:
+                # Parse the dictionary output
+                import ast
+                result_dict = ast.literal_eval(output)
+                original = result_dict.get('original_caption', '').strip()
+                adversarial = result_dict.get('adversarial_caption', '').strip()
+                orig_img_path = result_dict.get('original_image_path')
+                adv_img_path = result_dict.get('adversarial_image_path')
+                pert_img_path = result_dict.get('perturbation_image_path')
+                orig_image = None
+                adv_image = None
+                pert_image = None
+                if orig_img_path and os.path.exists(orig_img_path):
+                    orig_image = np.array(Image.open(orig_img_path))
+                    try:
+                        os.unlink(orig_img_path)
+                    except:
+                        pass
+                if adv_img_path and os.path.exists(adv_img_path):
+                    adv_image = np.array(Image.open(adv_img_path))
+                    try:
+                        os.unlink(adv_img_path)
+                    except:
+                        pass
+                if pert_img_path and os.path.exists(pert_img_path):
+                    pert_image = np.array(Image.open(pert_img_path))
+                    try:
+                        os.unlink(pert_img_path)
+                    except:
+                        pass
+                return original, adversarial, orig_image, adv_image, pert_image  # Return 5 values
+            except (ValueError, SyntaxError) as e:
+                print(f"Failed to parse output: {e}", flush=True)
+                # If parsing fails, try to return raw output
+                return f"Parse error: {str(e)}", "", None, None, None
+        else:
+            error_msg = result.stderr.strip()
+            return f"Error generating caption: {error_msg}", "", None, None, None
+    except subprocess.TimeoutExpired:
+        return "Error: Caption generation timed out (>60s)", "", None, None, None
+    except Exception as e:
+        return f"Error: {str(e)}", "", None, None, None
+# Create the Gradio interface
+with gr.Blocks(title="Image Captioning") as demo:
+    gr.Markdown("# Evaluating Robustness of Multimodal Models Against Adversarial Perturbations")
+    gr.Markdown("Upload an image to generate the adversarial image and caption using the APGD/SAIF algorithm.")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(
+                label="Upload Image",
+                type="numpy"
+            )
+            attack_algo = gr.Dropdown(
+                choices=["APGD", "SAIF"],
+                value="APGD",
+                label="Adversarial Attack Algorithm",
+                interactive=True
+            )
+            epsilon = gr.Slider(
+                minimum=1, maximum=255, value=8, step=1, interactive=True,
+                label="Epsilon (max perturbation, 0-255 scale)"
+            )
+            sparsity = gr.Slider(
+                minimum=0, maximum=10000, value=0, step=100, interactive=True,
+                label="Sparsity (L1 norm of the perturbation, for SAIF only)"
+            )
+            num_iters = gr.Slider(
+                minimum=1, maximum=100, value=8, step=1, interactive=True,
+                label="Number of Iterations"
+            )
+    with gr.Row():
+        with gr.Column():
+            generate_btn = gr.Button("Generate Captions", variant="primary")
+    with gr.Row():
+        with gr.Column():
+            orig_image_output = gr.Image(label="Original Image")
+            orig_caption_output = gr.Textbox(
+                label="Generated Original Caption",
+                lines=5,
+                placeholder="Caption will appear here..."
+            )
+        with gr.Column():
+            pert_image_output = gr.Image(label="Perturbation (10x magnified)")
+        with gr.Column():
+            adv_image_output = gr.Image(label="Adversarial Image")
+            adv_caption_output = gr.Textbox(
+                label="Generated Adversarial Caption",
+                lines=5,
+                placeholder="Caption will appear here..."
+            )
+    # Set up the button click event
+    generate_btn.click(
+        fn=generate_caption,
+        inputs=[image_input, epsilon, sparsity, attack_algo, num_iters],
+        outputs=[orig_caption_output, adv_caption_output, orig_image_output, adv_image_output, pert_image_output]
+    )
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        debug=True,
+        show_error=True
+    )

gradio/run_caption.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""
+Script to generate captions for images using the VLM model.
+This script runs in the RobustMMFMEnv conda environment.
+"""
+import argparse
+import sys
+import os
+import warnings
+warnings.filterwarnings('ignore')
+# Add the parent directory to the path to import vlm_eval modules
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+def generate_caption(image_path, epsilon, sparsity, attack_algo, num_iters, model_name="open_flamingo", num_shots=0, targeted=False):
+    """
+    Generate caption for a single image.
+    Args:
+        image_path: Path to the image file
+        model_name: Name of the model to use
+        num_shots: Number of shots for few-shot learning
+    Returns:
+        str: Generated caption
+    """
+    try:
+        # Import required modules
+        from PIL import Image
+        import torch
+        import numpy as np
+        import tempfile
+        from open_flamingo.eval.models.of_eval_model_adv import EvalModelAdv
+        from open_flamingo.eval.coco_metric import postprocess_captioning_generation
+        from vlm_eval.attacks.apgd import APGD
+        from vlm_eval.attacks.saif import SAIF
+        # Model arguments
+        model_args = {
+            "lm_path": "togethercomputer/RedPajama-INCITE-Base-3B-v1",
+            "lm_tokenizer_path": "togethercomputer/RedPajama-INCITE-Base-3B-v1",
+            "vision_encoder_path": "ViT-L-14",
+            "vision_encoder_pretrained": "openai",
+            "checkpoint_path": "/home/kc/.cache/huggingface/hub/models--openflamingo--OpenFlamingo-4B-vitl-rpj3b/snapshots/df8d3f7e75bcf891ce2fbf5253a12f524692d9c2/checkpoint.pt",
+            "cross_attn_every_n_layers": "2",
+            "precision": "float16",
+        }
+        eval_model = EvalModelAdv(model_args, adversarial=True)
+        eval_model.set_device(0 if torch.cuda.is_available() else -1)
+        image = Image.open(image_path).convert("RGB")
+        image = eval_model._prepare_images([[image]])
+        prompt = eval_model.get_caption_prompt()
+        # Generate original caption
+        orig_caption = eval_model.get_outputs(
+            batch_images=image,
+            batch_text=[prompt],  # Note: wrapped in list
+            min_generation_length=0,
+            max_generation_length=20,
+            num_beams=3,
+            length_penalty=-2.0,
+        )
+        #orig_caption = [postprocess_captioning_generation(out).replace('"', "") for out in orig_caption
+        #]
+        # For adversarial attack, create the adversarial text prompt
+        targeted = False  # or True if you want targeted attack
+        target_str = "a dog"  # your target if targeted=True
+        adv_caption = orig_caption[0] if not targeted else target_str
+        prompt_adv = eval_model.get_caption_prompt(adv_caption)
+        # ⭐ THIS IS THE CRITICAL MISSING STEP ⭐
+        eval_model.set_inputs(
+            batch_text=[prompt_adv],  # Use adversarial prompt
+            past_key_values=None,
+            to_device=True,
+        )
+        # Now run the attack
+        if attack_algo == "APGD":
+            attack = APGD(
+                eval_model if not targeted else lambda x: -eval_model(x),
+                norm="linf",
+                eps=epsilon/255.0,
+                mask_out=None,
+                initial_stepsize=1.0,
+            )
+            adv_image = attack.perturb(
+                image.to(eval_model.device, dtype=eval_model.cast_dtype),
+                iterations=num_iters,
+                pert_init=None,
+                verbose=False,
+            )
+        elif attack_algo == "SAIF":
+            attack = SAIF(
+                    model=eval_model,
+                    targeted=targeted,
+                    img_range=(0,1),
+                    steps=num_iters,
+                    mask_out=None,
+                    eps=epsilon/255.0,
+                    k=sparsity,
+                    ver=False
+                )
+            adv_image, _ = attack(
+                    x=image.to(eval_model.device, dtype=eval_model.cast_dtype),
+                )
+        else:
+            raise ValueError(f"Unsupported attack algorithm: {attack_algo}")
+        adv_image = adv_image.detach().cpu()
+        # Generate adversarial caption
+        adv_caption_output = eval_model.get_outputs(
+            batch_images=adv_image,
+            batch_text=[prompt],  # Use clean prompt for generation
+            min_generation_length=0,
+            max_generation_length=20,
+            num_beams=3,
+            length_penalty=-2.0,
+        )
+        new_predictions = [
+            postprocess_captioning_generation(out).replace('"', "") for out in adv_caption_output
+        ]
+        # At the end, instead of:
+# print(orig_caption[0])
+# print(new_predictions[0])
+# Do this - strip the list and get just the string:
+        #print(orig_caption)
+        orig_img_np = image.view(1,3,224,224).squeeze(0).cpu().permute(1, 2, 0).numpy()
+        adv_img_np = adv_image.view(1,3,224,224).squeeze(0).cpu().permute(1, 2, 0).numpy()
+        # Calculate perturbation (difference between adversarial and original)
+        perturbation = adv_img_np - orig_img_np
+        # Magnify by 10x for visualization
+        perturbation_magnified = perturbation * 10
+        # Normalize to [0, 255] for display
+        orig_img_np = ((orig_img_np - orig_img_np.min()) / (orig_img_np.max() - orig_img_np.min()) * 255).astype(np.uint8)
+        adv_img_np = ((adv_img_np - adv_img_np.min()) / (adv_img_np.max() - adv_img_np.min()) * 255).astype(np.uint8)
+        # Normalize perturbation to [0, 255] for visualization
+        pert_img_np = ((perturbation_magnified - perturbation_magnified.min()) /
+                      (perturbation_magnified.max() - perturbation_magnified.min()) * 255).astype(np.uint8)
+        # ✅ Save images to temporary files
+        with tempfile.NamedTemporaryFile(mode='wb', suffix='.png', delete=False) as f:
+            orig_img_path = f.name
+            Image.fromarray(orig_img_np).save(orig_img_path)
+        with tempfile.NamedTemporaryFile(mode='wb', suffix='.png', delete=False) as f:
+            adv_img_path = f.name
+            Image.fromarray(adv_img_np).save(adv_img_path)
+        with tempfile.NamedTemporaryFile(mode='wb', suffix='.png', delete=False) as f:
+            pert_img_path = f.name
+            Image.fromarray(pert_img_np).save(pert_img_path)
+        results = {
+            "original_caption": orig_caption[0],
+            "adversarial_caption": new_predictions[0],
+            "original_image_path": orig_img_path,  # Return file paths
+            "adversarial_image_path": adv_img_path,
+            "perturbation_image_path": pert_img_path
+        }
+        return results
+    except Exception as e:
+        import traceback
+        error_msg = f"Error in caption generation: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg, file=sys.stderr, flush=True)
+        # Return dict with error information
+        return {
+            "original_caption": f"Error: {str(e)}",
+            "adversarial_caption": "",
+            "original_image_path": None,
+            "adversarial_image_path": None,
+            "perturbation_image_path": None
+        }
+def main():
+    parser = argparse.ArgumentParser(description="Generate caption for an image")
+    parser.add_argument("--image_path", type=str, required=True, help="Path to the image")
+    parser.add_argument("--model", type=str, default="open_flamingo", help="Model to use")
+    parser.add_argument("--shots", type=int, default=0, help="Number of shots")
+    parser.add_argument("--epsilon", type=float, default=8.0, help="Epsilon for adversarial attack")
+    parser.add_argument("--sparsity", type=int, default=0, help="Sparsity for SAIF attack")
+    parser.add_argument("--attack_algo", type=str, default="APGD", help="Adversarial attack algorithm (APGD or SAIF)")
+    parser.add_argument("--num_iters", type=int, default=100, help="Number of iterations for adversarial attack")
+    args = parser.parse_args()
+    # Generate caption
+    caption = generate_caption(args.image_path, args.epsilon, args.sparsity, args.attack_algo, args.num_iters, args.model, args.shots)
+    if caption:
+        print(caption)
+        sys.exit(0)
+    else:
+        print("Failed to generate caption", file=sys.stderr)
+        sys.exit(1)
+if __name__ == "__main__":
+    main()