import gradio as gr import torch from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image # Initialize the BLIP model and processor processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") model.eval() # Check if CUDA is available device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) # Define function for image captioning def analyze_scene_api(image): try: # Preprocess the image processed_image = processor(images=image, return_tensors="pt").pixel_values.to(device) with torch.no_grad(): output = model.generate(processed_image) caption = processor.decode(output[0], skip_special_tokens=True) return {"caption": caption} except Exception as e: return {"error": str(e)} # Create Gradio Interface with API mode interface = gr.Interface( fn=analyze_scene_api, inputs=gr.Image(type="pil"), outputs="json", title="BLIP API for Image Captioning", description="Send an image to get a caption response in JSON format." ) if __name__ == "__main__": # Launch Gradio interface in API mode interface.launch(server_name="0.0.0.0", server_port=7860)