Spaces:
Runtime error
Runtime error
File size: 6,163 Bytes
eb5a9e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
"""
Caption Generator Plugin
Generates descriptive captions for images using BLIP-2.
"""
from typing import Dict, Any
from pathlib import Path
import numpy as np
from PIL import Image
from loguru import logger
from plugins.base import BasePlugin, PluginMetadata
class CaptionGeneratorPlugin(BasePlugin):
"""
Generate captions for images using BLIP-2.
Creates natural language descriptions of image content.
"""
def __init__(self):
"""Initialize CaptionGeneratorPlugin."""
super().__init__()
self.model = None
self.processor = None
self.max_length = 50
@property
def metadata(self) -> PluginMetadata:
"""Return plugin metadata."""
return PluginMetadata(
name="caption_generator",
version="0.1.0",
description="Generates image captions using BLIP-2",
author="AI Dev Collective",
requires=["transformers", "torch"],
category="captioning",
priority=20,
)
def initialize(self) -> None:
"""Initialize the plugin and load BLIP-2 model."""
try:
# Import here to avoid loading if plugin is not used
from transformers import (
Blip2Processor,
Blip2ForConditionalGeneration
)
logger.info("Loading BLIP-2 model...")
# Use smaller BLIP-2 model for faster inference
model_name = "Salesforce/blip2-opt-2.7b"
# Load processor and model
self.processor = Blip2Processor.from_pretrained(model_name)
self.model = Blip2ForConditionalGeneration.from_pretrained(
model_name
)
# Set to eval mode
self.model.eval()
# Move to CPU (GPU support can be added later)
device = "cpu"
self.model.to(device)
self._initialized = True
logger.info(
f"BLIP-2 model loaded successfully on {device}"
)
except Exception as e:
logger.error(f"Failed to initialize CaptionGeneratorPlugin: {e}")
# Fallback: try smaller BLIP model
try:
logger.info("Trying smaller BLIP model...")
from transformers import BlipProcessor, BlipForConditionalGeneration
model_name = "Salesforce/blip-image-captioning-base"
self.processor = BlipProcessor.from_pretrained(model_name)
self.model = BlipForConditionalGeneration.from_pretrained(
model_name
)
self.model.eval()
self.model.to("cpu")
self._initialized = True
logger.info("BLIP base model loaded successfully")
except Exception as fallback_error:
logger.error(f"Fallback also failed: {fallback_error}")
raise
def _generate_caption(
self,
image: Image.Image,
max_length: int = 50
) -> str:
"""
Generate caption for image.
Args:
image: PIL Image
max_length: Maximum caption length
Returns:
Generated caption string
"""
import torch
# Prepare inputs
inputs = self.processor(
images=image,
return_tensors="pt"
)
# Generate caption
with torch.no_grad():
generated_ids = self.model.generate(
**inputs,
max_length=max_length,
num_beams=5,
early_stopping=True
)
# Decode caption
caption = self.processor.decode(
generated_ids[0],
skip_special_tokens=True
)
return caption.strip()
def analyze(
self,
media: Any,
media_path: Path
) -> Dict[str, Any]:
"""
Generate caption for the image.
Args:
media: PIL Image or numpy array
media_path: Path to image file
Returns:
Dictionary with caption
"""
try:
# Check if initialized
if not self._initialized:
self.initialize()
# Validate input
if not self.validate_input(media):
return {"error": "Invalid input type"}
# Convert to PIL Image if numpy array
if isinstance(media, np.ndarray):
image = Image.fromarray(
(media * 255).astype(np.uint8) if media.max() <= 1
else media.astype(np.uint8)
)
else:
image = media
# Generate caption
caption = self._generate_caption(image, self.max_length)
# Analyze caption
word_count = len(caption.split())
result = {
"caption": caption,
"word_count": word_count,
"character_count": len(caption),
"max_length": self.max_length,
"status": "success",
}
logger.debug(f"Caption generated: '{caption[:50]}...'")
return result
except Exception as e:
logger.error(f"Caption generation failed: {e}")
return {
"error": str(e),
"status": "failed"
}
def cleanup(self) -> None:
"""Clean up model resources."""
if self.model is not None:
del self.model
self.model = None
if self.processor is not None:
del self.processor
self.processor = None
logger.info("CaptionGeneratorPlugin cleanup complete")
|