Spaces:
Running
Running
| import torch | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| MODEL_ID = "Salesforce/blip-image-captioning-large" | |
| DEVICE = torch.device("cpu") | |
| # Prompt templates (kept short & stable for BLIP) | |
| PROMPTS = { | |
| "Short Caption": "a photo of", | |
| "Detailed Caption": "this image shows" | |
| } | |
| def load_model(): | |
| processor = BlipProcessor.from_pretrained(MODEL_ID) | |
| model = BlipForConditionalGeneration.from_pretrained(MODEL_ID) | |
| model.to(DEVICE) | |
| model.eval() | |
| return model, processor | |
| def _finalize_sentence(text: str) -> str: | |
| """ | |
| Ensures: | |
| - no trailing commas / conjunctions | |
| - sentence ends with a dot | |
| """ | |
| text = text.strip() | |
| # Remove dangling conjunctions | |
| for suffix in [",", "and", "and a", "and the"]: | |
| if text.lower().endswith(suffix): | |
| text = text[: -len(suffix)].strip() | |
| # Ensure final punctuation | |
| if not text.endswith((".", "!", "?")): | |
| text += "." | |
| return text | |
| def generate_caption( | |
| model, | |
| processor, | |
| image, | |
| style | |
| ): | |
| prompt = PROMPTS.get(style, "this image shows") | |
| inputs = processor( | |
| images=image, | |
| text=prompt, | |
| return_tensors="pt" | |
| ).to(DEVICE) | |
| # Style-specific decoding configuration | |
| if style == "Detailed Caption": | |
| generation_kwargs = dict( | |
| min_length=55, | |
| max_length=110, | |
| num_beams=4, | |
| do_sample=False, | |
| repetition_penalty=1.25, | |
| length_penalty=1.1, | |
| no_repeat_ngram_size=3, | |
| early_stopping=True | |
| ) | |
| else: # Short Caption | |
| generation_kwargs = dict( | |
| min_length=18, | |
| max_length=40, | |
| num_beams=3, | |
| do_sample=False, | |
| repetition_penalty=1.15, | |
| no_repeat_ngram_size=3, | |
| early_stopping=True | |
| ) | |
| with torch.inference_mode(): | |
| output_ids = model.generate( | |
| **inputs, | |
| **generation_kwargs | |
| ) | |
| caption = processor.decode( | |
| output_ids[0], | |
| skip_special_tokens=True | |
| ) | |
| return _finalize_sentence(caption) |