Spaces:

HF-Pawan
/

AI-Image-Describer

Running

File size: 2,136 Bytes

import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

MODEL_ID = "Salesforce/blip-image-captioning-large"
DEVICE = torch.device("cpu")

# Prompt templates (kept short & stable for BLIP)
PROMPTS = {
    "Short Caption": "a photo of",
    "Detailed Caption": "this image shows"
}

def load_model():
    processor = BlipProcessor.from_pretrained(MODEL_ID)
    model = BlipForConditionalGeneration.from_pretrained(MODEL_ID)
    model.to(DEVICE)
    model.eval()
    return model, processor

def _finalize_sentence(text: str) -> str:
    """
    Ensures:
    - no trailing commas / conjunctions
    - sentence ends with a dot
    """
    text = text.strip()

    # Remove dangling conjunctions
    for suffix in [",", "and", "and a", "and the"]:
        if text.lower().endswith(suffix):
            text = text[: -len(suffix)].strip()

    # Ensure final punctuation
    if not text.endswith((".", "!", "?")):
        text += "."

    return text


def generate_caption(
    model,
    processor,
    image,
    style
):
    prompt = PROMPTS.get(style, "this image shows")

    inputs = processor(
        images=image,
        text=prompt,
        return_tensors="pt"
    ).to(DEVICE)

    # Style-specific decoding configuration
    if style == "Detailed Caption":
        generation_kwargs = dict(
            min_length=55,
            max_length=110,
            num_beams=4,
            do_sample=False,
            repetition_penalty=1.25,
            length_penalty=1.1,
            no_repeat_ngram_size=3,
            early_stopping=True
        )

    else:  # Short Caption
        generation_kwargs = dict(
            min_length=18,
            max_length=40,
            num_beams=3,
            do_sample=False,
            repetition_penalty=1.15,
            no_repeat_ngram_size=3,
            early_stopping=True
        )

    with torch.inference_mode():
        output_ids = model.generate(
            **inputs,
            **generation_kwargs
        )

    caption = processor.decode(
        output_ids[0],
        skip_special_tokens=True
    )

    return _finalize_sentence(caption)