Spaces:

HF-Pawan
/

AI-Image-Describer

Running

App Files Files Community

HF-Pawan commited on 13 days ago

Commit

f24f5b3

1 Parent(s): d9e2463

Style Changes

Browse files

Files changed (2) hide show

core/model_loader.py +34 -25
ui/layout.py +5 -7

core/model_loader.py CHANGED Viewed

@@ -4,14 +4,10 @@ from transformers import BlipProcessor, BlipForConditionalGeneration
 MODEL_ID = "Salesforce/blip-image-captioning-large"
 DEVICE = torch.device("cpu")
-# Prompt templates
 PROMPTS = {
     "Short Caption": "a photo of",
-    "Detailed Caption": "this image shows",
-    "Creative Caption": "this artistic scene depicts",
-    "Image Explanation": (
-        "this image shows a complete and detailed scene depicting"
-    )
 }
 def load_model():
@@ -21,6 +17,26 @@ def load_model():
     model.eval()
     return model, processor
 def generate_caption(
     model,
     processor,
@@ -36,34 +52,27 @@ def generate_caption(
     ).to(DEVICE)
     # Style-specific decoding configuration
-    if style == "Image Explanation":
         generation_kwargs = dict(
-            min_length=90,
-            max_length=160,
-            num_beams=5,
             do_sample=False,
             repetition_penalty=1.25,
             length_penalty=1.1,
             early_stopping=True
         )
-    elif style == "Detailed Caption":
         generation_kwargs = dict(
-            min_length=60,
-            max_length=120,
             num_beams=3,
             do_sample=False,
-            repetition_penalty=1.2
-        )
-    else:
-        generation_kwargs = dict(
-            min_length=20,
-            max_length=50,
-            do_sample=True,
-            top_p=0.9,
-            temperature=0.8,
-            repetition_penalty=1.1
         )
     with torch.inference_mode():
@@ -77,4 +86,4 @@ def generate_caption(
         skip_special_tokens=True
     )
-    return caption.strip()

 MODEL_ID = "Salesforce/blip-image-captioning-large"
 DEVICE = torch.device("cpu")
+# Prompt templates (kept short & stable for BLIP)
 PROMPTS = {
     "Short Caption": "a photo of",
+    "Detailed Caption": "this image shows"
 }
 def load_model():
     model.eval()
     return model, processor
+def _finalize_sentence(text: str) -> str:
+    """
+    Ensures:
+    - no trailing commas / conjunctions
+    - sentence ends with a dot
+    """
+    text = text.strip()
+    # Remove dangling conjunctions
+    for suffix in [",", "and", "and a", "and the"]:
+        if text.lower().endswith(suffix):
+            text = text[: -len(suffix)].strip()
+    # Ensure final punctuation
+    if not text.endswith((".", "!", "?")):
+        text += "."
+    return text
 def generate_caption(
     model,
     processor,
     ).to(DEVICE)
     # Style-specific decoding configuration
+    if style == "Detailed Caption":
         generation_kwargs = dict(
+            min_length=55,
+            max_length=110,
+            num_beams=4,
             do_sample=False,
             repetition_penalty=1.25,
             length_penalty=1.1,
+            no_repeat_ngram_size=3,
             early_stopping=True
         )
+    else:  # Short Caption
         generation_kwargs = dict(
+            min_length=18,
+            max_length=40,
             num_beams=3,
             do_sample=False,
+            repetition_penalty=1.15,
+            no_repeat_ngram_size=3,
+            early_stopping=True
         )
     with torch.inference_mode():
         skip_special_tokens=True
     )
+    return _finalize_sentence(caption)

ui/layout.py CHANGED Viewed

@@ -47,9 +47,7 @@ def build_ui(model, processor):
                 style_select = gr.Dropdown(
                     choices=[
                         "Short Caption",
-                        "Detailed Caption",
-                        "Creative Caption",
-                        "Image Explanation"
                     ],
                     value="Detailed Caption",
                     label="Caption Style"
@@ -69,11 +67,11 @@ def build_ui(model, processor):
         gr.Examples(
             examples=[
-                ["./assets/zebra.jpg", "Detailed Caption"],
-                ["./assets/cat.jpg", "Image Explanation"],
                 ["./assets/fridge.jpg", "Detailed Caption"],
-                ["./assets/marriage.jpg", "Creative Caption"],
-                ["./assets/giraffe.jpg", "Short Caption"]
             ],
             inputs=[image_input, style_select]
         )

                 style_select = gr.Dropdown(
                     choices=[
                         "Short Caption",
+                        "Detailed Caption"
                     ],
                     value="Detailed Caption",
                     label="Caption Style"
         gr.Examples(
             examples=[
+                ["./assets/zebra.jpg", "Short Caption"],
+                ["./assets/cat.jpg", "Short Caption"],
                 ["./assets/fridge.jpg", "Detailed Caption"],
+                ["./assets/marriage.jpg", "Detailed Caption"],
+                ["./assets/giraffe.jpg", "Detailed Caption"]
             ],
             inputs=[image_input, style_select]
         )