Spaces:

csccorner
/

Image-captioning

Sleeping

App Files Files

RohitCSharp commited on Jun 20, 2025

Commit

209652e

verified ·

1 Parent(s): 23dfb62

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -28

app.py CHANGED Viewed

@@ -1,45 +1,45 @@
 import gradio as gr
-from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM
 from PIL import Image
 import torch
-# Load CLIP model
-clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-# Load GPT-2 (or any captioning LLM)
-lm_tokenizer = AutoTokenizer.from_pretrained("gpt2")
-lm_model = AutoModelForCausalLM.from_pretrained("gpt2")
 def generate_caption(image):
     if image is None:
         return "No image uploaded."
-    # Create basic prompt ideas for CLIP to compare
-    concepts = ["cat", "dog", "person", "landscape", "food", "technology", "vehicle", "building", "nature"]
-    prompts = [f"an image of a {c}" for c in concepts]
-    # Use CLIP to find the best concept
-    inputs = clip_processor(text=prompts, images=image, return_tensors="pt", padding=True)
-    outputs = clip_model(**inputs)
-    probs = outputs.logits_per_image.softmax(dim=1)
-    best = torch.argmax(probs).item()
-    selected_concept = concepts[best]
-    # Use the concept as seed for GPT caption generation
-    gpt_prompt = f"This is an image of a {selected_concept}. It shows"
-    input_ids = lm_tokenizer.encode(gpt_prompt, return_tensors="pt")
-    gpt_output = lm_model.generate(input_ids, max_length=30, do_sample=True, top_k=50, top_p=0.95)
-    generated_text = lm_tokenizer.decode(gpt_output[0], skip_special_tokens=True)
-    return generated_text
 iface = gr.Interface(
     fn=generate_caption,
     inputs=gr.Image(type="pil"),
     outputs=gr.Textbox(label="Generated Caption"),
-    title="Image Captioning with CLIP + GPT",
-    description="CLIP guesses image context, GPT generates free-text caption."
 )
 iface.launch()

 import gradio as gr
+from transformers import CLIPProcessor, CLIPModel
 from PIL import Image
 import torch
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 def generate_caption(image):
     if image is None:
         return "No image uploaded."
+    # Candidate text prompts
+    texts = [
+        "a photo of a cat",
+        "a photo of a dog",
+        "a photo of a man",
+        "a photo of a woman",
+        "a photo of a laptop",
+        "a photo of a smartphone",
+        "a photo of a city",
+        "a photo of a landscape",
+        "a photo of food",
+        "a photo of a car"
+    ]
+    inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
+    outputs = model(**inputs)
+    logits_per_image = outputs.logits_per_image  # image-text similarity scores
+    probs = logits_per_image.softmax(dim=1)      # convert to probabilities
+    best_match = torch.argmax(probs).item()
+    caption = texts[best_match]
+    return f"Best match: {caption} (Confidence: {probs[0][best_match].item():.2f})"
 iface = gr.Interface(
     fn=generate_caption,
     inputs=gr.Image(type="pil"),
     outputs=gr.Textbox(label="Generated Caption"),
+    title="Image Captioning with CLIP",
+    description="Upload an image and get a dynamically generated caption using CLIP."
 )
 iface.launch()