RohitCSharp commited on
Commit
209652e
·
verified ·
1 Parent(s): 23dfb62

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -28
app.py CHANGED
@@ -1,45 +1,45 @@
1
  import gradio as gr
2
- from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM
3
  from PIL import Image
4
  import torch
5
 
6
- # Load CLIP model
7
- clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
8
- clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
9
-
10
- # Load GPT-2 (or any captioning LLM)
11
- lm_tokenizer = AutoTokenizer.from_pretrained("gpt2")
12
- lm_model = AutoModelForCausalLM.from_pretrained("gpt2")
13
 
14
  def generate_caption(image):
15
  if image is None:
16
  return "No image uploaded."
17
 
18
- # Create basic prompt ideas for CLIP to compare
19
- concepts = ["cat", "dog", "person", "landscape", "food", "technology", "vehicle", "building", "nature"]
20
- prompts = [f"an image of a {c}" for c in concepts]
21
-
22
- # Use CLIP to find the best concept
23
- inputs = clip_processor(text=prompts, images=image, return_tensors="pt", padding=True)
24
- outputs = clip_model(**inputs)
25
- probs = outputs.logits_per_image.softmax(dim=1)
26
- best = torch.argmax(probs).item()
27
- selected_concept = concepts[best]
28
-
29
- # Use the concept as seed for GPT caption generation
30
- gpt_prompt = f"This is an image of a {selected_concept}. It shows"
31
- input_ids = lm_tokenizer.encode(gpt_prompt, return_tensors="pt")
32
- gpt_output = lm_model.generate(input_ids, max_length=30, do_sample=True, top_k=50, top_p=0.95)
33
- generated_text = lm_tokenizer.decode(gpt_output[0], skip_special_tokens=True)
34
-
35
- return generated_text
 
 
 
 
 
36
 
37
  iface = gr.Interface(
38
  fn=generate_caption,
39
  inputs=gr.Image(type="pil"),
40
  outputs=gr.Textbox(label="Generated Caption"),
41
- title="Image Captioning with CLIP + GPT",
42
- description="CLIP guesses image context, GPT generates free-text caption."
43
  )
44
 
45
  iface.launch()
 
1
  import gradio as gr
2
+ from transformers import CLIPProcessor, CLIPModel
3
  from PIL import Image
4
  import torch
5
 
6
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
7
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
 
 
 
 
8
 
9
  def generate_caption(image):
10
  if image is None:
11
  return "No image uploaded."
12
 
13
+ # Candidate text prompts
14
+ texts = [
15
+ "a photo of a cat",
16
+ "a photo of a dog",
17
+ "a photo of a man",
18
+ "a photo of a woman",
19
+ "a photo of a laptop",
20
+ "a photo of a smartphone",
21
+ "a photo of a city",
22
+ "a photo of a landscape",
23
+ "a photo of food",
24
+ "a photo of a car"
25
+ ]
26
+
27
+ inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
28
+ outputs = model(**inputs)
29
+
30
+ logits_per_image = outputs.logits_per_image # image-text similarity scores
31
+ probs = logits_per_image.softmax(dim=1) # convert to probabilities
32
+
33
+ best_match = torch.argmax(probs).item()
34
+ caption = texts[best_match]
35
+ return f"Best match: {caption} (Confidence: {probs[0][best_match].item():.2f})"
36
 
37
  iface = gr.Interface(
38
  fn=generate_caption,
39
  inputs=gr.Image(type="pil"),
40
  outputs=gr.Textbox(label="Generated Caption"),
41
+ title="Image Captioning with CLIP",
42
+ description="Upload an image and get a dynamically generated caption using CLIP."
43
  )
44
 
45
  iface.launch()