RohitCSharp commited on
Commit
23dfb62
·
verified ·
1 Parent(s): 42db30a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -18
app.py CHANGED
@@ -1,26 +1,45 @@
1
  import gradio as gr
2
- from transformers import CLIPProcessor, CLIPModel
3
  from PIL import Image
4
  import torch
5
 
6
- # Load CLIP
7
  clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
8
  clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
9
 
10
- # Prompt template
 
 
 
11
  def generate_caption(image):
12
- inputs = clip_processor(images=image, return_tensors="pt")
13
- outputs = clip_model.get_image_features(**inputs)
14
-
15
- # Convert image features into a dummy "caption" using top concept labels
16
- # (In actual implementation, this could be passed to GPT-like models)
17
- # Here we simulate a caption
18
- return "A photo showing something relevant to the content."
19
-
20
- demo = gr.Interface(fn=generate_caption,
21
- inputs=gr.Image(type="pil"),
22
- outputs="text",
23
- title="Image Captioning with CLIP & GPT-style Generation",
24
- description="Upload an image to get a descriptive caption. Based on CLIP for vision understanding.")
25
-
26
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM
3
  from PIL import Image
4
  import torch
5
 
6
+ # Load CLIP model
7
  clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
8
  clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
9
 
10
+ # Load GPT-2 (or any captioning LLM)
11
+ lm_tokenizer = AutoTokenizer.from_pretrained("gpt2")
12
+ lm_model = AutoModelForCausalLM.from_pretrained("gpt2")
13
+
14
  def generate_caption(image):
15
+ if image is None:
16
+ return "No image uploaded."
17
+
18
+ # Create basic prompt ideas for CLIP to compare
19
+ concepts = ["cat", "dog", "person", "landscape", "food", "technology", "vehicle", "building", "nature"]
20
+ prompts = [f"an image of a {c}" for c in concepts]
21
+
22
+ # Use CLIP to find the best concept
23
+ inputs = clip_processor(text=prompts, images=image, return_tensors="pt", padding=True)
24
+ outputs = clip_model(**inputs)
25
+ probs = outputs.logits_per_image.softmax(dim=1)
26
+ best = torch.argmax(probs).item()
27
+ selected_concept = concepts[best]
28
+
29
+ # Use the concept as seed for GPT caption generation
30
+ gpt_prompt = f"This is an image of a {selected_concept}. It shows"
31
+ input_ids = lm_tokenizer.encode(gpt_prompt, return_tensors="pt")
32
+ gpt_output = lm_model.generate(input_ids, max_length=30, do_sample=True, top_k=50, top_p=0.95)
33
+ generated_text = lm_tokenizer.decode(gpt_output[0], skip_special_tokens=True)
34
+
35
+ return generated_text
36
+
37
+ iface = gr.Interface(
38
+ fn=generate_caption,
39
+ inputs=gr.Image(type="pil"),
40
+ outputs=gr.Textbox(label="Generated Caption"),
41
+ title="Image Captioning with CLIP + GPT",
42
+ description="CLIP guesses image context, GPT generates free-text caption."
43
+ )
44
+
45
+ iface.launch()