Yatheshr commited on
Commit
02f82f5
·
verified ·
1 Parent(s): 85e5aa2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -13
app.py CHANGED
@@ -1,16 +1,22 @@
1
- # 1. Import Libraries
 
 
 
2
  import gradio as gr
3
  from transformers import CLIPProcessor, CLIPModel
4
  from PIL import Image
5
  import torch
6
 
7
- # 2. Load the Pre-trained Model
8
  model_name = "openai/clip-vit-base-patch16"
9
  processor = CLIPProcessor.from_pretrained(model_name)
10
  model = CLIPModel.from_pretrained(model_name)
11
 
12
- # 3. Define the Prediction Function
13
  def classify_image_text(image, text):
 
 
 
14
  # Process the inputs
15
  inputs = processor(text=[text], images=image, return_tensors="pt", padding=True)
16
 
@@ -19,19 +25,26 @@ def classify_image_text(image, text):
19
  outputs = model(**inputs)
20
 
21
  # Calculate similarity between image and text
22
- logits_per_image = outputs.logits_per_image
23
- probs = logits_per_image.softmax(dim=1) # Convert logits to probabilities
 
24
 
25
- # Return the prediction
26
- return {text: probs.item()}
 
27
 
28
- # 4. Create the Gradio Interface
29
  iface = gr.Interface(
30
  fn=classify_image_text,
31
- inputs=[gr.Image(type="pil"), gr.Textbox(label="Enter description")],
32
- outputs=gr.Label(),
33
- live=True
 
 
 
 
 
34
  )
35
 
36
- # 5. Launch the App
37
- iface.launch()
 
1
+ # 1. Install Required Libraries (run this in terminal or notebook once)
2
+ # pip install gradio transformers torch torchvision pillow
3
+
4
+ # 2. Import Libraries
5
  import gradio as gr
6
  from transformers import CLIPProcessor, CLIPModel
7
  from PIL import Image
8
  import torch
9
 
10
+ # 3. Load the Pre-trained Model
11
  model_name = "openai/clip-vit-base-patch16"
12
  processor = CLIPProcessor.from_pretrained(model_name)
13
  model = CLIPModel.from_pretrained(model_name)
14
 
15
+ # 4. Define the Prediction Function
16
  def classify_image_text(image, text):
17
+ if not image or not text:
18
+ return "Please provide both image and description."
19
+
20
  # Process the inputs
21
  inputs = processor(text=[text], images=image, return_tensors="pt", padding=True)
22
 
 
25
  outputs = model(**inputs)
26
 
27
  # Calculate similarity between image and text
28
+ logits_per_image = outputs.logits_per_image # shape: [1, 1]
29
+ probs = logits_per_image.softmax(dim=1) # shape: [1, 1]
30
+ score = probs[0][0].item() # Get scalar score
31
 
32
+ # Return readable percentage
33
+ match_percentage = round(score * 100, 2)
34
+ return f"Match Confidence: {match_percentage}%"
35
 
36
+ # 5. Create the Gradio Interface
37
  iface = gr.Interface(
38
  fn=classify_image_text,
39
+ inputs=[
40
+ gr.Image(type="pil", label="Upload an Image"),
41
+ gr.Textbox(lines=2, placeholder="Describe the image...", label="Your Description")
42
+ ],
43
+ outputs=gr.Label(label="Result"),
44
+ title="CLIP Image-Text Matcher",
45
+ description="Upload an image and enter a description. This app will tell you how well your text matches the image.",
46
+ allow_flagging="never"
47
  )
48
 
49
+ # 6. Launch the App
50
+ iface.launch()