Spaces:

Yatheshr
/

Image_Text_Process_Using_Keras

Sleeping

App Files Files Community

Yatheshr commited on May 24, 2025

Commit

02f82f5

verified ·

1 Parent(s): 85e5aa2

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -13

app.py CHANGED Viewed

@@ -1,16 +1,22 @@
-# 1. Import Libraries
 import gradio as gr
 from transformers import CLIPProcessor, CLIPModel
 from PIL import Image
 import torch
-# 2. Load the Pre-trained Model
 model_name = "openai/clip-vit-base-patch16"
 processor = CLIPProcessor.from_pretrained(model_name)
 model = CLIPModel.from_pretrained(model_name)
-# 3. Define the Prediction Function
 def classify_image_text(image, text):
     # Process the inputs
     inputs = processor(text=[text], images=image, return_tensors="pt", padding=True)
@@ -19,19 +25,26 @@ def classify_image_text(image, text):
         outputs = model(**inputs)
     # Calculate similarity between image and text
-    logits_per_image = outputs.logits_per_image
-    probs = logits_per_image.softmax(dim=1)  # Convert logits to probabilities
-    # Return the prediction
-    return {text: probs.item()}
-# 4. Create the Gradio Interface
 iface = gr.Interface(
     fn=classify_image_text,
-    inputs=[gr.Image(type="pil"), gr.Textbox(label="Enter description")],
-    outputs=gr.Label(),
-    live=True
 )
-# 5. Launch the App
-iface.launch()

+# 1. Install Required Libraries (run this in terminal or notebook once)
+# pip install gradio transformers torch torchvision pillow
+# 2. Import Libraries
 import gradio as gr
 from transformers import CLIPProcessor, CLIPModel
 from PIL import Image
 import torch
+# 3. Load the Pre-trained Model
 model_name = "openai/clip-vit-base-patch16"
 processor = CLIPProcessor.from_pretrained(model_name)
 model = CLIPModel.from_pretrained(model_name)
+# 4. Define the Prediction Function
 def classify_image_text(image, text):
+    if not image or not text:
+        return "Please provide both image and description."
     # Process the inputs
     inputs = processor(text=[text], images=image, return_tensors="pt", padding=True)
         outputs = model(**inputs)
     # Calculate similarity between image and text
+    logits_per_image = outputs.logits_per_image  # shape: [1, 1]
+    probs = logits_per_image.softmax(dim=1)      # shape: [1, 1]
+    score = probs[0][0].item()                   # Get scalar score
+    # Return readable percentage
+    match_percentage = round(score * 100, 2)
+    return f"Match Confidence: {match_percentage}%"
+# 5. Create the Gradio Interface
 iface = gr.Interface(
     fn=classify_image_text,
+    inputs=[
+        gr.Image(type="pil", label="Upload an Image"),
+        gr.Textbox(lines=2, placeholder="Describe the image...", label="Your Description")
+    ],
+    outputs=gr.Label(label="Result"),
+    title="CLIP Image-Text Matcher",
+    description="Upload an image and enter a description. This app will tell you how well your text matches the image.",
+    allow_flagging="never"
 )
+# 6. Launch the App
+iface.launch()