nikkoyabut commited on
Commit
f925e95
·
verified ·
1 Parent(s): 891c569

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ notebook/clip_architecture.png filter=lfs diff=lfs merge=lfs -text
37
+ notebook/CLIP.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,72 @@
1
  ---
2
- title: Clip Zero Shot Classifier
3
- emoji: 👁
4
- colorFrom: gray
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.26.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: CLIP Zero-Shot Classifier
3
+ emoji: 🖼️
4
+ colorFrom: indigo
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: "4.24.0"
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # 🖼️ CLIP Zero-Shot Classifier
13
+
14
+ This interactive web app demonstrates a **zero-shot image classification** system using **OpenAI's CLIP model** (`ViT-B/32`) and a custom Gradio interface.
15
+
16
+ ## 🚀 What It Does
17
+
18
+ CLIP can understand images and text in the same embedding space. With this app, you can:
19
+ - Upload an image
20
+ - Enter any number of labels (comma-separated)
21
+ - Get predictions on how likely the image matches each label — **even without training!**
22
+
23
+ ## 💡 How It Works
24
+
25
+ 1. The input image is preprocessed and encoded using CLIP.
26
+ 2. Your custom labels are tokenized and also encoded.
27
+ 3. The cosine similarity between image and text embeddings is computed.
28
+ 4. The results are displayed with a probability score and a visual bar indicator.
29
+
30
+ ## 📦 Technologies Used
31
+
32
+ - [Gradio](https://www.gradio.app/) — for the interactive web interface
33
+ - [OpenAI CLIP](https://github.com/openai/CLIP) — the core model for zero-shot classification
34
+ - PyTorch — model backend
35
+ - Hugging Face Spaces — for easy and free deployment
36
+
37
+ ## 📷 Example Use Cases
38
+
39
+ - Test if an image matches multiple tags
40
+ - Quickly validate custom labels
41
+ - Educational demos for multimodal ML
42
+
43
+ ## 🛠️ How to Use
44
+
45
+ 1. Upload an image.
46
+ 2. Type in labels like: `a cat, a dog, a diagram, a spacecraft`
47
+ 3. Click **Classify**.
48
+ 4. See prediction probabilities and visual bars for each label.
49
+
50
+ ## 📍 Notes
51
+
52
+ - You can enter *any text labels* — even abstract or creative ones!
53
+ - Works best on natural images (e.g., animals, objects, scenes)
54
+
55
+ ## 📓 Notebook
56
+
57
+ You can explore the companion Jupyter notebook here:
58
+ [📘 Open notebook.ipynb](./notebook.ipynb)
59
+
60
+ ---
61
+
62
+ ## 👤 About Me
63
+
64
+ I'm **Nikko**, a Machine Learning Engineer and AI enthusiast with a Master's degree in Artificial Intelligence from the University of the Philippines Diliman. With over a decade of experience in ICT consulting and telecommunications, I now specialize in **vision-language models**, **LLMs**, and **generative AI applications**.
65
+
66
+ I'm passionate about creating systems where AI and humans can collaborate seamlessly — working toward a future where **smart cities** and intelligent automation become reality.
67
+
68
+ Feel free to connect with me on [LinkedIn](https://www.linkedin.com/in/nikkoyabut/).
69
+
70
+ ---
71
+
72
+ Made with ❤️ using CLIP + Gradio
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ # 🛠️ Setup
4
+ # pip install -q gradio torch ftfy regex tqdm git+https://github.com/openai/CLIP.git matplotlib
5
+
6
+ # 📦 Imports
7
+ import gradio as gr
8
+ import torch
9
+ import clip
10
+ from PIL import Image
11
+ import numpy as np
12
+ from typing import List, Tuple, Union
13
+
14
+ # 🚀 Load CLIP Model
15
+ device: str = "cuda" if torch.cuda.is_available() else "cpu"
16
+ model, preprocess = clip.load("ViT-B/32", device=device)
17
+
18
+
19
+ def predict(image: Image.Image, label_text: str) -> List[List[Union[str, float]]]:
20
+ """
21
+ Perform zero-shot classification using the CLIP model.
22
+
23
+ Args:
24
+ image (PIL.Image.Image): Input image.
25
+ label_text (str): Comma-separated labels to classify against.
26
+
27
+ Returns:
28
+ List[List[Union[str, float]]]: A list of results with label, probability, and confidence bar HTML.
29
+ """
30
+ labels: List[str] = [label.strip() for label in label_text.split(",") if label.strip()]
31
+ if not image or not labels:
32
+ return []
33
+
34
+ # Preprocess inputs
35
+ image_input: torch.Tensor = preprocess(image).unsqueeze(0).to(device)
36
+ text_inputs: torch.Tensor = clip.tokenize(labels).to(device)
37
+
38
+ # Run model
39
+ with torch.no_grad():
40
+ image_features: torch.Tensor = model.encode_image(image_input)
41
+ text_features: torch.Tensor = model.encode_text(text_inputs)
42
+ logits_per_image, _ = model(image_input, text_inputs)
43
+ probs: np.ndarray = logits_per_image.softmax(dim=-1).cpu().numpy()[0]
44
+
45
+ # Create table with bar visualization
46
+ results: List[List[Union[str, float]]] = []
47
+ for label, prob in zip(labels, probs):
48
+ bar_html: str = (
49
+ f'<div style="background-color:#4caf50;width:{prob * 100:.1f}%;height:20px;"></div>'
50
+ )
51
+ results.append([label, f"{prob * 100:.2f}%", bar_html])
52
+
53
+ return results
54
+
55
+
56
+ # 🎨 Gradio Interface
57
+ with gr.Blocks() as demo:
58
+ gr.Markdown("## CLIP Zero-Shot Classifier")
59
+
60
+ with gr.Row():
61
+ image = gr.Image(type="pil", label="Upload Image")
62
+ label_text = gr.Textbox(
63
+ lines=2,
64
+ label="Enter comma-separated labels",
65
+ placeholder="e.g., a cat, a dog, a diagram"
66
+ )
67
+
68
+ # Image Examples
69
+ with gr.Row():
70
+ gr.Examples(
71
+ examples=[
72
+ ["images/boy.jpg"],
73
+ ["images/dog.jpg"],
74
+ ["images/boy_dog.jpg"]
75
+ ],
76
+ inputs=[image],
77
+ label="🖼️ Click to select example image"
78
+ )
79
+
80
+ # Label Text Examples
81
+ gr.Examples(
82
+ examples=[
83
+ ["boy, girl, dog, cat"],
84
+ ["a boy with a dog, a boy with a cat, a girl with a dog, a girl with a cat"],
85
+ ["a cat, a dog, a diagram"]
86
+ ],
87
+ inputs=[label_text],
88
+ label="📝 Click to autofill example labels"
89
+ )
90
+
91
+ submit = gr.Button("Classify")
92
+
93
+ output = gr.Dataframe(
94
+ headers=["Label", "Probability", "Confidence Bar"],
95
+ datatype=["str", "str", "html"],
96
+ row_count=5,
97
+ interactive=False
98
+ )
99
+
100
+ submit.click(fn=predict, inputs=[image, label_text], outputs=output)
101
+
102
+ if __name__ == "__main__":
103
+ demo.launch(share=True)
images/boy.jpg ADDED
images/boy_dog.jpg ADDED
images/dog.jpg ADDED
notebook/CLIP.png ADDED

Git LFS Details

  • SHA256: 308a3ca4503f1c7a07803916c369d78c4ef501e5ab7fc727da9b5e1d2f9ec85b
  • Pointer size: 131 Bytes
  • Size of remote file: 252 kB
notebook/clip_architecture.png ADDED

Git LFS Details

  • SHA256: 0cfdaa3c4d98a4ba3d7afb811560f43414ba93abaec2a69748d16c823034a6d6
  • Pointer size: 132 Bytes
  • Size of remote file: 2.65 MB
notebook/clip_inspect.ipynb ADDED
The diff for this file is too large to render. See raw diff