ProfRom commited on
Commit
4cde0da
·
verified ·
1 Parent(s): 15db435

Adapa - Sanity Check 3

Browse files
Files changed (1) hide show
  1. app.py +62 -14
app.py CHANGED
@@ -2,23 +2,71 @@
2
  import gradio as gr
3
  from transformers import pipeline
4
 
5
- # Load image captioning pipeline
6
- captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- def generate_caption(image):
9
- if image is None:
10
- return "Please upload an image."
11
- result = captioner(image)
12
- return result[0]['generated_text']
13
 
14
  demo = gr.Interface(
15
- fn=generate_caption,
16
- inputs=gr.Image(type="pil", label="Upload an image"),
17
- outputs=gr.Textbox(label="Generated Caption"),
18
- title="Image Captioning Demo",
19
- description="Multimodal model: Vision → Language"
 
 
 
 
 
 
 
 
 
 
20
  )
21
 
22
- if __name__ == "__main__":
23
- demo.launch()
24
 
 
2
  import gradio as gr
3
  from transformers import pipeline
4
 
5
+ # BLIP captioning
6
+ caption_pipeline = pipeline(
7
+ task="image-to-text",
8
+ model="Salesforce/blip-image-captioning-base"
9
+ )
10
+
11
+ # BLIP VQA
12
+ vqa_pipeline = pipeline(
13
+ task="visual-question-answering",
14
+ model="Salesforce/blip-vqa-base"
15
+ )
16
+
17
+ # CLIP zero-shot classification
18
+ clip_pipeline = pipeline(
19
+ task="zero-shot-image-classification",
20
+ model="openai/clip-vit-base-patch32"
21
+ )
22
+
23
+ def process_image(image, question, labels):
24
+ # Caption
25
+ caption_result = caption_pipeline(image)
26
+ caption = caption_result[0]["generated_text"]
27
+
28
+ # VQA
29
+ if question and question.strip():
30
+ vqa_result = vqa_pipeline(image=image, question=question)
31
+ vqa_answer = vqa_result[0]["answer"]
32
+ else:
33
+ vqa_answer = "No question provided."
34
+
35
+ # CLIP Classification
36
+ if labels and labels.strip():
37
+ candidate_labels = [l.strip() for l in labels.split(",") if l.strip()]
38
+ if candidate_labels:
39
+ # NOTE: use 'images=' or positional arg
40
+ clip_result = clip_pipeline(images=image, candidate_labels=candidate_labels)
41
+ clip_output = "\n".join(
42
+ f"{item['label']}: {round(item['score'] * 100, 1)}%"
43
+ for item in clip_result
44
+ )
45
+ else:
46
+ clip_output = "No valid labels provided."
47
+ else:
48
+ clip_output = "No labels provided."
49
+
50
+ return caption, vqa_answer, clip_output
51
 
 
 
 
 
 
52
 
53
  demo = gr.Interface(
54
+ fn=process_image,
55
+ inputs=[
56
+ gr.Image(type="pil", label="Upload an image"),
57
+ gr.Textbox(label="Ask a question about the image (optional)"),
58
+ gr.Textbox(
59
+ label="Enter CLIP classification labels (comma-separated)",
60
+ placeholder="e.g., man, boy, park, snow, happiness",
61
+ ),
62
+ ],
63
+ outputs=[
64
+ gr.Textbox(label="Generated Caption"),
65
+ gr.Textbox(label="VQA Answer"),
66
+ gr.Textbox(label="CLIP Classification Scores"),
67
+ ],
68
+ title="Multimodal AI — Captioning + VQA + Zero-Shot Classification",
69
  )
70
 
71
+ demo.launch()
 
72