ProfRom commited on
Commit
20351ba
·
verified ·
1 Parent(s): 5fc8594

Agelakis - Unit 8 Assignment

Browse files
Files changed (2) hide show
  1. app.py +40 -115
  2. requirements.txt +4 -4
app.py CHANGED
@@ -1,115 +1,40 @@
1
- import gradio as gr
2
- from transformers import pipeline
3
-
4
- # ----------------------------------------------------------
5
- # LOAD ALL THREE MODELS USED IN THE MULTIMODAL AI SYSTEM
6
- # ----------------------------------------------------------
7
-
8
- # 1. BLIP Image Captioning Model
9
- # - Takes an image as input and generates a natural language
10
- # description of its contents.
11
- caption_pipeline = pipeline(
12
- task="image-to-text",
13
- model="Salesforce/blip-image-captioning-base"
14
- )
15
-
16
- # 2. BLIP Visual Question Answering Model
17
- # - Takes an image AND a natural language question as input.
18
- # - Produces a short text answer based on the image content.
19
- vqa_pipeline = pipeline(
20
- task="visual-question-answering",
21
- model="Salesforce/blip-vqa-base"
22
- )
23
-
24
- # 3. CLIP Zero-Shot Image Classification Model
25
- # - Compares the image with a list of user-provided labels.
26
- # - Returns a probability score for each label without training.
27
- clip_pipeline = pipeline(
28
- task="zero-shot-image-classification",
29
- model="openai/clip-vit-base-patch32"
30
- )
31
-
32
-
33
- # ----------------------------------------------------------
34
- # PROCESS FUNCTION — RUNS ALL THREE AI TASKS
35
- # ----------------------------------------------------------
36
- def process_image(image, question, labels):
37
- """
38
- Runs captioning, VQA, and zero-shot classification on the input image.
39
-
40
- Parameters:
41
- image : Image uploaded by the user.
42
- question: Optional natural-language question about the image.
43
- labels : Optional comma-separated classification labels for CLIP.
44
-
45
- Returns:
46
- caption (str) : Generated caption for the image.
47
- vqa_answer (str) : Answer to the user's question.
48
- clip_output (str) : Zero-shot classification probabilities.
49
- """
50
-
51
- # -----------------------------
52
- # IMAGE CAPTIONING USING BLIP
53
- # -----------------------------
54
- caption_result = caption_pipeline(image)
55
- caption = caption_result[0]["generated_text"] # extract caption text
56
-
57
-
58
- # ----------------------------------------------------
59
- # VISUAL QUESTION ANSWERING (only if question given)
60
- # ----------------------------------------------------
61
- if question and question.strip(): # check if the user provided a question
62
- vqa_result = vqa_pipeline(image=image, question=question)
63
- vqa_answer = vqa_result[0]["answer"]
64
- else:
65
- vqa_answer = "No question provided."
66
-
67
-
68
- # ----------------------------------------------------
69
- # ZERO-SHOT IMAGE CLASSIFICATION USING CLIP
70
- # ----------------------------------------------------
71
- if labels and labels.strip(): # ensure labels exist
72
- # Convert comma-separated text into clean list of labels
73
- candidate_labels = [l.strip() for l in labels.split(",") if l.strip()]
74
-
75
- if candidate_labels:
76
- # CLIP requires parameter name 'images=' instead of 'image'
77
- clip_result = clip_pipeline(images=image, candidate_labels=candidate_labels)
78
-
79
- # Format classification scores nicely for display
80
- clip_output = "\n".join(
81
- f"{item['label']}: {round(item['score'] * 100, 1)}%"
82
- for item in clip_result
83
- )
84
- else:
85
- clip_output = "No valid labels provided."
86
- else:
87
- clip_output = "No labels provided."
88
-
89
- # Return results of all three AI tasks
90
- return caption, vqa_answer, clip_output
91
-
92
-
93
- # ----------------------------------------------------------
94
- # CREATE THE GRADIO USER INTERFACE
95
- # ----------------------------------------------------------
96
- demo = gr.Interface(
97
- fn=process_image, # function that executes model inference
98
- inputs=[
99
- gr.Image(type="pil", label="Upload an image"), # image input
100
- gr.Textbox(label="Ask a question about the image (optional)"), # VQA input
101
- gr.Textbox(
102
- label="Enter CLIP classification labels (comma-separated)",
103
- placeholder="e.g., man, boy, park, snow, happiness",
104
- ),
105
- ],
106
- outputs=[
107
- gr.Textbox(label="Generated Caption"), # BLIP caption output
108
- gr.Textbox(label="VQA Answer"), # VQA answer output
109
- gr.Textbox(label="CLIP Classification Scores"), # CLIP zero-shot output
110
- ],
111
- title="Multimodal AI — Captioning + VQA + Zero-Shot Classification",
112
- )
113
-
114
- # Launch the web application on Hugging Face Spaces or locally
115
- demo.launch()
 
1
+ import gradio as gr
2
+ from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
3
+ from PIL import Image
4
+ import torch
5
+
6
+ # Load model
7
+ model_name = "nlpconnect/vit-gpt2-image-captioning"
8
+ model = VisionEncoderDecoderModel.from_pretrained(model_name)
9
+ processor = ViTImageProcessor.from_pretrained(model_name)
10
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ model.to(device)
14
+
15
+ # Caption function
16
+ def predict_caption(image):
17
+ if image is None:
18
+ return "Upload an image."
19
+ if image.mode != "RGB":
20
+ image = image.convert("RGB")
21
+
22
+ pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
23
+
24
+ with torch.no_grad():
25
+ output_ids = model.generate(pixel_values, max_length=32, num_beams=4)
26
+
27
+ caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
28
+ return caption.strip()
29
+
30
+ # UI
31
+ demo = gr.Interface(
32
+ fn=predict_caption,
33
+ inputs=gr.Image(type="pil", label="Upload Image"),
34
+ outputs=gr.Textbox(label="Caption"),
35
+ title="AI Image Captioning",
36
+ description="Upload an image to get an AI-generated caption."
37
+ )
38
+
39
+ if __name__ == "__main__":
40
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio==4.25.0
2
- transformers==4.36.2
3
- torch
4
- Pillow
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ Pillow