TestSpace3 / app.py
ProfRom's picture
Smallwood - Sanity Check 3
adbc5fd verified
raw
history blame
3.99 kB
import gradio as gr
from transformers import pipeline
from PIL import ImageDraw, ImageFont
import textwrap
# --- LOAD MODELS ---
print("Loading Models...")
caption_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
classification_pipeline = pipeline("image-classification", model="google/vit-base-patch16-224")
sentiment_pipeline = pipeline("sentiment-analysis")
# --- DRAWING FUNCTION ---
def add_caption_to_image(image, text):
draw = ImageDraw.Draw(image)
image_width, image_height = image.size
# 1. Setup Font
try:
font = ImageFont.truetype("DejaVuSans.ttf", 20)
except IOError:
font = ImageFont.load_default()
# 2. Wrap Text
avg_char_width = 12
chars_per_line = max(10, int((image_width - 40) / avg_char_width))
lines = textwrap.wrap(text, width=chars_per_line)
# 3. Calculate Box Size
line_height = 24
total_text_height = len(lines) * line_height
y_start = image_height - total_text_height - 20
max_line_width = 0
for line in lines:
bbox = draw.textbbox((0, 0), line, font=font)
w = bbox[2] - bbox[0]
if w > max_line_width: max_line_width = w
box_x = (image_width - max_line_width) / 2
# 4. Draw Box
padding = 10
draw.rectangle(
[
(box_x - padding, y_start - padding),
(box_x + max_line_width + padding, y_start + total_text_height + padding)
],
fill=(0, 0, 0, 180)
)
# 5. Draw Text
current_y = y_start
for line in lines:
bbox = draw.textbbox((0, 0), line, font=font)
line_width = bbox[2] - bbox[0]
line_x = (image_width - line_width) / 2
draw.text((line_x, current_y), line, font=font, fill="white")
current_y += line_height
return image
# --- ANALYSIS FUNCTION ---
def multimodal_analysis(input_image):
if input_image is None: return None, "Upload image first", "N/A"
processed_image = input_image.copy()
# 1. Caption
try:
caption = caption_pipeline(input_image)[0]['generated_text']
except:
return processed_image, "Error", "Error"
# 2. Draw
final_img = add_caption_to_image(processed_image, caption)
# 3. Classify
try:
res = classification_pipeline(input_image)
cls_str = f"{res[0]['label']} ({res[0]['score']:.2f})"
except:
cls_str = "Error"
# 4. Sentiment
try:
sent = sentiment_pipeline(caption)[0]['label']
except:
sent = "Error"
return final_img, cls_str, sent
# --- INTERFACE (Removed Theme to fix crash) ---
with gr.Blocks() as demo:
gr.Markdown("# πŸ€– Multimodal AI Analyst")
gr.Markdown("Select an example image below to see: **Image Captioning**, **Vision Classification**, and **NLP Sentiment Analysis** working together.")
with gr.Row():
with gr.Column():
image_input = gr.Image(type="pil", label="Input Image")
submit_btn = gr.Button("πŸ” Analyze Image", variant="primary")
with gr.Column():
output_image = gr.Image(label="AI Caption Result")
with gr.Row():
output_class = gr.Textbox(label="Object Class")
output_sent = gr.Textbox(label="Caption Sentiment")
# EXACT FILES FROM YOUR LIST
examples = [
["Ashe Catcum with Pikachu.png"],
["Beautiful sunrise over ocean.png"],
["Cat on a couch.png"],
["Female Crying.png"],
["Lions Football team huddle.png"],
["michael jordan trophy.png"],
["Puppies playing in grass.png"],
["Red Ferrari.png"],
["Siamese cat.png"],
["Stormy dark sky lightning.png"]
]
gr.Examples(examples=examples, inputs=image_input)
submit_btn.click(fn=multimodal_analysis, inputs=image_input, outputs=[output_image, output_class, output_sent])
demo.launch()