Spaces:
Running
Running
| import gradio as gr | |
| from transformers import pipeline | |
| from PIL import ImageDraw, ImageFont | |
| import textwrap | |
| # --- LOAD MODELS --- | |
| print("Loading Models...") | |
| caption_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") | |
| classification_pipeline = pipeline("image-classification", model="google/vit-base-patch16-224") | |
| sentiment_pipeline = pipeline("sentiment-analysis") | |
| # --- DRAWING FUNCTION --- | |
| def add_caption_to_image(image, text): | |
| draw = ImageDraw.Draw(image) | |
| image_width, image_height = image.size | |
| # 1. Setup Font | |
| try: | |
| font = ImageFont.truetype("DejaVuSans.ttf", 20) | |
| except IOError: | |
| font = ImageFont.load_default() | |
| # 2. Wrap Text | |
| avg_char_width = 12 | |
| chars_per_line = max(10, int((image_width - 40) / avg_char_width)) | |
| lines = textwrap.wrap(text, width=chars_per_line) | |
| # 3. Calculate Box Size | |
| line_height = 24 | |
| total_text_height = len(lines) * line_height | |
| y_start = image_height - total_text_height - 20 | |
| max_line_width = 0 | |
| for line in lines: | |
| bbox = draw.textbbox((0, 0), line, font=font) | |
| w = bbox[2] - bbox[0] | |
| if w > max_line_width: max_line_width = w | |
| box_x = (image_width - max_line_width) / 2 | |
| # 4. Draw Box | |
| padding = 10 | |
| draw.rectangle( | |
| [ | |
| (box_x - padding, y_start - padding), | |
| (box_x + max_line_width + padding, y_start + total_text_height + padding) | |
| ], | |
| fill=(0, 0, 0, 180) | |
| ) | |
| # 5. Draw Text | |
| current_y = y_start | |
| for line in lines: | |
| bbox = draw.textbbox((0, 0), line, font=font) | |
| line_width = bbox[2] - bbox[0] | |
| line_x = (image_width - line_width) / 2 | |
| draw.text((line_x, current_y), line, font=font, fill="white") | |
| current_y += line_height | |
| return image | |
| # --- ANALYSIS FUNCTION --- | |
| def multimodal_analysis(input_image): | |
| if input_image is None: return None, "Upload image first", "N/A" | |
| processed_image = input_image.copy() | |
| # 1. Caption | |
| try: | |
| caption = caption_pipeline(input_image)[0]['generated_text'] | |
| except: | |
| return processed_image, "Error", "Error" | |
| # 2. Draw | |
| final_img = add_caption_to_image(processed_image, caption) | |
| # 3. Classify | |
| try: | |
| res = classification_pipeline(input_image) | |
| cls_str = f"{res[0]['label']} ({res[0]['score']:.2f})" | |
| except: | |
| cls_str = "Error" | |
| # 4. Sentiment | |
| try: | |
| sent = sentiment_pipeline(caption)[0]['label'] | |
| except: | |
| sent = "Error" | |
| return final_img, cls_str, sent | |
| # --- INTERFACE (Removed Theme to fix crash) --- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π€ Multimodal AI Analyst") | |
| gr.Markdown("Select an example image below to see: **Image Captioning**, **Vision Classification**, and **NLP Sentiment Analysis** working together.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image(type="pil", label="Input Image") | |
| submit_btn = gr.Button("π Analyze Image", variant="primary") | |
| with gr.Column(): | |
| output_image = gr.Image(label="AI Caption Result") | |
| with gr.Row(): | |
| output_class = gr.Textbox(label="Object Class") | |
| output_sent = gr.Textbox(label="Caption Sentiment") | |
| # EXACT FILES FROM YOUR LIST | |
| examples = [ | |
| ["Ashe Catcum with Pikachu.png"], | |
| ["Beautiful sunrise over ocean.png"], | |
| ["Cat on a couch.png"], | |
| ["Female Crying.png"], | |
| ["Lions Football team huddle.png"], | |
| ["michael jordan trophy.png"], | |
| ["Puppies playing in grass.png"], | |
| ["Red Ferrari.png"], | |
| ["Siamese cat.png"], | |
| ["Stormy dark sky lightning.png"] | |
| ] | |
| gr.Examples(examples=examples, inputs=image_input) | |
| submit_btn.click(fn=multimodal_analysis, inputs=image_input, outputs=[output_image, output_class, output_sent]) | |
| demo.launch() | |