Spaces:

kkkai123456
/

HW_3

Running

File size: 10,879 Bytes

0263be8

import gradio as gr
import torch
from PIL import Image
from transformers import (
    BlipProcessor, BlipForConditionalGeneration,
    BlipForQuestionAnswering,
    CLIPProcessor, CLIPModel
)
import numpy as np

# ==================== Model Loading ====================
print("🔄 Loading models...")

# BLIP Image Captioning Model
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# BLIP Visual Question Answering Model
vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

# CLIP Image Classification Model
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

print("✅ Models loaded successfully!")

# ==================== Function Definitions ====================

def generate_caption(image):
    """Generate image caption"""
    if image is None:
        return "❌ Please upload an image first"
    
    try:
        # Process image
        inputs = caption_processor(image, return_tensors="pt")
        
        # Generate caption
        out = caption_model.generate(**inputs, max_length=50)
        caption = caption_processor.decode(out[0], skip_special_tokens=True)
        
        return f"📝 Image Caption:\n{caption}"
    
    except Exception as e:
        return f"❌ Processing failed: {str(e)}"


def answer_question(image, question):
    """Visual Question Answering"""
    if image is None:
        return "❌ Please upload an image first"
    if not question.strip():
        return "❌ Please enter a question"
    
    try:
        # Process inputs
        inputs = vqa_processor(image, question, return_tensors="pt")
        
        # Generate answer
        out = vqa_model.generate(**inputs, max_length=20)
        answer = vqa_processor.decode(out[0], skip_special_tokens=True)
        
        return f"❓ Question: {question}\n\n✅ Answer: {answer}"
    
    except Exception as e:
        return f"❌ Processing failed: {str(e)}"


def classify_image(image, categories):
    """Zero-shot Image Classification"""
    if image is None:
        return "❌ Please upload an image first"
    if not categories.strip():
        return "❌ Please enter categories"
    
    try:
        # Parse categories
        category_list = [cat.strip() for cat in categories.split(",")]
        
        # Process image and text
        inputs = clip_processor(
            text=category_list,
            images=image,
            return_tensors="pt",
            padding=True
        )
        
        # Calculate similarity
        outputs = clip_model(**inputs)
        logits_per_image = outputs.logits_per_image
        probs = logits_per_image.softmax(dim=1)[0]
        
        # Format results
        results = "🎯 Classification Results:\n\n"
        for category, prob in zip(category_list, probs):
            percentage = prob.item() * 100
            bar = "█" * int(percentage / 5)
            results += f"{category}: {percentage:.2f}% {bar}\n"
        
        return results
    
    except Exception as e:
        return f"❌ Processing failed: {str(e)}"


def multimodal_chat(image, message, history):
    """Multimodal Chat (Simplified)"""
    if image is None:
        return history + [[message, "❌ Please upload an image first to start chatting"]]
    
    try:
        # Use VQA model to process question
        inputs = vqa_processor(image, message, return_tensors="pt")
        out = vqa_model.generate(**inputs, max_length=30)
        response = vqa_processor.decode(out[0], skip_special_tokens=True)
        
        history.append([message, response])
        return history
    
    except Exception as e:
        history.append([message, f"❌ Processing failed: {str(e)}"])
        return history


# ==================== Gradio Interface ====================

# Custom CSS
custom_css = """
#title {
    text-align: center;
    background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    font-size: 3em;
    font-weight: bold;
    margin-bottom: 10px;
}
#subtitle {
    text-align: center;
    color: #666;
    font-size: 1.2em;
    margin-bottom: 30px;
}
.feature-box {
    border: 2px solid #667eea;
    border-radius: 10px;
    padding: 20px;
    margin: 10px 0;
}
"""

with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
    
    # Title
    gr.HTML('<h1 id="title">🤖 Vision Language AI Demo</h1>')
    gr.HTML('<p id="subtitle">Interactive application showcasing multiple vision-language AI capabilities</p>')
    
    # Tabbed Interface
    with gr.Tabs():
        
        # Tab 1: Image Captioning
        with gr.Tab("🖼️ Image Captioning"):
            gr.Markdown("### Upload an image and AI will generate a description")
            with gr.Row():
                with gr.Column():
                    caption_image = gr.Image(type="pil", label="Upload Image")
                    caption_btn = gr.Button("🎨 Generate Caption", variant="primary")
                with gr.Column():
                    caption_output = gr.Textbox(
                        label="Generated Caption",
                        lines=5,
                        placeholder="Caption will appear here..."
                    )
            
            # Examples
            gr.Examples(
                examples=[
                    ["https://images.unsplash.com/photo-1514888286974-6c03e2ca1dba"],
                    ["https://images.unsplash.com/photo-1506748686214-e9df14d4d9d0"],
                ],
                inputs=caption_image,
                label="📸 Click on examples to try"
            )
            
            caption_btn.click(
                fn=generate_caption,
                inputs=caption_image,
                outputs=caption_output
            )
            caption_image.change(
                fn=generate_caption,
                inputs=caption_image,
                outputs=caption_output
            )
        
        # Tab 2: Visual Question Answering
        with gr.Tab("🔍 Visual Question Answering"):
            gr.Markdown("### Upload an image and ask questions, AI will answer based on the image content")
            with gr.Row():
                with gr.Column():
                    vqa_image = gr.Image(type="pil", label="Upload Image")
                    vqa_question = gr.Textbox(
                        label="Enter Question",
                        placeholder="e.g., What color is the car? How many people are there?",
                        lines=2
                    )
                    vqa_btn = gr.Button("🤔 Get Answer", variant="primary")
                with gr.Column():
                    vqa_output = gr.Textbox(
                        label="AI Answer",
                        lines=6,
                        placeholder="Answer will appear here..."
                    )
            
            # Common question examples
            gr.Markdown("**💡 Common Question Examples:**")
            gr.Markdown("- What is in the image?\n- What color is...?\n- How many ... are there?\n- Is there a ... in the image?")
            
            vqa_btn.click(
                fn=answer_question,
                inputs=[vqa_image, vqa_question],
                outputs=vqa_output
            )
        
        # Tab 3: Image Classification
        with gr.Tab("🏷️ Zero-Shot Classification"):
            gr.Markdown("### Define custom categories and AI will classify the image")
            with gr.Row():
                with gr.Column():
                    classify_image_input = gr.Image(type="pil", label="Upload Image")
                    classify_categories = gr.Textbox(
                        label="Categories (comma-separated)",
                        placeholder="e.g., cat, dog, bird, car, building",
                        value="cat, dog, bird, car, building",
                        lines=2
                    )
                    classify_btn = gr.Button("🎯 Classify", variant="primary")
                with gr.Column():
                    classify_output = gr.Textbox(
                        label="Classification Results",
                        lines=8,
                        placeholder="Results will appear here..."
                    )
            
            gr.Markdown("**💡 Tip:** You can input any categories, the model will calculate similarity between the image and each category")
            
            classify_btn.click(
                fn=classify_image,
                inputs=[classify_image_input, classify_categories],
                outputs=classify_output
            )
        
        # Tab 4: Multimodal Chat
        with gr.Tab("💬 Multimodal Chat"):
            gr.Markdown("### Upload an image and have a conversation with AI about it")
            with gr.Row():
                with gr.Column(scale=1):
                    chat_image = gr.Image(type="pil", label="Upload Image")
                    gr.Markdown("**💡 Conversation Prompts:**")
                    gr.Markdown("- Describe this image\n- What's in the image?\n- Where is this?\n- What is the main color?")
                
                with gr.Column(scale=2):
                    chatbot = gr.Chatbot(label="Chat History", height=400)
                    chat_input = gr.Textbox(
                        label="Enter Message",
                        placeholder="Type your question...",
                        lines=2
                    )
                    with gr.Row():
                        chat_btn = gr.Button("📤 Send", variant="primary")
                        clear_btn = gr.Button("🗑️ Clear Chat")
            
            chat_btn.click(
                fn=multimodal_chat,
                inputs=[chat_image, chat_input, chatbot],
                outputs=chatbot
            )
            chat_input.submit(
                fn=multimodal_chat,
                inputs=[chat_image, chat_input, chatbot],
                outputs=chatbot
            )
            clear_btn.click(lambda: [], outputs=chatbot)
    
    # Footer
    gr.Markdown("---")
    gr.Markdown("""
    ### 📚 About This Application
    - **Models**: BLIP (Captioning & VQA) + CLIP (Classification)
    - **Framework**: Gradio + Transformers
    - **Deployment**: Can be deployed to Hugging Face Spaces
    - **Open Source**: All models are open source
    
    ⚡ **Performance Tip**: Use Hugging Face Spaces Zero GPU for significantly faster processing
    """)

# Launch application
if __name__ == "__main__":
    demo.launch(share=True)