Spaces:

akhaliq
/

Isaac-0.1

Running on Zero

File size: 11,137 Bytes

import spaces
import gradio as gr
import torch
from PIL import Image
from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor
import os
import tempfile

# Import required modules from perceptron
from perceptron.tensorstream.ops import tensor_stream_token_view, modality_mask
from perceptron.pointing.parser import extract_points

# Define vision type enum
class VisionType:
    image = 1

def document_to_messages(document, vision_token="<image>"):
    """Convert a Document to messages format compatible with chat templates."""
    messages = []
    images = []
    
    for item in document:
        itype = item.get("type")
        if itype == "text":
            content = item.get("content")
            if content:
                messages.append({
                    "role": item.get("role", "user"),
                    "content": content,
                })
        elif itype == "image":
            content = item.get("content")
            if content:
                if isinstance(content, str) and os.path.exists(content):
                    img = Image.open(content)
                elif hasattr(content, 'read'):  # Gradio file object
                    img = Image.open(content)
                else:
                    continue
                images.append(img)
                messages.append({
                    "role": item.get("role", "user"),
                    "content": vision_token,
                })
    
    return messages, images

def decode_tensor_stream(tensor_stream, tokenizer):
    """Decode a TensorStream to see its text content."""
    token_view = tensor_stream_token_view(tensor_stream)
    mod = modality_mask(tensor_stream)
    
    # Get text tokens (excluding vision tokens)
    text_tokens = token_view[(mod != VisionType.image)]
    decoded = tokenizer.decode(text_tokens[0] if len(text_tokens.shape) > 1 else text_tokens)
    return decoded

def visualize_predictions(generated_text, image, output_path):
    """Extract bounding boxes from generated text and render them on the input image."""
    from PIL import ImageDraw, ImageFont
    
    # Extract bounding boxes from the generated text
    boxes = extract_points(generated_text, expected="box")
    
    if not boxes:
        image.save(output_path)
        return output_path
    
    # Get image dimensions
    img_width, img_height = image.size
    
    # Create a copy of the image to draw on
    img_with_boxes = image.copy()
    draw = ImageDraw.Draw(img_with_boxes)
    
    # Try to use a basic font, fall back to default if not available
    try:
        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
    except:
        font = ImageFont.load_default()
    
    # Define colors for different boxes
    colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "orange", "purple"]
    
    for idx, box in enumerate(boxes):
        color = colors[idx % len(colors)]
        
        # Extract normalized coordinates (0-1000 range)
        norm_x1, norm_y1 = box.top_left.x, box.top_left.y
        norm_x2, norm_y2 = box.bottom_right.x, box.bottom_right.y
        
        # Scale coordinates from 0-1000 range to actual image dimensions
        x1 = int((norm_x1 / 1000.0) * img_width)
        y1 = int((norm_y1 / 1000.0) * img_height)
        x2 = int((norm_x2 / 1000.0) * img_width)
        y2 = int((norm_y2 / 1000.0) * img_height)
        
        # Ensure coordinates are within image bounds
        x1 = max(0, min(x1, img_width - 1))
        y1 = max(0, min(y1, img_height - 1))
        x2 = max(0, min(x2, img_width - 1))
        y2 = max(0, min(y2, img_height - 1))
        
        # Draw the bounding box
        draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
        
        # Add label if mention exists
        if box.mention:
            # Calculate text position (above the box if possible)
            text_y = max(y1 - 20, 5)
            
            # Draw text background for better visibility
            text_bbox = draw.textbbox((x1, text_y), box.mention, font=font)
            draw.rectangle(text_bbox, fill=color)
            draw.text((x1, text_y), box.mention, fill="white", font=font)
    
    # Save the image with bounding boxes
    img_with_boxes.save(output_path, "JPEG")
    return output_path

# Load model and processor once at startup
@spaces.GPU(duration=1500)
def load_model():
    """Load the Perceptron model with AoT compilation."""
    hf_path = "PerceptronAI/Isaac-0.1"
    
    print("Loading processor and config...")
    config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
    processor = AutoProcessor.from_pretrained(hf_path, trust_remote_code=True)
    
    print("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
    
    # Move to appropriate device and dtype
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
    model = model.to(device=device, dtype=dtype)
    model.eval()
    
    print(f"Model loaded on {device} with dtype {dtype}")
    return model, processor, config, device

# Load model during startup
model, processor, config, device = load_model()

@spaces.GPU(duration=120)
def generate_response(image_file, text_prompt, max_tokens=256):
    """Generate response using Perceptron model."""
    try:
        # Create document from inputs
        document = [
            {
                "type": "text",
                "content": "<hint>BOX</hint>",
                "role": "user",
            },
            {
                "type": "image",
                "content": image_file,
                "role": "user",
            },
            {
                "type": "text",
                "content": text_prompt,
                "role": "user",
            },
        ]
        
        # Convert document to messages format
        messages, images = document_to_messages(document, vision_token=config.vision_token)
        
        # Apply chat template
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        # Process with IsaacProcessor
        inputs = processor(text=text, images=images, return_tensors="pt")
        tensor_stream = inputs["tensor_stream"].to(device)
        input_ids = inputs["input_ids"].to(device)
        
        # Generate text using the model
        with torch.no_grad():
            generated_ids = model.generate(
                tensor_stream=tensor_stream,
                max_new_tokens=max_tokens,
                do_sample=False,
                pad_token_id=processor.tokenizer.eos_token_id,
                eos_token_id=processor.tokenizer.eos_token_id,
            )
            
            # Decode the generated text
            generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False)
            
            # Extract new tokens only
            if generated_ids.shape[1] > input_ids.shape[1]:
                new_tokens = generated_ids[0, input_ids.shape[1]:]
                new_text = processor.tokenizer.decode(new_tokens, skip_special_tokens=True)
            else:
                new_text = "No new tokens generated"
        
        # Create visualization
        if images and len(images) > 0:
            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
                viz_path = tmp_file.name
            viz_path = visualize_predictions(generated_text, images[0], viz_path)
        else:
            viz_path = None
        
        return new_text, generated_text, viz_path if viz_path else None
        
    except Exception as e:
        return f"Error: {str(e)}", "", None

# Create Gradio interface
with gr.Blocks(title="HuggingFace Perceptron Demo", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🚀 HuggingFace Perceptron Multimodal AI Demo
    
    This demo showcases the PerceptronAI/Isaac-0.1 model for multimodal understanding and generation.
    Upload an image and provide a text prompt to see the model's response with bounding box visualizations.
    
    **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
    """)
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(
                label="Upload Image",
                type="filepath",
                sources=["upload"],
                height=300
            )
            text_input = gr.Textbox(
                label="Text Prompt",
                placeholder="Describe what you want to analyze in the image...",
                lines=3
            )
            max_tokens_slider = gr.Slider(
                label="Max Tokens",
                minimum=50,
                maximum=512,
                value=256,
                step=50
            )
            generate_btn = gr.Button("Generate Response", variant="primary")
        
        with gr.Column():
            new_text_output = gr.Textbox(
                label="Generated Response",
                lines=4,
                interactive=False
            )
            full_output = gr.Textbox(
                label="Full Generated Text",
                lines=6,
                interactive=False,
                visible=False
            )
            visualization_output = gr.Image(
                label="Visualization with Bounding Boxes",
                height=300,
                interactive=False
            )
    
    with gr.Accordion("Advanced Options", open=False):
        gr.Markdown("""
        - The model processes both text and images using TensorStream technology
        - Bounding boxes are automatically extracted from the generated text
        - Supports complex multimodal reasoning tasks
        """)
        show_full_checkbox = gr.Checkbox(label="Show Full Generated Text", value=False)
    
    # Event handlers
    show_full_checkbox.change(
        lambda x: gr.Textbox(visible=x),
        inputs=show_full_checkbox,
        outputs=full_output
    )
    
    generate_btn.click(
        fn=generate_response,
        inputs=[image_input, text_input, max_tokens_slider],
        outputs=[new_text_output, full_output, visualization_output]
    )
    
    # Examples
    gr.Examples(
        examples=[
            [
                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg",
                "Identify all vehicles in the image and describe their positions.",
                200
            ],
            [
                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/street.jpg",
                "Analyze the street scene and identify any potential safety concerns.",
                256
            ]
        ],
        inputs=[image_input, text_input, max_tokens_slider],
        outputs=[new_text_output, full_output, visualization_output],
        fn=generate_response,
        cache_examples=True
    )

if __name__ == "__main__":
    demo.launch(share=True)