Isaac-0.1 / app.py
akhaliq's picture
akhaliq HF Staff
Upload app.py with huggingface_hub
837d8aa verified
raw
history blame
11.1 kB
import spaces
import gradio as gr
import torch
from PIL import Image
from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor
import os
import tempfile
# Import required modules from perceptron
from perceptron.tensorstream.ops import tensor_stream_token_view, modality_mask
from perceptron.pointing.parser import extract_points
# Define vision type enum
class VisionType:
image = 1
def document_to_messages(document, vision_token="<image>"):
"""Convert a Document to messages format compatible with chat templates."""
messages = []
images = []
for item in document:
itype = item.get("type")
if itype == "text":
content = item.get("content")
if content:
messages.append({
"role": item.get("role", "user"),
"content": content,
})
elif itype == "image":
content = item.get("content")
if content:
if isinstance(content, str) and os.path.exists(content):
img = Image.open(content)
elif hasattr(content, 'read'): # Gradio file object
img = Image.open(content)
else:
continue
images.append(img)
messages.append({
"role": item.get("role", "user"),
"content": vision_token,
})
return messages, images
def decode_tensor_stream(tensor_stream, tokenizer):
"""Decode a TensorStream to see its text content."""
token_view = tensor_stream_token_view(tensor_stream)
mod = modality_mask(tensor_stream)
# Get text tokens (excluding vision tokens)
text_tokens = token_view[(mod != VisionType.image)]
decoded = tokenizer.decode(text_tokens[0] if len(text_tokens.shape) > 1 else text_tokens)
return decoded
def visualize_predictions(generated_text, image, output_path):
"""Extract bounding boxes from generated text and render them on the input image."""
from PIL import ImageDraw, ImageFont
# Extract bounding boxes from the generated text
boxes = extract_points(generated_text, expected="box")
if not boxes:
image.save(output_path)
return output_path
# Get image dimensions
img_width, img_height = image.size
# Create a copy of the image to draw on
img_with_boxes = image.copy()
draw = ImageDraw.Draw(img_with_boxes)
# Try to use a basic font, fall back to default if not available
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
except:
font = ImageFont.load_default()
# Define colors for different boxes
colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "orange", "purple"]
for idx, box in enumerate(boxes):
color = colors[idx % len(colors)]
# Extract normalized coordinates (0-1000 range)
norm_x1, norm_y1 = box.top_left.x, box.top_left.y
norm_x2, norm_y2 = box.bottom_right.x, box.bottom_right.y
# Scale coordinates from 0-1000 range to actual image dimensions
x1 = int((norm_x1 / 1000.0) * img_width)
y1 = int((norm_y1 / 1000.0) * img_height)
x2 = int((norm_x2 / 1000.0) * img_width)
y2 = int((norm_y2 / 1000.0) * img_height)
# Ensure coordinates are within image bounds
x1 = max(0, min(x1, img_width - 1))
y1 = max(0, min(y1, img_height - 1))
x2 = max(0, min(x2, img_width - 1))
y2 = max(0, min(y2, img_height - 1))
# Draw the bounding box
draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
# Add label if mention exists
if box.mention:
# Calculate text position (above the box if possible)
text_y = max(y1 - 20, 5)
# Draw text background for better visibility
text_bbox = draw.textbbox((x1, text_y), box.mention, font=font)
draw.rectangle(text_bbox, fill=color)
draw.text((x1, text_y), box.mention, fill="white", font=font)
# Save the image with bounding boxes
img_with_boxes.save(output_path, "JPEG")
return output_path
# Load model and processor once at startup
@spaces.GPU(duration=1500)
def load_model():
"""Load the Perceptron model with AoT compilation."""
hf_path = "PerceptronAI/Isaac-0.1"
print("Loading processor and config...")
config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(hf_path, trust_remote_code=True)
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
# Move to appropriate device and dtype
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
model = model.to(device=device, dtype=dtype)
model.eval()
print(f"Model loaded on {device} with dtype {dtype}")
return model, processor, config, device
# Load model during startup
model, processor, config, device = load_model()
@spaces.GPU(duration=120)
def generate_response(image_file, text_prompt, max_tokens=256):
"""Generate response using Perceptron model."""
try:
# Create document from inputs
document = [
{
"type": "text",
"content": "<hint>BOX</hint>",
"role": "user",
},
{
"type": "image",
"content": image_file,
"role": "user",
},
{
"type": "text",
"content": text_prompt,
"role": "user",
},
]
# Convert document to messages format
messages, images = document_to_messages(document, vision_token=config.vision_token)
# Apply chat template
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Process with IsaacProcessor
inputs = processor(text=text, images=images, return_tensors="pt")
tensor_stream = inputs["tensor_stream"].to(device)
input_ids = inputs["input_ids"].to(device)
# Generate text using the model
with torch.no_grad():
generated_ids = model.generate(
tensor_stream=tensor_stream,
max_new_tokens=max_tokens,
do_sample=False,
pad_token_id=processor.tokenizer.eos_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
)
# Decode the generated text
generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False)
# Extract new tokens only
if generated_ids.shape[1] > input_ids.shape[1]:
new_tokens = generated_ids[0, input_ids.shape[1]:]
new_text = processor.tokenizer.decode(new_tokens, skip_special_tokens=True)
else:
new_text = "No new tokens generated"
# Create visualization
if images and len(images) > 0:
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
viz_path = tmp_file.name
viz_path = visualize_predictions(generated_text, images[0], viz_path)
else:
viz_path = None
return new_text, generated_text, viz_path if viz_path else None
except Exception as e:
return f"Error: {str(e)}", "", None
# Create Gradio interface
with gr.Blocks(title="HuggingFace Perceptron Demo", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸš€ HuggingFace Perceptron Multimodal AI Demo
This demo showcases the PerceptronAI/Isaac-0.1 model for multimodal understanding and generation.
Upload an image and provide a text prompt to see the model's response with bounding box visualizations.
**Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
""")
with gr.Row():
with gr.Column():
image_input = gr.Image(
label="Upload Image",
type="filepath",
sources=["upload"],
height=300
)
text_input = gr.Textbox(
label="Text Prompt",
placeholder="Describe what you want to analyze in the image...",
lines=3
)
max_tokens_slider = gr.Slider(
label="Max Tokens",
minimum=50,
maximum=512,
value=256,
step=50
)
generate_btn = gr.Button("Generate Response", variant="primary")
with gr.Column():
new_text_output = gr.Textbox(
label="Generated Response",
lines=4,
interactive=False
)
full_output = gr.Textbox(
label="Full Generated Text",
lines=6,
interactive=False,
visible=False
)
visualization_output = gr.Image(
label="Visualization with Bounding Boxes",
height=300,
interactive=False
)
with gr.Accordion("Advanced Options", open=False):
gr.Markdown("""
- The model processes both text and images using TensorStream technology
- Bounding boxes are automatically extracted from the generated text
- Supports complex multimodal reasoning tasks
""")
show_full_checkbox = gr.Checkbox(label="Show Full Generated Text", value=False)
# Event handlers
show_full_checkbox.change(
lambda x: gr.Textbox(visible=x),
inputs=show_full_checkbox,
outputs=full_output
)
generate_btn.click(
fn=generate_response,
inputs=[image_input, text_input, max_tokens_slider],
outputs=[new_text_output, full_output, visualization_output]
)
# Examples
gr.Examples(
examples=[
[
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg",
"Identify all vehicles in the image and describe their positions.",
200
],
[
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/street.jpg",
"Analyze the street scene and identify any potential safety concerns.",
256
]
],
inputs=[image_input, text_input, max_tokens_slider],
outputs=[new_text_output, full_output, visualization_output],
fn=generate_response,
cache_examples=True
)
if __name__ == "__main__":
demo.launch(share=True)