Spaces:

akhaliq
/

Isaac-0.1

Running on Zero

App Files Files Community

Isaac-0.1 / app.py

akhaliq HF Staff

Upload app.py with huggingface_hub

837d8aa verified 3 months ago

raw

history blame

11.1 kB

	import spaces
	import gradio as gr
	import torch
	from PIL import Image
	from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor
	import os
	import tempfile

	# Import required modules from perceptron
	from perceptron.tensorstream.ops import tensor_stream_token_view, modality_mask
	from perceptron.pointing.parser import extract_points

	# Define vision type enum
	class VisionType:
	image = 1

	def document_to_messages(document, vision_token="<image>"):
	"""Convert a Document to messages format compatible with chat templates."""
	messages = []
	images = []

	for item in document:
	itype = item.get("type")
	if itype == "text":
	content = item.get("content")
	if content:
	messages.append({
	"role": item.get("role", "user"),
	"content": content,
	})
	elif itype == "image":
	content = item.get("content")
	if content:
	if isinstance(content, str) and os.path.exists(content):
	img = Image.open(content)
	elif hasattr(content, 'read'): # Gradio file object
	img = Image.open(content)
	else:
	continue
	images.append(img)
	messages.append({
	"role": item.get("role", "user"),
	"content": vision_token,
	})

	return messages, images

	def decode_tensor_stream(tensor_stream, tokenizer):
	"""Decode a TensorStream to see its text content."""
	token_view = tensor_stream_token_view(tensor_stream)
	mod = modality_mask(tensor_stream)

	# Get text tokens (excluding vision tokens)
	text_tokens = token_view[(mod != VisionType.image)]
	decoded = tokenizer.decode(text_tokens[0] if len(text_tokens.shape) > 1 else text_tokens)
	return decoded

	def visualize_predictions(generated_text, image, output_path):
	"""Extract bounding boxes from generated text and render them on the input image."""
	from PIL import ImageDraw, ImageFont

	# Extract bounding boxes from the generated text
	boxes = extract_points(generated_text, expected="box")

	if not boxes:
	image.save(output_path)
	return output_path

	# Get image dimensions
	img_width, img_height = image.size

	# Create a copy of the image to draw on
	img_with_boxes = image.copy()
	draw = ImageDraw.Draw(img_with_boxes)

	# Try to use a basic font, fall back to default if not available
	try:
	font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
	except:
	font = ImageFont.load_default()

	# Define colors for different boxes
	colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "orange", "purple"]

	for idx, box in enumerate(boxes):
	color = colors[idx % len(colors)]

	# Extract normalized coordinates (0-1000 range)
	norm_x1, norm_y1 = box.top_left.x, box.top_left.y
	norm_x2, norm_y2 = box.bottom_right.x, box.bottom_right.y

	# Scale coordinates from 0-1000 range to actual image dimensions
	x1 = int((norm_x1 / 1000.0) * img_width)
	y1 = int((norm_y1 / 1000.0) * img_height)
	x2 = int((norm_x2 / 1000.0) * img_width)
	y2 = int((norm_y2 / 1000.0) * img_height)

	# Ensure coordinates are within image bounds
	x1 = max(0, min(x1, img_width - 1))
	y1 = max(0, min(y1, img_height - 1))
	x2 = max(0, min(x2, img_width - 1))
	y2 = max(0, min(y2, img_height - 1))

	# Draw the bounding box
	draw.rectangle([x1, y1, x2, y2], outline=color, width=3)

	# Add label if mention exists
	if box.mention:
	# Calculate text position (above the box if possible)
	text_y = max(y1 - 20, 5)

	# Draw text background for better visibility
	text_bbox = draw.textbbox((x1, text_y), box.mention, font=font)
	draw.rectangle(text_bbox, fill=color)
	draw.text((x1, text_y), box.mention, fill="white", font=font)

	# Save the image with bounding boxes
	img_with_boxes.save(output_path, "JPEG")
	return output_path

	# Load model and processor once at startup
	@spaces.GPU(duration=1500)
	def load_model():
	"""Load the Perceptron model with AoT compilation."""
	hf_path = "PerceptronAI/Isaac-0.1"

	print("Loading processor and config...")
	config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True)
	processor = AutoProcessor.from_pretrained(hf_path, trust_remote_code=True)

	print("Loading model...")
	model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)

	# Move to appropriate device and dtype
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
	model = model.to(device=device, dtype=dtype)
	model.eval()

	print(f"Model loaded on {device} with dtype {dtype}")
	return model, processor, config, device

	# Load model during startup
	model, processor, config, device = load_model()

	@spaces.GPU(duration=120)
	def generate_response(image_file, text_prompt, max_tokens=256):
	"""Generate response using Perceptron model."""
	try:
	# Create document from inputs
	document = [
	{
	"type": "text",
	"content": "<hint>BOX</hint>",
	"role": "user",
	},
	{
	"type": "image",
	"content": image_file,
	"role": "user",
	},
	{
	"type": "text",
	"content": text_prompt,
	"role": "user",
	},
	]

	# Convert document to messages format
	messages, images = document_to_messages(document, vision_token=config.vision_token)

	# Apply chat template
	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

	# Process with IsaacProcessor
	inputs = processor(text=text, images=images, return_tensors="pt")
	tensor_stream = inputs["tensor_stream"].to(device)
	input_ids = inputs["input_ids"].to(device)

	# Generate text using the model
	with torch.no_grad():
	generated_ids = model.generate(
	tensor_stream=tensor_stream,
	max_new_tokens=max_tokens,
	do_sample=False,
	pad_token_id=processor.tokenizer.eos_token_id,
	eos_token_id=processor.tokenizer.eos_token_id,
	)

	# Decode the generated text
	generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False)

	# Extract new tokens only
	if generated_ids.shape[1] > input_ids.shape[1]:
	new_tokens = generated_ids[0, input_ids.shape[1]:]
	new_text = processor.tokenizer.decode(new_tokens, skip_special_tokens=True)
	else:
	new_text = "No new tokens generated"

	# Create visualization
	if images and len(images) > 0:
	with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
	viz_path = tmp_file.name
	viz_path = visualize_predictions(generated_text, images[0], viz_path)
	else:
	viz_path = None

	return new_text, generated_text, viz_path if viz_path else None

	except Exception as e:
	return f"Error: {str(e)}", "", None

	# Create Gradio interface
	with gr.Blocks(title="HuggingFace Perceptron Demo", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🚀 HuggingFace Perceptron Multimodal AI Demo

	This demo showcases the PerceptronAI/Isaac-0.1 model for multimodal understanding and generation.
	Upload an image and provide a text prompt to see the model's response with bounding box visualizations.

	Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
	""")

	with gr.Row():
	with gr.Column():
	image_input = gr.Image(
	label="Upload Image",
	type="filepath",
	sources=["upload"],
	height=300
	)
	text_input = gr.Textbox(
	label="Text Prompt",
	placeholder="Describe what you want to analyze in the image...",
	lines=3
	)
	max_tokens_slider = gr.Slider(
	label="Max Tokens",
	minimum=50,
	maximum=512,
	value=256,
	step=50
	)
	generate_btn = gr.Button("Generate Response", variant="primary")

	with gr.Column():
	new_text_output = gr.Textbox(
	label="Generated Response",
	lines=4,
	interactive=False
	)
	full_output = gr.Textbox(
	label="Full Generated Text",
	lines=6,
	interactive=False,
	visible=False
	)
	visualization_output = gr.Image(
	label="Visualization with Bounding Boxes",
	height=300,
	interactive=False
	)

	with gr.Accordion("Advanced Options", open=False):
	gr.Markdown("""
	- The model processes both text and images using TensorStream technology
	- Bounding boxes are automatically extracted from the generated text
	- Supports complex multimodal reasoning tasks
	""")
	show_full_checkbox = gr.Checkbox(label="Show Full Generated Text", value=False)

	# Event handlers
	show_full_checkbox.change(
	lambda x: gr.Textbox(visible=x),
	inputs=show_full_checkbox,
	outputs=full_output
	)

	generate_btn.click(
	fn=generate_response,
	inputs=[image_input, text_input, max_tokens_slider],
	outputs=[new_text_output, full_output, visualization_output]
	)

	# Examples
	gr.Examples(
	examples=[
	[
	"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg",
	"Identify all vehicles in the image and describe their positions.",
	200
	],
	[
	"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/street.jpg",
	"Analyze the street scene and identify any potential safety concerns.",
	256
	]
	],
	inputs=[image_input, text_input, max_tokens_slider],
	outputs=[new_text_output, full_output, visualization_output],
	fn=generate_response,
	cache_examples=True
	)

	if __name__ == "__main__":
	demo.launch(share=True)