|
|
import spaces |
|
|
import gradio as gr |
|
|
import torch |
|
|
from PIL import Image |
|
|
from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor |
|
|
import os |
|
|
import tempfile |
|
|
|
|
|
|
|
|
from perceptron.tensorstream.ops import tensor_stream_token_view, modality_mask |
|
|
from perceptron.pointing.parser import extract_points |
|
|
|
|
|
|
|
|
class VisionType: |
|
|
image = 1 |
|
|
|
|
|
def document_to_messages(document, vision_token="<image>"): |
|
|
"""Convert a Document to messages format compatible with chat templates.""" |
|
|
messages = [] |
|
|
images = [] |
|
|
|
|
|
for item in document: |
|
|
itype = item.get("type") |
|
|
if itype == "text": |
|
|
content = item.get("content") |
|
|
if content: |
|
|
messages.append({ |
|
|
"role": item.get("role", "user"), |
|
|
"content": content, |
|
|
}) |
|
|
elif itype == "image": |
|
|
content = item.get("content") |
|
|
if content: |
|
|
if isinstance(content, str) and os.path.exists(content): |
|
|
img = Image.open(content) |
|
|
elif hasattr(content, 'read'): |
|
|
img = Image.open(content) |
|
|
else: |
|
|
continue |
|
|
images.append(img) |
|
|
messages.append({ |
|
|
"role": item.get("role", "user"), |
|
|
"content": vision_token, |
|
|
}) |
|
|
|
|
|
return messages, images |
|
|
|
|
|
def decode_tensor_stream(tensor_stream, tokenizer): |
|
|
"""Decode a TensorStream to see its text content.""" |
|
|
token_view = tensor_stream_token_view(tensor_stream) |
|
|
mod = modality_mask(tensor_stream) |
|
|
|
|
|
|
|
|
text_tokens = token_view[(mod != VisionType.image)] |
|
|
decoded = tokenizer.decode(text_tokens[0] if len(text_tokens.shape) > 1 else text_tokens) |
|
|
return decoded |
|
|
|
|
|
def visualize_predictions(generated_text, image, output_path): |
|
|
"""Extract bounding boxes from generated text and render them on the input image.""" |
|
|
from PIL import ImageDraw, ImageFont |
|
|
|
|
|
|
|
|
boxes = extract_points(generated_text, expected="box") |
|
|
|
|
|
if not boxes: |
|
|
image.save(output_path) |
|
|
return output_path |
|
|
|
|
|
|
|
|
img_width, img_height = image.size |
|
|
|
|
|
|
|
|
img_with_boxes = image.copy() |
|
|
draw = ImageDraw.Draw(img_with_boxes) |
|
|
|
|
|
|
|
|
try: |
|
|
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16) |
|
|
except: |
|
|
font = ImageFont.load_default() |
|
|
|
|
|
|
|
|
colors = ["red", "green", "blue", "yellow", "magenta", "cyan", "orange", "purple"] |
|
|
|
|
|
for idx, box in enumerate(boxes): |
|
|
color = colors[idx % len(colors)] |
|
|
|
|
|
|
|
|
norm_x1, norm_y1 = box.top_left.x, box.top_left.y |
|
|
norm_x2, norm_y2 = box.bottom_right.x, box.bottom_right.y |
|
|
|
|
|
|
|
|
x1 = int((norm_x1 / 1000.0) * img_width) |
|
|
y1 = int((norm_y1 / 1000.0) * img_height) |
|
|
x2 = int((norm_x2 / 1000.0) * img_width) |
|
|
y2 = int((norm_y2 / 1000.0) * img_height) |
|
|
|
|
|
|
|
|
x1 = max(0, min(x1, img_width - 1)) |
|
|
y1 = max(0, min(y1, img_height - 1)) |
|
|
x2 = max(0, min(x2, img_width - 1)) |
|
|
y2 = max(0, min(y2, img_height - 1)) |
|
|
|
|
|
|
|
|
draw.rectangle([x1, y1, x2, y2], outline=color, width=3) |
|
|
|
|
|
|
|
|
if box.mention: |
|
|
|
|
|
text_y = max(y1 - 20, 5) |
|
|
|
|
|
|
|
|
text_bbox = draw.textbbox((x1, text_y), box.mention, font=font) |
|
|
draw.rectangle(text_bbox, fill=color) |
|
|
draw.text((x1, text_y), box.mention, fill="white", font=font) |
|
|
|
|
|
|
|
|
img_with_boxes.save(output_path, "JPEG") |
|
|
return output_path |
|
|
|
|
|
|
|
|
@spaces.GPU(duration=1500) |
|
|
def load_model(): |
|
|
"""Load the Perceptron model with AoT compilation.""" |
|
|
hf_path = "PerceptronAI/Isaac-0.1" |
|
|
|
|
|
print("Loading processor and config...") |
|
|
config = AutoConfig.from_pretrained(hf_path, trust_remote_code=True) |
|
|
processor = AutoProcessor.from_pretrained(hf_path, trust_remote_code=True) |
|
|
|
|
|
print("Loading model...") |
|
|
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True) |
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 |
|
|
model = model.to(device=device, dtype=dtype) |
|
|
model.eval() |
|
|
|
|
|
print(f"Model loaded on {device} with dtype {dtype}") |
|
|
return model, processor, config, device |
|
|
|
|
|
|
|
|
model, processor, config, device = load_model() |
|
|
|
|
|
@spaces.GPU(duration=120) |
|
|
def generate_response(image_file, text_prompt, max_tokens=256): |
|
|
"""Generate response using Perceptron model.""" |
|
|
try: |
|
|
|
|
|
document = [ |
|
|
{ |
|
|
"type": "text", |
|
|
"content": "<hint>BOX</hint>", |
|
|
"role": "user", |
|
|
}, |
|
|
{ |
|
|
"type": "image", |
|
|
"content": image_file, |
|
|
"role": "user", |
|
|
}, |
|
|
{ |
|
|
"type": "text", |
|
|
"content": text_prompt, |
|
|
"role": "user", |
|
|
}, |
|
|
] |
|
|
|
|
|
|
|
|
messages, images = document_to_messages(document, vision_token=config.vision_token) |
|
|
|
|
|
|
|
|
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
|
|
|
|
|
|
|
inputs = processor(text=text, images=images, return_tensors="pt") |
|
|
tensor_stream = inputs["tensor_stream"].to(device) |
|
|
input_ids = inputs["input_ids"].to(device) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
generated_ids = model.generate( |
|
|
tensor_stream=tensor_stream, |
|
|
max_new_tokens=max_tokens, |
|
|
do_sample=False, |
|
|
pad_token_id=processor.tokenizer.eos_token_id, |
|
|
eos_token_id=processor.tokenizer.eos_token_id, |
|
|
) |
|
|
|
|
|
|
|
|
generated_text = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=False) |
|
|
|
|
|
|
|
|
if generated_ids.shape[1] > input_ids.shape[1]: |
|
|
new_tokens = generated_ids[0, input_ids.shape[1]:] |
|
|
new_text = processor.tokenizer.decode(new_tokens, skip_special_tokens=True) |
|
|
else: |
|
|
new_text = "No new tokens generated" |
|
|
|
|
|
|
|
|
if images and len(images) > 0: |
|
|
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file: |
|
|
viz_path = tmp_file.name |
|
|
viz_path = visualize_predictions(generated_text, images[0], viz_path) |
|
|
else: |
|
|
viz_path = None |
|
|
|
|
|
return new_text, generated_text, viz_path if viz_path else None |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error: {str(e)}", "", None |
|
|
|
|
|
|
|
|
with gr.Blocks(title="HuggingFace Perceptron Demo", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# π HuggingFace Perceptron Multimodal AI Demo |
|
|
|
|
|
This demo showcases the PerceptronAI/Isaac-0.1 model for multimodal understanding and generation. |
|
|
Upload an image and provide a text prompt to see the model's response with bounding box visualizations. |
|
|
|
|
|
**Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)** |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
image_input = gr.Image( |
|
|
label="Upload Image", |
|
|
type="filepath", |
|
|
sources=["upload"], |
|
|
height=300 |
|
|
) |
|
|
text_input = gr.Textbox( |
|
|
label="Text Prompt", |
|
|
placeholder="Describe what you want to analyze in the image...", |
|
|
lines=3 |
|
|
) |
|
|
max_tokens_slider = gr.Slider( |
|
|
label="Max Tokens", |
|
|
minimum=50, |
|
|
maximum=512, |
|
|
value=256, |
|
|
step=50 |
|
|
) |
|
|
generate_btn = gr.Button("Generate Response", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
new_text_output = gr.Textbox( |
|
|
label="Generated Response", |
|
|
lines=4, |
|
|
interactive=False |
|
|
) |
|
|
full_output = gr.Textbox( |
|
|
label="Full Generated Text", |
|
|
lines=6, |
|
|
interactive=False, |
|
|
visible=False |
|
|
) |
|
|
visualization_output = gr.Image( |
|
|
label="Visualization with Bounding Boxes", |
|
|
height=300, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
with gr.Accordion("Advanced Options", open=False): |
|
|
gr.Markdown(""" |
|
|
- The model processes both text and images using TensorStream technology |
|
|
- Bounding boxes are automatically extracted from the generated text |
|
|
- Supports complex multimodal reasoning tasks |
|
|
""") |
|
|
show_full_checkbox = gr.Checkbox(label="Show Full Generated Text", value=False) |
|
|
|
|
|
|
|
|
show_full_checkbox.change( |
|
|
lambda x: gr.Textbox(visible=x), |
|
|
inputs=show_full_checkbox, |
|
|
outputs=full_output |
|
|
) |
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_response, |
|
|
inputs=[image_input, text_input, max_tokens_slider], |
|
|
outputs=[new_text_output, full_output, visualization_output] |
|
|
) |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
[ |
|
|
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg", |
|
|
"Identify all vehicles in the image and describe their positions.", |
|
|
200 |
|
|
], |
|
|
[ |
|
|
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/street.jpg", |
|
|
"Analyze the street scene and identify any potential safety concerns.", |
|
|
256 |
|
|
] |
|
|
], |
|
|
inputs=[image_input, text_input, max_tokens_slider], |
|
|
outputs=[new_text_output, full_output, visualization_output], |
|
|
fn=generate_response, |
|
|
cache_examples=True |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(share=True) |