PaliGemma2 / app.py
breadlicker45's picture
Update app.py
33262af verified
raw
history blame
2.08 kB
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import torch
import os
import spaces # Import the spaces module
def load_model():
"""Load PaliGemma2 model and processor with Hugging Face token."""
token = os.getenv("HUGGINGFACEHUB_API_TOKEN") # Retrieve token from environment variable
if not token:
raise ValueError(
"Hugging Face API token not found. Please set it in the environment variables."
)
# Load the processor and model using the correct identifier
processor = AutoProcessor.from_pretrained(
"google/paligemma2-28b-pt-896", use_auth_token=token
)
model = AutoModelForImageTextToText.from_pretrained(
"google/paligemma2-28b-pt-896", use_auth_token=token, torch_dtype=torch.bfloat16
)
# Move model to GPU if available
if torch.cuda.is_available():
model = model.to("cuda")
return processor, model
@spaces.GPU # Decorate the function that uses the GPU
def process_image_and_text(image, text_input):
"""Extract text from image using PaliGemma2."""
processor, model = load_model()
# Preprocess the image and text
inputs = processor(text=text_input, images=image, return_tensors="pt").to(
"cuda" if torch.cuda.is_available() else "cpu", dtype=torch.bfloat16
)
# Generate predictions
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=100)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return text
if __name__ == "__main__":
iface = gr.Interface(
fn=process_image_and_text,
inputs=[
gr.Image(type="pil", label="Upload an image containing text"),
gr.Textbox(label="Enter Text Prompt"),
],
outputs=gr.Textbox(label="Extracted/Generated Text"),
title="Text Reading/Generation with PaliGemma2",
description="Upload an image and enter a text prompt. The model will generate text based on both.",
)
iface.launch()