Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import AutoProcessor, AutoModelForImageTextToText | |
| from PIL import Image | |
| import torch | |
| import os | |
| import spaces # Import the spaces module | |
| def load_model(): | |
| """Load PaliGemma2 model and processor with Hugging Face token.""" | |
| token = os.getenv("HUGGINGFACEHUB_API_TOKEN") # Retrieve token from environment variable | |
| if not token: | |
| raise ValueError( | |
| "Hugging Face API token not found. Please set it in the environment variables." | |
| ) | |
| # Load the processor and model using the correct identifier | |
| processor = AutoProcessor.from_pretrained( | |
| "google/paligemma2-28b-pt-896", use_auth_token=token | |
| ) | |
| model = AutoModelForImageTextToText.from_pretrained( | |
| "google/paligemma2-28b-pt-896", use_auth_token=token, torch_dtype=torch.bfloat16 | |
| ) | |
| # Move model to GPU if available | |
| if torch.cuda.is_available(): | |
| model = model.to("cuda") | |
| return processor, model | |
| # Decorate the function that uses the GPU | |
| def process_image_and_text(image, text_input): | |
| """Extract text from image using PaliGemma2.""" | |
| processor, model = load_model() | |
| # Preprocess the image and text | |
| inputs = processor(text=text_input, images=image, return_tensors="pt").to( | |
| "cuda" if torch.cuda.is_available() else "cpu", dtype=torch.bfloat16 | |
| ) | |
| # Generate predictions | |
| with torch.no_grad(): | |
| generated_ids = model.generate(**inputs, max_new_tokens=100) | |
| text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| return text | |
| if __name__ == "__main__": | |
| iface = gr.Interface( | |
| fn=process_image_and_text, | |
| inputs=[ | |
| gr.Image(type="pil", label="Upload an image containing text"), | |
| gr.Textbox(label="Enter Text Prompt"), | |
| ], | |
| outputs=gr.Textbox(label="Extracted/Generated Text"), | |
| title="Text Reading/Generation with PaliGemma2", | |
| description="Upload an image and enter a text prompt. The model will generate text based on both.", | |
| ) | |
| iface.launch() |