import torch from transformers import AutoModelForCausalLM, AutoProcessor from datasets import load_dataset import gradio as g # Initialize the processor and the model processor = AutoProcessor.from_pretrained("AkshaySiraswar/Florence-2-FT-DocVQA", trust_remote_code=True, force_download=True) model = AutoModelForCausalLM.from_pretrained("AkshaySiraswar/Florence-2-FT-DocVQA", trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu") r def generate_response(image, question): try: if image.mode != "RGB": image = image.convert("RGB") inputs = processor(text=question, images=image, return_tensors="pt") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) inputs = {key: value.to(device) for key, value in inputs.items()} generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_length=1024, num_beams=3, early_stopping=True ) response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return response except Exception as e: return f"Error processing image: {e}" # Example images for demonstration (update paths as needed) examples = [ ["demo.jpg", "what is the address in the page?"], ["demo.jpg", "what is the phone number?"], ["demo.jpg", "what is the email address?"] ] # Gradio interface iface = gr.Interface( fn=generate_response, inputs=[gr.Image(type="pil"), gr.Textbox(label="Question")], outputs=gr.Textbox(label="Response"), examples=examples, title="Image to Text Extractor", description="Upload an image and provide a question. This tool will extract the relevant information from the image based on your question." ) # Launch the interface iface.launch()