import gradio as gr import os import requests import json from PIL import Image from io import BytesIO import base64 from mistralai import Mistral def ocr_with_pixtral(image): print(image) img_byte_arr = BytesIO() image.save(img_byte_arr, format='PNG') img_bytes = img_byte_arr.getvalue() img_base64 = base64.b64encode(img_bytes).decode('utf-8') # Retrieve the API key from environment variables api_key = os.getenv('mistral_api_key') # Specify model model = "pixtral-12b-2409" # Initialize the Mistral client client = Mistral(api_key=api_key) # Define the messages for the chat messages = [ { "role": "user", "content": [ { "type": "text", "text": "What's in this image?" }, { "type": "image_url", "image_url": f"data:image/jpeg;base64,{img_base64}" } ] } ] # Get the chat response chat_response = client.chat.complete( model=model, messages=messages ) return chat_response.choices[0].message.content, chat_response.choices[0].message.content # Create the Gradio interface with gr.Blocks(title="Receipt Scanner") as app: gr.Markdown("# Receipt Scanner") gr.Markdown("Upload your receipt and get the extracted text in structured form.") with gr.Row(): with gr.Column(): input_image = gr.Image(type="pil", label="Upload Image") submit_button = gr.Button("Extract Text") with gr.Column(): output_textbox = gr.Textbox(label="Extracted Text", max_lines=5) output_markdown = gr.Markdown(label="Extracted Text") submit_button.click(fn=ocr_with_pixtral, inputs=input_image, outputs=[output_textbox, output_markdown]) gr.Markdown("## Instructions") gr.Markdown("1. Upload an image containing text") gr.Markdown("2. Click 'Extract Text' to process the image") gr.Markdown("3. View the extracted text in the output box") # Launch the app if __name__ == "__main__": app.launch(debug=True)