from typing import TypedDict, Annotated, List import operator import base64 import gradio as gr from openai import OpenAI from pydub import AudioSegment from pathlib import Path import os import soundfile as sf from pydantic import BaseModel import anthropic import mimetypes os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") os.environ["ANTHROPIC_API_KEY"] = os.getenv("ANTHROPIC_API_KEY") client = OpenAI() anthropic_client = anthropic.Anthropic() def transform_text_to_speech(text: str): # Generate speech from transcription speech_file_path_mp3 = Path.cwd() / f"speech.mp3" speech_file_path_wav = Path.cwd() / f"speech.wav" response = client.audio.speech.create ( model="tts-1", voice="alloy", input=text ) with open(speech_file_path_mp3, "wb") as f: f.write(response.content) # Convert mp3 to wav audio = AudioSegment.from_mp3(speech_file_path_mp3) audio.export(speech_file_path_wav, format="wav") # Read the audio file and encode it to base64 with open(speech_file_path_wav, "rb") as audio_file: audio_data = audio_file.read() audio_base64 = base64.b64encode(audio_data).decode('utf-8') # Create an HTML audio player with autoplay audio_html = f""" """ return audio_html def encode_image(image_path: str) -> str: """Return the binary contents of a file as a base64 encoded string.""" with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def get_media_type(image_path: str) -> str: mime_type, _ = mimetypes.guess_type(image_path) return mime_type or "image/jpeg" def anthropic_image_model(image_path: str, prompt: str, temperature): encoded_image = encode_image(image_path) image1_media_type = get_media_type(image_path) print(prompt) message = anthropic_client.messages.create( model="claude-3-5-haiku-latest", max_tokens=1000, temperature=temperature, # system=prompt, messages=[ { "role": "user", "content": [ { "type": "image", "source": { "type": "base64", "media_type": image1_media_type, "data": encoded_image, } }, { "type": "text", "text": prompt } ] } ] ) return message.content[0].text def openai_image_model(image_path: str, prompt: str, temperature) -> dict: encoded_image = encode_image(image_path) response = client.chat.completions.create( model="gpt-4.1", messages=[ # { # "role": "developer", # "content": prompt, # }, { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{encoded_image}", "detail": "auto" } }, { "type": "text", "text": prompt } ] }, ], temperature=temperature, max_tokens=1024, ) return response.choices[0].message.content image_path = "" def pred(image_input, prompt, temperature, model): global image_path if image_path != image_input: image_path = image_input if image_input is None: return "Please select an Image", transform_text_to_speech("Please select an Image") # if prompt.strip() == "": # return "Please select an Image", transform_text_to_speech("Please select an Image") if model == "gpt-4.1": ai_response = openai_image_model(image_path, prompt, temperature) else: ai_response = anthropic_image_model(image_path, prompt, temperature) return ai_response, transform_text_to_speech(ai_response) # Ensure the function always returns six values, even if no condition is met return "Error..", None # Gradio Interface with gr.Blocks(title = "Experimental Setup for Kitchentable.AI") as demo: with gr.Row(): with gr.Column(): image_input = gr.Image(type="filepath", label="Upload an Image") model = gr.Dropdown(choices=["gpt-4.1", "claude-3-5-haiku-latest"],label="Select Model",value="gpt-4.1",interactive=True) temperature = gr.Slider(minimum=0, maximum=0.9999, step=0.01, label="Temperature") with gr.Column(): question = gr.Textbox(label="Agent Output") audio_output = gr.HTML(label="Audio Player") prompt = gr.Textbox(label="Prompt", value = "Your prompt . . .") submit_button = gr.Button("Submit Prompt", elem_id="Submit") submit_button.click(pred, inputs=[image_input, prompt, temperature, model], outputs=[question, audio_output]) demo.launch(share=True)