| from typing import TypedDict, Annotated, List |
| import operator |
| import base64 |
| import gradio as gr |
| from openai import OpenAI |
| from pydub import AudioSegment |
| from pathlib import Path |
| import os |
| import soundfile as sf |
| from pydantic import BaseModel |
| import anthropic |
| import mimetypes |
|
|
| os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") |
|
|
| os.environ["ANTHROPIC_API_KEY"] = os.getenv("ANTHROPIC_API_KEY") |
|
|
| client = OpenAI() |
|
|
| anthropic_client = anthropic.Anthropic() |
|
|
| def transform_text_to_speech(text: str): |
| |
| speech_file_path_mp3 = Path.cwd() / f"speech.mp3" |
| speech_file_path_wav = Path.cwd() / f"speech.wav" |
| response = client.audio.speech.create ( |
| model="tts-1", |
| voice="alloy", |
| input=text |
| ) |
|
|
| with open(speech_file_path_mp3, "wb") as f: |
| f.write(response.content) |
|
|
| |
| audio = AudioSegment.from_mp3(speech_file_path_mp3) |
| audio.export(speech_file_path_wav, format="wav") |
|
|
| |
| with open(speech_file_path_wav, "rb") as audio_file: |
| audio_data = audio_file.read() |
| audio_base64 = base64.b64encode(audio_data).decode('utf-8') |
|
|
| |
| audio_html = f""" |
| <audio controls autoplay> |
| <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav"> |
| Your browser does not support the audio element. |
| </audio> |
| """ |
| return audio_html |
|
|
| def encode_image(image_path: str) -> str: |
| """Return the binary contents of a file as a base64 encoded string.""" |
| with open(image_path, "rb") as image_file: |
| return base64.b64encode(image_file.read()).decode('utf-8') |
|
|
|
|
| def get_media_type(image_path: str) -> str: |
| mime_type, _ = mimetypes.guess_type(image_path) |
| return mime_type or "image/jpeg" |
|
|
|
|
| def anthropic_image_model(image_path: str, prompt: str, temperature): |
| encoded_image = encode_image(image_path) |
| image1_media_type = get_media_type(image_path) |
| print(prompt) |
| message = anthropic_client.messages.create( |
| model="claude-3-5-haiku-latest", |
| max_tokens=1000, |
| temperature=temperature, |
| |
| messages=[ |
| { |
| "role": "user", |
| "content": [ |
| { |
| "type": "image", |
| "source": { |
| "type": "base64", |
| "media_type": image1_media_type, |
| "data": encoded_image, |
| } |
| }, |
| { |
| "type": "text", |
| "text": prompt |
| } |
| ] |
| } |
| ] |
| ) |
| return message.content[0].text |
|
|
|
|
| def openai_image_model(image_path: str, prompt: str, temperature) -> dict: |
| encoded_image = encode_image(image_path) |
| response = client.chat.completions.create( |
| model="gpt-4.1", |
| messages=[ |
| |
| |
| |
| |
| { |
| "role": "user", |
| "content": [ |
|
|
| { |
| "type": "image_url", |
| "image_url": { |
| "url": f"data:image/jpeg;base64,{encoded_image}", |
| "detail": "auto" |
| } |
| }, |
| { |
| "type": "text", |
| "text": prompt |
| } |
| ] |
| }, |
| ], |
|
|
|
|
|
|
|
|
| temperature=temperature, |
| max_tokens=1024, |
| ) |
|
|
| return response.choices[0].message.content |
|
|
| image_path = "" |
|
|
| def pred(image_input, prompt, temperature, model): |
| global image_path |
| if image_path != image_input: |
| image_path = image_input |
| |
| if image_input is None: |
| return "Please select an Image", transform_text_to_speech("Please select an Image") |
|
|
| |
| |
|
|
|
|
| if model == "gpt-4.1": |
| ai_response = openai_image_model(image_path, prompt, temperature) |
| else: |
| ai_response = anthropic_image_model(image_path, prompt, temperature) |
|
|
| return ai_response, transform_text_to_speech(ai_response) |
|
|
| |
| return "Error..", None |
|
|
|
|
|
|
| |
| with gr.Blocks(title = "Experimental Setup for Kitchentable.AI") as demo: |
| with gr.Row(): |
| with gr.Column(): |
| image_input = gr.Image(type="filepath", label="Upload an Image") |
| model = gr.Dropdown(choices=["gpt-4.1", "claude-3-5-haiku-latest"],label="Select Model",value="gpt-4.1",interactive=True) |
| temperature = gr.Slider(minimum=0, maximum=0.9999, step=0.01, label="Temperature") |
|
|
| with gr.Column(): |
| question = gr.Textbox(label="Agent Output") |
| audio_output = gr.HTML(label="Audio Player") |
| prompt = gr.Textbox(label="Prompt", value = "Your prompt . . .") |
| submit_button = gr.Button("Submit Prompt", elem_id="Submit") |
|
|
| submit_button.click(pred, inputs=[image_input, prompt, temperature, model], outputs=[question, audio_output]) |
|
|
| demo.launch(share=True) |