Spaces:
Sleeping
Sleeping
| from typing import TypedDict, Annotated, List | |
| import operator | |
| import base64 | |
| import gradio as gr | |
| from openai import OpenAI | |
| from pydub import AudioSegment | |
| from pathlib import Path | |
| import os | |
| import soundfile as sf | |
| from pydantic import BaseModel | |
| import anthropic | |
| import mimetypes | |
| os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") | |
| os.environ["ANTHROPIC_API_KEY"] = os.getenv("ANTHROPIC_API_KEY") | |
| client = OpenAI() | |
| anthropic_client = anthropic.Anthropic() | |
| def transform_text_to_speech(text: str): | |
| # Generate speech from transcription | |
| speech_file_path_mp3 = Path.cwd() / f"speech.mp3" | |
| speech_file_path_wav = Path.cwd() / f"speech.wav" | |
| response = client.audio.speech.create ( | |
| model="tts-1", | |
| voice="alloy", | |
| input=text | |
| ) | |
| with open(speech_file_path_mp3, "wb") as f: | |
| f.write(response.content) | |
| # Convert mp3 to wav | |
| audio = AudioSegment.from_mp3(speech_file_path_mp3) | |
| audio.export(speech_file_path_wav, format="wav") | |
| # Read the audio file and encode it to base64 | |
| with open(speech_file_path_wav, "rb") as audio_file: | |
| audio_data = audio_file.read() | |
| audio_base64 = base64.b64encode(audio_data).decode('utf-8') | |
| # Create an HTML audio player with autoplay | |
| audio_html = f""" | |
| <audio controls autoplay> | |
| <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav"> | |
| Your browser does not support the audio element. | |
| </audio> | |
| """ | |
| return audio_html | |
| def encode_image(image_path: str) -> str: | |
| """Return the binary contents of a file as a base64 encoded string.""" | |
| with open(image_path, "rb") as image_file: | |
| return base64.b64encode(image_file.read()).decode('utf-8') | |
| def get_media_type(image_path: str) -> str: | |
| mime_type, _ = mimetypes.guess_type(image_path) | |
| return mime_type or "image/jpeg" | |
| def anthropic_image_model(image_path: str, prompt: str, temperature): | |
| encoded_image = encode_image(image_path) | |
| image1_media_type = get_media_type(image_path) | |
| print(prompt) | |
| message = anthropic_client.messages.create( | |
| model="claude-3-5-haiku-latest", | |
| max_tokens=1000, | |
| temperature=temperature, | |
| # system=prompt, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image", | |
| "source": { | |
| "type": "base64", | |
| "media_type": image1_media_type, | |
| "data": encoded_image, | |
| } | |
| }, | |
| { | |
| "type": "text", | |
| "text": prompt | |
| } | |
| ] | |
| } | |
| ] | |
| ) | |
| return message.content[0].text | |
| def openai_image_model(image_path: str, prompt: str, temperature) -> dict: | |
| encoded_image = encode_image(image_path) | |
| response = client.chat.completions.create( | |
| model="gpt-4.1", | |
| messages=[ | |
| # { | |
| # "role": "developer", | |
| # "content": prompt, | |
| # }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{encoded_image}", | |
| "detail": "auto" | |
| } | |
| }, | |
| { | |
| "type": "text", | |
| "text": prompt | |
| } | |
| ] | |
| }, | |
| ], | |
| temperature=temperature, | |
| max_tokens=1024, | |
| ) | |
| return response.choices[0].message.content | |
| image_path = "" | |
| def pred(image_input, prompt, temperature, model): | |
| global image_path | |
| if image_path != image_input: | |
| image_path = image_input | |
| if image_input is None: | |
| return "Please select an Image", transform_text_to_speech("Please select an Image") | |
| # if prompt.strip() == "": | |
| # return "Please select an Image", transform_text_to_speech("Please select an Image") | |
| if model == "gpt-4.1": | |
| ai_response = openai_image_model(image_path, prompt, temperature) | |
| else: | |
| ai_response = anthropic_image_model(image_path, prompt, temperature) | |
| return ai_response, transform_text_to_speech(ai_response) | |
| # Ensure the function always returns six values, even if no condition is met | |
| return "Error..", None | |
| # Gradio Interface | |
| with gr.Blocks(title = "Experimental Setup for Kitchentable.AI") as demo: | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image(type="filepath", label="Upload an Image") | |
| model = gr.Dropdown(choices=["gpt-4.1", "claude-3-5-haiku-latest"],label="Select Model",value="gpt-4.1",interactive=True) | |
| temperature = gr.Slider(minimum=0, maximum=0.9999, step=0.01, label="Temperature") | |
| with gr.Column(): | |
| question = gr.Textbox(label="Agent Output") | |
| audio_output = gr.HTML(label="Audio Player") | |
| prompt = gr.Textbox(label="Prompt", value = "Your prompt . . .") | |
| submit_button = gr.Button("Submit Prompt", elem_id="Submit") | |
| submit_button.click(pred, inputs=[image_input, prompt, temperature, model], outputs=[question, audio_output]) | |
| demo.launch(share=True) |