Spaces:

humanpath
/

Colin

Sleeping

App Files Files Community

UbaidMajied commited on May 19, 2025

Commit

70e232f

verified ·

1 Parent(s): e758c77

Create app.py

Browse files

Files changed (1) hide show

app.py +175 -0

app.py ADDED Viewed

	@@ -0,0 +1,175 @@

+from typing import TypedDict, Annotated, List
+import operator
+import base64
+import gradio as gr
+from openai import OpenAI
+from pydub import AudioSegment
+from pathlib import Path
+import os
+import soundfile as sf
+from pydantic import BaseModel
+import anthropic
+import mimetypes
+os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
+os.environ["ANTHROPIC_API_KEY"] = os.getenv("ANTHROPIC_API_KEY")
+client = OpenAI()
+anthropic_client = anthropic.Anthropic()
+def transform_text_to_speech(text: str):
+  # Generate speech from transcription
+  speech_file_path_mp3 = Path.cwd() / f"speech.mp3"
+  speech_file_path_wav = Path.cwd() / f"speech.wav"
+  response = client.audio.speech.create (
+                model="tts-1",
+                voice="alloy",
+                input=text
+            )
+  with open(speech_file_path_mp3, "wb") as f:
+      f.write(response.content)
+  # Convert mp3 to wav
+  audio = AudioSegment.from_mp3(speech_file_path_mp3)
+  audio.export(speech_file_path_wav, format="wav")
+  # Read the audio file and encode it to base64
+  with open(speech_file_path_wav, "rb") as audio_file:
+      audio_data = audio_file.read()
+      audio_base64 = base64.b64encode(audio_data).decode('utf-8')
+  # Create an HTML audio player with autoplay
+  audio_html = f"""
+  <audio controls autoplay>
+      <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
+      Your browser does not support the audio element.
+  </audio>
+  """
+  return audio_html
+def encode_image(image_path: str) -> str:
+  """Return the binary contents of a file as a base64 encoded string."""
+  with open(image_path, "rb") as image_file:
+    return base64.b64encode(image_file.read()).decode('utf-8')
+def get_media_type(image_path: str) -> str:
+    mime_type, _ = mimetypes.guess_type(image_path)
+    return mime_type or "image/jpeg"
+def anthropic_image_model(image_path: str, prompt: str, temperature):
+  encoded_image = encode_image(image_path)
+  image1_media_type = get_media_type(image_path)
+  print(prompt)
+  message = anthropic_client.messages.create(
+      model="claude-3-5-haiku-latest",
+      max_tokens=1000,
+      temperature=temperature,
+      # system=prompt,
+      messages=[
+          {
+              "role": "user",
+              "content": [
+                  {
+                      "type": "image",
+                      "source": {
+                        "type": "base64",
+                        "media_type": image1_media_type,
+                        "data": encoded_image,
+                      }
+                  },
+                  {
+                      "type": "text",
+                      "text": prompt
+                  }
+              ]
+          }
+      ]
+  )
+  return message.content[0].text
+def openai_image_model(image_path: str, prompt: str, temperature) -> dict:
+  encoded_image = encode_image(image_path)
+  response = client.chat.completions.create(
+    model="gpt-4.1",
+    messages=[
+        # {
+        #               "role": "developer",
+        #                "content": prompt,
+        # },
+        {
+            "role": "user",
+            "content": [
+                    {
+                      "type": "image_url",
+                      "image_url": {
+                        "url": f"data:image/jpeg;base64,{encoded_image}",
+                        "detail": "auto"
+                      }
+                    },
+                    {
+                      "type": "text",
+                      "text": prompt
+                    }
+            ]
+        },
+    ],
+    temperature=temperature,
+    max_tokens=1024,
+   )
+  return response.choices[0].message.content
+image_path = ""
+def pred(image_input, prompt, temperature, model):
+    global image_path
+    if image_path != image_input:
+        image_path = image_input
+    if image_input is None:
+      return "Please select an Image", transform_text_to_speech("Please select an Image")
+    # if prompt.strip() == "":
+    #   return "Please select an Image", transform_text_to_speech("Please select an Image")
+    if model == "gpt-4.1":
+      ai_response = openai_image_model(image_path, prompt, temperature)
+    else:
+      ai_response = anthropic_image_model(image_path, prompt, temperature)
+    return ai_response, transform_text_to_speech(ai_response)
+    # Ensure the function always returns six values, even if no condition is met
+    return "Error..", None
+# Gradio Interface
+with gr.Blocks(title = "Experimental Setup for Kitchentable.AI") as demo:
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="filepath", label="Upload an Image")
+            model = gr.Dropdown(choices=["gpt-4.1", "claude-3-5-haiku-latest"],label="Select Model",value="gpt-4.1",interactive=True)
+            temperature = gr.Slider(minimum=0, maximum=0.9999, step=0.01, label="Temperature")
+        with gr.Column():
+            question = gr.Textbox(label="Agent Output")
+            audio_output = gr.HTML(label="Audio Player")
+            prompt = gr.Textbox(label="Prompt", value = "Your prompt . . .")
+            submit_button = gr.Button("Submit Prompt", elem_id="Submit")
+    submit_button.click(pred, inputs=[image_input, prompt, temperature, model], outputs=[question, audio_output])
+demo.launch(share=True)