UbaidMajied commited on
Commit
70e232f
·
verified ·
1 Parent(s): e758c77

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -0
app.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TypedDict, Annotated, List
2
+ import operator
3
+ import base64
4
+ import gradio as gr
5
+ from openai import OpenAI
6
+ from pydub import AudioSegment
7
+ from pathlib import Path
8
+ import os
9
+ import soundfile as sf
10
+ from pydantic import BaseModel
11
+ import anthropic
12
+ import mimetypes
13
+
14
+ os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
15
+
16
+ os.environ["ANTHROPIC_API_KEY"] = os.getenv("ANTHROPIC_API_KEY")
17
+
18
+ client = OpenAI()
19
+
20
+ anthropic_client = anthropic.Anthropic()
21
+
22
+ def transform_text_to_speech(text: str):
23
+ # Generate speech from transcription
24
+ speech_file_path_mp3 = Path.cwd() / f"speech.mp3"
25
+ speech_file_path_wav = Path.cwd() / f"speech.wav"
26
+ response = client.audio.speech.create (
27
+ model="tts-1",
28
+ voice="alloy",
29
+ input=text
30
+ )
31
+
32
+ with open(speech_file_path_mp3, "wb") as f:
33
+ f.write(response.content)
34
+
35
+ # Convert mp3 to wav
36
+ audio = AudioSegment.from_mp3(speech_file_path_mp3)
37
+ audio.export(speech_file_path_wav, format="wav")
38
+
39
+ # Read the audio file and encode it to base64
40
+ with open(speech_file_path_wav, "rb") as audio_file:
41
+ audio_data = audio_file.read()
42
+ audio_base64 = base64.b64encode(audio_data).decode('utf-8')
43
+
44
+ # Create an HTML audio player with autoplay
45
+ audio_html = f"""
46
+ <audio controls autoplay>
47
+ <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
48
+ Your browser does not support the audio element.
49
+ </audio>
50
+ """
51
+ return audio_html
52
+
53
+ def encode_image(image_path: str) -> str:
54
+ """Return the binary contents of a file as a base64 encoded string."""
55
+ with open(image_path, "rb") as image_file:
56
+ return base64.b64encode(image_file.read()).decode('utf-8')
57
+
58
+
59
+ def get_media_type(image_path: str) -> str:
60
+ mime_type, _ = mimetypes.guess_type(image_path)
61
+ return mime_type or "image/jpeg"
62
+
63
+
64
+ def anthropic_image_model(image_path: str, prompt: str, temperature):
65
+ encoded_image = encode_image(image_path)
66
+ image1_media_type = get_media_type(image_path)
67
+ print(prompt)
68
+ message = anthropic_client.messages.create(
69
+ model="claude-3-5-haiku-latest",
70
+ max_tokens=1000,
71
+ temperature=temperature,
72
+ # system=prompt,
73
+ messages=[
74
+ {
75
+ "role": "user",
76
+ "content": [
77
+ {
78
+ "type": "image",
79
+ "source": {
80
+ "type": "base64",
81
+ "media_type": image1_media_type,
82
+ "data": encoded_image,
83
+ }
84
+ },
85
+ {
86
+ "type": "text",
87
+ "text": prompt
88
+ }
89
+ ]
90
+ }
91
+ ]
92
+ )
93
+ return message.content[0].text
94
+
95
+
96
+ def openai_image_model(image_path: str, prompt: str, temperature) -> dict:
97
+ encoded_image = encode_image(image_path)
98
+ response = client.chat.completions.create(
99
+ model="gpt-4.1",
100
+ messages=[
101
+ # {
102
+ # "role": "developer",
103
+ # "content": prompt,
104
+ # },
105
+ {
106
+ "role": "user",
107
+ "content": [
108
+
109
+ {
110
+ "type": "image_url",
111
+ "image_url": {
112
+ "url": f"data:image/jpeg;base64,{encoded_image}",
113
+ "detail": "auto"
114
+ }
115
+ },
116
+ {
117
+ "type": "text",
118
+ "text": prompt
119
+ }
120
+ ]
121
+ },
122
+ ],
123
+
124
+
125
+
126
+
127
+ temperature=temperature,
128
+ max_tokens=1024,
129
+ )
130
+
131
+ return response.choices[0].message.content
132
+
133
+ image_path = ""
134
+
135
+ def pred(image_input, prompt, temperature, model):
136
+ global image_path
137
+ if image_path != image_input:
138
+ image_path = image_input
139
+
140
+ if image_input is None:
141
+ return "Please select an Image", transform_text_to_speech("Please select an Image")
142
+
143
+ # if prompt.strip() == "":
144
+ # return "Please select an Image", transform_text_to_speech("Please select an Image")
145
+
146
+
147
+ if model == "gpt-4.1":
148
+ ai_response = openai_image_model(image_path, prompt, temperature)
149
+ else:
150
+ ai_response = anthropic_image_model(image_path, prompt, temperature)
151
+
152
+ return ai_response, transform_text_to_speech(ai_response)
153
+
154
+ # Ensure the function always returns six values, even if no condition is met
155
+ return "Error..", None
156
+
157
+
158
+
159
+ # Gradio Interface
160
+ with gr.Blocks(title = "Experimental Setup for Kitchentable.AI") as demo:
161
+ with gr.Row():
162
+ with gr.Column():
163
+ image_input = gr.Image(type="filepath", label="Upload an Image")
164
+ model = gr.Dropdown(choices=["gpt-4.1", "claude-3-5-haiku-latest"],label="Select Model",value="gpt-4.1",interactive=True)
165
+ temperature = gr.Slider(minimum=0, maximum=0.9999, step=0.01, label="Temperature")
166
+
167
+ with gr.Column():
168
+ question = gr.Textbox(label="Agent Output")
169
+ audio_output = gr.HTML(label="Audio Player")
170
+ prompt = gr.Textbox(label="Prompt", value = "Your prompt . . .")
171
+ submit_button = gr.Button("Submit Prompt", elem_id="Submit")
172
+
173
+ submit_button.click(pred, inputs=[image_input, prompt, temperature, model], outputs=[question, audio_output])
174
+
175
+ demo.launch(share=True)