Spaces:
Sleeping
Sleeping
Nils Durner
commited on
Commit
Β·
43b6937
1
Parent(s):
59b8207
basic Whisper support
Browse files
app.py
CHANGED
|
@@ -12,6 +12,7 @@ log_to_console = False
|
|
| 12 |
|
| 13 |
# constants
|
| 14 |
image_embed_prefix = "πΌοΈπ "
|
|
|
|
| 15 |
|
| 16 |
def encode_image(image_data):
|
| 17 |
"""Generates a prefix for image base64 data in the required format for the
|
|
@@ -74,9 +75,14 @@ def add_img(history, files):
|
|
| 74 |
for file in files:
|
| 75 |
if log_to_console:
|
| 76 |
print(f"add_img {file.name}")
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
-
gr.Info(f"
|
| 80 |
|
| 81 |
return history
|
| 82 |
|
|
@@ -111,55 +117,78 @@ def bot(message, history, oai_key, system_prompt, seed, temperature, max_tokens,
|
|
| 111 |
api_key=oai_key
|
| 112 |
)
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
user_msg_parts
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
if log_to_console:
|
| 164 |
print(f"br_result: {str(history)}")
|
| 165 |
|
|
@@ -192,7 +221,7 @@ with gr.Blocks() as demo:
|
|
| 192 |
|
| 193 |
oai_key = gr.Textbox(label="OpenAI API Key", elem_id="oai_key")
|
| 194 |
model = gr.Dropdown(label="Model", value="gpt-4-turbo", allow_custom_value=True, elem_id="model",
|
| 195 |
-
choices=["gpt-4-turbo", "gpt-4-turbo-preview", "gpt-4-1106-preview", "gpt-4", "gpt-4-vision-preview", "gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-1106"])
|
| 196 |
system_prompt = gr.TextArea("You are a helpful yet diligent AI assistant. Answer faithfully and factually correct. Respond with 'I do not know' if uncertain.", label="System Prompt", lines=3, max_lines=250, elem_id="system_prompt")
|
| 197 |
seed = gr.Textbox(label="Seed", elem_id="seed")
|
| 198 |
temp = gr.Slider(0, 1, label="Temperature", elem_id="temp", value=1)
|
|
@@ -245,7 +274,7 @@ with gr.Blocks() as demo:
|
|
| 245 |
|
| 246 |
with gr.Row():
|
| 247 |
btn = gr.UploadButton("π Upload", size="sm", file_count="multiple")
|
| 248 |
-
img_btn = gr.UploadButton("πΌοΈ Upload", size="sm", file_count="multiple", file_types=["image"])
|
| 249 |
undo_btn = gr.Button("β©οΈ Undo")
|
| 250 |
undo_btn.click(undo, inputs=[chatbot], outputs=[chatbot])
|
| 251 |
|
|
|
|
| 12 |
|
| 13 |
# constants
|
| 14 |
image_embed_prefix = "πΌοΈπ "
|
| 15 |
+
audio_embed_prefix = "ποΈπ "
|
| 16 |
|
| 17 |
def encode_image(image_data):
|
| 18 |
"""Generates a prefix for image base64 data in the required format for the
|
|
|
|
| 75 |
for file in files:
|
| 76 |
if log_to_console:
|
| 77 |
print(f"add_img {file.name}")
|
| 78 |
+
|
| 79 |
+
if file.name.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")):
|
| 80 |
+
prefix = audio_embed_prefix
|
| 81 |
+
else:
|
| 82 |
+
prefix = image_embed_prefix
|
| 83 |
+
history = history + [(prefix + file.name, None)]
|
| 84 |
|
| 85 |
+
gr.Info(f"Media added as {file.name}")
|
| 86 |
|
| 87 |
return history
|
| 88 |
|
|
|
|
| 117 |
api_key=oai_key
|
| 118 |
)
|
| 119 |
|
| 120 |
+
if model == "whisper":
|
| 121 |
+
result = ""
|
| 122 |
+
whisper_prompt = system_prompt
|
| 123 |
+
for human, assi in history:
|
| 124 |
+
if human is not None:
|
| 125 |
+
if human.startswith(audio_embed_prefix):
|
| 126 |
+
audio_fn = human.lstrip(audio_embed_prefix)
|
| 127 |
+
with open(audio_fn, "rb") as f:
|
| 128 |
+
transcription = client.audio.transcriptions.create(
|
| 129 |
+
model="whisper-1",
|
| 130 |
+
prompt=whisper_prompt,
|
| 131 |
+
file=f,
|
| 132 |
+
response_format="text"
|
| 133 |
+
)
|
| 134 |
+
whisper_prompt += f"\n{transcription}"
|
| 135 |
+
result += f"\n``` transcript {audio_fn}\n {transcription}\n```"
|
| 136 |
+
else:
|
| 137 |
+
whisper_prompt += f"\n{human}"
|
| 138 |
+
if assi is not None:
|
| 139 |
+
whisper_prompt += f"\n{assi}"
|
| 140 |
+
else:
|
| 141 |
+
seed_i = None
|
| 142 |
+
if seed:
|
| 143 |
+
seed_i = int(seed)
|
| 144 |
+
|
| 145 |
+
if log_to_console:
|
| 146 |
+
print(f"bot history: {str(history)}")
|
| 147 |
+
|
| 148 |
+
history_openai_format = []
|
| 149 |
+
user_msg_parts = []
|
| 150 |
+
if system_prompt:
|
| 151 |
+
history_openai_format.append({"role": "system", "content": system_prompt})
|
| 152 |
+
for human, assi in history:
|
| 153 |
+
if human is not None:
|
| 154 |
+
if human.startswith(image_embed_prefix):
|
| 155 |
+
with open(human.lstrip(image_embed_prefix), mode="rb") as f:
|
| 156 |
+
content = f.read()
|
| 157 |
+
user_msg_parts.append({"type": "image_url",
|
| 158 |
+
"image_url":{"url": encode_image(content)}})
|
| 159 |
+
else:
|
| 160 |
+
user_msg_parts.append({"type": "text", "text": human})
|
| 161 |
+
|
| 162 |
+
if assi is not None:
|
| 163 |
+
if user_msg_parts:
|
| 164 |
+
history_openai_format.append({"role": "user", "content": user_msg_parts})
|
| 165 |
+
user_msg_parts = []
|
| 166 |
+
|
| 167 |
+
history_openai_format.append({"role": "assistant", "content": assi})
|
| 168 |
+
|
| 169 |
+
if message:
|
| 170 |
+
user_msg_parts.append({"type": "text", "text": human})
|
| 171 |
+
|
| 172 |
+
if user_msg_parts:
|
| 173 |
+
history_openai_format.append({"role": "user", "content": user_msg_parts})
|
| 174 |
+
|
| 175 |
+
if log_to_console:
|
| 176 |
+
print(f"br_prompt: {str(history_openai_format)}")
|
| 177 |
+
|
| 178 |
+
response = client.chat.completions.create(
|
| 179 |
+
model=model,
|
| 180 |
+
messages= history_openai_format,
|
| 181 |
+
temperature=temperature,
|
| 182 |
+
seed=seed_i,
|
| 183 |
+
max_tokens=max_tokens
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
if log_to_console:
|
| 187 |
+
print(f"br_response: {str(response)}")
|
| 188 |
+
|
| 189 |
+
result = response.choices[0].message.content
|
| 190 |
+
|
| 191 |
+
history[-1][1] = result
|
| 192 |
if log_to_console:
|
| 193 |
print(f"br_result: {str(history)}")
|
| 194 |
|
|
|
|
| 221 |
|
| 222 |
oai_key = gr.Textbox(label="OpenAI API Key", elem_id="oai_key")
|
| 223 |
model = gr.Dropdown(label="Model", value="gpt-4-turbo", allow_custom_value=True, elem_id="model",
|
| 224 |
+
choices=["gpt-4-turbo", "gpt-4-turbo-preview", "gpt-4-1106-preview", "gpt-4", "gpt-4-vision-preview", "gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-1106", "whisper"])
|
| 225 |
system_prompt = gr.TextArea("You are a helpful yet diligent AI assistant. Answer faithfully and factually correct. Respond with 'I do not know' if uncertain.", label="System Prompt", lines=3, max_lines=250, elem_id="system_prompt")
|
| 226 |
seed = gr.Textbox(label="Seed", elem_id="seed")
|
| 227 |
temp = gr.Slider(0, 1, label="Temperature", elem_id="temp", value=1)
|
|
|
|
| 274 |
|
| 275 |
with gr.Row():
|
| 276 |
btn = gr.UploadButton("π Upload", size="sm", file_count="multiple")
|
| 277 |
+
img_btn = gr.UploadButton("πΌοΈ Upload", size="sm", file_count="multiple", file_types=["image", "audio"])
|
| 278 |
undo_btn = gr.Button("β©οΈ Undo")
|
| 279 |
undo_btn.click(undo, inputs=[chatbot], outputs=[chatbot])
|
| 280 |
|