Spaces:
Sleeping
Sleeping
File size: 12,337 Bytes
c1e65f4 7f3ea96 c1e65f4 7f3ea96 c1e65f4 eb22f17 9cd2e20 eb22f17 9cd2e20 eb22f17 9cd2e20 c1e65f4 7f3ea96 c1e65f4 7f3ea96 c1e65f4 7f3ea96 c1e65f4 9fcfd45 c1e65f4 7f3ea96 c1e65f4 9fcfd45 c1e65f4 7f3ea96 c1e65f4 5b39d61 7f3ea96 c1e65f4 7f3ea96 c1e65f4 7f3ea96 c1e65f4 7f3ea96 c1e65f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 | from typing import TypedDict, Annotated, List
import operator
import base64
import gradio as gr
from openai import OpenAI
from pydub import AudioSegment
from pathlib import Path
import os
import soundfile as sf
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
client = OpenAI()
def encode_image(image_path: str) -> str:
"""Return the binary contents of a file as a base64 encoded string."""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def fast_thinking(image_path: str, prompt: str, temperature) -> dict:
# vision_chain = load_image_chain | image_model | parser
# return vision_chain.invoke({'image_path': f'{image_path}', 'prompt': prompt, 'parser':parser, "temperature": temperature})
encoded_image = encode_image(image_path)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_image}",
"detail": "auto"
}
},
{
"type": "text",
"text": prompt
}
]
},
],
temperature= temperature,
max_tokens=1024,
)
return response.choices[0].message.content
def get_story(image_path: str, prompt: str, temperature) -> dict:
# vision_chain = load_image_chain | image_model | parser
# return vision_chain.invoke({'image_path': f'{image_path}', 'prompt': prompt, 'parser':parser, "temperature": temperature})
encoded_image = encode_image(image_path)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_image}",
"detail": "auto"
}
},
{
"type": "text",
"text": prompt
}
]
},
],
temperature= temperature,
max_tokens=1024,
)
return response.choices[0].message.content
def transform_text_to_speech(text: str):
# Generate speech from transcription
speech_file_path_mp3 = Path.cwd() / f"speech.mp3"
speech_file_path_wav = Path.cwd() / f"speech.wav"
response = client.audio.speech.create (
model="tts-1",
voice="onyx",
input=text
)
with open(speech_file_path_mp3, "wb") as f:
f.write(response.content)
# Convert mp3 to wav
audio = AudioSegment.from_mp3(speech_file_path_mp3)
audio.export(speech_file_path_wav, format="wav")
# Read the audio file and encode it to base64
with open(speech_file_path_wav, "rb") as audio_file:
audio_data = audio_file.read()
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
# Create an HTML audio player with autoplay
audio_html = f"""
<audio controls autoplay>
<source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
Your browser does not support the audio element.
</audio>
"""
return audio_html
def transform_speech_to_text(audio):
file_path = "saved_audio.wav"
sample_rate, audio_data = audio
sf.write(file_path, audio_data, sample_rate)
# Transcribe audio
with open(file_path, "rb") as audio_file:
transcription = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
return transcription.text
CONVERSATION_STARTER_PROMPT = """
### Role
{role}
### Context
The user is an older person who has uploaded a photograph. Your goal is to start a meaningful and inviting conversation about the photo.
### Objective
Ask a simple first question that encourages the user to start talking about the photograph based on the below rules.
### Guidelines
Follow these rules while generating the question:
{rules}
### Output
Provide:
- A single, open-ended question based on the above rules.
Note: Output should be in 1 to 2 lines. Please don't generate anything else.
"""
CONVERSATION_STARTER2_PROMPT = """
### Role
{role}
### Context
The user is an older person who has uploaded a photo, and you are at the start of a conversation about it.
Here is the conversation history about the photo between the user and you (Good friend):
{history}
### Objective
Respond to user's most recent input in the conversation history above and a follow-up question generated based on below rules.
### Guidelines
Follow these rules while generating the follow up question:
{rules}
### Output
Provide:
- Respond to user's most recent input in the conversation history above and a follow-up question generated based on above rules.
Note: Output should be in 2 to 3 lines. Please don't generate anything else.
"""
CONVERSATION_EXPANDING_PROMPT = """
### Role
{role}
### Context
The user is an older person who has uploaded a photo, and you are in the middle of a conversation about it.
Here is the conversation history about the photo between the user and you (Good friend), reflecting the ongoing dialogue:
{history}
### Objective
Respond to user's most recent input in the conversation history above and a follow-up question generated based on below rules
### Guidelines
Follow these rules while generating the follow up question:
{rules}
### Output
Provide:
- Respond to user's most recent input in the conversation history above and a follow-up question generated based on above rules.
Note: Output should be in 2 to 3 lines. Please don't generate anything else.
"""
generate_story_prompt = """
You are a skilled listener and a respectful storyteller. Your goal is to create a **brief, clear, and faithful third-person summary** of the user's responses about their photo—without embellishment.
### **Given:**
- A photograph uploaded by the user.
- A conversation between an energetic and sympathetic friend and the user about the photograph:
{conversation}
### **Your task:**
Turn the user's words in the conversation above into a **short, objective third-person account** that accurately reflects what they said, without adding anything new.
### **Strict Rules:**
1. **Use only direct quotes from the user whenever possible.** If paraphrasing, ensure absolute neutrality. **Mention "the user" only once in the summary, then refer to them naturally (e.g., "they") or restructure sentences to avoid redundancy.**
2. **Do not invent, embellish, or reinterpret any details.** Stick exactly to what the user has said.
3. **Do not infer emotions, sentiment, or context beyond what the user explicitly stated.** No assumptions about happiness, nostalgia, or significance.
4. **Do not describe the photo beyond what the user shared.** The summary should reflect the conversation, not visual analysis.
5. **Write in the third person**, summarizing exactly what the user said.
6. **Keep the summary concise, well-structured, and under four sentences.**
7. If the user hasn't shared much, provide a neutral **one-line summary** and invite them to say more:
- *"You haven't shared details about this photo yet. I'd love to hear the story behind it!"*
### **Output:**
- A concise, well-structured third-person summary in **plain, natural language**.
- No introductions, artistic flourishes, or speculative details.
- **No descriptions of the image unless explicitly mentioned by the user.**
- **No assumptions about mood, significance, or context beyond the user's words.**
"""
memory = ""
iter = 1
image_path = ""
def pred(image_input, role, conversation_starter_prompt_rules, conversation_starter2_prompt_rules, conversation_expanding_prompt_rules, temperature, reply, audio_reply):
global memory
global iter
global image_path
if image_path != image_input:
image_path = image_input
iter = 1
memory = ""
if audio_reply is not None:
reply = transform_speech_to_text(audio_reply)
if iter == 1:
prompt = CONVERSATION_STARTER_PROMPT.format(role = role, rules=conversation_starter_prompt_rules)
res = fast_thinking(image_path, prompt, temperature)
question = res
memory += "\n" + "Good Friend: "+ question
iter += 1
return "Fast", iter-1 , question, transform_text_to_speech(question), "", None
if iter > 1 and iter <= 3:
memory += "\n" + "User: " + reply
prompt = CONVERSATION_STARTER2_PROMPT.format(role = role, history=memory,rules = conversation_starter2_prompt_rules)
res = fast_thinking(image_path, prompt, temperature)
acknowledgement_followback_question = res
memory += "\n" + "Good Friend: "+ acknowledgement_followback_question
iter += 1
return "Fast", iter-1 , acknowledgement_followback_question, transform_text_to_speech(acknowledgement_followback_question),"", None
if iter > 3:
memory += "\n" + "User: " + reply
prompt = CONVERSATION_EXPANDING_PROMPT.format(role = role, history=memory, rules = conversation_expanding_prompt_rules)
res = fast_thinking(image_path, prompt, temperature)
acknowledgement_followback_question = res
memory += "\n" + "Good Friend: "+ acknowledgement_followback_question
iter += 1
return "Fast", iter-1 , acknowledgement_followback_question, transform_text_to_speech(acknowledgement_followback_question), "", None
def generate_story(image_input):
global memory
global iter
global image_path
global generate_story_prompt
if iter < 4:
return "Fast", "No Solid Content to generate a Story", transform_text_to_speech("No Solid Content to generate a Story")
prompt = generate_story_prompt.format(conversation = memory)
res = get_story(image_path, prompt, 0.1)
return "Fast", res, transform_text_to_speech(res), "", None
def clear():
global memory
global iter
global image_path
memory = ""
iter = 1
image_path = ""
return None, "", "", "", None, " ", None
# Gradio Interface
with gr.Blocks(title = "Experimental Setup for Kitchentable.AI") as demo:
with gr.Row():
with gr.Column():
image_input = gr.Image(type="filepath", label="Upload an Image")
role = gr.Textbox(label="Role")
conversation_starter_prompt_rules = gr.Textbox(label="Conversation starter prompt rules(Generates question 1)")
conversation_starter2_prompt_rules = gr.Textbox(label="Conversation starter2 prompt rules(Generates questions 2, 3)")
conversation_expanding_prompt_rules = gr.Textbox(label="Conversation expanding prompt rules(Generates question after 3)")
temperature = gr.Slider(minimum=0, maximum=0.9999, step=0.01, label="Temperature")
with gr.Column():
thinkingType = gr.Textbox(label="Thinking Type")
question_number = gr.Textbox(label="Question Number")
question = gr.Textbox(label="Agent Output")
audio_output = gr.HTML(label="Audio Player")
audio_input = gr.Audio(sources="microphone", type="numpy", value=None)
reply = gr.Textbox(label="Your reply to the question")
submit_button = gr.Button("Submit Reply", elem_id="Submit")
Generate_story = gr.Button("Generate Story", elem_id="Submit")
reset_setup = gr.Button("Reset Setup", elem_id="Submit")
# critique = gr.Textbox(label="Agent Fast Thinking question Critique")
# question2 = gr.Textbox(label="Agent Slow Thinking Question")
submit_button.click(pred, inputs=[image_input, role, conversation_starter_prompt_rules, conversation_starter2_prompt_rules, conversation_expanding_prompt_rules, temperature, reply, audio_input], outputs=[thinkingType, question_number, question, audio_output, reply, audio_input])
Generate_story.click(generate_story, inputs = [image_input], outputs = [thinkingType, question, audio_output, reply, audio_input])
reset_setup.click(clear, inputs = [], outputs = [image_input, thinkingType, question_number, question, audio_output, reply, audio_input])
# Launch the interface
demo.launch(share=True) |