File size: 12,337 Bytes
c1e65f4
 
 
 
 
 
 
 
7f3ea96
c1e65f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f3ea96
 
 
 
 
 
 
 
 
 
 
 
 
c1e65f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb22f17
 
 
 
 
 
 
 
 
 
 
9cd2e20
eb22f17
 
 
 
9cd2e20
eb22f17
 
 
 
 
 
 
 
9cd2e20
c1e65f4
 
 
 
 
 
 
7f3ea96
c1e65f4
 
 
 
 
 
 
 
7f3ea96
 
 
c1e65f4
 
 
 
 
 
7f3ea96
c1e65f4
9fcfd45
c1e65f4
 
 
 
 
7f3ea96
c1e65f4
9fcfd45
c1e65f4
 
 
 
 
7f3ea96
c1e65f4
 
 
 
 
 
 
 
 
 
5b39d61
7f3ea96
c1e65f4
 
 
 
 
 
 
 
 
7f3ea96
c1e65f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f3ea96
c1e65f4
 
 
 
 
 
 
7f3ea96
 
 
c1e65f4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
from typing import TypedDict, Annotated, List
import operator
import base64
import gradio as gr
from openai import OpenAI
from pydub import AudioSegment
from pathlib import Path
import os
import soundfile as sf


os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

client = OpenAI()

def encode_image(image_path: str) -> str:
  """Return the binary contents of a file as a base64 encoded string."""
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')


def fast_thinking(image_path: str, prompt: str, temperature) -> dict:
  #  vision_chain = load_image_chain | image_model | parser
  #  return vision_chain.invoke({'image_path': f'{image_path}', 'prompt': prompt, 'parser':parser, "temperature": temperature})
  encoded_image = encode_image(image_path)
  response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "user",
            "content": [

                    {
                      "type": "image_url",
                      "image_url": {
                        "url": f"data:image/jpeg;base64,{encoded_image}",
                        "detail": "auto"
                      }
                    },
                    {
                      "type": "text",
                      "text": prompt
                    }
            ]
        },
    ],
    temperature= temperature,
    max_tokens=1024,
   )
  return response.choices[0].message.content

def get_story(image_path: str, prompt: str, temperature) -> dict:
  #  vision_chain = load_image_chain | image_model | parser
  #  return vision_chain.invoke({'image_path': f'{image_path}', 'prompt': prompt, 'parser':parser, "temperature": temperature})
  encoded_image = encode_image(image_path)
  response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "user",
            "content": [

                    {
                      "type": "image_url",
                      "image_url": {
                        "url": f"data:image/jpeg;base64,{encoded_image}",
                        "detail": "auto"
                      }
                    },
                    {
                      "type": "text",
                      "text": prompt
                    }
            ]
        },
    ],
    temperature= temperature,
    max_tokens=1024,
   )
  return response.choices[0].message.content



def transform_text_to_speech(text: str):
  # Generate speech from transcription
  speech_file_path_mp3 = Path.cwd() / f"speech.mp3"
  speech_file_path_wav = Path.cwd() / f"speech.wav"
  response = client.audio.speech.create (
                model="tts-1",
                voice="onyx",
                input=text
            )

  with open(speech_file_path_mp3, "wb") as f:
      f.write(response.content)

  # Convert mp3 to wav
  audio = AudioSegment.from_mp3(speech_file_path_mp3)
  audio.export(speech_file_path_wav, format="wav")

  # Read the audio file and encode it to base64
  with open(speech_file_path_wav, "rb") as audio_file:
      audio_data = audio_file.read()
      audio_base64 = base64.b64encode(audio_data).decode('utf-8')

  # Create an HTML audio player with autoplay
  audio_html = f"""
  <audio controls autoplay>
      <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
      Your browser does not support the audio element.
  </audio>
  """
  return audio_html


def transform_speech_to_text(audio):
  file_path = "saved_audio.wav"
  sample_rate, audio_data = audio
  sf.write(file_path, audio_data, sample_rate)
  # Transcribe audio
  with open(file_path, "rb") as audio_file:
      transcription = client.audio.transcriptions.create(
          model="whisper-1",
          file=audio_file
      )
  return transcription.text  
    
CONVERSATION_STARTER_PROMPT = """
### Role
{role}
### Context
The user is an older person who has uploaded a photograph. Your goal is to start a meaningful and inviting conversation about the photo.
### Objective
Ask a simple first question that encourages the user to start talking about the photograph based on the below rules.
### Guidelines
Follow these rules while generating the question:
{rules}
### Output
Provide:
- A single, open-ended question based on the above rules.
Note: Output should be in 1 to 2 lines. Please don't generate anything else.
"""

CONVERSATION_STARTER2_PROMPT = """
### Role
{role}
### Context
The user is an older person who has uploaded a photo, and you are at the start of a conversation about it.
Here is the conversation history about the photo between the user and you (Good friend):
{history}
### Objective
Respond to user's most recent input in the conversation history above and a follow-up question generated based on below rules.
### Guidelines
Follow these rules while generating the follow up question:
{rules}
### Output
Provide:
- Respond to user's most recent input in the conversation history above and a follow-up question generated based on above rules.
Note: Output should be in 2 to 3 lines. Please don't generate anything else.
"""


CONVERSATION_EXPANDING_PROMPT = """
### Role
{role}
### Context
The user is an older person who has uploaded a photo, and you are in the middle of a conversation about it.
Here is the conversation history about the photo between the user and you (Good friend), reflecting the ongoing dialogue:
{history}
### Objective
Respond to user's most recent input in the conversation history above and a follow-up question generated based on below rules
### Guidelines
Follow these rules while generating the follow up question:
{rules}
### Output
Provide:
- Respond to user's most recent input in the conversation history above and a follow-up question generated based on above rules.
Note: Output should be in 2 to 3 lines. Please don't generate anything else.
"""


generate_story_prompt = """
You are a skilled listener and a respectful storyteller. Your goal is to create a **brief, clear, and faithful third-person summary** of the user's responses about their photo—without embellishment.

### **Given:**  
- A photograph uploaded by the user.  
- A conversation between an energetic and sympathetic friend and the user about the photograph:  
  {conversation}  

### **Your task:**  
Turn the user's words in the conversation above into a **short, objective third-person account** that accurately reflects what they said, without adding anything new.  

### **Strict Rules:**  
1. **Use only direct quotes from the user whenever possible.** If paraphrasing, ensure absolute neutrality. **Mention "the user" only once in the summary, then refer to them naturally (e.g., "they") or restructure sentences to avoid redundancy.**  
2. **Do not invent, embellish, or reinterpret any details.** Stick exactly to what the user has said.  
3. **Do not infer emotions, sentiment, or context beyond what the user explicitly stated.** No assumptions about happiness, nostalgia, or significance.  
4. **Do not describe the photo beyond what the user shared.** The summary should reflect the conversation, not visual analysis.  
5. **Write in the third person**, summarizing exactly what the user said.  
6. **Keep the summary concise, well-structured, and under four sentences.**  
7. If the user hasn't shared much, provide a neutral **one-line summary** and invite them to say more:  
   - *"You haven't shared details about this photo yet. I'd love to hear the story behind it!"*  

### **Output:**  
- A concise, well-structured third-person summary in **plain, natural language**.  
- No introductions, artistic flourishes, or speculative details.  
- **No descriptions of the image unless explicitly mentioned by the user.**  
- **No assumptions about mood, significance, or context beyond the user's words.**  
"""  



memory = ""
iter = 1
image_path = ""

def pred(image_input, role, conversation_starter_prompt_rules, conversation_starter2_prompt_rules, conversation_expanding_prompt_rules,  temperature, reply, audio_reply):
  global memory
  global iter
  global image_path
  if image_path != image_input:
    image_path = image_input
    iter = 1
    memory = ""

  if audio_reply is not None:
    reply = transform_speech_to_text(audio_reply) 

  if iter == 1:
    prompt = CONVERSATION_STARTER_PROMPT.format(role = role, rules=conversation_starter_prompt_rules)
    res = fast_thinking(image_path, prompt, temperature)
    question = res
    memory += "\n" + "Good Friend: "+ question
    iter += 1
    return "Fast", iter-1 , question, transform_text_to_speech(question), "", None
  if iter > 1 and iter <= 3:
    memory += "\n" + "User: "  + reply  
    prompt = CONVERSATION_STARTER2_PROMPT.format(role = role, history=memory,rules = conversation_starter2_prompt_rules)
    res = fast_thinking(image_path, prompt, temperature)
    acknowledgement_followback_question = res
    memory += "\n" + "Good Friend: "+ acknowledgement_followback_question
    iter += 1
    return "Fast", iter-1 ,  acknowledgement_followback_question, transform_text_to_speech(acknowledgement_followback_question),"", None
  if  iter > 3:
    memory += "\n" + "User: "  + reply  
    prompt = CONVERSATION_EXPANDING_PROMPT.format(role = role, history=memory, rules = conversation_expanding_prompt_rules)
    res = fast_thinking(image_path, prompt, temperature)
    acknowledgement_followback_question = res
    memory += "\n" + "Good Friend: "+ acknowledgement_followback_question
    iter += 1
    return "Fast", iter-1 , acknowledgement_followback_question, transform_text_to_speech(acknowledgement_followback_question), "", None

def generate_story(image_input):
  global memory
  global iter
  global image_path
  global generate_story_prompt

  if iter < 4:
    return "Fast", "No Solid Content to generate a Story", transform_text_to_speech("No Solid Content to generate a Story")
  prompt = generate_story_prompt.format(conversation = memory)
  res = get_story(image_path, prompt, 0.1)
  return "Fast", res, transform_text_to_speech(res), "", None

def clear():
  global memory
  global iter
  global image_path

  memory = ""
  iter = 1
  image_path = ""
  return None, "", "", "", None, " ", None



# Gradio Interface
with gr.Blocks(title = "Experimental Setup for Kitchentable.AI") as demo:
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="filepath", label="Upload an Image")
            role = gr.Textbox(label="Role")
            conversation_starter_prompt_rules = gr.Textbox(label="Conversation starter prompt rules(Generates question 1)")
            conversation_starter2_prompt_rules = gr.Textbox(label="Conversation starter2 prompt rules(Generates questions 2, 3)")
            conversation_expanding_prompt_rules = gr.Textbox(label="Conversation expanding prompt rules(Generates question after 3)")
            temperature = gr.Slider(minimum=0, maximum=0.9999, step=0.01, label="Temperature")

        with gr.Column():
            thinkingType = gr.Textbox(label="Thinking Type")
            question_number = gr.Textbox(label="Question Number")
            question = gr.Textbox(label="Agent Output")
            audio_output = gr.HTML(label="Audio Player")
            audio_input = gr.Audio(sources="microphone", type="numpy", value=None)
            reply = gr.Textbox(label="Your reply to the question")
            submit_button = gr.Button("Submit Reply", elem_id="Submit")
            Generate_story = gr.Button("Generate Story", elem_id="Submit")
            reset_setup = gr.Button("Reset Setup", elem_id="Submit")
            # critique = gr.Textbox(label="Agent  Fast Thinking question Critique")
            # question2 = gr.Textbox(label="Agent  Slow Thinking Question")

    submit_button.click(pred, inputs=[image_input, role, conversation_starter_prompt_rules, conversation_starter2_prompt_rules, conversation_expanding_prompt_rules, temperature, reply, audio_input], outputs=[thinkingType, question_number, question, audio_output, reply, audio_input])
    Generate_story.click(generate_story, inputs = [image_input], outputs = [thinkingType, question, audio_output, reply, audio_input])
    reset_setup.click(clear, inputs = [], outputs = [image_input, thinkingType, question_number, question, audio_output, reply, audio_input])
# Launch the interface
demo.launch(share=True)