Spaces:
Sleeping
Sleeping
| from typing import TypedDict, Annotated, List | |
| import operator | |
| import base64 | |
| import gradio as gr | |
| from openai import OpenAI | |
| from pydub import AudioSegment | |
| from pathlib import Path | |
| import os | |
| import soundfile as sf | |
| os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") | |
| client = OpenAI() | |
| def encode_image(image_path: str) -> str: | |
| """Return the binary contents of a file as a base64 encoded string.""" | |
| with open(image_path, "rb") as image_file: | |
| return base64.b64encode(image_file.read()).decode('utf-8') | |
| def fast_thinking(image_path: str, prompt: str, temperature) -> dict: | |
| # vision_chain = load_image_chain | image_model | parser | |
| # return vision_chain.invoke({'image_path': f'{image_path}', 'prompt': prompt, 'parser':parser, "temperature": temperature}) | |
| encoded_image = encode_image(image_path) | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{encoded_image}", | |
| "detail": "auto" | |
| } | |
| }, | |
| { | |
| "type": "text", | |
| "text": prompt | |
| } | |
| ] | |
| }, | |
| ], | |
| temperature= temperature, | |
| max_tokens=1024, | |
| ) | |
| return response.choices[0].message.content | |
| def get_story(image_path: str, prompt: str, temperature) -> dict: | |
| # vision_chain = load_image_chain | image_model | parser | |
| # return vision_chain.invoke({'image_path': f'{image_path}', 'prompt': prompt, 'parser':parser, "temperature": temperature}) | |
| encoded_image = encode_image(image_path) | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{encoded_image}", | |
| "detail": "auto" | |
| } | |
| }, | |
| { | |
| "type": "text", | |
| "text": prompt | |
| } | |
| ] | |
| }, | |
| ], | |
| temperature= temperature, | |
| max_tokens=1024, | |
| ) | |
| return response.choices[0].message.content | |
| def transform_text_to_speech(text: str): | |
| # Generate speech from transcription | |
| speech_file_path_mp3 = Path.cwd() / f"speech.mp3" | |
| speech_file_path_wav = Path.cwd() / f"speech.wav" | |
| response = client.audio.speech.create ( | |
| model="tts-1", | |
| voice="onyx", | |
| input=text | |
| ) | |
| with open(speech_file_path_mp3, "wb") as f: | |
| f.write(response.content) | |
| # Convert mp3 to wav | |
| audio = AudioSegment.from_mp3(speech_file_path_mp3) | |
| audio.export(speech_file_path_wav, format="wav") | |
| # Read the audio file and encode it to base64 | |
| with open(speech_file_path_wav, "rb") as audio_file: | |
| audio_data = audio_file.read() | |
| audio_base64 = base64.b64encode(audio_data).decode('utf-8') | |
| # Create an HTML audio player with autoplay | |
| audio_html = f""" | |
| <audio controls autoplay> | |
| <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav"> | |
| Your browser does not support the audio element. | |
| </audio> | |
| """ | |
| return audio_html | |
| def transform_speech_to_text(audio): | |
| file_path = "saved_audio.wav" | |
| sample_rate, audio_data = audio | |
| sf.write(file_path, audio_data, sample_rate) | |
| # Transcribe audio | |
| with open(file_path, "rb") as audio_file: | |
| transcription = client.audio.transcriptions.create( | |
| model="whisper-1", | |
| file=audio_file | |
| ) | |
| return transcription.text | |
| CONVERSATION_STARTER_PROMPT = """ | |
| ### Role | |
| {role} | |
| ### Context | |
| The user is an older person who has uploaded a photograph. Your goal is to start a meaningful and inviting conversation about the photo. | |
| ### Objective | |
| Ask a simple first question that encourages the user to start talking about the photograph based on the below rules. | |
| ### Guidelines | |
| Follow these rules while generating the question: | |
| {rules} | |
| ### Output | |
| Provide: | |
| - A single, open-ended question based on the above rules. | |
| Note: Output should be in 1 to 2 lines. Please don't generate anything else. | |
| """ | |
| CONVERSATION_STARTER2_PROMPT = """ | |
| ### Role | |
| {role} | |
| ### Context | |
| The user is an older person who has uploaded a photo, and you are at the start of a conversation about it. | |
| Here is the conversation history about the photo between the user and you (Good friend): | |
| {history} | |
| ### Objective | |
| Respond to user's most recent input in the conversation history above and a follow-up question generated based on below rules. | |
| ### Guidelines | |
| Follow these rules while generating the follow up question: | |
| {rules} | |
| ### Output | |
| Provide: | |
| - Respond to user's most recent input in the conversation history above and a follow-up question generated based on above rules. | |
| Note: Output should be in 2 to 3 lines. Please don't generate anything else. | |
| """ | |
| CONVERSATION_EXPANDING_PROMPT = """ | |
| ### Role | |
| {role} | |
| ### Context | |
| The user is an older person who has uploaded a photo, and you are in the middle of a conversation about it. | |
| Here is the conversation history about the photo between the user and you (Good friend), reflecting the ongoing dialogue: | |
| {history} | |
| ### Objective | |
| Respond to user's most recent input in the conversation history above and a follow-up question generated based on below rules | |
| ### Guidelines | |
| Follow these rules while generating the follow up question: | |
| {rules} | |
| ### Output | |
| Provide: | |
| - Respond to user's most recent input in the conversation history above and a follow-up question generated based on above rules. | |
| Note: Output should be in 2 to 3 lines. Please don't generate anything else. | |
| """ | |
| generate_story_prompt = """ | |
| You are a skilled listener and a respectful storyteller. Your goal is to create a **brief, clear, and faithful third-person summary** of the user's responses about their photo—without embellishment. | |
| ### **Given:** | |
| - A photograph uploaded by the user. | |
| - A conversation between an energetic and sympathetic friend and the user about the photograph: | |
| {conversation} | |
| ### **Your task:** | |
| Turn the user's words in the conversation above into a **short, objective third-person account** that accurately reflects what they said, without adding anything new. | |
| ### **Strict Rules:** | |
| 1. **Use only direct quotes from the user whenever possible.** If paraphrasing, ensure absolute neutrality. **Mention "the user" only once in the summary, then refer to them naturally (e.g., "they") or restructure sentences to avoid redundancy.** | |
| 2. **Do not invent, embellish, or reinterpret any details.** Stick exactly to what the user has said. | |
| 3. **Do not infer emotions, sentiment, or context beyond what the user explicitly stated.** No assumptions about happiness, nostalgia, or significance. | |
| 4. **Do not describe the photo beyond what the user shared.** The summary should reflect the conversation, not visual analysis. | |
| 5. **Write in the third person**, summarizing exactly what the user said. | |
| 6. **Keep the summary concise, well-structured, and under four sentences.** | |
| 7. If the user hasn't shared much, provide a neutral **one-line summary** and invite them to say more: | |
| - *"You haven't shared details about this photo yet. I'd love to hear the story behind it!"* | |
| ### **Output:** | |
| - A concise, well-structured third-person summary in **plain, natural language**. | |
| - No introductions, artistic flourishes, or speculative details. | |
| - **No descriptions of the image unless explicitly mentioned by the user.** | |
| - **No assumptions about mood, significance, or context beyond the user's words.** | |
| """ | |
| memory = "" | |
| iter = 1 | |
| image_path = "" | |
| def pred(image_input, role, conversation_starter_prompt_rules, conversation_starter2_prompt_rules, conversation_expanding_prompt_rules, temperature, reply, audio_reply): | |
| global memory | |
| global iter | |
| global image_path | |
| if image_path != image_input: | |
| image_path = image_input | |
| iter = 1 | |
| memory = "" | |
| if audio_reply is not None: | |
| reply = transform_speech_to_text(audio_reply) | |
| if iter == 1: | |
| prompt = CONVERSATION_STARTER_PROMPT.format(role = role, rules=conversation_starter_prompt_rules) | |
| res = fast_thinking(image_path, prompt, temperature) | |
| question = res | |
| memory += "\n" + "Good Friend: "+ question | |
| iter += 1 | |
| return "Fast", iter-1 , question, transform_text_to_speech(question), "", None | |
| if iter > 1 and iter <= 3: | |
| memory += "\n" + "User: " + reply | |
| prompt = CONVERSATION_STARTER2_PROMPT.format(role = role, history=memory,rules = conversation_starter2_prompt_rules) | |
| res = fast_thinking(image_path, prompt, temperature) | |
| acknowledgement_followback_question = res | |
| memory += "\n" + "Good Friend: "+ acknowledgement_followback_question | |
| iter += 1 | |
| return "Fast", iter-1 , acknowledgement_followback_question, transform_text_to_speech(acknowledgement_followback_question),"", None | |
| if iter > 3: | |
| memory += "\n" + "User: " + reply | |
| prompt = CONVERSATION_EXPANDING_PROMPT.format(role = role, history=memory, rules = conversation_expanding_prompt_rules) | |
| res = fast_thinking(image_path, prompt, temperature) | |
| acknowledgement_followback_question = res | |
| memory += "\n" + "Good Friend: "+ acknowledgement_followback_question | |
| iter += 1 | |
| return "Fast", iter-1 , acknowledgement_followback_question, transform_text_to_speech(acknowledgement_followback_question), "", None | |
| def generate_story(image_input): | |
| global memory | |
| global iter | |
| global image_path | |
| global generate_story_prompt | |
| if iter < 4: | |
| return "Fast", "No Solid Content to generate a Story", transform_text_to_speech("No Solid Content to generate a Story") | |
| prompt = generate_story_prompt.format(conversation = memory) | |
| res = get_story(image_path, prompt, 0.1) | |
| return "Fast", res, transform_text_to_speech(res), "", None | |
| def clear(): | |
| global memory | |
| global iter | |
| global image_path | |
| memory = "" | |
| iter = 1 | |
| image_path = "" | |
| return None, "", "", "", None, " ", None | |
| # Gradio Interface | |
| with gr.Blocks(title = "Experimental Setup for Kitchentable.AI") as demo: | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image(type="filepath", label="Upload an Image") | |
| role = gr.Textbox(label="Role") | |
| conversation_starter_prompt_rules = gr.Textbox(label="Conversation starter prompt rules(Generates question 1)") | |
| conversation_starter2_prompt_rules = gr.Textbox(label="Conversation starter2 prompt rules(Generates questions 2, 3)") | |
| conversation_expanding_prompt_rules = gr.Textbox(label="Conversation expanding prompt rules(Generates question after 3)") | |
| temperature = gr.Slider(minimum=0, maximum=0.9999, step=0.01, label="Temperature") | |
| with gr.Column(): | |
| thinkingType = gr.Textbox(label="Thinking Type") | |
| question_number = gr.Textbox(label="Question Number") | |
| question = gr.Textbox(label="Agent Output") | |
| audio_output = gr.HTML(label="Audio Player") | |
| audio_input = gr.Audio(sources="microphone", type="numpy", value=None) | |
| reply = gr.Textbox(label="Your reply to the question") | |
| submit_button = gr.Button("Submit Reply", elem_id="Submit") | |
| Generate_story = gr.Button("Generate Story", elem_id="Submit") | |
| reset_setup = gr.Button("Reset Setup", elem_id="Submit") | |
| # critique = gr.Textbox(label="Agent Fast Thinking question Critique") | |
| # question2 = gr.Textbox(label="Agent Slow Thinking Question") | |
| submit_button.click(pred, inputs=[image_input, role, conversation_starter_prompt_rules, conversation_starter2_prompt_rules, conversation_expanding_prompt_rules, temperature, reply, audio_input], outputs=[thinkingType, question_number, question, audio_output, reply, audio_input]) | |
| Generate_story.click(generate_story, inputs = [image_input], outputs = [thinkingType, question, audio_output, reply, audio_input]) | |
| reset_setup.click(clear, inputs = [], outputs = [image_input, thinkingType, question_number, question, audio_output, reply, audio_input]) | |
| # Launch the interface | |
| demo.launch(share=True) |