Spaces:

UbaidMajied
/

JD

Sleeping

App Files Files Community

JD / app.py

UbaidMajied

Update app.py

9cd2e20 verified about 1 year ago

raw

history blame contribute delete

12.3 kB

	from typing import TypedDict, Annotated, List
	import operator
	import base64
	import gradio as gr
	from openai import OpenAI
	from pydub import AudioSegment
	from pathlib import Path
	import os
	import soundfile as sf


	os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

	client = OpenAI()

	def encode_image(image_path: str) -> str:
	"""Return the binary contents of a file as a base64 encoded string."""
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode('utf-8')


	def fast_thinking(image_path: str, prompt: str, temperature) -> dict:
	# vision_chain = load_image_chain \| image_model \| parser
	# return vision_chain.invoke({'image_path': f'{image_path}', 'prompt': prompt, 'parser':parser, "temperature": temperature})
	encoded_image = encode_image(image_path)
	response = client.chat.completions.create(
	model="gpt-4o",
	messages=[
	{
	"role": "user",
	"content": [

	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{encoded_image}",
	"detail": "auto"
	}
	},
	{
	"type": "text",
	"text": prompt
	}
	]
	},
	],
	temperature= temperature,
	max_tokens=1024,
	)
	return response.choices[0].message.content

	def get_story(image_path: str, prompt: str, temperature) -> dict:
	# vision_chain = load_image_chain \| image_model \| parser
	# return vision_chain.invoke({'image_path': f'{image_path}', 'prompt': prompt, 'parser':parser, "temperature": temperature})
	encoded_image = encode_image(image_path)
	response = client.chat.completions.create(
	model="gpt-4o",
	messages=[
	{
	"role": "user",
	"content": [

	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{encoded_image}",
	"detail": "auto"
	}
	},
	{
	"type": "text",
	"text": prompt
	}
	]
	},
	],
	temperature= temperature,
	max_tokens=1024,
	)
	return response.choices[0].message.content



	def transform_text_to_speech(text: str):
	# Generate speech from transcription
	speech_file_path_mp3 = Path.cwd() / f"speech.mp3"
	speech_file_path_wav = Path.cwd() / f"speech.wav"
	response = client.audio.speech.create (
	model="tts-1",
	voice="onyx",
	input=text
	)

	with open(speech_file_path_mp3, "wb") as f:
	f.write(response.content)

	# Convert mp3 to wav
	audio = AudioSegment.from_mp3(speech_file_path_mp3)
	audio.export(speech_file_path_wav, format="wav")

	# Read the audio file and encode it to base64
	with open(speech_file_path_wav, "rb") as audio_file:
	audio_data = audio_file.read()
	audio_base64 = base64.b64encode(audio_data).decode('utf-8')

	# Create an HTML audio player with autoplay
	audio_html = f"""
	<audio controls autoplay>
	<source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
	Your browser does not support the audio element.
	</audio>
	"""
	return audio_html


	def transform_speech_to_text(audio):
	file_path = "saved_audio.wav"
	sample_rate, audio_data = audio
	sf.write(file_path, audio_data, sample_rate)
	# Transcribe audio
	with open(file_path, "rb") as audio_file:
	transcription = client.audio.transcriptions.create(
	model="whisper-1",
	file=audio_file
	)
	return transcription.text

	CONVERSATION_STARTER_PROMPT = """
	### Role
	{role}
	### Context
	The user is an older person who has uploaded a photograph. Your goal is to start a meaningful and inviting conversation about the photo.
	### Objective
	Ask a simple first question that encourages the user to start talking about the photograph based on the below rules.
	### Guidelines
	Follow these rules while generating the question:
	{rules}
	### Output
	Provide:
	- A single, open-ended question based on the above rules.
	Note: Output should be in 1 to 2 lines. Please don't generate anything else.
	"""

	CONVERSATION_STARTER2_PROMPT = """
	### Role
	{role}
	### Context
	The user is an older person who has uploaded a photo, and you are at the start of a conversation about it.
	Here is the conversation history about the photo between the user and you (Good friend):
	{history}
	### Objective
	Respond to user's most recent input in the conversation history above and a follow-up question generated based on below rules.
	### Guidelines
	Follow these rules while generating the follow up question:
	{rules}
	### Output
	Provide:
	- Respond to user's most recent input in the conversation history above and a follow-up question generated based on above rules.
	Note: Output should be in 2 to 3 lines. Please don't generate anything else.
	"""


	CONVERSATION_EXPANDING_PROMPT = """
	### Role
	{role}
	### Context
	The user is an older person who has uploaded a photo, and you are in the middle of a conversation about it.
	Here is the conversation history about the photo between the user and you (Good friend), reflecting the ongoing dialogue:
	{history}
	### Objective
	Respond to user's most recent input in the conversation history above and a follow-up question generated based on below rules
	### Guidelines
	Follow these rules while generating the follow up question:
	{rules}
	### Output
	Provide:
	- Respond to user's most recent input in the conversation history above and a follow-up question generated based on above rules.
	Note: Output should be in 2 to 3 lines. Please don't generate anything else.
	"""


	generate_story_prompt = """
	You are a skilled listener and a respectful storyteller. Your goal is to create a brief, clear, and faithful third-person summary of the user's responses about their photo—without embellishment.

	### Given:
	- A photograph uploaded by the user.
	- A conversation between an energetic and sympathetic friend and the user about the photograph:
	{conversation}

	### Your task:
	Turn the user's words in the conversation above into a short, objective third-person account that accurately reflects what they said, without adding anything new.

	### Strict Rules:
	1. Use only direct quotes from the user whenever possible. If paraphrasing, ensure absolute neutrality. Mention "the user" only once in the summary, then refer to them naturally (e.g., "they") or restructure sentences to avoid redundancy.
	2. Do not invent, embellish, or reinterpret any details. Stick exactly to what the user has said.
	3. Do not infer emotions, sentiment, or context beyond what the user explicitly stated. No assumptions about happiness, nostalgia, or significance.
	4. Do not describe the photo beyond what the user shared. The summary should reflect the conversation, not visual analysis.
	5. Write in the third person, summarizing exactly what the user said.
	6. Keep the summary concise, well-structured, and under four sentences.
	7. If the user hasn't shared much, provide a neutral one-line summary and invite them to say more:
	- "You haven't shared details about this photo yet. I'd love to hear the story behind it!"

	### Output:
	- A concise, well-structured third-person summary in plain, natural language.
	- No introductions, artistic flourishes, or speculative details.
	- No descriptions of the image unless explicitly mentioned by the user.
	- No assumptions about mood, significance, or context beyond the user's words.
	"""



	memory = ""
	iter = 1
	image_path = ""

	def pred(image_input, role, conversation_starter_prompt_rules, conversation_starter2_prompt_rules, conversation_expanding_prompt_rules, temperature, reply, audio_reply):
	global memory
	global iter
	global image_path
	if image_path != image_input:
	image_path = image_input
	iter = 1
	memory = ""

	if audio_reply is not None:
	reply = transform_speech_to_text(audio_reply)

	if iter == 1:
	prompt = CONVERSATION_STARTER_PROMPT.format(role = role, rules=conversation_starter_prompt_rules)
	res = fast_thinking(image_path, prompt, temperature)
	question = res
	memory += "\n" + "Good Friend: "+ question
	iter += 1
	return "Fast", iter-1 , question, transform_text_to_speech(question), "", None
	if iter > 1 and iter <= 3:
	memory += "\n" + "User: " + reply
	prompt = CONVERSATION_STARTER2_PROMPT.format(role = role, history=memory,rules = conversation_starter2_prompt_rules)
	res = fast_thinking(image_path, prompt, temperature)
	acknowledgement_followback_question = res
	memory += "\n" + "Good Friend: "+ acknowledgement_followback_question
	iter += 1
	return "Fast", iter-1 , acknowledgement_followback_question, transform_text_to_speech(acknowledgement_followback_question),"", None
	if iter > 3:
	memory += "\n" + "User: " + reply
	prompt = CONVERSATION_EXPANDING_PROMPT.format(role = role, history=memory, rules = conversation_expanding_prompt_rules)
	res = fast_thinking(image_path, prompt, temperature)
	acknowledgement_followback_question = res
	memory += "\n" + "Good Friend: "+ acknowledgement_followback_question
	iter += 1
	return "Fast", iter-1 , acknowledgement_followback_question, transform_text_to_speech(acknowledgement_followback_question), "", None

	def generate_story(image_input):
	global memory
	global iter
	global image_path
	global generate_story_prompt

	if iter < 4:
	return "Fast", "No Solid Content to generate a Story", transform_text_to_speech("No Solid Content to generate a Story")
	prompt = generate_story_prompt.format(conversation = memory)
	res = get_story(image_path, prompt, 0.1)
	return "Fast", res, transform_text_to_speech(res), "", None

	def clear():
	global memory
	global iter
	global image_path

	memory = ""
	iter = 1
	image_path = ""
	return None, "", "", "", None, " ", None



	# Gradio Interface
	with gr.Blocks(title = "Experimental Setup for Kitchentable.AI") as demo:
	with gr.Row():
	with gr.Column():
	image_input = gr.Image(type="filepath", label="Upload an Image")
	role = gr.Textbox(label="Role")
	conversation_starter_prompt_rules = gr.Textbox(label="Conversation starter prompt rules(Generates question 1)")
	conversation_starter2_prompt_rules = gr.Textbox(label="Conversation starter2 prompt rules(Generates questions 2, 3)")
	conversation_expanding_prompt_rules = gr.Textbox(label="Conversation expanding prompt rules(Generates question after 3)")
	temperature = gr.Slider(minimum=0, maximum=0.9999, step=0.01, label="Temperature")

	with gr.Column():
	thinkingType = gr.Textbox(label="Thinking Type")
	question_number = gr.Textbox(label="Question Number")
	question = gr.Textbox(label="Agent Output")
	audio_output = gr.HTML(label="Audio Player")
	audio_input = gr.Audio(sources="microphone", type="numpy", value=None)
	reply = gr.Textbox(label="Your reply to the question")
	submit_button = gr.Button("Submit Reply", elem_id="Submit")
	Generate_story = gr.Button("Generate Story", elem_id="Submit")
	reset_setup = gr.Button("Reset Setup", elem_id="Submit")
	# critique = gr.Textbox(label="Agent Fast Thinking question Critique")
	# question2 = gr.Textbox(label="Agent Slow Thinking Question")

	submit_button.click(pred, inputs=[image_input, role, conversation_starter_prompt_rules, conversation_starter2_prompt_rules, conversation_expanding_prompt_rules, temperature, reply, audio_input], outputs=[thinkingType, question_number, question, audio_output, reply, audio_input])
	Generate_story.click(generate_story, inputs = [image_input], outputs = [thinkingType, question, audio_output, reply, audio_input])
	reset_setup.click(clear, inputs = [], outputs = [image_input, thinkingType, question_number, question, audio_output, reply, audio_input])
	# Launch the interface
	demo.launch(share=True)