Spaces:

UbaidMajied
/

Colin

Sleeping

App Files Files Community

UbaidMajied commited on Jan 20, 2025

Commit

373bc71

verified ·

1 Parent(s): a444054

Create app.py

Browse files

Files changed (1) hide show

app.py +432 -0

app.py ADDED Viewed

	@@ -0,0 +1,432 @@

+from langgraph.graph import StateGraph, END
+from typing import TypedDict, Annotated, List
+import operator
+from langgraph.checkpoint.sqlite import SqliteSaver
+from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, AIMessage, ChatMessage
+from langchain_core.runnables import chain
+from langchain_openai import ChatOpenAI
+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain_core.output_parsers import JsonOutputParser
+import base64
+from langchain.chains import TransformChain
+from google.colab import userdata
+from IPython import display
+import gradio as gr
+from openai import OpenAI
+from pydub import AudioSegment
+from pathlib import Path
+import os
+os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
+def encode_image(image_path: str) -> str:
+  """Return the binary contents of a file as a base64 encoded string."""
+  with open(image_path, "rb") as image_file:
+    return base64.b64encode(image_file.read()).decode('utf-8')
+def load_image(inputs: dict) -> dict:
+  """Load image from file and encode it as base64."""
+  image_path = inputs["image_path"]
+  image_base64 = encode_image(image_path)
+  return {"image": image_base64}
+def get_open_ai_api_key() -> str:
+  return userdata.get('OPEN_AI_API_KEY')
+client = OpenAI()
+def encode_image(image_path):
+  with open(image_path, "rb") as image_file:
+    return base64.b64encode(image_file.read()).decode('utf-8')
+def load_image(inputs: dict) -> dict:
+    """Load image from file and encode it as base64."""
+    image_path = inputs["image_path"]
+    image_base64 = encode_image(image_path)
+    return {"image": image_base64}
+class GenerateQuestion(BaseModel):
+ """Information about an image."""
+ question: str = Field(description= "A single, open-ended question to start the convesation")
+QUESTION_PARSER = JsonOutputParser(pydantic_object=GenerateQuestion)
+class GenerateQuestion2(BaseModel):
+ """Information about an image and the user's responses."""
+ acknowledgement_followback_question: str = Field(description= "An acknowledgement to user's most recent input and a follow-up question to gather more information about the photograph.")
+QUESTION_PARSER_2 = JsonOutputParser(pydantic_object=GenerateQuestion2)
+class GenerateQuestion3(BaseModel):
+ """Information about an image and the user's responses."""
+ acknowledgement_followback_question: str = Field(description= "An acknowledgement to user's most recent input and a follow-up question to expand on the conversation.")
+QUESTION_PARSER_3 = JsonOutputParser(pydantic_object=GenerateQuestion3)
+class GenerateCritique(BaseModel):
+ """Information about an image."""
+ critique: str = Field(description= "A Critique")
+ question: str = Field(description= "A revised reply and follow up question, if necessary")
+CRITIQUE_PARSER = JsonOutputParser(pydantic_object=GenerateCritique)
+@chain
+def image_model(inputs: dict) -> str | list[str] | dict:
+ """Invoke model with image and prompt."""
+ model = ChatOpenAI(temperature=inputs["temperature"], model="gpt-4o", max_tokens=1024)
+ msg = model.invoke(
+             [HumanMessage(
+             content=[
+             {"type": "text", "text": inputs["prompt"]},
+             {"type": "text", "text": inputs["parser"].get_format_instructions()},
+             {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{inputs['image']}"}},
+             ])]
+             )
+ return msg.content
+load_image_chain = TransformChain(
+    input_variables=["image_path"],
+    output_variables=["image"],
+    transform=load_image
+)
+def fast_thinking(image_path: str, prompt: str, parser, temperature) -> dict:
+  #  vision_chain = load_image_chain | image_model | parser
+  #  return vision_chain.invoke({'image_path': f'{image_path}', 'prompt': prompt, 'parser':parser, "temperature": temperature})
+  encoded_image = encode_image(image_path)
+  response = client.chat.completions.create(
+    model="gpt-4o",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                    {
+                      "type": "image_url",
+                      "image_url": {
+                        "url": f"data:image/jpeg;base64,{encoded_image}",
+                        "detail": "auto"
+                      }
+                    },
+                    {
+                      "type": "text",
+                      "text": prompt
+                    }
+            ]
+        },
+    ],
+    temperature= temperature,
+    max_tokens=1024,
+   )
+  return response.choices[0].message.content
+def get_story(image_path: str, prompt: str, temperature) -> dict:
+  #  vision_chain = load_image_chain | image_model | parser
+  #  return vision_chain.invoke({'image_path': f'{image_path}', 'prompt': prompt, 'parser':parser, "temperature": temperature})
+  encoded_image = encode_image(image_path)
+  response = client.chat.completions.create(
+    model="gpt-4o",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                    {
+                      "type": "image_url",
+                      "image_url": {
+                        "url": f"data:image/jpeg;base64,{encoded_image}",
+                        "detail": "auto"
+                      }
+                    },
+                    {
+                      "type": "text",
+                      "text": prompt
+                    }
+            ]
+        },
+    ],
+    temperature= temperature,
+    max_tokens=1024,
+   )
+  return response.choices[0].message.content
+class AgentState(TypedDict):
+    image_path: str
+    prompt:str
+    critique_prompt:str
+    question1: str
+    question2: str
+    critique: str
+    temperature: float
+def generate_question_node1(state: AgentState):
+  res = fast_thinking(state["image_path"], state["prompt"], QUESTION_PARSER_2, state["temperature"])
+  return {"question1": res["question"]}
+def question_critique_node(state: AgentState):
+  critique_prompt = state["critique_prompt"].format(question=state["question1"])
+  res = fast_thinking(state["image_path"],critique_prompt, CRITIQUE_PARSER, state["temperature"])
+  return {"critique": res["critique"], "question2": res["question"]}
+# builder = StateGraph(AgentState)
+# builder.add_node("question_generator1", generate_question_node1)
+# builder.add_node("question_critique", question_critique_node)
+# builder.add_edge("question_generator1", "question_critique")
+# builder.set_entry_point("question_generator1")
+# graph = builder.compile()
+# display.Image(graph.get_graph().draw_png())
+def slow_thinking(image_path: str, prompt:str, critique_prompt:str, temperature):
+  builder = StateGraph(AgentState)
+  builder.add_node("question_generator1", generate_question_node1)
+  builder.add_node("question_critique", question_critique_node)
+  builder.add_edge("question_generator1", "question_critique")
+  builder.set_entry_point("question_generator1")
+  graph = builder.compile()
+  final_state = graph.invoke(
+      {
+          'image_path': image_path,
+          'prompt':prompt,
+          'critique_prompt': critique_prompt,
+          'temperature': temperature
+      }, config={"configurable": {"thread_id": 1}})
+  return final_state
+def transform_text_to_speech(text: str):
+  # Generate speech from transcription
+  speech_file_path_mp3 = Path.cwd() / f"speech.mp3"
+  speech_file_path_wav = Path.cwd() / f"speech.wav"
+  response = client.audio.speech.create (
+                model="tts-1",
+                voice="onyx",
+                input=text
+            )
+  with open(speech_file_path_mp3, "wb") as f:
+      f.write(response.content)
+  # Convert mp3 to wav
+  audio = AudioSegment.from_mp3(speech_file_path_mp3)
+  audio.export(speech_file_path_wav, format="wav")
+  # Read the audio file and encode it to base64
+  with open(speech_file_path_wav, "rb") as audio_file:
+      audio_data = audio_file.read()
+      audio_base64 = base64.b64encode(audio_data).decode('utf-8')
+  # Create an HTML audio player with autoplay
+  audio_html = f"""
+  <audio controls autoplay>
+      <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
+      Your browser does not support the audio element.
+  </audio>
+  """
+  return audio_html
+CONVERSATION_STARTER_PROMPT = """
+### Role
+{role}
+### Context
+The user is an older person who has uploaded a photograph. Your goal is to start a meaningful and inviting conversation about the photo.
+### Objective
+Ask a simple first question that encourages the user to start talking about the photograph based on the below rules.
+### Guidelines
+Follow these rules while generating the question:
+{rules}
+### Output
+Provide:
+- A single, open-ended question based on the above rules.
+Note: Output should be in 1 to 2 lines. Please don't generate anything else.
+"""
+CONVERSATION_STARTER2_PROMPT = """
+### Role
+{role}
+### Context
+The user is an older person who has uploaded a photo, and you are at the start of a conversation about it.
+Here is the conversation history about the photo between the user and you (Good friend):
+{history}
+### Objective
+Respond to user's most recent input in the conversation history above and a follow-up question generated based on below rules.
+### Guidelines
+Follow these rules while generating the follow up question:
+{rules}
+### Output
+Provide:
+- Respond to user's most recent input in the conversation history above and a follow-up question generated based on above rules.
+Note: Output should be in 2 to 3 lines. Please don't generate anything else.
+"""
+CONVERSATION_EXPANDING_PROMPT = """
+### Role
+{role}
+### Context
+The user is an older person who has uploaded a photo, and you are in the middle of a conversation about it.
+Here is the conversation history about the photo between the user and you (Good friend), reflecting the ongoing dialogue:
+{history}
+### Objective
+Respond to user's most recent input in the conversation history above and a follow-up question generated based on below rules
+### Guidelines
+Follow these rules while generating the follow up question:
+{rules}
+### Output
+Provide:
+- Respond to user's most recent input in the conversation history above and a follow-up question generated based on above rules.
+Note: Output should be in 2 to 3 lines. Please don't generate anything else.
+"""
+generate_story_prompt = """
+    Given a photograph uploaded by the user and a conversation between a good friend and the user about the photograph:
+    {conversation}
+    Instructions:
+    1. Create a short story that captures the essence of the conversation about the photograph.
+    2. Do not invent new details—base the story entirely on the provided conversation.
+    Provide:
+    1. A concise story in three sentences.
+Note: Please generated only story.
+"""
+memory = ""
+iter = 1
+image_path = ""
+def pred(image_input, role, conversation_starter_prompt_rules, conversation_starter2_prompt_rules, conversation_expanding_prompt_rules,  temperature, reply):
+  global memory
+  global iter
+  global image_path
+  if image_path != image_input:
+    image_path = image_input
+    iter = 1
+    memory = ""
+  # Fast Thinking
+  # if iter <= 50:
+  if iter == 1:
+    prompt = CONVERSATION_STARTER_PROMPT.format(role = role, rules=conversation_starter_prompt_rules)
+    res = fast_thinking(image_path, prompt, QUESTION_PARSER, temperature)
+    question = res
+    memory += "\n" + "Good Friend: "+ question
+    iter += 1
+    return "Fast", question, transform_text_to_speech(question)
+  if iter > 1 and iter <= 3:
+    prompt = CONVERSATION_STARTER2_PROMPT.format(role = role, history=memory,rules = conversation_starter2_prompt_rules)
+    res = fast_thinking(image_path, prompt, QUESTION_PARSER_2, temperature)
+    acknowledgement_followback_question = res
+    memory += "\n" + "User: "  + reply
+    memory += "\n" + "Good Friend: "+ acknowledgement_followback_question
+    iter += 1
+    return "Fast", acknowledgement_followback_question, transform_text_to_speech(acknowledgement_followback_question)
+  if  iter > 3:
+    prompt = CONVERSATION_EXPANDING_PROMPT.format(role = role, history=memory, rules = conversation_expanding_prompt_rules)
+    res = fast_thinking(image_path, prompt, QUESTION_PARSER_3, temperature)
+    acknowledgement_followback_question = res
+    memory += "\n" + "User: "  + reply
+    memory += "\n" + "Good Friend: "+ acknowledgement_followback_question
+    iter += 1
+    return "Fast", acknowledgement_followback_question, transform_text_to_speech(acknowledgement_followback_question)
+  # Slow Thinking
+  # else:
+  #     prompt = CONVERSATION_EXPANDING_PROMPT.format(history=memory)
+  #     critique_prompt = CONVERSATION_EXPANDING_PROMPT_CRITIQUE.format(question="{question}", history=memory)
+  #     res = slow_thinking(image_path, prompt, critique_prompt, temperature)
+  #     question = res['question2']
+  #     memory += "\n" + "User: "  + reply
+  #     memory += "\n" + "Good Friend: "+ question
+  #     iter += 1
+  #     return "Slow", res["question1"], res["critique"], res["question2"]
+def generate_story(image_input):
+  global memory
+  global iter
+  global image_path
+  global generate_story_prompt
+  if iter < 4:
+    return "Fast", "No Solid Content to generate a Story", transform_text_to_speech("No Solid Content to generate a Story")
+  prompt = generate_story_prompt.format(conversation = memory)
+  res = get_story(image_path, prompt,  0.5)
+  return "Fast", res, transform_text_to_speech(res)
+def clear():
+  global memory
+  global iter
+  global image_path
+  memory = ""
+  iter = 1
+  image_path = ""
+  return None, "", "", None
+# Gradio Interface
+with gr.Blocks(title = "Experimental Setup for Kitchentable.AI") as demo:
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="filepath", label="Upload an Image")
+            role = gr.Textbox(label="Role")
+            conversation_starter_prompt_rules = gr.Textbox(label="Conversation starter prompt rules(Generates question 1)")
+            conversation_starter2_prompt_rules = gr.Textbox(label="Conversation starter2 prompt rules(Generates questions 2, 3)")
+            conversation_expanding_prompt_rules = gr.Textbox(label="Conversation expanding prompt rules(Generates question after 3)")
+            temperature = gr.Slider(minimum=0, maximum=0.9999, step=0.01, label="Temperature")
+        with gr.Column():
+            thinkingType = gr.Textbox(label="Thinking Type")
+            question = gr.Textbox(label="Agent Output")
+            audio_output = gr.HTML(label="Audio Player")
+            reply = gr.Textbox(label="Your reply to the question")
+            submit_button = gr.Button("Submit Reply", elem_id="Submit")
+            Generate_story = gr.Button("Generate Story", elem_id="Submit")
+            reset_setup = gr.Button("Reset Setup", elem_id="Submit")
+            # critique = gr.Textbox(label="Agent  Fast Thinking question Critique")
+            # question2 = gr.Textbox(label="Agent  Slow Thinking Question")
+    submit_button.click(pred, inputs=[image_input, role, conversation_starter_prompt_rules,conversation_starter2_prompt_rules, conversation_expanding_prompt_rules, temperature, reply], outputs=[thinkingType, question, audio_output])
+    Generate_story.click(generate_story, inputs = [image_input], outputs = [thinkingType, question, audio_output])
+    reset_setup.click(clear, inputs = [], outputs = [image_input, thinkingType, question, audio_output])
+# Launch the interface
+demo.launch(share=True)