sravan837's picture
Update app.py
dfbb592 verified
import gradio as gr
import json
import asyncio
import edge_tts
import re
import os
from huggingface_hub import InferenceClient
# --- SETTINGS ---
# 1. BRAIN: Llama-3 (Text Generation)
EXTRACTOR_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
PERSONALITY_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
# 2. EARS: Whisper (Speech-to-Text)
STT_MODEL = "openai/whisper-large-v3-turbo"
# Default Chat History
DEFAULT_LOGS = """
1. User: I feel tired after big parties. I need to be alone to recharge.
2. User: I like ideas more than real-world details.
3. User: My desk is messy, but I know where my stuff is.
4. User: I worry that I said the wrong thing.
5. User: I like to plan ahead. Surprises stress me out.
6. User: It is hard for me to understand why people cry over small things.
7. User: I start many hobbies but do not finish them.
8. User: I feel bad when someone criticizes me.
9. User: I take charge in groups to make sure work is done right.
10. User: Logic is more important than feelings.
11. User: I daydream a lot.
12. User: I hate fighting. I want everyone to get along.
13. User: I help others even if it hurts me.
14. User: Boring tasks make me sleepy.
15. User: I need proof before I believe something.
16. User: I love being the center of attention.
17. User: I am bad at talking about my feelings.
18. User: I wait until the last minute to do work.
19. User: Music makes me feel strong emotions.
20. User: I prefer 2 close friends over 20 acquaintances.
21. User: I cannot say "no" to people.
22. User: I always analyze why people act the way they do.
23. User: I like following rules and traditions.
24. User: People say I am too serious.
25. User: I have lots of energy when debating.
26. User: I am scared of the future.
27. User: I trust my gut feeling more than numbers.
28. User: I work better alone.
29. User: I hate losing games.
30. User: I want to know my purpose in life.
"""
# --- HELPER: CLEAN TEXT ---
def clean_text_for_audio(text):
"""Removes (pause), *laughs*, etc. so the robot doesn't read them."""
clean = re.sub(r'[\(\[\*].*?[\)\]\*]', '', text)
return clean.strip()
# --- PART 1: MEMORY EXTRACTOR ---
def extract_memory(chat_logs, hf_token):
if not hf_token:
return "Error: Please paste your Hugging Face Token."
client = InferenceClient(token=hf_token)
system_prompt = """
Read the chat logs. Create a simple User Profile in JSON format.
Find these 3 things:
1. "traits": Is the user Introverted? Organized? Anxious?
2. "values": Do they care about Logic? Peace? Winning?
3. "struggles": Do they procrastinate? Have social anxiety?
Return ONLY valid JSON.
"""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": chat_logs}
]
try:
response = client.chat_completion(
model=EXTRACTOR_MODEL,
messages=messages,
max_tokens=500,
temperature=0.1
)
text = response.choices[0].message.content.strip()
if "```" in text:
text = text.replace("```json", "").replace("```", "")
start = text.find("{")
end = text.rfind("}") + 1
return json.dumps(json.loads(text[start:end]), indent=2)
except Exception as e:
return json.dumps({"error": str(e)}, indent=2)
# --- PART 2: THE EARS (Speech-to-Text) ---
def transcribe_audio(audio_filepath, hf_token):
"""
Sends the user's recorded audio file to the Whisper model.
Returns the text string.
"""
if not audio_filepath:
return ""
client = InferenceClient(token=hf_token)
try:
# Provide the file path directly to the API
response = client.automatic_speech_recognition(
model=STT_MODEL,
audio=audio_filepath
)
return response.text
except Exception as e:
return f"Error listening: {str(e)}"
# --- PART 3: PERSONALITY & VOICE ---
async def generate_response_and_audio(text_input, audio_input, memory_json, persona, hf_token):
if not hf_token:
return "Error: Please paste your Hugging Face Token.", None
# LOGIC: Did the user Type or Speak?
user_message = ""
if audio_input is not None:
# If audio exists, convert it to text first
user_message = transcribe_audio(audio_input, hf_token)
else:
# Otherwise use the typed text
user_message = text_input
if not user_message:
return "Error: Please type something or record your voice.", None
client = InferenceClient(token=hf_token)
try:
memory = json.loads(memory_json)
except:
memory = {}
prompts = {
"Calm Mentor": "Role: Wise Teacher. Tone: Calm, slow, patient. Advice: Focus on long-term growth.",
"Witty Friend": "Role: Best Friend. Tone: Funny, fast, sarcastic. Advice: Make jokes and be relatable.",
"Therapist": "Role: Counselor. Tone: Soft, kind, gentle. Advice: Validate their feelings."
}
context = f"""
ABOUT THE USER:
- Personality: {memory.get('traits', 'Unknown')}
- Values: {memory.get('values', 'Unknown')}
- Problems: {memory.get('struggles', 'Unknown')}
"""
messages = [
{"role": "system", "content": f"{prompts[persona]}\n\n{context}"},
{"role": "user", "content": user_message}
]
try:
# A. Generate Text Response
res = client.chat_completion(
model=PERSONALITY_MODEL,
messages=messages,
max_tokens=250,
temperature=0.7
)
text_reply = res.choices[0].message.content
# B. Generate Audio Response
spoken_text = clean_text_for_audio(text_reply)
voice_map = {
"Calm Mentor": "en-US-ChristopherNeural",
"Witty Friend": "en-US-EricNeural",
"Therapist": "en-US-AvaNeural"
}
output_file = "response.mp3"
communicate = edge_tts.Communicate(spoken_text, voice_map.get(persona, "en-US-AriaNeural"))
await communicate.save(output_file)
# Return: (User's Transcribed Text, AI Response, Audio File)
return f" You said: {user_message}\n\n AI: {text_reply}", output_file
except Exception as e:
return f"Error: {str(e)}", None
# Wrapper for Gradio
def process_interaction(text, audio, memory, persona, token):
return asyncio.run(generate_response_and_audio(text, audio, memory, persona, token))
# --- UI LAYOUT ---
with gr.Blocks(title="Multimodal Personality Engine") as demo:
gr.Markdown("Input: **Text or Voice** | Output: **Text + Voice**")
with gr.Row():
token_input = gr.Textbox(label="Hugging Face Token (Required)", type="password")
with gr.Row():
# Column 1: Analyze
with gr.Column():
gr.Markdown("### 1. Memory Analysis")
logs_input = gr.Textbox(label="History", value=DEFAULT_LOGS, lines=5)
extract_btn = gr.Button("Create Profile")
memory_output = gr.Code(label="Result (JSON)", language="json")
extract_btn.click(extract_memory, inputs=[logs_input, token_input], outputs=memory_output)
# Column 2: Chat
with gr.Column():
gr.Markdown("### 2. Chat with Agent")
# INPUTS
with gr.Tab("Type"):
text_in = gr.Textbox(label="Type here...")
with gr.Tab("Speak"):
audio_in = gr.Audio(sources=["microphone"], type="filepath", label="Record here")
persona_select = gr.Radio(["Calm Mentor", "Witty Friend", "Therapist"], label="Tone", value="Calm Mentor")
send_btn = gr.Button("Send Message")
# OUTPUTS
text_out = gr.Textbox(label="Conversation Log", lines=4)
audio_out = gr.Audio(label="AI Voice Response")
send_btn.click(
process_interaction,
inputs=[text_in, audio_in, memory_output, persona_select, token_input],
outputs=[text_out, audio_out]
)
if __name__ == "__main__":
demo.queue().launch()