import gradio as gr import json import asyncio import edge_tts import re import os from huggingface_hub import InferenceClient # --- SETTINGS --- # 1. BRAIN: Llama-3 (Text Generation) EXTRACTOR_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct" PERSONALITY_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct" # 2. EARS: Whisper (Speech-to-Text) STT_MODEL = "openai/whisper-large-v3-turbo" # Default Chat History DEFAULT_LOGS = """ 1. User: I feel tired after big parties. I need to be alone to recharge. 2. User: I like ideas more than real-world details. 3. User: My desk is messy, but I know where my stuff is. 4. User: I worry that I said the wrong thing. 5. User: I like to plan ahead. Surprises stress me out. 6. User: It is hard for me to understand why people cry over small things. 7. User: I start many hobbies but do not finish them. 8. User: I feel bad when someone criticizes me. 9. User: I take charge in groups to make sure work is done right. 10. User: Logic is more important than feelings. 11. User: I daydream a lot. 12. User: I hate fighting. I want everyone to get along. 13. User: I help others even if it hurts me. 14. User: Boring tasks make me sleepy. 15. User: I need proof before I believe something. 16. User: I love being the center of attention. 17. User: I am bad at talking about my feelings. 18. User: I wait until the last minute to do work. 19. User: Music makes me feel strong emotions. 20. User: I prefer 2 close friends over 20 acquaintances. 21. User: I cannot say "no" to people. 22. User: I always analyze why people act the way they do. 23. User: I like following rules and traditions. 24. User: People say I am too serious. 25. User: I have lots of energy when debating. 26. User: I am scared of the future. 27. User: I trust my gut feeling more than numbers. 28. User: I work better alone. 29. User: I hate losing games. 30. User: I want to know my purpose in life. """ # --- HELPER: CLEAN TEXT --- def clean_text_for_audio(text): """Removes (pause), *laughs*, etc. so the robot doesn't read them.""" clean = re.sub(r'[\(\[\*].*?[\)\]\*]', '', text) return clean.strip() # --- PART 1: MEMORY EXTRACTOR --- def extract_memory(chat_logs, hf_token): if not hf_token: return "Error: Please paste your Hugging Face Token." client = InferenceClient(token=hf_token) system_prompt = """ Read the chat logs. Create a simple User Profile in JSON format. Find these 3 things: 1. "traits": Is the user Introverted? Organized? Anxious? 2. "values": Do they care about Logic? Peace? Winning? 3. "struggles": Do they procrastinate? Have social anxiety? Return ONLY valid JSON. """ messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": chat_logs} ] try: response = client.chat_completion( model=EXTRACTOR_MODEL, messages=messages, max_tokens=500, temperature=0.1 ) text = response.choices[0].message.content.strip() if "```" in text: text = text.replace("```json", "").replace("```", "") start = text.find("{") end = text.rfind("}") + 1 return json.dumps(json.loads(text[start:end]), indent=2) except Exception as e: return json.dumps({"error": str(e)}, indent=2) # --- PART 2: THE EARS (Speech-to-Text) --- def transcribe_audio(audio_filepath, hf_token): """ Sends the user's recorded audio file to the Whisper model. Returns the text string. """ if not audio_filepath: return "" client = InferenceClient(token=hf_token) try: # Provide the file path directly to the API response = client.automatic_speech_recognition( model=STT_MODEL, audio=audio_filepath ) return response.text except Exception as e: return f"Error listening: {str(e)}" # --- PART 3: PERSONALITY & VOICE --- async def generate_response_and_audio(text_input, audio_input, memory_json, persona, hf_token): if not hf_token: return "Error: Please paste your Hugging Face Token.", None # LOGIC: Did the user Type or Speak? user_message = "" if audio_input is not None: # If audio exists, convert it to text first user_message = transcribe_audio(audio_input, hf_token) else: # Otherwise use the typed text user_message = text_input if not user_message: return "Error: Please type something or record your voice.", None client = InferenceClient(token=hf_token) try: memory = json.loads(memory_json) except: memory = {} prompts = { "Calm Mentor": "Role: Wise Teacher. Tone: Calm, slow, patient. Advice: Focus on long-term growth.", "Witty Friend": "Role: Best Friend. Tone: Funny, fast, sarcastic. Advice: Make jokes and be relatable.", "Therapist": "Role: Counselor. Tone: Soft, kind, gentle. Advice: Validate their feelings." } context = f""" ABOUT THE USER: - Personality: {memory.get('traits', 'Unknown')} - Values: {memory.get('values', 'Unknown')} - Problems: {memory.get('struggles', 'Unknown')} """ messages = [ {"role": "system", "content": f"{prompts[persona]}\n\n{context}"}, {"role": "user", "content": user_message} ] try: # A. Generate Text Response res = client.chat_completion( model=PERSONALITY_MODEL, messages=messages, max_tokens=250, temperature=0.7 ) text_reply = res.choices[0].message.content # B. Generate Audio Response spoken_text = clean_text_for_audio(text_reply) voice_map = { "Calm Mentor": "en-US-ChristopherNeural", "Witty Friend": "en-US-EricNeural", "Therapist": "en-US-AvaNeural" } output_file = "response.mp3" communicate = edge_tts.Communicate(spoken_text, voice_map.get(persona, "en-US-AriaNeural")) await communicate.save(output_file) # Return: (User's Transcribed Text, AI Response, Audio File) return f" You said: {user_message}\n\n AI: {text_reply}", output_file except Exception as e: return f"Error: {str(e)}", None # Wrapper for Gradio def process_interaction(text, audio, memory, persona, token): return asyncio.run(generate_response_and_audio(text, audio, memory, persona, token)) # --- UI LAYOUT --- with gr.Blocks(title="Multimodal Personality Engine") as demo: gr.Markdown("Input: **Text or Voice** | Output: **Text + Voice**") with gr.Row(): token_input = gr.Textbox(label="Hugging Face Token (Required)", type="password") with gr.Row(): # Column 1: Analyze with gr.Column(): gr.Markdown("### 1. Memory Analysis") logs_input = gr.Textbox(label="History", value=DEFAULT_LOGS, lines=5) extract_btn = gr.Button("Create Profile") memory_output = gr.Code(label="Result (JSON)", language="json") extract_btn.click(extract_memory, inputs=[logs_input, token_input], outputs=memory_output) # Column 2: Chat with gr.Column(): gr.Markdown("### 2. Chat with Agent") # INPUTS with gr.Tab("Type"): text_in = gr.Textbox(label="Type here...") with gr.Tab("Speak"): audio_in = gr.Audio(sources=["microphone"], type="filepath", label="Record here") persona_select = gr.Radio(["Calm Mentor", "Witty Friend", "Therapist"], label="Tone", value="Calm Mentor") send_btn = gr.Button("Send Message") # OUTPUTS text_out = gr.Textbox(label="Conversation Log", lines=4) audio_out = gr.Audio(label="AI Voice Response") send_btn.click( process_interaction, inputs=[text_in, audio_in, memory_output, persona_select, token_input], outputs=[text_out, audio_out] ) if __name__ == "__main__": demo.queue().launch()