Spaces:

Agents-MCP-Hackathon
/

explainor

Sleeping

Emperor555 Claude commited on 12 days ago

Commit

d4d880a

1 Parent(s): c1c3a76

Add character voice settings for more expressive TTS

Each persona now has unique voice settings:
- 5-Year-Old: Fast, bouncy, very expressive (stability: 0.3, speed: 1.15)
- Gordon Ramsay: Unpredictable, maximum drama (stability: 0.25, style: 0.9)
- Pirate: Rough, theatrical (stability: 0.35, style: 0.85)
- Shakespeare: Slow, deliberate, poetic (speed: 0.85)
- Surfer Dude: Chill, relaxed flow (speed: 0.9)
- Yoda: Very slow, wise pauses (speed: 0.7)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (4) hide show

app.py +3 -1
src/agent.py +1 -0
src/personas.py +41 -0
src/tts.py +23 -8

app.py CHANGED Viewed

@@ -65,6 +65,7 @@ def explain_topic(topic: str, persona_name: str, audience: str = "", generate_au
     explanation = ""
     sources = []
     voice_id = None
     mcp_tools = []
     # Run the agent pipeline
@@ -88,6 +89,7 @@ def explain_topic(topic: str, persona_name: str, audience: str = "", generate_au
             explanation = update["explanation"]
             sources = update.get("sources", sources)
             voice_id = update["voice_id"]
             mcp_tools = update.get("mcp_tools", [])
             progress(0.8, desc="Explanation ready!")
@@ -99,7 +101,7 @@ def explain_topic(topic: str, persona_name: str, audience: str = "", generate_au
     if generate_audio and explanation and voice_id:
         progress(0.9, desc="Generating audio...")
         try:
-            audio_bytes = generate_speech(explanation, voice_id)
             # Save to temp file for Gradio
             with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
                 f.write(audio_bytes)

     explanation = ""
     sources = []
     voice_id = None
+    voice_settings = None
     mcp_tools = []
     # Run the agent pipeline
             explanation = update["explanation"]
             sources = update.get("sources", sources)
             voice_id = update["voice_id"]
+            voice_settings = update.get("voice_settings")
             mcp_tools = update.get("mcp_tools", [])
             progress(0.8, desc="Explanation ready!")
     if generate_audio and explanation and voice_id:
         progress(0.9, desc="Generating audio...")
         try:
+            audio_bytes = generate_speech(explanation, voice_id, voice_settings)
             # Save to temp file for Gradio
             with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
                 f.write(audio_bytes)

src/agent.py CHANGED Viewed

@@ -307,5 +307,6 @@ Now explain "{topic}" in your unique {persona_name} voice and style. Make it fun
         "persona": persona_name,
         "persona_emoji": persona["emoji"],
         "voice_id": persona["voice_id"],
         "mcp_tools": mcp_tools,
     }

         "persona": persona_name,
         "persona_emoji": persona["emoji"],
         "voice_id": persona["voice_id"],
+        "voice_settings": persona.get("voice_settings"),
         "mcp_tools": mcp_tools,
     }

src/personas.py CHANGED Viewed

@@ -1,5 +1,10 @@
 """Persona definitions for Explainor."""
 PERSONAS = {
     "5-Year-Old": {
         "system_prompt": """You are an excited, curious 5-year-old child explaining things.
@@ -8,6 +13,12 @@ Say things like "Ooh!" and "Wow!" and "You know what?"
 Compare everything to toys, candy, cartoons, and playground activities.
 Keep sentences very short. Use lots of exclamation marks!""",
         "voice_id": "jBpfuIE2acCO8z3wKNLl",  # "Aria" - young, enthusiastic
         "emoji": "👶",
     },
     "Gordon Ramsay": {
@@ -18,6 +29,12 @@ Compare concepts to cooking techniques, ingredients, and kitchen disasters.
 Use phrases like "Listen here!", "It's RAW!", "Absolutely stunning!", and "Donkey!".
 Be dramatic but ultimately make the explanation clear.""",
         "voice_id": "N2lVS1w4EtoT3dr4eOWO",  # "Callum" - British, intense
         "emoji": "👨‍🍳",
     },
     "Pirate": {
@@ -28,6 +45,12 @@ Talk about concepts like they're parts of a treasure map or sea voyage.
 Be dramatic and swashbuckling. Mention your crew, your ship, and rum occasionally.
 End with something about setting sail for knowledge.""",
         "voice_id": "TX3LPaxmHKxFdv7VOQHJ",  # "Liam" - gruff, theatrical
         "emoji": "🏴‍☠️",
     },
     "Shakespeare": {
@@ -38,6 +61,12 @@ Occasionally quote or parody your own famous lines.
 Structure explanations like soliloquies with dramatic pauses.
 Compare technology and modern things to courtly intrigue and theatrical performance.""",
         "voice_id": "onwK4e9ZLuTAKqWW03F9",  # "Daniel" - theatrical British
         "emoji": "🎭",
     },
     "Surfer Dude": {
@@ -48,6 +77,12 @@ Be super chill and positive. Everything is awesome and gives good vibes.
 Use "like" as filler. Talk about concepts like they're waves to ride.
 Keep the energy mellow but enthusiastic.""",
         "voice_id": "ErXwobaYiN019PkySvjV",  # "Antoni" - laid-back American
         "emoji": "🏄",
     },
     "Yoda": {
@@ -59,6 +94,12 @@ Use phrases like "Hmmmm", "Yes, yes", "Much to learn, you have."
 Speak slowly and thoughtfully. Make profound observations.
 Occasionally chuckle wisely: "Hehehehe".""",
         "voice_id": "pqHfZKP75CvOlQylNhV4",  # "Bill" - slow, thoughtful
         "emoji": "🧙",
     },
 }

 """Persona definitions for Explainor."""
+# Voice settings: stability (0-1), similarity_boost (0-1), style (0-1), speed (0.5-2.0)
+# Lower stability = more expressive/variable
+# Higher style = more exaggerated delivery
+# Speed affects pacing
 PERSONAS = {
     "5-Year-Old": {
         "system_prompt": """You are an excited, curious 5-year-old child explaining things.
 Compare everything to toys, candy, cartoons, and playground activities.
 Keep sentences very short. Use lots of exclamation marks!""",
         "voice_id": "jBpfuIE2acCO8z3wKNLl",  # "Aria" - young, enthusiastic
+        "voice_settings": {
+            "stability": 0.3,  # Very expressive, bouncy
+            "similarity_boost": 0.7,
+            "style": 0.8,  # Exaggerated childlike delivery
+            "speed": 1.15,  # Kids talk fast when excited
+        },
         "emoji": "👶",
     },
     "Gordon Ramsay": {
 Use phrases like "Listen here!", "It's RAW!", "Absolutely stunning!", and "Donkey!".
 Be dramatic but ultimately make the explanation clear.""",
         "voice_id": "N2lVS1w4EtoT3dr4eOWO",  # "Callum" - British, intense
+        "voice_settings": {
+            "stability": 0.25,  # Unpredictable, emotional
+            "similarity_boost": 0.8,
+            "style": 0.9,  # Maximum drama!
+            "speed": 1.1,  # Intense, rapid delivery
+        },
         "emoji": "👨‍🍳",
     },
     "Pirate": {
 Be dramatic and swashbuckling. Mention your crew, your ship, and rum occasionally.
 End with something about setting sail for knowledge.""",
         "voice_id": "TX3LPaxmHKxFdv7VOQHJ",  # "Liam" - gruff, theatrical
+        "voice_settings": {
+            "stability": 0.35,  # Rough, varied
+            "similarity_boost": 0.6,
+            "style": 0.85,  # Theatrical pirate flair
+            "speed": 0.95,  # Slightly slower, dramatic
+        },
         "emoji": "🏴‍☠️",
     },
     "Shakespeare": {
 Structure explanations like soliloquies with dramatic pauses.
 Compare technology and modern things to courtly intrigue and theatrical performance.""",
         "voice_id": "onwK4e9ZLuTAKqWW03F9",  # "Daniel" - theatrical British
+        "voice_settings": {
+            "stability": 0.4,  # Theatrical variation
+            "similarity_boost": 0.75,
+            "style": 0.7,  # Dramatic but refined
+            "speed": 0.85,  # Slow, deliberate, poetic
+        },
         "emoji": "🎭",
     },
     "Surfer Dude": {
 Use "like" as filler. Talk about concepts like they're waves to ride.
 Keep the energy mellow but enthusiastic.""",
         "voice_id": "ErXwobaYiN019PkySvjV",  # "Antoni" - laid-back American
+        "voice_settings": {
+            "stability": 0.5,  # Relaxed, flowing
+            "similarity_boost": 0.65,
+            "style": 0.6,  # Chill vibes
+            "speed": 0.9,  # Slooow and chill broooo
+        },
         "emoji": "🏄",
     },
     "Yoda": {
 Speak slowly and thoughtfully. Make profound observations.
 Occasionally chuckle wisely: "Hehehehe".""",
         "voice_id": "pqHfZKP75CvOlQylNhV4",  # "Bill" - slow, thoughtful
+        "voice_settings": {
+            "stability": 0.45,  # Wise, measured variations
+            "similarity_boost": 0.7,
+            "style": 0.5,  # Subtle but distinct
+            "speed": 0.7,  # Slow... speak I do... hmmm
+        },
         "emoji": "🧙",
     },
 }

src/tts.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """ElevenLabs Text-to-Speech integration."""
 import os
-from elevenlabs import ElevenLabs
 def get_client() -> ElevenLabs:
@@ -12,25 +12,40 @@ def get_client() -> ElevenLabs:
     return ElevenLabs(api_key=api_key)
-def generate_speech(text: str, voice_id: str) -> bytes:
     """Generate speech audio from text.
     Args:
         text: The text to convert to speech
         voice_id: ElevenLabs voice ID
     Returns:
         Audio bytes (MP3 format)
     """
     client = get_client()
     # Generate audio
-    audio_generator = client.text_to_speech.convert(
-        voice_id=voice_id,
-        text=text,
-        model_id="eleven_multilingual_v2",
-        output_format="mp3_44100_128",
-    )
     # Collect all audio chunks
     audio_chunks = []

 """ElevenLabs Text-to-Speech integration."""
 import os
+from elevenlabs import ElevenLabs, VoiceSettings
 def get_client() -> ElevenLabs:
     return ElevenLabs(api_key=api_key)
+def generate_speech(text: str, voice_id: str, voice_settings: dict = None) -> bytes:
     """Generate speech audio from text.
     Args:
         text: The text to convert to speech
         voice_id: ElevenLabs voice ID
+        voice_settings: Optional dict with stability, similarity_boost, style, speed
     Returns:
         Audio bytes (MP3 format)
     """
     client = get_client()
+    # Build voice settings if provided
+    settings = None
+    if voice_settings:
+        settings = VoiceSettings(
+            stability=voice_settings.get("stability", 0.5),
+            similarity_boost=voice_settings.get("similarity_boost", 0.75),
+            style=voice_settings.get("style", 0.0),
+            speed=voice_settings.get("speed", 1.0),
+        )
     # Generate audio
+    kwargs = {
+        "voice_id": voice_id,
+        "text": text,
+        "model_id": "eleven_multilingual_v2",
+        "output_format": "mp3_44100_128",
+    }
+    if settings:
+        kwargs["voice_settings"] = settings
+    audio_generator = client.text_to_speech.convert(**kwargs)
     # Collect all audio chunks
     audio_chunks = []