Spaces:

Revrse
/

Yukti

Sleeping

App Files Files Community

Revrse commited on Nov 1, 2025

Commit

3afcfbc

verified ·

1 Parent(s): 0d279e2

Upload 8 files

Browse files

Files changed (5) hide show

README.md +8 -7
app.py +55 -28
config.py +3 -3
models.py +25 -19
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🎯
 colorFrom: purple
 colorTo: blue
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 license: mit
@@ -34,7 +34,7 @@ Practice with 9 different sales situations:
 ### 🤖 AI-Powered Conversation
 - **Speech-to-Text (STT)**: Whisper large-v3 for accurate transcription
 - **Text-to-Speech (TTS)**: Parler-TTS with accent customization
-- **LLM**: Llama 3.2 for dynamic, context-aware responses
 ### 📊 Comprehensive Feedback Analysis
@@ -121,9 +121,9 @@ This application is optimized for Hugging Face Spaces with **Zero GPU** (Dynamic
    - Upload `requirements.txt`
    - Copy content from `README_HF_SPACE.md` to the Space's README.md
-3. **Set secrets**
    - Go to Space settings
-   - Add `HF_TOKEN` as a secret
 4. **Configure Space**
    - The app will automatically start
@@ -172,9 +172,10 @@ SpeakEdge/
 - Quality: Natural-sounding voices
 **Language Model**
-- Model: `meta-llama/Llama-3.2-3B-Instruct`
 - Purpose: Dynamic conversation & feedback generation
 - Context: Last 6 messages for coherence
 ### Performance Optimization
@@ -222,7 +223,7 @@ This project is licensed under the MIT License.
 - OpenAI Whisper for STT
 - Parler-TTS for multi-accent TTS
-- Meta for Llama models
 - Hugging Face for hosting and Zero GPU infrastructure
 ## 📞 Support
@@ -231,4 +232,4 @@ For issues, questions, or suggestions, please open an issue on GitHub.
 ---
-**Made with ❤️ for sales professionals looking to elevate their communication game**

 colorFrom: purple
 colorTo: blue
 sdk: gradio
+sdk_version: 4.0.0
 app_file: app.py
 pinned: false
 license: mit
 ### 🤖 AI-Powered Conversation
 - **Speech-to-Text (STT)**: Whisper large-v3 for accurate transcription
 - **Text-to-Speech (TTS)**: Parler-TTS with accent customization
+- **LLM**: Mistral-7B-Instruct for dynamic, context-aware responses
 ### 📊 Comprehensive Feedback Analysis
    - Upload `requirements.txt`
    - Copy content from `README_HF_SPACE.md` to the Space's README.md
+3. **Set secrets (optional)**
    - Go to Space settings
+   - Add `HF_TOKEN` as a secret (optional but recommended for better rate limits)
 4. **Configure Space**
    - The app will automatically start
 - Quality: Natural-sounding voices
 **Language Model**
+- Model: `mistralai/Mistral-7B-Instruct-v0.3`
 - Purpose: Dynamic conversation & feedback generation
 - Context: Last 6 messages for coherence
+- Advantage: No approval needed, excellent performance
 ### Performance Optimization
 - OpenAI Whisper for STT
 - Parler-TTS for multi-accent TTS
+- Mistral AI for Mistral-7B-Instruct model
 - Hugging Face for hosting and Zero GPU infrastructure
 ## 📞 Support
 ---
+**Made with ❤️ for sales professionals looking to elevate their communication game**

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 from datetime import datetime
 from typing import List, Dict, Tuple
 import numpy as np
 from models import ModelManager
 from scenarios import SCENARIOS, get_scenario_prompt
@@ -77,14 +78,18 @@ def start_roleplay(scenario: str, accent: str, personality: str, bot_name: str):
 def process_user_audio(audio_input, current_history):
     """Process user's audio input and generate bot response"""
-    if audio_input is None:
-        return current_history, None, "Please record your audio first."
     # Transcribe user audio
     user_text = model_manager.speech_to_text(audio_input)
     if not user_text or user_text.strip() == "":
-        return current_history, None, "Could not understand audio. Please try again."
     # Store user transcript
     conversation_state["transcripts"].append({
@@ -102,6 +107,8 @@ def process_user_audio(audio_input, current_history):
         current_history = []
     current_history.append((user_text, None))
     # Generate bot response
     system_prompt = get_scenario_prompt(
         conversation_state["scenario"],
@@ -126,6 +133,11 @@ def process_user_audio(audio_input, current_history):
         "timestamp": datetime.now().isoformat()
     })
     # Generate audio for bot response
     audio_path = model_manager.text_to_speech(
         bot_response,
@@ -133,10 +145,7 @@ def process_user_audio(audio_input, current_history):
         conversation_state["bot_config"]["name"]
     )
-    # Update conversation history
-    current_history[-1] = (user_text, bot_response)
-    return current_history, audio_path, "Bot responded. Your turn!"
 def end_roleplay():
     """End the roleplay and generate feedback"""
@@ -194,18 +203,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="SpeakEdge - Sales Communication Pr
                 info="Choose the sales situation you want to practice"
             )
-            accent_dropdown = gr.Dropdown(
-                choices=[
-                    "American",
-                    "British",
-                    "Australian",
-                    "Indian",
-                    "Neutral"
-                ],
-                label="Bot Accent",
-                value="American",
-                info="Select the accent for your conversation partner"
-            )
             personality_dropdown = gr.Dropdown(
                 choices=[
@@ -253,12 +252,16 @@ with gr.Blocks(theme=gr.themes.Soft(), title="SpeakEdge - Sales Communication Pr
                 audio_input = gr.Audio(
                     sources=["microphone"],
                     type="filepath",
-                    label="Your Response (Speak)"
                 )
             with gr.Row():
-                send_btn = gr.Button("📤 Send Audio", variant="primary")
-                end_btn = gr.Button("🏁 End Roleplay", variant="stop")
     with gr.Row():
         with gr.Column(visible=False) as feedback_panel:
@@ -272,7 +275,15 @@ with gr.Blocks(theme=gr.themes.Soft(), title="SpeakEdge - Sales Communication Pr
         outputs=[setup_panel, conversation_panel, chatbot, bot_audio_output, status_text]
     )
-    send_btn.click(
         fn=process_user_audio,
         inputs=[audio_input, chatbot],
         outputs=[chatbot, bot_audio_output, status_text]
@@ -286,11 +297,27 @@ with gr.Blocks(theme=gr.themes.Soft(), title="SpeakEdge - Sales Communication Pr
     gr.Markdown("""
     ---
-    ### 📝 Tips for Best Results:
-    - Speak clearly and at a natural pace
-    - Engage authentically as you would in a real situation
-    - Practice different scenarios to improve various skills
-    - Review your feedback carefully to identify improvement areas
     """)
 if __name__ == "__main__":

 from datetime import datetime
 from typing import List, Dict, Tuple
 import numpy as np
+import time
 from models import ModelManager
 from scenarios import SCENARIOS, get_scenario_prompt
 def process_user_audio(audio_input, current_history):
     """Process user's audio input and generate bot response"""
+    if audio_input is None or audio_input == "":
+        return current_history, None, "Listening..."
+    # Update status
+    yield current_history, None, "🎧 Transcribing your speech..."
     # Transcribe user audio
     user_text = model_manager.speech_to_text(audio_input)
     if not user_text or user_text.strip() == "":
+        yield current_history, None, "Ready - Speak now!"
+        return
     # Store user transcript
     conversation_state["transcripts"].append({
         current_history = []
     current_history.append((user_text, None))
+    yield current_history, None, "💭 Thinking..."
     # Generate bot response
     system_prompt = get_scenario_prompt(
         conversation_state["scenario"],
         "timestamp": datetime.now().isoformat()
     })
+    # Update conversation history
+    current_history[-1] = (user_text, bot_response)
+    yield current_history, None, "🗣️ Speaking..."
     # Generate audio for bot response
     audio_path = model_manager.text_to_speech(
         bot_response,
         conversation_state["bot_config"]["name"]
     )
+    yield current_history, audio_path, "🎤 Your turn - Speak now!"
 def end_roleplay():
     """End the roleplay and generate feedback"""
                 info="Choose the sales situation you want to practice"
             )
+            # Removed accent selector - using single optimized American accent for speed
+            accent_dropdown = gr.State("American")  # Hidden state
             personality_dropdown = gr.Dropdown(
                 choices=[
                 audio_input = gr.Audio(
                     sources=["microphone"],
                     type="filepath",
+                    label="🎤 Continuous Conversation - Just Speak!",
+                    streaming=True,
+                    show_label=True,
+                    container=True
                 )
+            gr.Markdown("**💡 Tip:** Speak naturally, pause when done. The bot will automatically respond!")
             with gr.Row():
+                end_btn = gr.Button("🏁 End Conversation & Get Feedback", variant="stop", size="lg")
     with gr.Row():
         with gr.Column(visible=False) as feedback_panel:
         outputs=[setup_panel, conversation_panel, chatbot, bot_audio_output, status_text]
     )
+    # Continuous conversation: Auto-process when audio is provided (streaming)
+    audio_input.stop_recording(
+        fn=process_user_audio,
+        inputs=[audio_input, chatbot],
+        outputs=[chatbot, bot_audio_output, status_text]
+    )
+    # Also trigger on change for immediate processing
+    audio_input.change(
         fn=process_user_audio,
         inputs=[audio_input, chatbot],
         outputs=[chatbot, bot_audio_output, status_text]
     gr.Markdown("""
     ---
+    ### 📝 How It Works:
+    1. **Grant microphone permission** when prompted
+    2. **Bot speaks first** - Listen to the greeting
+    3. **You speak** - Just talk naturally (no need to click anything!)
+    4. **Pause briefly** when you're done speaking
+    5. **Bot responds** - Listen and continue the conversation
+    6. **Repeat** - Keep the conversation flowing naturally
+    7. **End** when done to get your detailed feedback
+    ### ⚡ What to Expect:
+    - 🕐 First response: 30-60 seconds (models loading)
+    - ⚡ After that: 5-10 seconds per exchange
+    - 🎤 Microphone stays active - just speak when ready
+    - 🔊 Bot responses play automatically
+    - 💬 Natural conversation flow
+    ### 🎯 Pro Tips:
+    - Speak clearly and naturally
+    - Pause for 1-2 seconds after finishing
+    - Let the bot finish speaking before responding
+    - Engage as you would in a real call
     """)
 if __name__ == "__main__":

config.py CHANGED Viewed

@@ -4,9 +4,9 @@ Configuration settings for SpeakEdge
 import os
-# Model configurations
-WHISPER_MODEL = os.getenv("WHISPER_MODEL", "openai/whisper-large-v3")
-TTS_MODEL = os.getenv("TTS_MODEL", "parler-tts/parler-tts-mini-v1")
 LLM_MODEL = os.getenv("LLM_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
 # Hugging Face token (optional for public models)

 import os
+# Model configurations (optimized for speed)
+WHISPER_MODEL = os.getenv("WHISPER_MODEL", "openai/whisper-medium")  # Faster than large
+TTS_MODEL = os.getenv("TTS_MODEL", "parler-tts/parler-tts-tiny-v1")  # Faster TTS
 LLM_MODEL = os.getenv("LLM_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
 # Hugging Face token (optional for public models)

models.py CHANGED Viewed

@@ -36,7 +36,8 @@ class ModelManager:
         """Load Whisper model for STT"""
         if self.whisper_pipe is None:
             print("Loading Whisper model...")
-            model_id = "openai/whisper-large-v3"
             model = AutoModelForSpeechSeq2Seq.from_pretrained(
                 model_id,
@@ -55,14 +56,17 @@ class ModelManager:
                 feature_extractor=processor.feature_extractor,
                 torch_dtype=self.torch_dtype,
                 device=self.device,
             )
             print("Whisper model loaded successfully!")
     def load_tts(self):
-        """Load Parler-TTS model for text-to-speech"""
         if self.tts_model is None:
             print("Loading TTS model...")
-            model_id = "parler-tts/parler-tts-mini-v1"
             self.tts_model = ParlerTTSForConditionalGeneration.from_pretrained(
                 model_id,
@@ -90,14 +94,19 @@ class ModelManager:
     @spaces.GPU
     def speech_to_text(self, audio_path: str) -> str:
-        """Convert speech to text using Whisper"""
         try:
             self.load_whisper()
             result = self.whisper_pipe(
                 audio_path,
                 return_timestamps=False,
-                generate_kwargs={"language": "english"}
             )
             return result["text"].strip()
@@ -106,31 +115,28 @@ class ModelManager:
             return ""
     @spaces.GPU
-    def text_to_speech(self, text: str, accent: str, speaker_name: str) -> str:
-        """Convert text to speech with specified accent"""
         try:
             self.load_tts()
-            # Create description based on accent
-            accent_descriptions = {
-                "American": "A clear American English accent, professional and articulate.",
-                "British": "A refined British English accent, clear and professional.",
-                "Australian": "An Australian English accent, friendly and clear.",
-                "Indian": "An Indian English accent, professional and articulate.",
-                "Neutral": "A neutral English accent, clear and professional."
-            }
-            description = accent_descriptions.get(accent, accent_descriptions["Neutral"])
-            description += " The speaker has a moderate pace and good enunciation."
-            # Generate audio
             input_ids = self.tts_tokenizer(description, return_tensors="pt").input_ids.to(self.device)
             prompt_input_ids = self.tts_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
             generation = self.tts_model.generate(
                 input_ids=input_ids,
                 prompt_input_ids=prompt_input_ids,
-                attention_mask=torch.ones_like(input_ids)
             )
             audio_arr = generation.cpu().numpy().squeeze()

         """Load Whisper model for STT"""
         if self.whisper_pipe is None:
             print("Loading Whisper model...")
+            # Using medium model for better speed/accuracy balance
+            model_id = "openai/whisper-medium"
             model = AutoModelForSpeechSeq2Seq.from_pretrained(
                 model_id,
                 feature_extractor=processor.feature_extractor,
                 torch_dtype=self.torch_dtype,
                 device=self.device,
+                chunk_length_s=30,
+                batch_size=16,
             )
             print("Whisper model loaded successfully!")
     def load_tts(self):
+        """Load TTS model for text-to-speech"""
         if self.tts_model is None:
             print("Loading TTS model...")
+            # Using smaller, faster TTS model
+            model_id = "parler-tts/parler-tts-tiny-v1"
             self.tts_model = ParlerTTSForConditionalGeneration.from_pretrained(
                 model_id,
     @spaces.GPU
     def speech_to_text(self, audio_path: str) -> str:
+        """Convert speech to text using Whisper - optimized for speed"""
         try:
             self.load_whisper()
             result = self.whisper_pipe(
                 audio_path,
                 return_timestamps=False,
+                generate_kwargs={
+                    "language": "english",
+                    "task": "transcribe",
+                    "num_beams": 1,  # Faster
+                    "temperature": 0.0  # More deterministic
+                }
             )
             return result["text"].strip()
             return ""
     @spaces.GPU
+    def text_to_speech(self, text: str, accent: str = "American", speaker_name: str = None) -> str:
+        """Convert text to speech - optimized for speed with American accent"""
         try:
             self.load_tts()
+            # Simplified: Just use one clear American voice for speed
+            description = "A clear American male voice speaks at moderate pace with good enunciation."
+            # Limit text length for faster generation
+            if len(text) > 200:
+                text = text[:200] + "..."
+            # Generate audio with optimized settings
             input_ids = self.tts_tokenizer(description, return_tensors="pt").input_ids.to(self.device)
             prompt_input_ids = self.tts_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
             generation = self.tts_model.generate(
                 input_ids=input_ids,
                 prompt_input_ids=prompt_input_ids,
+                attention_mask=torch.ones_like(input_ids),
+                do_sample=False,  # Faster, deterministic
+                num_beams=1       # Faster generation
             )
             audio_arr = generation.cpu().numpy().squeeze()

requirements.txt CHANGED Viewed

@@ -8,6 +8,7 @@ spaces>=0.21.0
 # Audio processing
 torchaudio>=2.0.0
 soundfile>=0.12.1
 # Speech models
 openai-whisper
@@ -19,5 +20,4 @@ protobuf>=3.20.0
 # Utilities
 numpy>=1.24.0
-scipy>=1.11.0

 # Audio processing
 torchaudio>=2.0.0
 soundfile>=0.12.1
+scipy>=1.11.0
 # Speech models
 openai-whisper
 # Utilities
 numpy>=1.24.0