Revrse commited on
Commit
3afcfbc
·
verified ·
1 Parent(s): 0d279e2

Upload 8 files

Browse files
Files changed (5) hide show
  1. README.md +8 -7
  2. app.py +55 -28
  3. config.py +3 -3
  4. models.py +25 -19
  5. requirements.txt +1 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🎯
4
  colorFrom: purple
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
@@ -34,7 +34,7 @@ Practice with 9 different sales situations:
34
  ### 🤖 AI-Powered Conversation
35
  - **Speech-to-Text (STT)**: Whisper large-v3 for accurate transcription
36
  - **Text-to-Speech (TTS)**: Parler-TTS with accent customization
37
- - **LLM**: Llama 3.2 for dynamic, context-aware responses
38
 
39
  ### 📊 Comprehensive Feedback Analysis
40
 
@@ -121,9 +121,9 @@ This application is optimized for Hugging Face Spaces with **Zero GPU** (Dynamic
121
  - Upload `requirements.txt`
122
  - Copy content from `README_HF_SPACE.md` to the Space's README.md
123
 
124
- 3. **Set secrets**
125
  - Go to Space settings
126
- - Add `HF_TOKEN` as a secret
127
 
128
  4. **Configure Space**
129
  - The app will automatically start
@@ -172,9 +172,10 @@ SpeakEdge/
172
  - Quality: Natural-sounding voices
173
 
174
  **Language Model**
175
- - Model: `meta-llama/Llama-3.2-3B-Instruct`
176
  - Purpose: Dynamic conversation & feedback generation
177
  - Context: Last 6 messages for coherence
 
178
 
179
  ### Performance Optimization
180
 
@@ -222,7 +223,7 @@ This project is licensed under the MIT License.
222
 
223
  - OpenAI Whisper for STT
224
  - Parler-TTS for multi-accent TTS
225
- - Meta for Llama models
226
  - Hugging Face for hosting and Zero GPU infrastructure
227
 
228
  ## 📞 Support
@@ -231,4 +232,4 @@ For issues, questions, or suggestions, please open an issue on GitHub.
231
 
232
  ---
233
 
234
- **Made with ❤️ for sales professionals looking to elevate their communication game**
 
4
  colorFrom: purple
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.0.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
34
  ### 🤖 AI-Powered Conversation
35
  - **Speech-to-Text (STT)**: Whisper large-v3 for accurate transcription
36
  - **Text-to-Speech (TTS)**: Parler-TTS with accent customization
37
+ - **LLM**: Mistral-7B-Instruct for dynamic, context-aware responses
38
 
39
  ### 📊 Comprehensive Feedback Analysis
40
 
 
121
  - Upload `requirements.txt`
122
  - Copy content from `README_HF_SPACE.md` to the Space's README.md
123
 
124
+ 3. **Set secrets (optional)**
125
  - Go to Space settings
126
+ - Add `HF_TOKEN` as a secret (optional but recommended for better rate limits)
127
 
128
  4. **Configure Space**
129
  - The app will automatically start
 
172
  - Quality: Natural-sounding voices
173
 
174
  **Language Model**
175
+ - Model: `mistralai/Mistral-7B-Instruct-v0.3`
176
  - Purpose: Dynamic conversation & feedback generation
177
  - Context: Last 6 messages for coherence
178
+ - Advantage: No approval needed, excellent performance
179
 
180
  ### Performance Optimization
181
 
 
223
 
224
  - OpenAI Whisper for STT
225
  - Parler-TTS for multi-accent TTS
226
+ - Mistral AI for Mistral-7B-Instruct model
227
  - Hugging Face for hosting and Zero GPU infrastructure
228
 
229
  ## 📞 Support
 
232
 
233
  ---
234
 
235
+ **Made with ❤️ for sales professionals looking to elevate their communication game**
app.py CHANGED
@@ -4,6 +4,7 @@ import os
4
  from datetime import datetime
5
  from typing import List, Dict, Tuple
6
  import numpy as np
 
7
 
8
  from models import ModelManager
9
  from scenarios import SCENARIOS, get_scenario_prompt
@@ -77,14 +78,18 @@ def start_roleplay(scenario: str, accent: str, personality: str, bot_name: str):
77
 
78
  def process_user_audio(audio_input, current_history):
79
  """Process user's audio input and generate bot response"""
80
- if audio_input is None:
81
- return current_history, None, "Please record your audio first."
 
 
 
82
 
83
  # Transcribe user audio
84
  user_text = model_manager.speech_to_text(audio_input)
85
 
86
  if not user_text or user_text.strip() == "":
87
- return current_history, None, "Could not understand audio. Please try again."
 
88
 
89
  # Store user transcript
90
  conversation_state["transcripts"].append({
@@ -102,6 +107,8 @@ def process_user_audio(audio_input, current_history):
102
  current_history = []
103
  current_history.append((user_text, None))
104
 
 
 
105
  # Generate bot response
106
  system_prompt = get_scenario_prompt(
107
  conversation_state["scenario"],
@@ -126,6 +133,11 @@ def process_user_audio(audio_input, current_history):
126
  "timestamp": datetime.now().isoformat()
127
  })
128
 
 
 
 
 
 
129
  # Generate audio for bot response
130
  audio_path = model_manager.text_to_speech(
131
  bot_response,
@@ -133,10 +145,7 @@ def process_user_audio(audio_input, current_history):
133
  conversation_state["bot_config"]["name"]
134
  )
135
 
136
- # Update conversation history
137
- current_history[-1] = (user_text, bot_response)
138
-
139
- return current_history, audio_path, "Bot responded. Your turn!"
140
 
141
  def end_roleplay():
142
  """End the roleplay and generate feedback"""
@@ -194,18 +203,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="SpeakEdge - Sales Communication Pr
194
  info="Choose the sales situation you want to practice"
195
  )
196
 
197
- accent_dropdown = gr.Dropdown(
198
- choices=[
199
- "American",
200
- "British",
201
- "Australian",
202
- "Indian",
203
- "Neutral"
204
- ],
205
- label="Bot Accent",
206
- value="American",
207
- info="Select the accent for your conversation partner"
208
- )
209
 
210
  personality_dropdown = gr.Dropdown(
211
  choices=[
@@ -253,12 +252,16 @@ with gr.Blocks(theme=gr.themes.Soft(), title="SpeakEdge - Sales Communication Pr
253
  audio_input = gr.Audio(
254
  sources=["microphone"],
255
  type="filepath",
256
- label="Your Response (Speak)"
 
 
 
257
  )
258
 
 
 
259
  with gr.Row():
260
- send_btn = gr.Button("📤 Send Audio", variant="primary")
261
- end_btn = gr.Button("🏁 End Roleplay", variant="stop")
262
 
263
  with gr.Row():
264
  with gr.Column(visible=False) as feedback_panel:
@@ -272,7 +275,15 @@ with gr.Blocks(theme=gr.themes.Soft(), title="SpeakEdge - Sales Communication Pr
272
  outputs=[setup_panel, conversation_panel, chatbot, bot_audio_output, status_text]
273
  )
274
 
275
- send_btn.click(
 
 
 
 
 
 
 
 
276
  fn=process_user_audio,
277
  inputs=[audio_input, chatbot],
278
  outputs=[chatbot, bot_audio_output, status_text]
@@ -286,11 +297,27 @@ with gr.Blocks(theme=gr.themes.Soft(), title="SpeakEdge - Sales Communication Pr
286
 
287
  gr.Markdown("""
288
  ---
289
- ### 📝 Tips for Best Results:
290
- - Speak clearly and at a natural pace
291
- - Engage authentically as you would in a real situation
292
- - Practice different scenarios to improve various skills
293
- - Review your feedback carefully to identify improvement areas
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  """)
295
 
296
  if __name__ == "__main__":
 
4
  from datetime import datetime
5
  from typing import List, Dict, Tuple
6
  import numpy as np
7
+ import time
8
 
9
  from models import ModelManager
10
  from scenarios import SCENARIOS, get_scenario_prompt
 
78
 
79
  def process_user_audio(audio_input, current_history):
80
  """Process user's audio input and generate bot response"""
81
+ if audio_input is None or audio_input == "":
82
+ return current_history, None, "Listening..."
83
+
84
+ # Update status
85
+ yield current_history, None, "🎧 Transcribing your speech..."
86
 
87
  # Transcribe user audio
88
  user_text = model_manager.speech_to_text(audio_input)
89
 
90
  if not user_text or user_text.strip() == "":
91
+ yield current_history, None, "Ready - Speak now!"
92
+ return
93
 
94
  # Store user transcript
95
  conversation_state["transcripts"].append({
 
107
  current_history = []
108
  current_history.append((user_text, None))
109
 
110
+ yield current_history, None, "💭 Thinking..."
111
+
112
  # Generate bot response
113
  system_prompt = get_scenario_prompt(
114
  conversation_state["scenario"],
 
133
  "timestamp": datetime.now().isoformat()
134
  })
135
 
136
+ # Update conversation history
137
+ current_history[-1] = (user_text, bot_response)
138
+
139
+ yield current_history, None, "🗣️ Speaking..."
140
+
141
  # Generate audio for bot response
142
  audio_path = model_manager.text_to_speech(
143
  bot_response,
 
145
  conversation_state["bot_config"]["name"]
146
  )
147
 
148
+ yield current_history, audio_path, "🎤 Your turn - Speak now!"
 
 
 
149
 
150
  def end_roleplay():
151
  """End the roleplay and generate feedback"""
 
203
  info="Choose the sales situation you want to practice"
204
  )
205
 
206
+ # Removed accent selector - using single optimized American accent for speed
207
+ accent_dropdown = gr.State("American") # Hidden state
 
 
 
 
 
 
 
 
 
 
208
 
209
  personality_dropdown = gr.Dropdown(
210
  choices=[
 
252
  audio_input = gr.Audio(
253
  sources=["microphone"],
254
  type="filepath",
255
+ label="🎤 Continuous Conversation - Just Speak!",
256
+ streaming=True,
257
+ show_label=True,
258
+ container=True
259
  )
260
 
261
+ gr.Markdown("**💡 Tip:** Speak naturally, pause when done. The bot will automatically respond!")
262
+
263
  with gr.Row():
264
+ end_btn = gr.Button("🏁 End Conversation & Get Feedback", variant="stop", size="lg")
 
265
 
266
  with gr.Row():
267
  with gr.Column(visible=False) as feedback_panel:
 
275
  outputs=[setup_panel, conversation_panel, chatbot, bot_audio_output, status_text]
276
  )
277
 
278
+ # Continuous conversation: Auto-process when audio is provided (streaming)
279
+ audio_input.stop_recording(
280
+ fn=process_user_audio,
281
+ inputs=[audio_input, chatbot],
282
+ outputs=[chatbot, bot_audio_output, status_text]
283
+ )
284
+
285
+ # Also trigger on change for immediate processing
286
+ audio_input.change(
287
  fn=process_user_audio,
288
  inputs=[audio_input, chatbot],
289
  outputs=[chatbot, bot_audio_output, status_text]
 
297
 
298
  gr.Markdown("""
299
  ---
300
+ ### 📝 How It Works:
301
+ 1. **Grant microphone permission** when prompted
302
+ 2. **Bot speaks first** - Listen to the greeting
303
+ 3. **You speak** - Just talk naturally (no need to click anything!)
304
+ 4. **Pause briefly** when you're done speaking
305
+ 5. **Bot responds** - Listen and continue the conversation
306
+ 6. **Repeat** - Keep the conversation flowing naturally
307
+ 7. **End** when done to get your detailed feedback
308
+
309
+ ### ⚡ What to Expect:
310
+ - 🕐 First response: 30-60 seconds (models loading)
311
+ - ⚡ After that: 5-10 seconds per exchange
312
+ - 🎤 Microphone stays active - just speak when ready
313
+ - 🔊 Bot responses play automatically
314
+ - 💬 Natural conversation flow
315
+
316
+ ### 🎯 Pro Tips:
317
+ - Speak clearly and naturally
318
+ - Pause for 1-2 seconds after finishing
319
+ - Let the bot finish speaking before responding
320
+ - Engage as you would in a real call
321
  """)
322
 
323
  if __name__ == "__main__":
config.py CHANGED
@@ -4,9 +4,9 @@ Configuration settings for SpeakEdge
4
 
5
  import os
6
 
7
- # Model configurations
8
- WHISPER_MODEL = os.getenv("WHISPER_MODEL", "openai/whisper-large-v3")
9
- TTS_MODEL = os.getenv("TTS_MODEL", "parler-tts/parler-tts-mini-v1")
10
  LLM_MODEL = os.getenv("LLM_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
11
 
12
  # Hugging Face token (optional for public models)
 
4
 
5
  import os
6
 
7
+ # Model configurations (optimized for speed)
8
+ WHISPER_MODEL = os.getenv("WHISPER_MODEL", "openai/whisper-medium") # Faster than large
9
+ TTS_MODEL = os.getenv("TTS_MODEL", "parler-tts/parler-tts-tiny-v1") # Faster TTS
10
  LLM_MODEL = os.getenv("LLM_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
11
 
12
  # Hugging Face token (optional for public models)
models.py CHANGED
@@ -36,7 +36,8 @@ class ModelManager:
36
  """Load Whisper model for STT"""
37
  if self.whisper_pipe is None:
38
  print("Loading Whisper model...")
39
- model_id = "openai/whisper-large-v3"
 
40
 
41
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
42
  model_id,
@@ -55,14 +56,17 @@ class ModelManager:
55
  feature_extractor=processor.feature_extractor,
56
  torch_dtype=self.torch_dtype,
57
  device=self.device,
 
 
58
  )
59
  print("Whisper model loaded successfully!")
60
 
61
  def load_tts(self):
62
- """Load Parler-TTS model for text-to-speech"""
63
  if self.tts_model is None:
64
  print("Loading TTS model...")
65
- model_id = "parler-tts/parler-tts-mini-v1"
 
66
 
67
  self.tts_model = ParlerTTSForConditionalGeneration.from_pretrained(
68
  model_id,
@@ -90,14 +94,19 @@ class ModelManager:
90
 
91
  @spaces.GPU
92
  def speech_to_text(self, audio_path: str) -> str:
93
- """Convert speech to text using Whisper"""
94
  try:
95
  self.load_whisper()
96
 
97
  result = self.whisper_pipe(
98
  audio_path,
99
  return_timestamps=False,
100
- generate_kwargs={"language": "english"}
 
 
 
 
 
101
  )
102
 
103
  return result["text"].strip()
@@ -106,31 +115,28 @@ class ModelManager:
106
  return ""
107
 
108
  @spaces.GPU
109
- def text_to_speech(self, text: str, accent: str, speaker_name: str) -> str:
110
- """Convert text to speech with specified accent"""
111
  try:
112
  self.load_tts()
113
 
114
- # Create description based on accent
115
- accent_descriptions = {
116
- "American": "A clear American English accent, professional and articulate.",
117
- "British": "A refined British English accent, clear and professional.",
118
- "Australian": "An Australian English accent, friendly and clear.",
119
- "Indian": "An Indian English accent, professional and articulate.",
120
- "Neutral": "A neutral English accent, clear and professional."
121
- }
122
 
123
- description = accent_descriptions.get(accent, accent_descriptions["Neutral"])
124
- description += " The speaker has a moderate pace and good enunciation."
 
125
 
126
- # Generate audio
127
  input_ids = self.tts_tokenizer(description, return_tensors="pt").input_ids.to(self.device)
128
  prompt_input_ids = self.tts_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
129
 
130
  generation = self.tts_model.generate(
131
  input_ids=input_ids,
132
  prompt_input_ids=prompt_input_ids,
133
- attention_mask=torch.ones_like(input_ids)
 
 
134
  )
135
 
136
  audio_arr = generation.cpu().numpy().squeeze()
 
36
  """Load Whisper model for STT"""
37
  if self.whisper_pipe is None:
38
  print("Loading Whisper model...")
39
+ # Using medium model for better speed/accuracy balance
40
+ model_id = "openai/whisper-medium"
41
 
42
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
43
  model_id,
 
56
  feature_extractor=processor.feature_extractor,
57
  torch_dtype=self.torch_dtype,
58
  device=self.device,
59
+ chunk_length_s=30,
60
+ batch_size=16,
61
  )
62
  print("Whisper model loaded successfully!")
63
 
64
  def load_tts(self):
65
+ """Load TTS model for text-to-speech"""
66
  if self.tts_model is None:
67
  print("Loading TTS model...")
68
+ # Using smaller, faster TTS model
69
+ model_id = "parler-tts/parler-tts-tiny-v1"
70
 
71
  self.tts_model = ParlerTTSForConditionalGeneration.from_pretrained(
72
  model_id,
 
94
 
95
  @spaces.GPU
96
  def speech_to_text(self, audio_path: str) -> str:
97
+ """Convert speech to text using Whisper - optimized for speed"""
98
  try:
99
  self.load_whisper()
100
 
101
  result = self.whisper_pipe(
102
  audio_path,
103
  return_timestamps=False,
104
+ generate_kwargs={
105
+ "language": "english",
106
+ "task": "transcribe",
107
+ "num_beams": 1, # Faster
108
+ "temperature": 0.0 # More deterministic
109
+ }
110
  )
111
 
112
  return result["text"].strip()
 
115
  return ""
116
 
117
  @spaces.GPU
118
+ def text_to_speech(self, text: str, accent: str = "American", speaker_name: str = None) -> str:
119
+ """Convert text to speech - optimized for speed with American accent"""
120
  try:
121
  self.load_tts()
122
 
123
+ # Simplified: Just use one clear American voice for speed
124
+ description = "A clear American male voice speaks at moderate pace with good enunciation."
 
 
 
 
 
 
125
 
126
+ # Limit text length for faster generation
127
+ if len(text) > 200:
128
+ text = text[:200] + "..."
129
 
130
+ # Generate audio with optimized settings
131
  input_ids = self.tts_tokenizer(description, return_tensors="pt").input_ids.to(self.device)
132
  prompt_input_ids = self.tts_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
133
 
134
  generation = self.tts_model.generate(
135
  input_ids=input_ids,
136
  prompt_input_ids=prompt_input_ids,
137
+ attention_mask=torch.ones_like(input_ids),
138
+ do_sample=False, # Faster, deterministic
139
+ num_beams=1 # Faster generation
140
  )
141
 
142
  audio_arr = generation.cpu().numpy().squeeze()
requirements.txt CHANGED
@@ -8,6 +8,7 @@ spaces>=0.21.0
8
  # Audio processing
9
  torchaudio>=2.0.0
10
  soundfile>=0.12.1
 
11
 
12
  # Speech models
13
  openai-whisper
@@ -19,5 +20,4 @@ protobuf>=3.20.0
19
 
20
  # Utilities
21
  numpy>=1.24.0
22
- scipy>=1.11.0
23
 
 
8
  # Audio processing
9
  torchaudio>=2.0.0
10
  soundfile>=0.12.1
11
+ scipy>=1.11.0
12
 
13
  # Speech models
14
  openai-whisper
 
20
 
21
  # Utilities
22
  numpy>=1.24.0
 
23