Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -180,6 +180,7 @@ class ProfessionalVoiceAgent:
|
|
| 180 |
def transcribe_audio(self, audio) -> str:
|
| 181 |
"""Convert speech to text using Whisper"""
|
| 182 |
if audio is None:
|
|
|
|
| 183 |
return ""
|
| 184 |
|
| 185 |
try:
|
|
@@ -187,19 +188,27 @@ class ProfessionalVoiceAgent:
|
|
| 187 |
if isinstance(audio, dict):
|
| 188 |
sample_rate = audio.get("sample_rate", 16000)
|
| 189 |
audio_data = audio.get("array", audio.get("data", None))
|
|
|
|
| 190 |
if audio_data is None:
|
| 191 |
logger.error("Audio dict missing 'array' or 'data' key")
|
| 192 |
return "Could not process audio format."
|
| 193 |
elif isinstance(audio, tuple):
|
| 194 |
sample_rate, audio_data = audio
|
|
|
|
| 195 |
else:
|
| 196 |
audio_data = audio
|
| 197 |
sample_rate = 16000
|
|
|
|
| 198 |
|
| 199 |
# Ensure we have audio data
|
| 200 |
if audio_data is None or len(audio_data) == 0:
|
|
|
|
| 201 |
return "No audio data received."
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
# Convert to float32 if needed
|
| 204 |
if audio_data.dtype == np.int16:
|
| 205 |
audio_data = audio_data.astype(np.float32) / 32768.0
|
|
@@ -232,8 +241,14 @@ class ProfessionalVoiceAgent:
|
|
| 232 |
# Generate token ids - optimized for speed
|
| 233 |
with torch.cuda.amp.autocast(enabled=self.device.type == "cuda"):
|
| 234 |
with torch.no_grad():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
predicted_ids = self.whisper_model.generate(
|
| 236 |
input_features,
|
|
|
|
| 237 |
max_new_tokens=64, # Reduced for faster processing
|
| 238 |
num_beams=1, # Greedy decoding for speed
|
| 239 |
do_sample=False # Deterministic
|
|
@@ -272,8 +287,12 @@ class ProfessionalVoiceAgent:
|
|
| 272 |
for user_msg, bot_msg in conversation_history[-3:]: # Last 3 exchanges
|
| 273 |
context += f"User: {user_msg}\nAssistant: {bot_msg}\n"
|
| 274 |
context += f"User: {text}\nAssistant:"
|
|
|
|
| 275 |
else:
|
| 276 |
context = f"User: {text}\nAssistant:"
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
if self.chat_tokenizer and hasattr(self.chat_model, 'generate'):
|
| 279 |
# Tokenize input
|
|
@@ -299,10 +318,12 @@ class ProfessionalVoiceAgent:
|
|
| 299 |
)
|
| 300 |
|
| 301 |
# Decode response
|
| 302 |
-
|
|
|
|
| 303 |
|
| 304 |
# Clean response
|
| 305 |
-
response =
|
|
|
|
| 306 |
|
| 307 |
else:
|
| 308 |
# Use pipeline
|
|
@@ -334,6 +355,8 @@ class ProfessionalVoiceAgent:
|
|
| 334 |
return None
|
| 335 |
|
| 336 |
try:
|
|
|
|
|
|
|
| 337 |
# Truncate if too long and warn
|
| 338 |
max_chars = 600
|
| 339 |
if len(text) > max_chars:
|
|
|
|
| 180 |
def transcribe_audio(self, audio) -> str:
|
| 181 |
"""Convert speech to text using Whisper"""
|
| 182 |
if audio is None:
|
| 183 |
+
logger.warning("No audio input received")
|
| 184 |
return ""
|
| 185 |
|
| 186 |
try:
|
|
|
|
| 188 |
if isinstance(audio, dict):
|
| 189 |
sample_rate = audio.get("sample_rate", 16000)
|
| 190 |
audio_data = audio.get("array", audio.get("data", None))
|
| 191 |
+
logger.info(f"Audio format: dict, sample_rate={sample_rate}, data shape={audio_data.shape if audio_data is not None else 'None'}")
|
| 192 |
if audio_data is None:
|
| 193 |
logger.error("Audio dict missing 'array' or 'data' key")
|
| 194 |
return "Could not process audio format."
|
| 195 |
elif isinstance(audio, tuple):
|
| 196 |
sample_rate, audio_data = audio
|
| 197 |
+
logger.info(f"Audio format: tuple, sample_rate={sample_rate}, data shape={audio_data.shape}")
|
| 198 |
else:
|
| 199 |
audio_data = audio
|
| 200 |
sample_rate = 16000
|
| 201 |
+
logger.info(f"Audio format: raw array, shape={audio_data.shape}")
|
| 202 |
|
| 203 |
# Ensure we have audio data
|
| 204 |
if audio_data is None or len(audio_data) == 0:
|
| 205 |
+
logger.warning("Empty audio data")
|
| 206 |
return "No audio data received."
|
| 207 |
|
| 208 |
+
# Log audio stats
|
| 209 |
+
duration_seconds = len(audio_data) / sample_rate
|
| 210 |
+
logger.info(f"Audio duration: {duration_seconds:.2f}s, sample_rate: {sample_rate}Hz")
|
| 211 |
+
|
| 212 |
# Convert to float32 if needed
|
| 213 |
if audio_data.dtype == np.int16:
|
| 214 |
audio_data = audio_data.astype(np.float32) / 32768.0
|
|
|
|
| 241 |
# Generate token ids - optimized for speed
|
| 242 |
with torch.cuda.amp.autocast(enabled=self.device.type == "cuda"):
|
| 243 |
with torch.no_grad():
|
| 244 |
+
# Force English language to avoid language detection overhead
|
| 245 |
+
forced_decoder_ids = self.whisper_processor.get_decoder_prompt_ids(
|
| 246 |
+
language="en",
|
| 247 |
+
task="transcribe"
|
| 248 |
+
)
|
| 249 |
predicted_ids = self.whisper_model.generate(
|
| 250 |
input_features,
|
| 251 |
+
forced_decoder_ids=forced_decoder_ids,
|
| 252 |
max_new_tokens=64, # Reduced for faster processing
|
| 253 |
num_beams=1, # Greedy decoding for speed
|
| 254 |
do_sample=False # Deterministic
|
|
|
|
| 287 |
for user_msg, bot_msg in conversation_history[-3:]: # Last 3 exchanges
|
| 288 |
context += f"User: {user_msg}\nAssistant: {bot_msg}\n"
|
| 289 |
context += f"User: {text}\nAssistant:"
|
| 290 |
+
logger.info(f"Input text: '{text}' | History entries: {len(conversation_history)}")
|
| 291 |
else:
|
| 292 |
context = f"User: {text}\nAssistant:"
|
| 293 |
+
logger.info(f"Input text: '{text}' | No history")
|
| 294 |
+
|
| 295 |
+
logger.debug(f"Full context sent to model:\n{context}")
|
| 296 |
|
| 297 |
if self.chat_tokenizer and hasattr(self.chat_model, 'generate'):
|
| 298 |
# Tokenize input
|
|
|
|
| 318 |
)
|
| 319 |
|
| 320 |
# Decode response
|
| 321 |
+
full_response = self.chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 322 |
+
logger.debug(f"Raw model output: '{full_response}'")
|
| 323 |
|
| 324 |
# Clean response
|
| 325 |
+
response = full_response.replace(context, "").strip()
|
| 326 |
+
logger.info(f"Generated response: '{response}'")
|
| 327 |
|
| 328 |
else:
|
| 329 |
# Use pipeline
|
|
|
|
| 355 |
return None
|
| 356 |
|
| 357 |
try:
|
| 358 |
+
logger.info(f"Synthesizing speech for text: '{text}'")
|
| 359 |
+
|
| 360 |
# Truncate if too long and warn
|
| 361 |
max_chars = 600
|
| 362 |
if len(text) > max_chars:
|