Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -210,19 +210,36 @@ class ProfessionalVoiceAgent:
|
|
| 210 |
logger.info(f"Audio duration: {duration_seconds:.2f}s, sample_rate: {sample_rate}Hz")
|
| 211 |
|
| 212 |
# Convert to float32 if needed
|
|
|
|
| 213 |
if audio_data.dtype == np.int16:
|
|
|
|
| 214 |
audio_data = audio_data.astype(np.float32) / 32768.0
|
| 215 |
elif audio_data.dtype == np.int32:
|
|
|
|
| 216 |
audio_data = audio_data.astype(np.float32) / 2147483648.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
# Handle stereo to mono conversion
|
| 219 |
if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
|
| 220 |
audio_data = np.mean(audio_data, axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
# Resample to 16kHz if needed (Whisper requirement)
|
| 223 |
if sample_rate != 16000:
|
| 224 |
import librosa
|
| 225 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
# Trim silence and limit audio length for speed (max 30 seconds)
|
| 228 |
max_samples = 16000 * 30 # 30 seconds at 16kHz
|
|
@@ -238,6 +255,8 @@ class ProfessionalVoiceAgent:
|
|
| 238 |
return_tensors="pt"
|
| 239 |
).input_features.to(self.device)
|
| 240 |
|
|
|
|
|
|
|
| 241 |
# Generate token ids - optimized for speed
|
| 242 |
with torch.cuda.amp.autocast(enabled=self.device.type == "cuda"):
|
| 243 |
with torch.no_grad():
|
|
@@ -246,6 +265,8 @@ class ProfessionalVoiceAgent:
|
|
| 246 |
language="en",
|
| 247 |
task="transcribe"
|
| 248 |
)
|
|
|
|
|
|
|
| 249 |
predicted_ids = self.whisper_model.generate(
|
| 250 |
input_features,
|
| 251 |
forced_decoder_ids=forced_decoder_ids,
|
|
@@ -254,6 +275,8 @@ class ProfessionalVoiceAgent:
|
|
| 254 |
do_sample=False # Deterministic
|
| 255 |
)
|
| 256 |
|
|
|
|
|
|
|
| 257 |
# Decode token ids to text
|
| 258 |
transcription = self.whisper_processor.batch_decode(
|
| 259 |
predicted_ids,
|
|
|
|
| 210 |
logger.info(f"Audio duration: {duration_seconds:.2f}s, sample_rate: {sample_rate}Hz")
|
| 211 |
|
| 212 |
# Convert to float32 if needed
|
| 213 |
+
logger.info(f"Audio dtype before conversion: {audio_data.dtype}")
|
| 214 |
if audio_data.dtype == np.int16:
|
| 215 |
+
logger.info("Converting from int16 to float32")
|
| 216 |
audio_data = audio_data.astype(np.float32) / 32768.0
|
| 217 |
elif audio_data.dtype == np.int32:
|
| 218 |
+
logger.info("Converting from int32 to float32")
|
| 219 |
audio_data = audio_data.astype(np.float32) / 2147483648.0
|
| 220 |
+
elif audio_data.dtype == np.float64:
|
| 221 |
+
logger.info("Converting from float64 to float32")
|
| 222 |
+
audio_data = audio_data.astype(np.float32)
|
| 223 |
+
logger.info(f"Audio dtype after conversion: {audio_data.dtype}")
|
| 224 |
|
| 225 |
# Handle stereo to mono conversion
|
| 226 |
if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
|
| 227 |
audio_data = np.mean(audio_data, axis=1)
|
| 228 |
+
logger.info(f"Converted stereo to mono, new shape: {audio_data.shape}")
|
| 229 |
+
|
| 230 |
+
# Check audio statistics before resampling
|
| 231 |
+
logger.info(f"Audio stats - min: {audio_data.min():.4f}, max: {audio_data.max():.4f}, mean: {audio_data.mean():.4f}")
|
| 232 |
|
| 233 |
# Resample to 16kHz if needed (Whisper requirement)
|
| 234 |
if sample_rate != 16000:
|
| 235 |
import librosa
|
| 236 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
| 237 |
+
logger.info(f"Resampled to 16kHz, new length: {len(audio_data)} samples ({len(audio_data)/16000:.2f}s)")
|
| 238 |
+
|
| 239 |
+
# Check if audio is too quiet or silent
|
| 240 |
+
audio_abs_mean = np.abs(audio_data).mean()
|
| 241 |
+
if audio_abs_mean < 0.001:
|
| 242 |
+
logger.warning(f"Audio might be too quiet! Abs mean: {audio_abs_mean}")
|
| 243 |
|
| 244 |
# Trim silence and limit audio length for speed (max 30 seconds)
|
| 245 |
max_samples = 16000 * 30 # 30 seconds at 16kHz
|
|
|
|
| 255 |
return_tensors="pt"
|
| 256 |
).input_features.to(self.device)
|
| 257 |
|
| 258 |
+
logger.info(f"Whisper input_features shape: {input_features.shape}, device: {input_features.device}")
|
| 259 |
+
|
| 260 |
# Generate token ids - optimized for speed
|
| 261 |
with torch.cuda.amp.autocast(enabled=self.device.type == "cuda"):
|
| 262 |
with torch.no_grad():
|
|
|
|
| 265 |
language="en",
|
| 266 |
task="transcribe"
|
| 267 |
)
|
| 268 |
+
logger.info(f"Forced decoder IDs: {forced_decoder_ids}")
|
| 269 |
+
|
| 270 |
predicted_ids = self.whisper_model.generate(
|
| 271 |
input_features,
|
| 272 |
forced_decoder_ids=forced_decoder_ids,
|
|
|
|
| 275 |
do_sample=False # Deterministic
|
| 276 |
)
|
| 277 |
|
| 278 |
+
logger.info(f"Predicted token IDs shape: {predicted_ids.shape}, first 10 IDs: {predicted_ids[0][:10].tolist()}")
|
| 279 |
+
|
| 280 |
# Decode token ids to text
|
| 281 |
transcription = self.whisper_processor.batch_decode(
|
| 282 |
predicted_ids,
|