Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import pipeline | |
| import librosa | |
| import soundfile as sf | |
| import numpy as np | |
| class WhisperTranscriber: | |
| def __init__(self, model_size="medium"): | |
| self.model_size = model_size | |
| self.device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| self.model = pipeline( | |
| "automatic-speech-recognition", | |
| model=f"openai/whisper-{model_size}", | |
| chunk_length_s=30, | |
| device=self.device, | |
| batch_size=8, | |
| torch_dtype=torch.float16, | |
| return_timestamps=True | |
| ) | |
| def preprocess_audio(self, audio_path, target_sr=16000): | |
| # Load audio with librosa for better preprocessing | |
| y, sr = librosa.load(audio_path, sr=None) | |
| # Resample to 16kHz (Whisper's expected rate) | |
| y_resampled = librosa.resample(y=y, orig_sr=sr, target_sr=target_sr) | |
| # Apply noise reduction | |
| y_cleaned = librosa.effects.preemphasis(y_resampled) | |
| # Normalize audio | |
| y_normalized = librosa.util.normalize(y_cleaned) | |
| # Remove silence and very quiet parts | |
| y_filtered = librosa.effects.trim( | |
| y_normalized, | |
| top_db=30, | |
| frame_length=2048, | |
| hop_length=512 | |
| )[0] | |
| return y_filtered, target_sr | |
| def transcribe(self, audio_path): | |
| try: | |
| # Preprocess audio | |
| audio_data, sample_rate = self.preprocess_audio(audio_path) | |
| print(f"Audio loaded and preprocessed - Shape: {audio_data.shape}, Sample rate: {sample_rate}") | |
| # Transcribe | |
| result = self.model( | |
| audio_data, | |
| generate_kwargs={ | |
| "task": "transcribe", | |
| "language": "en", | |
| "max_new_tokens": 256, | |
| "temperature": 0.7 # Added to reduce hallucination | |
| } | |
| ) | |
| # Extract transcription with timestamps if available | |
| if isinstance(result, dict): | |
| if "chunks" in result: | |
| transcription = " ".join([chunk["text"] for chunk in result["chunks"]]) | |
| else: | |
| transcription = result["text"] | |
| else: | |
| transcription = result | |
| return transcription | |
| except Exception as e: | |
| print(f"Error in transcribe: {str(e)}") | |
| raise | |
| # Example usage | |
| if __name__ == "__main__": | |
| transcriber = WhisperTranscriber(model_size="medium") | |
| transcription = transcriber.transcribe("path_to_your_audio_file.wav") | |
| print(f"Transcription: {transcription}") | |