Fine-tuned Wav2Vec2-Large-XLSR-53 large model for speech recognition on Uzbek Language

Fine-tuned facebook/wav2vec2-large-xlsr-53 on Uzbek using the train splits of Common Voice. When using this model, make sure that your speech input is sampled at 16kHz.

Usage

from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import torchaudio
from typing import Optional, Tuple

class Wav2Vec2STTModel:
    def __init__(self, model_name: str):
        """Initialize the Wav2Vec2 model and processor"""
        self.model_name = model_name
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self._load_model()
        
    def _load_model(self) -> None:
        """Load model and processor from HuggingFace"""
        try:
            self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
            self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name).to(self.device)
        except Exception as e:
            raise RuntimeError(f"Failed to load model: {str(e)}")
    
    def preprocess_audio(self, file_path: str) -> Tuple[torch.Tensor, int]:
        """Load and preprocess audio file"""
        try:
            speech_array, sampling_rate = torchaudio.load(file_path)
            
            # Resample if needed
            if sampling_rate != 16000:
                resampler = torchaudio.transforms.Resample(
                    orig_freq=sampling_rate, 
                    new_freq=16000
                )
                speech_array = resampler(speech_array)
                
            return speech_array.squeeze().numpy(), 16000
        except FileNotFoundError:
            raise FileNotFoundError(f"Audio file not found: {file_path}")
        except Exception as e:
            raise RuntimeError(f"Audio processing error: {str(e)}")
    
    def _replace_unk(self, transcription: str) -> str:
        """Replace unknown tokens with apostrophe"""
        return transcription.replace("[UNK]", "ʼ")
    
    def transcribe(self, file_path: str) -> str:
        """Transcribe audio file to text"""
        try:
            # Preprocess audio
            speech_array, sampling_rate = self.preprocess_audio(file_path)
            
            # Process input
            inputs = self.processor(
                speech_array, 
                sampling_rate=sampling_rate, 
                return_tensors="pt"
            ).to(self.device)
            
            # Model inference
            with torch.no_grad():
                logits = self.model(inputs.input_values).logits
            
            # Decode prediction
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = self.processor.batch_decode(predicted_ids)[0]
            
            # Clean up result
            return self._replace_unk(transcription)
            
        except Exception as e:
            raise RuntimeError(f"Transcription error: {str(e)}")

# Example usage
if __name__ == "__main__":
    try:
        # Initialize model
        stt_model = Wav2Vec2STTModel("ipilot7/uzbek_speach_to_text")
        
        # Transcribe audio
        result = stt_model.transcribe("1.mp3")
        print("Transcription:", result)
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")

Downloads last month: 33

Safetensors

Model size

0.3B params

Tensor type

F32

Model tree for ipilot7/uzbek_speach_to_text

Base model

facebook/wav2vec2-large-xlsr-53

Finetuned

(370)

this model

ipilot7
/

uzbek_speach_to_text

Fine-tuned Wav2Vec2-Large-XLSR-53 large model for speech recognition on Uzbek Language

Usage

Model tree for ipilot7/uzbek_speach_to_text

Dataset used to train ipilot7/uzbek_speach_to_text