| import librosa | |
| import torch | |
| from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor | |
| from transformers import set_seed | |
| def identify_language(fp:str) -> str: | |
| ''' | |
| For given audio file, identify what language it uses. | |
| Parameters | |
| ---------- | |
| fp: str | |
| The file path to the audio file. | |
| Returns | |
| ---------- | |
| detected_lang:str | |
| The iso3 code of the detected language. | |
| ''' | |
| # Ensure replicability | |
| set_seed(555) | |
| # Load language ID model | |
| model_id = "facebook/mms-lid-256" # Need to find the appropriate model for the language -- 256 languages is the first that contains MOS | |
| processor = AutoFeatureExtractor.from_pretrained(model_id) | |
| model = Wav2Vec2ForSequenceClassification.from_pretrained(model_id) | |
| # Process the audio | |
| signal, sampling_rate = librosa.load(fp, sr=16000) | |
| inputs = processor(signal, sampling_rate=16_000, return_tensors="pt") | |
| # Inference | |
| with torch.no_grad(): | |
| outputs = model(**inputs).logits | |
| lang_id = torch.argmax(outputs, dim=-1)[0].item() | |
| detected_lang = model.config.id2label[lang_id] | |
| return detected_lang | |