BrendaTellez's picture
Update app2.py
f2a40df
raw
history blame
1.32 kB
import os
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
# Set the Hugging Face API token
os.environ["HUGGINGFACE_TOKEN"] = "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"
# Load the pre-trained model and tokenizer
model_name = "BrendaTellez/sounds2"
model = Wav2Vec2ForCTC.from_pretrained(model_name, use_auth_token=True)
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name, use_auth_token=True)
# Get the audio file from the user
file_path = input("Enter the path to the audio file: ")
# Load the audio file using torchaudio
waveform, sample_rate = torchaudio.load(file_path)
# Resample the audio to match the sample rate expected by the model
if waveform.shape[0] != model.config.sample_rate:
resampler = torchaudio.transforms.Resample(waveform.shape[1], model.config.sample_rate)
waveform = resampler(waveform)
# Tokenize the audio using the model's tokenizer
inputs = tokenizer(waveform.numpy(), return_tensors="pt", padding=True)
# Use the model to classify the audio
with torch.no_grad():
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
predicted_class_id = torch.argmax(logits, dim=-1)
predicted_class_label = tokenizer.decode(predicted_class_id[0])
print(f"The audio file is classified as: {predicted_class_label}")