Spaces:
Runtime error
Runtime error
File size: 1,806 Bytes
4f4a842 4c28db1 4f4a842 4c28db1 4f4a842 4c28db1 4f4a842 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import gradio as gr
import os
import librosa
from transformers import Wav2Vec2ProcessorWithLM, AutoModelForCTC, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
import torch
model_name = os.getenv("MODEL_NAME")
auth_token = os.getenv("API_TOKEN")
# Load models
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name, eos_token=None, bos_token=None, use_auth_token=auth_token)
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name, use_auth_token=auth_token)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name, use_auth_token=auth_token)
decoder = processor.decoder
processor = Wav2Vec2ProcessorWithLM(feature_extractor=feature_extractor, tokenizer=tokenizer, decoder=decoder)
model = AutoModelForCTC.from_pretrained(model_name, use_auth_token=auth_token)
def load_data(input_file):
# Read the file
speech, sample_rate = librosa.load(input_file)
# Make it 1-D
if len(speech.shape) > 1:
speech = speech[:,0] + speech[:,1]
# Resampling at 16KHz
if sample_rate !=16_000:
speech = librosa.resample(speech, sample_rate, 16_000)
return speech
def transcribe(input_file):
audio = load_data(input_file)
# audio = input_file
# Tokenize
input_values = processor(audio, return_tensors="pt", sampling_rate=16_000).input_values
# Take logits
with torch.no_grad():
logits = model(input_values).logits.cpu().numpy()[0]
# Decode
text = decoder.decode(logits, beam_width=30)
return text
examples = [
["examples/example1.mp3"],
["examples/example2.mp3"],
]
gr.Interface(
title="Rozpoznání mluvené řeči pro český jazyk",
fn=transcribe,
inputs=gr.inputs.Audio(source="upload", type="filepath"),
outputs="text",
examples=examples
).launch()
|