Spaces:
Build error
Build error
File size: 2,747 Bytes
844fd6a 41c20a5 75831ed 09df261 844fd6a 1614463 d8c2a3a 1614463 d8c2a3a 09df261 a1f5239 09df261 844fd6a a7b2a22 844fd6a a83121d 844fd6a f5f5ed3 844fd6a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | import gradio as gr
import base64
import os
import torch
import numpy as np
#import ast
#import librosa
from transformers import (
AutomaticSpeechRecognitionPipeline,
WhisperForConditionalGeneration,
WhisperTokenizer,
WhisperProcessor,
)
from peft import PeftModel, PeftConfig
peft_model_id = "Boadiwaa/LORA-colab-Distil-Whisper-medium2"
task = "transcribe"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
peft_config.base_model_name_or_path,device_map="auto"
)
model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path,task=task)
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path,task=task)
feature_extractor = processor.feature_extractor
#forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
pipe = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
#api_key = os.getenv("HF_API_TOKEN")
def transcribe(data):
#data_list= data["data"]
#url = data_list[0]
#url["url"]
decode_string = base64.b64decode(data)
#array = np.frombuffer(decode_string, dtype=np.uint8)
output_file_path = "audio.wav"
with open(output_file_path, "wb") as output_file:
output_file.write(decode_string)
print("Success")
with torch.cuda.amp.autocast():
#data,samplerate = librosa.load(output_file_path)
text = pipe(output_file_path,max_new_tokens=255)["text"]
return text
#hf_writer = gr.HuggingFaceDatasetSaver(hf_token = api_key,dataset_name="interaction-log2")
demo = gr.Interface(
fn=transcribe,
inputs=gr.JSON(),
outputs="text",
title="Transcriber for Ghanaian-accented speech (English)",
description="Realtime demo for Ghanaian-accented speech recognition (in English).",
article = """
By using this app you consent to your voice being used to train the underlying open-source model further.
INSTRUCTIONS FOR USE:
1. Click on record and speak into your microphone
2. Click on stop and submit after you are done speaking.
3. Speech input should not exceed 40s for optimal results.
4. Please wait a few secs after input to see your results.
NB: You might see "no microphone detected" when you first open the app, CONSIDER THAT A MICROPHONE TEST, record anyway and submit. You might see an Error in the output. Now delete the input by clicking the 'x' at the top and record your main input.
The app should run seamlessly in the subsequent inputs.
"""
)
demo.launch(share=True, show_error= True)
if __name__ == "__main__":
demo.launch()
|