File size: 2,747 Bytes
844fd6a
 
 
 
41c20a5
75831ed
09df261
844fd6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1614463
 
d8c2a3a
 
1614463
d8c2a3a
 
 
09df261
 
a1f5239
09df261
 
844fd6a
 
 
 
a7b2a22
844fd6a
 
 
 
 
 
 
 
 
 
 
 
 
a83121d
844fd6a
 
f5f5ed3
844fd6a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr
import base64
import os
import torch
import numpy as np
#import ast
#import librosa
from transformers import (
    AutomaticSpeechRecognitionPipeline,
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperProcessor,
)
from peft import PeftModel, PeftConfig
peft_model_id = "Boadiwaa/LORA-colab-Distil-Whisper-medium2"
task = "transcribe"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path,device_map="auto"
)

model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path,task=task)
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path,task=task)
feature_extractor = processor.feature_extractor
#forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
pipe = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)

#api_key = os.getenv("HF_API_TOKEN")

def transcribe(data):
    #data_list= data["data"]
    #url = data_list[0]
    #url["url"]
    decode_string = base64.b64decode(data)
    #array = np.frombuffer(decode_string, dtype=np.uint8)
    output_file_path = "audio.wav"
    with open(output_file_path, "wb") as output_file:
        output_file.write(decode_string)
        print("Success")
    with torch.cuda.amp.autocast():
        #data,samplerate = librosa.load(output_file_path)
        text = pipe(output_file_path,max_new_tokens=255)["text"]
        return text
    
#hf_writer = gr.HuggingFaceDatasetSaver(hf_token = api_key,dataset_name="interaction-log2")
demo = gr.Interface(
    fn=transcribe,
    inputs=gr.JSON(),
    outputs="text",
    title="Transcriber for Ghanaian-accented speech (English)",
    description="Realtime demo for Ghanaian-accented speech recognition (in English).",
    article = """
    By using this app you consent to your voice being used to train the underlying open-source model further.
    
    INSTRUCTIONS FOR USE:
    1. Click on record and speak into your microphone
    2. Click on stop and submit after you are done speaking.
    3. Speech input should not exceed 40s for optimal results.
    4. Please wait a few secs after input to see your results.
    NB: You might see "no microphone detected" when you first open the app, CONSIDER THAT A MICROPHONE TEST, record anyway and submit. You might see an Error in the output. Now delete the input by clicking the 'x' at the top and record your main input.
    The app should run seamlessly in the subsequent inputs.
    """    
)

demo.launch(share=True, show_error= True)

if __name__ == "__main__":
    demo.launch()