Spaces:

dmcartor
/

ASR_Starter_Project

Sleeping

File size: 3,050 Bytes

8d647a4
bd771cb
ca4ed36
bd771cb
ca4ed36
 
bd771cb
ca4ed36
e7c0d26
ca4ed36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7dfdabd
bd771cb
8d647a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7dfdabd
 
 
ca4ed36
7dfdabd
ca4ed36
 
7dfdabd
bd771cb
7dfdabd
bd771cb
8d647a4
 
 
 
9eecb2d
8d647a4
 
 
 
 
 
 
 
 
 
bd771cb
 
 
e7c0d26
bd771cb
 
 
8d647a4
 
ca4ed36

import gradio as gr
from transformers import pipeline
import whisper

# Load the Whisper model
model = whisper.load_model("large")

# Define the function for ASR with language detection
def transcribe(audio):
    # Load audio and pad/trim it to fit 30 seconds
    audio_data = whisper.load_audio(audio)
    audio_data = whisper.pad_or_trim(audio_data)

    # Make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio_data).to(model.device)

    # Detect the spoken language
    _, probs = model.detect_language(mel)
    detected_language = max(probs, key=probs.get)
    
    # Decode the audio
    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    
    return f"Detected language: {detected_language}\n\nTranscription: {result.text}"

# Retain the ChatInterface setup from the existing app.py
from huggingface_hub import InferenceClient

client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""

    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content

        response += token
        yield response

# Create the ASR interface with a label and functionality for both file upload and direct recording
asr_interface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath", label="Upload or record audio"),
    outputs="text",
    title="ASR Transcription with Language Detection",
    description="Upload an audio file or record audio directly to get the transcription and detected language."
)

# Retain the ChatInterface setup from the existing app.py
chat_interface = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

# Combine the two interfaces into a single Gradio Blocks application
with gr.Blocks() as demo:
    gr.Markdown("# ASR and Chatbot Application")
    gr.Markdown(" ")  # Adding space between the top and the ASR interface
    asr_interface.render()
    gr.Markdown("----")
    chat_interface.render()

if __name__ == "__main__":
    demo.launch()