File size: 2,308 Bytes
9cef7ee
 
 
 
 
 
85e2f97
5d5384f
efc7e86
 
 
 
 
 
 
 
 
9cef7ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be4cc6c
9cef7ee
 
 
 
 
be4cc6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cef7ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import whisper
import gradio as gr
from gtts import gTTS
from groq import Groq




api_key = os.environ.get("Groq_Api_Key")

if not api_key:
    raise ValueError("GROQ_API_KEY is not set. Please add it in the Hugging Face Secrets.")

# Initialize Groq client
client = Groq(api_key=api_key)


# Load Whisper model
whisper_model = whisper.load_model("base")

# Function to process audio input
def process_audio_realtime(audio_file):
    """
    Real-time processing of audio.
    1. Transcribe audio with Whisper.
    2. Process transcription using Llama.
    3. Convert Llama output to audio using gTTS.
    """
    # Step 1: Transcribe the audio to text using Whisper
    transcription = whisper_model.transcribe(audio_file)["text"]
    
    # Step 2: Process transcription using Llama model via Groq API
    """"
    llama_response = client.chat.completions.create(
        messages=[{"role": "user", "content": transcription}],
        model="llama3-8b-8192",  # Replace with your actual Llama model name
        stream=False
    ).choices[0].message.content
    """

    # Step 2: Process transcription using Llama model via Groq API
    llama_response = client.chat.completions.create(
        messages=[
        {"role": "system", "content": "You are a helpful assistant. Please provide a concise and accurate response."},
        {"role": "user", "content": transcription}
        ],
        model="llama3-8b-8192",  # Replace with your actual Llama model name
        max_tokens=70,
        stream=False
    ).choices[0].message.content





    
    # Step 3: Convert Llama response to audio using gTTS
    tts = gTTS(text=llama_response, lang="en")
    audio_output_path = "generated_output.mp3"
    tts.save(audio_output_path)
    
    return llama_response, audio_output_path

# Create Gradio interface for real-time simulation
interface = gr.Interface(
    fn=process_audio_realtime,
    inputs=gr.Audio(type="filepath", label="Input Audio"),  # Removed `source` argument
    outputs=[
        gr.Textbox(label="Processed Text"),  # Display processed text in real-time
        gr.Audio(type="filepath", label="Generated Audio")  # Output audio
    ],
    live=True,  # Enable real-time behavior
    title="Real-Time Audio-to-Audio Application"
)

# Launch Gradio app
interface.launch()