File size: 2,899 Bytes
1b6a625
 
 
 
 
 
 
 
 
 
 
c3841bf
1b6a625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import gradio as gr
import assemblyai as aai
from cerebras.cloud.sdk import Cerebras
from gtts import gTTS
import tempfile

Voicekey = os.getenv ("AssemblyVoice")
CereAI = os.getenv ("CerebrasAI")

# Set API keys
aai.settings.api_key = Voicekey

client = Cerebras(
    api_key= CereAI
)

def process_audio(audio):
    # Check if audio is valid
    if audio is None:
        return "No audio file received."

    # If the audio file doesn't have a name attribute, assign a temporary name
    if isinstance(audio, str):  # If audio is passed as a file path (string)
        audio_file_path = audio
    else:
        # Generate a temporary file name and save audio
        audio_file_path = tempfile.mktemp(suffix=".mp3")  # .wav as default, you can change the format if needed
        with open(audio_file_path, 'wb') as f:
            f.write(audio.read())  # Save audio data to the file

    # Upload audio to AssemblyAI for transcription
    transcriber = aai.Transcriber()
    transcript = transcriber.transcribe(audio_file_path)  # Transcribe the uploaded file

    if transcript.status == aai.TranscriptStatus.error:
        return f"Error transcribing audio: {transcript.error}"

    transcript_text = transcript.text
    print(f"Transcription: {transcript_text}")

    # Generate response using Cerebras Llama 3.3
    stream = client.chat.completions.create(
        messages=[{
            "role": "system", "content": "Conversation will be started in this chat. Try as much as possible to provide concise and informed responses to the prompt."
        }, {
            "role": "user", "content": transcript_text
        }],
        model="llama-3.3-70b",
        stream=True,
        max_completion_tokens=1024,
        temperature=0.4,
        top_p=1
    )

    response_text = "".join(chunk.choices[0].delta.content or "" for chunk in stream)
    print(f"Response from LLM: {response_text}")

    # Generate speech using gTTS (Google Text-to-Speech)
    tts = gTTS(text=response_text, lang='en', slow=False)

    # Save the audio to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tts.save(tmp_file.name)
        audio_path = tmp_file.name

    return audio_path

# Gradio Interface
interface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(sources=["microphone"], type="filepath"),  # Use 'file' to correctly handle the audio file
    outputs=gr.Audio(type="filepath", label="Generated Response Audio", show_download_button=True,
    waveform_options=gr.WaveformOptions(
        waveform_color="#01C6FF",
        waveform_progress_color="#0066B4",
        skip_length=2,
        show_controls=False,
    )),
    title="Xplayn: Voice-to-Audio AI",
    description="Record your voice, and the system will transcribe it, generate a response using Llama 3.3, and return the response as audio."
)

interface.launch()