File size: 4,871 Bytes
f821365
c86e03f
 
d152984
5a6a6ff
c86e03f
29a7fe7
5a6a6ff
 
 
 
 
 
 
c86e03f
 
 
 
5a6a6ff
 
 
 
 
 
29a7fe7
5a6a6ff
c86e03f
 
5a6a6ff
 
29a7fe7
 
5a6a6ff
 
29a7fe7
5a6a6ff
29a7fe7
 
 
 
5a6a6ff
29a7fe7
5a6a6ff
29a7fe7
 
 
 
c86e03f
5a6a6ff
 
d152984
5a6a6ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29a7fe7
5a6a6ff
 
 
29a7fe7
5a6a6ff
 
 
 
 
29a7fe7
5a6a6ff
 
 
29a7fe7
5a6a6ff
 
d152984
5a6a6ff
 
 
 
 
 
 
 
 
d152984
5a6a6ff
 
 
 
 
 
d152984
5a6a6ff
 
 
 
 
 
 
 
 
 
29a7fe7
5a6a6ff
 
 
 
 
 
 
 
 
 
 
 
29a7fe7
5a6a6ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d152984
 
5a6a6ff
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions

# --- Configuration ---
# 1. API KEY: Ensure you have your Deepgram API Key ready
# Ideally, set this in your environment variables as DEEPGRAM_API_KEY
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "YOUR_DEEPGRAM_KEY_HERE")

# 2. Model Config
REPO_ID = "Kezovic/iris-q4gguf-v2"
FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
CONTEXT_WINDOW = 4096 
MAX_NEW_TOKENS = 512
TEMPERATURE = 0.7 

# --- Initialize Deepgram ---
if DEEPGRAM_API_KEY == "YOUR_DEEPGRAM_KEY_HERE":
    print("WARNING: Please set your DEEPGRAM_API_KEY.")
    
deepgram = DeepgramClient(DEEPGRAM_API_KEY)

# --- Model Loading Function ---
llm = None
def load_llm():
    """Downloads the GGUF model and initializes LlamaCPP."""
    global llm
    print("Downloading LLM...")
    try:
        model_path = hf_hub_download(
            repo_id=REPO_ID,
            filename=FILENAME
        )
        # n_threads=2 is good for free Hugging Face CPU tiers
        llm = Llama(
            model_path=model_path,
            n_ctx=CONTEXT_WINDOW,
            n_threads=2,
            verbose=False 
        )
        print("LLM loaded successfully!")
        return llm
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Load model on startup
load_llm()

# --- 1. Speech-to-Text (Deepgram) ---
def transcribe_audio(audio_filepath):
    """Sends audio file to Deepgram and returns text."""
    if not audio_filepath:
        return ""
    
    try:
        with open(audio_filepath, "rb") as buffer:
            payload = {"buffer": buffer}
            options = PrerecordedOptions(
                smart_format=True, 
                model="nova-2", 
                language="en-US"
            )
            response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
            return response.results.channels[0].alternatives[0].transcript
    except Exception as e:
        print(f"STT Error: {e}")
        return ""

# --- 2. Text-to-Speech (Deepgram) ---
def text_to_speech(text):
    """Sends text to Deepgram and returns path to audio file."""
    try:
        filename = "output_response.mp3"
        options = SpeakOptions(
            model="aura-asteria-en", # Choices: aura-asteria-en, aura-helios-en, etc.
            encoding="linear16", 
            container="wav"
        )
        # Save the audio to a file
        deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
        return filename
    except Exception as e:
        print(f"TTS Error: {e}")
        return None

# --- 3. Main Pipeline Function ---
def process_conversation(audio_input):
    """
    1. Transcribe Audio (STT)
    2. Query LLM
    3. Synthesize Speech (TTS)
    """
    if llm is None:
        return "Model not loaded.", None, "System Error: Model failed to load."

    # Step A: Transcribe
    user_text = transcribe_audio(audio_input)
    if not user_text:
        return "Could not hear audio.", None, ""

    print(f"User said: {user_text}")

    # Step B: LLM Inference
    # Using the prompt format from your original code
    full_prompt = f"### Human: {user_text}\n### Assistant:"
    
    output = llm(
        prompt=full_prompt, 
        max_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        stop=["### Human:"], 
        echo=False
    )
    response_text = output['choices'][0]['text'].strip()
    print(f"LLM said: {response_text}")

    # Step C: Speak Response
    output_audio_path = text_to_speech(response_text)

    # Return: Transcription (for display), Audio (for playback), LLM Text (for display)
    return user_text, output_audio_path, response_text

# --- Gradio UI ---
with gr.Blocks(title=f"Voice Chat with {FILENAME}") as demo:
    gr.Markdown(f"## 🗣️ Deepgram Voice Chat with {FILENAME}")
    
    with gr.Row():
        # Input Column
        with gr.Column():
            audio_input = gr.Audio(
                sources=["microphone"], 
                type="filepath", 
                label="Speak Now"
            )
            submit_btn = gr.Button("Submit Audio", variant="primary")

        # Output Column
        with gr.Column():
            audio_output = gr.Audio(
                label="Assistant Voice", 
                autoplay=True, # Automatically plays the response
                interactive=False
            )
            # Debugging/Visuals
            user_transcript = gr.Textbox(label="You said:")
            ai_response_text = gr.Textbox(label="AI Response:")

    # Event Listener
    submit_btn.click(
        fn=process_conversation,
        inputs=[audio_input],
        outputs=[user_transcript, audio_output, ai_response_text]
    )

if __name__ == "__main__":
    demo.launch()