WWMachine commited on
Commit
592b831
·
verified ·
1 Parent(s): f4264e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -112
app.py CHANGED
@@ -3,10 +3,13 @@ from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
5
  from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions
6
- import time
7
 
8
  # --- Configuration ---
9
- DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") # Ensure this is set in Space Settings
 
 
 
 
10
  REPO_ID = "Kezovic/iris-q4gguf-v2"
11
  FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
12
  CONTEXT_WINDOW = 4096
@@ -14,164 +17,143 @@ MAX_NEW_TOKENS = 512
14
  TEMPERATURE = 0.7
15
 
16
  # --- Initialize Deepgram ---
17
- if not DEEPGRAM_API_KEY:
18
- print("Error: DEEPGRAM_API_KEY is missing.")
19
- deepgram = None
20
- else:
21
- deepgram = DeepgramClient(DEEPGRAM_API_KEY)
22
 
23
- # --- Load LLM ---
24
  llm = None
25
  def load_llm():
 
26
  global llm
27
  print("Downloading LLM...")
28
  try:
29
- model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
 
 
 
 
30
  llm = Llama(
31
  model_path=model_path,
32
  n_ctx=CONTEXT_WINDOW,
33
  n_threads=2,
34
  verbose=False
35
  )
36
- print("LLM loaded!")
 
37
  except Exception as e:
38
  print(f"Error loading model: {e}")
 
39
 
 
40
  load_llm()
41
 
42
- # --- Helper Functions ---
43
-
44
- def transcribe(audio_path):
45
- """Converts Speech to Text using Deepgram Nova-2"""
46
- if not audio_path or deepgram is None:
47
- return None
48
  try:
49
- with open(audio_path, "rb") as buffer:
50
  payload = {"buffer": buffer}
51
- options = PrerecordedOptions(smart_format=True, model="nova-2", language="en-US")
 
 
 
 
52
  response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
53
  return response.results.channels[0].alternatives[0].transcript
54
  except Exception as e:
55
  print(f"STT Error: {e}")
56
- return None
57
 
58
- def speak(text):
59
- """Converts Text to Speech using Deepgram Aura"""
60
- if not text or deepgram is None:
61
- return None
62
  try:
63
- filename = f"response_{int(time.time())}.mp3"
64
- options = SpeakOptions(model="aura-asteria-en", encoding="linear16", container="wav")
 
 
 
 
 
65
  deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
66
  return filename
67
  except Exception as e:
68
  print(f"TTS Error: {e}")
69
  return None
70
 
71
- # --- Main Logic ---
72
-
73
- def run_chat_pipeline(audio_input, history, state_messages):
74
  """
75
- 1. Transcribe Audio -> Update UI with User Text
76
- 2. Query LLM -> Update UI with AI Text
77
- 3. Generate Audio -> Auto-play response
78
  """
79
  if llm is None:
80
- return history, state_messages, None
81
 
82
- # --- Step 1: User Speech to Text ---
83
- user_text = transcribe(audio_input)
84
-
85
  if not user_text:
86
- # If silence or error, return existing state without changes
87
- return history, state_messages, None
88
 
89
- # Update internal memory (Standard OpenAI/Llama format)
90
- state_messages.append({"role": "user", "content": user_text})
91
-
92
- # Update UI History (Gradio Chatbot format: list of [user_msg, bot_msg])
93
- # We add the user message temporarily with a pending bot response
94
- history.append((user_text, None))
95
 
96
- # --- Step 2: LLM Generation ---
97
- try:
98
- completion = llm.create_chat_completion(
99
- messages=state_messages,
100
- max_tokens=MAX_NEW_TOKENS,
101
- temperature=TEMPERATURE
102
- )
103
- ai_text = completion['choices'][0]['message']['content']
104
- except Exception as e:
105
- ai_text = f"Error: {str(e)}"
106
-
107
- # Update internal memory with AI response
108
- state_messages.append({"role": "assistant", "content": ai_text})
109
-
110
- # Update UI History: Replace the 'None' with the actual AI text
111
- history[-1] = (user_text, ai_text)
112
-
113
- # --- Step 3: Text to Speech ---
114
- audio_path = speak(ai_text)
115
-
116
- # Return: Updated Chatbot UI, Updated Internal State, Audio File
117
- return history, state_messages, audio_path
118
-
119
- # --- Gradio UI Layout ---
120
- with gr.Blocks(title="Voice Chatbot") as demo:
121
- gr.Markdown("## 🎙️ Voice-First AI Chat")
122
 
123
- # 1. Visual Conversation History (The "Screen")
124
- chatbot = gr.Chatbot(
125
- label="Conversation",
126
- type="messages", # Uses newer Gradio format if available, else standard
127
- height=500
 
128
  )
 
 
129
 
130
- # 2. State (Hidden Memory)
131
- state_messages = gr.State([]) # Stores [{"role":"user", "content":"..."}, ...]
132
 
133
- # 3. Audio Interaction Area
 
 
 
 
 
 
134
  with gr.Row():
135
- with gr.Column(scale=4):
136
- # Input Microphone
137
  audio_input = gr.Audio(
138
  sources=["microphone"],
139
- type="filepath",
140
- label="Record Your Message"
141
  )
142
- with gr.Column(scale=1):
143
- # Send Button
144
- submit_btn = gr.Button("Send Voice 💬", variant="primary")
145
- clear_btn = gr.Button("Clear Chat 🗑️")
146
-
147
- # 4. Hidden Output Audio (For Autoplay)
148
- # We make it visible=False so it doesn't clutter UI,
149
- # but Gradio still plays it if we return it to this component.
150
- # Note: Some browsers block autoplay from hidden components.
151
- # If it doesn't play, set visible=True.
152
- audio_player = gr.Audio(
153
- label="AI Voice",
154
- autoplay=True,
155
- visible=True, # Kept visible for control, can set to False
156
- interactive=False
157
- )
158
 
159
- # --- Event Wiring ---
160
-
161
  submit_btn.click(
162
- fn=run_chat_pipeline,
163
- inputs=[audio_input, chatbot, state_messages],
164
- outputs=[chatbot, state_messages, audio_player]
165
- )
166
-
167
- # Clear Logic
168
- def clear_all():
169
- return [], [], None
170
-
171
- clear_btn.click(
172
- fn=clear_all,
173
- inputs=None,
174
- outputs=[chatbot, state_messages, audio_player]
175
  )
176
 
177
  if __name__ == "__main__":
 
3
  from huggingface_hub import hf_hub_download
4
  import os
5
  from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions
 
6
 
7
  # --- Configuration ---
8
+ # 1. API KEY: Ensure you have your Deepgram API Key ready
9
+ # Ideally, set this in your environment variables as DEEPGRAM_API_KEY
10
+ DEEPGRAM_API_KEY = "19d640a011569d78395c814e5f875b15cc84deb8"
11
+
12
+ # 2. Model Config
13
  REPO_ID = "Kezovic/iris-q4gguf-v2"
14
  FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
15
  CONTEXT_WINDOW = 4096
 
17
  TEMPERATURE = 0.7
18
 
19
  # --- Initialize Deepgram ---
20
+ if DEEPGRAM_API_KEY == "YOUR_DEEPGRAM_KEY_HERE":
21
+ print("WARNING: Please set your DEEPGRAM_API_KEY.")
22
+
23
+ deepgram = DeepgramClient(DEEPGRAM_API_KEY)
 
24
 
25
+ # --- Model Loading Function ---
26
  llm = None
27
  def load_llm():
28
+ """Downloads the GGUF model and initializes LlamaCPP."""
29
  global llm
30
  print("Downloading LLM...")
31
  try:
32
+ model_path = hf_hub_download(
33
+ repo_id=REPO_ID,
34
+ filename=FILENAME
35
+ )
36
+ # n_threads=2 is good for free Hugging Face CPU tiers
37
  llm = Llama(
38
  model_path=model_path,
39
  n_ctx=CONTEXT_WINDOW,
40
  n_threads=2,
41
  verbose=False
42
  )
43
+ print("LLM loaded successfully!")
44
+ return llm
45
  except Exception as e:
46
  print(f"Error loading model: {e}")
47
+ return None
48
 
49
+ # Load model on startup
50
  load_llm()
51
 
52
+ # --- 1. Speech-to-Text (Deepgram) ---
53
+ def transcribe_audio(audio_filepath):
54
+ """Sends audio file to Deepgram and returns text."""
55
+ if not audio_filepath:
56
+ return ""
57
+
58
  try:
59
+ with open(audio_filepath, "rb") as buffer:
60
  payload = {"buffer": buffer}
61
+ options = PrerecordedOptions(
62
+ smart_format=True,
63
+ model="nova-2",
64
+ language="en-US"
65
+ )
66
  response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
67
  return response.results.channels[0].alternatives[0].transcript
68
  except Exception as e:
69
  print(f"STT Error: {e}")
70
+ return ""
71
 
72
+ # --- 2. Text-to-Speech (Deepgram) ---
73
+ def text_to_speech(text):
74
+ """Sends text to Deepgram and returns path to audio file."""
 
75
  try:
76
+ filename = "output_response.mp3"
77
+ options = SpeakOptions(
78
+ model="aura-asteria-en", # Choices: aura-asteria-en, aura-helios-en, etc.
79
+ encoding="linear16",
80
+ container="wav"
81
+ )
82
+ # Save the audio to a file
83
  deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
84
  return filename
85
  except Exception as e:
86
  print(f"TTS Error: {e}")
87
  return None
88
 
89
+ # --- 3. Main Pipeline Function ---
90
+ def process_conversation(audio_input):
 
91
  """
92
+ 1. Transcribe Audio (STT)
93
+ 2. Query LLM
94
+ 3. Synthesize Speech (TTS)
95
  """
96
  if llm is None:
97
+ return "Model not loaded.", None, "System Error: Model failed to load."
98
 
99
+ # Step A: Transcribe
100
+ user_text = transcribe_audio(audio_input)
101
+ print(audio_input)
102
  if not user_text:
103
+ return "Could not hear audio.", None, ""
 
104
 
105
+ print(f"User said: {user_text}")
 
 
 
 
 
106
 
107
+ # Step B: LLM Inference
108
+ # Using the prompt format from your original code
109
+ full_prompt = f"### Human: {user_text}\n### Assistant:"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ output = llm(
112
+ prompt=full_prompt,
113
+ max_tokens=MAX_NEW_TOKENS,
114
+ temperature=TEMPERATURE,
115
+ stop=["### Human:"],
116
+ echo=False
117
  )
118
+ response_text = output['choices'][0]['text'].strip()
119
+ print(f"LLM said: {response_text}")
120
 
121
+ # Step C: Speak Response
122
+ output_audio_path = text_to_speech(response_text)
123
 
124
+ # Return: Transcription (for display), Audio (for playback), LLM Text (for display)
125
+ return user_text, output_audio_path, response_text
126
+
127
+ # --- Gradio UI ---
128
+ with gr.Blocks(title=f"Voice Chat with {FILENAME}") as demo:
129
+ gr.Markdown(f"## 🗣️ Deepgram Voice Chat with {FILENAME}")
130
+
131
  with gr.Row():
132
+ # Input Column
133
+ with gr.Column():
134
  audio_input = gr.Audio(
135
  sources=["microphone"],
136
+ type="filepath",
137
+ label="Speak Now"
138
  )
139
+ submit_btn = gr.Button("Submit Audio", variant="primary")
140
+
141
+ # Output Column
142
+ with gr.Column():
143
+ audio_output = gr.Audio(
144
+ label="Assistant Voice",
145
+ autoplay=True, # Automatically plays the response
146
+ interactive=False
147
+ )
148
+ # Debugging/Visuals
149
+ user_transcript = gr.Textbox(label="You said:")
150
+ ai_response_text = gr.Textbox(label="AI Response:")
 
 
 
 
151
 
152
+ # Event Listener
 
153
  submit_btn.click(
154
+ fn=process_conversation,
155
+ inputs=[audio_input],
156
+ outputs=[user_transcript, audio_output, ai_response_text]
 
 
 
 
 
 
 
 
 
 
157
  )
158
 
159
  if __name__ == "__main__":