WWMachine commited on
Commit
f4264e5
·
verified ·
1 Parent(s): 9c039c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -94
app.py CHANGED
@@ -3,13 +3,10 @@ from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
5
  from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions
 
6
 
7
  # --- Configuration ---
8
- # 1. API KEY: Ensure you have your Deepgram API Key ready
9
- # Ideally, set this in your environment variables as DEEPGRAM_API_KEY
10
- DEEPGRAM_API_KEY = "19d640a011569d78395c814e5f875b15cc84deb8"
11
-
12
- # 2. Model Config
13
  REPO_ID = "Kezovic/iris-q4gguf-v2"
14
  FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
15
  CONTEXT_WINDOW = 4096
@@ -17,143 +14,164 @@ MAX_NEW_TOKENS = 512
17
  TEMPERATURE = 0.7
18
 
19
  # --- Initialize Deepgram ---
20
- if DEEPGRAM_API_KEY == "YOUR_DEEPGRAM_KEY_HERE":
21
- print("WARNING: Please set your DEEPGRAM_API_KEY.")
22
-
23
- deepgram = DeepgramClient(DEEPGRAM_API_KEY)
 
24
 
25
- # --- Model Loading Function ---
26
  llm = None
27
  def load_llm():
28
- """Downloads the GGUF model and initializes LlamaCPP."""
29
  global llm
30
  print("Downloading LLM...")
31
  try:
32
- model_path = hf_hub_download(
33
- repo_id=REPO_ID,
34
- filename=FILENAME
35
- )
36
- # n_threads=2 is good for free Hugging Face CPU tiers
37
  llm = Llama(
38
  model_path=model_path,
39
  n_ctx=CONTEXT_WINDOW,
40
  n_threads=2,
41
  verbose=False
42
  )
43
- print("LLM loaded successfully!")
44
- return llm
45
  except Exception as e:
46
  print(f"Error loading model: {e}")
47
- return None
48
 
49
- # Load model on startup
50
  load_llm()
51
 
52
- # --- 1. Speech-to-Text (Deepgram) ---
53
- def transcribe_audio(audio_filepath):
54
- """Sends audio file to Deepgram and returns text."""
55
- if not audio_filepath:
56
- return ""
57
-
58
  try:
59
- with open(audio_filepath, "rb") as buffer:
60
  payload = {"buffer": buffer}
61
- options = PrerecordedOptions(
62
- smart_format=True,
63
- model="nova-2",
64
- language="en-US"
65
- )
66
  response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
67
  return response.results.channels[0].alternatives[0].transcript
68
  except Exception as e:
69
  print(f"STT Error: {e}")
70
- return ""
71
 
72
- # --- 2. Text-to-Speech (Deepgram) ---
73
- def text_to_speech(text):
74
- """Sends text to Deepgram and returns path to audio file."""
 
75
  try:
76
- filename = "output_response.mp3"
77
- options = SpeakOptions(
78
- model="aura-asteria-en", # Choices: aura-asteria-en, aura-helios-en, etc.
79
- encoding="linear16",
80
- container="wav"
81
- )
82
- # Save the audio to a file
83
  deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
84
  return filename
85
  except Exception as e:
86
  print(f"TTS Error: {e}")
87
  return None
88
 
89
- # --- 3. Main Pipeline Function ---
90
- def process_conversation(audio_input):
 
91
  """
92
- 1. Transcribe Audio (STT)
93
- 2. Query LLM
94
- 3. Synthesize Speech (TTS)
95
  """
96
  if llm is None:
97
- return "Model not loaded.", None, "System Error: Model failed to load."
98
 
99
- # Step A: Transcribe
100
- user_text = transcribe_audio(audio_input)
101
- print(audio_input)
102
  if not user_text:
103
- return "Could not hear audio.", None, ""
104
-
105
- print(f"User said: {user_text}")
106
 
107
- # Step B: LLM Inference
108
- # Using the prompt format from your original code
109
- full_prompt = f"### Human: {user_text}\n### Assistant:"
110
 
111
- output = llm(
112
- prompt=full_prompt,
113
- max_tokens=MAX_NEW_TOKENS,
114
- temperature=TEMPERATURE,
115
- stop=["### Human:"],
116
- echo=False
117
- )
118
- response_text = output['choices'][0]['text'].strip()
119
- print(f"LLM said: {response_text}")
120
 
121
- # Step C: Speak Response
122
- output_audio_path = text_to_speech(response_text)
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- # Return: Transcription (for display), Audio (for playback), LLM Text (for display)
125
- return user_text, output_audio_path, response_text
126
 
127
- # --- Gradio UI ---
128
- with gr.Blocks(title=f"Voice Chat with {FILENAME}") as demo:
129
- gr.Markdown(f"## 🗣️ Deepgram Voice Chat with {FILENAME}")
 
 
 
 
 
 
130
 
 
 
 
 
 
 
 
 
 
 
 
131
  with gr.Row():
132
- # Input Column
133
- with gr.Column():
134
  audio_input = gr.Audio(
135
  sources=["microphone"],
136
- type="filepath",
137
- label="Speak Now"
138
  )
139
- submit_btn = gr.Button("Submit Audio", variant="primary")
140
-
141
- # Output Column
142
- with gr.Column():
143
- audio_output = gr.Audio(
144
- label="Assistant Voice",
145
- autoplay=True, # Automatically plays the response
146
- interactive=False
147
- )
148
- # Debugging/Visuals
149
- user_transcript = gr.Textbox(label="You said:")
150
- ai_response_text = gr.Textbox(label="AI Response:")
 
 
 
 
151
 
152
- # Event Listener
 
153
  submit_btn.click(
154
- fn=process_conversation,
155
- inputs=[audio_input],
156
- outputs=[user_transcript, audio_output, ai_response_text]
 
 
 
 
 
 
 
 
 
 
157
  )
158
 
159
  if __name__ == "__main__":
 
3
  from huggingface_hub import hf_hub_download
4
  import os
5
  from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions
6
+ import time
7
 
8
  # --- Configuration ---
9
+ DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") # Ensure this is set in Space Settings
 
 
 
 
10
  REPO_ID = "Kezovic/iris-q4gguf-v2"
11
  FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
12
  CONTEXT_WINDOW = 4096
 
14
  TEMPERATURE = 0.7
15
 
16
  # --- Initialize Deepgram ---
17
+ if not DEEPGRAM_API_KEY:
18
+ print("Error: DEEPGRAM_API_KEY is missing.")
19
+ deepgram = None
20
+ else:
21
+ deepgram = DeepgramClient(DEEPGRAM_API_KEY)
22
 
23
+ # --- Load LLM ---
24
  llm = None
25
  def load_llm():
 
26
  global llm
27
  print("Downloading LLM...")
28
  try:
29
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
 
 
 
 
30
  llm = Llama(
31
  model_path=model_path,
32
  n_ctx=CONTEXT_WINDOW,
33
  n_threads=2,
34
  verbose=False
35
  )
36
+ print("LLM loaded!")
 
37
  except Exception as e:
38
  print(f"Error loading model: {e}")
 
39
 
 
40
  load_llm()
41
 
42
+ # --- Helper Functions ---
43
+
44
+ def transcribe(audio_path):
45
+ """Converts Speech to Text using Deepgram Nova-2"""
46
+ if not audio_path or deepgram is None:
47
+ return None
48
  try:
49
+ with open(audio_path, "rb") as buffer:
50
  payload = {"buffer": buffer}
51
+ options = PrerecordedOptions(smart_format=True, model="nova-2", language="en-US")
 
 
 
 
52
  response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
53
  return response.results.channels[0].alternatives[0].transcript
54
  except Exception as e:
55
  print(f"STT Error: {e}")
56
+ return None
57
 
58
+ def speak(text):
59
+ """Converts Text to Speech using Deepgram Aura"""
60
+ if not text or deepgram is None:
61
+ return None
62
  try:
63
+ filename = f"response_{int(time.time())}.mp3"
64
+ options = SpeakOptions(model="aura-asteria-en", encoding="linear16", container="wav")
 
 
 
 
 
65
  deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
66
  return filename
67
  except Exception as e:
68
  print(f"TTS Error: {e}")
69
  return None
70
 
71
+ # --- Main Logic ---
72
+
73
+ def run_chat_pipeline(audio_input, history, state_messages):
74
  """
75
+ 1. Transcribe Audio -> Update UI with User Text
76
+ 2. Query LLM -> Update UI with AI Text
77
+ 3. Generate Audio -> Auto-play response
78
  """
79
  if llm is None:
80
+ return history, state_messages, None
81
 
82
+ # --- Step 1: User Speech to Text ---
83
+ user_text = transcribe(audio_input)
84
+
85
  if not user_text:
86
+ # If silence or error, return existing state without changes
87
+ return history, state_messages, None
 
88
 
89
+ # Update internal memory (Standard OpenAI/Llama format)
90
+ state_messages.append({"role": "user", "content": user_text})
 
91
 
92
+ # Update UI History (Gradio Chatbot format: list of [user_msg, bot_msg])
93
+ # We add the user message temporarily with a pending bot response
94
+ history.append((user_text, None))
 
 
 
 
 
 
95
 
96
+ # --- Step 2: LLM Generation ---
97
+ try:
98
+ completion = llm.create_chat_completion(
99
+ messages=state_messages,
100
+ max_tokens=MAX_NEW_TOKENS,
101
+ temperature=TEMPERATURE
102
+ )
103
+ ai_text = completion['choices'][0]['message']['content']
104
+ except Exception as e:
105
+ ai_text = f"Error: {str(e)}"
106
+
107
+ # Update internal memory with AI response
108
+ state_messages.append({"role": "assistant", "content": ai_text})
109
 
110
+ # Update UI History: Replace the 'None' with the actual AI text
111
+ history[-1] = (user_text, ai_text)
112
 
113
+ # --- Step 3: Text to Speech ---
114
+ audio_path = speak(ai_text)
115
+
116
+ # Return: Updated Chatbot UI, Updated Internal State, Audio File
117
+ return history, state_messages, audio_path
118
+
119
+ # --- Gradio UI Layout ---
120
+ with gr.Blocks(title="Voice Chatbot") as demo:
121
+ gr.Markdown("## 🎙️ Voice-First AI Chat")
122
 
123
+ # 1. Visual Conversation History (The "Screen")
124
+ chatbot = gr.Chatbot(
125
+ label="Conversation",
126
+ type="messages", # Uses newer Gradio format if available, else standard
127
+ height=500
128
+ )
129
+
130
+ # 2. State (Hidden Memory)
131
+ state_messages = gr.State([]) # Stores [{"role":"user", "content":"..."}, ...]
132
+
133
+ # 3. Audio Interaction Area
134
  with gr.Row():
135
+ with gr.Column(scale=4):
136
+ # Input Microphone
137
  audio_input = gr.Audio(
138
  sources=["microphone"],
139
+ type="filepath",
140
+ label="Record Your Message"
141
  )
142
+ with gr.Column(scale=1):
143
+ # Send Button
144
+ submit_btn = gr.Button("Send Voice 💬", variant="primary")
145
+ clear_btn = gr.Button("Clear Chat 🗑️")
146
+
147
+ # 4. Hidden Output Audio (For Autoplay)
148
+ # We make it visible=False so it doesn't clutter UI,
149
+ # but Gradio still plays it if we return it to this component.
150
+ # Note: Some browsers block autoplay from hidden components.
151
+ # If it doesn't play, set visible=True.
152
+ audio_player = gr.Audio(
153
+ label="AI Voice",
154
+ autoplay=True,
155
+ visible=True, # Kept visible for control, can set to False
156
+ interactive=False
157
+ )
158
 
159
+ # --- Event Wiring ---
160
+
161
  submit_btn.click(
162
+ fn=run_chat_pipeline,
163
+ inputs=[audio_input, chatbot, state_messages],
164
+ outputs=[chatbot, state_messages, audio_player]
165
+ )
166
+
167
+ # Clear Logic
168
+ def clear_all():
169
+ return [], [], None
170
+
171
+ clear_btn.click(
172
+ fn=clear_all,
173
+ inputs=None,
174
+ outputs=[chatbot, state_messages, audio_player]
175
  )
176
 
177
  if __name__ == "__main__":