1MR commited on
Commit
52296e0
Β·
verified Β·
1 Parent(s): 7895244

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -95
app.py CHANGED
@@ -1,129 +1,86 @@
1
  import argparse
2
- from typing import Generator, Tuple
3
-
4
  import numpy as np
5
- from fastrtc import (
6
- AlgoOptions,
7
- ReplyOnPause,
8
- Stream,
9
- audio_to_bytes,
10
- )
11
- from groq import Groq
12
  from loguru import logger
13
- from process_groq_tts import process_groq_tts
 
 
14
  from simple_math_agent import agent, agent_config
15
- import os
 
16
 
 
17
  os.environ["GROQ_API_KEY"] = "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"
18
- logger.remove()
19
- logger.add(
20
- lambda msg: print(msg),
21
- colorize=True,
22
- format="<green>{time:HH:mm:ss}</green> | <level>{level}</level> | <level>{message}</level>",
23
- )
24
 
25
- groq_client = Groq(api_key="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")
 
 
 
26
 
27
 
28
- def response(
29
- audio: tuple[int, np.ndarray],
30
- ) -> Generator[Tuple[int, np.ndarray], None, None]:
31
  """
32
- Process audio input, transcribe it, generate a response using LangGraph, and deliver TTS audio.
33
-
34
- Args:
35
- audio: Tuple containing sample rate and audio data
36
-
37
- Yields:
38
- Tuples of (sample_rate, audio_array) for audio playback
39
  """
40
  logger.info("πŸŽ™οΈ Received audio input")
41
 
42
- logger.debug("πŸ”„ Transcribing audio...")
43
- import whisper
44
- import wave
45
- import tempfile
46
- import os
 
 
 
 
 
 
 
47
 
 
48
  model = whisper.load_model("base")
49
-
50
- # Create a temporary WAV file
51
- temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
52
- temp_file.close()
53
-
54
- try:
55
- # Convert audio data to bytes and save as WAV
56
- audio_bytes = audio_to_bytes(audio)
57
-
58
- # Save as WAV file using wave module
59
- with wave.open(temp_file.name, 'wb') as wav_file:
60
- wav_file.setnchannels(1) # mono audio
61
- wav_file.setsampwidth(2) # 16-bit audio
62
- wav_file.setframerate(audio[0]) # sample rate
63
- wav_file.writeframes(audio_bytes)
64
-
65
- # Transcribe the audio
66
- result = model.transcribe(temp_file.name, language="ar")
67
- transcript = result["text"]
68
-
69
- finally:
70
- # Clean up the temporary file
71
- if os.path.exists(temp_file.name):
72
- os.remove(temp_file.name)
73
-
74
  logger.info(f'πŸ‘‚ Transcribed: "{transcript}"')
75
 
76
- logger.debug("🧠 Running agent...")
77
  agent_response = agent.invoke(
78
- {"messages": [{"role": "user", "content": transcript}]}, config=agent_config
 
79
  )
80
  response_text = agent_response["messages"][-1].content
81
  logger.info(f'πŸ’¬ Response: "{response_text}"')
82
 
83
- logger.debug("πŸ”Š Generating speech...")
84
  tts_response = groq_client.audio.speech.create(
85
  model="playai-tts-arabic",
86
  voice="Ahmad-PlayAI",
87
  response_format="wav",
88
  input=response_text,
89
  )
90
- yield from process_groq_tts(tts_response)
91
 
 
 
 
92
 
93
- def create_stream() -> Stream:
94
- """
95
- Create and configure a Stream instance with audio capabilities.
96
 
97
- Returns:
98
- Stream: Configured FastRTC Stream instance
99
- """
100
- return Stream(
101
- modality="audio",
102
- mode="send-receive",
103
- handler=ReplyOnPause(
104
- response,
105
- algo_options=AlgoOptions(
106
- speech_threshold=0.5,
107
- ),
108
- ),
109
- )
110
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  if __name__ == "__main__":
113
- parser = argparse.ArgumentParser(description="FastRTC Groq Voice Agent")
114
- parser.add_argument(
115
- "--phone",
116
- action="store_true",
117
- help="Launch with FastRTC phone interface (get a temp phone number)",
118
- )
119
- args = parser.parse_args()
120
-
121
- stream = create_stream()
122
- logger.info("🎧 Stream handler configured")
123
-
124
- if args.phone:
125
- logger.info("Launching with FastRTC phone interface...")
126
- stream.fastphone(share=True)
127
- else:
128
- logger.info("Launching with Gradio UI...")
129
- stream.ui.launch(share=True)
 
1
  import argparse
 
 
2
  import numpy as np
3
+ import tempfile
4
+ import wave
5
+ import os
 
 
 
 
6
  from loguru import logger
7
+ import gradio as gr
8
+ import whisper
9
+ from groq import Groq
10
  from simple_math_agent import agent, agent_config
11
+ from process_groq_tts import process_groq_tts
12
+ from fastrtc import audio_to_bytes # keep using it for conversion
13
 
14
+ # βœ… Set your API key safely
15
  os.environ["GROQ_API_KEY"] = "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"
16
+ groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])
 
 
 
 
 
17
 
18
+ logger.remove()
19
+ logger.add(lambda msg: print(msg),
20
+ colorize=True,
21
+ format="<green>{time:HH:mm:ss}</green> | <level>{level}</level> | <level>{message}</level>")
22
 
23
 
24
+ def process_audio(audio):
 
 
25
  """
26
+ Take an uploaded or recorded audio file, transcribe it, generate an AI response,
27
+ and return the spoken audio as a WAV.
 
 
 
 
 
28
  """
29
  logger.info("πŸŽ™οΈ Received audio input")
30
 
31
+ # If audio is a file path (Gradio format)
32
+ if isinstance(audio, tuple):
33
+ sr, data = audio
34
+ audio_bytes = audio_to_bytes((sr, data))
35
+ temp_path = tempfile.mktemp(suffix=".wav")
36
+ with wave.open(temp_path, 'wb') as wav_file:
37
+ wav_file.setnchannels(1)
38
+ wav_file.setsampwidth(2)
39
+ wav_file.setframerate(sr)
40
+ wav_file.writeframes(audio_bytes)
41
+ else:
42
+ temp_path = audio
43
 
44
+ # 🎧 Transcribe using Whisper
45
  model = whisper.load_model("base")
46
+ result = model.transcribe(temp_path, language="ar")
47
+ transcript = result["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  logger.info(f'πŸ‘‚ Transcribed: "{transcript}"')
49
 
50
+ # 🧠 Run agent
51
  agent_response = agent.invoke(
52
+ {"messages": [{"role": "user", "content": transcript}]},
53
+ config=agent_config
54
  )
55
  response_text = agent_response["messages"][-1].content
56
  logger.info(f'πŸ’¬ Response: "{response_text}"')
57
 
58
+ # πŸ”Š Generate speech with Groq
59
  tts_response = groq_client.audio.speech.create(
60
  model="playai-tts-arabic",
61
  voice="Ahmad-PlayAI",
62
  response_format="wav",
63
  input=response_text,
64
  )
 
65
 
66
+ output_path = tempfile.mktemp(suffix=".wav")
67
+ with open(output_path, "wb") as f:
68
+ f.write(tts_response.read())
69
 
70
+ return response_text, output_path
 
 
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ # βœ… Use Gradio UI (no RTC, fully compatible with Spaces)
74
+ demo = gr.Interface(
75
+ fn=process_audio,
76
+ inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="🎀 Speak or Upload Audio"),
77
+ outputs=[
78
+ gr.Textbox(label="πŸ’¬ Transcription + Response"),
79
+ gr.Audio(label="οΏ½οΏ½οΏ½οΏ½ AI Voice Reply")
80
+ ],
81
+ title="πŸŽ™οΈ Groq Voice Assistant",
82
+ description="Speak Arabic and get an intelligent spoken reply (STT β†’ Agent β†’ TTS)"
83
+ )
84
 
85
  if __name__ == "__main__":
86
+ demo.launch(server_port=None, share=True)