Spaces:

wahab5763
/

AudioToAudioApplication

Sleeping

App Files Files Community

AudioToAudioApplication / app.py

wahab5763

Create app.py

48f585b verified about 1 month ago

raw

history blame contribute delete

3.18 kB

	import os
	import gradio as gr
	from groq import Groq
	from pathlib import Path

	# Initialize Groq client using the secret stored in Hugging Face
	api_key = os.environ.get("GROQ_API_KEY")
	client = Groq(api_key=api_key)

	def process_voice_assistant(audio_input):
	if audio_input is None:
	return "No audio provided.", "Please record or upload audio first.", None

	try:
	# --- 1. Audio to Text (Transcription) ---
	with open(audio_input, "rb") as file:
	transcription = client.audio.transcriptions.create(
	file=(audio_input, file.read()),
	model="whisper-large-v3",
	temperature=0,
	)
	user_text = transcription.text

	# --- 2. Text Generation ---
	# We include strict instructions to stay under the Groq TTS token limit
	completion = client.chat.completions.create(
	model="llama-3.1-8b-instant",
	messages=[
	{
	"role": "system",
	"content": "You are a concise voice assistant. Your response MUST be under 50 words."
	},
	{"role": "user", "content": user_text}
	],
	max_tokens=150,
	temperature=0.5,
	)
	ai_response_text = completion.choices[0].message.content

	# --- 3. Text to Audio (Speech Synthesis) ---
	# Safety: Truncate text to ensure it doesn't exceed the 1200 token TPM limit
	safe_audio_text = ai_response_text[:1000]
	speech_file_path = "output_response.wav"

	response = client.audio.speech.create(
	model="canopylabs/orpheus-v1-english",
	voice="autumn",
	response_format="wav",
	input=safe_audio_text,
	)
	response.write_to_file(speech_file_path)

	return user_text, ai_response_text, speech_file_path

	except Exception as e:
	error_str = str(e)
	if "413" in error_str:
	return "Audio processed", "The AI response was too long for the voice engine. Try a shorter question.", None
	return "Error", error_str, None

	# --- Gradio Interface ---
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🎙️ Groq Voice-to-Voice Assistant")
	gr.Markdown("Deploying on Hugging Face Spaces using Whisper, Llama 3.1, and Orpheus.")

	with gr.Row():
	with gr.Column():
	audio_in = gr.Audio(
	label="Input Audio (Mic or Upload)",
	type="filepath",
	sources=["microphone", "upload"]
	)
	submit_btn = gr.Button("Submit", variant="primary")

	with gr.Column():
	user_transcript = gr.Textbox(label="Transcription")
	ai_transcript = gr.Textbox(label="AI Response")
	audio_out = gr.Audio(label="AI Voice Output", autoplay=True)

	submit_btn.click(
	fn=process_voice_assistant,
	inputs=[audio_in],
	outputs=[user_transcript, ai_transcript, audio_out]
	)

	# For Hugging Face, we just call launch() without specific ports
	if __name__ == "__main__":
	demo.launch()