Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| from groq import Groq | |
| from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline # Import pipeline | |
| from espnet2.bin.tts_inference import Text2Speech | |
| import soundfile as sf | |
| from pydub import AudioSegment | |
| import io | |
| from dotenv import load_dotenv | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| # Load Groq API key from .env file | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| if not GROQ_API_KEY: | |
| st.error("Groq API key not found. Please add it to the .env file.") | |
| st.stop() | |
| # Initialize Groq client | |
| groq_client = Groq(api_key=GROQ_API_KEY) | |
| # Load models | |
| # Use st.cache_resource for caching models | |
| def load_models(): | |
| # Speech-to-Text | |
| processor = AutoProcessor.from_pretrained("openai/whisper-small") | |
| stt_model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small") | |
| stt_pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=stt_model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor | |
| ) | |
| # Text-to-Speech | |
| tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet") | |
| return stt_pipe, tts_model | |
| stt_pipe, tts_model = load_models() | |
| # Streamlit app | |
| st.title("Voice-Enabled Chatbot") | |
| # Audio input | |
| audio_file = st.file_uploader("Upload your voice input", type=['wav']) | |
| if audio_file is not None: | |
| audio_bytes = audio_file.read() | |
| audio = AudioSegment.from_file(io.BytesIO(audio_bytes)) | |
| audio.export("temp.wav", format="wav") | |
| speech, _ = sf.read("temp.wav") | |
| text = stt_pipe(speech)['text'] | |
| st.write("Transcribed Text:", text) | |
| # Generate response using Groq API | |
| try: | |
| chat_completion = groq_client.chat.completions.create( | |
| messages=[{"role": "user", "content": text}], | |
| model="mixtral-8x7b-32768", | |
| temperature=0.5, | |
| max_tokens=1024 | |
| ) | |
| response = chat_completion.choices[0].message.content | |
| st.write("Generated Response:", response) | |
| # Convert response to speech | |
| speech, *_ = tts_model(response) | |
| sf.write("response.wav", speech, 22050) | |
| st.audio("response.wav") | |
| except Exception as e: | |
| st.error(f"Error generating response: {e}") |