| import torch |
| from transformers import pipeline, VitsModel, AutoTokenizer |
| import scipy.io.wavfile |
| import gradio as gr |
| import tempfile |
| import os |
| import google.generativeai as genai |
|
|
| |
| |
| |
| |
|
|
| |
| STT_MODEL = "openai/whisper-tiny" |
| |
| TTS_MODEL_ID = "facebook/mms-tts-swh" |
|
|
| print("Loading AI components...") |
| stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL, device="cpu") |
| tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID) |
| tts_model = VitsModel.from_pretrained(TTS_MODEL_ID) |
|
|
| |
| |
| GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY") |
| if GEMINI_API_KEY: |
| genai.configure(api_key=GEMINI_API_KEY) |
| print("Gemini API configured successfully from Secrets.") |
| else: |
| print("WARNING: GEMINI_API_KEY not found in environment variables. LLM will not work.") |
|
|
| def voice_agent_chat(audio_path): |
| if not GEMINI_API_KEY: |
| return "ERROR: Gemini API Key is missing. Please set the GEMINI_API_KEY secret in your Space settings.", None |
| |
| if audio_path is None: |
| return "Tafadhali rekodi sauti yako.", None |
|
|
| |
| stt_result = stt_pipe(audio_path, generate_kwargs={"language": "swahili"}) |
| user_text = stt_result["text"] |
|
|
| |
| try: |
| model = genai.GenerativeModel('gemini-1.5-flash') |
| |
| |
| system_instruction = "Wewe ni SwaGPT, msaidizi wa akili mnemba unayezungumza Kiswahili sanifu. Jibu kwa ufupi sana (sentensi 1-2)." |
| prompt = f"{system_instruction}\n\nMtumiaji: {user_text}" |
| |
| response = model.generate_content(prompt) |
| ai_response = response.text |
| except Exception as e: |
| ai_response = f"Tatizo la API: {str(e)}. Huenda umefikia kikomo cha matumizi ya bure." |
|
|
| |
| inputs = tts_tokenizer(ai_response, return_tensors="pt") |
| with torch.no_grad(): |
| output = tts_model(**inputs).waveform |
| |
| sampling_rate = tts_model.config.sampling_rate |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") |
| scipy.io.wavfile.write(temp_file.name, rate=sampling_rate, data=output.float().numpy().T) |
| |
| return ai_response, temp_file.name |
|
|
| |
| with gr.Blocks(title="SwaGPT Intelligent Voice Agent") as demo: |
| gr.Markdown("# 🤖 SwaGPT Intelligent Voice Agent") |
| gr.Markdown("Zungumza na SwaGPT! Mfumo huu unatumia Gemini kufikiri na SwaGPT kuzungumza.") |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### 1. Zungumza (Talk)") |
| audio_input = gr.Audio(label="Rekodi Sauti", type="filepath") |
| submit_btn = gr.Button("Anza Mazungumzo", variant="primary") |
| |
| with gr.Column(): |
| gr.Markdown("### 2. Jibu (Response)") |
| chat_text = gr.Textbox(label="Maandishi ya AI") |
| audio_output = gr.Audio(label="Sauti ya AI") |
| |
| submit_btn.click( |
| fn=voice_agent_chat, |
| inputs=audio_input, |
| outputs=[chat_text, audio_output] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|